NCBI C++ ToolKit
Search_func.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: Search_func.cpp 95024 2021-09-28 18:06:16Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: J. Chen
27  *
28  * File Description:
29  * Evaluate string match
30  *
31  * Remark:
32  * This code was originally generated by application DATATOOL
33  * using the following specifications:
34  * 'macro.asn'.
35  */
36 
37 #include <ncbi_pch.hpp>
40 
41 ///// This file is included in macro__.cpp (sic!), so these statics are visible elsewhere
42 namespace
43 {
44  static const char* digit_str = "0123456789";
45  static const char* alpha_str = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
46 };
47 
49 BEGIN_objects_SCOPE // namespace ncbi::objects::
50 
51 
52 bool CSearch_func::Empty() const
53 {
54  switch (Which()) {
56  return GetString_constraint().Empty();
58  return (GetPrefix_and_numbers().empty());
59  default: return false;
60  }
61  return false;
62 };
63 
64 
65 bool CSearch_func::x_DoesStrContainPlural(const string& word, char last_letter, char second_to_last_letter, char next_letter) const
66 {
67  size_t len = word.size();
68  if (last_letter == 's') {
69  if (len >= 5 && word.substr(len-5) == "trans") {
70  return false; // not plural;
71  }
72  else if (len > 3) {
73  if (second_to_last_letter != 's'
74  && second_to_last_letter != 'i'
75  && second_to_last_letter != 'u'
76  && next_letter == ',') {
77  return true;
78  }
79  }
80  }
81 
82  return false;
83 };
84 
85 
87 {
88  char last_letter, second_to_last_letter, next_letter;
89  bool may_contain_plural = false;
90  string word_skip = " ,";
91  size_t len;
92 
93  if (str.empty()) {
94  return false;
95  }
96  vector <string> arr;
98  if (arr.size() == 1) { // doesn't have ', ', or the last char is ', '
99  len = arr[0].size();
100  if (len == 1) {
101  return false;
102  }
103  last_letter = arr[0][len-1];
104  second_to_last_letter = arr[0][len-2];
105  next_letter = (len == str.size()) ? ',' : str[len];
106  may_contain_plural = x_DoesStrContainPlural(arr[0], last_letter, second_to_last_letter, next_letter);
107  }
108  else {
109  string strtmp(str);
110  size_t pos;
111  vector <string>::const_iterator jt;
112  ITERATE (vector <string>, it, arr) {
113  pos = strtmp.find(*it);
114  len = (*it).size();
115  if (len == 1) {
116  strtmp = strtmp.substr(pos+len);
118  continue;
119  }
120  last_letter = (*it)[len-1];
121  second_to_last_letter = (*it)[len-2];
122  next_letter = (len == strtmp.size()) ? ',' : strtmp[pos+len];
123  may_contain_plural = x_DoesStrContainPlural(*it, last_letter, second_to_last_letter, next_letter);
124  if (may_contain_plural) {
125  break;
126  }
127  jt = it;
128  if (++jt != arr.end()) { // not jt++
129  strtmp = strtmp.substr(strtmp.find(*jt));
130  }
131  }
132  }
133  return may_contain_plural;
134 };
135 
136 
137 char CSearch_func::x_GetClose(char bp) const
138 {
139  if (bp == '(') return ')';
140  else if (bp == '[') return ']';
141  else if (bp == '{') return '}';
142  else return bp;
143 };
144 
145 
146 static const char* skip_bracket_paren[] = {
147  "(NAD(P)H)",
148  "(NAD(P))",
149  "(I)",
150  "(II)",
151  "(III)",
152  "(NADPH)",
153  "(NAD+)",
154  "(NAPPH/NADH)",
155  "(NADP+)",
156  "[acyl-carrier protein]",
157  "[acyl-carrier-protein]",
158  "(acyl carrier protein)"
159 };
160 
161 
162 bool CSearch_func::x_SkipBracketOrParen(size_t idx, string& start) const
163 {
164  bool rval = false;
165  size_t ep, ns;
166 
167  if (idx > 2 && start.substr(idx-3, 6) == "NAD(P)") {
168  rval = true;
169  start = start.substr(idx + 3);
170  }
171  else {
172  size_t len;
173  for (size_t i = 0; i < ArraySize(skip_bracket_paren); i++) {
174  len = strlen(skip_bracket_paren[i]);
175  if (start.substr(idx, len) == skip_bracket_paren[i]) {
176  start = start.substr(idx + len);
177  rval = true;
178  break;
179  }
180  }
181  if (!rval) {
182  ns = start.find(start[idx], idx+1);
183  ep = start.find(x_GetClose(start[idx]), idx+1);
184  if (ep != string::npos && (ns == string::npos || ns > ep)) {
185  if (ep - idx < 5) {
186  rval = true;
187  start = start.substr(ep+1);
188  }
189  else if (ep - idx > 3 && start.substr(ep - 3, 3) == "ing") {
190  rval = true;
191  start = start.substr(ep + 1);
192  }
193  }
194  }
195  }
196  return rval;
197 };
198 
199 
201 {
202  size_t idx, end;
203  int num_found = 0;
204  string open_bp("(["), sch_src(str);
205 
206  if (sch_src.empty()) {
207  return false;
208  }
209 
210  idx = sch_src.find_first_of(open_bp);
211  while (idx != string::npos && num_found < n) {
212  end = sch_src.find(x_GetClose(sch_src[idx]), idx);
213  if (x_SkipBracketOrParen (idx, sch_src)) { // ignore it
214  idx = sch_src.find_first_of(open_bp);
215  }
216  else if (end == string::npos) { // skip, doesn't close the bracket
217  idx = sch_src.find_first_of(open_bp, idx+1);
218  }
219  else {
220  idx = sch_src.find_first_of(open_bp, end);
221  num_found++;
222  }
223  }
224 
225  if (num_found >= n) {
226  return true;
227  }
228  else {
229  return false;
230  }
231 };
232 
233 
234 static const char* ok_num_prefix[] = {
235  "DUF",
236  "UPF",
237  "IS",
238  "TIGR",
239  "UCP",
240  "PUF",
241  "CHP"
242 };
243 
244 
245 bool CSearch_func::x_PrecededByOkPrefix (const string& start_str) const
246 {
247  size_t len_str = start_str.size();
248  size_t len_i;
249  for (size_t i = 0; i < ArraySize(ok_num_prefix); i++) {
250  len_i = string(ok_num_prefix[i]).size();
251  if (len_str >= len_i && (start_str.substr(len_str-len_i) == ok_num_prefix[i])) {
252  return true;
253  }
254  }
255  return false;
256 };
257 
258 
259 bool CSearch_func::x_InWordBeforeCytochromeOrCoenzyme(const string& start_str) const
260 {
261  if (start_str.empty()) {
262  return false;
263  }
264  size_t pos = start_str.find_last_of(' ');
265  if (pos != string::npos) {
266  string strtmp = start_str.substr(0, pos);
267  pos = strtmp.find_last_not_of(' ');
268  if (pos != string::npos) {
269  if (strtmp.size() >= 8 && (NStr::EndsWith(strtmp, "cytochrome", NStr::eNocase) ||
270  NStr::EndsWith(strtmp, "coenzyme", NStr::eNocase))) {
271  return true;
272  }
273  }
274  }
275  return false;
276 };
277 
278 
279 bool CSearch_func::x_FollowedByFamily(string& after_str) const
280 {
281  size_t pos = after_str.find_first_of(' ');
282  if (pos != string::npos) {
283  after_str = after_str.substr(pos+1);
284  if (NStr::EqualNocase(after_str, 0, 6, "family")) {
285  if (after_str.size() >= 7) {
286  after_str = after_str.substr(7);
287  } else {
288  after_str.clear();
289  }
290  return true;
291  }
292  }
293 
294  return false;
295 };
296 
297 
299 {
300  size_t p=0, p2;
301  unsigned num_digits = 0;
302  string sch_str(str), strtmp;
303 
304  while (!sch_str.empty()) {
305  p = sch_str.find_first_of(digit_str);
306  if (p == string::npos) {
307  break;
308  }
309  strtmp = sch_str.substr(0, p);
311  p2 = sch_str.find_first_not_of(digit_str, p+1);
312  if (p2 != string::npos) {
313  sch_str = sch_str.substr(p2);
314  num_digits = 0;
315  }
316  else break;
317  }
318  else {
319  num_digits ++;
320  if (num_digits == 3) {
321  sch_str = sch_str.substr(p+1);
322  if (x_FollowedByFamily(sch_str)) {
323  num_digits = 0;
324  }
325  else return true;
326  }
327  if (p < sch_str.size() - 1) {
328  if (!sch_str.empty()) {
329  sch_str = sch_str.substr(p+1);
330  }
331  if (sch_str.empty() || !isdigit(sch_str.front())) {
332  num_digits = 0;
333  }
334  }
335  else break;
336  }
337  }
338  return false;
339 };
340 
341 
343 {
344  if (str.find('_') == string::npos) {
345  return false;
346  }
347 
348  string strtmp;
349  vector <string> arr;
350  arr = NStr::Split(str, "_", arr, 0);
351  for (unsigned i=0; i< arr.size() - 1; i++) {
352  strtmp = arr[i+1];
353  // strtmp was changed in the FollowedByFamily
354  if (x_FollowedByFamily(strtmp)) {
355  continue;
356  }
357  else if (arr[i].size() < 3 || str[arr[i].size()-1] == ' ') {
358  return true;
359  }
360  else {
361  strtmp = arr[i].substr(arr[i].size()-3);
362  if ( (strtmp == "MFS" || strtmp == "TPR" || strtmp == "AAA") && (isdigit(arr[i+1][0]) && !isdigit(arr[i+1][1])) ) {
363  continue;
364  }
365  else return true;
366  }
367  }
368 
369  return false;
370 };
371 
372 
373 bool CSearch_func::x_IsPrefixPlusNumbers(const string& str, const string& prefix) const
374 {
375  if (str.empty()) return false;
376 
377  size_t pattern_len = prefix.size();
378  if (pattern_len > 0 && !NStr::EqualCase(str, 0, pattern_len, prefix)) {
379  return false;
380  }
381 
382  size_t digit_len = str.find_first_not_of(digit_str, pattern_len);
383  if (digit_len != string::npos && digit_len == str.size()) {
384  return true;
385  }
386  else return false;
387 };
388 
389 
390 bool CSearch_func::x_IsPropClose(const string& str, char open_p) const
391 {
392  if (str.empty()) return false;
393  else if (str[str.size()-1] != open_p) return false;
394  else return true;
395 };
396 
397 
399 {
400  size_t pos = 0;
401  char ch_src;
402  string strtmp, sch_src(str);
403  while (!sch_src.empty()) {
404  pos = sch_src.find_first_of("()[]");
405  if (pos == string::npos) {
406  if (strtmp.empty()) {
407  return false;
408  }
409  else return true;
410  }
411  else {
412  ch_src = sch_src[pos];
413  if (ch_src == '(' || ch_src == '[') {
414  strtmp += ch_src;
415  }
416  else if (sch_src[pos] == ')') {
417  if (!x_IsPropClose(strtmp, '(')) {
418  return true;
419  }
420  else {
421  strtmp = strtmp.substr(0, strtmp.size()-1);
422  }
423  }
424  else if (sch_src[pos] == ']') {
425  if (!x_IsPropClose(strtmp, '[')) {
426  return true;
427  }
428  else {
429  strtmp = strtmp.substr(0, strtmp.size()-1);
430  }
431  }
432  }
433  sch_src = (pos < sch_src.size()-1) ? sch_src.substr(pos+1) : kEmptyStr;
434  }
435 
436  if (strtmp.empty()) {
437  return false;
438  }
439  else return true;
440 };
441 
442 
443 bool CSearch_func::x_ProductContainsTerm(const string& str, const string& pattern) const
444 {
445  // don't bother searching for c-term or n-term if product name contains "domain"
446  if (NStr::FindNoCase(str, "domain") != string::npos) {
447  return false;
448  }
449 
450  size_t pos = NStr::FindNoCase(str, pattern);
451  // c-term and n-term must be either first word or separated from other word
452  // by space, num, or punct
453  if (pos != string::npos && (!pos || !isalpha (str[pos-1]))) {
454  return true;
455  }
456  else return false;
457 }
458 
459 
461 {
462  const string& orig = str;
463 
464  switch (Which()){
465  case e_String_constraint:
466  {
467  const CString_constraint& str_cons = GetString_constraint();
468  return str_cons.Match(str);
469  }
470  case e_Contains_plural:
474  case e_Three_numbers:
476  case e_Underscore:
480  case e_All_caps:
481  if (orig.find_first_not_of(alpha_str) != string::npos) {
482  return false;
483  }
484  if (orig == str.original().uppercase()) {
485  return true;
486  }
487  else return false;
488  case e_Unbalanced_paren:
490  case e_Too_long:
491  if (NStr::FindNoCase (orig, "bifunctional") == string::npos && NStr::FindNoCase (orig, "multifunctional") == string::npos && orig.size() > (unsigned) GetToo_long()) {
492  return true;
493  }
494  else return false;
495  case e_Has_term:
497  default: break;
498  }
499  return false;
500 };
501 
502 
503 
505 {
506  switch (Which()) {
507  case e_String_constraint:
508  {
509  const CString_constraint& constr = GetString_constraint();
510  if (constr.IsSetMatch_text()) {
511  if (constr.GetNot_present()) {
512  NCBI_THROW(CException, eUnknown, "GetRegex is not implemented for NOT-PRESENT: " + constr.GetMatch_text());
513  }
515  if (constr.IsSetIgnore_words() && constr.GetIgnore_words().IsSet()) {
516  for (auto words : constr.GetIgnore_words().Get()) {
517  if (words->IsSetSynonyms()) {
518  for (auto syn : words->GetSynonyms()) {
520  }
521  }
522  }
523  }
524  str = "/" + str + "/i";
525  return str;
526  }
527  else {
528  NCBI_THROW(CException, eUnknown, "GetRegex Match text is not set");
529  }
530  }
531  case e_Contains_plural:
532  return "/[A-Za-hj-rtv-z]s\\b/";
533  //case e_N_or_more_brackets_or_parentheses:
534  case e_Three_numbers:
535  return "/\\d\\d\\d/";
536  case e_Underscore:
537  return "/_/";
539  return "/^" + CMultipatternSearch::QuoteString(GetPrefix_and_numbers()) + "\\d+$/";
540  case e_All_caps:
541  return "/^[A-Z]+$/";
542  case e_Unbalanced_paren:
543  return "/[\\(\\)\\[\\]]/";
544  case e_Too_long:
545  return "/.$/";
546  //case e_Has_term:
547  default: break;
548  }
549  NCBI_THROW(CException, eUnknown, "GetRegex is not implemented for subtype: " + NStr::IntToString(Which()));
550  return "";
551 }
552 
553 
554 END_objects_SCOPE // namespace ncbi::objects::
555 
557 
558 /* Original file checksum: lines: 57, chars: 1726, CRC32: 9fb286c1 */
static const char * ok_num_prefix[]
static const char * skip_bracket_paren[]
User-defined methods of the data storage class.
static string QuoteString(const string &str)
Quote special characters to insert string into regular expression.
bool x_ProductContainsTerm(const string &str, const string &pattern) const
char x_GetClose(char bp) const
bool x_IsPrefixPlusNumbers(const string &str, const string &prefix) const
bool Match(const CMatchString &str) const
bool x_StringMayContainPlural(const string &str) const
Definition: Search_func.cpp:86
bool Empty() const
Definition: Search_func.cpp:52
bool x_DoesStrContainPlural(const string &word, char last_letter, char second_to_last_letter, char next_letter) const
Definition: Search_func.cpp:65
bool x_ContainsNorMoreSetsOfBracketsOrParentheses(const string &str, const int &n) const
bool x_SkipBracketOrParen(size_t idx, string &start) const
bool x_ContainsThreeOrMoreNumbersTogether(const string &str) const
bool x_StringContainsUnbalancedParentheses(const string &str) const
bool x_InWordBeforeCytochromeOrCoenzyme(const string &start_str) const
bool x_IsPropClose(const string &str, char open_p) const
string GetRegex() const
bool x_StringContainsUnderscore(const string &str) const
bool x_PrecededByOkPrefix(const string &start_str) const
bool x_FollowedByFamily(string &after_str) const
bool Match(const CMatchString &str) const
static const char * str(char *buf, int n)
Definition: stats.c:84
constexpr size_t ArraySize(const Element(&)[Size])
Definition: ncbimisc.hpp:1532
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
string
Definition: cgiapp.hpp:687
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define kEmptyStr
Definition: ncbistr.hpp:123
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3461
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2993
static bool EndsWith(const CTempString str, const CTempString end, ECase use_case=eCase)
Check if a string ends with a specified suffix value.
Definition: ncbistr.hpp:5430
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
static bool EqualCase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-sensitive equality of a substring with another string.
Definition: ncbistr.hpp:5325
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5353
static string TruncateSpaces(const string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string.
Definition: ncbistr.cpp:3186
@ fSplit_Tokenize
All delimiters are merged and trimmed, to get non-empty tokens only.
Definition: ncbistr.hpp:2508
@ eTrunc_Begin
Truncate leading spaces only.
Definition: ncbistr.hpp:2240
@ eNocase
Case insensitive compare.
Definition: ncbistr.hpp:1206
TToo_long GetToo_long(void) const
Get the variant data.
const TIgnore_words & GetIgnore_words(void) const
Get the Ignore_words member data.
const TPrefix_and_numbers & GetPrefix_and_numbers(void) const
Get the variant data.
TN_or_more_brackets_or_parentheses GetN_or_more_brackets_or_parentheses(void) const
Get the variant data.
const TMatch_text & GetMatch_text(void) const
Get the Match_text member data.
TNot_present GetNot_present(void) const
Get the Not_present member data.
const THas_term & GetHas_term(void) const
Get the variant data.
const Tdata & Get(void) const
Get the member data.
const TString_constraint & GetString_constraint(void) const
Get the variant data.
bool IsSet(void) const
Check if a value has been assigned to data member.
E_Choice Which(void) const
Which variant is currently selected.
bool IsSetMatch_text(void) const
Check if a value has been assigned to Match_text data member.
bool IsSetIgnore_words(void) const
Check if a value has been assigned to Ignore_words data member.
@ e_N_or_more_brackets_or_parentheses
where boath are integers</td > n< td ></td > n</tr > n< tr > n< td > tse</td > n< td > optional</td > n< td > String</td > n< td class=\"description\"> TSE option controls what blob is orig
int i
yy_size_t n
int len
Simultaneous search of multiple RegEx patterns in the input string.
constexpr bool empty(list< Ts... >) noexcept
const struct ncbi::grid::netcache::search::fields::SIZE size
int isalpha(Uchar c)
Definition: ncbictype.hpp:61
int isdigit(Uchar c)
Definition: ncbictype.hpp:64
static const char * prefix[]
Definition: pcregrep.c:405
Modified on Thu Apr 11 15:07:24 2024 by modify_doxy.py rev. 669887