NCBI C++ ToolKit
objutil.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: objutil.cpp 100324 2023-07-20 14:30:16Z vasilche $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Mati Shomrat, NCBI
27 *
28 * File Description:
29 * shared utility functions
30 *
31 */
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbistd.hpp>
34 
35 #include <util/strsearch.hpp>
36 
37 #include <objects/general/Date.hpp>
41 #include <objects/general/Date.hpp>
42 #include <objects/seq/Bioseq.hpp>
43 #include <objects/seq/Seq_inst.hpp>
44 #include <objects/seq/Seq_ext.hpp>
48 #include <objects/seq/MolInfo.hpp>
51 #include <objmgr/scope.hpp>
52 #include <objmgr/bioseq_handle.hpp>
53 #include <objmgr/seqdesc_ci.hpp>
55 #include <objmgr/util/sequence.hpp>
57 #include <algorithm>
58 #include <objmgr/util/objutil.hpp>
59 
60 
63 
64 
65 SAFE_CONST_STATIC_STRING(kLegalPathChars, "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_-.");
66 
67 bool IsPartOfUrl(const string& sentence, size_t pos)
68 {
69  string separators( "( \t\r\n" );
70  const string& legal_path_chars = kLegalPathChars.Get();
71 
72  //
73  // Weed out silly input:
74  //
75  if ( sentence == "" || pos > sentence.length() - 1 ) {
76  return false;
77  }
78  if ( string::npos != separators.find( sentence[ pos ] ) ) {
79  return false;
80  }
81 
82  // Do easy tests first:
83 
84  // We require the tilde to show up in a pattern like
85  // "/~[0..9A..Za..z_-.]+". This is inherited from the C toolkit flat file
86  // generator:
87  //
88  if ( (pos < 1) || (sentence[ pos-1 ] != '/') ) {
89  return false;
90  }
91 
92  //
93  // Find the start of the "word" that surrounds the given position:
94  //
95  separators += '~';
96  string::size_type left_edge = sentence.find_last_of( separators, pos-1 );
97  if ( left_edge == string::npos ) {
98  left_edge = 0;
99  }
100  else {
101  ++left_edge;
102  }
103 
104  //
105  // If it's a URL, it better start with a protocol specifier we approve of:
106  //
107  static const char* sc_ProtocolSpecifiers[] = {
108  "URL:",
109  "http:",
110  "https:",
111  };
112  DEFINE_STATIC_ARRAY_MAP_WITH_COPY(CStaticArraySet<string>, vProtocolSpecifiers, sc_ProtocolSpecifiers);
113  size_t colon = sentence.find( ':', left_edge );
114  if ( colon == string::npos ) {
115  return false;
116  }
117  string strMaybeUrl = sentence.substr( left_edge, colon - left_edge + 1 );
118  if ( vProtocolSpecifiers.find( strMaybeUrl ) == vProtocolSpecifiers.end() ) {
119  return false;
120  }
121 
122  ++pos;
123  if ( string::npos == legal_path_chars.find( sentence[ pos ] ) ) {
124  return false;
125  }
126 
127  for ( ++pos; sentence[ pos ] != 0; ++pos ) {
128  if ( string::npos == legal_path_chars.find( sentence[ pos ] ) ) {
129  return ( sentence[ pos ] == '/' );
130  }
131  }
132 
133  return false; /* never found the terminating '/' */
134 };
135 
136 
137 static bool s_RunOfStars(string& s, SIZE_TYPE start, SIZE_TYPE length)
138 {
139  SIZE_TYPE max = start + 66;
140  if (max >= length) {
141  return false;
142  }
143  for (SIZE_TYPE i = start; i < max; i++) {
144  if (s[i] != '*') {
145  return false;
146  }
147  }
148  return true;
149 }
150 
151 
152 void ExpandTildes(string& s, ETildeStyle style)
153 {
154  if ( style == eTilde_tilde ) {
155  return;
156  }
157 
158  SIZE_TYPE start = 0, tilde, length = s.length();
159 
160  tilde = s.find('~', start);
161  if (tilde == NPOS) { // no tilde
162  return;
163  }
164 
165  string result;
166 
167  while ( (start < length) && (tilde = s.find('~', start)) != NPOS ) {
168  result.append(s, start, tilde - start);
169  char next = (tilde + 1) < length ? s[tilde + 1] : 0;
170  switch ( style ) {
171  case eTilde_space:
172  if ( (tilde + 1 < length && isdigit((unsigned char) next) ) ||
173  (tilde + 2 < length && (next == ' ' || next == '(') &&
174  isdigit((unsigned char) s[tilde + 2]))) {
175  result += '~';
176  } else {
177  result += ' ';
178  }
179  start = tilde + 1;
180  break;
181 
182  case eTilde_newline:
183  if ( tilde + 1 < length && s[tilde + 1] == '~' ) {
184  result += '~';
185  start = tilde + 2;
186  } else {
187  result += "\n";
188  start = tilde + 1;
189  }
190  break;
191 
192  case eTilde_note:
193  if ( tilde + 1 < length && s[tilde + 1] == '~' ) {
194  result += '~';
195  start = tilde + 2;
196  } else {
197  // plain "~" expands to ";\n", unless it's after a space or semi-colon, in
198  // which case it becomes a plain "\n"
199  char prevChar = ( tilde >= 1 ? s[tilde - 1] : '\0' );
200 
201  if( ' ' == prevChar || ';' == prevChar ) {
202  result += '\n';
203  } else {
204  result += ";\n";
205  }
206  start = tilde + 1;
207  }
208  break;
209 
210  case eTilde_comment:
211  if (tilde > 0 && s[tilde - 1] == '`') {
212  result.replace(result.length() - 1, 1, 1,'~');
213  }
214  else if ( IsPartOfUrl( s, tilde ) ) {
215  result += '~';
216  }
217  else {
218  result += "\n";
219  }
220  start = tilde + 1;
221  if (s[start] == ' ' && s_RunOfStars(s, start+1, length)) {
222  start++;
223  result += '\n';
224  }
225  break;
226 
227  default: // just keep it, for lack of better ideas
228  result += '~';
229  start = tilde + 1;
230  break;
231  }
232  }
233  if (start < length) {
234  result.append(s, start, NPOS);
235  }
236  s.swap(result);
237 }
238 
239 
240 void ConvertQuotes(string& str)
241 {
242  replace(str.begin(), str.end(), '\"', '\'');
243 }
244 
245 
246 string ConvertQuotes(const string& str)
247 {
248  string retval = str;
249  ConvertQuotes(retval);
250  return retval;
251 }
252 
253 // Strips all spaces in string in following manner. If the function
254 // meet several spaces (spaces and tabs) in succession it replaces them
255 // with one space. Strips all spaces after '(' and before ( ')' or ',' ).
256 bool StripSpaces(string& str)
257 {
258  if (str.empty()) {
259  return false;
260  }
261  auto orig_len = str.length();
262 
263  NStr::ReplaceInPlace(str, "\t", " ");
264  auto this_len = str.length();
265  NStr::ReplaceInPlace(str, " ", " ");
266  while (str.length() != this_len) {
267  this_len = str.length();
268  NStr::ReplaceInPlace(str, " ", " ");
269  }
270  NStr::ReplaceInPlace(str, "( ", "(");
271  NStr::ReplaceInPlace(str, " )", ")");
272  NStr::ReplaceInPlace(str, " ,", ",");
273 
274 #if 0
275 
276  string::iterator end = str.end();
277  string::iterator it = str.begin();
278  string::iterator new_str = it;
279  while (it != end) {
280  *new_str++ = *it;
281  if ( (*it == ' ') || (*it == '\t') || (*it == '(') ) {
282  for (++it; it != end && (*it == ' ' || *it == '\t'); ++it)
283  continue;
284  if (it != end && (*it == ')' || *it == ',')) {
285  if( *(new_str - 1) != '(' ) { // this if protects against the case "(...bunch of spaces and tabs...)". Otherwise, the first '(' is erased
286  --new_str;
287  }
288  }
289  } else {
290  ++it;
291  }
292  }
293  str.erase(new_str, str.end());
294 #endif
295  return (orig_len != str.length());
296 }
297 
298 
299 bool RemovePeriodFromEnd(string& str, bool keep_ellipsis)
300 {
301 
302  // NB: this is likely a better solution; however, the C toolkit differs...
303  //string::size_type pos = str.find_last_not_of(".,;:() ");
304  // string::size_type pos = str.find_last_not_of(".,;: ");
305  //string::size_type pos = str.find_last_not_of(".");
306  //string::size_type pos2 = str.find("...", pos);
307  //// string::size_type pos3 = str.find_first_of(".", pos);
308  //if (pos < str.size() - 1) {
309  // str.erase(pos + 1);
310  // if (keep_ellipsis && pos2 != string::npos) {
311  // str += "...";
312  // }
313  //}
314  //return ( pos != string::npos );
315 
316  const string::size_type len = str.length();
317 
318  if( keep_ellipsis ) {
319  if( len >= 3 && str[len-1] == '.' && str[len-2] == '.' && str[len-3] == '.' ) {
320  return false;
321  }
322  }
323 
324  // chop off period if there's one at the end
325  if( len >= 1 && str[len-1] == '.' ) {
326  str.resize( len - 1 );
327  return true;
328  } else {
329  return false;
330  }
331 
332  /* string::size_type pos2 = str.find_last_not_of(";,.");
333  string::size_type pos3 = str.find_last_not_of(" ", pos2);
334  if (pos3 < pos2) {
335  str.erase(pos3 + 1);
336  pos2 = str.find_last_not_of(";,.");
337  }
338 
339  string::size_type pos = str.find_last_not_of(".");
340  if (pos2 < str.size() - 1) {
341  if (keep_ellipsis) {
342  /// trim the end to an actual ellipsis
343  if (str.length() - pos2 > 3) {
344  if (pos2 < pos) {
345  str.erase(pos2 + 1);
346  str += "...";
347  return true;
348  }
349  pos += 3;
350  }
351  else if (pos2 < pos) {
352  pos = pos2;
353  }
354  } else if (pos2 < pos) {
355  pos = pos2;
356  }
357  if (pos < str.size() - 1) {
358  str.erase(pos + 1);
359  return true;
360  }
361  } */
362 
363  /**
364  static const char* kEllipsis = "...";
365 
366  if ( NStr::EndsWith(str, '.') ) {
367  if ( !keep_ellipsis || !NStr::EndsWith(str, kEllipsis) ) {
368  str.erase(str.length() - 1);
369  return true;
370  }
371  }
372  **/
373  // return false;
374 }
375 
376 
377 void AddPeriod(string& str)
378 {
379  size_t pos = str.find_last_not_of(" \t~.\n");
380  str.erase(pos + 1);
381  str += '.';
382 }
383 
384 
385 void TrimSpaces(string& str, size_t indent)
386 {
387  if (str.empty() || str.length() <= indent) {
388  return;
389  }
390 
391  size_t end = str.length() - 1;
392  while (end >= indent && isspace((unsigned char) str[end])) {
393  end--;
394  }
395  if (end < indent) {
396  str.erase(indent);
397  } else {
398  str.erase(end + 1);
399  }
400 }
401 
402 // needed because not all compilers will just let you pass "isgraph" to STL find_if
403 class CIsGraph
404 {
405 public:
406  bool operator()( const char c ) {
407  return isgraph((unsigned char)c) != 0;
408  }
409 };
410 
411 // This will compress multiple spaces in a row.
412 // It also translates unprintable characters to spaces.
413 // If trim_beginning, strips all spaces and unprintables from beginning of string.
414 // If trim_end, strips all spaces and unprintables from end of string.
415 // returns the string you gave it.
416 string& CompressSpaces( string& str, const bool trim_beginning, const bool trim_end )
417 {
418  if( str.empty() ) {
419  return str;
420  }
421 
422  // set up start_iter and end_iter to determine the range in which we're looking
423 
424  string::iterator start_iter = str.begin();
425  if( trim_beginning ) {
426  start_iter = find_if( str.begin(), str.end(), CIsGraph() );
427  }
428  if( str.end() == start_iter ) {
429  str.clear();
430  return str;
431  }
432 
433  string::iterator end_iter = str.end();
434  if( trim_end ) {
435  string::reverse_iterator rev_iter = find_if( str.rbegin(), str.rend(), CIsGraph() );
436  end_iter = str.begin() + ( str.rend() - rev_iter );
437  }
438  if( str.begin() == end_iter ) {
439  str.clear();
440  return str;
441  }
442 
443  // The main part, where we compress spaces
444  string newstr; // result will end up here
445  newstr.reserve( end_iter - start_iter );
446 
447  // efficiency note: If the efficiency of unique_copy followed by transform becomes
448  // burdensome, we may have to replace these 2 calls with one raw loop that does
449  // what those calls do ( a sloppier and more bug-prone ( but faster ), prospect)
450 
451  // copy such that consecutive spaces or control characters are compressed to one space
452  char last_ch_was_printable = true;
453  for( string::iterator iter = start_iter; iter < end_iter; ++iter ) {
454  const char ch = *iter;
455  if( isgraph(ch) ) {
456  // visible characters get copied straight
457  newstr += ch;
458  last_ch_was_printable = true;
459  } else {
460  // unprintable chars become space, and they're only appended if the last char was
461  // printable
462  if( last_ch_was_printable ) {
463  newstr += ' ';
464  }
465  last_ch_was_printable = false;
466  }
467  }
468 
469  str.swap( newstr );
470  return str;
471 }
472 
473 
474 // returns true if it changed the string
475 bool TrimSpacesAndJunkFromEnds(string& str, bool allow_ellipsis)
476 {
477  // TODO: This commented out code represents how ellipsis trimming
478  // should work. However, for compatibility with C, we're using a
479  // (in my opinion) suboptimal algorithm. We can switch over later.
480 
481  //if (str.empty()) {
482  // return;
483  //}
484 
485  //size_t strlen = str.length();
486  //size_t begin = 0;
487 
488  //// trim unprintable characters (and space) off the beginning
489  //while (begin != strlen) {
490  // unsigned char ch = str[begin];
491  // if (ch > ' ') {
492  // break;
493  // } else {
494  // ++begin;
495  // }
496  //}
497 
498  //// we're done if we trimmed the string to nothing
499  //if (begin == strlen) {
500  // str.erase();
501  // return;
502  //}
503 
504  //// trim junk off the end (while we're at it, record whether we're chopping off a period)
505  //size_t end = strlen - 1;
506  //bool has_period = false;
507  //while (end > begin) {
508  // unsigned char ch = str[end];
509  // if (ch <= ' ' || ch == '.' || ch == ',' || ch == '~' || ch == ';') {
510  // has_period = (has_period || ch == '.');
511  // --end;
512  // } else {
513  // break;
514  // }
515  //}
516 
517  //// check whether we're about to chop off an ellipsis, so we remember to add it back
518  //// TODO: There's got to be a more efficient way of doing this
519  //const bool weChoppedOffAnEllipsis = ( NPOS != NStr::Find(str, "...", end) );
520 
521  //// do the actual chopping here
522  //str = str.substr( begin, end + 1 );
523 
524  //// restore chopped off ellipsis or period, if any
525  //if ( allow_ellipsis && weChoppedOffAnEllipsis ) {
526  // str += "...";
527  //} else if (has_period) {
528  // // re-add any periods if we had one before
529  // str += '.';
530  //}
531 
532  // This is based on the C function TrimSpacesAndJunkFromEnds.
533  // Although it's updated to use iterators and such and to
534  // return whether it changed the string, it should
535  // have the same output, except:
536  // - We do NOT chop off a semicolon if we determine that it's
537  // part of an HTML escape char (e.g. "&bgr;" ).
538  // - There are some changes in how tildes are handled;
539  // this algo is less likely to remove them.
540 
541  if ( str.empty() ) {
542  return false;
543  }
544 
545  // make start_of_junk_pos hold the beginning of the "junk" at the end
546  // (where junk is defined as one of several characters)
547  // while we're at it, also check if the junk contains a tilde and/or period
548  bool isPeriod = false;
549  bool isTilde = false;
550  int start_of_junk_pos = (int)str.length() - 1;
551  for( ; start_of_junk_pos >= 0 ; --start_of_junk_pos ) {
552  const char ch = str[start_of_junk_pos];
553  if (ch <= ' ' || ch == '.' || ch == ',' || ch == '~' || ch == ';') {
554  // found junk character
555 
556  // also, keep track of whether the junk includes a period and/or tilde
557  isPeriod = (isPeriod || ch == '.');
558  isTilde = (isTilde || ch == '~');
559  } else {
560  // found non-junk character. Last junk character is just after this
561  ++start_of_junk_pos;
562  break;
563  }
564  }
565  // special case of the whole string being junk
566  if( start_of_junk_pos < 0 ) {
567  start_of_junk_pos = 0;
568  }
569 
570  // check for ';' that's part of an HTML escape char like "&bgr;" and
571  // skip over it (i.e., don't remove it) if so
572  if( start_of_junk_pos < (int)str.length() && str[start_of_junk_pos] == ';' ) {
573  // we assume no HTML escape char will be longer than this
574  static const int kMaxCharsToLookAt = 20;
575 
576  // go backwards, looking for the ampersand
577  int amp_iter = (start_of_junk_pos - 1);
578  for( ; amp_iter >= 0 && ((start_of_junk_pos - amp_iter) < kMaxCharsToLookAt); --amp_iter ) {
579  const char ch = str[amp_iter];
580  if( isalnum(ch) || ch == '#' ) {
581  // just keep going
582  } else if( ch == '&' ) {
583  // The semicolon ends an HTML escape character, so we skip it
584  ++start_of_junk_pos;
585  break;
586  } else {
587  // The semicolon does NOT end an HTML escape character, so we might remove it
588  break;
589  }
590  }
591  }
592 
593  bool changed = false;
594 
595  // if there's junk, chop it off (but leave period/tildes/ellipsis as appropriate)
596  if ( start_of_junk_pos < (int)str.length() ) {
597 
598  // holds the suffix to add after we remove the junk
599  const char * suffix = ""; // by default, just remove junk
600 
601  const int chars_in_junk = ( (int)str.length() - start_of_junk_pos );
602  _ASSERT( chars_in_junk >= 1 );
603 
604  // allow one period at end
605  if (isPeriod) {
606  // check if we should put an ellipsis, or just a period
607  const bool putEllipsis = ( allow_ellipsis && (chars_in_junk >= 3) &&
608  str[start_of_junk_pos+1] == '.' && str[start_of_junk_pos+2] == '.' );
609 
610  suffix = ( putEllipsis ? "..." : "." );
611  } else if (isTilde ) {
612  // allow tilde(s)
613  // (This should work on single- AND double-tildes because
614  // we don't know whether or not tilde-expansion was called before this
615  // point )
616  if ( str[start_of_junk_pos] == '~' ) {
617  const bool doubleTilde = ( (chars_in_junk >= 2) && str[start_of_junk_pos+1] == '~' );
618  suffix = ( doubleTilde ? "~~" : "~" );
619  }
620  }
621  if( suffix[0] != '\0' ) {
622  if( 0 != str.compare( start_of_junk_pos, INT_MAX, suffix) ) {
623  str.erase( start_of_junk_pos );
624  str += suffix;
625  changed = true;
626  }
627  } else if ( start_of_junk_pos < (int)str.length() ) {
628  str.erase( start_of_junk_pos );
629  changed = true;
630  }
631  }
632 
633  // copy the part after the initial whitespace to the destination
634  string::iterator input_iter = str.begin();
635  while ( input_iter != str.end() && *input_iter <= ' ') {
636  ++input_iter;
637  }
638  if( input_iter != str.begin() ) {
639  str.erase( str.begin(), input_iter );
640  changed = true;
641  }
642 
643  return changed;
644 }
645 
646 // this is copy-pasted method and optimized to use CTempString
647 void TrimSpacesAndJunkFromEnds(string& result, const CTempString& str, bool allow_ellipsis)
648 {
649  // TODO: This commented out code represents how ellipsis trimming
650  // should work. However, for compatibility with C, we're using a
651  // (in my opinion) suboptimal algorithm. We can switch over later.
652 
653  //if (str.empty()) {
654  // return;
655  //}
656 
657  //size_t strlen = str.length();
658  //size_t begin = 0;
659 
660  //// trim unprintable characters (and space) off the beginning
661  //while (begin != strlen) {
662  // unsigned char ch = str[begin];
663  // if (ch > ' ') {
664  // break;
665  // } else {
666  // ++begin;
667  // }
668  //}
669 
670  //// we're done if we trimmed the string to nothing
671  //if (begin == strlen) {
672  // str.erase();
673  // return;
674  //}
675 
676  //// trim junk off the end (while we're at it, record whether we're chopping off a period)
677  //size_t end = strlen - 1;
678  //bool has_period = false;
679  //while (end > begin) {
680  // unsigned char ch = str[end];
681  // if (ch <= ' ' || ch == '.' || ch == ',' || ch == '~' || ch == ';') {
682  // has_period = (has_period || ch == '.');
683  // --end;
684  // } else {
685  // break;
686  // }
687  //}
688 
689  //// check whether we're about to chop off an ellipsis, so we remember to add it back
690  //// TODO: There's got to be a more efficient way of doing this
691  //const bool weChoppedOffAnEllipsis = ( NPOS != NStr::Find(str, "...", end) );
692 
693  //// do the actual chopping here
694  //str = str.substr( begin, end + 1 );
695 
696  //// restore chopped off ellipsis or period, if any
697  //if ( allow_ellipsis && weChoppedOffAnEllipsis ) {
698  // str += "...";
699  //} else if (has_period) {
700  // // re-add any periods if we had one before
701  // str += '.';
702  //}
703 
704  // This is based on the C function TrimSpacesAndJunkFromEnds.
705  // Although it's updated to use iterators and such and to
706  // return whether it changed the string, it should
707  // have the same output, except:
708  // - We do NOT chop off a semicolon if we determine that it's
709  // part of an HTML escape char (e.g. "&bgr;" ).
710  // - There are some changes in how tildes are handled;
711  // this algo is less likely to remove them.
712 
713  if (str.empty()) {
714  result.clear();
715  return;
716  }
717 
718  // make start_of_junk_pos hold the beginning of the "junk" at the end
719  // (where junk is defined as one of several characters)
720  // while we're at it, also check if the junk contains a tilde and/or period
721  bool isPeriod = false;
722  bool isTilde = false;
723  size_t start_of_junk_pos = 0;
724  for (size_t len = str.length(); len && start_of_junk_pos == 0; len--)
725  {
726  char ch = str[len-1];
727  if (ch <= ' ') ch = ' ';
728  switch (ch)
729  {
730  case '.':
731  isPeriod = true;
732  break;
733  case '~':
734  isTilde = true;
735  break;
736  case ';':
737  case ',':
738  case ' ':
739  break;
740  default:
741  // found non-junk character. Last junk character is just after this
742  start_of_junk_pos = len;
743  break;
744  }
745  }
746 
747  // check for ';' that's part of an HTML escape char like "&bgr;" and
748  // skip over it (i.e., don't remove it) if so
749  if (start_of_junk_pos < str.length() && str[start_of_junk_pos] == ';') {
750  // we assume no HTML escape char will be longer than this
751  static const int kMaxCharsToLookAt = 20;
752 
753  // go backwards, looking for the ampersand
754  int amp_iter = ((int)start_of_junk_pos - 1);
755  for (; amp_iter >= 0 && ((start_of_junk_pos - amp_iter) < kMaxCharsToLookAt); --amp_iter) {
756  const unsigned char ch = str[amp_iter];
757  if (isalnum(ch) || ch == '#') {
758  // just keep going
759  }
760  else if (ch == '&') {
761  // The semicolon ends an HTML escape character, so we skip it
762  ++start_of_junk_pos;
763  break;
764  }
765  else {
766  // The semicolon does NOT end an HTML escape character, so we might remove it
767  break;
768  }
769  }
770  }
771 
772  // holds the suffix to add after we remove the junk
773  CTempString suffix; // by default, just remove junk
774 
775  // if there's junk, chop it off (but leave period/tildes/ellipsis as appropriate)
776  if (start_of_junk_pos < str.length()) {
777 
778  const int chars_in_junk = (int)(str.length() - start_of_junk_pos);
779  _ASSERT(chars_in_junk >= 1);
780 
781  // allow one period at end
782  if (isPeriod) {
783  // check if we should put an ellipsis, or just a period
784  const bool putEllipsis = (allow_ellipsis && (chars_in_junk >= 3) &&
785  str[start_of_junk_pos + 1] == '.' && str[start_of_junk_pos + 2] == '.');
786 
787  suffix = (putEllipsis ? "..." : ".");
788  }
789  else if (isTilde) {
790  // allow tilde(s)
791  // (This should work on single- AND double-tildes because
792  // we don't know whether or not tilde-expansion was called before this
793  // point )
794  if (str[start_of_junk_pos] == '~') {
795  const bool doubleTilde = ((chars_in_junk >= 2) && str[start_of_junk_pos + 1] == '~');
796  suffix = (doubleTilde ? "~~" : "~");
797  }
798  }
799  }
800  const char* ptr = str.data();
801  size_t len = start_of_junk_pos;
802  while (len && *ptr <= ' ')
803  {
804  len--; ptr++;
805  }
806  result.reserve(len + suffix.length());
807  result.assign(ptr, len);
808  result.append(suffix.data(), suffix.length());
809 }
810 
811 // two-bytes combinations we're looking to clean
812 #define twochars(a,b) Uint2((a) << 8 | (b))
813 #define twocommas twochars(',',',')
814 #define twospaces twochars(' ',' ')
815 #define twosemicolons twochars(';',';')
816 #define space_comma twochars(' ',',')
817 #define space_bracket twochars(' ',')')
818 #define bracket_space twochars('(',' ')
819 #define space_semicolon twochars(' ',';')
820 #define comma_space twochars(',',' ')
821 #define semicolon_space twochars(';',' ')
822 
823 void CleanAndCompress(string& dest, const CTempString& instr)
824 {
825  size_t left = instr.size();
826  // this is the input stream
827  const char* in = instr.data();
828 
829  // skip front white spaces
830  while (left && *in == ' ')
831  {
832  in++;
833  left--;
834  }
835  // forget end white spaces
836  while (left && in[left - 1] == ' ')
837  {
838  left--;
839  }
840 
841  dest.resize(left);
842 
843  if (left < 1) return;
844 
845  // this is where we write result
846  char* out = (char*)dest.c_str();
847 
848  char curr = *in++; // initialize with first character
849  left--;
850 
851  char next = 0;
852  Uint2 two_chars = curr; // this is two bytes storage where we see current and previous symbols
853 
854  while (left > 0) {
855  next = *in++;
856 
857  two_chars = Uint2((two_chars << 8) | next);
858 
859  switch (two_chars)
860  {
861  case twocommas: // replace double commas with comma+space
862  *out++ = curr;
863  next = ' ';
864  break;
865  case twospaces: // skip multiple spaces (only print last one)
866  break;
867  case twosemicolons: // skip multiple semicolons (only print last one)
868  break;
869  case bracket_space: // skip space after bracket
870  next = curr;
871  two_chars = curr;
872  break;
873  case space_bracket: // skip space before bracket
874  break;
875  case space_comma:
876  *out++ = next;
877  next = curr;
878  *out++ = ' ';
879  while ((next == ' ' || next == ',') && left > 0) {
880  next = *in;
881  in++;
882  left--;
883  }
884  two_chars = next;
885  break;
886  case space_semicolon:
887  *out++ = next;
888  next = curr;
889  *out++ = ' ';
890  while ((next == ' ' || next == ';') && left > 0) {
891  next = *in;
892  in++;
893  left--;
894  }
895  two_chars = next;
896  break;
897  case comma_space:
898  *out++ = curr;
899  *out++ = ' ';
900  while ((next == ' ' || next == ',') && left > 0) {
901  next = *in;
902  in++;
903  left--;
904  }
905  two_chars = next;
906  break;
907  case semicolon_space:
908  *out++ = curr;
909  *out++ = ' ';
910  while ((next == ' ' || next == ';') && left > 0) {
911  next = *in;
912  in++;
913  left--;
914  }
915  two_chars = next;
916  break;
917  default:
918  *out++ = curr;
919  break;
920  }
921 
922  curr = next;
923  if (left > 0) {
924  left--;
925  }
926  }
927 
928  if (curr > 0 && curr != ' ') {
929  *out++ = curr;
930  }
931 
932  dest.resize(out - dest.c_str());
933 }
934 
935 #if 0
936 struct CleanAndCompress_unit_test
937 {
938  CleanAndCompress_unit_test()
939  {
940  test("C( )C");
941  test("xx,,xx");
942  test("xx,, xx");
943  test("xx,, xx");
944  test(" xx xx ");
945  test("xx , xx");
946  test("xx , xx");
947  test("xx(xx)");
948  test("xx( xx )");
949  }
950  void test(char* s)
951  {
952  string str;
953  CleanAndCompress(str, s);
954  cout << s << "--->" << str << '.' << endl;
955  }
956 };
957 
958 CleanAndCompress_unit_test t;
959 #endif
960 
961 
962 /*
963 void CleanAndCompress (string& str)
964 {
965  if (str.empty()) {
966  return;
967  }
968 
969  size_t pos = str.find (" ,");
970  if (pos != NPOS) {
971  str [pos] = ',';
972  str [pos+1] = ' ';
973  }
974  pos = str.find (",,");
975  if (pos != NPOS) {
976  str [pos+1] = ' ';
977  }
978  pos = str.find (" ;");
979  if (pos != NPOS) {
980  str [pos] = ';';
981  str [pos+1] = ' ';
982  }
983  pos = str.find ("( ");
984  if (pos != NPOS) {
985  str [pos] = ' ';
986  str [pos+1] = '(';
987  }
988  pos = str.find (" )");
989  if (pos != NPOS) {
990  str [pos] = ')';
991  str [pos+1] = ' ';
992  }
993 
994  string::iterator end = str.end();
995  string::iterator it = str.begin();
996  string::iterator new_str = it;
997  while (it != end) {
998  *new_str++ = *it;
999  if ( (*it == ' ') || (*it == '\t') || (*it == '(') ) {
1000  for (++it; (it != end) && (*it == ' ' || *it == '\t'); ++it) continue;
1001  if ((it != end) && (*it == ')' || *it == ',') ) {
1002  // this "if" protects against the case "(...bunch of spaces and tabs...)".
1003  // Otherwise, the first '(' is unintentionally erased
1004  if( *(new_str - 1) != '(' ) {
1005  --new_str;
1006  }
1007  }
1008  } else {
1009  ++it;
1010  }
1011  }
1012  str.erase(new_str, str.end());
1013 }
1014 */
1015 
1016 
1017 #if 0
1018 struct CJunkUnitTest
1019 {
1020  void test(CTempString v, bool a_e)
1021  {
1022  string res(v);
1023  TrimSpacesAndJunkFromEnds(res, a_e);
1024  TrimSpacesAndJunkFromEnds(res, v, a_e);
1025  }
1026  CJunkUnitTest()
1027  {
1028  test(" .", true);
1029  test(" aaa bbb.....", true);
1030  test(" aaa bbb.....", false);
1031  test(" aaa bbb~~~~~", true);
1032  test(" aaa bbb,,,,,", true);
1033  test(" aaa bbb;;;;;;", true);
1034  }
1035 };
1036 
1037 static CJunkUnitTest c;
1038 #endif
1039 
1040 static bool s_IsWholeWord(const string& str, size_t pos)
1041 {
1042  // NB: To preserve the behavior of the C toolkit we only test on the left.
1043  // This was an old bug in the C toolkit that was never fixed and by now
1044  // has become the expected behavior.
1045  return (pos > 0 && pos <= str.size()) ?
1046  isspace((unsigned char) str[pos - 1]) || ispunct((unsigned char) str[pos - 1]) : true;
1047 }
1048 
1049 
1050 void JoinString(string& to, const string& prefix, const string& str, bool noRedundancy)
1051 {
1052  if ( str.empty() ) {
1053  return;
1054  }
1055 
1056  if ( to.empty() ) {
1057  to += str;
1058  return;
1059  }
1060 
1061  size_t pos = NPOS;
1062  if (noRedundancy) {
1063  //for ( pos = NStr::Find(to, str); pos != NPOS; pos += str.length()) {
1064  for ( pos = NStr::Find(to, str);
1065  pos != NPOS; pos = NStr::Find(to, str, pos + 1)) {
1066  if (s_IsWholeWord(to, pos)) {
1067  return;
1068  }
1069  }
1070  }
1071 
1072  //LOG_POST(Error << "adding: to=" << to << " prefix=" << prefix << " str=" << str);
1073 
1074  if( NStr::StartsWith(prefix, ";") && NStr::EndsWith(to, ";") ) {
1075  to += prefix.substr(1);
1076  } else {
1077  to += prefix;
1078  }
1079  to += str;
1080 }
1081 
1082 
1083 string JoinString(const list<string>& l, const string& delim, bool noRedundancy)
1084 {
1085  if ( l.empty() ) {
1086  return kEmptyStr;
1087  }
1088 
1089  /**
1090  string result;
1091  set<CTempString> strings;
1092  ITERATE (list<string>, it, l) {
1093  if ( !noRedundancy ||
1094  strings.insert(CTempString(*it)).second) {
1095  if ( !result.empty() ) {
1096  result += delim;
1097  }
1098  result += *it;
1099  }
1100  }
1101  **/
1102 
1103  string result = l.front();
1104  list<string>::const_iterator it = l.begin();
1105  while ( ++it != l.end() ) {
1106  JoinString(result, delim, *it, noRedundancy);
1107  }
1108 
1109  return result;
1110 }
1111 
1112 /*
1113 // Validate the correct format of an accession string.
1114 static bool s_IsValidAccession(const string& acc)
1115 {
1116  static const size_t kMaxAccLength = 16;
1117 
1118  if ( acc.empty() ) {
1119  return false;
1120  }
1121 
1122  if ( acc.length() >= kMaxAccLength ) {
1123  return false;
1124  }
1125 
1126  // first character must be uppercase letter
1127  if ( !(isalpha((unsigned char) acc[0]) && isupper((unsigned char) acc[0])) ) {
1128  return false;
1129  }
1130 
1131  size_t num_alpha = 0,
1132  num_undersc = 0,
1133  num_digits = 0;
1134 
1135  const char* ptr = acc.c_str();
1136  if ( NStr::StartsWith(acc, "NZ_") ) {
1137  ptr += 3;
1138  }
1139  for ( ; isalpha((unsigned char)(*ptr)); ++ptr, ++num_alpha );
1140  for ( ; *ptr == '_'; ++ptr, ++num_undersc );
1141  for ( ; isdigit((unsigned char)(*ptr)); ++ptr, ++num_digits );
1142 
1143  if ( (*ptr != '\0') && (*ptr != ' ') && (*ptr != '.') ) {
1144  return false;
1145  }
1146 
1147  switch ( num_undersc ) {
1148  case 0:
1149  {{
1150  if ( (num_alpha == 1 && num_digits == 5) ||
1151  (num_alpha == 2 && num_digits == 6) ||
1152  (num_alpha == 3 && num_digits == 5) ||
1153  (num_alpha == 4 && num_digits == 8) ||
1154  (num_alpha == 4 && num_digits == 9) ) {
1155  return true;
1156  }
1157  }}
1158  break;
1159 
1160  case 1:
1161  {{
1162  if( num_alpha == 3 && num_digits == 6 &&
1163  NStr::StartsWith(acc, "MAP_") )
1164  {
1165  return true;
1166  }
1167 
1168  // RefSeq accession
1169  if ( (num_alpha != 2) ||
1170  (num_digits != 6 && num_digits != 8 && num_digits != 9) ) {
1171  return false;
1172  }
1173 
1174  char first_letter = acc[0];
1175  char second_letter = acc[1];
1176 
1177  if ( first_letter == 'N' ) {
1178  if ( second_letter == 'C' || second_letter == 'G' ||
1179  second_letter == 'M' || second_letter == 'R' ||
1180  second_letter == 'P' || second_letter == 'W' ||
1181  second_letter == 'T' ) {
1182  return true;
1183  }
1184  } else if ( first_letter == 'X' ) {
1185  if ( second_letter == 'M' || second_letter == 'R' ||
1186  second_letter == 'P' ) {
1187  return true;
1188  }
1189  } else if ( first_letter == 'Z' || first_letter == 'A' ||
1190  first_letter == 'Y' ) {
1191  return (second_letter == 'P');
1192  } else if ( first_letter == 'W' ) {
1193  if ( second_letter == 'P' ) {
1194  return true;
1195  }
1196  }
1197  }}
1198  break;
1199 
1200  default:
1201  return false;
1202  }
1203 
1204  return false;
1205 }
1206 */
1207 
1208 static bool s_IsValidDotVersion(const string& accn)
1209 {
1210  size_t pos = accn.find('.');
1211  if (pos == NPOS) {
1212  return false;
1213  }
1214  size_t num_digis = 0;
1215  for (++pos; pos < accn.size(); ++pos) {
1216  if (isdigit((unsigned char) accn[pos])) {
1217  ++num_digis;
1218  } else {
1219  return false;
1220  }
1221  }
1222 
1223  return (num_digis >= 1);
1224 }
1225 
1226 
1227 bool IsValidAccession(const string& accn, EAccValFlag flag)
1228 {
1229  // bool valid = s_IsValidAccession(accn);
1230  bool valid = (CSeq_id::IdentifyAccession(accn) != CSeq_id::eAcc_unknown);
1231  if (valid && flag == eValidateAccDotVer) {
1232  valid = s_IsValidDotVersion(accn);
1233  }
1234  return valid;
1235 }
1236 
1237 
1238 void DateToString(const CDate& date, string& str, EDateToString format_choice )
1239 {
1240  // One day we should make regular format default to JAN, since "JUN" seems
1241  // kind of arbitrary.
1242  static const char* regular_format = "%{%2D%|01%}-%{%3N%|JUN%}-%Y";
1243  static const char* cit_sub_format = "%{%2D%|??%}-%{%3N%|???%}-%{%4Y%|/???%}";
1244  static const char* patent_format = "%{%2D%|01%}-%{%3N%|JAN%}-%Y";
1245 
1246  const char* format = ( format_choice == eDateToString_cit_sub ?
1247  cit_sub_format :
1248  ( format_choice == eDateToString_patent ? patent_format : regular_format ) );
1249 
1250  string date_str;
1251  date.GetDate(&date_str, format);
1252  NStr::ToUpper(date_str);
1253  str.append(date_str);
1254 }
1255 
1256 
1258 {
1259  if ( !seq.IsSetInst() ||
1260  !seq.IsSetInst_Repr() ||
1261  !(seq.GetInst_Repr() == CSeq_inst::eRepr_delta) ||
1262  !seq.IsSetInst_Ext() ||
1263  !seq.GetInst_Ext().IsDelta() ) {
1264  return;
1265  }
1266 
1267  SDeltaSeqSummary temp;
1268  CScope& scope = seq.GetScope();
1269 
1270  const CDelta_ext::Tdata& segs = seq.GetInst_Ext().GetDelta().Get();
1271  temp.num_segs = segs.size();
1272 
1273  size_t len = 0;
1274 
1276 
1277  CDelta_ext::Tdata::const_iterator curr = segs.begin();
1278  CDelta_ext::Tdata::const_iterator end = segs.end();
1279  CDelta_ext::Tdata::const_iterator next;
1280  for ( ; curr != end; curr = next ) {
1281  {{
1282  // set next to one after curr
1283  next = curr; ++next;
1284  }}
1285  size_t from = len + 1;
1286  switch ( (*curr)->Which() ) {
1287  case CDelta_seq::e_Loc:
1288  {{
1289  const CDelta_seq::TLoc& loc = (*curr)->GetLoc();
1290  if ( loc.IsNull() ) { // gap
1291  ++temp.num_gaps;
1292  text << "* " << from << ' ' << len
1293  << " gap of unknown length~";
1294  } else { // count length
1295  size_t tlen = sequence::GetLength(loc, &scope);
1296  len += tlen;
1297  temp.residues += tlen;
1298  text << "* " << setw(8) << from << ' ' << setw(8) << len
1299  << ": contig of " << tlen << " bp in length~";
1300  }
1301  }}
1302  break;
1303  case CDelta_seq::e_Literal:
1304  {{
1305  const CDelta_seq::TLiteral& lit = (*curr)->GetLiteral();
1306  size_t lit_len = lit.CanGetLength() ? lit.GetLength() : 0;
1307  len += lit_len;
1308  if ( lit.CanGetSeq_data() && lit.GetSeq_data().Which() != CSeq_data::e_Gap ) {
1309  temp.residues += lit_len;
1310  while ( next != end && (*next)->IsLiteral() &&
1311  (*next)->GetLiteral().CanGetSeq_data() &&
1312  (*next)->GetLiteral().GetSeq_data().Which() != CSeq_data::e_Gap ) {
1313  const CDelta_seq::TLiteral& next_lit = (*next)->GetLiteral();
1314  size_t next_len = next_lit.CanGetLength() ?
1315  next_lit.GetLength() : 0;
1316  lit_len += next_len;
1317  len += next_len;
1318  temp.residues += next_len;
1319  ++next;
1320  }
1321  text << "* " << setw(8) << from << ' ' << setw(8) << len
1322  << ": contig of " << lit_len << " bp in length~";
1323  } else {
1324  bool unk = false;
1325  ++temp.num_gaps;
1326  if ( lit.CanGetFuzz() ) {
1327  const CSeq_literal::TFuzz& fuzz = lit.GetFuzz();
1328  if ( fuzz.IsLim() &&
1329  fuzz.GetLim() == CInt_fuzz::eLim_unk ) {
1330  unk = true;
1331  ++temp.num_faked_gaps;
1332  if ( from > len ) {
1333  text << "* gap of unknown length~";
1334  } else {
1335  text << "* " << setw(8) << from << ' ' << setw(8) << len
1336  << ": gap of unknown length~";
1337  }
1338  }
1339  }
1340  if ( !unk ) {
1341  text << "* " << setw(8) << from << " " << setw(8) << len
1342  << ": gap of " << lit_len << " bp~";
1343  }
1344  }
1345  }}
1346  break;
1347 
1348  default:
1349  break;
1350  }
1351  }
1352  summary = temp;
1353  summary.text = CNcbiOstrstreamToString(text);
1354 }
1355 
1356 
1357 SAFE_CONST_STATIC_STRING(kTS_concept_trans, "conceptual translation");
1358 SAFE_CONST_STATIC_STRING(kTS_concept_trans_a, "conceptual translation supplied by author");
1359 SAFE_CONST_STATIC_STRING(kTS_both, "conceptual translation with partial peptide sequencing");
1360 SAFE_CONST_STATIC_STRING(kTS_seq_pept, "direct peptide sequencing");
1361 SAFE_CONST_STATIC_STRING(kTS_seq_pept_homol, "sequenced peptide, ordered by homology");
1362 SAFE_CONST_STATIC_STRING(kTS_seq_pept_overlap, "sequenced peptide, ordered by overlap");
1363 
1364 const string& GetTechString(int tech)
1365 {
1366 
1367  switch ( tech ) {
1369  return kTS_concept_trans.Get();
1370 
1372  return kTS_seq_pept.Get();
1373 
1374  case CMolInfo::eTech_both:
1375  return kTS_both.Get();
1376 
1378  return kTS_seq_pept_overlap.Get();
1379 
1381  return kTS_seq_pept_homol.Get();
1382 
1384  return kTS_concept_trans_a.Get();
1385 
1386  default:
1387  return kEmptyStr;
1388  }
1389 
1390  return kEmptyStr;
1391 }
1392 
1393 
1395 {
1396  return (uo.CanGetType() && uo.GetType().IsStr() &&
1397  uo.GetType().GetStr() == "ModelEvidence");
1398 }
1399 
1400 
1402 {
1403  if ( s_IsModelEvidanceUop(uo) ) {
1404  return &uo;
1405  }
1406 
1407  const CUser_object* temp = 0;
1408  ITERATE (CUser_object::TData, ufi, uo.GetData()) {
1409  const CUser_field& uf = **ufi;
1410  if ( !uf.CanGetData() ) {
1411  continue;
1412  }
1413  const CUser_field::TData& data = uf.GetData();
1414 
1415  switch ( data.Which() ) {
1417  temp = s_FindModelEvidanceUop(data.GetObject());
1418  break;
1419 
1421  ITERATE (CUser_field::TData::TObjects, obj, data.GetObjects()) {
1422  temp = s_FindModelEvidanceUop(**obj);
1423  if ( temp != 0 ) {
1424  break;
1425  }
1426  }
1427  break;
1428 
1429  default:
1430  break;
1431  }
1432  if ( temp != 0 ) {
1433  break;
1434  }
1435  }
1436 
1437  return temp;
1438 }
1439 
1440 
1442 {
1443  CConstRef<CUser_object> moduop;
1444  bool result = false;
1445 
1446  for (CSeqdesc_CI it(bsh, CSeqdesc::e_User); it; ++it) {
1447  moduop.Reset(s_FindModelEvidanceUop(it->GetUser()));
1448  if (moduop.NotEmpty()) {
1449  result = true;
1451  if( moduop->HasField("Contig Name") ) {
1452  ufp = &(moduop->GetField("Contig Name"));
1453  if ( ufp.NotEmpty() && ufp->IsSetData() && ufp->GetData().IsStr() ) {
1454  me.name = ufp->GetData().GetStr();
1455  }
1456  }
1457  if( moduop->HasField("Assembly") ) {
1458  ufp = &(moduop->GetField("Assembly"));
1459  if ( ufp.NotEmpty() && ufp->IsSetData() && ufp->GetData().IsFields() ) {
1461  const CUser_field& field = **fld_itr;
1462  ITERATE(CUser_field::C_Data::TFields, inr_itr, field.GetData().GetFields()) {
1463  const CUser_field& ufld = **inr_itr;
1464  if ( !ufld.IsSetLabel() || !ufld.GetLabel().IsStr() ) continue;
1465  const string& label = ufld.GetLabel().GetStr();
1466  if (label != "accession") continue;
1467  const CUser_field::C_Data& data = ufld.GetData();
1468  if (data.IsStr()) {
1469  const string& accn = data.GetStr();
1470  me.assembly.push_back(accn);
1471  }
1472  }
1473  }
1474  }
1475  }
1476  if ( moduop->HasField("Method") ) {
1477  ufp = &(moduop->GetField("Method"));
1478  if ( ufp.NotEmpty() && ufp->IsSetData() && ufp->GetData().IsStr() ) {
1479  me.method = ufp->GetData().GetStr();
1480  }
1481  }
1482  if ( moduop->HasField("Counts") ) {
1483  ufp = &(moduop->GetField("Counts"));
1484  if ( ufp->HasField("mRNA")) {
1485  me.mrnaEv = true;
1486  }
1487  if ( ufp->HasField("EST")) {
1488  me.estEv = true;
1489  }
1490  }
1491  if ( moduop->HasField("mRNA") ) {
1492  me.mrnaEv = true;
1493  }
1494  if ( moduop->HasField("EST") ) {
1495  me.estEv = true;
1496  }
1497  if( moduop->HasField("Contig Gi") ) {
1498  ufp = &(moduop->GetField("Contig Gi"));
1499  if ( ufp.NotEmpty() && ufp->IsSetData() && ufp->GetData().IsInt() ) {
1501  }
1502  }
1503  if( moduop->HasField("Contig Span") ) {
1504  ufp = &(moduop->GetField("Contig Span"));
1505  if ( ufp.NotEmpty() && ufp->IsSetData() && ufp->GetData().IsInts()
1506  && ufp->IsSetNum() && ufp->GetNum() == 2 && ufp->GetData().GetInts().size() == 2 )
1507  {
1508  const CUser_field::C_Data::TInts & int_list = ufp->GetData().GetInts();
1509  me.span.first = int_list[0];
1510  me.span.second = int_list[1];
1511  }
1512  }
1513  }
1514  }
1515 
1516  // if me.name is missing version, try to update from me.gi
1517  if( me.gi > ZERO_GI && me.name.find('.') == string::npos ) {
1519  if( accver_idh ) {
1520  CConstRef<CSeq_id> accver_seq_id = accver_idh.GetSeqIdOrNull();
1521  if( accver_seq_id ) {
1522  const CTextseq_id *text_id = accver_seq_id->GetTextseq_Id();
1523  if( text_id && text_id->IsSetAccession() && text_id->IsSetVersion() ) {
1524  me.name = text_id->GetAccession() + "." + NStr::IntToString(text_id->GetVersion());
1525  }
1526  }
1527  }
1528  }
1529 
1530  return result;
1531 }
1532 
1533 
1535 {
1536  if ( s_GetModelEvidance(bsh, me) ) {
1537  return true;
1538  }
1539 
1540  if ( CSeq_inst::IsAa(bsh.GetInst_Mol()) ) {
1542  if ( nuc ) {
1543  return s_GetModelEvidance(nuc, me);
1544  }
1545  }
1546 
1547  return false;
1548 }
1549 
1550 
1551 // in Ncbistdaa order
1552 static const char* kAANames[] = {
1553  "---", "Ala", "Asx", "Cys", "Asp", "Glu", "Phe", "Gly", "His", "Ile",
1554  "Lys", "Leu", "Met", "Asn", "Pro", "Gln", "Arg", "Ser", "Thr", "Val",
1555  "Trp", "OTHER", "Tyr", "Glx", "Sec", "TERM", "Pyl", "Xle"
1556 };
1557 
1558 
1559 const char* GetAAName(unsigned char aa, bool is_ascii)
1560 {
1561  if (is_ascii) {
1562  aa = (unsigned char)
1565  }
1566  return (aa < sizeof(kAANames)/sizeof(*kAANames)) ? kAANames[aa] : "OTHER";
1567 }
1568 
1569 //////////////////////////////////////////////////////////////////////////////
1570 
1572  const CSeq_id_Handle& mrna,
1573  const CSeq_id_Handle& prot,
1574  CBioseq_Handle& mrna_bsh,
1575  CBioseq_Handle& prot_bsh)
1576 {
1578 
1579  if (order == eResolve_NotFound) {
1580  CRef<CScope> local_scope(new CScope(*CObjectManager::GetInstance()));
1581  local_scope->AddDefaults();
1582 
1583  CBioseq_Handle possible_mrna = local_scope->GetBioseqHandle(mrna);
1584  CBioseq_Handle possible_prot;
1585  if (possible_mrna) {
1586  possible_prot =
1587  possible_mrna.GetTopLevelEntry().GetBioseqHandle(prot);
1588  }
1589  if (possible_mrna && possible_prot) {
1590  order = eResolve_RnaFirst;
1591  }
1592  }
1593 
1594  if (order == eResolve_NotFound) {
1595  CRef<CScope> local_scope(new CScope(*CObjectManager::GetInstance()));
1596  local_scope->AddDefaults();
1597 
1598  CBioseq_Handle possible_prot = local_scope->GetBioseqHandle(prot);
1599  CBioseq_Handle possible_mrna;
1600  if (possible_prot) {
1601  possible_mrna =
1602  possible_prot.GetTopLevelEntry().GetBioseqHandle(mrna);
1603  }
1604 
1605  if (possible_mrna && possible_prot) {
1606  order = eResolve_ProtFirst;
1607  }
1608  }
1609 
1610  switch (order) {
1611  case eResolve_NotFound:
1612  mrna_bsh = CBioseq_Handle();
1613  prot_bsh = CBioseq_Handle();
1614  break;
1615 
1616  case eResolve_RnaFirst:
1617  mrna_bsh = scope.GetBioseqHandle(mrna);
1618  prot_bsh = scope.GetBioseqHandle(prot);
1619  break;
1620 
1621  case eResolve_ProtFirst:
1622  prot_bsh = scope.GetBioseqHandle(prot);
1623  mrna_bsh = scope.GetBioseqHandle(mrna);
1624  break;
1625  }
1626 
1627  return order;
1628 }
1629 
1630 //////////////////////////////////////////////////////////////////////////////
1631 // HTML utils and strings
1632 
1633 // ============================================================================
1634 // Link locations:
1635 // ============================================================================
1637  "https://www.ncbi.nlm.nih.gov/nuccore/";
1639  "https://www.ncbi.nlm.nih.gov/protein/";
1640 
1642  "https://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?val="; // https forwarded to http
1643 
1645  "https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?";
1647  "https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=c#SG";
1649  "https://www.ncbi.nlm.nih.gov/pubmed/";
1651  "https://enzyme.expasy.org/EC/"; // not government site
1653  "https://www.ncbi.nlm.nih.gov/sites/entrez?db=Nucleotide&amp;cmd=Search&amp;term=";
1655  "https://www.ncbi.nlm.nih.gov/bioproject/";
1657  "https://www.ncbi.nlm.nih.gov/projects/Sequin/latlonview.html";
1659  "http://amigo.geneontology.org/amigo/term/GO:"; // not government site
1661  "http://www.geneontology.org/cgi-bin/references.cgi#GO_REF:"; // not government site
1663  "https://ppubs.uspto.gov/pubwebapp/external.html?q=";
1665  "https://www.uniprot.org/uniprot/";
1666 
1668  "https://www.ncbi.nlm.nih.gov/genome/annotation_euk/process/";
1669 
1670 namespace {
1671  // make sure we're not "double-sanitizing"
1672  // (e.g. "&gt;" to "&amp;gt;")
1673  // ============================================================================
1674  template<typename _T>
1675  bool s_ShouldWeEscapeAmpersand(
1676  _T str_iter, // yes, COPY not reference
1677  const _T &str_iter_end)
1678  // ============================================================================
1679  {
1680  _ASSERT(*str_iter == '&');
1681 
1682  // This is a long-winded way of checking if str_iter
1683  // is at "&gt;", "&lt;", "&quot;" or "&amp;"
1684  // I'm concerned about regexes being too slow.
1685 
1686  ++str_iter;
1687  if (str_iter != str_iter_end) {
1688  switch (*str_iter) {
1689  case 'g':
1690  case 'l':
1691  ++str_iter;
1692  if (str_iter != str_iter_end && *str_iter == 't') {
1693  ++str_iter;
1694  if (str_iter != str_iter_end && *str_iter == ';') {
1695  return false;
1696  }
1697  }
1698  break;
1699  case 'a':
1700  ++str_iter;
1701  if (str_iter != str_iter_end && *str_iter == 'm') {
1702  ++str_iter;
1703  if (str_iter != str_iter_end && *str_iter == 'p') {
1704  ++str_iter;
1705  if (str_iter != str_iter_end && *str_iter == ';') {
1706  return false;
1707  }
1708  }
1709  }
1710  break;
1711  case 'q':
1712  ++str_iter;
1713  if (str_iter != str_iter_end && *str_iter == 'u') {
1714  ++str_iter;
1715  if (str_iter != str_iter_end && *str_iter == 'o') {
1716  ++str_iter;
1717  if (str_iter != str_iter_end && *str_iter == 't') {
1718  ++str_iter;
1719  if (str_iter != str_iter_end && *str_iter == ';') {
1720  return false;
1721  }
1722  }
1723  }
1724  }
1725  break;
1726  default:
1727  return true;
1728  }
1729  }
1730  return true;
1731  }
1732 
1733  // see if the '<' opens an HTML tag (currently we
1734  // only check for a few kinds of tags )
1735  // ============================================================================
1736  template<typename _T>
1737  bool s_IsTagStart(
1738  const _T &str_iter,
1739  const _T &str_iter_end)
1740  // ============================================================================
1741  {
1742  static const char* possible_tag_starts[] = {
1743  "<a href=",
1744  "<acronym title",
1745  "</a>",
1746  "</acronym"
1747  };
1748  static const size_t num_possible_tag_starts =
1749  (sizeof(possible_tag_starts) / sizeof(possible_tag_starts[0]));
1750 
1751  // check every string it might start with
1752  for (int possible_str_idx = 0; possible_str_idx < num_possible_tag_starts; ++possible_str_idx) {
1753  const string expected_str = possible_tag_starts[possible_str_idx];
1754 
1755  string::size_type idx = 0;
1756  _T check_str_iter = str_iter;
1757  for (; check_str_iter != str_iter_end && idx < expected_str.length(); ++idx, ++check_str_iter) {
1758  if (*check_str_iter != expected_str[idx]) {
1759  break;
1760  }
1761  }
1762 
1763  if (idx == expected_str.length()) {
1764  return true;
1765  }
1766  }
1767 
1768  // we're in a tag if we matched the whole expected_str
1769  return false;
1770  }
1771 
1772 }
1773 
1775 {
1776  bool changes_made = false;
1777 
1778  bool in_tag = false;
1779  size_t idx = 0;
1780  for (; idx < str.length(); ++idx) {
1781  switch (str[idx]) {
1782  case '<':
1783  // heuristic
1784  in_tag = true;
1785  break;
1786  case '>':
1787  in_tag = false;
1788  break;
1789  case '"':
1790  if (!in_tag) {
1791  str[idx] = '\'';
1792  changes_made = true;
1793  }
1794  break;
1795  }
1796  }
1797 
1798  return changes_made;
1799 }
1800 
1801 
1802 // ============================================================================
1803 void TryToSanitizeHtml(string &str)
1804 {
1805  string result;
1806  // The "* 1.1" should keep up efficient in most cases since data tends not to have
1807  // too many characters that need escaping.
1808  result.reserve(1 + (int)((double)str.length() * 1.1));
1810 
1811  // swap is faster than assignment
1812  str.swap(result);
1813 }
1814 
1816 // ============================================================================
1817 {
1818  result.clear();
1819 
1820  // we only sanitize when we're not in an url
1821  bool in_html_tag = false;
1822  ITERATE(CTempString, str_iter, str) {
1823  // see if we're entering an HTML tag
1824  if (!in_html_tag && *str_iter == '<' && s_IsTagStart(str_iter, str.end())) {
1825  in_html_tag = true;
1826  }
1827 
1828  // now that we know whether we're in a tag,
1829  // process characters appropriately.
1830  if (in_html_tag) {
1831  switch (*str_iter) {
1832  case '&':
1833  // make sure we're not "double-sanitizing"
1834  // (e.g. "&gt;" to "&amp;gt;")
1835  if (s_ShouldWeEscapeAmpersand(str_iter, str.end())) {
1836  result += "&amp;";
1837  }
1838  else {
1839  result += '&';
1840  }
1841  break;
1842  default:
1843  result += *str_iter;
1844  break;
1845  }
1846  }
1847  else {
1848  switch (*str_iter) {
1849  case '<':
1850  result += "&lt;";
1851  break;
1852  case '>':
1853  result += "&gt;";
1854  break;
1855  default:
1856  result += *str_iter;
1857  break;
1858  }
1859  }
1860 
1861  // see if we're exiting an HTML tag
1862  if (in_html_tag && *str_iter == '>') {
1863  // tag is closed now
1864  // (Note: does this consider cases where '>' is in quotes?)
1865  in_html_tag = false;
1866  }
1867  }
1868 }
1869 
1870 void
1871 TryToSanitizeHtmlList( std::list<std::string> &strs )
1872 {
1873  NON_CONST_ITERATE( std::list<std::string>, str_iter, strs ) {
1874  TryToSanitizeHtml( *str_iter );
1875  }
1876 }
1877 
1878 bool
1880 {
1881  // list is not complete, still need to take proper precautions
1882  static const char* bad_html_strings[] = {
1883  "<script", "<object", "<applet", "<embed", "<form",
1884  "javascript:", "vbscript:"
1885  };
1886 
1887  // load matching fsa if not already done
1888  static CSafeStatic<CTextFsa> fsa;
1889  if( ! fsa->IsPrimed() ) {
1890  for( size_t ii = 0; ii < ArraySize(bad_html_strings); ++ii ) {
1891  fsa->AddWord( bad_html_strings[ii] );
1892  }
1893  fsa->Prime();
1894  }
1895 
1896  // do the match
1897  int current_state = 0;
1898  for ( SIZE_TYPE str_idx = 0 ; str_idx < str.length(); ++str_idx) {
1899  const char ch = str[str_idx];
1900  int next_state = fsa->GetNextState (current_state, ch);
1901  if (fsa->IsMatchFound (next_state)) {
1902  return true;
1903  }
1904  current_state = next_state;
1905  }
1906 
1907  return false;
1908 }
1909 
1910 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
CBioseq_Handle –.
Definition: Date.hpp:53
void GetDate(string *label, bool year_only=false) const
Append a standardized string representation of the date to the label.
Definition: Date.hpp:149
bool operator()(const char c)
Definition: objutil.cpp:406
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
CSafeStatic<>::
CScope –.
Definition: scope.hpp:92
bool IsAa(void) const
Definition: Seq_inst.hpp:113
CSeqdesc_CI –.
Definition: seqdesc_ci.hpp:65
static TIndex GetMapToIndex(CSeq_data::E_Choice from_type, CSeq_data::E_Choice to_type, TIndex from_idx)
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
bool HasField(const string &str, const string &delim=".", NStr::ECase use_case=NStr::eCase) const
Verify that a named field exists.
Definition: User_field.cpp:393
bool HasField(const string &str, const string &delim=".", NStr::ECase use_case=NStr::eCase) const
Verify that a named field exists.
const CUser_field & GetField(const string &str, const string &delim=".", NStr::ECase use_case=NStr::eCase) const
Access a named field in this user object.
Definition: User_object.cpp:71
Include a standard set of the NCBI C++ Toolkit most basic headers.
std::ofstream out("events_result.xml")
main entry point for tests
#define test(a, b, c, d, e)
Definition: numeric.c:170
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:56
static const char * str(char *buf, int n)
Definition: stats.c:84
char data[12]
Definition: iconv.c:80
Utility macros and typedefs for exploring NCBI objects from general.asn.
#define GI_FROM(T, value)
Definition: ncbimisc.hpp:1086
constexpr size_t ArraySize(const Element(&)[Size])
Definition: ncbimisc.hpp:1532
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define ZERO_GI
Definition: ncbimisc.hpp:1088
string
Definition: cgiapp.hpp:690
static EAccessionInfo IdentifyAccession(const CTempString &accession, TParseFlags flags=fParse_AnyRaw)
Deduces information from a bare accession a la WHICH_db_accession; may report false negatives on prop...
Definition: Seq_id.cpp:1634
CConstRef< CSeq_id > GetSeqIdOrNull(void) const
static CSeq_id_Handle GetGiHandle(TGi gi)
Faster way to create a handle for a gi.
const CTextseq_id * GetTextseq_Id(void) const
Return embedded CTextseq_id, if any.
Definition: Seq_id.cpp:169
@ eAcc_unknown
Definition: Seq_id.hpp:322
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
const CBioseq * GetNucleotideParent(const CBioseq &product, CScope *scope)
Get the encoding nucleotide sequnce of a protein.
Definition: sequence.cpp:2660
CSeq_id_Handle GetAccVer(const CSeq_id_Handle &idh, TGetFlags flags=0)
Get accession.version Seq-id Returns null CSeq_id_Handle if the sequence is not found or if it doesn'...
Definition: scope.cpp:413
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
void AddDefaults(TPriority pri=kPriority_Default)
Add default data loaders from object manager.
Definition: scope.cpp:504
const TInst_Ext & GetInst_Ext(void) const
bool IsSetInst_Ext(void) const
TInst_Mol GetInst_Mol(void) const
CBioseq_Handle GetBioseqHandle(const CSeq_id &id) const
Get Bioseq handle from the TSE of this Seq-entry.
bool IsSetInst(void) const
bool IsSetInst_Repr(void) const
TInst_Repr GetInst_Repr(void) const
CScope & GetScope(void) const
Get scope this handle belongs to.
CSeq_entry_Handle GetTopLevelEntry(void) const
Get top level Seq-entry handle.
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:1439
bool NotEmpty(void) const THROWS_NONE
Check if CConstRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:1392
uint16_t Uint2
2-byte (16-bit) unsigned integer
Definition: ncbitype.h:101
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
#define kEmptyStr
Definition: ncbistr.hpp:123
static bool EndsWith(const CTempString str, const CTempString end, ECase use_case=eCase)
Check if a string ends with a specified suffix value.
Definition: ncbistr.hpp:5424
#define NPOS
Definition: ncbistr.hpp:133
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5078
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2882
const char * data(void) const
Return a pointer to the array represented.
Definition: tempstr.hpp:313
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5406
size_type length(void) const
Return the length of the represented array.
Definition: tempstr.hpp:320
static string & ReplaceInPlace(string &src, const string &search, const string &replace, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3396
static string & ToUpper(string &str)
Convert string to upper case – string& version.
Definition: ncbistr.cpp:424
size_type size(void) const
Return the length of the represented array.
Definition: tempstr.hpp:327
#define NCBI_XOBJEDIT_EXPORT
Definition: ncbi_export.h:1291
static const char label[]
const TStr & GetStr(void) const
Get the variant data.
bool IsStr(void) const
Check if variant Str is selected.
Definition: Object_id_.hpp:291
bool CanGetType(void) const
Check if it is safe to call GetType method.
bool IsLim(void) const
Check if variant Lim is selected.
Definition: Int_fuzz_.hpp:636
const TData & GetData(void) const
Get the Data member data.
bool CanGetData(void) const
Check if it is safe to call GetData method.
const TFields & GetFields(void) const
Get the variant data.
vector< CRef< CUser_field > > TFields
TLim GetLim(void) const
Get the variant data.
Definition: Int_fuzz_.hpp:642
bool IsFields(void) const
Check if variant Fields is selected.
bool IsInt(void) const
Check if variant Int is selected.
bool IsStr(void) const
Check if variant Str is selected.
vector< CRef< CUser_object > > TObjects
bool IsSetLabel(void) const
field label Check if a value has been assigned to Label data member.
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
bool IsSetNum(void) const
required for strs, ints, reals, oss Check if a value has been assigned to Num data member.
TInt GetInt(void) const
Get the variant data.
bool IsInts(void) const
Check if variant Ints is selected.
const TData & GetData(void) const
Get the Data member data.
const TLabel & GetLabel(void) const
Get the Label member data.
const TType & GetType(void) const
Get the Type member data.
bool IsSetData(void) const
Check if a value has been assigned to Data data member.
vector< CRef< CUser_field > > TData
const TInts & GetInts(void) const
Get the variant data.
TNum GetNum(void) const
Get the Num member data.
@ e_Object
for using other definitions
@ eLim_unk
unknown
Definition: Int_fuzz_.hpp:210
bool IsSetAccession(void) const
Check if a value has been assigned to Accession data member.
TVersion GetVersion(void) const
Get the Version member data.
bool IsNull(void) const
Check if variant Null is selected.
Definition: Seq_loc_.hpp:504
bool IsSetVersion(void) const
Check if a value has been assigned to Version data member.
const TAccession & GetAccession(void) const
Get the Accession member data.
TLength GetLength(void) const
Get the Length member data.
bool CanGetLength(void) const
Check if it is safe to call GetLength method.
const TFuzz & GetFuzz(void) const
Get the Fuzz member data.
bool IsDelta(void) const
Check if variant Delta is selected.
Definition: Seq_ext_.hpp:336
const TDelta & GetDelta(void) const
Get the variant data.
Definition: Seq_ext_.cpp:180
bool CanGetFuzz(void) const
Check if it is safe to call GetFuzz method.
bool CanGetSeq_data(void) const
Check if it is safe to call GetSeq_data method.
const Tdata & Get(void) const
Get the member data.
Definition: Delta_ext_.hpp:164
list< CRef< CDelta_seq > > Tdata
Definition: Delta_ext_.hpp:89
const TSeq_data & GetSeq_data(void) const
Get the Seq_data member data.
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_data_.hpp:475
@ eRepr_delta
sequence made by changes (delta) to others
Definition: Seq_inst_.hpp:100
@ eTech_both
concept transl. w/ partial pept. seq.
Definition: MolInfo_.hpp:133
@ eTech_seq_pept_homol
sequenced peptide, ordered by homology
Definition: MolInfo_.hpp:135
@ eTech_seq_pept_overlap
sequenced peptide, ordered by overlap
Definition: MolInfo_.hpp:134
@ eTech_concept_trans
conceptual translation
Definition: MolInfo_.hpp:131
@ eTech_seq_pept
peptide was sequenced
Definition: MolInfo_.hpp:132
@ eTech_concept_trans_a
conceptual transl. supplied by author
Definition: MolInfo_.hpp:136
@ e_Gap
gap types
Definition: Seq_data_.hpp:114
@ e_Ncbieaa
extended ASCII 1 letter aa codes
Definition: Seq_data_.hpp:111
@ e_Ncbistdaa
consecutive codes for std aas
Definition: Seq_data_.hpp:113
@ e_User
user defined object
Definition: Seqdesc_.hpp:124
@ e_Literal
a piece of sequence
Definition: Delta_seq_.hpp:90
@ e_Loc
point to a sequence
Definition: Delta_seq_.hpp:89
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
int len
static void text(MDB_val *v)
Definition: mdb_dump.c:62
EIPRangeType t
Definition: ncbi_localip.c:101
int isspace(Uchar c)
Definition: ncbictype.hpp:69
int isalnum(Uchar c)
Definition: ncbictype.hpp:62
int isdigit(Uchar c)
Definition: ncbictype.hpp:64
int isgraph(Uchar c)
Definition: ncbictype.hpp:65
int ispunct(Uchar c)
Definition: ncbictype.hpp:68
T max(T x_, T y_)
static Format format
Definition: njn_ioutil.cpp:53
std::istream & in(std::istream &in_, double &x_)
The Object manager core.
const char * strLinkBaseGeneOntologyRef
Definition: objutil.cpp:1660
bool s_IsModelEvidanceUop(const CUser_object &uo)
Definition: objutil.cpp:1394
const char * strLinkBaseGenomePrj
Definition: objutil.cpp:1654
const char * strLinkBaseProt
Definition: objutil.cpp:1638
bool StripSpaces(string &str)
Definition: objutil.cpp:256
const char * strLinkBaseNucSearch
Definition: objutil.cpp:1652
void TryToSanitizeHtml(string &str)
Definition: objutil.cpp:1803
bool RemovePeriodFromEnd(string &str, bool keep_ellipsis)
Definition: objutil.cpp:299
SAFE_CONST_STATIC_STRING(kLegalPathChars, "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_-.")
const char * strLinkBaseNuc
Definition: objutil.cpp:1636
const char * strLinkBaseEntrezViewer
Definition: objutil.cpp:1641
#define comma_space
Definition: objutil.cpp:820
const char * strLinkBaseTaxonomy
Definition: objutil.cpp:1644
#define space_comma
Definition: objutil.cpp:816
const char * strLinkBaseGeneOntology
Definition: objutil.cpp:1658
#define space_semicolon
Definition: objutil.cpp:819
void TryToSanitizeHtmlList(std::list< std::string > &strs)
Definition: objutil.cpp:1871
const CUser_object * s_FindModelEvidanceUop(const CUser_object &uo)
Definition: objutil.cpp:1401
static bool s_IsWholeWord(const string &str, size_t pos)
Definition: objutil.cpp:1040
void ExpandTildes(string &s, ETildeStyle style)
Definition: objutil.cpp:152
void DateToString(const CDate &date, string &str, EDateToString format_choice)
Definition: objutil.cpp:1238
void TrimSpaces(string &str, size_t indent)
Definition: objutil.cpp:385
void JoinString(string &to, const string &prefix, const string &str, bool noRedundancy)
Definition: objutil.cpp:1050
#define semicolon_space
Definition: objutil.cpp:821
#define bracket_space
Definition: objutil.cpp:818
const char * strLinkBaseLatLon
Definition: objutil.cpp:1656
const char * strLinkBaseTransTable
Definition: objutil.cpp:1646
void GetDeltaSeqSummary(const CBioseq_Handle &seq, SDeltaSeqSummary &summary)
Definition: objutil.cpp:1257
bool TrimSpacesAndJunkFromEnds(string &str, bool allow_ellipsis)
Definition: objutil.cpp:475
bool IsPartOfUrl(const string &sentence, size_t pos)
Definition: objutil.cpp:67
const char * strDocLink
Definition: objutil.cpp:1667
bool ConvertQuotesNotInHTMLTags(string &str)
Definition: objutil.cpp:1774
bool IsValidAccession(const string &accn, EAccValFlag flag)
Definition: objutil.cpp:1227
const char * GetAAName(unsigned char aa, bool is_ascii)
Definition: objutil.cpp:1559
#define twocommas
Definition: objutil.cpp:813
#define twosemicolons
Definition: objutil.cpp:815
const char * strLinkBasePubmed
Definition: objutil.cpp:1648
const char * strLinkBaseUniProt
Definition: objutil.cpp:1664
#define space_bracket
Definition: objutil.cpp:817
bool CommentHasSuspiciousHtml(const string &str)
Definition: objutil.cpp:1879
#define twospaces
Definition: objutil.cpp:814
void AddPeriod(string &str)
Definition: objutil.cpp:377
static const char * kAANames[]
Definition: objutil.cpp:1552
string & CompressSpaces(string &str, const bool trim_beginning, const bool trim_end)
Definition: objutil.cpp:416
bool s_GetModelEvidance(const CBioseq_Handle &bsh, SModelEvidance &me)
Definition: objutil.cpp:1441
const string & GetTechString(int tech)
Definition: objutil.cpp:1364
void CleanAndCompress(string &dest, const CTempString &instr)
Definition: objutil.cpp:823
EResolveOrder GetResolveOrder(CScope &scope, const CSeq_id_Handle &mrna, const CSeq_id_Handle &prot, CBioseq_Handle &mrna_bsh, CBioseq_Handle &prot_bsh)
Definition: objutil.cpp:1571
const char * strLinkBaseUSPTO
Definition: objutil.cpp:1662
static bool s_IsValidDotVersion(const string &accn)
Definition: objutil.cpp:1208
bool GetModelEvidance(const CBioseq_Handle &bsh, SModelEvidance &me)
Definition: objutil.cpp:1534
static bool s_RunOfStars(string &s, SIZE_TYPE start, SIZE_TYPE length)
Definition: objutil.cpp:137
void ConvertQuotes(string &str)
Definition: objutil.cpp:240
const char * strLinkBaseExpasy
Definition: objutil.cpp:1650
ETildeStyle
Definition: objutil.hpp:47
@ eTilde_newline
Definition: objutil.hpp:50
@ eTilde_tilde
Definition: objutil.hpp:48
@ eTilde_space
Definition: objutil.hpp:49
@ eTilde_comment
Definition: objutil.hpp:51
@ eTilde_note
Definition: objutil.hpp:52
EAccValFlag
Definition: objutil.hpp:95
@ eValidateAccDotVer
Definition: objutil.hpp:97
EResolveOrder
Definition: objutil.hpp:160
@ eResolve_RnaFirst
Definition: objutil.hpp:162
@ eResolve_ProtFirst
Definition: objutil.hpp:163
@ eResolve_NotFound
Definition: objutil.hpp:161
EDateToString
Definition: objutil.hpp:103
@ eDateToString_cit_sub
Definition: objutil.hpp:105
@ eDateToString_patent
Definition: objutil.hpp:106
string indent(" ")
static SLJIT_INLINE sljit_ins l(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
#define DEFINE_STATIC_ARRAY_MAP_WITH_COPY(Type, Var, Array)
Definition: static_set.hpp:894
String search utilities.
size_t num_faked_gaps
Definition: objutil.hpp:117
TSpanType span
Definition: objutil.hpp:144
string method
Definition: objutil.hpp:140
list< string > assembly
Definition: objutil.hpp:139
#define _ASSERT
else result
Definition: token2.c:20
#define const
Definition: zconf.h:232
Modified on Fri Sep 20 14:58:19 2024 by modify_doxy.py rev. 669887