NCBI C++ ToolKit
cleanup_utils.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: cleanup_utils.cpp 102112 2024-04-02 18:07:29Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Mati Shomrat
27  *
28  * File Description:
29  * General utilities for data cleanup.
30  *
31  * ===========================================================================
32  */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include "cleanup_utils.hpp"
36 
38 #include <objmgr/util/sequence.hpp>
39 #include <objects/seq/Pubdesc.hpp>
41 #include <objects/pub/Pub.hpp>
45 #include <objects/biblio/Affil.hpp>
48 #include <objects/general/Date.hpp>
51 
52 #include <objects/seq/Seqdesc.hpp>
53 #include <objects/seq/MolInfo.hpp>
57 
58 #include <objmgr/seqdesc_ci.hpp>
59 
61 
62 
65 
66 #define IS_LOWER(c) ('a'<=(c) && (c)<='z')
67 #define IS_UPPER(c) ('A'<=(c) && (c)<='Z')
68 
69 using namespace sequence;
70 
71 bool CleanVisString( string &str )
72 {
73  bool changed = false;
74 
75  if( str.empty() ) {
76  return false;
77  }
78 
79  // chop off initial junk
80  {
81  string::size_type first_good_char_pos = str.find_first_not_of(" ;,");
82  if( first_good_char_pos == string::npos ) {
83  // string is completely junk
84  str.clear();
85  return true;
86  } else if( first_good_char_pos > 0 ) {
87  copy( str.begin() + first_good_char_pos, str.end(), str.begin() );
88  str.resize( str.length() - first_good_char_pos );
89  changed = true;
90  }
91  }
92 
93  // chop off end junk
94 
95  string::size_type last_good_char_pos = str.find_last_not_of(" ;,");
96  _ASSERT( last_good_char_pos != string::npos ); // we checked this case so it shouldn't happen
97  if( last_good_char_pos == (str.length() - 1) ) {
98  // nothing to chop of the end
99  return changed;
100  } else if( str[last_good_char_pos+1] == ';' ) {
101  // special extra logic for semicolons because it might be part of
102  // an HTML character like "&nbsp;"
103 
104  // see if there's a '&' before the semicolon
105  // ( ' ' and ',' would break the '&' and make it irrelevant, though )
106  string::size_type last_ampersand_pos = str.find_last_of("& ,", last_good_char_pos );
107  if( last_ampersand_pos == string::npos ) {
108  // no ampersand, so just chop off as normal
109  str.resize( last_good_char_pos + 1 );
110  return true;
111  }
112  switch( str[last_ampersand_pos] ) {
113  case '&':
114  // can't chop semicolon, so chop just after it
115  if( (last_good_char_pos + 2) == str.length() ) {
116  // semicolon is at end, so no chopping occurs
117  return changed;
118  } else {
119  // chop after semicolon
120  str.resize( last_good_char_pos + 2 );
121  return true;
122  }
123  case ' ':
124  case ',':
125  // ampersand (if any) is irrelevant due to intervening
126  // space or comma
127  str.resize( last_good_char_pos + 1 );
128  return true;
129  default:
130  _ASSERT(false);
131  return changed; // should be impossible to reach here
132  }
133 
134  } else {
135  str.resize( last_good_char_pos + 1 );
136  return true;
137  }
138 }
139 
140 bool CleanVisStringJunk( string &str, bool allow_ellipses )
141 {
142  // This is based on the C function TrimSpacesAndJunkFromEnds.
143  // Although it's updated to use iterators and such and to
144  // return whether it changed the string, it should
145  // have the same output.
146 
147  // TODO: This function is copy-pasted from TrimSpacesAndJunkFromEnds,
148  // so we should do something about that since duplicate code is evil.
149 
150  if ( str.empty() ) {
151  return false;
152  }
153 
154  // make start_of_junk_pos hold the beginning of the "junk" at the end
155  // (where junk is defined as one of several characters)
156  // while we're at it, also check if the junk contains a tilde and/or period
157  bool isPeriod = false;
158  bool isTilde = false;
159  int start_of_junk_pos = (int)str.length() - 1;
160  for( ; start_of_junk_pos >= 0 ; --start_of_junk_pos ) {
161  const char ch = str[start_of_junk_pos];
162  if (ch <= ' ' || ch == '.' || ch == ',' || ch == '~' || ch == ';') {
163  // found junk character
164 
165  // also, keep track of whether the junk includes a period and/or tilde
166  isPeriod = (isPeriod || ch == '.');
167  isTilde = (isTilde || ch == '~');
168  } else {
169  // found non-junk character. Last junk character is just after this
170  ++start_of_junk_pos;
171  break;
172  }
173  }
174  // special case of the whole string being junk
175  if( start_of_junk_pos < 0 ) {
176  start_of_junk_pos = 0;
177  }
178 
179  bool changed = false;
180 
181  // if there's junk, chop it off (but leave period/tildes/ellipsis as appropriate)
182  if ( start_of_junk_pos < (int)str.length() ) {
183 
184  // holds the suffix to add after we remove the junk
185  const char * suffix = ""; // by default, just remove junk
186 
187  const int chars_in_junk = ( (int)str.length() - start_of_junk_pos );
188  _ASSERT( chars_in_junk >= 1 );
189  // allow one period at end
190  if (isPeriod) {
191  suffix = ".";
192  if ( allow_ellipses && (chars_in_junk >= 3) &&
193  str[start_of_junk_pos+1] == '.' && str[start_of_junk_pos+2] == '.' ) {
194  suffix = "...";
195  }
196  } else if (isTilde ) {
197  // allow double tilde(s) at the end
198  if ( str[start_of_junk_pos] == '~' ) {
199  const bool doubleTilde = ( (chars_in_junk >= 2) && str[start_of_junk_pos+1] == '~' );
200  suffix = ( doubleTilde ? "~~" : "" );
201  }
202  }
203  if( suffix[0] != '\0' ) {
204  if( 0 != str.compare( start_of_junk_pos, INT_MAX, suffix) ) {
205  str.erase( start_of_junk_pos );
206  str += suffix;
207  changed = true;
208  }
209  } else if ( start_of_junk_pos < (int)str.length() ) {
210  str.erase( start_of_junk_pos );
211  changed = true;
212  }
213  }
214 
215  // copy the part after the initial whitespace to the destination
216  string::iterator input_iter = str.begin();
217  while ( input_iter != str.end() && *input_iter <= ' ') {
218  ++input_iter;
219  }
220  if( input_iter != str.begin() ) {
221  str.erase( str.begin(), input_iter );
222  changed = true;
223  }
224 
225  return changed;
226 }
227 
228 
230 {
231  static string whites(" \t\n\r");
232  bool changed = false;
233  SIZE_TYPE tilde1 = str.find('~');
234  if (tilde1 == NPOS) {
235  return changed; // no tildes in str.
236  }
237  SIZE_TYPE tilde2 = str.find_first_not_of(whites, tilde1 + 1);
238  while (tilde2 != NPOS) {
239  if (str[tilde2] == '~') {
240  if ( tilde2 > tilde1 + 1) {
241  // found two tildes with only spaces between them.
242  str.erase(tilde1+1, tilde2 - tilde1 - 1);
243  ++tilde1;
244  changed = true;
245  } else {
246  // found two tildes side by side.
247  tilde1 = tilde2;
248  }
249  } else {
250  // found a tilde with non-space non-tilde after it.
251  tilde1 = str.find('~', tilde2 + 1);
252  if (tilde1 == NPOS) {
253  return changed; // no more tildes in str.
254  }
255  }
256  tilde2 = str.find_first_not_of(whites, tilde1 + 1);
257  }
258  return changed;
259 
260 }
261 
262 
263 bool CleanDoubleQuote(string& str)
264 {
265  bool changed = false;
266  NON_CONST_ITERATE(string, it, str) {
267  if (*it == '\"') {
268  *it = '\'';
269  changed = true;
270  }
271  }
272  return changed;
273 }
274 
275 
277 {
278  size_t pos, next_pos;
279 
280  pos = NStr::Find (str, ";");
281  while (pos != string::npos) {
282  next_pos = pos + 1;
283  bool has_space = false;
284  while (next_pos < str.length() && (str[next_pos] == ';' || str[next_pos] == ' ' || str[next_pos] == '\t')) {
285  if (str[next_pos] == ' ') {
286  has_space = true;
287  }
288  next_pos++;
289  }
290  if (next_pos == pos + 1 || (has_space && next_pos == pos + 2)) {
291  // nothing to fix, advance semicolon search
292  pos = NStr::Find (str, ";", next_pos);
293  } else if (next_pos == str.length()) {
294  // nothing but semicolons, spaces, and tabs from here to the end of the string
295  // just truncate it
296  str = str.substr(0, pos);
297  pos = string::npos;
298  } else {
299  if (has_space) {
300  str = str.substr(0, pos + 1) + " " + str.substr(next_pos);
301  } else {
302  str = str.substr(0, pos + 1) + str.substr(next_pos);
303  }
304  pos = NStr::Find (str, ";", pos + 1);
305  }
306  }
307 }
308 
309 #define twocommas ((',') << 8 | (','))
310 #define twospaces ((' ') << 8 | (' '))
311 #define twosemicolons ((';') << 8 | (';'))
312 #define space_comma ((' ') << 8 | (','))
313 #define space_bracket ((' ') << 8 | (')'))
314 #define bracket_space (('(') << 8 | (' '))
315 #define space_semicolon ((' ') << 8 | (';'))
316 #define comma_space ((',') << 8 | (' '))
317 #define semicolon_space ((';') << 8 | (' '))
318 
320 {
321  if (val.length() == 0) return false;
322 
323  char * str = new char[sizeof(char) * (val.length() + 1)];
324  strcpy(str, val.c_str());
325 
326  char ch;
327  char * dst;
328  char * ptr;
329 
330  char curr;
331  char next;
332  char * in;
333  char * out;
334  unsigned short two_chars;
335 
336 
337  in = str;
338  out = str;
339 
340  curr = *in;
341  in++;
342 
343  two_chars = curr;
344 
345  while (curr != '\0') {
346  next = *in;
347  in++;
348 
349  two_chars = (two_chars << 8) | next;
350 
351  if (two_chars == twocommas) {
352  *out++ = curr;
353  next = ' ';
354  two_chars = next;
355  }
356  else if (two_chars == twospaces) {
357  }
358  else if (two_chars == twosemicolons) {
359  }
360  else if (two_chars == bracket_space) {
361  next = curr;
362  two_chars = curr;
363  }
364  else if (two_chars == space_bracket) {
365  }
366  else if (two_chars == space_comma) {
367  *out++ = next;
368  next = curr;
369  *out++ = ' ';
370  while (next == ' ' || next == ',') {
371  next = *in;
372  in++;
373  }
374  two_chars = next;
375  }
376  else if (two_chars == space_semicolon) {
377  *out++ = next;
378  next = curr;
379  *out++ = ' ';
380  while (next == ' ' || next == ';') {
381  next = *in;
382  in++;
383  }
384  two_chars = next;
385  }
386  else if (two_chars == comma_space) {
387  *out++ = curr;
388  *out++ = ' ';
389  while (next == ' ' || next == ',') {
390  next = *in;
391  in++;
392  }
393  two_chars = next;
394  }
395  else if (two_chars == semicolon_space) {
396  *out++ = curr;
397  *out++ = ' ';
398  while (next == ' ' || next == ';') {
399  next = *in;
400  in++;
401  }
402  two_chars = next;
403  }
404  else {
405  *out++ = curr;
406  }
407 
408  curr = next;
409  }
410 
411  *out = '\0';
412 
413  /* TrimSpacesAroundString but allow leading/trailing tabs/newlines */
414 
415  if (str[0] != '\0') {
416  dst = str;
417  ptr = str;
418  ch = *ptr;
419  while (ch == ' ') {
420  ptr++;
421  ch = *ptr;
422  }
423  while (ch != '\0') {
424  *dst = ch;
425  dst++;
426  ptr++;
427  ch = *ptr;
428  }
429  *dst = '\0';
430  dst = nullptr;
431  ptr = str;
432  ch = *ptr;
433  while (ch != '\0') {
434  if (ch != ' ') {
435  dst = nullptr;
436  }
437  else if (!dst) {
438  dst = ptr;
439  }
440  ptr++;
441  ch = *ptr;
442  }
443  if (dst) {
444  *dst = '\0';
445  }
446  }
447  string new_val;
448  new_val = str;
449  delete[] str;
450 
451  if (!NStr::Equal(val, new_val)) {
452 #ifdef _DEBUG
453 #if 0
454  printf("Use new string\n");
455 #endif
456 #endif
457  val = new_val;
458  return true;
459  }
460  else {
461  return false;
462  }
463 }
464 
466 {
467  if (val.length() == 0) return false;
468 
469  char * str = new char[sizeof(char) * (val.length() + 1)];
470  strcpy(str, val.c_str());
471 
472  char * amp;
473  unsigned char ch; /* to use 8bit characters in multibyte languages */
474  char * dst;
475  char * ptr;
476 
477  dst = str;
478  ptr = str;
479  ch = *ptr;
480  if (ch != '\0' && (ch <= ' ' || ch == ';' || ch == ',')) {
481  while (ch != '\0' && (ch <= ' ' || ch == ';' || ch == ',')) {
482  ptr++;
483  ch = *ptr;
484  }
485  while (ch != '\0') {
486  *dst = ch;
487  dst++;
488  ptr++;
489  ch = *ptr;
490  }
491  *dst = '\0';
492  }
493  amp = nullptr;
494  dst = nullptr;
495  ptr = str;
496  ch = *ptr;
497  while (ch != '\0') {
498  if (ch == '&') {
499  amp = ptr;
500  dst = nullptr;
501  }
502  else if (ch <= ' ') {
503  if (!dst) {
504  dst = ptr;
505  }
506  amp = nullptr;
507  }
508  else if (ch == ';') {
509  if (!dst && !amp) {
510  dst = ptr;
511  }
512  }
513  else if (ch == ',') {
514  if (!dst) {
515  dst = ptr;
516  }
517  amp = nullptr;
518  }
519  else {
520  dst = nullptr;
521  }
522  ptr++;
523  ch = *ptr;
524  }
525  if (dst) {
526  *dst = '\0';
527  }
528 
529  string new_val;
530  new_val = str;
531  delete[] str;
532 
533  if (!NStr::Equal(val, new_val)) {
534 #ifdef _DEBUG
535 #if 0
536  printf("Use new string\n");
537 #endif
538 #endif
539  val = new_val;
540  return true;
541  }
542  else {
543  return false;
544  }
545 }
546 
547 
548 bool RemoveSpaces(string& str)
549 {
550  if (str.empty()) {
551  return false;
552  }
553 
554  size_t next = 0;
555 
556  NON_CONST_ITERATE(string, it, str) {
557  if (!isspace((unsigned char)(*it))) {
558  str[next++] = *it;
559  }
560  }
561  if (next < str.length()) {
562  str.resize(next);
563  return true;
564  }
565  return false;
566 }
567 
569 public:
571  : m_scope(scope) { }
572 
574  const CSeq_loc& loc1,
575  const CSeq_loc& loc2,
577  {
578  return sequence::Seq_loc_Add( loc1, loc2, flags, m_scope );
579  }
580 
581 private:
582  CScope *m_scope;
583 };
584 
585 CRef<CSeq_loc> ReadLocFromText(const string& text, const CSeq_id *id, CScope *scope)
586 {
588  return GetSeqLocFromString(text, id, &helper);
589 }
590 
591 typedef struct proteinabbrev {
592  string abbreviation;
593  char letter;
595 
597 {
598  {"Ala", 'A'},
599  {"Asx", 'B'},
600  {"Cys", 'C'},
601  {"Asp", 'D'},
602  {"Glu", 'E'},
603  {"Phe", 'F'},
604  {"Gly", 'G'},
605  {"His", 'H'},
606  {"Ile", 'I'},
607  {"Xle", 'J'}, /* was - notice no 'J', breaks naive meaning of index -Karl */
608  {"Lys", 'K'},
609  {"Leu", 'L'},
610  {"Met", 'M'},
611  {"Asn", 'N'},
612  {"Pyl", 'O'}, /* was - no 'O' */
613  {"Pro", 'P'},
614  {"Gln", 'Q'},
615  {"Arg", 'R'},
616  {"Ser", 'S'},
617  {"Thr", 'T'},
618  {"Val", 'V'},
619  {"Trp", 'W'},
620  {"Sec", 'U'}, /* was - not in iupacaa */
621  {"Xxx", 'X'},
622  {"Tyr", 'Y'},
623  {"Glx", 'Z'},
624  {"TERM", '*'}, /* not in iupacaa */ /*changed by Tatiana 06.07.95?`*/
625  {"OTHER", 'X'}
626 };
627 
628 // Find the single-letter abbreviation for either the single letter abbreviation
629 // or three-letter abbreviation.
630 // Use X if the abbreviation is not found.
631 
632 char x_ValidAminoAcid(string_view abbrev)
633 {
634  if (abbrev.length() >= 3) {
635  for (unsigned k = 0; k < ArraySize(abbreviation_list); ++k) {
636  if (NStr::EqualNocase(abbrev, abbreviation_list[k].abbreviation)) {
637  return abbreviation_list[k].letter;
638  }
639  }
640  }
641 
642  if (abbrev.length() == 1) {
643  for (unsigned k = 0; k < ArraySize(abbreviation_list); ++k) {
644  if (abbrev[0] == abbreviation_list[k].letter) {
645  return abbreviation_list[k].letter;
646  }
647  }
648  }
649 
650  return 'X';
651 }
652 
653 
654 bool s_DbtagCompare (const CRef<CDbtag>& dbt1, const CRef<CDbtag>& dbt2)
655 {
656  // is dbt1 < dbt2
657  return dbt1->Compare(*dbt2) < 0;
658 }
659 
660 
661 bool s_DbtagEqual (const CRef<CDbtag>& dbt1, const CRef<CDbtag>& dbt2)
662 {
663  // is dbt1 == dbt2
664  return dbt1->Compare(*dbt2) == 0;
665 }
666 
667 bool s_OrgrefSynCompare( const string & syn1, const string & syn2 )
668 {
669  return NStr::CompareNocase(syn1, syn2) < 0;
670 }
671 
672 bool s_OrgrefSynEqual( const string & syn1, const string & syn2 )
673 {
674  return NStr::EqualNocase(syn1, syn2);
675 }
676 
677 
User-defined methods of the data storage class.
int Compare(const CDbtag &dbt2) const
Definition: Dbtag.cpp:176
virtual CRef< CSeq_loc > Seq_loc_Add(const CSeq_loc &loc1, const CSeq_loc &loc2, CSeq_loc::TOpFlags flags)
CScope –.
Definition: scope.hpp:92
bool RemoveSpacesBetweenTildes(string &str)
remove white space between pairs of tildes.
bool CleanDoubleQuote(string &str)
Change double to single quotes.
bool CleanVisString(string &str)
#define comma_space
#define space_comma
#define space_semicolon
CRef< CSeq_loc > ReadLocFromText(const string &text, const CSeq_id *id, CScope *scope)
bool s_OrgrefSynCompare(const string &syn1, const string &syn2)
bool s_DbtagEqual(const CRef< CDbtag > &dbt1, const CRef< CDbtag > &dbt2)
void TrimInternalSemicolons(string &str)
remove duplicate internal semicolons.
bool s_DbtagCompare(const CRef< CDbtag > &dbt1, const CRef< CDbtag > &dbt2)
#define semicolon_space
char x_ValidAminoAcid(string_view abbrev)
#define bracket_space
bool s_OrgrefSynEqual(const string &syn1, const string &syn2)
struct proteinabbrev ProteinAbbrevData
static ProteinAbbrevData abbreviation_list[]
bool RemoveSpaces(string &str)
remove all spaces from a string
#define twocommas
#define twosemicolons
#define space_bracket
bool CleanVisStringJunk(string &str, bool allow_ellipses)
#define twospaces
bool Asn2gnbkCompressSpaces(string &val)
weird space compression from C Toolkit
bool TrimSpacesSemicolonsAndCommas(string &val)
Include a standard set of the NCBI C++ Toolkit most basic headers.
static uch flags
std::ofstream out("events_result.xml")
main entry point for tests
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:56
static const char * str(char *buf, int n)
Definition: stats.c:84
constexpr size_t ArraySize(const Element(&)[Size])
Definition: ncbimisc.hpp:1532
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
int TOpFlags
Definition: Seq_loc.hpp:336
CRef< CSeq_loc > Seq_loc_Add(const CSeq_loc &loc1, const CSeq_loc &loc2, CSeq_loc::TOpFlags flags, CScope *scope)
Add two seq-locs.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
#define NPOS
Definition: ncbistr.hpp:133
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2882
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5347
static bool Equal(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Test for equality of a substring with another string.
Definition: ncbistr.hpp:5378
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
static void text(MDB_val *v)
Definition: mdb_dump.c:62
int isspace(Uchar c)
Definition: ncbictype.hpp:69
std::istream & in(std::istream &in_, double &x_)
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
CRef< CSeq_loc > GetSeqLocFromString(const string &text, const CSeq_id *id, CGetSeqLocFromStringHelper *helper)
#define _ASSERT
static Uint4 letter(char c)
Modified on Fri Sep 20 14:58:25 2024 by modify_doxy.py rev. 669887