NCBI C++ ToolKit
capitalization_string.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: capitalization_string.cpp 99172 2023-02-22 15:30:43Z asztalos $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Andrea Asztalos, Igor Filippov
27 *
28 * File Description:
29 * Implement capitalization change in strings.
30 */
31 
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbistd.hpp>
34 #include <objmgr/bioseq_ci.hpp>
35 #include <objmgr/seqdesc_ci.hpp>
38 
39 #include <util/xregexp/regexp.hpp>
41 
44 
46 {
47  {"\\barabidopsis thaliana\\b","Arabidopsis thaliana"},
48  {"\\badp\\b", "ADP" },
49  {"\\batp\\b", "ATP" },
50  {"\\bbac\\b", "BAC" },
51  {"\\bcaenorhabditis elegans\\b", "Caenorhabditis elegans" },
52  {"\\bcdna\\b", "cDNA" },
53  {"\\bcdnas\\b", "cDNAs" },
54  {"\\bcoa\\b", "CoA" },
55  {"\\bcoi\\b", "COI" },
56  {"\\bcoii\\b", "COII" },
57  {"\\bdanio rerio\\b", "Danio rerio" },
58  {"\\bdna\\b", "DNA" },
59  {"\\bdrosophila melanogaster\\b", "Drosophila melanogaster" },
60  {"\\bdsrna\\b", "dsRNA" },
61  {"\\bescherichia coli\\b", "Escherichia coli" },
62  {"\\bhiv\\b", "HIV" },
63  {"\\bhiv\\-1\\b", "HIV-1" },
64  {"\\bhiv\\-2\\b", "HIV-2" },
65  {"\\bhnrna\\b", "hnRNA" },
66  {"\\bhomo sapiens\\b", "Homo sapiens" },
67  {"\\bmhc\\b", "MHC" },
68  {"\\bmrna\\b", "mRNA" },
69  {"\\bmtdna\\b", "mtDNA" },
70  {"\\bmus musculus\\b", "Mus musculus" },
71  {"\\bnadh\\b", "NADH" },
72  {"\\bnov\\.\\b", "nov." },
73  {"\\bnov\\.\\.\\b", "nov.." },
74  {"\\bpcr\\b", "PCR" },
75  {"\\brattus norvegicus\\b", "Rattus norvegicus" },
76  {"\\brapd\\b", "RAPD" },
77  {"\\brdna\\b", "rDNA" },
78  {"\\brna\\b", "RNA" },
79  {"\\brrna\\b", "rRNA" },
80  {"\\brt\\-pcr\\b", "RT-PCR" },
81  {"\\bsaccharomyces cerevisiae\\b", "Saccharomyces cerevisiae" },
82  {"\\bscrna\\b", "scRNA" },
83  {"\\bsiv\\-1\\b", "SIV-1" },
84  {"\\bsnp\\b", "SNP" },
85  {"\\bsnps\\b", "SNPs" },
86  {"\\bsnrna\\b", "snRNA" },
87  {"\\bsp\\.\\b", "sp." },
88  {"\\bsp\\.\\.\\b", "sp.." },
89  {"\\bssp\\.\\b", "ssp." },
90  {"\\bssp\\.\\.\\b", "ssp.." },
91  {"\\bssrna\\b", "ssRNA" },
92  {"\\bsubsp\\.\\b", "subsp." },
93  {"\\bsubsp\\.\\.\\b", "subsp.." },
94  {"\\btrna\\b", "tRNA" },
95  {"\\bvar\\.\\b", "var." },
96  {"\\bvar\\.\\.\\b", "var.." },
97  {"\\buk\\b", "UK" },
98  {"\\busa\\b", "USA" },
99  {"\\bU\\.S\\.A\\.\\b", "USA" },
100  {"\\bU\\.S\\.A\\b", "USA" },
101  {"\\bUnited States of America\\b", "USA" },
102  {"\\b\\(hiv\\)\\b", "(HIV)" },
103  {"\\b\\(hiv1\\)\\b", "(HIV1)" },
104  {"\\b\\(hiv\\-1\\)\\b", "(HIV-1)"},
105 
106  {"\0","\0"}
107 };
108 
110 {
111  {"\\bsp\\.$", "sp.." },
112  {"\\bnov\\.$", "nov.." },
113  {"\\bssp\\.$", "ssp.." },
114  {"\\bvar\\.$", "var.." },
115  {"\\bsubsp\\.$", "subsp.."},
116  {"\0","\0"}
117 };
118 
120 {
121 { "ala", "AL"},
122 { "alabama", "AL"},
123 { "alas", "AK"},
124 { "alaska", "AK"},
125 { "ariz", "AZ"},
126 { "arizona", "AZ"},
127 { "ark", "AR"},
128 { "arkansas", "AR"},
129 { "cal", "CA"},
130 { "cali", "CA"},
131 { "calif", "CA"},
132 { "california", "CA"},
133 { "col", "CO"},
134 { "colo", "CO"},
135 { "colorado", "CO"},
136 { "conn", "CT"},
137 { "connecticut", "CT"},
138 { "del", "DE"},
139 { "delaware", "DE"},
140 { "fla", "FL"},
141 { "florida", "FL"},
142 { "georgia", "GA"},
143 { "hawaii", "HI"},
144 { "ida", "ID"},
145 { "idaho", "ID"},
146 { "ill", "IL"},
147 { "illinois", "IL"},
148 { "ind", "IN"},
149 { "indiana", "IN"},
150 { "iowa", "IA"},
151 { "kan", "KS"},
152 { "kans", "KS"},
153 { "kansas", "KS"},
154 { "ken", "KY"},
155 { "kent", "KY"},
156 { "kentucky", "KY"},
157 { "louisiana", "LA"},
158 { "maine", "ME"},
159 { "maryland", "MD"},
160 { "mass", "MA"},
161 { "massachusetts", "MA"},
162 { "mich", "MI"},
163 { "michigan", "MI"},
164 { "minn", "MN"},
165 { "minnesota", "MN"},
166 { "miss", "MS"},
167 { "mississippi", "MS"},
168 { "missouri", "MO"},
169 { "mont", "MT"},
170 { "montana", "MT"},
171 { "n car", "NC"},
172 { "n dak", "ND"},
173 { "neb", "NE"},
174 { "nebr", "NE"},
175 { "nebraska", "NE"},
176 { "nev", "NV"},
177 { "nevada", "NV"},
178 { "new hampshire", "NH"},
179 { "new jersey", "NJ"},
180 { "new mexico", "NM"},
181 { "new york", "NY"},
182 { "north carolina", "NC"},
183 { "north dakota", "ND"},
184 { "ohio", "OH"},
185 { "okla", "OK"},
186 { "oklahoma", "OK"},
187 { "ore", "OR"},
188 { "oreg", "OR"},
189 { "oregon", "OR"},
190 { "penn", "PA"},
191 { "penna", "PA"},
192 { "pennsylvania", "PA"},
193 { "puerto rico", "PR"},
194 { "rhode island", "RI"},
195 { "s car", "SC"},
196 { "s dak", "SD"},
197 { "south carolina", "SC"},
198 { "south dakota", "SD"},
199 { "tenn", "TN"},
200 { "tennessee", "TN"},
201 { "tex", "TX"},
202 { "texas", "TX"},
203 { "utah", "UT"},
204 { "vermont", "VT"},
205 { "virg", "VA"},
206 { "virginia", "VA"},
207 { "wash", "WA"},
208 { "washington", "WA"},
209 { "west virginia", "WV"},
210 { "wis", "WI"},
211 { "wisc", "WI"},
212 { "wisconsin", "WI"},
213 { "wyo", "WY"},
214 { "wyoming", "WY"}
215 
216 };
217 
218 static const string mouse_strain_fixes[] = {
219  "129/Sv" ,
220  "129/SvJ" ,
221  "BALB/c" ,
222  "C57BL/6" ,
223  "C57BL/6J" ,
224  "CD-1" ,
225  "CZECHII" ,
226  "FVB/N",
227  "FVB/N-3" ,
228  "ICR" ,
229  "NMRI" ,
230  "NOD" ,
231  "C3H" ,
232  "C57BL" ,
233  "C57BL/6" ,
234  "C57BL/6J" ,
235  "DBA/2"
236 };
237 
240 
242 {
243  {"\\bA\\b", "a" },
244  {"\\bAbout\\b", "about" },
245  {"\\bAnd\\b", "and" },
246  {"\\bAt\\b", "at" },
247  {"\\bBut\\b", "but" },
248  {"\\bBy\\b", "by" },
249  {"\\bFor\\b", "for" },
250  {"\\bIn\\b", "in" },
251  {"\\bIs\\b", "is" },
252  {"\\bOf\\b", "of" },
253  {"\\bOn\\b", "on" },
254  {"\\bOr\\b", "or" },
255  {"\\bThe\\b", "the" },
256  {"\\bTo\\b", "to" },
257  {"\\bWith\\b", "with" },
258  {"\0","\0"}
259 };
260 
261 
263 {
264 
265  {"\\bchnia\\b", "China" },
266  {"\\bpr china\\b", "P.R. China" },
267  {"\\bprchina\\b", "P.R. China" },
268  {"\\bp\\.r\\.china\\b", "P.R. China" },
269  {"\\bp\\.r china\\b", "P.R. China" },
270  {"\\bp\\, r\\, china\\b", "P.R. China" },
271  {"\\brok\\b", "ROK" },
272  {"\\brsa\\b", "RSA" },
273  {"\\broc\\b", "ROC" },
274  {"\\buae\\b", "UAE" },
275  {"\\bK\\.S\\.A\\.\\b", "K.S.A." },
276  {"\\bk\\. s\\. a\\.\\b", "K. S. A." },
277  {"\\bksa\\b", "KSA" },
278  {"\0","\0"}
279 };
280 
282 {
283  {"\\bAu\\b", "au" },
284  {"\\bAux\\b", "aux" },
285  {"\\bA La\\b", "a la" },
286  {"\\bDe La\\b", "de la" },
287  {"\\bDe\\b", "de" },
288  {"\\bDel\\b", "del"},
289  {"\\bDes\\b", "des" },
290  {"\\bDu\\b", "du" },
291  {"\\bEt\\b", "et" },
292  {"\\bLa\\b", "la" },
293  {"\\bLe\\b", "le" },
294  {"\\bLes\\b", "les" },
295  {"\\bRue\\b", "rue" },
296  {"\\bPo Box\\b", "PO Box" },
297  {"\\bPobox\\b", "PO Box" },
298  {"\\bP\\.O box\\b", "P.O. Box" },
299  {"\\bP\\.Obox\\b", "P.O. Box" },
300  {"\\bY\\b", "y" },
301  {"\\bA\\&F\\b", "A&F" }, // Northwest A&F University
302  {"\0","\0"}
303 };
304 
305 static const char* set_ordinal_endings[] =
306 {
307  "\\dth\\b",
308  "\\dst\\b",
309  "\\dnd\\b",
310  "\\drd\\b",
311  "\0"
312 };
313 
315 {
316  {"\\bpo box\\b", "PO Box" },
317  {"\\bPobox\\b", "PO Box" },
318  {"\\bP\\.O box\\b", "P.O. Box" },
319  {"\\bP\\.Obox\\b", "P.O. Box" },
320  {"\\bPO\\.Box\\b", "P.O. Box" },
321  {"\\bPO\\. Box\\b", "P.O. Box" },
322  {"\\bpr china\\b", "P.R. China"},
323  {"\\bprchina\\b", "P.R. China" },
324  {"\\bp\\.r\\.china\\b", "P.R. China" },
325  {"\\bp\\.r china\\b", "P.R. China" },
326  {"\\bp\\, r\\, china\\b", "P.R. China" },
327  {"\\bp\\,r\\, china\\b", "P.R. China" },
328  {"\\bp\\,r\\,china\\b", "P.R. China" },
329  {"\0","\0"} // end of array
330 };
331 
332 const string& GetValidCountryCode(unsigned int i)
333 {
334  static vector<string> set_valid_country_codes
335  {
336  "Afghanistan",
337  "Albania",
338  "Algeria",
339  "American Samoa",
340  "Andorra",
341  "Angola",
342  "Anguilla",
343  "Antarctica",
344  "Antigua and Barbuda",
345  "Arctic Ocean",
346  "Argentina",
347  "Armenia",
348  "Aruba",
349  "Ashmore and Cartier Islands",
350  "Atlantic Ocean",
351  "Australia",
352  "Austria",
353  "Azerbaijan",
354  "Bahamas",
355  "Bahrain",
356  "Baker Island",
357  "Baltic Sea",
358  "Bangladesh",
359  "Barbados",
360  "Bassas da India",
361  "Belarus",
362  "Belgium",
363  "Belize",
364  "Benin",
365  "Bermuda",
366  "Bhutan",
367  "Bolivia",
368  "Borneo",
369  "Bosnia and Herzegovina",
370  "Botswana",
371  "Bouvet Island",
372  "Brazil",
373  "British Virgin Islands",
374  "Brunei",
375  "Bulgaria",
376  "Burkina Faso",
377  "Burundi",
378  "Cambodia",
379  "Cameroon",
380  "Canada",
381  "Cape Verde",
382  "Cayman Islands",
383  "Central African Republic",
384  "Chad",
385  "Chile",
386  "China",
387  "Christmas Island",
388  "Clipperton Island",
389  "Cocos Islands",
390  "Colombia",
391  "Comoros",
392  "Cook Islands",
393  "Coral Sea Islands",
394  "Costa Rica",
395  "Cote d'Ivoire",
396  "Croatia",
397  "Cuba",
398  "Curacao",
399  "Cyprus",
400  "Czech Republic",
401  "Democratic Republic of the Congo",
402  "Denmark",
403  "Djibouti",
404  "Dominica",
405  "Dominican Republic",
406  "East Timor",
407  "Ecuador",
408  "Egypt",
409  "El Salvador",
410  "Equatorial Guinea",
411  "Eritrea",
412  "Estonia",
413  "Ethiopia",
414  "Europa Island",
415  "Falkland Islands (Islas Malvinas)",
416  "Faroe Islands",
417  "Fiji",
418  "Finland",
419  "France",
420  "French Guiana",
421  "French Polynesia",
422  "French Southern and Antarctic Lands",
423  "Gabon",
424  "Gambia",
425  "Gaza Strip",
426  "Georgia",
427  "Germany",
428  "Ghana",
429  "Gibraltar",
430  "Glorioso Islands",
431  "Greece",
432  "Greenland",
433  "Grenada",
434  "Guadeloupe",
435  "Guam",
436  "Guatemala",
437  "Guernsey",
438  "Guinea",
439  "Guinea-Bissau",
440  "Guyana",
441  "Haiti",
442  "Heard Island and McDonald Islands",
443  "Honduras",
444  "Hong Kong",
445  "Howland Island",
446  "Hungary",
447  "Iceland",
448  "India",
449  "Indian Ocean",
450  "Indonesia",
451  "Iran",
452  "Iraq",
453  "Ireland",
454  "Isle of Man",
455  "Israel",
456  "Italy",
457  "Jamaica",
458  "Jan Mayen",
459  "Japan",
460  "Jarvis Island",
461  "Jersey",
462  "Johnston Atoll",
463  "Jordan",
464  "Juan de Nova Island",
465  "Kazakhstan",
466  "Kenya",
467  "Kerguelen Archipelago",
468  "Kingman Reef",
469  "Kiribati",
470  "Kosovo",
471  "Kuwait",
472  "Kyrgyzstan",
473  "Laos",
474  "Latvia",
475  "Lebanon",
476  "Lesotho",
477  "Liberia",
478  "Libya",
479  "Liechtenstein",
480  "Line Islands",
481  "Lithuania",
482  "Luxembourg",
483  "Macau",
484  "Macedonia",
485  "Madagascar",
486  "Malawi",
487  "Malaysia",
488  "Maldives",
489  "Mali",
490  "Malta",
491  "Marshall Islands",
492  "Martinique",
493  "Mauritania",
494  "Mauritius",
495  "Mayotte",
496  "Mediterranean Sea",
497  "Mexico",
498  "Micronesia",
499  "Midway Islands",
500  "Moldova",
501  "Monaco",
502  "Mongolia",
503  "Montenegro",
504  "Montserrat",
505  "Morocco",
506  "Mozambique",
507  "Myanmar",
508  "Namibia",
509  "Nauru",
510  "Navassa Island",
511  "Nepal",
512  "Netherlands",
513  "New Caledonia",
514  "New Zealand",
515  "Nicaragua",
516  "Niger",
517  "Nigeria",
518  "Niue",
519  "Norfolk Island",
520  "North Korea",
521  "North Sea",
522  "Northern Mariana Islands",
523  "Norway",
524  "Oman",
525  "Pacific Ocean",
526  "Pakistan",
527  "Palau",
528  "Palmyra Atoll",
529  "Panama",
530  "Papua New Guinea",
531  "Paracel Islands",
532  "Paraguay",
533  "Peru",
534  "Philippines",
535  "Pitcairn Islands",
536  "Poland",
537  "Portugal",
538  "Puerto Rico",
539  "Qatar",
540  "Republic of the Congo",
541  "Reunion",
542  "Romania",
543  "Ross Sea",
544  "Russia",
545  "Rwanda",
546  "Saint Helena",
547  "Saint Kitts and Nevis",
548  "Saint Lucia",
549  "Saint Pierre and Miquelon",
550  "Saint Vincent and the Grenadines",
551  "Samoa",
552  "San Marino",
553  "Sao Tome and Principe",
554  "Saudi Arabia",
555  "Senegal",
556  "Serbia",
557  "Seychelles",
558  "Sierra Leone",
559  "Singapore",
560  "Sint Maarten",
561  "Slovakia",
562  "Slovenia",
563  "Solomon Islands",
564  "Somalia",
565  "South Africa",
566  "South Georgia and the South Sandwich Islands",
567  "South Korea",
568  "South Sudan",
569  "Southern Ocean",
570  "Spain",
571  "Spratly Islands",
572  "Sri Lanka",
573  "Sudan",
574  "Suriname",
575  "Svalbard",
576  "Swaziland",
577  "Sweden",
578  "Switzerland",
579  "Syria",
580  "Taiwan",
581  "Tajikistan",
582  "Tanzania",
583  "Tasman Sea",
584  "Thailand",
585  "Togo",
586  "Tokelau",
587  "Tonga",
588  "Trinidad and Tobago",
589  "Tromelin Island",
590  "Tunisia",
591  "Turkey",
592  "Turkmenistan",
593  "Turks and Caicos Islands",
594  "Tuvalu",
595  "Uganda",
596  "Ukraine",
597  "United Arab Emirates",
598  "United Kingdom",
599  "Uruguay",
600  "USA",
601  "Uzbekistan",
602  "Vanuatu",
603  "Venezuela",
604  "Viet Nam",
605  "Virgin Islands",
606  "Wake Island",
607  "Wallis and Futuna",
608  "West Bank",
609  "Western Sahara",
610  "Yemen",
611  "Zambia",
612  "Zimbabwe"
613  };
614  return (i < set_valid_country_codes.size()) ? set_valid_country_codes[i] : kEmptyStr;
615 };
616 
617 
618 void FixCapitalizationInString (CSeq_entry_Handle seh, string& str, ECapChange capchange_opt)
619 {
620  if (NStr::IsBlank(str) || capchange_opt == eCapChange_none) {
621  return;
622  } else {
623  switch (capchange_opt) {
624  case eCapChange_tolower:
627  FixOrgNames(seh, str);
628  break;
629  case eCapChange_toupper:
632  FixOrgNames(seh, str);
633  break;
636  if ( isalpha(str[0]) ) {
637  str[0] = toupper(str[0]);
638  }
640  FixOrgNames(seh, str);
641  break;
643  if ( isalpha(str[0]) ) {
644  str[0] = toupper(str[0]);
645  }
646  break;
648  if ( isalpha(str[0]) ) {
649  str[0] = tolower(str[0]);
650  }
651  break;
654  {
656  vector<string> words;
657  NStr::Split(str, " \t\r\n", words);
658  for (vector<string>::iterator word = words.begin(); word != words.end(); ++word) {
659  if (!word->empty() && isalpha(word->at(0))) {
660  word->at(0) = toupper(word->at(0));
661  }
662  }
663  str = NStr::Join(words, " ");
664  if (capchange_opt == eCapChange_capword_afterspacepunc) {
665  bool found_punct = false;
666  for (SIZE_TYPE n = 0; n < str.size(); ++n) {
667  if (ispunct(str[n])) {
668  found_punct = true;
669  } else if (isalpha(str[n]) && found_punct) {
670  str[n] = toupper(str[n]);
671  found_punct = false;
672  }
673  }
674  }
676  FixOrgNames(seh, str);
677  }
678  break;
679  default:
680  break;
681  }
682  }
683 }
684 
685 void FixAbbreviationsInElement(string& result, bool fix_end_of_sentence)
686 {
687  for (int pat=0; set_abbreviation_list[pat].first[0]!='\0'; ++pat) {
688  CRegexpUtil replacer( result );
689  //int num_replacements =
690  replacer.Replace( set_abbreviation_list[pat].first, set_abbreviation_list[pat].second,
692  replacer.GetResult().swap( result );
693  }
694  if (fix_end_of_sentence)
695  {
696  for (int pat=0; set_abbreviation_list_end_of_sentence[pat].first[0]!='\0'; ++pat) {
697  CRegexpUtil replacer( result );
700  replacer.GetResult().swap( result );
701  }
702  }
703 }
704 
705 static bool s_ReplaceInPlaceWholeWordNoCase(string& str, const string& search, const string& replace)
706 {
707  bool modified = false;
708 
709  size_t pos = NStr::FindNoCase(str, search);
710  while (pos != string::npos) {
711  size_t right_end = pos + search.length();
712  if ((pos == 0 || !isalpha(str.c_str()[pos - 1]))
713  && (right_end == str.length() || !isalpha(str.c_str()[right_end]))) {
714  string this_replace = replace;
715  str = str.substr(0, pos) + this_replace + str.substr(right_end);
716  right_end = pos + this_replace.length();
717  modified = true;
718  }
719  pos = NStr::FindNoCase(str, search, right_end);
720  }
721 
722  return modified;
723 }
724 
726 {
727  vector<string> taxnames;
728  FindOrgNames(seh, taxnames);
729  for (vector<string>::const_iterator name = taxnames.begin(); name != taxnames.end(); ++name) {
730  bool modified = s_ReplaceInPlaceWholeWordNoCase(result, *name, *name);
731  if (!modified && (NStr::Find(*name, "]") != NPOS || NStr::Find(*name, "[") != NPOS)) {
732  string temp_taxname(*name);
733  NStr::ReplaceInPlace(temp_taxname, "]", "");
734  NStr::ReplaceInPlace(temp_taxname, "[", "");
735  modified = s_ReplaceInPlaceWholeWordNoCase(result, temp_taxname, temp_taxname);
736  }
737  }
738 }
739 
740 void FindOrgNames(CSeq_entry_Handle seh, vector<string>& taxnames)
741 {
742  if (!seh) return;
744  CBioseq_CI b_iter(seh, CSeq_inst::eMol_na);
745  for ( ; b_iter ; ++b_iter ) {
746  CSeqdesc_CI it (*b_iter, CSeqdesc::e_Source);
747  if (it && it->GetSource().IsSetTaxname()) {
748  auto& tax_name = it->GetSource().GetTaxname();
749  if (!NStr::IsBlank(tax_name)) {
750  names.insert(tax_name);
751  }
752  }
753  }
754  taxnames.assign(names.begin(), names.end());
755 }
756 
757 void RemoveFieldNameFromString( const string& field_name, string& str)
758 {
759  if (NStr::IsBlank(field_name) || NStr::IsBlank(str)) {
760  return;
761  }
762 
764  if (NStr::StartsWith(str, field_name, NStr::eNocase) && str.length() > field_name.length()
765  && str[field_name.length()] == ' ') {
766  NStr::ReplaceInPlace(str, field_name, kEmptyStr, 0, 1);
768  }
769 }
770 
772 {
773  NStr::ReplaceInPlace (state, " ", " ");
775  TCStringPairsMap::const_iterator found = k_state_abbrev.find(NStr::ToLower(state).c_str());
776  if (found != k_state_abbrev.end())
777  state = found->second;
778  else
780 }
781 
783 {
784  bool modified = false;
785  if (sub.IsSetAuthors() && sub.GetAuthors().IsSetAffil() && sub.GetAuthors().GetAffil().IsStd()) {
786  modified |= FixUSAAbbreviationInAffil(sub.SetAuthors().SetAffil());
787  modified |= FixStateAbbreviationsInAffil(sub.SetAuthors().SetAffil());
788  }
789  return modified;
790 }
791 
793 {
794  if (affil.IsStd() && affil.GetStd().IsSetCountry()) {
795  CAffil::C_Std& std = affil.SetStd();
796  string country = std.GetCountry();
797  NStr::ReplaceInPlace(country, " ", " ");
799 
800  if (NStr::CompareNocase(country, "United States of America") == 0 ||
801  NStr::CompareNocase(country, "United States") == 0 ||
802  NStr::CompareNocase(country, "U.S.A.") == 0 ||
803  NStr::CompareNocase(country, "U S A") == 0 ||
804  NStr::CompareNocase(country, "US") == 0)
805  {
806  std.SetCountry("USA");
807  return true;
808  }
809  }
810  return false;
811 }
812 
814 {
815  if (affil.IsStd()) {
816  CAffil::C_Std& std = affil.SetStd();
817  if (std.IsSetCountry() && NStr::EqualCase(std.GetCountry(), "USA")) {
818  if (std.IsSetSub() && !NStr::IsBlank(std.GetSub())) {
819  string state = std.GetSub();
820  GetStateAbbreviation(state); // update the state abbreviation
821  if (!NStr::IsBlank(state) && !NStr::EqualCase(std.GetSub(), state)) {
822  std.SetSub(state);
823  return true;
824  }
825  }
826  }
827  }
828  return false;
829 }
830 
831 bool FixupMouseStrain(string& strain)
832 {
833  if (NStr::IsBlank(strain))
834  return false;
835 
837 
838  bool whole_word = true;
839  for (unsigned int i = 0; i < sizeof(mouse_strain_fixes)/sizeof(mouse_strain_fixes[0]); ++i) {
840  CRegexpUtil replacer(strain);
841  string pattern = whole_word ? ("\\b" + mouse_strain_fixes[i] + "\\b") : mouse_strain_fixes[i];
842  // whole-word and case insensitive search
843  if (replacer.Replace(pattern, mouse_strain_fixes[i], CRegexp::fCompile_ignore_case) > 0) {
844  replacer.GetResult().swap(strain);
845  return true;
846  }
847  }
848  return false;
849 }
850 
852 {
853  CRegexpUtil replacer( result );
854  replacer.Replace( "\\,(\\S)", ", $1", CRegexp::fCompile_default, CRegexp::fMatch_default, 0);
855  replacer.GetResult().swap( result );
856 }
857 
859 {
860  CRegexpUtil replacer( result );
861  replacer.Replace( "No\\.(\\w)", "No. $1", CRegexp::fCompile_ignore_case, CRegexp::fMatch_default, 0);
862  replacer.GetResult().swap( result );
863 }
864 
866 {
868  bool capitalize = true;
869  for (unsigned int i=0; i<result.size(); i++)
870  {
871  char &a = result.at(i);
872  if (isalpha(a))
873  {
874  if (capitalize)
875  a = toupper(a);
876  capitalize = false;
877  }
878  else if (a != '\'')
879  capitalize = true;
880  }
881 }
882 
884 {
885  for (int pat=0; set_short_words[pat].first[0]!='\0'; ++pat)
886  {
887  CRegexpUtil replacer( result );
889  replacer.GetResult().swap( result );
890  }
891  result.at(0) = toupper(result.at(0));
892 }
893 
894 
896 {
897  for (int pat=0; set_country_fixes[pat].first[0] != '\0'; ++pat)
898  {
899  CRegexpUtil replacer( result );
901  replacer.GetResult().swap( result );
902  }
903 }
904 
906 {
907  string result;
908  CRegexp pattern("\\'\\w");
909  size_t start = 0;
910  for (;;) {
911  pattern.GetMatch(input, start, 0, CRegexp::fMatch_default, true);
912  if (pattern.NumFound() > 0) {
913  const auto* rslt = pattern.GetResults(0);
914  if (rslt[0] != start)
915  result += input.substr(start,rslt[0]-start);
916  string tmp = input.substr(rslt[0], rslt[1] - rslt[0]);
918  start = rslt[1];
919  } else {
920  result += input.substr(start,input.length()-start);
921  break;
922  }
923  }
924  input = result;
925 }
926 
928 {
929  if (result.empty()) return;
930  for (int pat=0; set_AffiliationShortWordList[pat].first[0]!='\0'; ++pat)
931  {
932  CRegexpUtil replacer( result );
933  //int num_replacements =
936  replacer.GetResult().swap( result );
937  }
938  result.at(0) = toupper(result.at(0));
939  // fix d'
940  {
941  CRegexpUtil replacer( result );
942  //int num_replacements =
943  replacer.Replace( "\\bD\\'", "d'", CRegexp::fCompile_default, CRegexp::fMatch_default, 0);
944  replacer.GetResult().swap( result );
945 
946  string temp;
947  CRegexp pattern("\\bd\\'\\w");
948  size_t start = 0;
949  for (;;) {
950  pattern.GetMatch(result, start, 0, CRegexp::fMatch_default, true);
951  if (pattern.NumFound() > 0) {
952  const auto* rslt = pattern.GetResults(0);
953  if (rslt[0] != start)
954  temp += result.substr(start,rslt[0]-start);
955  string tmp = result.substr(rslt[0], rslt[1] - rslt[0]);
956  tmp = NStr::ToUpper(tmp);
957  tmp.at(0) = 'd';
958  temp += tmp;
959  start = rslt[1];
960  } else {
961  temp += result.substr(start,result.length()-start);
962  break;
963  }
964  }
965  result = temp;
966  }
967 }
968 
970 {
971  for(int p = 0; set_ordinal_endings[p][0] != '\0'; ++p)
972  {
974  string temp;
975  size_t start = 0;
976  for (;;) {
977  pattern.GetMatch(result, start, 0, CRegexp::fMatch_default, true);
978  if (pattern.NumFound() > 0) {
979  const auto* rslt = pattern.GetResults(0);
980  if (rslt[0] != start)
981  temp += result.substr(start,rslt[0]-start);
982  string tmp = result.substr(rslt[0], rslt[1] - rslt[0]);
983  tmp = NStr::ToLower(tmp);
984  temp += tmp;
985  start = rslt[1];
986  } else {
987  temp += result.substr(start,result.length()-start);
988  break;
989  }
990  }
991  result = temp;
992  }
993 }
994 
996 {
997  if (result.empty()) return;
998  for (int pat=0; set_KnownAbbreviationList[pat].first[0] != '\0' ; ++pat)
999  {
1000  CRegexpUtil replacer( result );
1001  //int num_replacements =
1003  replacer.GetResult().swap( result );
1004  }
1005 }
1006 
1008 {
1009  CRegexpUtil replacer( result );
1010  //int num_replacements =
1011  replacer.Replace( "(\\d)s\\b", "$1S", CRegexp::fCompile_default, CRegexp::fMatch_default, 0);
1012  replacer.GetResult().swap( result );
1013 }
1014 
1015 void ResetCapitalization(string& result, bool first_is_upper)
1016 {
1017 
1018  if (result.empty()) return;
1019 
1020  bool was_digit = false;
1021 
1022  if (first_is_upper)
1023  {
1024  /* Set first character to upper */
1025  result[0] = toupper(result[0]);
1026  }
1027  else
1028  {
1029  /* set first character to lower */
1030  result[0] = tolower(result[0]);
1031  }
1032 
1033  if (isdigit ((Int4)(result[0])))
1034  {
1035  was_digit = true;
1036  }
1037  unsigned int i = 1;
1038  /* Set rest of characters to lower */
1039  while (i < result.size())
1040  {
1041  char &pCh = result[i];
1042  if (was_digit && (pCh == 'S' || pCh == 's') && (i+1 >= result.size()-1 || isspace(result[i+1])))
1043  {
1044  pCh = toupper (pCh);
1045  was_digit = false;
1046  }
1047  else if (isdigit (pCh))
1048  {
1049  was_digit = true;
1050  }
1051  else
1052  {
1053  was_digit = false;
1054  pCh = tolower (pCh);
1055  }
1056  i++;
1057  }
1058 }
1059 
1061 {
1062  for(unsigned int p = 0; !GetValidCountryCode(p).empty(); ++p)
1063  {
1064  string name = GetValidCountryCode(p);
1065  CRegexpUtil replacer( result );
1066  replacer.Replace( "\\b"+name+"\\b", name, CRegexp::fCompile_ignore_case, CRegexp::fMatch_default, 0);
1067  replacer.GetResult().swap( result );
1068  }
1069 }
1070 
1073 
1074 
1075 
#define static
bool FixupMouseStrain(string &strain)
This function does not check whether the taxname starts with "Mus musculus", it only corrects the mou...
void FindOrgNames(CSeq_entry_Handle seh, vector< string > &taxnames)
void FixShortWordsInElement(string &result)
void FixAbbreviationsInElement(string &result, bool fix_end_of_sentence)
void RemoveFieldNameFromString(const string &field_name, string &str)
bool FixStateAbbreviationsInAffil(CAffil &affil)
void InsertMissingSpacesAfterNo(string &result)
static const SStaticPair< const char *, const char * > set_country_fixes[]
void FixAffiliationShortWordsInElement(string &result)
void FixCountryCapitalization(string &result)
static const SStaticPair< const char *, const char * > set_AffiliationShortWordList[]
void GetStateAbbreviation(string &state)
void FindReplaceString_CountryFixes(string &result)
static const string mouse_strain_fixes[]
void CapitalizeAfterApostrophe(string &input)
bool FixUSAAbbreviationInAffil(CAffil &affil)
void FixKnownAbbreviationsInElement(string &result)
CStaticPairArrayMap< const char *, const char *, PCase_CStr > TCStringPairsMap
void InsertMissingSpacesAfterCommas(string &result)
static const SStaticPair< const char *, const char * > set_abbreviation_list_end_of_sentence[]
static const SStaticPair< const char *, const char * > set_KnownAbbreviationList[]
const string & GetValidCountryCode(unsigned int i)
void FixCapitalizationInString(CSeq_entry_Handle seh, string &str, ECapChange capchange_opt)
void CapitalizeSAfterNumber(string &result)
bool FixStateAbbreviationsInCitSub(CCit_sub &sub)
void FixCapitalizationInElement(string &result)
static const SStaticPair< const char *, const char * > set_abbreviation_list[]
static const SStaticPair< const char *, const char * > set_short_words[]
DEFINE_STATIC_ARRAY_MAP(TCStringPairsMap, k_state_abbrev, map_state_to_abbrev)
static const SStaticPair< const char *, const char * > map_state_to_abbrev[]
void FixOrgNames(CSeq_entry_Handle seh, string &result)
void FixOrdinalNumbers(string &result)
static const char * set_ordinal_endings[]
void ResetCapitalization(string &result, bool first_is_upper)
static bool s_ReplaceInPlaceWholeWordNoCase(string &str, const string &search, const string &replace)
std representation
Definition: Affil_.hpp:91
@Affil.hpp User-defined methods of the data storage class.
Definition: Affil.hpp:56
const string & GetTaxname(void) const
Definition: BioSource.cpp:340
bool IsSetTaxname(void) const
Definition: BioSource.cpp:335
CBioseq_CI –.
Definition: bioseq_ci.hpp:69
CRegexpUtil –.
Definition: regexp.hpp:312
CRegexp –.
Definition: regexp.hpp:70
CSeq_entry_Handle –.
CSeqdesc_CI –.
Definition: seqdesc_ci.hpp:65
class CStaticArrayMap<> is an array adaptor that provides an STLish interface to statically-defined a...
Definition: static_map.hpp:105
TBase::const_iterator const_iterator
Definition: static_map.hpp:109
@ eCapChange_firstlower_restnochange
capitalize the first letter, the rest is not changed
@ eCapChange_capword_afterspacepunc
capitalize the first letter and letters after spaces
@ eCapChange_none
@ eCapChange_capword_afterspace
first letter is lower case, the rest is not changed
@ eCapChange_firstcap_restnochange
capitalize the first letter, the rest is lower case
@ eCapChange_firstcap_restlower
change each letter to upper case
@ eCapChange_tolower
no change
@ eCapChange_toupper
change each letter to lower case
Include a standard set of the NCBI C++ Toolkit most basic headers.
static const struct name_t names[]
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
const TOffset * GetResults(size_t idx) const
Get location of pattern/subpattern for the last GetMatch().
Definition: regexp.hpp:569
CTempString GetMatch(CTempString str, size_t offset=0, size_t idx=0, TMatch flags=fMatch_default, bool noreturn=false)
Get matching pattern and subpatterns.
Definition: regexp.cpp:182
size_t Replace(CTempStringEx search, CTempString replace, CRegexp::TCompile compile_flags=CRegexp::fCompile_default, CRegexp::TMatch match_flags=CRegexp::fMatch_default, size_t max_replace=0)
Replace occurrences of a substring within a string by pattern.
Definition: regexp.cpp:289
int NumFound() const
Get number of patterns + subpatterns.
Definition: regexp.hpp:562
string GetResult(void)
Get result string.
Definition: regexp.hpp:582
@ fCompile_default
Definition: regexp.hpp:102
@ fCompile_ignore_case
Definition: regexp.hpp:103
@ fMatch_default
Definition: regexp.hpp:127
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
#define kEmptyStr
Definition: ncbistr.hpp:123
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3457
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2989
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
#define NPOS
Definition: ncbistr.hpp:133
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
Definition: ncbistr.cpp:3197
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2887
static string Join(const TContainer &arr, const CTempString &delim)
Join strings using the specified delimiter.
Definition: ncbistr.hpp:2697
static bool EqualCase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-sensitive equality of a substring with another string.
Definition: ncbistr.hpp:5324
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5411
static string & ReplaceInPlace(string &src, const string &search, const string &replace, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3401
static string & ToUpper(string &str)
Convert string to upper case – string& version.
Definition: ncbistr.cpp:424
static string & ToLower(string &str)
Convert string to lower case – string& version.
Definition: ncbistr.cpp:405
@ eNocase
Case insensitive compare.
Definition: ncbistr.hpp:1206
bool IsSetAffil(void) const
author affiliation Check if a value has been assigned to Affil data member.
Definition: Auth_list_.hpp:498
void SetCountry(const TCountry &value)
Assign a value to Country data member.
Definition: Affil_.hpp:897
void SetSub(const TSub &value)
Assign a value to Sub data member.
Definition: Affil_.hpp:850
const TAffil & GetAffil(void) const
Get the Affil member data.
Definition: Auth_list_.hpp:510
const TAuthors & GetAuthors(void) const
Get the Authors member data.
Definition: Cit_sub_.hpp:357
bool IsSetAuthors(void) const
not necessarily authors of the paper Check if a value has been assigned to Authors data member.
Definition: Cit_sub_.hpp:345
void SetAuthors(TAuthors &value)
Assign a value to Authors data member.
Definition: Cit_sub_.cpp:74
const TSub & GetSub(void) const
Get the Sub member data.
Definition: Affil_.hpp:841
const TCountry & GetCountry(void) const
Get the Country member data.
Definition: Affil_.hpp:888
const TStd & GetStd(void) const
Get the variant data.
Definition: Affil_.cpp:214
bool IsStd(void) const
Check if variant Std is selected.
Definition: Affil_.hpp:1207
bool IsSetCountry(void) const
Author Affiliation, Country Check if a value has been assigned to Country data member.
Definition: Affil_.hpp:876
TStd & SetStd(void)
Select the variant.
Definition: Affil_.cpp:220
bool IsSetSub(void) const
Author Affiliation, County Sub Check if a value has been assigned to Sub data member.
Definition: Affil_.hpp:829
const TSource & GetSource(void) const
Get the variant data.
Definition: Seqdesc_.cpp:566
@ e_Source
source of materials, includes Org-ref
Definition: Seqdesc_.hpp:133
@ eMol_na
just a nucleic acid
Definition: Seq_inst_.hpp:113
static int input()
int i
yy_size_t n
unsigned int a
Definition: ncbi_localip.c:102
int isalpha(Uchar c)
Definition: ncbictype.hpp:61
int isspace(Uchar c)
Definition: ncbictype.hpp:69
int tolower(Uchar c)
Definition: ncbictype.hpp:72
int isdigit(Uchar c)
Definition: ncbictype.hpp:64
int toupper(Uchar c)
Definition: ncbictype.hpp:73
int ispunct(Uchar c)
Definition: ncbictype.hpp:68
static char tmp[2048]
Definition: utf8.c:42
static const char * str(char *buf, int n)
Definition: stats.c:84
Template structure SStaticPair is simlified replacement of STL pair<> Main reason of introducing this...
Definition: static_set.hpp:60
else result
Definition: token2.c:20
C++ wrappers for the Perl-compatible regular expression (PCRE) library.
#define const
Definition: zconf.h:230
Modified on Wed Dec 06 07:14:30 2023 by modify_doxy.py rev. 669887