NCBI C++ ToolKit
capitalization_string.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: capitalization_string.cpp 102169 2024-04-09 14:42:19Z asztalos $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Andrea Asztalos, Igor Filippov
27 *
28 * File Description:
29 * Implement capitalization change in strings.
30 */
31 
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbistd.hpp>
34 #include <objmgr/bioseq_ci.hpp>
35 #include <objmgr/seqdesc_ci.hpp>
38 
39 #include <util/xregexp/regexp.hpp>
41 
44 
46 {
47  {"\\barabidopsis thaliana\\b","Arabidopsis thaliana"},
48  {"\\badp\\b", "ADP" },
49  {"\\batp\\b", "ATP" },
50  {"\\bbac\\b", "BAC" },
51  {"\\bcaenorhabditis elegans\\b", "Caenorhabditis elegans" },
52  {"\\bcdna\\b", "cDNA" },
53  {"\\bcdnas\\b", "cDNAs" },
54  {"\\bcoa\\b", "CoA" },
55  {"\\bcoi\\b", "COI" },
56  {"\\bcoii\\b", "COII" },
57  {"\\bdanio rerio\\b", "Danio rerio" },
58  {"\\bdna\\b", "DNA" },
59  {"\\bdrosophila melanogaster\\b", "Drosophila melanogaster" },
60  {"\\bdsrna\\b", "dsRNA" },
61  {"\\bescherichia coli\\b", "Escherichia coli" },
62  {"\\bhiv\\b", "HIV" },
63  {"\\bhiv\\-1\\b", "HIV-1" },
64  {"\\bhiv\\-2\\b", "HIV-2" },
65  {"\\bhnrna\\b", "hnRNA" },
66  {"\\bhomo sapiens\\b", "Homo sapiens" },
67  {"\\bmhc\\b", "MHC" },
68  {"\\bmrna\\b", "mRNA" },
69  {"\\bmtdna\\b", "mtDNA" },
70  {"\\bmus musculus\\b", "Mus musculus" },
71  {"\\bnadh\\b", "NADH" },
72  {"\\bnov\\.\\b", "nov." },
73  {"\\bnov\\.\\.\\b", "nov.." },
74  {"\\bpcr\\b", "PCR" },
75  {"\\brattus norvegicus\\b", "Rattus norvegicus" },
76  {"\\brapd\\b", "RAPD" },
77  {"\\brdna\\b", "rDNA" },
78  {"\\brna\\b", "RNA" },
79  {"\\brrna\\b", "rRNA" },
80  {"\\brt\\-pcr\\b", "RT-PCR" },
81  {"\\bsaccharomyces cerevisiae\\b", "Saccharomyces cerevisiae" },
82  {"\\bscrna\\b", "scRNA" },
83  {"\\bsiv\\-1\\b", "SIV-1" },
84  {"\\bsnp\\b", "SNP" },
85  {"\\bsnps\\b", "SNPs" },
86  {"\\bsnrna\\b", "snRNA" },
87  {"\\bsp\\.\\b", "sp." },
88  {"\\bsp\\.\\.\\b", "sp.." },
89  {"\\bssp\\.\\b", "ssp." },
90  {"\\bssp\\.\\.\\b", "ssp.." },
91  {"\\bssrna\\b", "ssRNA" },
92  {"\\bsubsp\\.\\b", "subsp." },
93  {"\\bsubsp\\.\\.\\b", "subsp.." },
94  {"\\btrna\\b", "tRNA" },
95  {"\\bvar\\.\\b", "var." },
96  {"\\bvar\\.\\.\\b", "var.." },
97  {"\\buk\\b", "UK" },
98  {"\\busa\\b", "USA" },
99  {"\\bU\\.S\\.A\\.\\b", "USA" },
100  {"\\bU\\.S\\.A\\b", "USA" },
101  {"\\bUnited States of America\\b", "USA" },
102  {"\\b\\(hiv\\)\\b", "(HIV)" },
103  {"\\b\\(hiv1\\)\\b", "(HIV1)" },
104  {"\\b\\(hiv\\-1\\)\\b", "(HIV-1)"},
105 
106  {"\0","\0"}
107 };
108 
110 {
111  {"\\bsp\\.$", "sp.." },
112  {"\\bnov\\.$", "nov.." },
113  {"\\bssp\\.$", "ssp.." },
114  {"\\bvar\\.$", "var.." },
115  {"\\bsubsp\\.$", "subsp.."},
116  {"\0","\0"}
117 };
118 
120 {
121 { "ala", "AL"},
122 { "alabama", "AL"},
123 { "alas", "AK"},
124 { "alaska", "AK"},
125 { "ariz", "AZ"},
126 { "arizona", "AZ"},
127 { "ark", "AR"},
128 { "arkansas", "AR"},
129 { "cal", "CA"},
130 { "cali", "CA"},
131 { "calif", "CA"},
132 { "california", "CA"},
133 { "col", "CO"},
134 { "colo", "CO"},
135 { "colorado", "CO"},
136 { "conn", "CT"},
137 { "connecticut", "CT"},
138 { "del", "DE"},
139 { "delaware", "DE"},
140 { "fla", "FL"},
141 { "florida", "FL"},
142 { "georgia", "GA"},
143 { "hawaii", "HI"},
144 { "ida", "ID"},
145 { "idaho", "ID"},
146 { "ill", "IL"},
147 { "illinois", "IL"},
148 { "ind", "IN"},
149 { "indiana", "IN"},
150 { "iowa", "IA"},
151 { "kan", "KS"},
152 { "kans", "KS"},
153 { "kansas", "KS"},
154 { "ken", "KY"},
155 { "kent", "KY"},
156 { "kentucky", "KY"},
157 { "louisiana", "LA"},
158 { "maine", "ME"},
159 { "maryland", "MD"},
160 { "mass", "MA"},
161 { "massachusetts", "MA"},
162 { "mich", "MI"},
163 { "michigan", "MI"},
164 { "minn", "MN"},
165 { "minnesota", "MN"},
166 { "miss", "MS"},
167 { "mississippi", "MS"},
168 { "missouri", "MO"},
169 { "mont", "MT"},
170 { "montana", "MT"},
171 { "n car", "NC"},
172 { "n dak", "ND"},
173 { "neb", "NE"},
174 { "nebr", "NE"},
175 { "nebraska", "NE"},
176 { "nev", "NV"},
177 { "nevada", "NV"},
178 { "new hampshire", "NH"},
179 { "new jersey", "NJ"},
180 { "new mexico", "NM"},
181 { "new york", "NY"},
182 { "north carolina", "NC"},
183 { "north dakota", "ND"},
184 { "ohio", "OH"},
185 { "okla", "OK"},
186 { "oklahoma", "OK"},
187 { "ore", "OR"},
188 { "oreg", "OR"},
189 { "oregon", "OR"},
190 { "penn", "PA"},
191 { "penna", "PA"},
192 { "pennsylvania", "PA"},
193 { "puerto rico", "PR"},
194 { "rhode island", "RI"},
195 { "s car", "SC"},
196 { "s dak", "SD"},
197 { "south carolina", "SC"},
198 { "south dakota", "SD"},
199 { "tenn", "TN"},
200 { "tennessee", "TN"},
201 { "tex", "TX"},
202 { "texas", "TX"},
203 { "utah", "UT"},
204 { "vermont", "VT"},
205 { "virg", "VA"},
206 { "virginia", "VA"},
207 { "wash", "WA"},
208 { "washington", "WA"},
209 { "west virginia", "WV"},
210 { "wis", "WI"},
211 { "wisc", "WI"},
212 { "wisconsin", "WI"},
213 { "wyo", "WY"},
214 { "wyoming", "WY"}
215 
216 };
217 
218 static const string mouse_strain_fixes[] = {
219  "129/Sv" ,
220  "129/SvJ" ,
221  "BALB/c" ,
222  "C57BL/6" ,
223  "C57BL/6J" ,
224  "CD-1" ,
225  "CZECHII" ,
226  "FVB/N",
227  "FVB/N-3" ,
228  "ICR" ,
229  "NMRI" ,
230  "NOD" ,
231  "C3H" ,
232  "C57BL" ,
233  "C57BL/6" ,
234  "C57BL/6J" ,
235  "DBA/2"
236 };
237 
240 
241 MAKE_CONST_MAP(Canada_map_state_to_abbrev, ct::tagStrNocase, ct::tagStrNocase,
242 {
243  { "alberta", "AB"},
244  { "british columbia", "BC"},
245  { "manitoba", "MB"},
246  { "new brunswick", "NB"},
247  { "newfoundland and labrador", "NL"},
248  { "northwest territories", "NT"},
249  { "nova scotia", "NS"},
250  { "nunavut", "NU"},
251  { "ontario", "ON"},
252  { "prince edward island", "PE"},
253  { "quebec", "QC"},
254  { "saskatchewan", "SK"},
255  { "yukon", "YT"}
256 });
257 
258 
260 {
261  {"\\bA\\b", "a" },
262  {"\\bAbout\\b", "about" },
263  {"\\bAnd\\b", "and" },
264  {"\\bAt\\b", "at" },
265  {"\\bBut\\b", "but" },
266  {"\\bBy\\b", "by" },
267  {"\\bFor\\b", "for" },
268  {"\\bIn\\b", "in" },
269  {"\\bIs\\b", "is" },
270  {"\\bOf\\b", "of" },
271  {"\\bOn\\b", "on" },
272  {"\\bOr\\b", "or" },
273  {"\\bThe\\b", "the" },
274  {"\\bTo\\b", "to" },
275  {"\\bWith\\b", "with" },
276  {"\0","\0"}
277 };
278 
279 
281 {
282 
283  {"\\bchnia\\b", "China" },
284  {"\\bpr china\\b", "P.R. China" },
285  {"\\bprchina\\b", "P.R. China" },
286  {"\\bp\\.r\\.china\\b", "P.R. China" },
287  {"\\bp\\.r china\\b", "P.R. China" },
288  {"\\bp\\, r\\, china\\b", "P.R. China" },
289  {"\\brok\\b", "ROK" },
290  {"\\brsa\\b", "RSA" },
291  {"\\broc\\b", "ROC" },
292  {"\\buae\\b", "UAE" },
293  {"\\bK\\.S\\.A\\.\\b", "K.S.A." },
294  {"\\bk\\. s\\. a\\.\\b", "K. S. A." },
295  {"\\bksa\\b", "KSA" },
296  {"\0","\0"}
297 };
298 
300 {
301  {"\\bAu\\b", "au" },
302  {"\\bAux\\b", "aux" },
303  {"\\bA La\\b", "a la" },
304  {"\\bDe La\\b", "de la" },
305  {"\\bDe\\b", "de" },
306  {"\\bDel\\b", "del"},
307  {"\\bDes\\b", "des" },
308  {"\\bDu\\b", "du" },
309  {"\\bEt\\b", "et" },
310  {"\\bLa\\b", "la" },
311  {"\\bLe\\b", "le" },
312  {"\\bLes\\b", "les" },
313  {"\\bRue\\b", "rue" },
314  {"\\bPo Box\\b", "PO Box" },
315  {"\\bPobox\\b", "PO Box" },
316  {"\\bP\\.O box\\b", "P.O. Box" },
317  {"\\bP\\.Obox\\b", "P.O. Box" },
318  {"\\bY\\b", "y" },
319  {"\\bA\\&F\\b", "A&F" }, // Northwest A&F University
320  {"\0","\0"}
321 };
322 
323 static const char* set_ordinal_endings[] =
324 {
325  "\\dth\\b",
326  "\\dst\\b",
327  "\\dnd\\b",
328  "\\drd\\b",
329  "\0"
330 };
331 
333 {
334  {"\\bpo box\\b", "PO Box" },
335  {"\\bPobox\\b", "PO Box" },
336  {"\\bP\\.O box\\b", "P.O. Box" },
337  {"\\bP\\.Obox\\b", "P.O. Box" },
338  {"\\bPO\\.Box\\b", "P.O. Box" },
339  {"\\bPO\\. Box\\b", "P.O. Box" },
340  {"\\bpr china\\b", "P.R. China"},
341  {"\\bprchina\\b", "P.R. China" },
342  {"\\bp\\.r\\.china\\b", "P.R. China" },
343  {"\\bp\\.r china\\b", "P.R. China" },
344  {"\\bp\\, r\\, china\\b", "P.R. China" },
345  {"\\bp\\,r\\, china\\b", "P.R. China" },
346  {"\\bp\\,r\\,china\\b", "P.R. China" },
347  {"\0","\0"} // end of array
348 };
349 
350 const string& GetValidCountryCode(unsigned int i)
351 {
352  static vector<string> set_valid_country_codes
353  {
354  "Afghanistan",
355  "Albania",
356  "Algeria",
357  "American Samoa",
358  "Andorra",
359  "Angola",
360  "Anguilla",
361  "Antarctica",
362  "Antigua and Barbuda",
363  "Arctic Ocean",
364  "Argentina",
365  "Armenia",
366  "Aruba",
367  "Ashmore and Cartier Islands",
368  "Atlantic Ocean",
369  "Australia",
370  "Austria",
371  "Azerbaijan",
372  "Bahamas",
373  "Bahrain",
374  "Baker Island",
375  "Baltic Sea",
376  "Bangladesh",
377  "Barbados",
378  "Bassas da India",
379  "Belarus",
380  "Belgium",
381  "Belize",
382  "Benin",
383  "Bermuda",
384  "Bhutan",
385  "Bolivia",
386  "Borneo",
387  "Bosnia and Herzegovina",
388  "Botswana",
389  "Bouvet Island",
390  "Brazil",
391  "British Virgin Islands",
392  "Brunei",
393  "Bulgaria",
394  "Burkina Faso",
395  "Burundi",
396  "Cambodia",
397  "Cameroon",
398  "Canada",
399  "Cape Verde",
400  "Cayman Islands",
401  "Central African Republic",
402  "Chad",
403  "Chile",
404  "China",
405  "Christmas Island",
406  "Clipperton Island",
407  "Cocos Islands",
408  "Colombia",
409  "Comoros",
410  "Cook Islands",
411  "Coral Sea Islands",
412  "Costa Rica",
413  "Cote d'Ivoire",
414  "Croatia",
415  "Cuba",
416  "Curacao",
417  "Cyprus",
418  "Czech Republic",
419  "Democratic Republic of the Congo",
420  "Denmark",
421  "Djibouti",
422  "Dominica",
423  "Dominican Republic",
424  "East Timor",
425  "Ecuador",
426  "Egypt",
427  "El Salvador",
428  "Equatorial Guinea",
429  "Eritrea",
430  "Estonia",
431  "Ethiopia",
432  "Europa Island",
433  "Falkland Islands (Islas Malvinas)",
434  "Faroe Islands",
435  "Fiji",
436  "Finland",
437  "France",
438  "French Guiana",
439  "French Polynesia",
440  "French Southern and Antarctic Lands",
441  "Gabon",
442  "Gambia",
443  "Gaza Strip",
444  "Georgia",
445  "Germany",
446  "Ghana",
447  "Gibraltar",
448  "Glorioso Islands",
449  "Greece",
450  "Greenland",
451  "Grenada",
452  "Guadeloupe",
453  "Guam",
454  "Guatemala",
455  "Guernsey",
456  "Guinea",
457  "Guinea-Bissau",
458  "Guyana",
459  "Haiti",
460  "Heard Island and McDonald Islands",
461  "Honduras",
462  "Hong Kong",
463  "Howland Island",
464  "Hungary",
465  "Iceland",
466  "India",
467  "Indian Ocean",
468  "Indonesia",
469  "Iran",
470  "Iraq",
471  "Ireland",
472  "Isle of Man",
473  "Israel",
474  "Italy",
475  "Jamaica",
476  "Jan Mayen",
477  "Japan",
478  "Jarvis Island",
479  "Jersey",
480  "Johnston Atoll",
481  "Jordan",
482  "Juan de Nova Island",
483  "Kazakhstan",
484  "Kenya",
485  "Kerguelen Archipelago",
486  "Kingman Reef",
487  "Kiribati",
488  "Kosovo",
489  "Kuwait",
490  "Kyrgyzstan",
491  "Laos",
492  "Latvia",
493  "Lebanon",
494  "Lesotho",
495  "Liberia",
496  "Libya",
497  "Liechtenstein",
498  "Line Islands",
499  "Lithuania",
500  "Luxembourg",
501  "Macau",
502  "Macedonia",
503  "Madagascar",
504  "Malawi",
505  "Malaysia",
506  "Maldives",
507  "Mali",
508  "Malta",
509  "Marshall Islands",
510  "Martinique",
511  "Mauritania",
512  "Mauritius",
513  "Mayotte",
514  "Mediterranean Sea",
515  "Mexico",
516  "Micronesia",
517  "Midway Islands",
518  "Moldova",
519  "Monaco",
520  "Mongolia",
521  "Montenegro",
522  "Montserrat",
523  "Morocco",
524  "Mozambique",
525  "Myanmar",
526  "Namibia",
527  "Nauru",
528  "Navassa Island",
529  "Nepal",
530  "Netherlands",
531  "New Caledonia",
532  "New Zealand",
533  "Nicaragua",
534  "Niger",
535  "Nigeria",
536  "Niue",
537  "Norfolk Island",
538  "North Korea",
539  "North Sea",
540  "Northern Mariana Islands",
541  "Norway",
542  "Oman",
543  "Pacific Ocean",
544  "Pakistan",
545  "Palau",
546  "Palmyra Atoll",
547  "Panama",
548  "Papua New Guinea",
549  "Paracel Islands",
550  "Paraguay",
551  "Peru",
552  "Philippines",
553  "Pitcairn Islands",
554  "Poland",
555  "Portugal",
556  "Puerto Rico",
557  "Qatar",
558  "Republic of the Congo",
559  "Reunion",
560  "Romania",
561  "Ross Sea",
562  "Russia",
563  "Rwanda",
564  "Saint Helena",
565  "Saint Kitts and Nevis",
566  "Saint Lucia",
567  "Saint Pierre and Miquelon",
568  "Saint Vincent and the Grenadines",
569  "Samoa",
570  "San Marino",
571  "Sao Tome and Principe",
572  "Saudi Arabia",
573  "Senegal",
574  "Serbia",
575  "Seychelles",
576  "Sierra Leone",
577  "Singapore",
578  "Sint Maarten",
579  "Slovakia",
580  "Slovenia",
581  "Solomon Islands",
582  "Somalia",
583  "South Africa",
584  "South Georgia and the South Sandwich Islands",
585  "South Korea",
586  "South Sudan",
587  "Southern Ocean",
588  "Spain",
589  "Spratly Islands",
590  "Sri Lanka",
591  "Sudan",
592  "Suriname",
593  "Svalbard",
594  "Swaziland",
595  "Sweden",
596  "Switzerland",
597  "Syria",
598  "Taiwan",
599  "Tajikistan",
600  "Tanzania",
601  "Tasman Sea",
602  "Thailand",
603  "Togo",
604  "Tokelau",
605  "Tonga",
606  "Trinidad and Tobago",
607  "Tromelin Island",
608  "Tunisia",
609  "Turkey",
610  "Turkmenistan",
611  "Turks and Caicos Islands",
612  "Tuvalu",
613  "Uganda",
614  "Ukraine",
615  "United Arab Emirates",
616  "United Kingdom",
617  "Uruguay",
618  "USA",
619  "Uzbekistan",
620  "Vanuatu",
621  "Venezuela",
622  "Viet Nam",
623  "Virgin Islands",
624  "Wake Island",
625  "Wallis and Futuna",
626  "West Bank",
627  "Western Sahara",
628  "Yemen",
629  "Zambia",
630  "Zimbabwe"
631  };
632  return (i < set_valid_country_codes.size()) ? set_valid_country_codes[i] : kEmptyStr;
633 };
634 
635 
636 void FixCapitalizationInString (CSeq_entry_Handle seh, string& str, ECapChange capchange_opt)
637 {
638  if (NStr::IsBlank(str) || capchange_opt == eCapChange_none) {
639  return;
640  } else {
641  switch (capchange_opt) {
642  case eCapChange_tolower:
645  FixOrgNames(seh, str);
646  break;
647  case eCapChange_toupper:
650  FixOrgNames(seh, str);
651  break;
654  if ( isalpha(str[0]) ) {
655  str[0] = toupper(str[0]);
656  }
658  FixOrgNames(seh, str);
659  break;
661  if ( isalpha(str[0]) ) {
662  str[0] = toupper(str[0]);
663  }
664  break;
666  if ( isalpha(str[0]) ) {
667  str[0] = tolower(str[0]);
668  }
669  break;
672  {
674  vector<string> words;
675  NStr::Split(str, " \t\r\n", words);
676  for (vector<string>::iterator word = words.begin(); word != words.end(); ++word) {
677  if (!word->empty() && isalpha(word->at(0))) {
678  word->at(0) = toupper(word->at(0));
679  }
680  }
681  str = NStr::Join(words, " ");
682  if (capchange_opt == eCapChange_capword_afterspacepunc) {
683  bool found_punct = false;
684  for (SIZE_TYPE n = 0; n < str.size(); ++n) {
685  if (ispunct(str[n])) {
686  found_punct = true;
687  } else if (isalpha(str[n]) && found_punct) {
688  str[n] = toupper(str[n]);
689  found_punct = false;
690  }
691  }
692  }
694  FixOrgNames(seh, str);
695  }
696  break;
697  default:
698  break;
699  }
700  }
701 }
702 
703 void FixAbbreviationsInElement(string& result, bool fix_end_of_sentence)
704 {
705  for (int pat=0; set_abbreviation_list[pat].first[0]!='\0'; ++pat) {
706  CRegexpUtil replacer( result );
707  //int num_replacements =
708  replacer.Replace( set_abbreviation_list[pat].first, set_abbreviation_list[pat].second,
710  replacer.GetResult().swap( result );
711  }
712  if (fix_end_of_sentence)
713  {
714  for (int pat=0; set_abbreviation_list_end_of_sentence[pat].first[0]!='\0'; ++pat) {
715  CRegexpUtil replacer( result );
718  replacer.GetResult().swap( result );
719  }
720  }
721 }
722 
723 static bool s_ReplaceInPlaceWholeWordNoCase(string& str, const string& search, const string& replace)
724 {
725  bool modified = false;
726 
727  size_t pos = NStr::FindNoCase(str, search);
728  while (pos != string::npos) {
729  size_t right_end = pos + search.length();
730  if ((pos == 0 || !isalpha(str.c_str()[pos - 1]))
731  && (right_end == str.length() || !isalpha(str.c_str()[right_end]))) {
732  string this_replace = replace;
733  str = str.substr(0, pos) + this_replace + str.substr(right_end);
734  right_end = pos + this_replace.length();
735  modified = true;
736  }
737  pos = NStr::FindNoCase(str, search, right_end);
738  }
739 
740  return modified;
741 }
742 
744 {
745  vector<string> taxnames;
746  FindOrgNames(seh, taxnames);
747  for (vector<string>::const_iterator name = taxnames.begin(); name != taxnames.end(); ++name) {
748  bool modified = s_ReplaceInPlaceWholeWordNoCase(result, *name, *name);
749  if (!modified && (NStr::Find(*name, "]") != NPOS || NStr::Find(*name, "[") != NPOS)) {
750  string temp_taxname(*name);
751  NStr::ReplaceInPlace(temp_taxname, "]", "");
752  NStr::ReplaceInPlace(temp_taxname, "[", "");
753  modified = s_ReplaceInPlaceWholeWordNoCase(result, temp_taxname, temp_taxname);
754  }
755  }
756 }
757 
758 void FindOrgNames(CSeq_entry_Handle seh, vector<string>& taxnames)
759 {
760  if (!seh) return;
762  CBioseq_CI b_iter(seh, CSeq_inst::eMol_na);
763  for ( ; b_iter ; ++b_iter ) {
764  CSeqdesc_CI it (*b_iter, CSeqdesc::e_Source);
765  if (it && it->GetSource().IsSetTaxname()) {
766  auto& tax_name = it->GetSource().GetTaxname();
767  if (!NStr::IsBlank(tax_name)) {
768  names.insert(tax_name);
769  }
770  }
771  }
772  taxnames.assign(names.begin(), names.end());
773 }
774 
775 void RemoveFieldNameFromString( const string& field_name, string& str)
776 {
777  if (NStr::IsBlank(field_name) || NStr::IsBlank(str)) {
778  return;
779  }
780 
782  if (NStr::StartsWith(str, field_name, NStr::eNocase) && str.length() > field_name.length()
783  && str[field_name.length()] == ' ') {
784  NStr::ReplaceInPlace(str, field_name, kEmptyStr, 0, 1);
786  }
787 }
788 
790 {
791  NStr::ReplaceInPlace (state, " ", " ");
793  TCStringPairsMap::const_iterator found = k_state_abbrev.find(NStr::ToLower(state).c_str());
794  if (found != k_state_abbrev.end())
795  state = found->second;
796  else
798 }
799 
801 {
802  NStr::ReplaceInPlace(state, " ", " ");
804  auto found = Canada_map_state_to_abbrev.find(state);
805  if (found != Canada_map_state_to_abbrev.end())
806  state = found->second;
807  else
809 }
810 
812 {
813  bool modified = false;
814  if (sub.IsSetAuthors() && sub.GetAuthors().IsSetAffil() && sub.GetAuthors().GetAffil().IsStd()) {
815  modified |= FixUSAAbbreviationInAffil(sub.SetAuthors().SetAffil());
816  modified |= FixStateAbbreviationsInAffil(sub.SetAuthors().SetAffil());
817  }
818  return modified;
819 }
820 
822 {
823  if (affil.IsStd() && affil.GetStd().IsSetCountry()) {
824  CAffil::C_Std& std = affil.SetStd();
825  string country = std.GetCountry();
826  NStr::ReplaceInPlace(country, " ", " ");
828 
829  if (NStr::CompareNocase(country, "United States of America") == 0 ||
830  NStr::CompareNocase(country, "United States") == 0 ||
831  NStr::CompareNocase(country, "U.S.A.") == 0 ||
832  NStr::CompareNocase(country, "U S A") == 0 ||
833  NStr::CompareNocase(country, "US") == 0)
834  {
835  std.SetCountry("USA");
836  return true;
837  }
838  }
839  return false;
840 }
841 
843 {
844  if (affil.IsStd()) {
845  CAffil::C_Std& std = affil.SetStd();
846  if (std.IsSetCountry() && NStr::EqualCase(std.GetCountry(), "USA")) {
847  if (std.IsSetSub() && !NStr::IsBlank(std.GetSub())) {
848  string state = std.GetSub();
849  GetStateAbbreviation(state); // update the state abbreviation
850  if (!NStr::IsBlank(state) && !NStr::EqualCase(std.GetSub(), state)) {
851  std.SetSub(state);
852  return true;
853  }
854  }
855  }
856  }
857  return false;
858 }
859 
860 bool FixupMouseStrain(string& strain)
861 {
862  if (NStr::IsBlank(strain))
863  return false;
864 
866 
867  bool whole_word = true;
868  for (unsigned int i = 0; i < sizeof(mouse_strain_fixes)/sizeof(mouse_strain_fixes[0]); ++i) {
869  CRegexpUtil replacer(strain);
870  string pattern = whole_word ? ("\\b" + mouse_strain_fixes[i] + "\\b") : mouse_strain_fixes[i];
871  // whole-word and case insensitive search
872  if (replacer.Replace(pattern, mouse_strain_fixes[i], CRegexp::fCompile_ignore_case) > 0) {
873  replacer.GetResult().swap(strain);
874  return true;
875  }
876  }
877  return false;
878 }
879 
881 {
882  CRegexpUtil replacer( result );
883  replacer.Replace( "\\,(\\S)", ", $1", CRegexp::fCompile_default, CRegexp::fMatch_default, 0);
884  replacer.GetResult().swap( result );
885 }
886 
888 {
889  CRegexpUtil replacer( result );
890  replacer.Replace( "No\\.(\\w)", "No. $1", CRegexp::fCompile_ignore_case, CRegexp::fMatch_default, 0);
891  replacer.GetResult().swap( result );
892 }
893 
895 {
897  bool capitalize = true;
898  for (unsigned int i=0; i<result.size(); i++)
899  {
900  char &a = result.at(i);
901  if (isalpha(a))
902  {
903  if (capitalize)
904  a = toupper(a);
905  capitalize = false;
906  }
907  else if (a != '\'')
908  capitalize = true;
909  }
910 }
911 
913 {
914  for (int pat=0; set_short_words[pat].first[0]!='\0'; ++pat)
915  {
916  CRegexpUtil replacer( result );
918  replacer.GetResult().swap( result );
919  }
920  result.at(0) = toupper(result.at(0));
921 }
922 
923 
925 {
926  for (int pat=0; set_country_fixes[pat].first[0] != '\0'; ++pat)
927  {
928  CRegexpUtil replacer( result );
930  replacer.GetResult().swap( result );
931  }
932 }
933 
935 {
936  string result;
937  CRegexp pattern("\\'\\w");
938  size_t start = 0;
939  for (;;) {
940  pattern.GetMatch(input, start, 0, CRegexp::fMatch_default, true);
941  if (pattern.NumFound() > 0) {
942  const auto* rslt = pattern.GetResults(0);
943  if (rslt[0] != start)
944  result += input.substr(start,rslt[0]-start);
945  string tmp = input.substr(rslt[0], rslt[1] - rslt[0]);
947  start = rslt[1];
948  } else {
949  result += input.substr(start,input.length()-start);
950  break;
951  }
952  }
953  input = result;
954 }
955 
957 {
958  if (result.empty()) return;
959  for (int pat=0; set_AffiliationShortWordList[pat].first[0]!='\0'; ++pat)
960  {
961  CRegexpUtil replacer( result );
962  //int num_replacements =
965  replacer.GetResult().swap( result );
966  }
967  result.at(0) = toupper(result.at(0));
968  // fix d'
969  {
970  CRegexpUtil replacer( result );
971  //int num_replacements =
972  replacer.Replace( "\\bD\\'", "d'", CRegexp::fCompile_default, CRegexp::fMatch_default, 0);
973  replacer.GetResult().swap( result );
974 
975  string temp;
976  CRegexp pattern("\\bd\\'\\w");
977  size_t start = 0;
978  for (;;) {
979  pattern.GetMatch(result, start, 0, CRegexp::fMatch_default, true);
980  if (pattern.NumFound() > 0) {
981  const auto* rslt = pattern.GetResults(0);
982  if (rslt[0] != start)
983  temp += result.substr(start,rslt[0]-start);
984  string tmp = result.substr(rslt[0], rslt[1] - rslt[0]);
985  tmp = NStr::ToUpper(tmp);
986  tmp.at(0) = 'd';
987  temp += tmp;
988  start = rslt[1];
989  } else {
990  temp += result.substr(start,result.length()-start);
991  break;
992  }
993  }
994  result = temp;
995  }
996 }
997 
999 {
1000  for(int p = 0; set_ordinal_endings[p][0] != '\0'; ++p)
1001  {
1003  string temp;
1004  size_t start = 0;
1005  for (;;) {
1006  pattern.GetMatch(result, start, 0, CRegexp::fMatch_default, true);
1007  if (pattern.NumFound() > 0) {
1008  const auto* rslt = pattern.GetResults(0);
1009  if (rslt[0] != start)
1010  temp += result.substr(start,rslt[0]-start);
1011  string tmp = result.substr(rslt[0], rslt[1] - rslt[0]);
1012  tmp = NStr::ToLower(tmp);
1013  temp += tmp;
1014  start = rslt[1];
1015  } else {
1016  temp += result.substr(start,result.length()-start);
1017  break;
1018  }
1019  }
1020  result = temp;
1021  }
1022 }
1023 
1025 {
1026  if (result.empty()) return;
1027  for (int pat=0; set_KnownAbbreviationList[pat].first[0] != '\0' ; ++pat)
1028  {
1029  CRegexpUtil replacer( result );
1030  //int num_replacements =
1032  replacer.GetResult().swap( result );
1033  }
1034 }
1035 
1037 {
1038  CRegexpUtil replacer( result );
1039  //int num_replacements =
1040  replacer.Replace( "(\\d)s\\b", "$1S", CRegexp::fCompile_default, CRegexp::fMatch_default, 0);
1041  replacer.GetResult().swap( result );
1042 }
1043 
1044 void ResetCapitalization(string& result, bool first_is_upper)
1045 {
1046 
1047  if (result.empty()) return;
1048 
1049  bool was_digit = false;
1050 
1051  if (first_is_upper)
1052  {
1053  /* Set first character to upper */
1054  result[0] = toupper(result[0]);
1055  }
1056  else
1057  {
1058  /* set first character to lower */
1059  result[0] = tolower(result[0]);
1060  }
1061 
1062  if (isdigit ((Int4)(result[0])))
1063  {
1064  was_digit = true;
1065  }
1066  unsigned int i = 1;
1067  /* Set rest of characters to lower */
1068  while (i < result.size())
1069  {
1070  char &pCh = result[i];
1071  if (was_digit && (pCh == 'S' || pCh == 's') && (i+1 >= result.size()-1 || isspace(result[i+1])))
1072  {
1073  pCh = toupper (pCh);
1074  was_digit = false;
1075  }
1076  else if (isdigit (pCh))
1077  {
1078  was_digit = true;
1079  }
1080  else
1081  {
1082  was_digit = false;
1083  pCh = tolower (pCh);
1084  }
1085  i++;
1086  }
1087 }
1088 
1090 {
1091  for(unsigned int p = 0; !GetValidCountryCode(p).empty(); ++p)
1092  {
1093  string name = GetValidCountryCode(p);
1094  CRegexpUtil replacer( result );
1095  replacer.Replace( "\\b"+name+"\\b", name, CRegexp::fCompile_ignore_case, CRegexp::fMatch_default, 0);
1096  replacer.GetResult().swap( result );
1097  }
1098 }
1099 
1102 
1103 
1104 
#define static
bool FixupMouseStrain(string &strain)
This function does not check whether the taxname starts with "Mus musculus", it only corrects the mou...
void FindOrgNames(CSeq_entry_Handle seh, vector< string > &taxnames)
void FixShortWordsInElement(string &result)
void FixAbbreviationsInElement(string &result, bool fix_end_of_sentence)
void RemoveFieldNameFromString(const string &field_name, string &str)
bool FixStateAbbreviationsInAffil(CAffil &affil)
void InsertMissingSpacesAfterNo(string &result)
static const SStaticPair< const char *, const char * > set_country_fixes[]
void FixAffiliationShortWordsInElement(string &result)
void FixCountryCapitalization(string &result)
static const SStaticPair< const char *, const char * > set_AffiliationShortWordList[]
void GetStateAbbreviation(string &state)
void FindReplaceString_CountryFixes(string &result)
static const string mouse_strain_fixes[]
void CapitalizeAfterApostrophe(string &input)
bool FixUSAAbbreviationInAffil(CAffil &affil)
void FixKnownAbbreviationsInElement(string &result)
CStaticPairArrayMap< const char *, const char *, PCase_CStr > TCStringPairsMap
void InsertMissingSpacesAfterCommas(string &result)
static const SStaticPair< const char *, const char * > set_abbreviation_list_end_of_sentence[]
static const SStaticPair< const char *, const char * > set_KnownAbbreviationList[]
const string & GetValidCountryCode(unsigned int i)
void FixCapitalizationInString(CSeq_entry_Handle seh, string &str, ECapChange capchange_opt)
void CapitalizeSAfterNumber(string &result)
bool FixStateAbbreviationsInCitSub(CCit_sub &sub)
void FixCapitalizationInElement(string &result)
static const SStaticPair< const char *, const char * > set_abbreviation_list[]
static const SStaticPair< const char *, const char * > set_short_words[]
DEFINE_STATIC_ARRAY_MAP(TCStringPairsMap, k_state_abbrev, map_state_to_abbrev)
MAKE_CONST_MAP(Canada_map_state_to_abbrev, ct::tagStrNocase, ct::tagStrNocase, { { "alberta", "AB"}, { "british columbia", "BC"}, { "manitoba", "MB"}, { "new brunswick", "NB"}, { "newfoundland and labrador", "NL"}, { "northwest territories", "NT"}, { "nova scotia", "NS"}, { "nunavut", "NU"}, { "ontario", "ON"}, { "prince edward island", "PE"}, { "quebec", "QC"}, { "saskatchewan", "SK"}, { "yukon", "YT"} })
static const SStaticPair< const char *, const char * > map_state_to_abbrev[]
void GetCanadaStateAbbreviation(string &state)
void FixOrgNames(CSeq_entry_Handle seh, string &result)
void FixOrdinalNumbers(string &result)
static const char * set_ordinal_endings[]
void ResetCapitalization(string &result, bool first_is_upper)
static bool s_ReplaceInPlaceWholeWordNoCase(string &str, const string &search, const string &replace)
std representation
Definition: Affil_.hpp:91
@Affil.hpp User-defined methods of the data storage class.
Definition: Affil.hpp:56
const string & GetTaxname(void) const
Definition: BioSource.cpp:340
bool IsSetTaxname(void) const
Definition: BioSource.cpp:335
CBioseq_CI –.
Definition: bioseq_ci.hpp:69
CRegexpUtil –.
Definition: regexp.hpp:312
CRegexp –.
Definition: regexp.hpp:70
CSeq_entry_Handle –.
CSeqdesc_CI –.
Definition: seqdesc_ci.hpp:65
class CStaticArrayMap<> is an array adaptor that provides an STLish interface to statically-defined a...
Definition: static_map.hpp:105
TBase::const_iterator const_iterator
Definition: static_map.hpp:109
@ eCapChange_firstlower_restnochange
capitalize the first letter, the rest is not changed
@ eCapChange_capword_afterspacepunc
capitalize the first letter and letters after spaces
@ eCapChange_none
@ eCapChange_capword_afterspace
first letter is lower case, the rest is not changed
@ eCapChange_firstcap_restnochange
capitalize the first letter, the rest is lower case
@ eCapChange_firstcap_restlower
change each letter to upper case
@ eCapChange_tolower
no change
@ eCapChange_toupper
change each letter to lower case
Include a standard set of the NCBI C++ Toolkit most basic headers.
static const struct name_t names[]
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static const char * str(char *buf, int n)
Definition: stats.c:84
static char tmp[3200]
Definition: utf8.c:42
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
const TOffset * GetResults(size_t idx) const
Get location of pattern/subpattern for the last GetMatch().
Definition: regexp.hpp:569
CTempString GetMatch(CTempString str, size_t offset=0, size_t idx=0, TMatch flags=fMatch_default, bool noreturn=false)
Get matching pattern and subpatterns.
Definition: regexp.cpp:182
size_t Replace(CTempStringEx search, CTempString replace, CRegexp::TCompile compile_flags=CRegexp::fCompile_default, CRegexp::TMatch match_flags=CRegexp::fMatch_default, size_t max_replace=0)
Replace occurrences of a substring within a string by pattern.
Definition: regexp.cpp:289
int NumFound() const
Get number of patterns + subpatterns.
Definition: regexp.hpp:562
string GetResult(void)
Get result string.
Definition: regexp.hpp:582
@ fCompile_default
Definition: regexp.hpp:102
@ fCompile_ignore_case
Definition: regexp.hpp:103
@ fMatch_default
Definition: regexp.hpp:127
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
#define kEmptyStr
Definition: ncbistr.hpp:123
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3461
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2993
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
#define NPOS
Definition: ncbistr.hpp:133
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
Definition: ncbistr.cpp:3201
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2891
static string Join(const TContainer &arr, const CTempString &delim)
Join strings using the specified delimiter.
Definition: ncbistr.hpp:2697
static bool EqualCase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-sensitive equality of a substring with another string.
Definition: ncbistr.hpp:5325
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
static string & ReplaceInPlace(string &src, const string &search, const string &replace, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3405
static string & ToUpper(string &str)
Convert string to upper case – string& version.
Definition: ncbistr.cpp:424
static string & ToLower(string &str)
Convert string to lower case – string& version.
Definition: ncbistr.cpp:405
@ eNocase
Case insensitive compare.
Definition: ncbistr.hpp:1206
bool IsSetAffil(void) const
author affiliation Check if a value has been assigned to Affil data member.
Definition: Auth_list_.hpp:498
void SetCountry(const TCountry &value)
Assign a value to Country data member.
Definition: Affil_.hpp:897
void SetSub(const TSub &value)
Assign a value to Sub data member.
Definition: Affil_.hpp:850
const TAffil & GetAffil(void) const
Get the Affil member data.
Definition: Auth_list_.hpp:510
const TAuthors & GetAuthors(void) const
Get the Authors member data.
Definition: Cit_sub_.hpp:357
bool IsSetAuthors(void) const
not necessarily authors of the paper Check if a value has been assigned to Authors data member.
Definition: Cit_sub_.hpp:345
void SetAuthors(TAuthors &value)
Assign a value to Authors data member.
Definition: Cit_sub_.cpp:74
const TSub & GetSub(void) const
Get the Sub member data.
Definition: Affil_.hpp:841
const TCountry & GetCountry(void) const
Get the Country member data.
Definition: Affil_.hpp:888
const TStd & GetStd(void) const
Get the variant data.
Definition: Affil_.cpp:214
bool IsStd(void) const
Check if variant Std is selected.
Definition: Affil_.hpp:1207
bool IsSetCountry(void) const
Author Affiliation, Country Check if a value has been assigned to Country data member.
Definition: Affil_.hpp:876
TStd & SetStd(void)
Select the variant.
Definition: Affil_.cpp:220
bool IsSetSub(void) const
Author Affiliation, County Sub Check if a value has been assigned to Sub data member.
Definition: Affil_.hpp:829
const TSource & GetSource(void) const
Get the variant data.
Definition: Seqdesc_.cpp:566
@ e_Source
source of materials, includes Org-ref
Definition: Seqdesc_.hpp:133
@ eMol_na
just a nucleic acid
Definition: Seq_inst_.hpp:113
static int input()
int i
yy_size_t n
std::integral_constant< ncbi::NStr::ECase, ncbi::NStr::eNocase > tagStrNocase
unsigned int a
Definition: ncbi_localip.c:102
int isalpha(Uchar c)
Definition: ncbictype.hpp:61
int isspace(Uchar c)
Definition: ncbictype.hpp:69
int tolower(Uchar c)
Definition: ncbictype.hpp:72
int isdigit(Uchar c)
Definition: ncbictype.hpp:64
int toupper(Uchar c)
Definition: ncbictype.hpp:73
int ispunct(Uchar c)
Definition: ncbictype.hpp:68
Template structure SStaticPair is simlified replacement of STL pair<> Main reason of introducing this...
Definition: static_set.hpp:60
else result
Definition: token2.c:20
C++ wrappers for the Perl-compatible regular expression (PCRE) library.
#define const
Definition: zconf.h:232
Modified on Wed Apr 17 13:09:39 2024 by modify_doxy.py rev. 669887