47 {
"\\barabidopsis thaliana\\b",
"Arabidopsis thaliana"},
48 {
"\\badp\\b",
"ADP" },
49 {
"\\batp\\b",
"ATP" },
50 {
"\\bbac\\b",
"BAC" },
51 {
"\\bcaenorhabditis elegans\\b",
"Caenorhabditis elegans" },
52 {
"\\bcdna\\b",
"cDNA" },
53 {
"\\bcdnas\\b",
"cDNAs" },
54 {
"\\bcoa\\b",
"CoA" },
55 {
"\\bcoi\\b",
"COI" },
56 {
"\\bcoii\\b",
"COII" },
57 {
"\\bdanio rerio\\b",
"Danio rerio" },
58 {
"\\bdna\\b",
"DNA" },
59 {
"\\bdrosophila melanogaster\\b",
"Drosophila melanogaster" },
60 {
"\\bdsrna\\b",
"dsRNA" },
61 {
"\\bescherichia coli\\b",
"Escherichia coli" },
62 {
"\\bhiv\\b",
"HIV" },
63 {
"\\bhiv\\-1\\b",
"HIV-1" },
64 {
"\\bhiv\\-2\\b",
"HIV-2" },
65 {
"\\bhnrna\\b",
"hnRNA" },
66 {
"\\bhomo sapiens\\b",
"Homo sapiens" },
67 {
"\\bmhc\\b",
"MHC" },
68 {
"\\bmrna\\b",
"mRNA" },
69 {
"\\bmtdna\\b",
"mtDNA" },
70 {
"\\bmus musculus\\b",
"Mus musculus" },
71 {
"\\bnadh\\b",
"NADH" },
72 {
"\\bnov\\.\\b",
"nov." },
73 {
"\\bnov\\.\\.\\b",
"nov.." },
74 {
"\\bpcr\\b",
"PCR" },
75 {
"\\brattus norvegicus\\b",
"Rattus norvegicus" },
76 {
"\\brapd\\b",
"RAPD" },
77 {
"\\brdna\\b",
"rDNA" },
78 {
"\\brna\\b",
"RNA" },
79 {
"\\brrna\\b",
"rRNA" },
80 {
"\\brt\\-pcr\\b",
"RT-PCR" },
81 {
"\\bsaccharomyces cerevisiae\\b",
"Saccharomyces cerevisiae" },
82 {
"\\bscrna\\b",
"scRNA" },
83 {
"\\bsiv\\-1\\b",
"SIV-1" },
84 {
"\\bsnp\\b",
"SNP" },
85 {
"\\bsnps\\b",
"SNPs" },
86 {
"\\bsnrna\\b",
"snRNA" },
87 {
"\\bsp\\.\\b",
"sp." },
88 {
"\\bsp\\.\\.\\b",
"sp.." },
89 {
"\\bssp\\.\\b",
"ssp." },
90 {
"\\bssp\\.\\.\\b",
"ssp.." },
91 {
"\\bssrna\\b",
"ssRNA" },
92 {
"\\bsubsp\\.\\b",
"subsp." },
93 {
"\\bsubsp\\.\\.\\b",
"subsp.." },
94 {
"\\btrna\\b",
"tRNA" },
95 {
"\\bvar\\.\\b",
"var." },
96 {
"\\bvar\\.\\.\\b",
"var.." },
98 {
"\\busa\\b",
"USA" },
99 {
"\\bU\\.S\\.A\\.\\b",
"USA" },
100 {
"\\bU\\.S\\.A\\b",
"USA" },
101 {
"\\bUnited States of America\\b",
"USA" },
102 {
"\\b\\(hiv\\)\\b",
"(HIV)" },
103 {
"\\b\\(hiv1\\)\\b",
"(HIV1)" },
104 {
"\\b\\(hiv\\-1\\)\\b",
"(HIV-1)"},
111 {
"\\bsp\\.$",
"sp.." },
112 {
"\\bnov\\.$",
"nov.." },
113 {
"\\bssp\\.$",
"ssp.." },
114 {
"\\bvar\\.$",
"var.." },
115 {
"\\bsubsp\\.$",
"subsp.."},
132 {
"california",
"CA"},
137 {
"connecticut",
"CT"},
157 {
"louisiana",
"LA"},
161 {
"massachusetts",
"MA"},
165 {
"minnesota",
"MN"},
167 {
"mississippi",
"MS"},
178 {
"new hampshire",
"NH"},
179 {
"new jersey",
"NJ"},
180 {
"new mexico",
"NM"},
182 {
"north carolina",
"NC"},
183 {
"north dakota",
"ND"},
192 {
"pennsylvania",
"PA"},
193 {
"puerto rico",
"PR"},
194 {
"rhode island",
"RI"},
197 {
"south carolina",
"SC"},
198 {
"south dakota",
"SD"},
200 {
"tennessee",
"TN"},
208 {
"washington",
"WA"},
209 {
"west virginia",
"WV"},
212 {
"wisconsin",
"WI"},
244 {
"british columbia",
"BC"},
246 {
"new brunswick",
"NB"},
247 {
"newfoundland and labrador",
"NL"},
248 {
"northwest territories",
"NT"},
249 {
"nova scotia",
"NS"},
252 {
"prince edward island",
"PE"},
254 {
"saskatchewan",
"SK"},
262 {
"\\bAbout\\b",
"about" },
263 {
"\\bAnd\\b",
"and" },
265 {
"\\bBut\\b",
"but" },
267 {
"\\bFor\\b",
"for" },
273 {
"\\bThe\\b",
"the" },
275 {
"\\bWith\\b",
"with" },
283 {
"\\bchnia\\b",
"China" },
284 {
"\\bpr china\\b",
"P.R. China" },
285 {
"\\bprchina\\b",
"P.R. China" },
286 {
"\\bp\\.r\\.china\\b",
"P.R. China" },
287 {
"\\bp\\.r china\\b",
"P.R. China" },
288 {
"\\bp\\, r\\, china\\b",
"P.R. China" },
289 {
"\\brok\\b",
"ROK" },
290 {
"\\brsa\\b",
"RSA" },
291 {
"\\broc\\b",
"ROC" },
292 {
"\\buae\\b",
"UAE" },
293 {
"\\bK\\.S\\.A\\.\\b",
"K.S.A." },
294 {
"\\bk\\. s\\. a\\.\\b",
"K. S. A." },
295 {
"\\bksa\\b",
"KSA" },
302 {
"\\bAux\\b",
"aux" },
303 {
"\\bA La\\b",
"a la" },
304 {
"\\bDe La\\b",
"de la" },
306 {
"\\bDel\\b",
"del"},
307 {
"\\bDes\\b",
"des" },
312 {
"\\bLes\\b",
"les" },
313 {
"\\bRue\\b",
"rue" },
314 {
"\\bPo Box\\b",
"PO Box" },
315 {
"\\bPobox\\b",
"PO Box" },
316 {
"\\bP\\.O box\\b",
"P.O. Box" },
317 {
"\\bP\\.Obox\\b",
"P.O. Box" },
319 {
"\\bA\\&F\\b",
"A&F" },
334 {
"\\bpo box\\b",
"PO Box" },
335 {
"\\bPobox\\b",
"PO Box" },
336 {
"\\bP\\.O box\\b",
"P.O. Box" },
337 {
"\\bP\\.Obox\\b",
"P.O. Box" },
338 {
"\\bPO\\.Box\\b",
"P.O. Box" },
339 {
"\\bPO\\. Box\\b",
"P.O. Box" },
340 {
"\\bpr china\\b",
"P.R. China"},
341 {
"\\bprchina\\b",
"P.R. China" },
342 {
"\\bp\\.r\\.china\\b",
"P.R. China" },
343 {
"\\bp\\.r china\\b",
"P.R. China" },
344 {
"\\bp\\, r\\, china\\b",
"P.R. China" },
345 {
"\\bp\\,r\\, china\\b",
"P.R. China" },
346 {
"\\bp\\,r\\,china\\b",
"P.R. China" },
352 static vector<string> set_valid_country_codes
362 "Antigua and Barbuda",
367 "Ashmore and Cartier Islands",
387 "Bosnia and Herzegovina",
391 "British Virgin Islands",
401 "Central African Republic",
419 "Democratic Republic of the Congo",
423 "Dominican Republic",
433 "Falkland Islands (Islas Malvinas)",
440 "French Southern and Antarctic Lands",
460 "Heard Island and McDonald Islands",
482 "Juan de Nova Island",
485 "Kerguelen Archipelago",
540 "Northern Mariana Islands",
558 "Republic of the Congo",
565 "Saint Kitts and Nevis",
567 "Saint Pierre and Miquelon",
568 "Saint Vincent and the Grenadines",
571 "Sao Tome and Principe",
584 "South Georgia and the South Sandwich Islands",
606 "Trinidad and Tobago",
611 "Turks and Caicos Islands",
615 "United Arab Emirates",
632 return (
i < set_valid_country_codes.size()) ? set_valid_country_codes[
i] :
kEmptyStr;
641 switch (capchange_opt) {
674 vector<string> words;
676 for (vector<string>::iterator word = words.begin(); word != words.end(); ++word) {
677 if (!word->empty() &&
isalpha(word->at(0))) {
678 word->at(0) =
toupper(word->at(0));
683 bool found_punct =
false;
712 if (fix_end_of_sentence)
725 bool modified =
false;
728 while (pos != string::npos) {
729 size_t right_end = pos + search.length();
730 if ((pos == 0 || !
isalpha(
str.c_str()[pos - 1]))
731 && (right_end ==
str.length() || !
isalpha(
str.c_str()[right_end]))) {
732 string this_replace = replace;
733 str =
str.substr(0, pos) + this_replace +
str.substr(right_end);
734 right_end = pos + this_replace.length();
745 vector<string> taxnames;
747 for (vector<string>::const_iterator name = taxnames.begin(); name != taxnames.end(); ++name) {
750 string temp_taxname(*name);
763 for ( ; b_iter ; ++b_iter ) {
768 names.insert(tax_name);
783 &&
str[field_name.length()] ==
' ') {
794 if (found != k_state_abbrev.end())
795 state = found->second;
804 auto found = Canada_map_state_to_abbrev.find(
state);
805 if (found != Canada_map_state_to_abbrev.end())
806 state = found->second;
813 bool modified =
false;
867 bool whole_word =
true;
897 bool capitalize =
true;
898 for (
unsigned int i=0;
i<
result.size();
i++)
943 if (rslt[0] != start)
945 string tmp =
input.substr(rslt[0], rslt[1] - rslt[0]);
958 if (
result.empty())
return;
982 if (rslt[0] != start)
983 temp +=
result.substr(start,rslt[0]-start);
984 string tmp =
result.substr(rslt[0], rslt[1] - rslt[0]);
1009 if (rslt[0] != start)
1010 temp +=
result.substr(start,rslt[0]-start);
1011 string tmp =
result.substr(rslt[0], rslt[1] - rslt[0]);
1026 if (
result.empty())
return;
1047 if (
result.empty())
return;
1049 bool was_digit =
false;
bool FixupMouseStrain(string &strain)
This function does not check whether the taxname starts with "Mus musculus", it only corrects the mou...
void FindOrgNames(CSeq_entry_Handle seh, vector< string > &taxnames)
void FixShortWordsInElement(string &result)
void FixAbbreviationsInElement(string &result, bool fix_end_of_sentence)
void RemoveFieldNameFromString(const string &field_name, string &str)
bool FixStateAbbreviationsInAffil(CAffil &affil)
void InsertMissingSpacesAfterNo(string &result)
static const SStaticPair< const char *, const char * > set_country_fixes[]
void FixAffiliationShortWordsInElement(string &result)
void FixCountryCapitalization(string &result)
static const SStaticPair< const char *, const char * > set_AffiliationShortWordList[]
void GetStateAbbreviation(string &state)
void FindReplaceString_CountryFixes(string &result)
static const string mouse_strain_fixes[]
void CapitalizeAfterApostrophe(string &input)
bool FixUSAAbbreviationInAffil(CAffil &affil)
void FixKnownAbbreviationsInElement(string &result)
CStaticPairArrayMap< const char *, const char *, PCase_CStr > TCStringPairsMap
void InsertMissingSpacesAfterCommas(string &result)
static const SStaticPair< const char *, const char * > set_abbreviation_list_end_of_sentence[]
static const SStaticPair< const char *, const char * > set_KnownAbbreviationList[]
const string & GetValidCountryCode(unsigned int i)
void FixCapitalizationInString(CSeq_entry_Handle seh, string &str, ECapChange capchange_opt)
void CapitalizeSAfterNumber(string &result)
bool FixStateAbbreviationsInCitSub(CCit_sub &sub)
void FixCapitalizationInElement(string &result)
static const SStaticPair< const char *, const char * > set_abbreviation_list[]
static const SStaticPair< const char *, const char * > set_short_words[]
DEFINE_STATIC_ARRAY_MAP(TCStringPairsMap, k_state_abbrev, map_state_to_abbrev)
MAKE_CONST_MAP(Canada_map_state_to_abbrev, ct::tagStrNocase, ct::tagStrNocase, { { "alberta", "AB"}, { "british columbia", "BC"}, { "manitoba", "MB"}, { "new brunswick", "NB"}, { "newfoundland and labrador", "NL"}, { "northwest territories", "NT"}, { "nova scotia", "NS"}, { "nunavut", "NU"}, { "ontario", "ON"}, { "prince edward island", "PE"}, { "quebec", "QC"}, { "saskatchewan", "SK"}, { "yukon", "YT"} })
static const SStaticPair< const char *, const char * > map_state_to_abbrev[]
void GetCanadaStateAbbreviation(string &state)
void FixOrgNames(CSeq_entry_Handle seh, string &result)
void FixOrdinalNumbers(string &result)
static const char * set_ordinal_endings[]
void ResetCapitalization(string &result, bool first_is_upper)
static bool s_ReplaceInPlaceWholeWordNoCase(string &str, const string &search, const string &replace)
@Affil.hpp User-defined methods of the data storage class.
const string & GetTaxname(void) const
bool IsSetTaxname(void) const
class CStaticArrayMap<> is an array adaptor that provides an STLish interface to statically-defined a...
TBase::const_iterator const_iterator
@ eCapChange_firstlower_restnochange
capitalize the first letter, the rest is not changed
@ eCapChange_capword_afterspacepunc
capitalize the first letter and letters after spaces
@ eCapChange_capword_afterspace
first letter is lower case, the rest is not changed
@ eCapChange_firstcap_restnochange
capitalize the first letter, the rest is lower case
@ eCapChange_firstcap_restlower
change each letter to upper case
@ eCapChange_tolower
no change
@ eCapChange_toupper
change each letter to lower case
Include a standard set of the NCBI C++ Toolkit most basic headers.
static const struct name_t names[]
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
static const char * str(char *buf, int n)
int32_t Int4
4-byte (32-bit) signed integer
const TOffset * GetResults(size_t idx) const
Get location of pattern/subpattern for the last GetMatch().
CTempString GetMatch(CTempString str, size_t offset=0, size_t idx=0, TMatch flags=fMatch_default, bool noreturn=false)
Get matching pattern and subpatterns.
size_t Replace(CTempStringEx search, CTempString replace, CRegexp::TCompile compile_flags=CRegexp::fCompile_default, CRegexp::TMatch match_flags=CRegexp::fMatch_default, size_t max_replace=0)
Replace occurrences of a substring within a string by pattern.
int NumFound() const
Get number of patterns + subpatterns.
string GetResult(void)
Get result string.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define BEGIN_SCOPE(ns)
Define a new scope.
NCBI_NS_STD::string::size_type SIZE_TYPE
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
static string Join(const TContainer &arr, const CTempString &delim)
Join strings using the specified delimiter.
static bool EqualCase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-sensitive equality of a substring with another string.
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
static string & ReplaceInPlace(string &src, const string &search, const string &replace, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
static string & ToUpper(string &str)
Convert string to upper case – string& version.
static string & ToLower(string &str)
Convert string to lower case – string& version.
@ eNocase
Case insensitive compare.
bool IsSetAffil(void) const
author affiliation Check if a value has been assigned to Affil data member.
void SetCountry(const TCountry &value)
Assign a value to Country data member.
void SetSub(const TSub &value)
Assign a value to Sub data member.
const TAffil & GetAffil(void) const
Get the Affil member data.
const TAuthors & GetAuthors(void) const
Get the Authors member data.
bool IsSetAuthors(void) const
not necessarily authors of the paper Check if a value has been assigned to Authors data member.
void SetAuthors(TAuthors &value)
Assign a value to Authors data member.
const TSub & GetSub(void) const
Get the Sub member data.
const TCountry & GetCountry(void) const
Get the Country member data.
const TStd & GetStd(void) const
Get the variant data.
bool IsStd(void) const
Check if variant Std is selected.
bool IsSetCountry(void) const
Author Affiliation, Country Check if a value has been assigned to Country data member.
TStd & SetStd(void)
Select the variant.
bool IsSetSub(void) const
Author Affiliation, County Sub Check if a value has been assigned to Sub data member.
const TSource & GetSource(void) const
Get the variant data.
@ e_Source
source of materials, includes Org-ref
@ eMol_na
just a nucleic acid
std::integral_constant< ncbi::NStr::ECase, ncbi::NStr::eNocase > tagStrNocase
Template structure SStaticPair is simlified replacement of STL pair<> Main reason of introducing this...
C++ wrappers for the Perl-compatible regular expression (PCRE) library.