47 {
"\\barabidopsis thaliana\\b",
"Arabidopsis thaliana"},
48 {
"\\badp\\b",
"ADP" },
49 {
"\\batp\\b",
"ATP" },
50 {
"\\bbac\\b",
"BAC" },
51 {
"\\bcaenorhabditis elegans\\b",
"Caenorhabditis elegans" },
52 {
"\\bcdna\\b",
"cDNA" },
53 {
"\\bcdnas\\b",
"cDNAs" },
54 {
"\\bcoa\\b",
"CoA" },
55 {
"\\bcoi\\b",
"COI" },
56 {
"\\bcoii\\b",
"COII" },
57 {
"\\bdanio rerio\\b",
"Danio rerio" },
58 {
"\\bdna\\b",
"DNA" },
59 {
"\\bdrosophila melanogaster\\b",
"Drosophila melanogaster" },
60 {
"\\bdsrna\\b",
"dsRNA" },
61 {
"\\bescherichia coli\\b",
"Escherichia coli" },
62 {
"\\bhiv\\b",
"HIV" },
63 {
"\\bhiv\\-1\\b",
"HIV-1" },
64 {
"\\bhiv\\-2\\b",
"HIV-2" },
65 {
"\\bhnrna\\b",
"hnRNA" },
66 {
"\\bhomo sapiens\\b",
"Homo sapiens" },
67 {
"\\bmhc\\b",
"MHC" },
68 {
"\\bmrna\\b",
"mRNA" },
69 {
"\\bmtdna\\b",
"mtDNA" },
70 {
"\\bmus musculus\\b",
"Mus musculus" },
71 {
"\\bnadh\\b",
"NADH" },
72 {
"\\bnov\\.\\b",
"nov." },
73 {
"\\bnov\\.\\.\\b",
"nov.." },
74 {
"\\bpcr\\b",
"PCR" },
75 {
"\\brattus norvegicus\\b",
"Rattus norvegicus" },
76 {
"\\brapd\\b",
"RAPD" },
77 {
"\\brdna\\b",
"rDNA" },
78 {
"\\brna\\b",
"RNA" },
79 {
"\\brrna\\b",
"rRNA" },
80 {
"\\brt\\-pcr\\b",
"RT-PCR" },
81 {
"\\bsaccharomyces cerevisiae\\b",
"Saccharomyces cerevisiae" },
82 {
"\\bscrna\\b",
"scRNA" },
83 {
"\\bsiv\\-1\\b",
"SIV-1" },
84 {
"\\bsnp\\b",
"SNP" },
85 {
"\\bsnps\\b",
"SNPs" },
86 {
"\\bsnrna\\b",
"snRNA" },
87 {
"\\bsp\\.\\b",
"sp." },
88 {
"\\bsp\\.\\.\\b",
"sp.." },
89 {
"\\bssp\\.\\b",
"ssp." },
90 {
"\\bssp\\.\\.\\b",
"ssp.." },
91 {
"\\bssrna\\b",
"ssRNA" },
92 {
"\\bsubsp\\.\\b",
"subsp." },
93 {
"\\bsubsp\\.\\.\\b",
"subsp.." },
94 {
"\\btrna\\b",
"tRNA" },
95 {
"\\bvar\\.\\b",
"var." },
96 {
"\\bvar\\.\\.\\b",
"var.." },
98 {
"\\busa\\b",
"USA" },
99 {
"\\bU\\.S\\.A\\.\\b",
"USA" },
100 {
"\\bU\\.S\\.A\\b",
"USA" },
101 {
"\\bUnited States of America\\b",
"USA" },
102 {
"\\b\\(hiv\\)\\b",
"(HIV)" },
103 {
"\\b\\(hiv1\\)\\b",
"(HIV1)" },
104 {
"\\b\\(hiv\\-1\\)\\b",
"(HIV-1)"},
111 {
"\\bsp\\.$",
"sp.." },
112 {
"\\bnov\\.$",
"nov.." },
113 {
"\\bssp\\.$",
"ssp.." },
114 {
"\\bvar\\.$",
"var.." },
115 {
"\\bsubsp\\.$",
"subsp.."},
132 {
"california",
"CA"},
137 {
"connecticut",
"CT"},
157 {
"louisiana",
"LA"},
161 {
"massachusetts",
"MA"},
165 {
"minnesota",
"MN"},
167 {
"mississippi",
"MS"},
178 {
"new hampshire",
"NH"},
179 {
"new jersey",
"NJ"},
180 {
"new mexico",
"NM"},
182 {
"north carolina",
"NC"},
183 {
"north dakota",
"ND"},
192 {
"pennsylvania",
"PA"},
193 {
"puerto rico",
"PR"},
194 {
"rhode island",
"RI"},
197 {
"south carolina",
"SC"},
198 {
"south dakota",
"SD"},
200 {
"tennessee",
"TN"},
208 {
"washington",
"WA"},
209 {
"west virginia",
"WV"},
212 {
"wisconsin",
"WI"},
244 {
"\\bAbout\\b",
"about" },
245 {
"\\bAnd\\b",
"and" },
247 {
"\\bBut\\b",
"but" },
249 {
"\\bFor\\b",
"for" },
255 {
"\\bThe\\b",
"the" },
257 {
"\\bWith\\b",
"with" },
265 {
"\\bchnia\\b",
"China" },
266 {
"\\bpr china\\b",
"P.R. China" },
267 {
"\\bprchina\\b",
"P.R. China" },
268 {
"\\bp\\.r\\.china\\b",
"P.R. China" },
269 {
"\\bp\\.r china\\b",
"P.R. China" },
270 {
"\\bp\\, r\\, china\\b",
"P.R. China" },
271 {
"\\brok\\b",
"ROK" },
272 {
"\\brsa\\b",
"RSA" },
273 {
"\\broc\\b",
"ROC" },
274 {
"\\buae\\b",
"UAE" },
275 {
"\\bK\\.S\\.A\\.\\b",
"K.S.A." },
276 {
"\\bk\\. s\\. a\\.\\b",
"K. S. A." },
277 {
"\\bksa\\b",
"KSA" },
284 {
"\\bAux\\b",
"aux" },
285 {
"\\bA La\\b",
"a la" },
286 {
"\\bDe La\\b",
"de la" },
288 {
"\\bDel\\b",
"del"},
289 {
"\\bDes\\b",
"des" },
294 {
"\\bLes\\b",
"les" },
295 {
"\\bRue\\b",
"rue" },
296 {
"\\bPo Box\\b",
"PO Box" },
297 {
"\\bPobox\\b",
"PO Box" },
298 {
"\\bP\\.O box\\b",
"P.O. Box" },
299 {
"\\bP\\.Obox\\b",
"P.O. Box" },
301 {
"\\bA\\&F\\b",
"A&F" },
316 {
"\\bpo box\\b",
"PO Box" },
317 {
"\\bPobox\\b",
"PO Box" },
318 {
"\\bP\\.O box\\b",
"P.O. Box" },
319 {
"\\bP\\.Obox\\b",
"P.O. Box" },
320 {
"\\bPO\\.Box\\b",
"P.O. Box" },
321 {
"\\bPO\\. Box\\b",
"P.O. Box" },
322 {
"\\bpr china\\b",
"P.R. China"},
323 {
"\\bprchina\\b",
"P.R. China" },
324 {
"\\bp\\.r\\.china\\b",
"P.R. China" },
325 {
"\\bp\\.r china\\b",
"P.R. China" },
326 {
"\\bp\\, r\\, china\\b",
"P.R. China" },
327 {
"\\bp\\,r\\, china\\b",
"P.R. China" },
328 {
"\\bp\\,r\\,china\\b",
"P.R. China" },
334 static vector<string> set_valid_country_codes
344 "Antigua and Barbuda",
349 "Ashmore and Cartier Islands",
369 "Bosnia and Herzegovina",
373 "British Virgin Islands",
383 "Central African Republic",
401 "Democratic Republic of the Congo",
405 "Dominican Republic",
415 "Falkland Islands (Islas Malvinas)",
422 "French Southern and Antarctic Lands",
442 "Heard Island and McDonald Islands",
464 "Juan de Nova Island",
467 "Kerguelen Archipelago",
522 "Northern Mariana Islands",
540 "Republic of the Congo",
547 "Saint Kitts and Nevis",
549 "Saint Pierre and Miquelon",
550 "Saint Vincent and the Grenadines",
553 "Sao Tome and Principe",
566 "South Georgia and the South Sandwich Islands",
588 "Trinidad and Tobago",
593 "Turks and Caicos Islands",
597 "United Arab Emirates",
614 return (
i < set_valid_country_codes.size()) ? set_valid_country_codes[
i] :
kEmptyStr;
623 switch (capchange_opt) {
656 vector<string> words;
658 for (vector<string>::iterator word = words.begin(); word != words.end(); ++word) {
659 if (!word->empty() &&
isalpha(word->at(0))) {
660 word->at(0) =
toupper(word->at(0));
665 bool found_punct =
false;
694 if (fix_end_of_sentence)
707 bool modified =
false;
710 while (pos != string::npos) {
711 size_t right_end = pos + search.length();
712 if ((pos == 0 || !
isalpha(
str.c_str()[pos - 1]))
713 && (right_end ==
str.length() || !
isalpha(
str.c_str()[right_end]))) {
714 string this_replace = replace;
715 str =
str.substr(0, pos) + this_replace +
str.substr(right_end);
716 right_end = pos + this_replace.length();
727 vector<string> taxnames;
729 for (vector<string>::const_iterator name = taxnames.begin(); name != taxnames.end(); ++name) {
732 string temp_taxname(*name);
745 for ( ; b_iter ; ++b_iter ) {
750 names.insert(tax_name);
765 &&
str[field_name.length()] ==
' ') {
776 if (found != k_state_abbrev.end())
777 state = found->second;
784 bool modified =
false;
838 bool whole_word =
true;
868 bool capitalize =
true;
869 for (
unsigned int i=0;
i<
result.size();
i++)
914 if (rslt[0] != start)
916 string tmp =
input.substr(rslt[0], rslt[1] - rslt[0]);
929 if (
result.empty())
return;
953 if (rslt[0] != start)
954 temp +=
result.substr(start,rslt[0]-start);
955 string tmp =
result.substr(rslt[0], rslt[1] - rslt[0]);
980 if (rslt[0] != start)
981 temp +=
result.substr(start,rslt[0]-start);
982 string tmp =
result.substr(rslt[0], rslt[1] - rslt[0]);
997 if (
result.empty())
return;
1018 if (
result.empty())
return;
1020 bool was_digit =
false;
bool FixupMouseStrain(string &strain)
This function does not check whether the taxname starts with "Mus musculus", it only corrects the mou...
void FindOrgNames(CSeq_entry_Handle seh, vector< string > &taxnames)
void FixShortWordsInElement(string &result)
void FixAbbreviationsInElement(string &result, bool fix_end_of_sentence)
void RemoveFieldNameFromString(const string &field_name, string &str)
bool FixStateAbbreviationsInAffil(CAffil &affil)
void InsertMissingSpacesAfterNo(string &result)
static const SStaticPair< const char *, const char * > set_country_fixes[]
void FixAffiliationShortWordsInElement(string &result)
void FixCountryCapitalization(string &result)
static const SStaticPair< const char *, const char * > set_AffiliationShortWordList[]
void GetStateAbbreviation(string &state)
void FindReplaceString_CountryFixes(string &result)
static const string mouse_strain_fixes[]
void CapitalizeAfterApostrophe(string &input)
bool FixUSAAbbreviationInAffil(CAffil &affil)
void FixKnownAbbreviationsInElement(string &result)
CStaticPairArrayMap< const char *, const char *, PCase_CStr > TCStringPairsMap
void InsertMissingSpacesAfterCommas(string &result)
static const SStaticPair< const char *, const char * > set_abbreviation_list_end_of_sentence[]
static const SStaticPair< const char *, const char * > set_KnownAbbreviationList[]
const string & GetValidCountryCode(unsigned int i)
void FixCapitalizationInString(CSeq_entry_Handle seh, string &str, ECapChange capchange_opt)
void CapitalizeSAfterNumber(string &result)
bool FixStateAbbreviationsInCitSub(CCit_sub &sub)
void FixCapitalizationInElement(string &result)
static const SStaticPair< const char *, const char * > set_abbreviation_list[]
static const SStaticPair< const char *, const char * > set_short_words[]
DEFINE_STATIC_ARRAY_MAP(TCStringPairsMap, k_state_abbrev, map_state_to_abbrev)
static const SStaticPair< const char *, const char * > map_state_to_abbrev[]
void FixOrgNames(CSeq_entry_Handle seh, string &result)
void FixOrdinalNumbers(string &result)
static const char * set_ordinal_endings[]
void ResetCapitalization(string &result, bool first_is_upper)
static bool s_ReplaceInPlaceWholeWordNoCase(string &str, const string &search, const string &replace)
@Affil.hpp User-defined methods of the data storage class.
const string & GetTaxname(void) const
bool IsSetTaxname(void) const
class CStaticArrayMap<> is an array adaptor that provides an STLish interface to statically-defined a...
TBase::const_iterator const_iterator
@ eCapChange_firstlower_restnochange
capitalize the first letter, the rest is not changed
@ eCapChange_capword_afterspacepunc
capitalize the first letter and letters after spaces
@ eCapChange_capword_afterspace
first letter is lower case, the rest is not changed
@ eCapChange_firstcap_restnochange
capitalize the first letter, the rest is lower case
@ eCapChange_firstcap_restlower
change each letter to upper case
@ eCapChange_tolower
no change
@ eCapChange_toupper
change each letter to lower case
Include a standard set of the NCBI C++ Toolkit most basic headers.
static const struct name_t names[]
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
int32_t Int4
4-byte (32-bit) signed integer
const TOffset * GetResults(size_t idx) const
Get location of pattern/subpattern for the last GetMatch().
CTempString GetMatch(CTempString str, size_t offset=0, size_t idx=0, TMatch flags=fMatch_default, bool noreturn=false)
Get matching pattern and subpatterns.
size_t Replace(CTempStringEx search, CTempString replace, CRegexp::TCompile compile_flags=CRegexp::fCompile_default, CRegexp::TMatch match_flags=CRegexp::fMatch_default, size_t max_replace=0)
Replace occurrences of a substring within a string by pattern.
int NumFound() const
Get number of patterns + subpatterns.
string GetResult(void)
Get result string.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define BEGIN_SCOPE(ns)
Define a new scope.
NCBI_NS_STD::string::size_type SIZE_TYPE
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
static string Join(const TContainer &arr, const CTempString &delim)
Join strings using the specified delimiter.
static bool EqualCase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-sensitive equality of a substring with another string.
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
static string & ReplaceInPlace(string &src, const string &search, const string &replace, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
static string & ToUpper(string &str)
Convert string to upper case – string& version.
static string & ToLower(string &str)
Convert string to lower case – string& version.
@ eNocase
Case insensitive compare.
bool IsSetAffil(void) const
author affiliation Check if a value has been assigned to Affil data member.
void SetCountry(const TCountry &value)
Assign a value to Country data member.
void SetSub(const TSub &value)
Assign a value to Sub data member.
const TAffil & GetAffil(void) const
Get the Affil member data.
const TAuthors & GetAuthors(void) const
Get the Authors member data.
bool IsSetAuthors(void) const
not necessarily authors of the paper Check if a value has been assigned to Authors data member.
void SetAuthors(TAuthors &value)
Assign a value to Authors data member.
const TSub & GetSub(void) const
Get the Sub member data.
const TCountry & GetCountry(void) const
Get the Country member data.
const TStd & GetStd(void) const
Get the variant data.
bool IsStd(void) const
Check if variant Std is selected.
bool IsSetCountry(void) const
Author Affiliation, Country Check if a value has been assigned to Country data member.
TStd & SetStd(void)
Select the variant.
bool IsSetSub(void) const
Author Affiliation, County Sub Check if a value has been assigned to Sub data member.
const TSource & GetSource(void) const
Get the variant data.
@ e_Source
source of materials, includes Org-ref
@ eMol_na
just a nucleic acid
static const char * str(char *buf, int n)
Template structure SStaticPair is simlified replacement of STL pair<> Main reason of introducing this...
C++ wrappers for the Perl-compatible regular expression (PCRE) library.