67 replace(name.begin(), name.end(),
'_',
'-');
68 replace(name.begin(), name.end(),
' ',
'-');
75 if (name ==
"host" || name ==
"specific-host") {
77 }
else if (name ==
"sub-strain") {
91 replace(name.begin(), name.end(),
'_',
'-');
92 replace(name.begin(), name.end(),
' ',
'-');
95 name ==
"orgmod-note" ||
96 name ==
"note-orgmod") {
99 if (name ==
"host" || name ==
"sub-strain") {
154 default:
return true;
197 if (pos == string::npos) {
201 inst =
str.substr(0, pos);
202 id =
str.substr(pos + 1);
204 if (pos != string::npos) {
205 coll =
id.substr(0, pos);
206 id =
id.substr(pos + 1);
227 #include "institution_codes.inc"
235 vector<string> tokens;
237 if (tokens.size() < 3) {
244 string& vouch_types = tokens[1];
245 for (
size_t i = 0;
i < vouch_types.size();
i++) {
246 switch (vouch_types[
i]) {
267 vector<string> synonyms;
285 CTime builtin_timestamp
286 (
static_cast<time_t
>(kInstitutionCollectionCodeList_Timestamp));
291 }
NCBI_CATCH(
"s_InitializeInstitutionCollectionCodeMaps")
295 if (getenv(
"NCBI_DEBUG")) {
296 LOG_POST(
"Falling back on built-in data for institution code list.");
298 size_t num_codes =
sizeof (kInstitutionCollectionCodeList) /
sizeof (
char *);
299 for (
size_t i = 0;
i < num_codes;
i++) {
300 const char *p = kInstitutionCollectionCodeList[
i];
304 if (getenv(
"NCBI_DEBUG")) {
305 LOG_POST(
"Reading from " +
file +
" for instition code list.");
309 }
while ( !lr->
AtEOF() );
317 bool& is_miscapitalized,
string& correct_cap,
bool& needs_country,
bool& erroneous_country)
320 if (it != code_map.
end()) {
323 is_miscapitalized =
true;
325 correct_cap = it->first;
329 if (pos == string::npos) {
330 string check = inst_coll +
"<";
331 it = code_map.
begin();
332 while (it != code_map.
end()) {
334 needs_country =
true;
336 is_miscapitalized =
true;
338 correct_cap = it->first.substr(0, inst_coll.length());
344 string inst_sub = inst_coll.substr(0, pos);
345 it = code_map.
find(inst_sub);
346 if (it != code_map.
end()) {
347 erroneous_country =
true;
352 return code_map.
end();
356 bool COrgMod::IsInstitutionCodeValid(
const string& inst_coll,
string &voucher_type,
bool& is_miscapitalized,
string& correct_cap,
bool& needs_country,
bool& erroneous_country)
358 is_miscapitalized =
false;
359 needs_country =
false;
360 erroneous_country =
false;
369 bool syn_is_miscapitalized =
false;
370 string syn_correct_cap =
"";
371 bool syn_needs_country =
false;
372 bool syn_erroneous_country =
false;
375 syn_needs_country, syn_erroneous_country);
379 is_miscapitalized = syn_is_miscapitalized;
380 correct_cap = syn_correct_cap;
381 needs_country = syn_needs_country;
382 erroneous_country = syn_erroneous_country;
383 voucher_type = is->second;
387 }
else if (erroneous_country) {
389 bool syn_is_miscapitalized =
false;
390 string syn_correct_cap =
"";
391 bool syn_needs_country =
false;
392 bool syn_erroneous_country =
false;
395 syn_needs_country, syn_erroneous_country);
399 is_miscapitalized = syn_is_miscapitalized;
400 correct_cap = syn_correct_cap;
401 needs_country = syn_needs_country;
402 erroneous_country = syn_erroneous_country;
403 voucher_type = is->second;
408 voucher_type = ic->second;
415 voucher_type = it->second;
426 if (
NStr::Find(culture_collection,
":") == string::npos) {
427 return "Culture_collection should be structured, but is not";
437 if (
NStr::Find(specimen_voucher,
":") == string::npos) {
448 if (
NStr::Find(biomaterial,
":") == string::npos) {
457 const string kMissingId =
"Voucher is missing specific identifier";
480 inst_coll = inst_code;
482 inst_coll = inst_code +
":" + coll_code;
487 bool is_miscapitalized;
489 bool erroneous_country;
493 return "Institution code " + inst_coll +
" needs to be qualified with a <COUNTRY> designation";
494 }
else if (erroneous_country) {
495 return "Institution code " + inst_coll +
" should not be qualified with a <COUNTRY> designation";
496 }
else if (is_miscapitalized) {
497 return "Institution code " + inst_coll +
" exists, but correct capitalization is " + correct_cap;
501 return "Institution code " + inst_coll +
" should be bio_material";
503 return "Institution code " + inst_coll +
" should be culture_collection";
505 return "Institution code " + inst_coll +
" should be specimen_voucher";
512 return "Personal collection does not have name of collector";
516 return "Institution code " + inst_coll +
" is not in list";
517 }
else if (
IsInstitutionCodeValid(inst_code, voucher_type, is_miscapitalized, correct_cap, needs_country, erroneous_country)) {
519 return "Institution code in " + inst_coll +
" needs to be qualified with a <COUNTRY> designation";
520 }
else if (erroneous_country) {
521 return "Institution code " + inst_code +
" should not be qualified with a <COUNTRY> designation";
522 }
else if (is_miscapitalized) {
523 return "Institution code " + inst_code +
" exists, but correct capitalization is " + correct_cap;
527 return "DNA should be bio_material";
530 return "Institution code " + inst_code +
" exists, but collection "
531 + inst_coll +
" is not in list";
534 return "Institution code " + inst_coll +
" is not in list";
548 rval = inst +
":" + id;
550 rval = inst +
":" + coll +
":" + id;
570 string::iterator sit =
val.begin();
579 string inst_code =
val.substr(0,
len);
580 string remainder =
val.substr(
len);
587 sit = remainder.begin();
588 while (sit != remainder.end()) {
597 if (it != code_map.
end()) {
598 val = inst_code +
":" + remainder;
634 if (colon_pos != 0 && colon_pos != string::npos) {
638 if (pos == string::npos) {
641 string inst =
val.substr(pos + 1,
val.length() - pos - 2);
642 bool miscap =
false, needs_country =
false, wrong_country =
false;
645 string v_type = voucher_type;
647 if (colon_pos == 0) {
648 val = inst +
val.substr(0, pos);
650 val = inst +
":" +
val.substr(0, pos);
682 string new_inst_code = inst_code;
684 if (
NStr::Find(it->second, v_type) != string::npos) {
687 new_inst_code = it->first;
692 && inst_code.c_str()[it->first.length()] ==
'<') {
738 ITERATE(vector<string>, it, vouchers) {
739 string inst1, coll1, id1;
744 vector<string>::const_iterator it_next = it;
745 for (++it_next; it_next != vouchers.end(); ++it_next) {
746 string inst2, coll2, id2;
751 return NStr::EqualNocase(coll1, coll2) && !
NStr::IsBlank(coll1) ?
"Multiple vouchers with same institution:collection" :
"Multiple vouchers with same institution";
760 return (
str.find_first_not_of(
"0123456789") ==
NPOS);
769 string tmp = strain.substr(
prefix.length());
786 string new_val = strain;
801 string new_val = strain;
802 vector<string> words;
803 vector<string> results;
810 results.push_back (
str);
812 results.push_back (fixed);
829 for (
size_t i = 0;
i <
max;
i++) {
961 for (
size_t i = 0;
i <
max;
i++) {
975 {
"none",
"missing" },
976 {
"NA",
"not available" },
977 {
"N/A",
"not available" },
978 {
"n/a",
"not available" },
979 {
"free-living",
"natural / free-living" },
980 {
"natural",
"natural / free-living" },
981 {
"not available",
"not available" },
982 {
"not collected",
"not collected" },
983 {
"not applicable",
"not applicable" },
984 {
"NR",
"not applicable" },
985 {
"not known",
"unknown" },
986 {
"other",
"missing" },
987 {
"misc",
"missing" },
988 {
"not determined",
"unknown" },
989 {
"unknown",
"unknown" },
990 {
"not available: to be reported later",
"not available" },
991 {
"obscured",
"obscured" },
992 {
"human",
"Homo sapiens" },
993 {
"homo sapiens",
"Homo sapiens" }
1005 fix = possible_fix->second;
1014 string new_val =
value;
1086 string s1 = strain1;
1087 string s2 = strain2;
1097 bool any_change =
false;
1156 "reference material",
1162 "culture from reference material",
1163 "culture from type material",
1165 "culture from hapantotype",
static COrgMod::TInstitutionCodeMap s_CultureCollectionInstitutionCodeMap
bool FindInstCodeAndSpecID(COrgMod::TInstitutionCodeMap &code_map, string &val)
static const size_t sNumUnexpectedViralOrgModQualifiers
static COrgMod::TInstitutionCodeMap s_CompleteInstitutionFullNameMap
void s_HarmonizeString(string &s)
const char * sm_KnownHostWords[]
const char * sm_BadStrainValues[]
static const string sValidTypeMaterialPrefixes[]
static const int sNumValidTypeMaterialPrefixes
const string kMissingInst
static const COrgMod::TSubtype sUnexpectedViralOrgModQualifiers[]
static const string sValidCultureTypeMaterialPrefixes[]
static constexpr auto s_hostFixupMap
static bool s_InstitutionCollectionCodeMapInitialized
static COrgMod::TInstitutionCodeMap s_BiomaterialInstitutionCodeMap
bool s_FixStrainForPrefix(const string &prefix, string &strain)
static COrgMod::TInstitutionCodeMap s_SpecimenVoucherInstitutionCodeMap
bool s_IsAllDigits(string str)
static COrgMod::TInstitutionCodeMap s_InstitutionCodeTypeMap
DEFINE_STATIC_FAST_MUTEX(s_InstitutionCollectionCodeMutex)
static COrgMod::TInstitutionCodeMap s_CompleteInstitutionCodeMap
string s_FixOneStrain(const string &strain)
static COrgMod::TInstitutionCodeMap s_InstitutionCodeSynonymsMap
static const int sNumValidCultureTypeMaterialPrefixes
static void s_InitializeInstitutionCollectionCodeMaps(void)
static void s_ProcessInstitutionCollectionCodeLine(const CTempString &line)
static bool FixStructuredVoucher(string &val, const string &voucher_type)
static bool FuzzyStrainMatch(const string &strain1, const string &strain2)
static string IsCultureCollectionValid(const string &culture_collection)
static bool IsValidSubtypeName(const string &str, EVocabulary vocabulary=eVocabulary_raw)
static const string & GetInstitutionShortName(const string &full_name)
static bool IsINSDCValidTypeMaterial(const string &type_material)
static bool IsInstitutionCodeValid(const string &inst_coll, string &voucher_type, bool &is_miscapitalized, string &correct_cap, bool &needs_country, bool &erroneous_country)
static bool AddStructureToVoucher(string &val, const string &voucher_type)
static bool IsStrainValid(const string &strain)
static bool IsMultipleValuesAllowed(TSubtype)
static bool IsDiscouraged(const TSubtype stype, bool indexer=false)
static const string & GetInstitutionFullName(const string &short_name)
static string IsStructuredVoucherValid(const string &val, const string &voucher_type)
static string FixHost(const string &value)
bool RemoveAbbreviation()
static bool IsValidTypeMaterial(const string &type_material)
static string IsBiomaterialValid(const string &biomaterial)
static string GetSubtypeName(TSubtype stype, EVocabulary vocabulary=eVocabulary_raw)
static string CheckMultipleVouchers(const vector< string > &)
static string FixHostCapitalization(const string &value)
static bool HoldsInstitutionCode(const TSubtype stype)
This indicates if the given Org-mod subtype is supposed to hold an institution code (Example: "ATCC:2...
static TSubtype GetSubtypeValue(const string &str, EVocabulary vocabulary=eVocabulary_raw)
bool IsUnexpectedViralOrgModQualifier() const
static bool RescueInstFromParentheses(string &val, const string &voucher_type)
static TInstitutionCodeMap::iterator FindInstitutionCode(const string &inst_coll, TInstitutionCodeMap &code_map, bool &is_miscapitalized, string &correct_cap, bool &needs_country, bool &erroneous_country)
static string MakeStructuredVoucher(const string &inst, const string &coll, const string &id)
static string IsSpecimenVoucherValid(const string &specimen_voucher)
static string FixStrain(const string &strain)
static bool ParseStructuredVoucher(const string &str, string &inst, string &coll, string &id)
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
static constexpr auto construct(typename _Enabled::type const (&init)[N])
container_type::const_iterator const_iterator
container_type::iterator iterator
const_iterator begin() const
const_iterator end() const
const_iterator find(const key_type &key) const
Utility macros and typedefs for exploring NCBI objects from general.asn.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
#define NCBI_CATCH(message)
Catch CExceptions as well This macro is deprecated - use *_X or *_XX variant instead of it.
#define ENUM_METHOD_NAME(EnumName)
static CRef< ILineReader > New(const string &filename)
Return a new ILineReader object corresponding to the given filename, taking "-" (but not "....
virtual bool AtEOF(void) const =0
Indicates (negatively) whether there is any more input.
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
static bool EndsWith(const CTempString str, const CTempString end, ECase use_case=eCase)
Check if a string ends with a specified suffix value.
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
static string Join(const TContainer &arr, const CTempString &delim)
Join strings using the specified delimiter.
static bool EqualCase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-sensitive equality of a substring with another string.
static string & Replace(const string &src, const string &search, const string &replace, string &dst, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
static bool Equal(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Test for equality of a substring with another string.
static string & ReplaceInPlace(string &src, const string &search, const string &replace, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
static string TruncateSpaces(const string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string.
static string & ToLower(string &str)
Convert string to lower case – string& version.
@ eReverseSearch
Search in a backward direction.
@ eNocase
Case insensitive compare.
@ eCase
Case sensitive compare.
TSubname & SetSubname(void)
Assign a value to Subname data member.
TSubtype GetSubtype(void) const
Get the Subtype member data.
bool IsSetSubtype(void) const
Check if a value has been assigned to Subtype data member.
const TSubname & GetSubname(void) const
Get the Subname member data.
bool IsSetSubname(void) const
Check if a value has been assigned to Subname data member.
@ eSubtype_gb_acronym
used by taxonomy database
@ eSubtype_gb_synonym
used by taxonomy database
@ eSubtype_other
ASN5: old-name (254) will be added to next spec.
@ eSubtype_dosage
chromosome dosage of hybrid
@ eSubtype_nat_host
natural host of this specimen
@ eSubtype_metagenome_source
@ eSubtype_specimen_voucher
@ eSubtype_nomenclature
code of nomenclature in subname (B,P,V,Z or combination)
@ eSubtype_gb_anamorph
used by taxonomy database
@ eSubtype_culture_collection
@ eSubtype_forma_specialis
Lightweight interface for getting lines of data with minimal memory copying.
static const char * prefix[]
#define FOR_EACH_STRING_IN_VECTOR(Itr, Var)
FOR_EACH_STRING_IN_VECTOR EDIT_EACH_STRING_IN_VECTOR.
static const char * str(char *buf, int n)
string g_FindDataFile(const CTempString &name, CDirEntry::EType type=CDirEntry::eFile)
Look for an NCBI application data file or directory of the given name and type; in general,...
bool g_IsDataFileOld(const CTempString &path, const CTempString &id_line)
Check whether the given file (a full path, as returned by g_FindDataFile) is older than a built-in ve...