67 replace(name.begin(), name.end(),
'_',
'-');
68 replace(name.begin(), name.end(),
' ',
'-');
75 if (name ==
"host" || name ==
"specific-host") {
77 }
else if (name ==
"sub-strain") {
91 replace(name.begin(), name.end(),
'_',
'-');
92 replace(name.begin(), name.end(),
' ',
'-');
95 name ==
"orgmod-note" ||
96 name ==
"note-orgmod") {
99 if (name ==
"host" || name ==
"sub-strain") {
155 default:
return true;
198 if (pos == string::npos) {
202 inst =
str.substr(0, pos);
203 id =
str.substr(pos + 1);
205 if (pos != string::npos) {
206 coll =
id.substr(0, pos);
207 id =
id.substr(pos + 1);
228 #include "institution_codes.inc"
236 vector<string> tokens;
238 if (tokens.size() < 3) {
245 string& vouch_types = tokens[1];
246 for (
size_t i = 0;
i < vouch_types.size();
i++) {
247 switch (vouch_types[
i]) {
268 vector<string> synonyms;
286 CTime builtin_timestamp
287 (
static_cast<time_t
>(kInstitutionCollectionCodeList_Timestamp));
292 }
NCBI_CATCH(
"s_InitializeInstitutionCollectionCodeMaps")
296 if (getenv(
"NCBI_DEBUG")) {
297 LOG_POST(
"Falling back on built-in data for institution code list.");
299 size_t num_codes =
sizeof (kInstitutionCollectionCodeList) /
sizeof (
char *);
300 for (
size_t i = 0;
i < num_codes;
i++) {
301 const char *p = kInstitutionCollectionCodeList[
i];
305 if (getenv(
"NCBI_DEBUG")) {
306 LOG_POST(
"Reading from " +
file +
" for instition code list.");
310 }
while ( !
lr->AtEOF() );
318 bool& is_miscapitalized,
string& correct_cap,
bool& needs_country,
bool& erroneous_country)
321 if (it != code_map.
end()) {
324 is_miscapitalized =
true;
326 correct_cap = it->first;
330 if (pos == string::npos) {
331 string check = inst_coll +
"<";
332 it = code_map.
begin();
333 while (it != code_map.
end()) {
335 needs_country =
true;
337 is_miscapitalized =
true;
339 correct_cap = it->first.substr(0, inst_coll.length());
345 string inst_sub = inst_coll.substr(0, pos);
346 it = code_map.
find(inst_sub);
347 if (it != code_map.
end()) {
348 erroneous_country =
true;
353 return code_map.
end();
357 bool COrgMod::IsInstitutionCodeValid(
const string& inst_coll,
string &voucher_type,
bool& is_miscapitalized,
string& correct_cap,
bool& needs_country,
bool& erroneous_country)
359 is_miscapitalized =
false;
360 needs_country =
false;
361 erroneous_country =
false;
370 bool syn_is_miscapitalized =
false;
371 string syn_correct_cap =
"";
372 bool syn_needs_country =
false;
373 bool syn_erroneous_country =
false;
376 syn_needs_country, syn_erroneous_country);
380 is_miscapitalized = syn_is_miscapitalized;
381 correct_cap = syn_correct_cap;
382 needs_country = syn_needs_country;
383 erroneous_country = syn_erroneous_country;
384 voucher_type = is->second;
388 }
else if (erroneous_country) {
390 bool syn_is_miscapitalized =
false;
391 string syn_correct_cap =
"";
392 bool syn_needs_country =
false;
393 bool syn_erroneous_country =
false;
396 syn_needs_country, syn_erroneous_country);
400 is_miscapitalized = syn_is_miscapitalized;
401 correct_cap = syn_correct_cap;
402 needs_country = syn_needs_country;
403 erroneous_country = syn_erroneous_country;
404 voucher_type = is->second;
409 voucher_type = ic->second;
416 voucher_type = it->second;
427 if (
NStr::Find(culture_collection,
":") == string::npos) {
428 return "Culture_collection should be structured, but is not";
438 if (
NStr::Find(specimen_voucher,
":") == string::npos) {
449 if (
NStr::Find(biomaterial,
":") == string::npos) {
458 const string kMissingId =
"Voucher is missing specific identifier";
481 inst_coll = inst_code;
483 inst_coll = inst_code +
":" + coll_code;
488 bool is_miscapitalized;
490 bool erroneous_country;
494 return "Institution code " + inst_coll +
" needs to be qualified with a <COUNTRY> designation";
495 }
else if (erroneous_country) {
496 return "Institution code " + inst_coll +
" should not be qualified with a <COUNTRY> designation";
497 }
else if (is_miscapitalized) {
498 return "Institution code " + inst_coll +
" exists, but correct capitalization is " + correct_cap;
502 return "Institution code " + inst_coll +
" should be bio_material";
504 return "Institution code " + inst_coll +
" should be culture_collection";
506 return "Institution code " + inst_coll +
" should be specimen_voucher";
513 return "Personal collection does not have name of collector";
517 return "Institution code " + inst_coll +
" is not in list";
518 }
else if (
IsInstitutionCodeValid(inst_code, voucher_type, is_miscapitalized, correct_cap, needs_country, erroneous_country)) {
520 return "Institution code in " + inst_coll +
" needs to be qualified with a <COUNTRY> designation";
521 }
else if (erroneous_country) {
522 return "Institution code " + inst_code +
" should not be qualified with a <COUNTRY> designation";
523 }
else if (is_miscapitalized) {
524 return "Institution code " + inst_code +
" exists, but correct capitalization is " + correct_cap;
528 return "DNA should be bio_material";
531 return "Institution code " + inst_code +
" exists, but collection "
532 + inst_coll +
" is not in list";
535 return "Institution code " + inst_coll +
" is not in list";
549 rval = inst +
":" + id;
551 rval = inst +
":" + coll +
":" + id;
571 string::iterator sit =
val.begin();
580 string inst_code =
val.substr(0,
len);
581 string remainder =
val.substr(
len);
588 sit = remainder.begin();
589 while (sit != remainder.end()) {
598 if (it != code_map.
end()) {
599 val = inst_code +
":" + remainder;
635 if (colon_pos != 0 && colon_pos != string::npos) {
639 if (pos == string::npos) {
642 string inst =
val.substr(pos + 1,
val.length() - pos - 2);
643 bool miscap =
false, needs_country =
false, wrong_country =
false;
646 string v_type = voucher_type;
648 if (colon_pos == 0) {
649 val = inst +
val.substr(0, pos);
651 val = inst +
":" +
val.substr(0, pos);
683 string new_inst_code = inst_code;
685 if (
NStr::Find(it->second, v_type) != string::npos) {
688 new_inst_code = it->first;
693 && inst_code.c_str()[it->first.length()] ==
'<') {
739 ITERATE(vector<string>, it, vouchers) {
740 string inst1, coll1, id1;
745 vector<string>::const_iterator it_next = it;
746 for (++it_next; it_next != vouchers.end(); ++it_next) {
747 string inst2, coll2, id2;
752 return NStr::EqualNocase(coll1, coll2) && !
NStr::IsBlank(coll1) ?
"Multiple vouchers with same institution:collection" :
"Multiple vouchers with same institution";
761 return (
str.find_first_not_of(
"0123456789") ==
NPOS);
770 string tmp = strain.substr(prefix.length());
777 strain = prefix +
" " +
tmp;
787 string new_val = strain;
802 string new_val = strain;
803 vector<string> words;
839 for (
size_t i = 0;
i <
max;
i++) {
971 for (
size_t i = 0;
i <
max;
i++) {
985 {
"none",
"missing" },
986 {
"NA",
"not available" },
987 {
"N/A",
"not available" },
988 {
"n/a",
"not available" },
989 {
"free-living",
"natural / free-living" },
990 {
"natural",
"natural / free-living" },
991 {
"not available",
"not available" },
992 {
"not collected",
"not collected" },
993 {
"not applicable",
"not applicable" },
994 {
"NR",
"not applicable" },
995 {
"not known",
"unknown" },
996 {
"other",
"missing" },
997 {
"misc",
"missing" },
998 {
"not determined",
"unknown" },
999 {
"unknown",
"unknown" },
1000 {
"not available: to be reported later",
"not available" },
1001 {
"obscured",
"obscured" },
1002 {
"human",
"Homo sapiens" },
1003 {
"homo sapiens",
"Homo sapiens" }
1015 fix = possible_fix->second;
1024 string new_val =
value;
1096 string s1 = strain1;
1097 string s2 = strain2;
1107 bool any_change =
false;
1166 "reference material",
1172 "culture from reference material",
1173 "culture from type material",
1175 "culture from hapantotype",
1241 string fromEnv =
env.Get(
"NCBI_VALIDATE_FOR_MULTIPLE_ISOLATES");
1243 if (fromEnv ==
"true") {
1245 }
else if (fromEnv ==
"false") {
1250 string fromConfig = reg.
GetString(
"OrgMod",
"ValidateForMultipleIsolates",
"off");
1252 if (fromConfig ==
"1" || fromConfig ==
"on" || fromConfig ==
"true" || fromConfig ==
"yes") {
static COrgMod::TInstitutionCodeMap s_CultureCollectionInstitutionCodeMap
bool FindInstCodeAndSpecID(COrgMod::TInstitutionCodeMap &code_map, string &val)
static const size_t sNumUnexpectedViralOrgModQualifiers
static COrgMod::TInstitutionCodeMap s_CompleteInstitutionFullNameMap
void s_HarmonizeString(string &s)
const char * sm_KnownHostWords[]
const char * sm_BadStrainValues[]
static const string sValidTypeMaterialPrefixes[]
static const int sNumValidTypeMaterialPrefixes
const string kMissingInst
static const COrgMod::TSubtype sUnexpectedViralOrgModQualifiers[]
static const string sValidCultureTypeMaterialPrefixes[]
static constexpr auto s_hostFixupMap
static bool s_InstitutionCollectionCodeMapInitialized
static COrgMod::TInstitutionCodeMap s_BiomaterialInstitutionCodeMap
bool s_FixStrainForPrefix(const string &prefix, string &strain)
static COrgMod::TInstitutionCodeMap s_SpecimenVoucherInstitutionCodeMap
bool s_IsAllDigits(string str)
static COrgMod::TInstitutionCodeMap s_InstitutionCodeTypeMap
DEFINE_STATIC_FAST_MUTEX(s_InstitutionCollectionCodeMutex)
static COrgMod::TInstitutionCodeMap s_CompleteInstitutionCodeMap
string s_FixOneStrain(const string &strain)
static COrgMod::TInstitutionCodeMap s_InstitutionCodeSynonymsMap
static const int sNumValidCultureTypeMaterialPrefixes
static void s_InitializeInstitutionCollectionCodeMaps(void)
static bool s_init_LookForMultipleIsolates(void)
static void s_ProcessInstitutionCollectionCodeLine(const CTempString &line)
static CNcbiApplication * Instance(void)
Singleton method.
static bool FixStructuredVoucher(string &val, const string &voucher_type)
static bool NCBI_ValidateForMultipleIsolates(void)
static bool FuzzyStrainMatch(const string &strain1, const string &strain2)
static string IsCultureCollectionValid(const string &culture_collection)
static bool IsValidSubtypeName(const string &str, EVocabulary vocabulary=eVocabulary_raw)
static const string & GetInstitutionShortName(const string &full_name)
static bool IsINSDCValidTypeMaterial(const string &type_material)
static bool IsInstitutionCodeValid(const string &inst_coll, string &voucher_type, bool &is_miscapitalized, string &correct_cap, bool &needs_country, bool &erroneous_country)
static bool AddStructureToVoucher(string &val, const string &voucher_type)
static bool IsStrainValid(const string &strain)
static bool IsMultipleValuesAllowed(TSubtype)
static bool IsDiscouraged(const TSubtype stype, bool indexer=false)
static const string & GetInstitutionFullName(const string &short_name)
static string IsStructuredVoucherValid(const string &val, const string &voucher_type)
static string FixHost(const string &value)
bool RemoveAbbreviation()
static bool IsValidTypeMaterial(const string &type_material)
static string IsBiomaterialValid(const string &biomaterial)
static string GetSubtypeName(TSubtype stype, EVocabulary vocabulary=eVocabulary_raw)
static string CheckMultipleVouchers(const vector< string > &)
static string FixHostCapitalization(const string &value)
static bool HoldsInstitutionCode(const TSubtype stype)
This indicates if the given Org-mod subtype is supposed to hold an institution code (Example: "ATCC:2...
static TSubtype GetSubtypeValue(const string &str, EVocabulary vocabulary=eVocabulary_raw)
bool IsUnexpectedViralOrgModQualifier() const
static bool RescueInstFromParentheses(string &val, const string &voucher_type)
static TInstitutionCodeMap::iterator FindInstitutionCode(const string &inst_coll, TInstitutionCodeMap &code_map, bool &is_miscapitalized, string &correct_cap, bool &needs_country, bool &erroneous_country)
static string MakeStructuredVoucher(const string &inst, const string &coll, const string &id)
static string IsSpecimenVoucherValid(const string &specimen_voucher)
static string FixStrain(const string &strain)
static bool ParseStructuredVoucher(const string &str, string &inst, string &coll, string &id)
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
static constexpr auto construct(typename _Enabled::type const (&init)[N])
container_type::const_iterator const_iterator
container_type::iterator iterator
const_iterator begin() const
const_iterator end() const
const_iterator find(const key_type &key) const
static const char * str(char *buf, int n)
Utility macros and typedefs for exploring NCBI objects from general.asn.
const CNcbiEnvironment & GetEnvironment(void) const
Get the application's cached environment.
const CNcbiRegistry & GetConfig(void) const
Get the application's cached configuration parameters (read-only).
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
#define NCBI_CATCH(message)
Catch CExceptions as well This macro is deprecated - use *_X or *_XX variant instead of it.
#define ENUM_METHOD_NAME(EnumName)
static CRef< ILineReader > New(const string &filename)
Return a new ILineReader object corresponding to the given filename, taking "-" (but not "....
virtual string GetString(const string §ion, const string &name, const string &default_value, TFlags flags=0) const
Get the parameter string value.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
static bool EndsWith(const CTempString str, const CTempString end, ECase use_case=eCase)
Check if a string ends with a specified suffix value.
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate whitespace in a string (in-place)
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
static string Join(const TContainer &arr, const CTempString &delim)
Join strings using the specified delimiter.
static bool EqualCase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-sensitive equality of a substring with another string.
static string & Replace(const string &src, const string &search, const string &replace, string &dst, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
static bool Equal(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Test for equality of a substring with another string.
static string & ReplaceInPlace(string &src, const string &search, const string &replace, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
static string TruncateSpaces(const string &str, ETrunc where=eTrunc_Both)
Truncate whitespace in a string.
static string & ToLower(string &str)
Convert string to lower case – string& version.
@ eReverseSearch
Search in a backward direction.
@ eNocase
Case insensitive compare.
@ eCase
Case sensitive compare.
TSubname & SetSubname(void)
Assign a value to Subname data member.
TSubtype GetSubtype(void) const
Get the Subtype member data.
bool IsSetSubtype(void) const
Check if a value has been assigned to Subtype data member.
const TSubname & GetSubname(void) const
Get the Subname member data.
bool IsSetSubname(void) const
Check if a value has been assigned to Subname data member.
@ eSubtype_gb_acronym
used by taxonomy database
@ eSubtype_gb_synonym
used by taxonomy database
@ eSubtype_other
ASN5: old-name (254) will be added to next spec.
@ eSubtype_dosage
chromosome dosage of hybrid
@ eSubtype_nat_host
natural host of this specimen
@ eSubtype_metagenome_source
@ eSubtype_specimen_voucher
@ eSubtype_nomenclature
code of nomenclature in subname (B,P,V,Z or combination)
@ eSubtype_gb_anamorph
used by taxonomy database
@ eSubtype_culture_collection
@ eSubtype_forma_specialis
Lightweight interface for getting lines of data with minimal memory copying.
const GenericPointer< typename T::ValueType > T2 value
#define FOR_EACH_STRING_IN_VECTOR(Itr, Var)
FOR_EACH_STRING_IN_VECTOR EDIT_EACH_STRING_IN_VECTOR.
static SLJIT_INLINE sljit_ins lr(sljit_gpr dst, sljit_gpr src)
string g_FindDataFile(const CTempString &name, CDirEntry::EType type=CDirEntry::eFile)
Look for an NCBI application data file or directory of the given name and type; in general,...
bool g_IsDataFileOld(const CTempString &path, const CTempString &id_line)
Check whether the given file (a full path, as returned by g_FindDataFile) is older than a built-in ve...