NCBI C++ ToolKit
mod_to_enum.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: mod_to_enum.cpp 101909 2024-03-01 12:11:21Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Justin Foley
27  *
28  */
29 
30 #include <ncbi_pch.hpp>
31 #include "mod_to_enum.hpp"
32 #include <serial/enumvalues.hpp>
33 #include <functional>
34 #include <unordered_set>
35 
38 
39 using TModNameSet = unordered_set<string>;
40 
41 
42 string g_GetNormalizedModVal(const string& unnormalized)
43 {
44  string normalized = unnormalized;
45  static const string irrelevant_prefix_suffix_chars = " \t\"\'-_";
46 
47  size_t pos = normalized.find_first_not_of(irrelevant_prefix_suffix_chars);
48  if (pos != NPOS) {
49  normalized.erase(0,pos);
50  }
51 
52  pos = normalized.find_last_not_of(irrelevant_prefix_suffix_chars);
53  if (pos != NPOS) {
54  normalized.erase(pos+1);
55  }
56 
57  normalized.erase(remove_if(normalized.begin(),
58  normalized.end(),
59  [](char c) {
60  return (c=='-' ||
61  c=='_' ||
62  c==':' ||
63  isspace(c)); }),
64  normalized.end());
65 
66  NStr::ToLower(normalized);
67  return normalized;
68 }
69 
70 
71 static string s_NoNormalization(const string& mod_string)
72 { return mod_string; };
73 
74 
75 template<typename TEnum>
77  const CEnumeratedTypeValues& etv,
78  const TModNameSet& skip_mod_strings,
79  const TStringToEnumMap<TEnum>& extra_mod_strings_to_enums,
80  function<string(const string&)> fNormalizeString = s_NoNormalization
81  )
82 
83 {
84  TModNameSet normalized_skip_set;
85  transform(skip_mod_strings.begin(),
86  skip_mod_strings.end(),
87  inserter(normalized_skip_set, normalized_skip_set.end()),
88  fNormalizeString);
89 
90  TStringToEnumMap<TEnum> smod_enum_map;
91  for (const auto& name_val : etv.GetValues()) {
92  const auto& mod_string = fNormalizeString(name_val.first);
93  if (normalized_skip_set.find(mod_string) == normalized_skip_set.end())
94  {
95  const TEnum& enum_val = static_cast<TEnum>(name_val.second);
96  smod_enum_map.emplace(mod_string, enum_val);
97  }
98  }
99 
100  for (auto extra_smod_to_enum : extra_mod_strings_to_enums) {
101  smod_enum_map.emplace(
102  fNormalizeString(extra_smod_to_enum.first),
103  extra_smod_to_enum.second);
104  }
105  return smod_enum_map;
106 }
107 
108 
111 {
112  static const TModNameSet kDeprecatedOrgSubtypes{
113  "dosage", "old-lineage", "old-name"};
115  extra_smod_to_enum_names
116  {{ "subspecies", COrgMod::eSubtype_sub_species},
118  {"specific-host", COrgMod::eSubtype_nat_host}};
120  *COrgMod::ENUM_METHOD_NAME(ESubtype)(),
121  kDeprecatedOrgSubtypes,
122  extra_smod_to_enum_names
123  );
124 }
125 
126 
129 {
130  // some are skipped because they're handled specially and some are
131  // skipped because they're deprecated
132  static const TModNameSet skip_enum_names {
133  // skip because handled specially elsewhere
134  "fwd-primer-seq", "rev-primer-seq",
135  "fwd-primer-name", "rev-primer-name",
136  // skip because deprecated
137  "transposon-name",
138  "plastid-name",
139  "insertion-seq-name",
140  };
142  extra_smod_to_enum_names
143  {{ "sub-clone", CSubSource::eSubtype_subclone },
144  { "lat-long", CSubSource::eSubtype_lat_lon },
145  { "latitude-longitude", CSubSource::eSubtype_lat_lon },
146  { "note", CSubSource::eSubtype_other },
147  { "notes", CSubSource::eSubtype_other }};
149  *CSubSource::ENUM_METHOD_NAME(ESubtype)(),
150  skip_enum_names,
151  extra_smod_to_enum_names);
152 }
153 
154 
157 {
158  static const TModNameSet skip_enum_names;
160  extra_smod_to_enum_names
161  {{ "mitochondrial", CBioSource::eGenome_mitochondrion },
162  { "provirus", CBioSource::eGenome_proviral},
163  { "extrachromosomal", CBioSource::eGenome_extrachrom},
164  { "insertion sequence", CBioSource::eGenome_insertion_seq}};
165 
167  *CBioSource::ENUM_METHOD_NAME(EGenome)(),
168  skip_enum_names,
169  extra_smod_to_enum_names,
171 }
172 
173 
176 {
177  static const TModNameSet skip_enum_names;
179  extra_smod_to_enum_names
180  {{ "natural mutant", CBioSource::eOrigin_natmut},
181  { "mutant", CBioSource::eOrigin_mut}};
182 
184  *CBioSource::ENUM_METHOD_NAME(EOrigin)(),
185  skip_enum_names,
186  extra_smod_to_enum_names,
188 }
189 
190 
191 const
194 = { {"crna", CMolInfo::eBiomol_cRNA },
195  {"dna", CMolInfo::eBiomol_genomic},
196  {"genomic", CMolInfo::eBiomol_genomic},
197  {"genomicdna", CMolInfo::eBiomol_genomic},
198  {"genomicrna", CMolInfo::eBiomol_genomic},
199  {"mrna", CMolInfo::eBiomol_mRNA},
200  {"ncrna", CMolInfo::eBiomol_ncRNA},
201  {"noncodingrna", CMolInfo::eBiomol_ncRNA},
202  {"othergenetic", CMolInfo::eBiomol_other_genetic},
203  {"precursorrna", CMolInfo::eBiomol_pre_RNA},
204  {"ribosomalrna", CMolInfo::eBiomol_rRNA},
205  {"rrna", CMolInfo::eBiomol_rRNA},
206  {"transcribedrna", CMolInfo::eBiomol_transcribed_RNA},
207  {"transfermessengerrna", CMolInfo::eBiomol_tmRNA},
208  {"tmrna", CMolInfo::eBiomol_tmRNA},
209  {"transferrna", CMolInfo::eBiomol_tRNA},
210  {"trna", CMolInfo::eBiomol_tRNA},
211 };
212 
213 
214 const
215 unordered_map<CMolInfo::TBiomol, CSeq_inst::EMol>
233 };
234 
237 
void remove_if(Container &c, Predicate *__pred)
Definition: chainer.hpp:69
void transform(Container &c, UnaryFunction *op)
Definition: chainer.hpp:86
const TValues & GetValues(void) const
Get the list of name-value pairs.
Definition: enumvalues.hpp:98
#define ENUM_METHOD_NAME(EnumName)
Definition: serialbase.hpp:994
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
#define NPOS
Definition: ncbistr.hpp:133
static string & ToLower(string &str)
Convert string to lower case – string& version.
Definition: ncbistr.cpp:405
@ eSubtype_lat_lon
+/- decimal degrees
Definition: SubSource_.hpp:113
@ eOrigin_mut
artificially mutagenized
Definition: BioSource_.hpp:132
@ eOrigin_natmut
naturally occurring mutant
Definition: BioSource_.hpp:131
@ eSubtype_sub_species
Definition: OrgMod_.hpp:105
@ eSubtype_nat_host
natural host of this specimen
Definition: OrgMod_.hpp:104
@ eBiomol_pre_RNA
precursor RNA of any sort really
Definition: MolInfo_.hpp:102
@ eBiomol_cRNA
viral RNA genome copy intermediate
Definition: MolInfo_.hpp:111
@ eBiomol_snoRNA
small nucleolar RNA
Definition: MolInfo_.hpp:112
@ eBiomol_genomic_mRNA
reported a mix of genomic and cdna sequence
Definition: MolInfo_.hpp:110
@ eBiomol_transcribed_RNA
transcribed RNA other than existing classes
Definition: MolInfo_.hpp:113
@ eBiomol_other_genetic
other genetic material
Definition: MolInfo_.hpp:109
const unordered_map< CMolInfo::TBiomol, CSeq_inst::EMol > g_BiomolEnumToMolEnum
unordered_set< string > TModNameSet
Definition: mod_to_enum.cpp:39
const TStringToEnumMap< CMolInfo::TBiomol > g_BiomolStringToEnum
static string s_NoNormalization(const string &mod_string)
Definition: mod_to_enum.cpp:71
static TStringToEnumMap< TEnum > s_InitModStringToEnumMap(const CEnumeratedTypeValues &etv, const TModNameSet &skip_mod_strings, const TStringToEnumMap< TEnum > &extra_mod_strings_to_enums, function< string(const string &)> fNormalizeString=s_NoNormalization)
Definition: mod_to_enum.cpp:76
TStringToEnumMap< CSubSource::ESubtype > g_InitModNameSubSrcSubtypeMap(void)
TStringToEnumMap< COrgMod::ESubtype > g_InitModNameOrgSubtypeMap(void)
string g_GetNormalizedModVal(const string &unnormalized)
Definition: mod_to_enum.cpp:42
TStringToEnumMap< CBioSource::EGenome > g_InitModNameGenomeMap(void)
TStringToEnumMap< CBioSource::EOrigin > g_InitModNameOriginMap(void)
unordered_map< string, TEnum > TStringToEnumMap
Definition: mod_to_enum.hpp:43
#define const
Definition: zconf.h:232
Modified on Sun Jun 16 04:33:40 2024 by modify_doxy.py rev. 669887