NCBI C++ ToolKit
local_taxon.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: local_taxon.cpp 99097 2023-02-13 20:02:01Z mozese2 $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Eyal Mozes
27  *
28  * File Description:
29  * Class for getting Taxonomy data from local SQLite file
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 
36 
38 
40 
41 #include <serial/serial.hpp>
42 #include <serial/objistr.hpp>
43 
45 
48 
51 
53 {
54  arg_desc.AddOptionalKey("taxon-db", "TaxonDBFile",
55  "SQLite file containing taxon database, to use "
56  "instead of CTaxon1 service",
58 
59  arg_desc.AddFlag("fallback-to-taxon-service",
60  "If organism not found in SQLIlte database, fall back to "
61  "CTaxon1 service");
62  arg_desc.SetDependency("fallback-to-taxon-service",
63  CArgDescriptions::eRequires, "taxon-db");
64 }
65 
66 CLocalTaxon::CLocalTaxon() : m_db_supports_synonym(false)
67 {
68  /// Initializing without command-line arguments; use Taxon server
69  m_TaxonConn.reset(new CTaxon1);
70  m_TaxonConn->Init();
71 }
72 
73 CLocalTaxon::CLocalTaxon(const CArgs &args) : m_db_supports_synonym(false)
74 {
75  if (args["taxon-db"]) {
76  m_SqliteConn.reset(new CSQLITE_Connection(args["taxon-db"].AsString(),
83  m_fallback = args["fallback-to-taxon-service"];
84  } else {
85  m_TaxonConn.reset(new CTaxon1);
86  m_TaxonConn->Init();
87  }
88 }
89 
91 {
92 }
93 
95  : taxid(INVALID_TAX_ID)
96  , is_valid(false)
97  , parent(s_InvalidNode)
98  , genetic_code(-1)
99 {
100 }
101 
103 {
104 }
105 
107 {
108  if (m_SqliteConn.get()) {
109  x_Cache(taxid);
110  return m_Nodes.find(taxid)->second.is_valid;
111  } else {
112  return m_TaxonConn->GetTreeIterator(taxid);
113  }
114 }
115 
117 {
118  if (m_SqliteConn.get()) {
119  x_Cache(taxid);
120  TNodeRef parent = m_Nodes.find(taxid)->second.parent;
121  return parent == s_InvalidNode ? ZERO_TAX_ID : parent->first;
122  } else {
123  return m_TaxonConn->GetParent(taxid);
124  }
125 }
126 
128 {
129  if (m_SqliteConn.get()) {
130  x_Cache(taxid);
131  return m_Nodes.find(taxid)->second.rank;
132  } else {
133  TTaxRank rank_id = m_TaxonConn->GetTreeIterator(taxid)->GetNode()->GetRank();
134  string rank_name;
135  m_TaxonConn->GetRankName(rank_id, rank_name);
136  return rank_name;
137  }
138 }
139 
141 {
142  if (m_SqliteConn.get()) {
143  x_Cache(taxid);
144  return m_Nodes.find(taxid)->second.scientific_name;
145  } else {
146  string scientific_name;
147  m_TaxonConn->GetScientificName(taxid, scientific_name);
148  return scientific_name;
149  }
150 }
151 
153 {
154  if (m_SqliteConn.get()) {
155  x_Cache(taxid);
156  return m_Nodes.find(taxid)->second.genetic_code;
157  } else {
158  return m_TaxonConn->GetTreeIterator(taxid)->GetNode()->GetGC();
159  }
160 }
161 
162 
163 namespace {
164 
165 // s_CopyDbTags(org, *new_org);
166 void s_CopyDbTags(const COrg_ref& org, COrg_ref& new_org)
167 {
168  if( ! org.IsSetDb() ) {
169  return;
170  }
171  new_org.SetDb().insert(
172  new_org.SetDb().end(),
173  const_cast<COrg_ref&>(org).SetDb().begin(),
174  const_cast<COrg_ref&>(org).SetDb().end()
175  );
176 
177  for (vector<CRef<CDbtag> >::iterator it1 = new_org.SetDb().begin();
178  it1 != new_org.SetDb().end(); ++it1) {
179 
180  vector<CRef<CDbtag> >::iterator it2 = it1;
181  for (++it2; it2 != new_org.SetDb().end(); ) {
182  if ((*it1)->Equals(**it2)) {
183  it2 = new_org.SetDb().erase(it2);
184  }
185  else {
186  ++it2;
187  }
188  }
189  }
190 }
191 
192 void s_RemoveTaxon(COrg_ref& org)
193 {
194  if( ! org.IsSetDb() ) {
195  return;
196  }
197  vector<CRef<CDbtag> >& dbs = org.SetDb();
198  ERASE_ITERATE(vector<CRef<CDbtag> >, it, dbs ) {
199  if ( (*it)->GetDb() == "taxon" ) {
200  VECTOR_ERASE(it, dbs);
201  }
202  }
203 }
204 
205 }
206 
207 
208 
209 
210 void CLocalTaxon::LookupMerge(objects::COrg_ref& org)
211 {
212  if (m_SqliteConn.get()) {
213  TTaxId taxid = ZERO_TAX_ID;
214  if( ! org.IsSetDb() ) {
215  taxid = GetTaxIdByOrgRef(org);
216  } else {
217  taxid = org.GetTaxId();
218  }
219  if ( taxid <= ZERO_TAX_ID ) {
220  NCBI_THROW(CException, eUnknown, "s_UpdateOrgRef: organism does not contain tax id or has unequivocal registered taxonomy name");
221  }
222 
223  CConstRef<COrg_ref> public_org = GetOrgRef(taxid);
224  CRef<COrg_ref> new_org(new COrg_ref);
225  new_org->Assign(*public_org);
226  if (org.IsSetOrgname() && org.GetOrgname().IsSetMod()) {
227  new_org->SetOrgname().SetMod() =
228  org.GetOrgname().GetMod();
229  }
230  if ( !new_org->Equals(org) ) {
231  s_RemoveTaxon(org);
232  s_CopyDbTags(org, *new_org);
233  org.Assign(*new_org);
234  }
235  }
236  else {
237  m_TaxonConn->LookupMerge(org);
238  }
239 }
240 
241 
243 {
244  if (m_SqliteConn.get()) {
245  x_Cache(taxid, true);
246  return m_Nodes.find(taxid)->second.org_ref;
247  } else {
248  bool is_species, is_uncultured;
249  string blast_name;
250  return m_TaxonConn->GetOrgRef(taxid, is_species, is_uncultured, blast_name);
251  }
252 }
253 
255 {
256  if (m_SqliteConn.get()) {
257  TInternalLineage lineage;
258  x_GetLineage(taxid, lineage);
259  for (TNodeRef ancestor : lineage) {
260  if (ancestor->second.rank == rank) {
261  return ancestor->first;
262  }
263  }
264  return ZERO_TAX_ID;
265  } else {
266  return m_TaxonConn->GetAncestorByRank(taxid, rank.c_str());
267  }
268 }
269 
271 {
272  if (inp_orgRef.IsSetDb()) {
273  return inp_orgRef.GetTaxId();
274  }
275  if (m_fallback && !m_TaxonConn.get()) {
276  m_TaxonConn.reset(new CTaxon1);
277  m_TaxonConn->Init();
278  }
279  if (m_TaxonConn.get()) {
280  return m_TaxonConn->GetTaxIdByOrgRef(inp_orgRef);
281  } else {
283  "GetTaxIdByOrgRef not supported for local execution");
284  }
285 }
286 
288 {
289  TLineage lineage;
290  if (m_SqliteConn.get()) {
291  TInternalLineage internal_lineage;
292  x_GetLineage(taxid, internal_lineage);
293  for (TNodeRef ancestor : internal_lineage) {
294  lineage.push_back(ancestor->first);
295  }
296  } else {
297  for (TTaxid ancestor = taxid; ancestor > ZERO_TAX_ID;
298  ancestor = m_TaxonConn->GetParent(ancestor))
299  {
300  lineage.push_back(ancestor);
301  }
302  reverse(lineage.begin(), lineage.end());
303  }
304  return lineage;
305 }
306 
308 {
309  if (m_SqliteConn.get()) {
310  TLineage lineage1 = GetLineage(taxid1),
311  lineage2 = GetLineage(taxid2);
312  TLineage::const_iterator it1 = lineage1.begin(),
313  it2 = lineage2.begin();
314  for (; it1 != lineage1.end() && it2 != lineage2.end() && *it1 == *it2;
315  ++it1, ++it2);
316  return *--it1;
317  } else {
318  return m_TaxonConn->Join(taxid1, taxid2);
319  }
320 }
321 
323 {
324  TTaxid taxid = INVALID_TAX_ID;
325  if (m_SqliteConn.get()) {
326  x_Cache(orgname);
327  auto& taxnode = m_ScientificNameIndex.find(orgname)->second;
328  taxid = taxnode.is_valid ? taxnode.taxid : INVALID_TAX_ID;
329  } else {
330  taxid = m_TaxonConn->GetTaxIdByName(orgname);
331  }
332  return taxid;
333 
334 }
335 
336 list<string> CLocalTaxon::GetSynonyms(TTaxId taxid)
337 {
338  if (m_SqliteConn.get()) {
339  x_Cache(taxid);
340  return m_Nodes.find(taxid)->second.synonyms;
341  } else {
342  list<string> lNames; // TNameList - second parameter to GetAllNames is currently list<string>
343  // we are using false because currently all
344  // other usages of this API in gpipe code is with this value:
345  m_TaxonConn->GetAllNames(taxid, lNames, false);
346  return lNames;
347  }
348 }
349 
350 //
351 // Implementation
352 //
353 
355 {
356  NCBI_ASSERT(m_SqliteConn.get(), "x_Cache called with server execution");
357 
359  if (it == m_ScientificNameIndex.end() ) {
360  //
361  // do the case-insensitive comparison
362  //
363  string sql = "SELECT taxid FROM TaxidInfo WHERE scientific_name = ?1 COLLATE NOCASE ";
365  sql += " UNION "
366  "SELECT taxid FROM Synonym WHERE scientific_name = ?1 COLLATE NOCASE ";
367  }
368 
370  stmt.Bind(1, orgname);
371  stmt.Execute();
372  TTaxId taxid = ZERO_TAX_ID;
373  if (stmt.Step()) {
374  taxid = TAX_ID_FROM(int, stmt.GetInt(0));
375  } else if (m_fallback) {
376  if (!m_TaxonConn.get()) {
377  m_TaxonConn.reset(new CTaxon1);
378  m_TaxonConn->Init();
379  }
380  taxid = m_TaxonConn->GetTaxIdByName(orgname);
381  }
382  if (taxid > ZERO_TAX_ID) {
383  CLocalTaxon::TNodeRef it2 = x_Cache(taxid);
384  it = m_ScientificNameIndex.insert(TScientificNameIndex::value_type(orgname, it2->second )).first;
385  } else {
386  //
387  // return invalid node.
388  //
390  }
391  }
392  return it;
393 }
394 
395 
396 CLocalTaxon::TNodeRef CLocalTaxon::x_Cache(TTaxid taxid, bool including_org_ref)
397 {
398  NCBI_ASSERT(m_SqliteConn.get(), "x_Cache called with server execution");
399 
400  TNodes::iterator it = m_Nodes.find(taxid);
401  if (it != m_Nodes.end() && (!including_org_ref || it->second.org_ref))
402  {
403  return it;
404  }
405 
406  if (it == m_Nodes.end()) {
407  TTaxId parent = INVALID_TAX_ID;
408  //
409  // Note that we are unconditionally recording (so far) unknown input taxid here
410  // thereby caching all successful and unsuccessful queries
411  //
412  it = m_Nodes.insert(TNodes::value_type(taxid, STaxidNode())).first;
413  it->second.taxid = taxid;
414  {{
416  (m_SqliteConn.get(),
417  "SELECT scientific_name, rank, parent, genetic_code "
418  "FROM TaxidInfo "
419  "WHERE taxid = ? ");
420  stmt.Bind(1, TAX_ID_TO(TIntId, taxid));
421  stmt.Execute();
422  if (stmt.Step()) {
423  it->second.is_valid = true;
424  it->second.scientific_name = stmt.GetString(0);
425  it->second.rank = stmt.GetString(1);
426  if (it->second.rank.empty()) {
427  it->second.rank = "no rank";
428  }
429  parent = TAX_ID_FROM(int, stmt.GetInt(2));
430  it->second.genetic_code = stmt.GetInt(3);
431  CSQLITE_Statement syn_stmt
432  (m_SqliteConn.get(),
433  "SELECT scientific_name "
434  "FROM Synonym "
435  "WHERE taxid = ? ");
436  syn_stmt.Bind(1, TAX_ID_TO(TIntId, taxid));
437  syn_stmt.Execute();
438  while (syn_stmt.Step()) {
439  it->second.synonyms.push_back( syn_stmt.GetString(0));
440  }
441  } else if (m_fallback) {
442  if (!m_TaxonConn.get()) {
443  m_TaxonConn.reset(new CTaxon1);
444  m_TaxonConn->Init();
445  }
446  if (m_TaxonConn->GetScientificName(taxid,
447  it->second.scientific_name))
448  {
449  it->second.is_valid = true;
450  TTaxRank rank_id = m_TaxonConn->GetTreeIterator(taxid)
451  ->GetNode()->GetRank();
452  m_TaxonConn->GetRankName(rank_id, it->second.rank);
453  it->second.genetic_code =
454  m_TaxonConn->GetTreeIterator(taxid)->GetNode()->GetGC();
455  m_TaxonConn->GetAllNames(taxid, it->second.synonyms, true);
456  parent = m_TaxonConn->GetParent(taxid);
457  }
458  }
459  }}
460 
461  if (parent > TAX_ID_CONST(1)) {
462  // Recursively get information for parent; no need for Org_ref, even
463  // / if it was requested for child node
464  it->second.parent = x_Cache(parent);
465  }
466  }
467 
468  if (it->second.is_valid && including_org_ref) {
470  (m_SqliteConn.get(),
471  "SELECT org_ref_asn "
472  "FROM TaxidInfo "
473  "WHERE taxid = ? ");
474  stmt.Bind(1, TAX_ID_TO(TIntId, taxid));
475  stmt.Execute();
476  stmt.Step();
477  string org_ref_asn = stmt.GetString(0);
478  if (!org_ref_asn.empty()) {
479  CNcbiIstrstream istr(org_ref_asn);
480  CRef<COrg_ref> org_ref(new COrg_ref);
481  istr >> MSerial_AsnText >> *org_ref;
482  it->second.org_ref = org_ref;
483  } else if (m_fallback) {
484  if (!m_TaxonConn.get()) {
485  m_TaxonConn.reset(new CTaxon1);
486  m_TaxonConn->Init();
487  }
488  bool is_species, is_uncultured;
489  string blast_name;
490  it->second.org_ref = m_TaxonConn->GetOrgRef(taxid, is_species,
491  is_uncultured, blast_name);
492  }
493  }
494 
495  return it;
496 }
497 
499 {
500  TNodeRef it = x_Cache(taxid);
501  if (!it->second.is_valid) {
502  return;
503  }
504  lineage.push_front(it);
505  while(lineage.front()->second.parent != s_InvalidNode) {
506  lineage.push_front(lineage.front()->second.parent);
507  }
508 }
509 
511 {
513  "SELECT count(*) FROM sqlite_master WHERE type='table' AND name='Synonym'");
514  stmt.Execute();
515  stmt.Step();
516  return stmt.GetInt(0) > 0;
517 }
518 
#define false
Definition: bool.h:36
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgs –.
Definition: ncbiargs.hpp:379
CConstRef –.
Definition: ncbiobj.hpp:1266
string GetRank(TTaxid taxid)
unique_ptr< objects::CTaxon1 > m_TaxonConn
short int GetGeneticCode(TTaxid taxid)
static TNodeRef s_InvalidNode
TTaxid Join(TTaxid taxid1, TTaxid taxid2)
void LookupMerge(objects::COrg_ref &org)
list< TNodeRef > TInternalLineage
Definition: local_taxon.hpp:96
TLineage GetLineage(TTaxid taxid)
string GetScientificName(TTaxid taxid)
static void AddArguments(CArgDescriptions &arg_desc)
Definition: local_taxon.cpp:52
list< string > GetSynonyms(TTaxId taxid)
TNodeRef x_Cache(TTaxid taxid, bool including_org_ref=false)
CConstRef< objects::COrg_ref > GetOrgRef(TTaxid taxid)
TNodes::const_iterator TNodeRef
Definition: local_taxon.hpp:94
bool m_db_supports_synonym
vector< TTaxid > TLineage
Definition: local_taxon.hpp:53
TTaxid GetTaxIdByOrgRef(const objects::COrg_ref &inp_orgRef)
void x_GetLineage(TTaxid taxid, TInternalLineage &lineage)
TTaxid GetAncestorByRank(TTaxid taxid, const string &rank)
TNodes m_Nodes
TTaxId TTaxid
Definition: local_taxon.hpp:52
unique_ptr< CSQLITE_Connection > m_SqliteConn
TTaxid GetParent(TTaxid taxid)
TScientificNameIndex m_ScientificNameIndex
TScientificNameIndex::const_iterator TScientificNameRef
Definition: local_taxon.hpp:95
static TNodes s_DummyNodes
bool IsValidTaxid(TTaxid taxid)
bool x_SupportsSynonym()
TTaxid GetTaxIdByName(const string &orgname)
TTaxId GetTaxId() const
Definition: Org_ref.cpp:72
Connection to SQLite database.
@ fExternalMT
Object and all statements and blobs created on top of it will not be used from different threads simu...
@ fJournalOff
Journaling is completely off (not recommended - transactions cannot be rollbacked unless they consist...
@ fSyncOff
Synchronization is off, database can be corrupted on OS crash or power outage.
@ fVacuumOff
Vacuuming is off, database file can only grow.
@ fTempToMemory
Mode of storing temporary data.
SQL statement executing on SQLite database.
void Bind(int index, int val)
Bind integer value to parameter index.
bool Step(void)
Step through results of the statement.
void Execute(void)
Execute statement without returning any result.
string GetString(int col_ind) const
Get text value from column col_ind in current row.
const_iterator end() const
Definition: map.hpp:152
iterator_bool insert(const value_type &val)
Definition: map.hpp:165
const_iterator find(const key_type &key) const
Definition: map.hpp:153
constexpr auto end(const ct_const_array< T, N > &in) noexcept
#define TAX_ID_CONST(id)
Definition: ncbimisc.hpp:1112
#define ZERO_TAX_ID
Definition: ncbimisc.hpp:1115
#define ERASE_ITERATE(Type, Var, Cont)
Non-constant version with ability to erase current element, if container permits.
Definition: ncbimisc.hpp:843
#define INVALID_TAX_ID
Definition: ncbimisc.hpp:1116
#define TAX_ID_TO(T, tax_id)
Definition: ncbimisc.hpp:1110
Int8 TIntId
Definition: ncbimisc.hpp:999
#define VECTOR_ERASE(Var, Cont)
Use this macro inside body of ERASE_ITERATE cycle to erase from vector-like container.
Definition: ncbimisc.hpp:852
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
#define TAX_ID_FROM(T, value)
Definition: ncbimisc.hpp:1111
void AddFlag(const string &name, const string &comment, CBoolEnum< EFlagValue > set_value=eFlagHasValueIfSet, TFlags flags=0)
Add description for flag argument.
Definition: ncbiargs.cpp:2459
void SetDependency(const string &arg1, EDependency dep, const string &arg2)
Define a dependency.
Definition: ncbiargs.cpp:2618
void AddOptionalKey(const string &name, const string &synopsis, const string &comment, EType type, TFlags flags=0)
Add description for optional key without default value.
Definition: ncbiargs.cpp:2427
@ eRequires
One argument requires another.
Definition: ncbiargs.hpp:956
@ eInputFile
Name of file (must exist and be readable)
Definition: ncbiargs.hpp:595
#define NCBI_ASSERT(expr, mess)
Definition: ncbidbg.hpp:130
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
#define MSerial_AsnText
I/O stream manipulators –.
Definition: serialbase.hpp:696
virtual bool Equals(const CSerialObject &object, ESerialRecursionMode how=eRecursive) const
Check if both objects contain the same values.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
bool IsSetDb(void) const
ids in taxonomic or culture dbases Check if a value has been assigned to Db data member.
Definition: Org_ref_.hpp:479
TDb & SetDb(void)
Assign a value to Db data member.
Definition: Org_ref_.hpp:497
void SetOrgname(TOrgname &value)
Assign a value to Orgname data member.
Definition: Org_ref_.cpp:87
USING_SCOPE(objects)
static char sql[1024]
Definition: putdata.c:19
static HSTMT stmt
Definition: rebindpar.c:12
short int TTaxRank
Primitive types for some taxon1 object fields.
Definition: taxon1.hpp:52
Modified on Fri Dec 08 08:21:01 2023 by modify_doxy.py rev. 669887