NCBI C++ ToolKit
local_taxon.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: local_taxon.cpp 101527 2023-12-27 14:41:53Z dicuccio $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Eyal Mozes
27  *
28  * File Description:
29  * Class for getting Taxonomy data from local SQLite file
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 
36 
38 
40 
41 #include <serial/serial.hpp>
42 #include <serial/objistr.hpp>
43 
45 
48 
51 
53 {
54  arg_desc.AddOptionalKey("taxon-db", "TaxonDBFile",
55  "SQLite file containing taxon database, to use "
56  "instead of CTaxon1 service",
58 
59  arg_desc.AddFlag("fallback-to-taxon-service",
60  "If organism not found in SQLIlte database, fall back to "
61  "CTaxon1 service");
62  arg_desc.SetDependency("fallback-to-taxon-service",
63  CArgDescriptions::eRequires, "taxon-db");
64 }
65 
66 CLocalTaxon::CLocalTaxon() : m_db_supports_synonym(false)
67 {
68  /// Initializing without command-line arguments; use Taxon server
69  m_TaxonConn.reset(new CTaxon1);
70  m_TaxonConn->Init();
71 }
72 
73 CLocalTaxon::CLocalTaxon(const CArgs &args) : m_db_supports_synonym(false)
74 {
75  if (args["taxon-db"]) {
76  m_SqliteConn.reset(new CSQLITE_Connection(args["taxon-db"].AsString(),
83  m_fallback = args["fallback-to-taxon-service"];
84  } else {
85  m_TaxonConn.reset(new CTaxon1);
86  m_TaxonConn->Init();
87  }
88 }
89 
91 {
92 }
93 
95  : taxid(INVALID_TAX_ID)
96  , is_valid(false)
97  , parent(s_InvalidNode)
98  , genetic_code(-1)
99 {
100 }
101 
103 {
104 }
105 
107 {
108  if (m_SqliteConn.get()) {
109  x_Cache(taxid);
110  return m_Nodes.find(taxid)->second.is_valid;
111  } else {
112  return m_TaxonConn->GetTreeIterator(taxid);
113  }
114 }
115 
117 {
118  if (m_SqliteConn.get()) {
119  x_Cache(taxid);
120  TNodeRef parent = m_Nodes.find(taxid)->second.parent;
121  return parent == s_InvalidNode ? ZERO_TAX_ID : parent->first;
122  } else {
123  return m_TaxonConn->GetParent(taxid);
124  }
125 }
126 
128 {
129  if (m_SqliteConn.get()) {
130  x_Cache(taxid);
131  return m_Nodes.find(taxid)->second.rank;
132  } else {
133  string rank_name;
134  auto node = m_TaxonConn->GetTreeIterator(taxid)->GetNode();
135  if (node) {
136  TTaxRank rank_id = node->GetRank();
137  m_TaxonConn->GetRankName(rank_id, rank_name);
138  }
139  return rank_name;
140  }
141 }
142 
144 {
145  if (m_SqliteConn.get()) {
146  x_Cache(taxid);
147  return m_Nodes.find(taxid)->second.scientific_name;
148  } else {
149  string scientific_name;
150  m_TaxonConn->GetScientificName(taxid, scientific_name);
151  return scientific_name;
152  }
153 }
154 
156 {
157  if (m_SqliteConn.get()) {
158  x_Cache(taxid);
159  return m_Nodes.find(taxid)->second.genetic_code;
160  } else {
161  return m_TaxonConn->GetTreeIterator(taxid)->GetNode()->GetGC();
162  }
163 }
164 
165 
166 namespace {
167 
168 // s_CopyDbTags(org, *new_org);
169 void s_CopyDbTags(const COrg_ref& org, COrg_ref& new_org)
170 {
171  if( ! org.IsSetDb() ) {
172  return;
173  }
174  new_org.SetDb().insert(
175  new_org.SetDb().end(),
176  const_cast<COrg_ref&>(org).SetDb().begin(),
177  const_cast<COrg_ref&>(org).SetDb().end()
178  );
179 
180  for (vector<CRef<CDbtag> >::iterator it1 = new_org.SetDb().begin();
181  it1 != new_org.SetDb().end(); ++it1) {
182 
183  vector<CRef<CDbtag> >::iterator it2 = it1;
184  for (++it2; it2 != new_org.SetDb().end(); ) {
185  if ((*it1)->Equals(**it2)) {
186  it2 = new_org.SetDb().erase(it2);
187  }
188  else {
189  ++it2;
190  }
191  }
192  }
193 }
194 
195 void s_RemoveTaxon(COrg_ref& org)
196 {
197  if( ! org.IsSetDb() ) {
198  return;
199  }
200  vector<CRef<CDbtag> >& dbs = org.SetDb();
201  ERASE_ITERATE(vector<CRef<CDbtag> >, it, dbs ) {
202  if ( (*it)->GetDb() == "taxon" ) {
203  VECTOR_ERASE(it, dbs);
204  }
205  }
206 }
207 
208 }
209 
210 
211 
212 
213 void CLocalTaxon::LookupMerge(objects::COrg_ref& org)
214 {
215  if (m_SqliteConn.get()) {
216  TTaxId taxid = ZERO_TAX_ID;
217  if( ! org.IsSetDb() ) {
218  taxid = GetTaxIdByOrgRef(org);
219  } else {
220  taxid = org.GetTaxId();
221  }
222  if ( taxid <= ZERO_TAX_ID ) {
223  NCBI_THROW(CException, eUnknown, "s_UpdateOrgRef: organism does not contain tax id or has unequivocal registered taxonomy name");
224  }
225 
226  CConstRef<COrg_ref> public_org = GetOrgRef(taxid);
227  CRef<COrg_ref> new_org(new COrg_ref);
228  new_org->Assign(*public_org);
229  if (org.IsSetOrgname() && org.GetOrgname().IsSetMod()) {
230  new_org->SetOrgname().SetMod() =
231  org.GetOrgname().GetMod();
232  }
233  if ( !new_org->Equals(org) ) {
234  s_RemoveTaxon(org);
235  s_CopyDbTags(org, *new_org);
236  org.Assign(*new_org);
237  }
238  }
239  else {
240  m_TaxonConn->LookupMerge(org);
241  }
242 }
243 
244 
246 {
247  if (m_SqliteConn.get()) {
248  x_Cache(taxid, true);
249  return m_Nodes.find(taxid)->second.org_ref;
250  } else {
251  bool is_species, is_uncultured;
252  string blast_name;
253  return m_TaxonConn->GetOrgRef(taxid, is_species, is_uncultured, blast_name);
254  }
255 }
256 
258 {
259  if (m_SqliteConn.get()) {
260  TInternalLineage lineage;
261  x_GetLineage(taxid, lineage);
262  for (TNodeRef ancestor : lineage) {
263  if (ancestor->second.rank == rank) {
264  return ancestor->first;
265  }
266  }
267  return ZERO_TAX_ID;
268  } else {
269  return m_TaxonConn->GetAncestorByRank(taxid, rank.c_str());
270  }
271 }
272 
274 {
275  if (inp_orgRef.IsSetDb()) {
276  return inp_orgRef.GetTaxId();
277  }
278  if (m_fallback && !m_TaxonConn.get()) {
279  m_TaxonConn.reset(new CTaxon1);
280  m_TaxonConn->Init();
281  }
282  if (m_TaxonConn.get()) {
283  return m_TaxonConn->GetTaxIdByOrgRef(inp_orgRef);
284  } else {
286  "GetTaxIdByOrgRef not supported for local execution");
287  }
288 }
289 
291 {
292  TLineage lineage;
293  if (m_SqliteConn.get()) {
294  TInternalLineage internal_lineage;
295  x_GetLineage(taxid, internal_lineage);
296  for (TNodeRef ancestor : internal_lineage) {
297  lineage.push_back(ancestor->first);
298  }
299  } else {
300  for (TTaxid ancestor = taxid; ancestor > ZERO_TAX_ID;
301  ancestor = m_TaxonConn->GetParent(ancestor))
302  {
303  lineage.push_back(ancestor);
304  }
305  reverse(lineage.begin(), lineage.end());
306  }
307  return lineage;
308 }
309 
311 {
312  if (m_SqliteConn.get()) {
313  TLineage lineage1 = GetLineage(taxid1),
314  lineage2 = GetLineage(taxid2);
315  TLineage::const_iterator it1 = lineage1.begin(),
316  it2 = lineage2.begin();
317  for (; it1 != lineage1.end() && it2 != lineage2.end() && *it1 == *it2;
318  ++it1, ++it2);
319  return *--it1;
320  } else {
321  return m_TaxonConn->Join(taxid1, taxid2);
322  }
323 }
324 
326 {
327  TTaxid taxid = INVALID_TAX_ID;
328  if (m_SqliteConn.get()) {
329  x_Cache(orgname);
330  auto& taxnode = m_ScientificNameIndex.find(orgname)->second;
331  taxid = taxnode.is_valid ? taxnode.taxid : INVALID_TAX_ID;
332  } else {
333  taxid = m_TaxonConn->GetTaxIdByName(orgname);
334  }
335  return taxid;
336 
337 }
338 
339 list<string> CLocalTaxon::GetSynonyms(TTaxId taxid)
340 {
341  if (m_SqliteConn.get()) {
342  x_Cache(taxid);
343  return m_Nodes.find(taxid)->second.synonyms;
344  } else {
345  list<string> lNames; // TNameList - second parameter to GetAllNames is currently list<string>
346  // we are using false because currently all
347  // other usages of this API in gpipe code is with this value:
348  m_TaxonConn->GetAllNames(taxid, lNames, false);
349  return lNames;
350  }
351 }
352 
353 //
354 // Implementation
355 //
356 
358 {
359  NCBI_ASSERT(m_SqliteConn.get(), "x_Cache called with server execution");
360 
362  if (it == m_ScientificNameIndex.end() ) {
363  //
364  // do the case-insensitive comparison
365  //
366  string sql = "SELECT taxid FROM TaxidInfo WHERE scientific_name = ?1 COLLATE NOCASE ";
368  sql += " UNION "
369  "SELECT taxid FROM Synonym WHERE scientific_name = ?1 COLLATE NOCASE ";
370  }
371 
373  stmt.Bind(1, orgname);
374  stmt.Execute();
375  TTaxId taxid = ZERO_TAX_ID;
376  if (stmt.Step()) {
377  taxid = TAX_ID_FROM(int, stmt.GetInt(0));
378  } else if (m_fallback) {
379  if (!m_TaxonConn.get()) {
380  m_TaxonConn.reset(new CTaxon1);
381  m_TaxonConn->Init();
382  }
383  taxid = m_TaxonConn->GetTaxIdByName(orgname);
384  }
385  if (taxid > ZERO_TAX_ID) {
386  CLocalTaxon::TNodeRef it2 = x_Cache(taxid);
387  it = m_ScientificNameIndex.insert(TScientificNameIndex::value_type(orgname, it2->second )).first;
388  } else {
389  //
390  // return invalid node.
391  //
393  }
394  }
395  return it;
396 }
397 
398 
399 CLocalTaxon::TNodeRef CLocalTaxon::x_Cache(TTaxid taxid, bool including_org_ref)
400 {
401  NCBI_ASSERT(m_SqliteConn.get(), "x_Cache called with server execution");
402 
403  TNodes::iterator it = m_Nodes.find(taxid);
404  if (it != m_Nodes.end() && (!including_org_ref || it->second.org_ref))
405  {
406  return it;
407  }
408 
409  if (it == m_Nodes.end()) {
410  TTaxId parent = INVALID_TAX_ID;
411  //
412  // Note that we are unconditionally recording (so far) unknown input taxid here
413  // thereby caching all successful and unsuccessful queries
414  //
415  it = m_Nodes.insert(TNodes::value_type(taxid, STaxidNode())).first;
416  it->second.taxid = taxid;
417  {{
419  (m_SqliteConn.get(),
420  "SELECT scientific_name, rank, parent, genetic_code "
421  "FROM TaxidInfo "
422  "WHERE taxid = ? ");
423  stmt.Bind(1, TAX_ID_TO(TIntId, taxid));
424  stmt.Execute();
425  if (stmt.Step()) {
426  it->second.is_valid = true;
427  it->second.scientific_name = stmt.GetString(0);
428  it->second.rank = stmt.GetString(1);
429  if (it->second.rank.empty()) {
430  it->second.rank = "no rank";
431  }
432  parent = TAX_ID_FROM(int, stmt.GetInt(2));
433  it->second.genetic_code = stmt.GetInt(3);
434  CSQLITE_Statement syn_stmt
435  (m_SqliteConn.get(),
436  "SELECT scientific_name "
437  "FROM Synonym "
438  "WHERE taxid = ? ");
439  syn_stmt.Bind(1, TAX_ID_TO(TIntId, taxid));
440  syn_stmt.Execute();
441  while (syn_stmt.Step()) {
442  it->second.synonyms.push_back( syn_stmt.GetString(0));
443  }
444  } else if (m_fallback) {
445  if (!m_TaxonConn.get()) {
446  m_TaxonConn.reset(new CTaxon1);
447  m_TaxonConn->Init();
448  }
449  if (m_TaxonConn->GetScientificName(taxid,
450  it->second.scientific_name))
451  {
452  it->second.is_valid = true;
453  TTaxRank rank_id = m_TaxonConn->GetTreeIterator(taxid)
454  ->GetNode()->GetRank();
455  m_TaxonConn->GetRankName(rank_id, it->second.rank);
456  it->second.genetic_code =
457  m_TaxonConn->GetTreeIterator(taxid)->GetNode()->GetGC();
458  m_TaxonConn->GetAllNames(taxid, it->second.synonyms, true);
459  parent = m_TaxonConn->GetParent(taxid);
460  }
461  }
462  }}
463 
464  if (parent > TAX_ID_CONST(1)) {
465  // Recursively get information for parent; no need for Org_ref, even
466  // / if it was requested for child node
467  it->second.parent = x_Cache(parent);
468  }
469  }
470 
471  if (it->second.is_valid && including_org_ref) {
473  (m_SqliteConn.get(),
474  "SELECT org_ref_asn "
475  "FROM TaxidInfo "
476  "WHERE taxid = ? ");
477  stmt.Bind(1, TAX_ID_TO(TIntId, taxid));
478  stmt.Execute();
479  stmt.Step();
480  string org_ref_asn = stmt.GetString(0);
481  if (!org_ref_asn.empty()) {
482  CNcbiIstrstream istr(org_ref_asn);
483  CRef<COrg_ref> org_ref(new COrg_ref);
484  istr >> MSerial_AsnText >> *org_ref;
485  it->second.org_ref = org_ref;
486  } else if (m_fallback) {
487  if (!m_TaxonConn.get()) {
488  m_TaxonConn.reset(new CTaxon1);
489  m_TaxonConn->Init();
490  }
491  bool is_species, is_uncultured;
492  string blast_name;
493  it->second.org_ref = m_TaxonConn->GetOrgRef(taxid, is_species,
494  is_uncultured, blast_name);
495  }
496  }
497 
498  return it;
499 }
500 
502 {
503  TNodeRef it = x_Cache(taxid);
504  if (!it->second.is_valid) {
505  return;
506  }
507  lineage.push_front(it);
508  while(lineage.front()->second.parent != s_InvalidNode) {
509  lineage.push_front(lineage.front()->second.parent);
510  }
511 }
512 
514 {
516  "SELECT count(*) FROM sqlite_master WHERE type='table' AND name='Synonym'");
517  stmt.Execute();
518  stmt.Step();
519  return stmt.GetInt(0) > 0;
520 }
521 
#define false
Definition: bool.h:36
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgs –.
Definition: ncbiargs.hpp:379
CConstRef –.
Definition: ncbiobj.hpp:1266
string GetRank(TTaxid taxid)
unique_ptr< objects::CTaxon1 > m_TaxonConn
short int GetGeneticCode(TTaxid taxid)
static TNodeRef s_InvalidNode
TTaxid Join(TTaxid taxid1, TTaxid taxid2)
void LookupMerge(objects::COrg_ref &org)
list< TNodeRef > TInternalLineage
Definition: local_taxon.hpp:96
TLineage GetLineage(TTaxid taxid)
string GetScientificName(TTaxid taxid)
static void AddArguments(CArgDescriptions &arg_desc)
Definition: local_taxon.cpp:52
list< string > GetSynonyms(TTaxId taxid)
TNodeRef x_Cache(TTaxid taxid, bool including_org_ref=false)
CConstRef< objects::COrg_ref > GetOrgRef(TTaxid taxid)
TNodes::const_iterator TNodeRef
Definition: local_taxon.hpp:94
bool m_db_supports_synonym
vector< TTaxid > TLineage
Definition: local_taxon.hpp:53
TTaxid GetTaxIdByOrgRef(const objects::COrg_ref &inp_orgRef)
void x_GetLineage(TTaxid taxid, TInternalLineage &lineage)
TTaxid GetAncestorByRank(TTaxid taxid, const string &rank)
TNodes m_Nodes
TTaxId TTaxid
Definition: local_taxon.hpp:52
unique_ptr< CSQLITE_Connection > m_SqliteConn
TTaxid GetParent(TTaxid taxid)
TScientificNameIndex m_ScientificNameIndex
TScientificNameIndex::const_iterator TScientificNameRef
Definition: local_taxon.hpp:95
static TNodes s_DummyNodes
bool IsValidTaxid(TTaxid taxid)
bool x_SupportsSynonym()
TTaxid GetTaxIdByName(const string &orgname)
TTaxId GetTaxId() const
Definition: Org_ref.cpp:72
Connection to SQLite database.
@ fExternalMT
Object and all statements and blobs created on top of it will not be used from different threads simu...
@ fJournalOff
Journaling is completely off (not recommended - transactions cannot be rollbacked unless they consist...
@ fSyncOff
Synchronization is off, database can be corrupted on OS crash or power outage.
@ fVacuumOff
Vacuuming is off, database file can only grow.
@ fTempToMemory
Mode of storing temporary data.
SQL statement executing on SQLite database.
void Bind(int index, int val)
Bind integer value to parameter index.
bool Step(void)
Step through results of the statement.
void Execute(void)
Execute statement without returning any result.
string GetString(int col_ind) const
Get text value from column col_ind in current row.
const_iterator end() const
Definition: map.hpp:152
iterator_bool insert(const value_type &val)
Definition: map.hpp:165
const_iterator find(const key_type &key) const
Definition: map.hpp:153
constexpr auto end(const ct_const_array< T, N > &in) noexcept
#define TAX_ID_CONST(id)
Definition: ncbimisc.hpp:1112
#define ZERO_TAX_ID
Definition: ncbimisc.hpp:1115
#define ERASE_ITERATE(Type, Var, Cont)
Non-constant version with ability to erase current element, if container permits.
Definition: ncbimisc.hpp:843
#define INVALID_TAX_ID
Definition: ncbimisc.hpp:1116
#define TAX_ID_TO(T, tax_id)
Definition: ncbimisc.hpp:1110
Int8 TIntId
Definition: ncbimisc.hpp:999
#define VECTOR_ERASE(Var, Cont)
Use this macro inside body of ERASE_ITERATE cycle to erase from vector-like container.
Definition: ncbimisc.hpp:852
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
#define TAX_ID_FROM(T, value)
Definition: ncbimisc.hpp:1111
void AddFlag(const string &name, const string &comment, CBoolEnum< EFlagValue > set_value=eFlagHasValueIfSet, TFlags flags=0)
Add description for flag argument.
Definition: ncbiargs.cpp:2459
void SetDependency(const string &arg1, EDependency dep, const string &arg2)
Define a dependency.
Definition: ncbiargs.cpp:2618
void AddOptionalKey(const string &name, const string &synopsis, const string &comment, EType type, TFlags flags=0)
Add description for optional key without default value.
Definition: ncbiargs.cpp:2427
@ eRequires
One argument requires another.
Definition: ncbiargs.hpp:956
@ eInputFile
Name of file (must exist and be readable)
Definition: ncbiargs.hpp:595
#define NCBI_ASSERT(expr, mess)
Definition: ncbidbg.hpp:130
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
#define MSerial_AsnText
I/O stream manipulators –.
Definition: serialbase.hpp:696
virtual bool Equals(const CSerialObject &object, ESerialRecursionMode how=eRecursive) const
Check if both objects contain the same values.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
bool IsSetDb(void) const
ids in taxonomic or culture dbases Check if a value has been assigned to Db data member.
Definition: Org_ref_.hpp:479
TDb & SetDb(void)
Assign a value to Db data member.
Definition: Org_ref_.hpp:497
void SetOrgname(TOrgname &value)
Assign a value to Orgname data member.
Definition: Org_ref_.cpp:87
USING_SCOPE(objects)
static char sql[1024]
Definition: putdata.c:19
static HSTMT stmt
Definition: rebindpar.c:12
short int TTaxRank
Primitive types for some taxon1 object fields.
Definition: taxon1.hpp:52
Modified on Wed Mar 27 11:21:20 2024 by modify_doxy.py rev. 669887