NCBI C++ ToolKit
local_taxon.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: local_taxon.cpp 102514 2024-05-20 13:53:26Z dicuccio $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Eyal Mozes
27  *
28  * File Description:
29  * Class for getting Taxonomy data from local SQLite file
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 
36 
38 
40 
41 #include <serial/serial.hpp>
42 #include <serial/objistr.hpp>
43 
45 
48 
51 
53 {
54  arg_desc.AddOptionalKey("taxon-db", "TaxonDBFile",
55  "SQLite file containing taxon database, to use "
56  "instead of CTaxon1 service",
58 
59  arg_desc.AddFlag("fallback-to-taxon-service",
60  "If organism not found in SQLIlte database, fall back to "
61  "CTaxon1 service");
62  arg_desc.SetDependency("fallback-to-taxon-service",
63  CArgDescriptions::eRequires, "taxon-db");
64 }
65 
66 CLocalTaxon::CLocalTaxon() : m_db_supports_synonym(false)
67 {
68  /// Initializing without command-line arguments; use Taxon server
69  m_TaxonConn.reset(new CTaxon1);
70  m_TaxonConn->Init();
71 }
72 
73 CLocalTaxon::CLocalTaxon(const CArgs &args) : m_db_supports_synonym(false)
74 {
75  if (args["taxon-db"]) {
76  m_SqliteConn.reset(new CSQLITE_Connection(args["taxon-db"].AsString(),
83  m_fallback = args["fallback-to-taxon-service"];
84  } else {
85  m_TaxonConn.reset(new CTaxon1);
86  m_TaxonConn->Init();
87  }
88 }
89 
91 {
92 }
93 
95  : taxid(INVALID_TAX_ID)
96  , is_valid(false)
97  , parent(s_InvalidNode)
98  , genetic_code(-1)
99 {
100 }
101 
103 {
104 }
105 
107 {
108  if (m_SqliteConn.get()) {
109  x_Cache(taxid);
110  return m_Nodes.find(taxid)->second.is_valid;
111  } else {
112  return m_TaxonConn->GetTreeIterator(taxid);
113  }
114 }
115 
117 {
118  if (m_SqliteConn.get()) {
119  x_Cache(taxid);
120  TNodeRef parent = m_Nodes.find(taxid)->second.parent;
121  return parent == s_InvalidNode ? ZERO_TAX_ID : parent->first;
122  } else {
123  return m_TaxonConn->GetParent(taxid);
124  }
125 }
126 
128 {
129  if (m_SqliteConn.get()) {
130  x_Cache(taxid);
131  return m_Nodes.find(taxid)->second.rank;
132  } else {
133  string rank_name;
134  auto node = m_TaxonConn->GetTreeIterator(taxid)->GetNode();
135  if (node) {
136  TTaxRank rank_id = node->GetRank();
137  m_TaxonConn->GetRankName(rank_id, rank_name);
138  }
139  return rank_name;
140  }
141 }
142 
144 {
145  if (m_SqliteConn.get()) {
146  x_Cache(taxid);
147  return m_Nodes.find(taxid)->second.scientific_name;
148  } else {
149  string scientific_name;
150  m_TaxonConn->GetScientificName(taxid, scientific_name);
151  return scientific_name;
152  }
153 }
154 
156 {
157  if (m_SqliteConn.get()) {
158  x_Cache(taxid);
159  return m_Nodes.find(taxid)->second.genetic_code;
160  } else {
161  return m_TaxonConn->GetTreeIterator(taxid)->GetNode()->GetGC();
162  }
163 }
164 
165 
166 namespace {
167 
168 // s_CopyDbTags(org, *new_org);
169 void s_CopyDbTags(const COrg_ref& org, COrg_ref& new_org)
170 {
171  if( ! org.IsSetDb() ) {
172  return;
173  }
174  new_org.SetDb().insert(
175  new_org.SetDb().end(),
176  const_cast<COrg_ref&>(org).SetDb().begin(),
177  const_cast<COrg_ref&>(org).SetDb().end()
178  );
179 
180  for (vector<CRef<CDbtag> >::iterator it1 = new_org.SetDb().begin();
181  it1 != new_org.SetDb().end(); ++it1) {
182 
183  vector<CRef<CDbtag> >::iterator it2 = it1;
184  for (++it2; it2 != new_org.SetDb().end(); ) {
185  if ((*it1)->Equals(**it2)) {
186  it2 = new_org.SetDb().erase(it2);
187  }
188  else {
189  ++it2;
190  }
191  }
192  }
193 }
194 
195 void s_RemoveTaxon(COrg_ref& org)
196 {
197  if( ! org.IsSetDb() ) {
198  return;
199  }
200  vector<CRef<CDbtag> >& dbs = org.SetDb();
201  ERASE_ITERATE(vector<CRef<CDbtag> >, it, dbs ) {
202  if ( (*it)->GetDb() == "taxon" ) {
203  VECTOR_ERASE(it, dbs);
204  }
205  }
206 }
207 
208 }
209 
210 
211 
212 
213 void CLocalTaxon::LookupMerge(objects::COrg_ref& org)
214 {
215  if (m_SqliteConn.get()) {
216  TTaxId taxid = ZERO_TAX_ID;
217  if( ! org.IsSetDb() ) {
218  taxid = GetTaxIdByOrgRef(org);
219  } else {
220  taxid = org.GetTaxId();
221  }
222  if ( taxid <= ZERO_TAX_ID ) {
223  NCBI_THROW(CException, eUnknown, "s_UpdateOrgRef: organism does not contain tax id or has unequivocal registered taxonomy name");
224  }
225 
226  CConstRef<COrg_ref> public_org = GetOrgRef(taxid);
227  CRef<COrg_ref> new_org(new COrg_ref);
228  new_org->Assign(*public_org);
229  if (org.IsSetOrgname() && org.GetOrgname().IsSetMod()) {
230  new_org->SetOrgname().SetMod() =
231  org.GetOrgname().GetMod();
232  }
233  if ( !new_org->Equals(org) ) {
234  s_RemoveTaxon(org);
235  s_CopyDbTags(org, *new_org);
236  org.Assign(*new_org);
237  }
238  }
239  else {
240  m_TaxonConn->LookupMerge(org);
241  }
242 }
243 
244 
246 {
247  if (m_SqliteConn.get()) {
248  x_Cache(taxid, true);
249  return m_Nodes.find(taxid)->second.org_ref;
250  } else {
251  bool is_species, is_uncultured;
252  string blast_name;
253  return m_TaxonConn->GetOrgRef(taxid, is_species, is_uncultured, blast_name);
254  }
255 }
256 
258 {
259  if (m_SqliteConn.get()) {
260  TInternalLineage lineage;
261  x_GetLineage(taxid, lineage);
262  for (TNodeRef ancestor : lineage) {
263  if (ancestor->second.rank == rank) {
264  return ancestor->first;
265  }
266  }
267  return ZERO_TAX_ID;
268  } else {
269  return m_TaxonConn->GetAncestorByRank(taxid, rank.c_str());
270  }
271 }
272 
274 {
275  if (inp_orgRef.IsSetDb()) {
276  return inp_orgRef.GetTaxId();
277  }
278  if (m_fallback && !m_TaxonConn.get()) {
279  m_TaxonConn.reset(new CTaxon1);
280  m_TaxonConn->Init();
281  }
282  if (m_TaxonConn.get()) {
283  return m_TaxonConn->GetTaxIdByOrgRef(inp_orgRef);
284  } else {
286  "GetTaxIdByOrgRef not supported for local execution");
287  }
288 }
289 
291 {
292  TLineage lineage;
293  if (m_SqliteConn.get()) {
294  TInternalLineage internal_lineage;
295  x_GetLineage(taxid, internal_lineage);
296  for (TNodeRef ancestor : internal_lineage) {
297  lineage.push_back(ancestor->first);
298  }
299  } else {
300  for (TTaxid ancestor = taxid; ancestor > ZERO_TAX_ID;
301  ancestor = m_TaxonConn->GetParent(ancestor))
302  {
303  lineage.push_back(ancestor);
304  }
305  reverse(lineage.begin(), lineage.end());
306  }
307  return lineage;
308 }
309 
311 {
312  if (m_SqliteConn.get()) {
313  TLineage lineage1 = GetLineage(taxid1),
314  lineage2 = GetLineage(taxid2);
315  TLineage::const_iterator it1 = lineage1.begin(),
316  it2 = lineage2.begin();
317  for (; it1 != lineage1.end() && it2 != lineage2.end() && *it1 == *it2;
318  ++it1, ++it2) {
319  }
320  if (it1 == lineage1.end()) {
321  return 0;
322  }
323  return *--it1;
324  } else {
325  return m_TaxonConn->Join(taxid1, taxid2);
326  }
327 }
328 
330 {
331  TTaxid taxid = INVALID_TAX_ID;
332  if (m_SqliteConn.get()) {
333  x_Cache(orgname);
334  auto& taxnode = m_ScientificNameIndex.find(orgname)->second;
335  taxid = taxnode.is_valid ? taxnode.taxid : INVALID_TAX_ID;
336  } else {
337  taxid = m_TaxonConn->GetTaxIdByName(orgname);
338  }
339  return taxid;
340 
341 }
342 
343 list<string> CLocalTaxon::GetSynonyms(TTaxId taxid)
344 {
345  if (m_SqliteConn.get()) {
346  x_Cache(taxid);
347  return m_Nodes.find(taxid)->second.synonyms;
348  } else {
349  list<string> lNames; // TNameList - second parameter to GetAllNames is currently list<string>
350  // we are using false because currently all
351  // other usages of this API in gpipe code is with this value:
352  m_TaxonConn->GetAllNames(taxid, lNames, false);
353  return lNames;
354  }
355 }
356 
357 //
358 // Implementation
359 //
360 
362 {
363  NCBI_ASSERT(m_SqliteConn.get(), "x_Cache called with server execution");
364 
366  if (it == m_ScientificNameIndex.end() ) {
367  //
368  // do the case-insensitive comparison
369  //
370  string sql = "SELECT taxid FROM TaxidInfo WHERE scientific_name = ?1 COLLATE NOCASE ";
372  sql += " UNION "
373  "SELECT taxid FROM Synonym WHERE scientific_name = ?1 COLLATE NOCASE ";
374  }
375 
377  stmt.Bind(1, orgname);
378  stmt.Execute();
379  TTaxId taxid = ZERO_TAX_ID;
380  if (stmt.Step()) {
381  taxid = TAX_ID_FROM(int, stmt.GetInt(0));
382  } else if (m_fallback) {
383  if (!m_TaxonConn.get()) {
384  m_TaxonConn.reset(new CTaxon1);
385  m_TaxonConn->Init();
386  }
387  taxid = m_TaxonConn->GetTaxIdByName(orgname);
388  }
389  if (taxid > ZERO_TAX_ID) {
390  CLocalTaxon::TNodeRef it2 = x_Cache(taxid);
391  it = m_ScientificNameIndex.insert(TScientificNameIndex::value_type(orgname, it2->second )).first;
392  } else {
393  //
394  // return invalid node.
395  //
397  }
398  }
399  return it;
400 }
401 
402 
403 CLocalTaxon::TNodeRef CLocalTaxon::x_Cache(TTaxid taxid, bool including_org_ref)
404 {
405  NCBI_ASSERT(m_SqliteConn.get(), "x_Cache called with server execution");
406 
407  TNodes::iterator it = m_Nodes.find(taxid);
408  if (it != m_Nodes.end() && (!including_org_ref || it->second.org_ref))
409  {
410  return it;
411  }
412 
413  if (it == m_Nodes.end()) {
414  TTaxId parent = INVALID_TAX_ID;
415  //
416  // Note that we are unconditionally recording (so far) unknown input taxid here
417  // thereby caching all successful and unsuccessful queries
418  //
419  it = m_Nodes.insert(TNodes::value_type(taxid, STaxidNode())).first;
420  it->second.taxid = taxid;
421  {{
423  (m_SqliteConn.get(),
424  "SELECT scientific_name, rank, parent, genetic_code "
425  "FROM TaxidInfo "
426  "WHERE taxid = ? ");
427  stmt.Bind(1, TAX_ID_TO(TIntId, taxid));
428  stmt.Execute();
429  if (stmt.Step()) {
430  it->second.is_valid = true;
431  it->second.scientific_name = stmt.GetString(0);
432  it->second.rank = stmt.GetString(1);
433  if (it->second.rank.empty()) {
434  it->second.rank = "no rank";
435  }
436  parent = TAX_ID_FROM(int, stmt.GetInt(2));
437  it->second.genetic_code = stmt.GetInt(3);
438  CSQLITE_Statement syn_stmt
439  (m_SqliteConn.get(),
440  "SELECT scientific_name "
441  "FROM Synonym "
442  "WHERE taxid = ? ");
443  syn_stmt.Bind(1, TAX_ID_TO(TIntId, taxid));
444  syn_stmt.Execute();
445  while (syn_stmt.Step()) {
446  it->second.synonyms.push_back( syn_stmt.GetString(0));
447  }
448  } else if (m_fallback) {
449  if (!m_TaxonConn.get()) {
450  m_TaxonConn.reset(new CTaxon1);
451  m_TaxonConn->Init();
452  }
453  if (m_TaxonConn->GetScientificName(taxid,
454  it->second.scientific_name))
455  {
456  it->second.is_valid = true;
457  TTaxRank rank_id = m_TaxonConn->GetTreeIterator(taxid)
458  ->GetNode()->GetRank();
459  m_TaxonConn->GetRankName(rank_id, it->second.rank);
460  it->second.genetic_code =
461  m_TaxonConn->GetTreeIterator(taxid)->GetNode()->GetGC();
462  m_TaxonConn->GetAllNames(taxid, it->second.synonyms, true);
463  parent = m_TaxonConn->GetParent(taxid);
464  }
465  }
466  }}
467 
468  if (parent > TAX_ID_CONST(1)) {
469  // Recursively get information for parent; no need for Org_ref, even
470  // / if it was requested for child node
471  it->second.parent = x_Cache(parent);
472  }
473  }
474 
475  if (it->second.is_valid && including_org_ref) {
477  (m_SqliteConn.get(),
478  "SELECT org_ref_asn "
479  "FROM TaxidInfo "
480  "WHERE taxid = ? ");
481  stmt.Bind(1, TAX_ID_TO(TIntId, taxid));
482  stmt.Execute();
483  stmt.Step();
484  string org_ref_asn = stmt.GetString(0);
485  if (!org_ref_asn.empty()) {
486  CNcbiIstrstream istr(org_ref_asn);
487  CRef<COrg_ref> org_ref(new COrg_ref);
488  istr >> MSerial_AsnText >> *org_ref;
489  it->second.org_ref = org_ref;
490  } else if (m_fallback) {
491  if (!m_TaxonConn.get()) {
492  m_TaxonConn.reset(new CTaxon1);
493  m_TaxonConn->Init();
494  }
495  bool is_species, is_uncultured;
496  string blast_name;
497  it->second.org_ref = m_TaxonConn->GetOrgRef(taxid, is_species,
498  is_uncultured, blast_name);
499  }
500  }
501 
502  return it;
503 }
504 
506 {
507  TNodeRef it = x_Cache(taxid);
508  if (!it->second.is_valid) {
509  return;
510  }
511  lineage.push_front(it);
512  while(lineage.front()->second.parent != s_InvalidNode) {
513  lineage.push_front(lineage.front()->second.parent);
514  }
515 }
516 
518 {
520  "SELECT count(*) FROM sqlite_master WHERE type='table' AND name='Synonym'");
521  stmt.Execute();
522  stmt.Step();
523  return stmt.GetInt(0) > 0;
524 }
525 
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgs –.
Definition: ncbiargs.hpp:379
CConstRef –.
Definition: ncbiobj.hpp:1266
string GetRank(TTaxid taxid)
unique_ptr< objects::CTaxon1 > m_TaxonConn
short int GetGeneticCode(TTaxid taxid)
static TNodeRef s_InvalidNode
TTaxid Join(TTaxid taxid1, TTaxid taxid2)
void LookupMerge(objects::COrg_ref &org)
list< TNodeRef > TInternalLineage
Definition: local_taxon.hpp:96
TLineage GetLineage(TTaxid taxid)
string GetScientificName(TTaxid taxid)
static void AddArguments(CArgDescriptions &arg_desc)
Definition: local_taxon.cpp:52
list< string > GetSynonyms(TTaxId taxid)
TNodeRef x_Cache(TTaxid taxid, bool including_org_ref=false)
CConstRef< objects::COrg_ref > GetOrgRef(TTaxid taxid)
TNodes::const_iterator TNodeRef
Definition: local_taxon.hpp:94
bool m_db_supports_synonym
vector< TTaxid > TLineage
Definition: local_taxon.hpp:53
TTaxid GetTaxIdByOrgRef(const objects::COrg_ref &inp_orgRef)
void x_GetLineage(TTaxid taxid, TInternalLineage &lineage)
TTaxid GetAncestorByRank(TTaxid taxid, const string &rank)
TNodes m_Nodes
TTaxId TTaxid
Definition: local_taxon.hpp:52
unique_ptr< CSQLITE_Connection > m_SqliteConn
TTaxid GetParent(TTaxid taxid)
TScientificNameIndex m_ScientificNameIndex
TScientificNameIndex::const_iterator TScientificNameRef
Definition: local_taxon.hpp:95
static TNodes s_DummyNodes
bool IsValidTaxid(TTaxid taxid)
bool x_SupportsSynonym()
TTaxid GetTaxIdByName(const string &orgname)
TTaxId GetTaxId() const
Definition: Org_ref.cpp:72
Connection to SQLite database.
@ fExternalMT
Object and all statements and blobs created on top of it will not be used from different threads simu...
@ fJournalOff
Journaling is completely off (not recommended - transactions cannot be rollbacked unless they consist...
@ fSyncOff
Synchronization is off, database can be corrupted on OS crash or power outage.
@ fVacuumOff
Vacuuming is off, database file can only grow.
@ fTempToMemory
Mode of storing temporary data.
SQL statement executing on SQLite database.
void Bind(int index, int val)
Bind integer value to parameter index.
bool Step(void)
Step through results of the statement.
void Execute(void)
Execute statement without returning any result.
string GetString(int col_ind) const
Get text value from column col_ind in current row.
const_iterator end() const
Definition: map.hpp:152
iterator_bool insert(const value_type &val)
Definition: map.hpp:165
const_iterator find(const key_type &key) const
Definition: map.hpp:153
static bool is_valid(const char *num, int type, CONV_RESULT *cr)
constexpr auto end(const ct_const_array< T, N > &in) noexcept
#define false
Definition: bool.h:36
static char sql[1024]
Definition: putdata.c:19
static HSTMT stmt
Definition: rebindpar.c:12
#define TAX_ID_CONST(id)
Definition: ncbimisc.hpp:1112
#define ZERO_TAX_ID
Definition: ncbimisc.hpp:1115
#define ERASE_ITERATE(Type, Var, Cont)
Non-constant version with ability to erase current element, if container permits.
Definition: ncbimisc.hpp:843
#define INVALID_TAX_ID
Definition: ncbimisc.hpp:1116
#define TAX_ID_TO(T, tax_id)
Definition: ncbimisc.hpp:1110
Int8 TIntId
Definition: ncbimisc.hpp:999
#define VECTOR_ERASE(Var, Cont)
Use this macro inside body of ERASE_ITERATE cycle to erase from vector-like container.
Definition: ncbimisc.hpp:852
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
#define TAX_ID_FROM(T, value)
Definition: ncbimisc.hpp:1111
void AddFlag(const string &name, const string &comment, CBoolEnum< EFlagValue > set_value=eFlagHasValueIfSet, TFlags flags=0)
Add description for flag argument.
Definition: ncbiargs.cpp:2459
void SetDependency(const string &arg1, EDependency dep, const string &arg2)
Define a dependency.
Definition: ncbiargs.cpp:2618
void AddOptionalKey(const string &name, const string &synopsis, const string &comment, EType type, TFlags flags=0)
Add description for optional key without default value.
Definition: ncbiargs.cpp:2427
@ eRequires
One argument requires another.
Definition: ncbiargs.hpp:956
@ eInputFile
Name of file (must exist and be readable)
Definition: ncbiargs.hpp:595
#define NCBI_ASSERT(expr, mess)
Definition: ncbidbg.hpp:130
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
#define MSerial_AsnText
I/O stream manipulators –.
Definition: serialbase.hpp:696
virtual bool Equals(const CSerialObject &object, ESerialRecursionMode how=eRecursive) const
Check if both objects contain the same values.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
bool IsSetDb(void) const
ids in taxonomic or culture dbases Check if a value has been assigned to Db data member.
Definition: Org_ref_.hpp:479
TDb & SetDb(void)
Assign a value to Db data member.
Definition: Org_ref_.hpp:497
void SetOrgname(TOrgname &value)
Assign a value to Orgname data member.
Definition: Org_ref_.cpp:87
USING_SCOPE(objects)
short int TTaxRank
Primitive types for some taxon1 object fields.
Definition: taxon1.hpp:52
Modified on Tue May 21 10:56:59 2024 by modify_doxy.py rev. 669887