NCBI C++ ToolKit
local_taxon.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: local_taxon.cpp 102870 2024-08-02 12:24:56Z ivanov $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Eyal Mozes
27  *
28  * File Description:
29  * Class for getting Taxonomy data from local SQLite file
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 
38 
39 #include <serial/serial.hpp>
40 #include <serial/objistr.hpp>
41 
44 
47 
48 
50 {
51  arg_desc.AddOptionalKey("taxon-db", "TaxonDBFile",
52  "SQLite file containing taxon database, to use "
53  "instead of CTaxon1 service",
55 
56  arg_desc.AddFlag("fallback-to-taxon-service",
57  "If organism not found in SQLIlte database, fall back to "
58  "CTaxon1 service");
59  arg_desc.SetDependency("fallback-to-taxon-service",
60  CArgDescriptions::eRequires, "taxon-db");
61 }
62 
63 CLocalTaxon::CLocalTaxon() : m_db_supports_synonym(false)
64 {
65  /// Initializing without command-line arguments; use Taxon server
66  m_TaxonConn.reset(new CTaxon1);
67  m_TaxonConn->Init();
68 }
69 
70 CLocalTaxon::CLocalTaxon(const CArgs &args) : m_db_supports_synonym(false)
71 {
72  if (args["taxon-db"]) {
73  m_SqliteConn.reset(new CSQLITE_Connection(args["taxon-db"].AsString(),
80  m_fallback = args["fallback-to-taxon-service"];
81  } else {
82  m_TaxonConn.reset(new CTaxon1);
83  m_TaxonConn->Init();
84  }
85 }
86 
88 {
89 }
90 
92 {
93  static CSafeStatic<CLocalTaxon::TNodes> s_DummyNodes;
94  static CLocalTaxon::TNodeRef s_InvalidNode = s_DummyNodes->end();
95  return s_InvalidNode;
96 }
97 
99  : taxid(INVALID_TAX_ID)
100  , is_valid(false)
101  , parent(GetInvalidNode())
102  , genetic_code(-1)
103 {
104 }
105 
107 {
108 }
109 
111 {
112  if (m_SqliteConn.get()) {
113  x_Cache(taxid);
114  return m_Nodes.find(taxid)->second.is_valid;
115  } else {
116  return m_TaxonConn->GetTreeIterator(taxid);
117  }
118 }
119 
121 {
122  if (m_SqliteConn.get()) {
123  x_Cache(taxid);
124  TNodeRef parent = m_Nodes.find(taxid)->second.parent;
125  return parent == GetInvalidNode() ? ZERO_TAX_ID : parent->first;
126  } else {
127  return m_TaxonConn->GetParent(taxid);
128  }
129 }
130 
132 {
133  if (m_SqliteConn.get()) {
134  x_Cache(taxid);
135  return m_Nodes.find(taxid)->second.rank;
136  } else {
137  string rank_name;
138  auto node = m_TaxonConn->GetTreeIterator(taxid)->GetNode();
139  if (node) {
140  TTaxRank rank_id = node->GetRank();
141  m_TaxonConn->GetRankName(rank_id, rank_name);
142  }
143  return rank_name;
144  }
145 }
146 
148 {
149  if (m_SqliteConn.get()) {
150  x_Cache(taxid);
151  return m_Nodes.find(taxid)->second.scientific_name;
152  } else {
153  string scientific_name;
154  m_TaxonConn->GetScientificName(taxid, scientific_name);
155  return scientific_name;
156  }
157 }
158 
160 {
161  if (m_SqliteConn.get()) {
162  x_Cache(taxid);
163  return m_Nodes.find(taxid)->second.genetic_code;
164  } else {
165  return m_TaxonConn->GetTreeIterator(taxid)->GetNode()->GetGC();
166  }
167 }
168 
169 
170 namespace {
171 
172 // s_CopyDbTags(org, *new_org);
173 void s_CopyDbTags(const COrg_ref& org, COrg_ref& new_org)
174 {
175  if( ! org.IsSetDb() ) {
176  return;
177  }
178  new_org.SetDb().insert(
179  new_org.SetDb().end(),
180  const_cast<COrg_ref&>(org).SetDb().begin(),
181  const_cast<COrg_ref&>(org).SetDb().end()
182  );
183 
184  for (vector<CRef<CDbtag> >::iterator it1 = new_org.SetDb().begin();
185  it1 != new_org.SetDb().end(); ++it1) {
186 
187  vector<CRef<CDbtag> >::iterator it2 = it1;
188  for (++it2; it2 != new_org.SetDb().end(); ) {
189  if ((*it1)->Equals(**it2)) {
190  it2 = new_org.SetDb().erase(it2);
191  }
192  else {
193  ++it2;
194  }
195  }
196  }
197 }
198 
199 void s_RemoveTaxon(COrg_ref& org)
200 {
201  if( ! org.IsSetDb() ) {
202  return;
203  }
204  vector<CRef<CDbtag> >& dbs = org.SetDb();
205  ERASE_ITERATE(vector<CRef<CDbtag> >, it, dbs ) {
206  if ( (*it)->GetDb() == "taxon" ) {
207  VECTOR_ERASE(it, dbs);
208  }
209  }
210 }
211 
212 }
213 
214 
215 
216 
217 void CLocalTaxon::LookupMerge(objects::COrg_ref& org)
218 {
219  if (m_SqliteConn.get()) {
220  TTaxId taxid = ZERO_TAX_ID;
221  if( ! org.IsSetDb() ) {
222  taxid = GetTaxIdByOrgRef(org);
223  } else {
224  taxid = org.GetTaxId();
225  }
226  if ( taxid <= ZERO_TAX_ID ) {
227  NCBI_THROW(CException, eUnknown, "s_UpdateOrgRef: organism does not contain tax id or has unequivocal registered taxonomy name");
228  }
229 
230  CConstRef<COrg_ref> public_org = GetOrgRef(taxid);
231  CRef<COrg_ref> new_org(new COrg_ref);
232  new_org->Assign(*public_org);
233  if (org.IsSetOrgname() && org.GetOrgname().IsSetMod()) {
234  new_org->SetOrgname().SetMod() =
235  org.GetOrgname().GetMod();
236  }
237  if ( !new_org->Equals(org) ) {
238  s_RemoveTaxon(org);
239  s_CopyDbTags(org, *new_org);
240  org.Assign(*new_org);
241  }
242  }
243  else {
244  m_TaxonConn->LookupMerge(org);
245  }
246 }
247 
248 
250 {
251  if (m_SqliteConn.get()) {
252  x_Cache(taxid, true);
253  return m_Nodes.find(taxid)->second.org_ref;
254  } else {
255  bool is_species, is_uncultured;
256  string blast_name;
257  return m_TaxonConn->GetOrgRef(taxid, is_species, is_uncultured, blast_name);
258  }
259 }
260 
262 {
263  if (m_SqliteConn.get()) {
264  TInternalLineage lineage;
265  x_GetLineage(taxid, lineage);
266  for (TNodeRef ancestor : lineage) {
267  if (ancestor->second.rank == rank) {
268  return ancestor->first;
269  }
270  }
271  return ZERO_TAX_ID;
272  } else {
273  if (m_fallback && !m_TaxonConn.get()) {
274  m_TaxonConn.reset(new CTaxon1);
275  m_TaxonConn->Init();
276  }
277  if (m_TaxonConn.get()) {
278  return m_TaxonConn->GetAncestorByRank(taxid, rank.c_str());
279  }
280  }
281 NCBI_THROW(CException, eUnknown, "CLocalTaxon: neither local nor remote connections available");
282 }
283 
285 {
286  if (inp_orgRef.IsSetDb()) {
287  return inp_orgRef.GetTaxId();
288  }
289  if (m_fallback && !m_TaxonConn.get()) {
290  m_TaxonConn.reset(new CTaxon1);
291  m_TaxonConn->Init();
292  }
293  if (m_TaxonConn.get()) {
294  return m_TaxonConn->GetTaxIdByOrgRef(inp_orgRef);
295  } else {
297  "GetTaxIdByOrgRef not supported for local execution");
298  }
299 }
300 
302 {
303  TLineage lineage;
304  if (m_SqliteConn.get()) {
305  TInternalLineage internal_lineage;
306  x_GetLineage(taxid, internal_lineage);
307  for (TNodeRef ancestor : internal_lineage) {
308  lineage.push_back(ancestor->first);
309  }
310  } else {
311  for (TTaxid ancestor = taxid; ancestor > ZERO_TAX_ID;
312  ancestor = m_TaxonConn->GetParent(ancestor))
313  {
314  lineage.push_back(ancestor);
315  }
316  reverse(lineage.begin(), lineage.end());
317  }
318  return lineage;
319 }
320 
322 {
323  if (m_SqliteConn.get()) {
324  TLineage lineage1 = GetLineage(taxid1);
325  TLineage lineage2 = GetLineage(taxid2);
326  TLineage::const_iterator it1 = lineage1.begin();
327  TLineage::const_iterator it2 = lineage2.begin();
328  CLocalTaxon::TTaxid join_taxid = 0;
329  for ( ; it1 != lineage1.end() && it2 != lineage2.end() &&
330  *it1 == *it2;
331  ++it1, ++it2) {
332  join_taxid = *it1;
333  }
334  return join_taxid;
335  } else {
336  return m_TaxonConn->Join(taxid1, taxid2);
337  }
338 }
339 
341 {
342  TTaxid taxid = INVALID_TAX_ID;
343  if (m_SqliteConn.get()) {
344  x_Cache(orgname);
345  auto& taxnode = m_ScientificNameIndex.find(orgname)->second;
346  taxid = taxnode.is_valid ? taxnode.taxid : INVALID_TAX_ID;
347  } else {
348  taxid = m_TaxonConn->GetTaxIdByName(orgname);
349  }
350  return taxid;
351 
352 }
353 
354 list<string> CLocalTaxon::GetSynonyms(TTaxId taxid)
355 {
356  if (m_SqliteConn.get()) {
357  x_Cache(taxid);
358  return m_Nodes.find(taxid)->second.synonyms;
359  } else {
360  list<string> lNames; // TNameList - second parameter to GetAllNames is currently list<string>
361  // we are using false because currently all
362  // other usages of this API in gpipe code is with this value:
363  m_TaxonConn->GetAllNames(taxid, lNames, false);
364  return lNames;
365  }
366 }
367 
368 //
369 // Implementation
370 //
371 
373 {
374  NCBI_ASSERT(m_SqliteConn.get(), "x_Cache called with server execution");
375 
377  if (it == m_ScientificNameIndex.end() ) {
378  //
379  // do the case-insensitive comparison
380  //
381  string sql = "SELECT taxid FROM TaxidInfo WHERE scientific_name = ?1 COLLATE NOCASE ";
383  sql += " UNION "
384  "SELECT taxid FROM Synonym WHERE scientific_name = ?1 COLLATE NOCASE ";
385  }
386 
388  stmt.Bind(1, orgname);
389  stmt.Execute();
390  TTaxId taxid = ZERO_TAX_ID;
391  if (stmt.Step()) {
392  taxid = TAX_ID_FROM(int, stmt.GetInt(0));
393  } else if (m_fallback) {
394  if (!m_TaxonConn.get()) {
395  m_TaxonConn.reset(new CTaxon1);
396  m_TaxonConn->Init();
397  }
398  taxid = m_TaxonConn->GetTaxIdByName(orgname);
399  }
400  if (taxid > ZERO_TAX_ID) {
401  CLocalTaxon::TNodeRef it2 = x_Cache(taxid);
402  it = m_ScientificNameIndex.insert(TScientificNameIndex::value_type(orgname, it2->second )).first;
403  } else {
404  //
405  // return invalid node.
406  //
408  }
409  }
410  return it;
411 }
412 
413 
414 CLocalTaxon::TNodeRef CLocalTaxon::x_Cache(TTaxid taxid, bool including_org_ref)
415 {
416  NCBI_ASSERT(m_SqliteConn.get(), "x_Cache called with server execution");
417 
418  TNodes::iterator it = m_Nodes.find(taxid);
419  if (it != m_Nodes.end() && (!including_org_ref || it->second.org_ref))
420  {
421  return it;
422  }
423 
424  if (it == m_Nodes.end()) {
425  TTaxId parent = INVALID_TAX_ID;
426  //
427  // Note that we are unconditionally recording (so far) unknown input taxid here
428  // thereby caching all successful and unsuccessful queries
429  //
430  it = m_Nodes.insert(TNodes::value_type(taxid, STaxidNode())).first;
431  it->second.taxid = taxid;
432  {{
434  (m_SqliteConn.get(),
435  "SELECT scientific_name, rank, parent, genetic_code "
436  "FROM TaxidInfo "
437  "WHERE taxid = ? ");
438  stmt.Bind(1, TAX_ID_TO(TIntId, taxid));
439  stmt.Execute();
440  if (stmt.Step()) {
441  it->second.is_valid = true;
442  it->second.scientific_name = stmt.GetString(0);
443  it->second.rank = stmt.GetString(1);
444  if (it->second.rank.empty()) {
445  it->second.rank = "no rank";
446  }
447  parent = TAX_ID_FROM(int, stmt.GetInt(2));
448  it->second.genetic_code = stmt.GetInt(3);
449  CSQLITE_Statement syn_stmt
450  (m_SqliteConn.get(),
451  "SELECT scientific_name "
452  "FROM Synonym "
453  "WHERE taxid = ? ");
454  syn_stmt.Bind(1, TAX_ID_TO(TIntId, taxid));
455  syn_stmt.Execute();
456  while (syn_stmt.Step()) {
457  it->second.synonyms.push_back( syn_stmt.GetString(0));
458  }
459  } else if (m_fallback) {
460  if (!m_TaxonConn.get()) {
461  m_TaxonConn.reset(new CTaxon1);
462  m_TaxonConn->Init();
463  }
464  if (m_TaxonConn->GetScientificName(taxid,
465  it->second.scientific_name))
466  {
467  it->second.is_valid = true;
468  TTaxRank rank_id = m_TaxonConn->GetTreeIterator(taxid)
469  ->GetNode()->GetRank();
470  m_TaxonConn->GetRankName(rank_id, it->second.rank);
471  it->second.genetic_code =
472  m_TaxonConn->GetTreeIterator(taxid)->GetNode()->GetGC();
473  m_TaxonConn->GetAllNames(taxid, it->second.synonyms, true);
474  parent = m_TaxonConn->GetParent(taxid);
475  }
476  }
477  }}
478 
479  if (parent > TAX_ID_CONST(1)) {
480  // Recursively get information for parent; no need for Org_ref, even
481  // / if it was requested for child node
482  it->second.parent = x_Cache(parent);
483  }
484  }
485 
486  if (it->second.is_valid && including_org_ref) {
488  (m_SqliteConn.get(),
489  "SELECT org_ref_asn "
490  "FROM TaxidInfo "
491  "WHERE taxid = ? ");
492  stmt.Bind(1, TAX_ID_TO(TIntId, taxid));
493  stmt.Execute();
494  stmt.Step();
495  string org_ref_asn = stmt.GetString(0);
496  if (!org_ref_asn.empty()) {
497  CNcbiIstrstream istr(org_ref_asn);
498  CRef<COrg_ref> org_ref(new COrg_ref);
499  istr >> MSerial_AsnText >> *org_ref;
500  it->second.org_ref = org_ref;
501  } else if (m_fallback) {
502  if (!m_TaxonConn.get()) {
503  m_TaxonConn.reset(new CTaxon1);
504  m_TaxonConn->Init();
505  }
506  bool is_species, is_uncultured;
507  string blast_name;
508  it->second.org_ref = m_TaxonConn->GetOrgRef(taxid, is_species,
509  is_uncultured, blast_name);
510  }
511  }
512 
513  return it;
514 }
515 
517 {
518  TNodeRef it = x_Cache(taxid);
519  if (!it->second.is_valid) {
520  return;
521  }
522  lineage.push_front(it);
523  while(lineage.front()->second.parent != GetInvalidNode()) {
524  lineage.push_front(lineage.front()->second.parent);
525  }
526 }
527 
529 {
531  "SELECT count(*) FROM sqlite_master WHERE type='table' AND name='Synonym'");
532  stmt.Execute();
533  stmt.Step();
534  return stmt.GetInt(0) > 0;
535 }
536 
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgs –.
Definition: ncbiargs.hpp:379
CConstRef –.
Definition: ncbiobj.hpp:1266
string GetRank(TTaxid taxid)
unique_ptr< objects::CTaxon1 > m_TaxonConn
short int GetGeneticCode(TTaxid taxid)
TTaxid Join(TTaxid taxid1, TTaxid taxid2)
static TNodeRef GetInvalidNode()
Definition: local_taxon.cpp:91
void LookupMerge(objects::COrg_ref &org)
list< TNodeRef > TInternalLineage
Definition: local_taxon.hpp:96
TLineage GetLineage(TTaxid taxid)
string GetScientificName(TTaxid taxid)
static void AddArguments(CArgDescriptions &arg_desc)
Definition: local_taxon.cpp:49
list< string > GetSynonyms(TTaxId taxid)
TNodeRef x_Cache(TTaxid taxid, bool including_org_ref=false)
CConstRef< objects::COrg_ref > GetOrgRef(TTaxid taxid)
TNodes::const_iterator TNodeRef
Definition: local_taxon.hpp:94
bool m_db_supports_synonym
vector< TTaxid > TLineage
Definition: local_taxon.hpp:53
TTaxid GetTaxIdByOrgRef(const objects::COrg_ref &inp_orgRef)
void x_GetLineage(TTaxid taxid, TInternalLineage &lineage)
TTaxid GetAncestorByRank(TTaxid taxid, const string &rank)
TNodes m_Nodes
TTaxId TTaxid
Definition: local_taxon.hpp:52
unique_ptr< CSQLITE_Connection > m_SqliteConn
TTaxid GetParent(TTaxid taxid)
TScientificNameIndex m_ScientificNameIndex
TScientificNameIndex::const_iterator TScientificNameRef
Definition: local_taxon.hpp:95
bool IsValidTaxid(TTaxid taxid)
bool x_SupportsSynonym()
TTaxid GetTaxIdByName(const string &orgname)
TTaxId GetTaxId() const
Definition: Org_ref.cpp:72
Connection to SQLite database.
@ fExternalMT
Object and all statements and blobs created on top of it will not be used from different threads simu...
@ fJournalOff
Journaling is completely off (not recommended - transactions cannot be rollbacked unless they consist...
@ fSyncOff
Synchronization is off, database can be corrupted on OS crash or power outage.
@ fVacuumOff
Vacuuming is off, database file can only grow.
@ fTempToMemory
Mode of storing temporary data.
SQL statement executing on SQLite database.
void Bind(int index, int val)
Bind integer value to parameter index.
bool Step(void)
Step through results of the statement.
void Execute(void)
Execute statement without returning any result.
string GetString(int col_ind) const
Get text value from column col_ind in current row.
CSafeStatic<>::
const_iterator end() const
Definition: map.hpp:152
iterator_bool insert(const value_type &val)
Definition: map.hpp:165
const_iterator find(const key_type &key) const
Definition: map.hpp:153
static bool is_valid(const char *num, int type, CONV_RESULT *cr)
#define false
Definition: bool.h:36
static char sql[1024]
Definition: putdata.c:19
static HSTMT stmt
Definition: rebindpar.c:12
#define TAX_ID_CONST(id)
Definition: ncbimisc.hpp:1112
#define ZERO_TAX_ID
Definition: ncbimisc.hpp:1115
#define ERASE_ITERATE(Type, Var, Cont)
Non-constant version with ability to erase current element, if container permits.
Definition: ncbimisc.hpp:843
#define INVALID_TAX_ID
Definition: ncbimisc.hpp:1116
#define TAX_ID_TO(T, tax_id)
Definition: ncbimisc.hpp:1110
Int8 TIntId
Definition: ncbimisc.hpp:999
#define VECTOR_ERASE(Var, Cont)
Use this macro inside body of ERASE_ITERATE cycle to erase from vector-like container.
Definition: ncbimisc.hpp:852
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
#define TAX_ID_FROM(T, value)
Definition: ncbimisc.hpp:1111
void AddFlag(const string &name, const string &comment, CBoolEnum< EFlagValue > set_value=eFlagHasValueIfSet, TFlags flags=0)
Add description for flag argument.
Definition: ncbiargs.cpp:2459
void SetDependency(const string &arg1, EDependency dep, const string &arg2)
Define a dependency.
Definition: ncbiargs.cpp:2618
void AddOptionalKey(const string &name, const string &synopsis, const string &comment, EType type, TFlags flags=0)
Add description for optional key without default value.
Definition: ncbiargs.cpp:2427
@ eRequires
One argument requires another.
Definition: ncbiargs.hpp:956
@ eInputFile
Name of file (must exist and be readable)
Definition: ncbiargs.hpp:595
#define NCBI_ASSERT(expr, mess)
Definition: ncbidbg.hpp:130
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
#define MSerial_AsnText
I/O stream manipulators –.
Definition: serialbase.hpp:696
virtual bool Equals(const CSerialObject &object, ESerialRecursionMode how=eRecursive) const
Check if both objects contain the same values.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
bool IsSetDb(void) const
ids in taxonomic or culture dbases Check if a value has been assigned to Db data member.
Definition: Org_ref_.hpp:479
TDb & SetDb(void)
Assign a value to Db data member.
Definition: Org_ref_.hpp:497
void SetOrgname(TOrgname &value)
Assign a value to Orgname data member.
Definition: Org_ref_.cpp:87
USING_SCOPE(objects)
Static variables safety - create on demand, destroy on application termination.
short int TTaxRank
Primitive types for some taxon1 object fields.
Definition: taxon1.hpp:52
Modified on Fri Sep 20 14:57:40 2024 by modify_doxy.py rev. 669887