NCBI C++ ToolKit
taxtree_tool_job.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: taxtree_tool_job.cpp 45164 2020-06-11 15:47:43Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Roman Katargin, Anatoliy Kuznetsov
27 */
28 
29 
30 #include <ncbi_pch.hpp>
31 
34 
36 
38 
41 
48 #include <objects/biotree/Node.hpp>
49 
50 #include <objmgr/util/sequence.hpp>
51 
52 #include <gui/objutils/label.hpp>
54 
57 
58 ///////////////////////////////////////////////////////////////////////////////
59 /// CTaxTreeToolJob
60 CTaxTreeToolJob::CTaxTreeToolJob(const CTaxTreeToolParams& params) : m_Params(params)
61 {
62  m_Descr = "Common TaxTree Job";
63 }
64 
65 /// Sequence information record
66 ///
67 /// @internal
69 {
70  STaxSeqInfo() : tax_id(0) {}
71  STaxSeqInfo(int tax, string l) : tax_id(tax), label(l) {}
72 
73  int tax_id; ///< taxonomy id
74  string label; ///< sequence label
75 };
76 
78 {
80  if (seqs.size() == 0) {
81  LOG_POST("CTaxTreeToolJob: no input!");
82  return;
83  }
84 
85  CTaxon1 taxon1;
86  bool tinit_ok = taxon1.Init();
87  if (!tinit_ok) {
88  return; // TODO: throw an exception
89  }
90 
92  CTaxon1::TTaxIdList tax_id_list;
93 
94 
95  // sequence-id to tax-id map
96  typedef map<CSeq_id_Handle, STaxSeqInfo> TSeqId2Tax;
97  TSeqId2Tax seq2tax;
98 
99 
100  CScope* scope = seqs[0].scope.GetPointer();
101 
102  // Get Tax-ids using bulk-interface
103  //
104  map<CSeq_id_Handle, int> bio2tax;
105  try {
106  CScope::TSeq_id_Handles handles;
107  ITERATE(TConstScopedObjects, it, seqs) {
108  const CObject* object = it->object.GetPointer();
109 
110  const CSeq_id* seq_id = 0;
111  const CSeq_loc* seq_loc = 0;
112 
113  string seqid_label;
114  seq_id = dynamic_cast<const CSeq_id*>(object);
115  if (!seq_id) {
116  seq_loc = dynamic_cast<const CSeq_loc*>(object);
117  if (!seq_loc) {
118  continue; // non sequence id object is ignored at this point
119  }
120  }
121 
122  bool handle_valid = false;
123  CBioseq_Handle bio_handle;
124  if (seq_id) {
125  bio_handle = scope->GetBioseqHandle(*seq_id);
126  handle_valid = true;
127  }
128  else {
129  if (seq_loc && seq_loc->GetId()) {
130  bio_handle = scope->GetBioseqHandle(*seq_loc->GetId());
131  handle_valid = true;
132  }
133  }
134 
135  if (handle_valid) {
136  CSeq_id_Handle canonical_seq_id_h =
137  sequence::GetId(bio_handle,
139  handles.push_back(canonical_seq_id_h);
140  }
141 
142  } // ITERATE
143  CScope::TTaxIds taxid_vector = scope->GetTaxIds(handles);
144  unsigned j = 0;
145  ITERATE(CScope::TTaxIds, it, taxid_vector) {
146  CSeq_id_Handle bh = handles[j];
147  bio2tax[bh] = *it;
148  ++j;
149  }
150 
151  } catch (std::exception&) {
152  // it is safe to ignore exceptions here, because we just can use non-bulk interface
153  }
154 
155 
156 
157  // Main algorithm here
158 
159  ITERATE(TConstScopedObjects, it, seqs) {
160  const CObject* object = it->object.GetPointer();
161 
162  const CSeq_id* seq_id = 0;
163  const CSeq_loc* seq_loc = 0;
164 
165  string seqid_label;
166  seq_id = dynamic_cast<const CSeq_id*>(object);
167  if (!seq_id) {
168  seq_loc = dynamic_cast<const CSeq_loc*>(object);
169  if (!seq_loc) {
170  continue; // non sequence id object is ignored at this point
171  }
172  }
173 
174  CBioseq_Handle bio_handle;
175  if (seq_id) {
176  bio_handle = scope->GetBioseqHandle(*seq_id);
177  }
178  else {
179  if (seq_loc && seq_loc->GetId()) {
180  bio_handle = scope->GetBioseqHandle(*seq_loc->GetId());
181  }
182  }
183 
184  CLabel::GetLabel(*bio_handle.GetSeqId(), &seqid_label,
185  CLabel::eDefault, scope);
186 
187  CSeq_id_Handle canonical_seq_id_h =
188  sequence::GetId(bio_handle,
190 
191 
192  // ------------------------------------------------------------
193  // Get the tax id for the sequence
194  //
195 
196  int tax_id = 0;
197 
198  map<CSeq_id_Handle, int>::const_iterator tax_it = bio2tax.find(canonical_seq_id_h);
199  if (tax_it != bio2tax.end()) {
200  tax_id = (*tax_it).second;
201  } else {
202  tax_id = sequence::GetTaxId(bio_handle);
203  }
204 
205  // if not, try the tax server
206  if ( !tax_id ) {
207  TGi gi = ZERO_GI;
208  if ( seq_id && seq_id->IsGi() ) {
209  gi = seq_id->GetGi();
210  }
211  if (gi == ZERO_GI) {
212  gi = scope->GetGi(canonical_seq_id_h);
213  }
214  if (gi == ZERO_GI) {
215  CConstRef<CSeq_id> seq_id = bio_handle.GetNonLocalIdOrNull();
216  if (seq_id && seq_id->IsGi())
217  gi = seq_id->GetGi();
218  }
219 
220  if (gi != ZERO_GI) {
221  taxon1.GetTaxId4GI(gi, tax_id);
222  if (!tax_id) {
223  if (!taxon1.IsAlive()) {
224  const string& err = taxon1.GetLastError();
225  ERR_POST(err);
226  }
227  }
228  }
229  }
230  if (tax_id) {
231  // check if tax id is present
232  if (taxbv[tax_id]) {
233 
234  } else {
235  tax_id_list.push_back(tax_id);
236  taxbv[tax_id] = true;
237  }
238  seq2tax[canonical_seq_id_h] = STaxSeqInfo(tax_id, seqid_label);
239  }
240  else {
241  ERR_POST("CTaxTreeToolJob: No tax_id for " << seqid_label);
242  }
243  } // ITERATE
244 
245  // ----------------------------------------------------
246  // use Taxon client to get the taxonomic tree
247  //
248 
249  CTaxon1 taxon2;
250  taxon2.Init();
251 
252  if (tax_id_list.size() == 0) {
254  "Can't generate common tree for sequences:\n"
255  "No taxonomy IDs found.");
256  }
257 
258  CTaxon1::TTaxIdList tax_ids_join;
259 
260  if (tax_id_list.size() == 1) {
261  tax_ids_join = tax_id_list;
262  } else {
263  if ( !taxon2.GetPopsetJoin(tax_id_list, tax_ids_join) ) {
265  "Can't generate common tree for sequences");
266  }
267  }
268 
269  if (tax_ids_join.size() == 0) {
271  "Can't generate common tree for sequences:\n"
272  "No taxonomy IDs found.");
273  }
274 
275  // load the common tree to taxon
276  //
277  bool tax_load_ok = false;
278  ITERATE(vector<int>, it, tax_ids_join) {
279  int tax_id = *it;
280  tax_load_ok |= taxon2.LoadNode(tax_id);
281  }
282 
283  if (!tax_load_ok) {
285  "Can't generate common tree for sequences:\n"
286  "Taxonomic load was not successfull.");
287  }
288 
289 
290  // convert taxon-tree to bio-tree
291  //
292  CRef<ITreeIterator> tax_tree_iter(taxon2.GetTreeIterator());
293 
295  typedef
297  CTaxon1,
298  ITaxon1Node,
300  TTaxon1Conv;
301 
302  TTaxon1Conv conv_func;
303  conv_func(*btc, tax_tree_iter);
304  btc->SetTreetype("CommonTaxTree");
305 
306  // add fan of sequences to the appropriate taxonomy leafs
307  //
308  {{
309  int max_tax_id = conv_func.GetMaxNodeId();
310  // here we take max tax id and just keep counting from that...
311  // not sure if this is a good idea to mix-in fake taxids this way?
312  ++max_tax_id;
313 
314  TTaxon1Conv::TTaxon1Visitor::TNodeList& node_list = btc->SetNodes().Set();
315  ITERATE(TSeqId2Tax, mit, seq2tax) {
316  string seqid_str = mit->first.AsString();
317  int tax_id = mit->second.tax_id;
318  string tax_id_str = NStr::IntToString(tax_id);
319  const string& label = mit->second.label;
320 
321  // construct a sequence node and attach it to org
322  {{
323  CRef<TTaxon1Conv::TTaxon1Visitor::TCNode> cnode(new TTaxon1Conv::TTaxon1Visitor::TCNode);
324  cnode->SetId(max_tax_id);
325  cnode->SetParent(tax_id);
326  TTaxon1Conv::TTaxon1Visitor::TCNodeFeatureSet& fset = cnode->SetFeatures();
327  // seq-id
328  {{
330  cfeat(new TTaxon1Conv::TTaxon1Visitor::TCNodeFeature);
331  cfeat->SetFeatureid(eTaxTree_SeqId);
332  cfeat->SetValue(seqid_str);
333 
334  fset.Set().push_back(cfeat);
335  }}
336  // Label
337  {{
339  cfeat(new TTaxon1Conv::TTaxon1Visitor::TCNodeFeature);
340  cfeat->SetFeatureid(eTaxTree_Label);
341  cfeat->SetValue(label);
342 
343  fset.Set().push_back(cfeat);
344  }}
345  // tax-id
346  if (tax_id > 0) {
348  cfeat(new TTaxon1Conv::TTaxon1Visitor::TCNodeFeature);
349  cfeat->SetFeatureid(eTaxTree_TaxId);
350  cfeat->SetValue(tax_id_str);
351 
352  fset.Set().push_back(cfeat);
353  }
354  node_list.push_back(cnode);
355  }}
356 
357  ++max_tax_id;
358  }
359  }}
360 
361  // ----------------------------------------------------
362  // Prepare project item for the output
363 
364  CRef<CProjectItem> item(new CProjectItem());
365  item->SetItem().SetOther().Set(*btc);
366  string project_item_label("Common TaxTree (");
367  project_item_label += NStr::SizetToString(seqs.size());
368  project_item_label += " sequences)";
369  item->SetLabel(project_item_label);
370  AddProjectItem(*item);
371 
372 }
373 
375 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
Things for representing and manipulating bio trees.
CBioseq_Handle –.
void AddProjectItem(objects::CProjectItem &item)
CObject –.
Definition: ncbiobj.hpp:180
CScope –.
Definition: scope.hpp:92
const string & GetLastError() const
Definition: taxon1.hpp:471
bool GetTaxId4GI(TGi gi, TTaxId &tax_id_out)
Definition: taxon1.cpp:1371
vector< TTaxId > TTaxIdList
Definition: taxon1.hpp:70
bool GetPopsetJoin(const TTaxIdList &ids_in, TTaxIdList &ids_out)
Definition: taxon1.cpp:1546
bool Init(void)
Definition: taxon1.cpp:101
bool LoadNode(TTaxId tax_id, const ITaxon1Node **ppNode=NULL)
Definition: taxon1.hpp:488
CRef< ITreeIterator > GetTreeIterator(EIteratorMode mode=eIteratorMode_Default)
Definition: taxon1.cpp:1715
bool IsAlive(void)
Definition: taxon1.cpp:1354
const_iterator end() const
Definition: map.hpp:152
const_iterator find(const key_type &key) const
Definition: map.hpp:153
Definition: map.hpp:338
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define ZERO_GI
Definition: ncbimisc.hpp:1088
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
CTaxTreeToolParams m_Params
TConstScopedObjects & SetObjects()
virtual void x_CreateProjectItems()
override this function in derived classes and populate m_Items.
CTaxTreeToolJob(const CTaxTreeToolParams &params)
CTaxTreeToolJob.
static void GetLabel(const CObject &obj, string *label, ELabelType type=eDefault)
Definition: label.cpp:140
string m_Descr
mutex to sync our internals
vector< SConstScopedObject > TConstScopedObjects
Definition: objects.hpp:65
@ eDefault
Definition: label.hpp:73
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
Definition: Seq_loc.hpp:941
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
TTaxId GetTaxId(const CBioseq_Handle &handle)
return the tax-id associated with a given sequence.
Definition: sequence.cpp:274
@ eGetId_Canonical
Definition: sequence.hpp:114
vector< TTaxId > TTaxIds
Get taxonomy ids of sequences Return -1 for sequences that aren't found Return 0 for sequences that d...
Definition: scope.hpp:567
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
TGi GetGi(const CSeq_id_Handle &idh, TGetFlags flags=0)
Get GI of a sequence Returns ZERO_GI if the sequence is not found or if it doesn't have GI.
Definition: scope.cpp:419
TTaxIds GetTaxIds(const TSeq_id_Handles &idhs, TGetFlags flags=0)
vector< CSeq_id_Handle > TSeq_id_Handles
Bulk retrieval methods Common argument typedef - vector of requested ids.
Definition: scope.hpp:518
CConstRef< CSeq_id > GetSeqId(void) const
Get id which can be used to access this bioseq handle Throws an exception if none is available.
CConstRef< CSeq_id > GetNonLocalIdOrNull(void) const
Find a non-local ID if present, consulting assembly details if all IDs for the overall sequence are l...
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static string SizetToString(size_t value, TNumToStringFlags flags=0, int base=10)
Convert size_t to string.
Definition: ncbistr.cpp:2742
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5078
@ eTaxTree_TaxId
@ eTaxTree_SeqId
@ eTaxTree_Label
static const char label[]
@ BM_GAP
GAP compression is ON.
Definition: bmconst.h:148
void SetNodes(TNodes &value)
Assign a value to Nodes data member.
void SetTreetype(const TTreetype &value)
Assign a value to Treetype data member.
void SetLabel(const TLabel &value)
Assign a value to Label data member.
void SetItem(TItem &value)
Assign a value to Item data member.
TGi GetGi(void) const
Get the variant data.
Definition: Seq_id_.hpp:889
bool IsGi(void) const
Check if variant Gi is selected.
Definition: Seq_id_.hpp:883
Compressed bitset (entry point to bm.h)
static SLJIT_INLINE sljit_ins l(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
Sequence information record.
STaxSeqInfo(int tax, string l)
int tax_id
taxonomy id
string label
sequence label
USING_SCOPE(objects)
Modified on Fri Sep 20 14:57:53 2024 by modify_doxy.py rev. 669887