NCBI C++ ToolKit
align_group.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: align_group.cpp 90551 2020-06-26 13:00:42Z grichenk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Mike DiCuccio
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
40 #include <objmgr/util/sequence.hpp>
41 #include <serial/iterator.hpp>
42 
45 
47 {
48 }
49 
50 
52 {
53 }
54 
55 
57  TAnnotList& align_groups,
58  const string& annot_name_base,
59  CScope& scope)
60 {
61  TTaxAlignMap tax_aligns;
62  x_SeparateByTaxId(aligns, tax_aligns, scope);
63 
64  ///
65  /// now, package as separate annots
66  ///
67  NON_CONST_ITERATE (TTaxAlignMap, iter, tax_aligns) {
68  string tax_id_tag;
69  ITERATE (set<TTaxId>, it, iter->first) {
70  CConstRef<COrg_ref> org_ref = x_GetOrgRef(*it);
71  if ( !tax_id_tag.empty() ) {
72  tax_id_tag += "; ";
73  }
74  if (org_ref) {
75  org_ref->GetLabel(&tax_id_tag);
76  } else {
77  tax_id_tag += "unknown";
78  }
79 
80  tax_id_tag += " [taxid:" + NStr::NumericToString(*it) + "]";
81  }
82 
83  CRef<CSeq_annot> annot(new CSeq_annot);
84 
85  string name(annot_name_base);
86  if ( !name.empty() ) {
87  name += ": ";
88  }
89  name += tax_id_tag;
90  annot->SetName(name);
91  annot->SetData().SetAlign().swap(iter->second);
92  align_groups.push_back(annot);
93  }
94 }
95 
96 
98  TAnnotList& align_groups,
99  const string& annot_name_base,
100  CScope& scope)
101 {
102  TTaxAlignMap tax_aligns;
103  x_SeparateByTaxId(aligns, tax_aligns, scope);
104 
105  ///
106  /// now, package as separate annots
107  ///
108  CRef<CSeq_annot> mixed_annot;
109  NON_CONST_ITERATE (TTaxAlignMap, iter, tax_aligns) {
110  if (iter->first.size() == 1) {
111  string tax_id_tag;
112  TTaxId tax_id = *iter->first.begin();
113  CConstRef<COrg_ref> org_ref = x_GetOrgRef(tax_id);
114 
115  if ( !tax_id_tag.empty() ) {
116  tax_id_tag += "; ";
117  }
118  if (org_ref) {
119  org_ref->GetLabel(&tax_id_tag);
120  } else {
121  tax_id_tag += "unknown";
122  }
123 
124  tax_id_tag += " [taxid:" + NStr::NumericToString(tax_id) + "]";
125 
126  CRef<CSeq_annot> annot(new CSeq_annot);
127 
128  string name(annot_name_base);
129  if ( !name.empty() ) {
130  name += ": ";
131  }
132  name += tax_id_tag;
133  annot->SetName(name);
134  annot->SetData().SetAlign().swap(iter->second);
135  align_groups.push_back(annot);
136  } else {
137  if ( !mixed_annot ) {
138  mixed_annot.Reset(new CSeq_annot);
139  string name(annot_name_base);
140  if ( !name.empty() ) {
141  name += ": ";
142  }
143  name += "Mixed Taxa";
144  mixed_annot->SetName(name);
145  }
146  mixed_annot->SetData().SetAlign()
147  .insert(mixed_annot->SetData().SetAlign().end(),
148  iter->second.begin(), iter->second.end());
149  }
150  }
151 
152  if (mixed_annot) {
153  align_groups.push_back(mixed_annot);
154  }
155 }
156 
157 
158 void CAlignGroup::GroupBySeqIds(const TAlignList& alignments,
159  TAnnotList& align_groups,
160  const string& annot_name_base,
161  objects::CScope& scope,
163 {
164  /// typedefs for dealing with separations by sequence types
165  typedef set<CSeq_id_Handle> TSeqIds;
166  typedef map<TSeqIds, list< CRef<objects::CSeq_align> > > TSequenceAlignMap;
167 
168  ///
169  /// first, categorize these types
170  ///
171  TSequenceAlignMap seq_aligns;
172  ITERATE (TAlignList, iter, alignments) {
173  CRef<CSeq_align> align = *iter;
174 
175  TSeqIds ids;
176  CTypeConstIterator<CSeq_id> id_iter(*align);
177  for ( ; id_iter; ++id_iter) {
179  if (flags & fResolveToGi) {
180  CSeq_id_Handle gi_idh =
182  if (gi_idh) {
183  idh = gi_idh;
184  }
185  }
186  ids.insert(idh);
187  }
188 
189  seq_aligns[ids].push_back(align);
190  }
191 
192  ///
193  /// now, create annotations for these as needed
194  /// order here is important, as the binning is destructive
195  ///
196 
197  ITERATE (TSequenceAlignMap, iter, seq_aligns) {
198  if ( !iter->second.size() ) {
199  continue;
200  }
201 
202  string tag;
203  /// scan to see if the IDs contains something appropriate
204  ITERATE (TSeqIds, it, iter->first) {
205  if ( !tag.empty() ) {
206  tag += "x";
207  }
208  it->GetSeqId()->GetLabel(&tag,
211  }
212 
213  CRef<CSeq_annot> annot(new CSeq_annot);
214  annot->SetData().SetAlign()
215  .insert(annot->SetData().SetAlign().begin(),
216  iter->second.begin(), iter->second.end());
217 
218  string name(annot_name_base);
219  if ( !name.empty() ) {
220  name += ": ";
221  }
222  name += tag;
223  annot->SetName(name);
224  align_groups.push_back(annot);
225  }
226 }
227 
228 
229 void CAlignGroup::GroupByStrand(const TAlignList& alignments,
230  TAnnotList& align_groups,
231  const string& annot_name_base,
232  objects::CScope& scope)
233 {
234  /// typedefs for dealing with separations by sequence types
235  typedef set<ENa_strand> TStrands;
236  typedef map<TStrands, list< CRef<objects::CSeq_align> > > TSequenceAlignMap;
237 
238  ///
239  /// first, categorize these types
240  ///
241  TSequenceAlignMap seq_aligns;
242  ITERATE (TAlignList, iter, alignments) {
243  CRef<CSeq_align> align = *iter;
244 
245  TStrands strands;
246  CSeq_align::TDim rows = align->CheckNumRows();
247  for (CSeq_align::TDim i = 0; i < rows; ++i) {
248  strands.insert(align->GetSeqStrand(i));
249  }
250  seq_aligns[strands].push_back(align);
251  }
252 
253  ///
254  /// now, create annotations for these as needed
255  /// order here is important, as the binning is destructive
256  ///
257 
258  ITERATE (TSequenceAlignMap, iter, seq_aligns) {
259  if ( !iter->second.size() ) {
260  continue;
261  }
262 
263  string tag;
264  /// scan to see if the IDs contains something appropriate
265  ITERATE (TStrands, it, iter->first) {
266  if ( !tag.empty() ) {
267  tag += "/";
268  }
269  switch (*it) {
270  case eNa_strand_minus:
271  tag += "-";
272  break;
273  default:
274  tag += "+";
275  break;
276  }
277  }
278 
279  CRef<CSeq_annot> annot(new CSeq_annot);
280  annot->SetData().SetAlign()
281  .insert(annot->SetData().SetAlign().begin(),
282  iter->second.begin(), iter->second.end());
283 
284  string name(annot_name_base);
285  if ( !name.empty() ) {
286  name += ": ";
287  }
288  name += tag;
289  annot->SetName(name);
290  align_groups.push_back(annot);
291  }
292 }
293 
294 
296  TAnnotList& align_groups,
297  const string& annot_name_base,
298  objects::CScope& scope,
300 {
301  /// typedefs for dealing with separations by sequence types
302  typedef set<int> TSequenceTypes;
303  typedef map<TSequenceTypes, list< CRef<objects::CSeq_align> > > TSequenceAlignMap;
304 
305  ///
306  /// first, categorize these types
307  ///
308  TSequenceAlignMap seq_aligns;
309  ITERATE (TAlignList, iter, alignments) {
310  CRef<CSeq_align> align = *iter;
311 
312  TSequenceTypes types;
313  CTypeConstIterator<CSeq_id> id_iter(*align);
314  for ( ; id_iter; ++id_iter) {
316  idh = sequence::GetId(idh, scope, sequence::eGetId_Best);
317 
318  string id_str;
319  idh.GetSeqId()->GetLabel(&id_str);
321  TSequenceFlags this_flags = 0;
322 
323  /// EST alignments: in EST division
324  if (flags & fEST && !this_flags ) {
326  types.insert(fEST);
327  this_flags |= fEST;
328  }
329  }
330 
331  /// WGS alignments: in WGS division
332  if (flags & fWGS && !this_flags ) {
334  types.insert(fWGS);
335  this_flags |= fWGS;
336  }
337  }
338 
339  /// HTGS alignments: in HTGS division
340  if (flags & fHTGS && !this_flags ) {
342  types.insert(fHTGS);
343  this_flags |= fHTGS;
344  }
345  }
346 
347  /// Patent alignments: in Patent division
348  if (flags & fPatent && !this_flags ) {
350  types.insert(fPatent);
351  this_flags |= fPatent;
352  }
353  }
354 
355  /// RefSeq predicted alignments:
356  /// accession type = other and predicted flag set
357  /// this must precede regular refseq!
358  if (flags & fRefSeqPredicted && !this_flags ) {
361  types.insert(fRefSeqPredicted);
362  this_flags |= fRefSeqPredicted;
363  }
364  }
365 
366  /// RefSeq predicted alignments:
367  /// accession type = other
368  if (flags & fRefSeq && !this_flags ) {
370  types.insert(fRefSeq);
371  this_flags |= fRefSeq;
372  }
373  }
374 
375  /// RefSeq predicted alignments:
376  /// accession type = GenBank, EMBL, or DDBJ
377  if (flags & fGB_EMBL_DDBJ && !this_flags ) {
378  bool is_gb = false;
382  if (is_gb) {
383  types.insert(fGB_EMBL_DDBJ);
384  this_flags |= fGB_EMBL_DDBJ;
385  }
386  }
387 
388  if (this_flags) {
389  types.insert(this_flags);
390  }
391  }
392 
393  seq_aligns[types].push_back(align);
394  }
395 
396  ///
397  /// now, create annotations for these as needed
398  /// order here is important, as the binning is destructive
399  ///
400 
401  ITERATE (TSequenceAlignMap, iter, seq_aligns) {
402  if ( !iter->second.size() ) {
403  continue;
404  }
405 
406  string tag;
407  /// scan to see if the IDs contains something appropriate
408  ITERATE (TSequenceTypes, it, iter->first) {
409  if ( !tag.empty() ) {
410  tag += "/";
411  }
412  switch (*it) {
413  case fEST:
414  tag += "EST";
415  break;
416  case fWGS:
417  tag += "WGS";
418  break;
419  case fHTGS:
420  tag += "HTGS";
421  break;
422  case fPatent:
423  tag += "Patent";
424  break;
425  case fRefSeq:
426  tag += "RefSeq";
427  break;
428  case fRefSeqPredicted:
429  tag += "Predicted RefSeq";
430  break;
431  case fGB_EMBL_DDBJ:
432  tag += "GenBank-EMBL-DDBJ";
433  break;
434 
435  default:
436  tag += "Other";
437  break;
438  }
439  }
440 
441  CRef<CSeq_annot> annot(new CSeq_annot);
442  annot->SetData().SetAlign()
443  .insert(annot->SetData().SetAlign().begin(),
444  iter->second.begin(), iter->second.end());
445 
446  string name(annot_name_base);
447  if ( !name.empty() ) {
448  name += ": ";
449  }
450  name += tag;
451  annot->SetName(name);
452  align_groups.push_back(annot);
453  }
454 }
455 
456 
458  TTaxAlignMap& tax_aligns,
459  CScope& scope)
460 {
461  ITERATE (TAlignList, iter, alignments) {
462  CRef<CSeq_align> align = *iter;
463 
464  TTaxIds ids;
465  CTypeConstIterator<CSeq_id> id_iter(*align);
466  for ( ; id_iter; ++id_iter) {
468  TTaxId tax_id = x_GetTaxId(idh, scope);
469  ids.insert(tax_id);
470  }
471 
472  tax_aligns[ids].push_back(align);
473  }
474 }
475 
476 
478 {
479  TTaxId tax_id = ZERO_TAX_ID;
480  try {
481  CBioseq_Handle bsh = scope.GetBioseqHandle(id);
482  tax_id = sequence::GetTaxId(bsh);
483  if ( tax_id == ZERO_TAX_ID ) {
484  if ( !m_Taxon1.get() ) {
485  m_Taxon1.reset(new CTaxon1);
486  m_Taxon1->Init();
487  }
488  CSeq_id_Handle gi_idh =
490  m_Taxon1->GetTaxId4GI(gi_idh.GetGi(), tax_id);
491  }
493  }
494  catch (CException&) {
495  }
496  return tax_id;
497 }
498 
499 
501 {
502  CConstRef<COrg_ref> org_ref;
503  TTaxInfoMap::iterator tax_iter = m_TaxInfo.find(tax_id);
504  if (tax_iter == m_TaxInfo.end()) {
505  if (tax_id != ZERO_TAX_ID) {
506  if ( !m_Taxon1.get() ) {
507  m_Taxon1.reset(new CTaxon1);
508  m_Taxon1->Init();
509  }
510  bool is_species;
511  bool is_uncultured;
512  string blast_name;
513  org_ref = m_Taxon1->GetOrgRef(tax_id, is_species,
514  is_uncultured, blast_name);
515  }
516 
517  if (org_ref) {
518  m_TaxInfo[tax_id] = org_ref;
519  }
520  } else {
521  org_ref = tax_iter->second;
522  }
523  return org_ref;
524 }
525 
526 
528  CScope& scope)
529 {
530  TTaxId tax_id = x_GetTaxId(id, scope);
531  return x_GetOrgRef(tax_id);
532 }
533 
534 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
USING_SCOPE(objects)
void GroupBySequenceType(const TAlignList &aligns, TAnnotList &align_groups, const string &annot_base_name, objects::CScope &scope, TSequenceFlags flags=fSequenceDefaults)
Group alignments into sequence-related categories.
void GroupByLikeTaxIds(const TAlignList &aligns, TAnnotList &align_groups, const string &annot_base_name, objects::CScope &scope)
Separate a set of alignments into groups that describe how the alignments relate taxonomically.
Definition: align_group.cpp:97
TTaxId x_GetTaxId(const objects::CSeq_id_Handle &id, objects::CScope &scope)
void GroupByTaxIds(const TAlignList &aligns, TAnnotList &align_groups, const string &annot_base_name, objects::CScope &scope)
Separate a set of alignments into groups that describe how the alignments relate taxonomically.
Definition: align_group.cpp:56
TTaxInfoMap m_TaxInfo
TTaxIdMap m_TaxIds
CConstRef< objects::COrg_ref > x_GetOrgRef(TTaxId tax_id)
unique_ptr< objects::CTaxon1 > m_Taxon1
void GroupBySeqIds(const TAlignList &aligns, TAnnotList &align_groups, const string &annot_base_name, objects::CScope &scope, TSeqIdFlags flags=0)
Group alignments into bins for each set of seq-ids.
void x_SeparateByTaxId(const TAlignList &alignments, TTaxAlignMap &tax_aligns, objects::CScope &scope)
void GroupByStrand(const TAlignList &aligns, TAnnotList &align_groups, const string &annot_base_name, objects::CScope &scope)
Group alignments into bins for each set of strands.
list< CRef< objects::CSeq_align > > TAlignList
Definition: align_group.hpp:55
list< CRef< objects::CSeq_annot > > TAnnotList
Definition: align_group.hpp:56
CBioseq_Handle –.
CConstRef –.
Definition: ncbiobj.hpp:1266
CScope –.
Definition: scope.hpp:92
TDim CheckNumRows(void) const
Validatiors.
Definition: Seq_align.cpp:73
ENa_strand GetSeqStrand(TDim row) const
Get strand (the first one if segments have different strands).
Definition: Seq_align.cpp:294
Template class for iteration on objects of class C (non-medifiable version)
Definition: iterator.hpp:767
const_iterator end() const
Definition: map.hpp:152
iterator_bool insert(const value_type &val)
Definition: map.hpp:165
const_iterator find(const key_type &key) const
Definition: map.hpp:153
Definition: map.hpp:338
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
static uch flags
#define ZERO_TAX_ID
Definition: ncbimisc.hpp:1115
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
static EAccessionInfo IdentifyAccession(const CTempString &accession, TParseFlags flags=fParse_AnyRaw)
Deduces information from a bare accession a la WHICH_db_accession; may report false negatives on prop...
Definition: Seq_id.cpp:1634
void GetLabel(string *label, ELabelType type=eDefault, TLabelFlags flags=fLabel_Default) const
Append a label for this Seq-id to the supplied string.
Definition: Seq_id.cpp:2039
CConstRef< CSeq_id > GetSeqId(void) const
EAccessionInfo
For IdentifyAccession (below)
Definition: Seq_id.hpp:220
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
TGi GetGi(void) const
@ fLabel_Version
Show the version.
Definition: Seq_id.hpp:583
@ fLabel_GeneralDbIsContent
For type general, use the database name as the tag and the (text or numeric) key as the content.
Definition: Seq_id.hpp:586
@ eAcc_wgs
Definition: Seq_id.hpp:264
@ eAcc_htgs
Definition: Seq_id.hpp:262
@ eAcc_type_mask
Definition: Seq_id.hpp:222
@ fAcc_predicted
Definition: Seq_id.hpp:228
@ eAcc_div_patent
Definition: Seq_id.hpp:241
@ eAcc_est
Definition: Seq_id.hpp:239
@ eAcc_division_mask
Definition: Seq_id.hpp:273
@ eContent
Untagged human-readable accession or the like.
Definition: Seq_id.hpp:573
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
TTaxId GetTaxId(const CBioseq_Handle &handle)
return the tax-id associated with a given sequence.
Definition: sequence.cpp:274
@ eGetId_Best
return the "best" gi (uses FindBestScore(), with CSeq_id::CalculateScore() as the score function
Definition: sequence.hpp:101
@ eGetId_ForceGi
return only a gi-based seq-id
Definition: sequence.hpp:99
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
@ e_Other
for historical reasons, 'other' = 'refseq'
Definition: Seq_id_.hpp:104
@ e_Ddbj
DDBJ.
Definition: Seq_id_.hpp:107
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_annot_.cpp:244
void SetName(const TName &value)
Assign a value to Name data member.
Definition: Seq_annot_.hpp:814
int i
static MDB_envinfo info
Definition: mdb_load.c:37
const char * tag
static const struct type types[]
Definition: type.c:22
Modified on Sat Dec 02 09:22:37 2023 by modify_doxy.py rev. 669887