NCBI C++ ToolKit
seqids_extractor.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef OBJTOOLS_ALNMGR___SEQIDS_EXTRACTOR__HPP
2 #define OBJTOOLS_ALNMGR___SEQIDS_EXTRACTOR__HPP
3 /* $Id: seqids_extractor.hpp 78930 2017-07-31 13:06:45Z dicuccio $
4 * ===========================================================================
5 *
6 * PUBLIC DOMAIN NOTICE
7 * National Center for Biotechnology Information
8 *
9 * This software/database is a "United States Government Work" under the
10 * terms of the United States Copyright Act. It was written as part of
11 * the author's official duties as a United States Government employee and
12 * thus cannot be copyrighted. This software/database is freely available
13 * to the public for use. The National Library of Medicine and the U.S.
14 * Government have not placed any restriction on its use or reproduction.
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================
27 *
28 * Author: Kamen Todorov, NCBI
29 *
30 * File Description:
31 * Extract Seq-ids from Seq-align
32 *
33 * ===========================================================================
34 */
35 
36 
37 #include <corelib/ncbistd.hpp>
38 #include <corelib/ncbiobj.hpp>
39 
50 
54 
58 
59 
62 
63 
64 /// Default seq-id converter.
65 /// @sa CAlnSeqIdsExtract
66 template <class TAlnSeqId>
68 {
69 public:
70  TAlnSeqId* operator() (const CSeq_id& id) const {
71  return new TAlnSeqId(id);
72  }
73 };
74 
75 
76 /// Scope-aware seq-id converter. Sets bioseq handle for the id.
77 /// @sa CAlnSeqIdsExtract
78 template <class TAlnSeqId>
80 {
81 public:
83 
84  TAlnSeqId* operator() (const CSeq_id& id) const {
85  CRef<CAlnSeqId> aln_id(new TAlnSeqId(id));
86  if (m_Scope) {
87  aln_id->SetBioseqHandle(m_Scope->GetBioseqHandle(id));
88  }
89  return aln_id.Release();
90  }
91 
92 private:
94 };
95 
96 
97 /// IAlnSeqId extracting functor.
98 template <class TAlnSeqId, class TIdConverter = CAlnSeqIdConverter<TAlnSeqId> >
100 {
101 public:
103  CAlnSeqIdsExtract(const TIdConverter& id_conv) : m_IdConv(id_conv) {}
104 
105  typedef vector<TAlnSeqIdIRef> TIdVec;
106 
107  /// Extract ids for the alignment.
108  /// @param seq_align
109  /// Input seq-align.
110  /// @param id_vec
111  /// Output vector to receive ids for the alignment.
112  void operator()(const CSeq_align& seq_align,
113  TIdVec& id_vec) const
114  {
115  _ASSERT(id_vec.empty());
116 
117  typedef CSeq_align::TSegs TSegs;
118 
119  seq_align.Validate(true);
120 
121  const TSegs& segs = seq_align.GetSegs();
122 
123  switch (segs.Which()) {
124  case TSegs::e_Disc:
125  {
126  bool first_disc = true;
127  ITERATE(CSeq_align_set::Tdata, sa_it, segs.GetDisc().Get()) {
128  if (first_disc) {
129  first_disc = false;
130  this->operator()(**sa_it, id_vec);
131  }
132  else {
133  // Need to make sure ids are identical across all alignments
134  TIdVec next_id_vec;
135  this->operator()(**sa_it, next_id_vec);
136  if ( !IdVecEqual(id_vec, next_id_vec) ) {
137  NCBI_THROW(CAlnException, eInvalidSeqId,
138  "Inconsistent Seq-ids across the disc alignments.");
139  }
140  }
141  }
142  }
143  break;
144  case TSegs::e_Dendiag:
145  {
146  bool first_diag = true;
147  ITERATE(TSegs::TDendiag, diag_it, segs.GetDendiag()) {
148  const CDense_diag::TIds& ids = (*diag_it)->GetIds();
149  if (first_diag) {
150  id_vec.resize(ids.size());
151  }
152  else if (id_vec.size() != ids.size()) {
153  NCBI_THROW(CAlnException, eInvalidSeqId,
154  "Inconsistent Seq-ids.");
155  }
156  size_t row = 0;
157  ITERATE(CDense_diag::TIds, id_it, ids) {
158  if (first_diag) {
159  id_vec[row].Reset(NewAlnSeqId(**id_it));
160  }
161  else if (*id_vec[row] != TAlnSeqId(**id_it)) {
162  NCBI_THROW(CAlnException, eInvalidSeqId,
163  string("Inconsistent Seq-ids: ") +
164  id_vec[row]->AsString() + " != " +
165  TAlnSeqId(**id_it).AsString() + ".");
166  }
167  ++row;
168  }
169  first_diag = false;
170  }
171  }
172  break;
173  case TSegs::e_Denseg:
174  {
175  const CDense_seg::TIds& ids = segs.GetDenseg().GetIds();
176  id_vec.resize(ids.size());
177  for (size_t i = 0; i < ids.size(); ++i) {
178  id_vec[i].Reset(NewAlnSeqId(*ids[i]));
179  }
180  }
181  break;
182  case TSegs::e_Std:
183  {
184  bool first_seg = true;
185 
186  typedef CSeq_loc::TRange::position_type TLen;
187  typedef vector<TLen> TLenVec;
188 
189  ITERATE(TSegs::TStd, std_it, segs.GetStd()) {
190  const CStd_seg& std_seg = **std_it;
191  if (first_seg) {
192  id_vec.resize(std_seg.GetDim());
193  }
194  else if (id_vec.size() != (size_t) std_seg.GetDim()) {
195  NCBI_THROW(CAlnException, eInvalidAlignment,
196  "The Std-seg dim's need to be consistent.");
197  }
198  if (std_seg.GetLoc().size() != id_vec.size()) {
199  NCBI_THROW(CAlnException, eInvalidAlignment,
200  "Number of seq-locs inconsistent with dim.");
201  }
202  size_t i = 0;
203  TAlnSeqIdIRef id;
204 
205  // Will record the seg_lens here:
206  TLenVec seg_lens(std_seg.GetDim());
207 
208  ITERATE (CStd_seg::TLoc, loc_it, std_seg.GetLoc()) {
209  switch ((*loc_it)->Which()) {
210  case CSeq_loc::e_Empty:
211  id.Reset(NewAlnSeqId((*loc_it)->GetEmpty()));
212  break;
213  case CSeq_loc::e_Int:
214  id.Reset(NewAlnSeqId((*loc_it)->GetInt().GetId()));
215  break;
216  case CSeq_loc::e_Pnt:
217  id.Reset(NewAlnSeqId((*loc_it)->GetPnt().GetId()));
218  break;
219  default:
220  string err_str = string("Seq-loc of type ") +
221  (*loc_it)->SelectionName((*loc_it)->Which()) +
222  "is not supported.";
223  NCBI_THROW(CAlnException, eUnsupported, err_str);
224  }
225 
226  // Store the lengths
227  seg_lens[i] = (*loc_it)->GetTotalRange().GetLength();
228 
229  if (first_seg) {
230  id_vec[i].Reset(id);
231  }
232  else if (*id_vec[i] != *id) {
233  string err("Inconsistent Seq-ids found in seg ");
234  err += NStr::NumericToString(i) +
235  ". Excpected " + id_vec[i]->AsString() +
236  ", encountered " + id->AsString() + ".";
237  NCBI_THROW(CAlnException, eInvalidSeqId, err);
238  }
239  ++i;
240  }
241 
242  // Try to figure out the base_widths
243  // by examining the seg_lens
244  TLen min_len = 0;
245  TLen max_len = 0;
246  ITERATE(TLenVec, len_i, seg_lens) {
247  if (*len_i == 0) continue;
248  if (min_len == 0 || min_len > *len_i) {
249  min_len = *len_i;
250  }
251  if (max_len < *len_i) {
252  max_len = *len_i;
253  }
254  }
255  if (min_len < max_len) {
256  bool nuc_prot_diag = (min_len == max_len / 3 ||
257  min_len - 1 == max_len / 3 ?
258  true : false);
259  for (size_t i_lcl=0; i_lcl< seg_lens.size(); ++i_lcl) {
260  if ( nuc_prot_diag ) {
261  id_vec[i_lcl]->SetBaseWidth(
262  seg_lens[i_lcl] == min_len ? 3 : 1);
263  }
264  }
265  }
266 
267  first_seg = false;
268  }
269  }
270  break;
271  case TSegs::e_Packed:
272  {
273  const CPacked_seg::TIds& ids = segs.GetPacked().GetIds();
274  id_vec.resize(ids.size());
275  for (size_t i = 0; i < ids.size(); ++i) {
276  id_vec[i].Reset(NewAlnSeqId(*ids[i]));
277  }
278  }
279  break;
280  case TSegs::e_Sparse:
281  {
282  const CSparse_seg::TRows& rows = segs.GetSparse().GetRows();
283  for (size_t row = 0; row < rows.size(); ++row) {
284  const CSparse_align& sa = *rows[row];
285  TAlnSeqIdIRef first_id(NewAlnSeqId(sa.GetFirst_id()));
286  if (row == 0) {
287  id_vec.resize(segs.GetSparse().GetRows().size() + 1);
288  id_vec[0].Reset(first_id);
289  }
290  else if (*id_vec[0] != *first_id) {
291  string err("Inconsistent Seq-ids found in row ");
292  err += NStr::NumericToString(row) + ".";
293  NCBI_THROW(CAlnException, eInvalidSeqId, err);
294  }
295  id_vec[row + 1].Reset(NewAlnSeqId(sa.GetSecond_id()));
296  }
297  }
298  break;
299  case TSegs::e_Spliced:
300  {
301  const CSpliced_seg& spliced_seg = segs.GetSpliced();
302  id_vec.resize(2);
303 
305 
306  id_vec[0].Reset(NewAlnSeqId(spliced_seg.GetProduct_id()));
307  id_vec[0]->SetBaseWidth(prot ? 3 : 1);
308 
309  id_vec[1].Reset(NewAlnSeqId(spliced_seg.GetGenomic_id()));
310  id_vec[1]->SetBaseWidth(1);
311  }
312  break;
313  case TSegs::e_not_set:
314  NCBI_THROW(CAlnException, eInvalidAlignment,
315  "Seq-align.segs not set.");
316  default:
317  NCBI_THROW(CAlnException, eUnsupported,
318  "This type of alignment is not supported.");
319  }
320  }
321 
322  /// Convert seq-id to TAlnSeqId.
323  TAlnSeqId* NewAlnSeqId(const CSeq_id& id) const
324  {
325  return m_IdConv(id);
326  }
327 
328 private:
329  static bool IdVecEqual(const TIdVec& x, const TIdVec& y)
330  {
331  return x.size() == y.size() &&
332  equal(x.begin(), x.end(), y.begin(), SAlnSeqIdRefEqual());
333  }
334 
335  TIdConverter m_IdConv;
336 };
337 
338 
339 /// Default seq-id extractor implementations.
341 
344 
345 
347 
348 #endif // OBJTOOLS_ALNMGR___SEQIDS_EXTRACTOR__HPP
349 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
Default seq-id converter.
TAlnSeqId * operator()(const CSeq_id &id) const
IAlnSeqId extracting functor.
static bool IdVecEqual(const TIdVec &x, const TIdVec &y)
void operator()(const CSeq_align &seq_align, TIdVec &id_vec) const
Extract ids for the alignment.
TAlnSeqId * NewAlnSeqId(const CSeq_id &id) const
Convert seq-id to TAlnSeqId.
vector< TAlnSeqIdIRef > TIdVec
CAlnSeqIdsExtract(const TIdConverter &id_conv)
CRef –.
Definition: ncbiobj.hpp:618
Scope-aware seq-id converter.
TAlnSeqId * operator()(const CSeq_id &id) const
CScopeAlnSeqIdConverter(CScope *scope)
CScope –.
Definition: scope.hpp:92
void Validate(bool full_test=false) const
Definition: Seq_align.cpp:649
Include a standard set of the NCBI C++ Toolkit most basic headers.
CSeq_align::C_Segs::TDendiag TDendiag
Definition: cuAlign.hpp:48
#define true
Definition: bool.h:35
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
string
Definition: cgiapp.hpp:687
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
TObjectType * Release(void)
Release a reference to the object and return a pointer to the object.
Definition: ncbiobj.hpp:846
TParent::position_type position_type
Definition: range.hpp:391
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
vector< CRef< CSeq_id > > TIds
Definition: Packed_seg_.hpp:94
vector< CRef< CSeq_loc > > TLoc
Definition: Std_seg_.hpp:93
vector< CRef< CSparse_align > > TRows
Definition: Sparse_seg_.hpp:99
const TGenomic_id & GetGenomic_id(void) const
Get the Genomic_id member data.
const TLoc & GetLoc(void) const
Get the Loc member data.
Definition: Std_seg_.hpp:357
const TProduct_id & GetProduct_id(void) const
Get the Product_id member data.
const TFirst_id & GetFirst_id(void) const
Get the First_id member data.
TProduct_type GetProduct_type(void) const
Get the Product_type member data.
vector< CRef< CSeq_id > > TIds
Definition: Dense_seg_.hpp:106
vector< CRef< CSeq_id > > TIds
Definition: Dense_diag_.hpp:93
TDim GetDim(void) const
Get the Dim member data.
Definition: Std_seg_.hpp:295
const TSecond_id & GetSecond_id(void) const
Get the Second_id member data.
list< CRef< CSeq_align > > Tdata
const TSegs & GetSegs(void) const
Get the Segs member data.
Definition: Seq_align_.hpp:921
@ e_Empty
to NULL one Seq-id in a collection
Definition: Seq_loc_.hpp:99
@ e_Int
from to
Definition: Seq_loc_.hpp:101
@ e_not_set
int i
Portable reference counted smart and weak pointers using CWeakRef, CRef, CObject and CObjectEx.
USING_SCOPE(objects)
CAlnSeqIdsExtract< CAlnSeqId, TScopeAlnSeqIdConverter > TScopeIdExtract
CScopeAlnSeqIdConverter< CAlnSeqId > TScopeAlnSeqIdConverter
CAlnSeqIdsExtract< CAlnSeqId > TIdExtract
Default seq-id extractor implementations.
#define row(bind, expected)
Definition: string_bind.c:73
#define _ASSERT
Modified on Sun Apr 14 05:28:44 2024 by modify_doxy.py rev. 669887