NCBI C++ ToolKit
sparse_functions.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: sparse_functions.cpp 45967 2021-01-20 16:32:21Z grichenk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Andrey Yazhuk
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 
35 
36 #include <corelib/ncbitime.hpp>
40 
41 
44 
45 
46 // Conversion function CSparse_align -> SAlignedSeq
47 SAlignedSeq* CreateAlignRow(const CSparse_align& align, bool master_first)
48 {
49  unique_ptr<SAlignedSeq> aln_seq(new SAlignedSeq());
50  aln_seq->m_SeqId.Reset(master_first ? &align.GetSecond_id()
51  : &align.GetFirst_id());
52  SAlignedSeq::TSignedRange& range = aln_seq->m_SecondRange;
53 
54  // get references to the containers inside CSparse_align
55  const CSparse_align::TFirst_starts& starts_1 = align.GetFirst_starts();
56  const CSparse_align::TSecond_starts& starts_2 = align.GetSecond_starts();
57  const CSparse_align::TLens& lens = align.GetLens();
58  const CSparse_align::TSecond_strands* strands =
59  align.IsSetSecond_strands() ? &align.GetSecond_strands() : 0;
60 
61  // create a new Align Collection
63  range.SetFrom(0).SetLength(0);
64  SAlignedSeq::TPos aln_from = -1, from = -1;
65 
66  // iterate on Sparse-seg elements
67  typedef CSparse_align::TNumseg TNumseg;
68  for( TNumseg i = 0; i < align.GetNumseg(); i++ ) {
69  aln_from = master_first ? starts_1[i] : starts_2[i];
70  from = master_first ? starts_2[i] : starts_1[i];
71  SAlignedSeq::TPos len = lens[i];
72  bool dir = strands ? ((*strands)[i] == eNa_strand_plus) : true;
73 
74  // update range
75  if(coll->empty()) {
76  range.SetFrom(aln_from);
77  range.SetLength(len);
78  } else {
79  range.SetFrom(min(range.GetFrom(), aln_from));
80  range.SetToOpen(max(range.GetToOpen(), aln_from + len));
81  }
82 
83  coll->insert(SAlignedSeq::TAlignRange(aln_from, from, len, dir));
84  }
85  aln_seq->m_AlignColl = coll;
86 
87  int dir = (coll->GetFlags() & SAlignedSeq::TAlignColl::fMixedDir);
89  // incorrect - do not return anything
90  return NULL;
91  } else if(dir == SAlignedSeq::TAlignColl::fReversed) {
92  aln_seq->m_NegativeStrand = true;
93  }
94  return aln_seq.release();
95 }
96 
97 /// Converter
98 bool ConvertToPairwise(const CSeq_align& align,
99  const CSeq_id& master_id,
100  vector<SAlignedSeq*>& aln_seqs)
101 {
102  typedef CSeq_align::TSegs TSegs;
103  const TSegs& segs = align.GetSegs();
104 
105  switch(segs.Which()) {
106  case TSegs::e_Denseg: {
107  const CDense_seg& dense_seg = segs.GetDenseg();
108  // find the row corresponding to master_id
109  const CDense_seg::TIds& ids = dense_seg.GetIds();
110  for( CDense_seg::TDim row = 0; row < dense_seg.GetDim(); row++ ) {
111  if(ids[row]->Equals(master_id)) {
112  return ConvertToPairwise(dense_seg, row, aln_seqs);
113  }
114  }
115  return false;
116  }
117  /// add code to support other types of Segs
118  default:
119  return false;
120  }
121 }
122 
123 
124 /// Converter
126  const CSeq_id& master_id,
127  vector<SAlignedSeq*>& aln_seqs)
128 {
129  CStopWatch timer;
130  timer.Start();
131 
132  bool ok = false;
133  for( size_t i = 0; i < aligns.size(); i++ ) {
134  const CSeq_align& align = *aligns[i];
135  bool res = ConvertToPairwise(align, master_id, aln_seqs);
136  ok |= res;
137  }
138  LOG_POST("ConvertToPairwise( vector of CSeq_align) " << 1000 * timer.Elapsed() << " ms");
139  return ok;
140 }
141 
142 
143 /// Converter CSparse_seg -> SAlignedSeq-s
144 bool ConvertToPairwise(const CSparse_seg& sparse_seg, vector<SAlignedSeq*>& aln_seqs)
145 {
146  CConstRef<objects::CSeq_id> master_id(&sparse_seg.GetMaster_id());
147 
148  typedef CSparse_seg::TRows TRows;
149  const TRows& rows = sparse_seg.GetRows();
150  TRows::const_iterator it = rows.begin();
151 
152  // convert pairwise alignment to TAlignColl objects
153  for( ; it != rows.end(); ++it ) {
154  const CSparse_align& align = **it;
155 
156  int master_index = -1;
157  if(master_id->Compare(align.GetFirst_id()) == CSeq_id::e_YES) {
158  master_index = 0;
159  } else if(master_id->Compare(align.GetSecond_id()) == CSeq_id::e_YES) {
160  master_index = 1;
161  }
162 
163  if(master_index != -1) { // create an alignment row from this CSparse_align
164  SAlignedSeq* aln_seq = CreateAlignRow(align, master_index == 0);
165  if(aln_seq) {
166  aln_seqs.push_back(aln_seq);
167  }
168  } else {
169  LOG_POST(Error << "CreateAlignRow() - a CSparse_align is"
170  << "invalid, neither of its CSeq_ids match master id");
171  }
172  }
173  return true; // handle errors
174 }
175 
176 
177 bool ConvertToPairwise(const CDense_seg& dense_seg,
178  CDense_seg::TDim anchor_row,
179  vector<SAlignedSeq*>& aln_seqs)
180 {
181  typedef CDense_seg::TDim TDim;
182 
183  for(TDim row = 0; row < dense_seg.GetDim(); row++ ) {
184  if(row != anchor_row) {
185  SAlignedSeq* aln_seq = CreateAlignRow(dense_seg, anchor_row, row);
186  if(aln_seq) {
187  aln_seqs.push_back(aln_seq);
188  }
189  }
190  }
191  return true;
192 }
193 
194 
195 /// Builder function
197  vector<SAlignedSeq*>& aln_seqs,
198  objects::CScope& scope)
199 {
200  if(! aln_seqs.empty()) {
201  CSparseAlignment* aln = new CSparseAlignment();
202  aln->Init(master_id, aln_seqs, scope);
203  return aln;
204  }
205  return NULL;
206 }
207 
208 
209 /// Builder function
211  vector<SAlignedSeq*>& aln_seqs,
212  objects::CScope& scope)
213 {
214  if(! aln_seqs.empty()) {
215  }
216  return NULL;
217 }
218 
219 
220 /// Converter Helper function
221 /// Creates an Align Collection from the two rows of a CDense_seg
223  CDense_seg::TDim row_1,
224  CDense_seg::TDim row_2)
225 {
226  _ASSERT(row_1 >=0 && row_1 < dense_seg.GetDim());
227  _ASSERT(row_2 >=0 && row_2 < dense_seg.GetDim());
228 
229  unique_ptr<SAlignedSeq> aln_seq(new SAlignedSeq());
230  aln_seq->m_SeqId.Reset(dense_seg.GetIds()[row_2]);
231  SAlignedSeq::TSignedRange& range = aln_seq->m_SecondRange;
232 
233  aln_seq->m_AlignColl = new SAlignedSeq::TAlignColl();
234  SAlignedSeq::TAlignColl& coll = *aln_seq->m_AlignColl;
235 
236  typedef CDense_seg::TDim TDim;
237  typedef CDense_seg::TNumseg TNum;
238 
239  const CDense_seg::TStarts& starts = dense_seg.GetStarts();
240  const CDense_seg::TLens& lens = dense_seg.GetLens();
241  const CDense_seg::TStrands* strands =
242  dense_seg.IsSetStrands() ? &dense_seg.GetStrands() : NULL;
243 
244  // iterate by segements and add aligned segments to the collection
245  TDim n_rows = dense_seg.GetDim();
246  TNum n_seg = dense_seg.GetNumseg();
247  for( TNum i = 0; i < n_seg; i++ ) {
248  int offset = i * n_rows;
249  int from_1 = starts[row_1 + offset];
250  int from_2 = starts[row_2 + offset];
251 
252  if(from_1 != -1 && from_2 != -1) { // not a gap
253  int len = lens[i];
254  bool direct = true;
255  if(strands) {
256  bool minus_1 = (*strands)[row_1 + offset] == eNa_strand_minus;
257  bool minus_2 = (*strands)[row_2 + offset] == eNa_strand_minus;
258  direct = (! minus_1 && ! minus_2) || (minus_1 == minus_2);
259  }
260  coll.insert(SAlignTools::TAlignRange(from_1, from_2, len, direct));
261 
262  // update range
263  if(coll.empty()) {
264  range.SetFrom(from_1);
265  range.SetLength(len);
266  } else {
267  range.SetFrom(min(range.GetFrom(), from_1));
268  range.SetToOpen(max(range.GetToOpen(), from_1 + len));
269  }
270  }
271  }
272  //LOG_POST("GetAlignColl() rows [" << row_1 << ", " << row_2 << "]" << ", segments " << coll.size());
273 
275  return aln_seq.release();
276 }
277 
278 
279 /// Creates Align Collection from a CSparse_seg
280 void GetAlignColl(const CSparse_align& sparse_align,
281  const CSeq_id& master_id,
283 {
284  coll.clear();
285 
286  int index = -1;
287  if(master_id.Compare(sparse_align.GetFirst_id()) == CSeq_id::e_YES) {
288  index = 0;
289  } else if(master_id.Compare(sparse_align.GetSecond_id()) == CSeq_id::e_YES) {
290  index = 1;
291  }
292  if(index != -1) {
293  bool first = (index == 0);
294  const CSparse_align::TFirst_starts& starts_1 = sparse_align.GetFirst_starts();
295  const CSparse_align::TFirst_starts& starts_2 = sparse_align.GetSecond_starts();
296  const CSparse_align::TLens& lens = sparse_align.GetLens();
297  const CSparse_align::TSecond_strands* strands =
298  sparse_align.IsSetSecond_strands() ? &sparse_align.GetSecond_strands() : 0;
299 
300  typedef CSparse_align::TNumseg TNumseg;
301  TNumseg n_seg = sparse_align.GetNumseg();
302  for( TNumseg i = 0; i < n_seg; i++ ) {
303  int from_1 = first ? starts_1[i] : starts_2[i];
304  int from_2 = first ? starts_2[i] : starts_1[i];
305  int len = lens[i];
306  bool direct = strands && ((*strands)[i] == eNa_strand_minus);
307 
308  coll.insert(SAlignTools::TAlignRange(from_1, from_2, len, direct));
309  }
310  }
311 }
312 
313 
314 /// Reverse Converter
315 /// Converts Align Collection into a CSparse_align
317  const CSeq_id& id_2,
318  const SAlignTools::TAlignColl& coll)
319 {
320  CRef<CSparse_align> align(new CSparse_align());
321 
322  CRef<CSeq_id> rid_1(new CSeq_id());
323  rid_1->Assign(id_1);
324  align->SetFirst_id(*rid_1);
325 
326  CRef<CSeq_id> rid_2(new CSeq_id());
327  rid_2->Assign(id_2);
328  align->SetSecond_id(*rid_2);
329 
330  // initilize containers
331  typedef CSparse_align::TNumseg TNumseg;
332  TNumseg n_seg = (TNumseg)coll.size();
333  align->SetNumseg(n_seg);
334 
335  CSparse_align::TFirst_starts& starts_1 = align->SetFirst_starts();
336  starts_1.resize(n_seg);
337  CSparse_align::TFirst_starts& starts_2 = align->SetSecond_starts();
338  starts_2.resize(n_seg);
339  CSparse_align::TLens& lens = align->SetLens();
340  lens.resize(n_seg);
341 
344  // there are reversed segments in the collection - need to fill "Strands"
345  strands = &align->SetSecond_strands();
346  strands->resize(n_seg);
347  }
348 
349  // move data to the containers
350  TNumseg i = 0;
351  ITERATE(SAlignTools::TAlignColl, it, coll) {
352  const SAlignTools::TAlignRange& r = *it;
353 
354  starts_1[i] = r.GetFirstFrom();
355  starts_2[i] = r.GetSecondFrom();
356  lens[i] = r.GetLength();
357  if(strands) {
358  (*strands)[i] = r.IsDirect() ? eNa_strand_plus : eNa_strand_minus;
359  }
360  i++;
361  }
362 
363  return align;
364 }
365 
366 
User-defined methods of the data storage class.
CAnchoredAln::TDim TDim
size_type size() const
@ fMixedDir
contains at least one reversed range
@ fInvalid
collection was modified and not validated
@ fReversed
contains at least one direct range
const_iterator insert(const TAlignRange &r)
CAlignRange Represents an element of pairwise alignment of two sequences.
Definition: align_range.hpp:63
CRef –.
Definition: ncbiobj.hpp:618
CSparseAlignment - an alignment based on CSparse_seg and CAlingRangeCollection classes rather than on...
void Init(const objects::CSeq_id &master_id, vector< SAlignedSeq * > &aln_seqs, objects::CScope &scope)
CStopWatch –.
Definition: ncbitime.hpp:1937
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
int offset
Definition: replacements.h:160
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NULL
Definition: ncbistd.hpp:225
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
Definition: Seq_id.cpp:318
E_SIC Compare(const CSeq_id &sid2) const
Compare() - more general.
Definition: Seq_id.cpp:411
@ e_YES
SeqIds compared, but are different.
Definition: Seq_id.hpp:583
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
double Elapsed(void) const
Return time elapsed since first Start() or last Restart() call (in seconds).
Definition: ncbitime.hpp:2775
void Start(void)
Start the timer.
Definition: ncbitime.hpp:2764
vector< CRef< CSparse_align > > TRows
Definition: Sparse_seg_.hpp:99
bool IsSetStrands(void) const
Check if a value has been assigned to Strands data member.
Definition: Dense_seg_.hpp:568
vector< TSeqPos > TLens
Definition: Dense_seg_.hpp:108
const TStarts & GetStarts(void) const
Get the Starts member data.
Definition: Dense_seg_.hpp:530
vector< ENa_strand > TStrands
Definition: Dense_seg_.hpp:109
const TLens & GetLens(void) const
Get the Lens member data.
Definition: Dense_seg_.hpp:555
const TFirst_id & GetFirst_id(void) const
Get the First_id member data.
vector< TSignedSeqPos > TStarts
Definition: Dense_seg_.hpp:107
const TMaster_id & GetMaster_id(void) const
Get the Master_id member data.
vector< CRef< CSeq_id > > TIds
Definition: Dense_seg_.hpp:106
TDim GetDim(void) const
Get the Dim member data.
Definition: Dense_seg_.hpp:421
const TSecond_starts & GetSecond_starts(void) const
Get the Second_starts member data.
const TLens & GetLens(void) const
Get the Lens member data.
bool IsSetSecond_strands(void) const
Check if a value has been assigned to Second_strands data member.
vector< int > TSecond_starts
const TFirst_starts & GetFirst_starts(void) const
Get the First_starts member data.
TNumseg GetNumseg(void) const
Get the Numseg member data.
vector< TSeqPos > TLens
const TIds & GetIds(void) const
Get the Ids member data.
Definition: Dense_seg_.hpp:505
const TSecond_strands & GetSecond_strands(void) const
Get the Second_strands member data.
const TSecond_id & GetSecond_id(void) const
Get the Second_id member data.
vector< int > TFirst_starts
TNumseg GetNumseg(void) const
Get the Numseg member data.
Definition: Dense_seg_.hpp:465
const TRows & GetRows(void) const
Get the Rows member data.
vector< ENa_strand > TSecond_strands
const TStrands & GetStrands(void) const
Get the Strands member data.
Definition: Dense_seg_.hpp:580
const TSegs & GetSegs(void) const
Get the Segs member data.
Definition: Seq_align_.hpp:921
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
int i
int len
range(_Ty, _Ty) -> range< _Ty >
static bool Equals(const CVariation::TPlacements &p1, const CVariation::TPlacements &p2)
Defines: CTimeFormat - storage class for time format.
T max(T x_, T y_)
T min(T x_, T y_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
CAlnVec * BuildDenseAlignment(const CSeq_id &master_id, vector< SAlignedSeq * > &aln_seqs, objects::CScope &scope)
Builder function.
bool ConvertToPairwise(const CSeq_align &align, const CSeq_id &master_id, vector< SAlignedSeq * > &aln_seqs)
Converter.
USING_SCOPE(ncbi::objects)
SAlignedSeq * CreateAlignRow(const CSparse_align &align, bool master_first)
CRef< CSparse_align > CreateSparseAlign(const CSeq_id &id_1, const CSeq_id &id_2, const SAlignTools::TAlignColl &coll)
Reverse Converter Converts Align Collection into a CSparse_align.
CSparseAlignment * BuildSparseAlignment(const CSeq_id &master_id, vector< SAlignedSeq * > &aln_seqs, objects::CScope &scope)
Builder function.
void GetAlignColl(const CSparse_align &sparse_align, const CSeq_id &master_id, SAlignTools::TAlignColl &coll)
Creates Align Collection from a CSparse_seg.
#define row(bind, expected)
Definition: string_bind.c:73
This is a building block for a Builder represents a Sequence aligned to an Anchor (pairwise alignment...
SAlignTools::TPos TPos
SAlignTools::TAlignColl TAlignColl
#define _ASSERT
Modified on Wed Sep 04 15:07:00 2024 by modify_doxy.py rev. 669887