NCBI C++ ToolKit
alnsegments.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: alnsegments.cpp 70546 2016-01-04 20:27:43Z grichenk $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Kamen Todorov, NCBI
27 *
28 * File Description:
29 * Alignment segments
30 *
31 * ===========================================================================
32 */
33 
34 #include <ncbi_pch.hpp>
40 
41 #include <stack>
42 
43 
45 BEGIN_objects_SCOPE // namespace ncbi::objects::
46 
47 
49  TCalcScoreMethod calc_score)
50  : m_AlnMixSequences(aln_mix_sequences),
51  m_Rows(m_AlnMixSequences->m_Rows),
52  m_ExtraRows(m_AlnMixSequences->m_ExtraRows),
53  x_CalculateScore(calc_score)
54 {}
55 
56 
57 void
58 CAlnMixSegments::Build(bool gap_join,
59  bool min_gap,
60  bool remove_leading_and_trailing_gaps)
61 {
64 #if _DEBUG && _ALNMGR_DEBUG
66 #endif
67 
68 
69  TSegmentsContainer gapped_segs;
70 
71  CAlnMixSequences::TSeqs::iterator refseq_it = m_Rows.begin();
72  bool orig_refseq = true;
73  while (true) {
74  CAlnMixSeq * refseq = 0;
75  while (refseq_it != m_Rows.end()) {
76  refseq = *(refseq_it++);
77  if (refseq->GetStarts().current != refseq->GetStarts().end()) {
78  break;
79  } else {
80  refseq = 0;
81  }
82  }
83  if ( !refseq ) {
84  // Done
85 
86  // add the gapped segments if any
87  if (gapped_segs.size()) {
88  if (gap_join) {
89  // request to try to align
90  // gapped segments w/ equal len
91  x_ConsolidateGaps(gapped_segs);
92  } else if (min_gap) {
93  // request to try to align
94  // all gapped segments
95  x_MinimizeGaps(gapped_segs);
96  }
98  seg_i, gapped_segs) {
99  m_Segments.push_back(&**seg_i);
100  }
101  gapped_segs.clear();
102  }
103  break; // from the main loop
104  }
105 #if _DEBUG && _ALNMGR_TRACE
106  cerr << "refseq is on row " << refseq->m_RowIdx
107  << " seq " << refseq->m_SeqIdx << "\n";
108 #endif
109  // for each refseq segment
110  while (refseq->GetStarts().current != refseq->GetStarts().end()) {
111  stack< CRef<CAlnMixSegment> > seg_stack;
112  // To prevent infinite loop, remember all segments already on the stack.
113  set< CRef<CAlnMixSegment> > seg_set;
114  seg_stack.push(refseq->GetStarts().current->second);
115  seg_set.insert(refseq->GetStarts().current->second);
116 #if _DEBUG
117  const TSeqPos& refseq_start = refseq->GetStarts().current->first;
118 #if _DEBUG && _ALNMGR_TRACE
119  cerr << " [row " << refseq->m_RowIdx
120  << " seq " << refseq->m_SeqIdx
121  << " start " << refseq_start
122  << " was pushed into stack\n";
123 #endif
124 #endif
125 
126  while ( !seg_stack.empty() ) {
127 
128  bool pop_seg = true;
129  // check the gapped segments on the left
131  seg_stack.top()->m_StartIts) {
132 
133  CAlnMixSeq * row = start_its_i->first;
134 
135  if (row->GetStarts().current != start_its_i->second) {
136 #if _DEBUG
137  const TSeqPos& curr_row_start = row->GetStarts().current->first;
138  const TSeqPos& row_start = start_its_i->second->first;
139 
140  if (row->m_PositiveStrand ?
141  curr_row_start >
142  row_start :
143  curr_row_start <
144  row_start) {
145  string errstr =
146  string("CAlnMixSegments::Build():")
147  + " Internal error: Integrity broken" +
148  " row=" + NStr::IntToString(row->m_RowIdx) +
149  " seq=" + NStr::IntToString(row->m_SeqIdx)
150  + " curr_row_start="
151  + NStr::IntToString(curr_row_start)
152  + " row_start=" +
153  NStr::IntToString(row_start)
154  + " refseq_start=" +
155  NStr::IntToString(refseq_start)
156  + " strand=" +
157  (row->m_PositiveStrand ? "plus" : "minus");
158  NCBI_THROW(CAlnException, eMergeFailure, errstr);
159  }
160 #endif
161  seg_stack.push(row->GetStarts().current->second);
162  // If the segment is already in the set, we have infinite loop
163  // and must terminate it.
164  if (!seg_set.insert(row->GetStarts().current->second).second) {
165  string errstr =
166  string("CAlnMixSegments::Build():")
167  + " Internal error: Infinite loop detected.";
168  NCBI_THROW(CAlnException, eMergeFailure, errstr);
169  }
170 #if _DEBUG && _ALNMGR_TRACE
171  cerr << " [row " << row->m_RowIdx
172  << " seq " << row->m_SeqIdx
173  << " start " << curr_row_start
174  << " (left of start " << row_start << ") "
175  << "was pushed into stack\n";
176 #endif
177  pop_seg = false;
178  break;
179  }
180  }
181 
182  if (pop_seg) {
183 
184  // inc/dec iterators for each row of the seg
186  seg_stack.top()->m_StartIts) {
187  CAlnMixSeq * row = start_its_i->first;
188 
189 #if _DEBUG
190  const TSeqPos& curr_row_start = row->GetStarts().current->first;
191  const TSeqPos& row_start = start_its_i->second->first;
192 
193  if (row->m_PositiveStrand &&
194  curr_row_start >
195  row_start ||
196  !row->m_PositiveStrand &&
197  curr_row_start <
198  row_start) {
199  string errstr =
200  string("CAlnMixSegments::Build():")
201  + " Internal error: Integrity broken" +
202  " row=" + NStr::IntToString(row->m_RowIdx) +
203  " seq=" + NStr::IntToString(row->m_SeqIdx)
204  + " curr_row_start="
205  + NStr::IntToString(curr_row_start)
206  + " row_start=" +
207  NStr::IntToString(row_start)
208  + " refseq_start=" +
209  NStr::IntToString(refseq_start)
210  + " strand=" +
211  (row->m_PositiveStrand ? "plus" : "minus");
212  NCBI_THROW(CAlnException, eMergeFailure, errstr);
213  }
214 #endif
215 
216  if (row->m_PositiveStrand) {
217  row->SetStarts().current++;
218  } else {
219  if (row->SetStarts().current == row->GetStarts().begin()) {
220  row->SetStarts().current = row->GetStarts().end();
221  } else {
222  row->SetStarts().current--;
223  }
224  }
225  }
226 
227  if (seg_stack.size() > 1) {
228  // add to the gapped segments
229  gapped_segs.push_back(seg_stack.top());
230  seg_stack.pop();
231 #if _DEBUG && _ALNMGR_TRACE
232  cerr << " seg popped].\n";
233 #endif
234  } else {
235  // add the gapped segments if any
236  if (gapped_segs.size()) {
237  if (gap_join) {
238  // request to try to align
239  // gapped segments w/ equal len
240  x_ConsolidateGaps(gapped_segs);
241  } else if (min_gap) {
242  // request to try to align
243  // all gapped segments
244  x_MinimizeGaps(gapped_segs);
245  }
246  if (orig_refseq) {
248  seg_i, gapped_segs) {
249  m_Segments.push_back(&**seg_i);
250  }
251  gapped_segs.clear();
252  }
253  }
254  // add the refseq segment
255  if (orig_refseq) {
256  m_Segments.push_back(seg_stack.top());
257  } else {
258  gapped_segs.push_back(seg_stack.top());
259  }
260  seg_stack.pop();
261 #if _DEBUG && _ALNMGR_TRACE
262  cerr << " refseq seg popped].\n";
263 #endif
264  } // if (seg_stack.size() > 1)
265  } // if (popseg)
266  } // while ( !seg_stack.empty() )
267  } // while (refseq->GetStarts().current != refseq->GetStarts().end())
268  orig_refseq = false;
269  } // while (true)
270 
271 
272  if (remove_leading_and_trailing_gaps) {
273  while (m_Segments.size() && m_Segments.front()->m_StartIts.size() < 2) {
274  m_Segments.pop_front();
275  }
276  while (m_Segments.size() && m_Segments.back()->m_StartIts.size() < 2) {
277  m_Segments.pop_back();
278  }
279  }
280 
281 }
282 
283 
284 
285 void
287 {
288  vector<TSignedSeqPos> starts;
289  vector<TSeqPos> lens;
290  starts.resize(m_Rows.size(), -1);
291  lens.resize(m_Rows.size(), 0);
292 
293  TSeqPos len = 0;
294  CAlnMap::TNumrow rowidx;
295 
296  TSegments::iterator seg_i = m_Segments.begin();
297  while (seg_i != m_Segments.end()) {
298  len = (*seg_i)->m_Len;
300  (*seg_i)->m_StartIts) {
301  CAlnMixSeq * row = start_its_i->first;
302  rowidx = row->m_RowIdx;
303  TSignedSeqPos& prev_start = starts[rowidx];
304  TSeqPos& prev_len = lens[rowidx];
305  TSeqPos start = start_its_i->second->first;
306  const bool plus = row->m_PositiveStrand;
307  const int& width = row->m_Width;
308  TSeqPos prev_start_plus_len = prev_start + prev_len * width;
309  TSeqPos start_plus_len = start + len * width;
310  if (prev_start >= 0) {
311  if (plus && prev_start_plus_len < start ||
312  !plus && start_plus_len < (TSeqPos) prev_start) {
313  // create a new seg
315  TSeqPos new_start;
316  if (row->m_PositiveStrand) {
317  new_start = prev_start + prev_len * width;
318  seg->m_Len = (start - new_start) / width;
319  } else {
320  new_start = start_plus_len;
321  seg->m_Len = (prev_start - new_start) / width;
322  }
323  row->SetStarts()[new_start] = seg;
324  CAlnMixStarts::iterator start_i =
325  start_its_i->second;
326  seg->SetStartIterator(row,
327  row->m_PositiveStrand ?
328  --start_i :
329  ++start_i);
330 
331  seg_i = m_Segments.insert(seg_i, seg);
332  seg_i++;
333  }
334  }
335  prev_start = start;
336  prev_len = len;
337  }
338  seg_i++;
339  }
340 }
341 
342 
343 void
345 {
346  TSegmentsContainer::iterator seg1_i, seg2_i;
347 
348  seg2_i = seg1_i = gapped_segs.begin();
349  if (seg2_i != gapped_segs.end()) {
350  seg2_i++;
351  }
352 
353  bool cache = false;
354  string s1;
355  int score1;
356  CAlnMixSeq * seq1;
357  CAlnMixSeq * seq2;
358 
359  while (seg2_i != gapped_segs.end()) {
360 
361  CAlnMixSegment * seg1 = *seg1_i;
362  CAlnMixSegment * seg2 = *seg2_i;
363 
364  // check if this seg possibly aligns with the previous one
365  bool possible = true;
366 
367  if (seg2->m_Len == seg1->m_Len &&
368  seg2->m_StartIts.size() == 1) {
369 
370  seq2 = seg2->m_StartIts.begin()->first;
371 
372  // check if this seq was already used
374  st_it,
375  (*seg1_i)->m_StartIts) {
376  if (st_it->first == seq2) {
377  possible = false;
378  break;
379  }
380  }
381 
382  // check if score is sufficient
383  if (possible && x_CalculateScore) {
384  if (!cache) {
385 
386  seq1 = seg1->m_StartIts.begin()->first;
387 
388  seq2->GetSeqString(s1,
389  seg1->m_StartIts[seq1]->first,
390  seg1->m_Len * seq1->m_Width,
391  seq1->m_PositiveStrand);
392 
393  score1 = x_CalculateScore(s1,
394  s1,
395  seq1->m_IsAA,
396  seq1->m_IsAA);
397  cache = true;
398  }
399 
400  string s2;
401  seq2->GetSeqString(s2,
402  seg2->m_StartIts[seq2]->first,
403  seg2->m_Len * seq2->m_Width,
404  seq2->m_PositiveStrand);
405 
406  int score2 =
407  x_CalculateScore(s1, s2, seq1->m_IsAA, seq2->m_IsAA);
408 
409  if (score2 < 75 * score1 / 100) {
410  possible = false;
411  }
412  }
413 
414  } else {
415  possible = false;
416  }
417 
418  if (possible) {
419  // consolidate the ones so far
420 
421  // add the new row
422  seg1->SetStartIterator(seq2, seg2->m_StartIts.begin()->second);
423 
424  // point the row's start position to the beginning seg
425  seg2->m_StartIts.begin()->second->second = seg1;
426 
427  seg2_i = gapped_segs.erase(seg2_i);
428  } else {
429  cache = false;
430  seg1_i++;
431  seg2_i++;
432  }
433  }
434 }
435 
436 
437 void
439 {
440  TSegmentsContainer::iterator seg_i, seg_i_end, seg_i_begin;
441  CAlnMixSegment * seg1, * seg2;
443  CAlnMixSeq * seq;
444  TSegmentsContainer new_segs;
445 
446  seg_i_begin = seg_i_end = seg_i = gapped_segs.begin();
447 
448  typedef map<TSeqPos, CRef<CAlnMixSegment> > TLenMap;
449  TLenMap len_map;
450 
451  while (seg_i_end != gapped_segs.end()) {
452 
453  len_map[(*seg_i_end)->m_Len];
454 
455  // check if this seg can possibly be minimized
456  bool possible = true;
457 
458  seg_i = seg_i_begin;
459  while (seg_i != seg_i_end) {
460  seg1 = *seg_i;
461  seg2 = *seg_i_end;
462 
464  st_it,
465  seg2->m_StartIts) {
466  seq = st_it->first;
467  // check if this seq was already used
468  if (seg1->m_StartIts.find(seq) != seg1->m_StartIts.end()) {
469  possible = false;
470  break;
471  }
472  }
473  if ( !possible ) {
474  break;
475  }
476  seg_i++;
477  }
478  seg_i_end++;
479 
480  if ( !possible || seg_i_end == gapped_segs.end()) {
481  // use the accumulated len info to create the new segs
482 
483  // create new segs with appropriate len
484  TSeqPos len_so_far = 0;
485  TLenMap::iterator len_i = len_map.begin();
486  while (len_i != len_map.end()) {
487  len_i->second = new CAlnMixSegment();
488  len_i->second->m_Len = len_i->first - len_so_far;
489  len_so_far += len_i->second->m_Len;
490  len_i++;
491  }
492 
493  // loop through the accumulated orig segs.
494  // copy info from them into the new segs
495  TLenMap::iterator len_i_end;
496  seg_i = seg_i_begin;
497  while (seg_i != seg_i_end) {
498  TSeqPos orig_len = (*seg_i)->m_Len;
499 
500  // determine the span of the current seg
501  len_i_end = len_map.find(orig_len);
502  len_i_end++;
503 
504  // loop through its sequences
506  st_it,
507  (*seg_i)->m_StartIts) {
508 
509  seq = st_it->first;
510  TSeqPos orig_start = st_it->second->first;
511 
512  len_i = len_map.begin();
513  len_so_far = 0;
514  // loop through the new segs
515  while (len_i != len_i_end) {
516  seg = len_i->second;
517 
518  // calc the start
519  TSeqPos this_start = orig_start +
520  (seq->m_PositiveStrand ?
521  len_so_far :
522  orig_len - len_so_far - seg->m_Len) *
523  seq->m_Width;
524 
525  // create the bindings:
526  seq->SetStarts()[this_start] = seg;
527  seg->SetStartIterator(seq, seq->SetStarts().find(this_start));
528  len_i++;
529  len_so_far += seg->m_Len;
530  }
531  }
532  seg_i++;
533  }
534  NON_CONST_ITERATE (TLenMap, len_it, len_map) {
535  new_segs.push_back(len_it->second);
536  }
537  len_map.clear();
538  seg_i_begin = seg_i_end;
539  }
540  }
541  gapped_segs.clear();
542  ITERATE (TSegmentsContainer, new_seg_i, new_segs) {
543  gapped_segs.push_back(*new_seg_i);
544  }
545 }
546 
547 
548 void
550  const TSeqPos& start,
551  size_t match_idx) const
552 {
553  ITERATE(TStartIterators, st_it_i, m_StartIts) {
554  // both should point to the same seg
555  if ((*st_it_i).second->second != this) {
556  string errstr =
557  string("CAlnMixSegment::StartItsConsistencyCheck")
558  + " [match_idx=" + NStr::NumericToString(match_idx) + "]"
559  + " The internal consistency check failed for"
560  + " the segment containing ["
561  + " row=" + NStr::NumericToString((*st_it_i).first->m_RowIdx)
562  + " seq=" + NStr::NumericToString((*st_it_i).first->m_SeqIdx)
563  + " strand=" +
564  ((*st_it_i).first->m_PositiveStrand ? "plus" : "minus")
565  + " start=" + NStr::NumericToString((*st_it_i).second->first)
566  + "] aligned to: ["
567  + " row=" + NStr::NumericToString(seq.m_RowIdx)
568  + " seq=" + NStr::NumericToString(seq.m_SeqIdx)
569  + " strand=" +
570  (seq.m_PositiveStrand ? "plus" : "minus")
571  + " start=" + NStr::NumericToString(start)
572  + "].";
573  NCBI_THROW(CAlnException, eMergeFailure, errstr);
574  }
575  }
576 }
577 
578 
579 END_objects_SCOPE // namespace ncbi::objects::
TDim TNumrow
Definition: alnmap.hpp:69
void StartItsConsistencyCheck(const CAlnMixSeq &seq, const TSeqPos &start, size_t match_idx) const
void SetStartIterator(CAlnMixSeq *seq, CAlnMixStarts::iterator iter)
TStartIterators m_StartIts
TSegments m_Segments
Definition: alnsegments.hpp:80
void Build(bool gap_join=false, bool min_gap=false, bool remove_leading_and_trailing_gaps=false)
Definition: alnsegments.cpp:58
list< CRef< CAlnMixSegment > > TSegmentsContainer
Definition: alnsegments.hpp:66
void x_MinimizeGaps(TSegmentsContainer &gapped_segs)
CAlnMixSegments(CRef< CAlnMixSequences > &aln_mix_sequences, TCalcScoreMethod calc_score=0)
Definition: alnsegments.cpp:48
CRef< CAlnMixSequences > m_AlnMixSequences
Definition: alnsegments.hpp:82
TCalcScoreMethod x_CalculateScore
Definition: alnsegments.hpp:85
void FillUnalignedRegions(void)
void x_ConsolidateGaps(TSegmentsContainer &gapped_segs)
vector< CRef< CAlnMixSeq > > & m_Rows
Definition: alnsegments.hpp:83
CAlnMixStarts & SetStarts()
Definition: alnseq.hpp:168
bool m_PositiveStrand
Definition: alnseq.hpp:156
int m_SeqIdx
Definition: alnseq.hpp:162
const CAlnMixStarts & GetStarts() const
Definition: alnseq.hpp:167
int m_RowIdx
Definition: alnseq.hpp:164
unsigned m_Width
Definition: alnseq.hpp:154
bool m_IsAA
Definition: alnseq.hpp:153
void GetSeqString(string &s, TSeqPos start, TSeqPos len, bool positive_strand=true)
Definition: alnseq.hpp:190
void RowsStartItsContsistencyCheck(size_t match_idx)
Definition: alnseq.cpp:344
void InitRowsStartIts()
Definition: alnseq.cpp:287
void InitExtraRowsStartIts()
Definition: alnseq.cpp:318
CAlnMixStarts::const_iterator current
Definition: alnsegments.hpp:92
size_type size() const
Definition: map.hpp:148
const_iterator begin() const
Definition: map.hpp:151
const_iterator end() const
Definition: map.hpp:152
const_iterator find(const key_type &key) const
Definition: map.hpp:153
Definition: set.hpp:45
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
int TSignedSeqPos
Type for signed sequence position.
Definition: ncbimisc.hpp:887
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
string
Definition: cgiapp.hpp:690
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5078
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
use n only the LMDB cache</td > n</tr > n< tr > n< td ></td > n< td > use the LMDB cache if at all possible
int len
T plus(T x_)
#define row(bind, expected)
Definition: string_bind.c:73
Modified on Fri Sep 20 14:57:59 2024 by modify_doxy.py rev. 669887