NCBI C++ ToolKit
merge_tree.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: merge_tree.cpp 98115 2022-09-29 15:42:12Z boukn $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Nathan Bouk
27 *
28 * File Description:
29 * Alignment merge by way of depth-first tree search
30 *
31 * ===========================================================================
32 */
33 
34 #include <ncbi_pch.hpp>
37 
38 #include <objmgr/util/sequence.hpp>
39 
41 #include <objmgr/scope.hpp>
42 
46 
48 
49 //#define MERGE_TREE_VERBOSE_DEBUG
50 
52 
54 
55 
56 //////////////////////
57 
58 
59 set<int>
61  set<int> AlignIds;
62  ITERATE(TEquivList, EquivIter, Equivs) {
63  AlignIds.insert(EquivIter->AlignId);
64  }
65  return AlignIds;
66 }
67 size_t s_ScoreFromEquivList(const TEquivList& Equivs) {
68  size_t Matches = 0;
69  size_t MisMatches = 0;
70  ITERATE(TEquivList, EquivIter, Equivs) {
71  Matches += EquivIter->Matches;
72  MisMatches += EquivIter->MisMatches;
73  }
74  return Matches;
75 }
76 size_t s_LengthFromEquivList(const TEquivList& Equivs) {
77  size_t Matches = 0;
78  size_t MisMatches = 0;
79  ITERATE(TEquivList, EquivIter, Equivs) {
80  Matches += EquivIter->Matches;
81  MisMatches += EquivIter->MisMatches;
82  }
83  return Matches+MisMatches;
84 }
85 
87 {
88  if(A->GetSeqStart(0) != B->GetSeqStart(0))
89  return (A->GetSeqStart(0) < B->GetSeqStart(0));
90  else if(A->GetSeqStop(0) != B->GetSeqStop(0))
91  return (A->GetSeqStop(0) < B->GetSeqStop(0));
92  else if(A->GetSeqStart(1) != B->GetSeqStart(1))
93  return (A->GetSeqStart(1) < B->GetSeqStart(1));
94  else if(A->GetSeqStop(1) != B->GetSeqStop(1))
95  return (A->GetSeqStop(1) < B->GetSeqStop(1));
96  else
97  return A->GetSeqStrand(0) < B->GetSeqStrand(0);
98 }
99 
101 {
102  if(A->GetSeqStart(0) != B->GetSeqStart(0))
103  return (A->GetSeqStart(0) > B->GetSeqStart(0));
104  else if(A->GetSeqStop(0) != B->GetSeqStop(0))
105  return (A->GetSeqStop(0) > B->GetSeqStop(0));
106  else if(A->GetSeqStart(1) != B->GetSeqStart(1))
107  return (A->GetSeqStart(1) < B->GetSeqStart(1));
108  else if(A->GetSeqStop(1) != B->GetSeqStop(1))
109  return (A->GetSeqStop(1) < B->GetSeqStop(1));
110  else
111  return A->GetSeqStrand(0) < B->GetSeqStrand(0);
112 }
113 
115 {
116  TSignedSeqPos InterA;
117 
118  if(A.GetSeqStrand(0) == eNa_strand_plus)
119  InterA = int(A.GetSeqStart(1)) - int(A.GetSeqStart(0));
120  else
121  InterA = A.GetSeqStop(1) + A.GetSeqStart(0);
122 
123  return InterA;
124 }
125 
127 {
128  TSignedSeqPos InterA, InterB;
129  InterA = s_SeqAlignIntercept(*A);
130  InterB = s_SeqAlignIntercept(*B);
131 
132  return (InterA < InterB);
133 }
134 
136 {
137  return (A->GetAlignLength(false) < B->GetAlignLength(false));
138 }
139 
140 void s_EquivDiff(const TEquivList& A, const TEquivList& B)
141 {
142 
143  TEquivList::const_iterator AI, BI;
144  TEquivList::const_iterator AE, BE;
145 
146  AI = A.begin();
147  BI = B.begin();
148  AE = A.end();
149  BE = B.end();
150 
151  while(true) {
152 
153  if(AI == AE || BI == BE) {
154  break;
155  }
156 
157  if(AI->Query == BI->Query && AI->Subjt == BI->Subjt) {
158  ++AI;
159  ++BI;
160  continue;
161  }
162 
163  if(AI->Query.GetFrom() < BI->Query.GetFrom()) {
164 #ifdef MERGE_TREE_VERBOSE_DEBUG
165  cerr << " < " << *AI << endl;
166 #endif
167  ++AI;
168  continue;
169  }
170  else if(BI->Query.GetFrom() < AI->Query.GetFrom()) {
171 #ifdef MERGE_TREE_VERBOSE_DEBUG
172  cerr << " > " << *BI << endl;
173 #endif
174  ++BI;
175  continue;
176  }
177  else {
178  if(AI->Subjt.GetFrom() < BI->Subjt.GetFrom()) {
179 #ifdef MERGE_TREE_VERBOSE_DEBUG
180  cerr << " < " << *AI << endl;
181 #endif
182  ++AI;
183  continue;
184  }
185  else if(BI->Subjt.GetFrom() < AI->Subjt.GetFrom()) {
186 #ifdef MERGE_TREE_VERBOSE_DEBUG
187  cerr << " > " << *BI << endl;
188 #endif
189  ++BI;
190  continue;
191  }
192  else {
193 #ifdef MERGE_TREE_VERBOSE_DEBUG
194  cerr << " < " << *AI << endl;
195  cerr << " > " << *BI << endl;
196 #endif
197  ++AI;
198  ++BI;
199  continue;
200  }
201  }
202  }
203 
204 
205  while(AI != AE) {
206 #ifdef MERGE_TREE_VERBOSE_DEBUG
207  cerr << " > " << *AI << endl;
208 #endif
209  ++AI;
210  }
211  while(BI != BE) {
212 #ifdef MERGE_TREE_VERBOSE_DEBUG
213  cerr << " < " << *BI << endl;
214 #endif
215  ++BI;
216  }
217 }
218 
219 
221 {
222  TSeqRange AQR = A.GetSeqRange(0);
223  TSeqRange ASR = A.GetSeqRange(1);
224  TSeqRange BQR = B.GetSeqRange(0);
225  TSeqRange BSR = B.GetSeqRange(1);
226 
227  TSeqRange QI = AQR.IntersectionWith(BQR);
228  TSeqRange SI = ASR.IntersectionWith(BSR);
229 
230  TSeqPos QD=0, SD=0;
231  if(QI.Empty()) {
232  if( AQR.GetFrom() >= BQR.GetTo() ) {
233  QD = AQR.GetFrom() - BQR.GetTo();
234  } else {
235  QD = BQR.GetFrom() - AQR.GetTo();
236  }
237  }
238  if(SI.Empty()) {
239  if( ASR.GetFrom() >= BSR.GetTo() ) {
240  SD = ASR.GetFrom() - BSR.GetTo();
241  } else {
242  SD = BSR.GetFrom() - ASR.GetTo();
243  }
244  }
245  TSeqPos D = QD + SD;
246 
249  TSeqPos DeltaInt = abs(AINT - BINT);
250 
251  return max(D, DeltaInt);
252 }
253 
254 
255 
256 
257 void s_FindBestSubRange(const CMergeTree& Tree, const TEquivList& Path)
258 {
259 
260  int F = 0, L = 0;
261 #ifdef MERGE_TREE_VERBOSE_DEBUG
263 #endif
264  int BL = -1;
265  int BestScore = numeric_limits<int>::min();
266 
267  TEquivList CurrPath;
268 
269  for(L=0; L < (int)Path.size(); L++) {
270  CurrPath.push_back( Path[L] );
271  int CurrScore = Tree.Score(CurrPath);
272  if(CurrScore >= BestScore) {
273  BL = L;
274  BestScore = CurrScore;
275  }
276  }
277 
278  CurrPath.clear();
279 
280  for(F = BL; F >= 0; F--) {
281  CurrPath.insert(CurrPath.begin(),Path[F]);
282  int CurrScore = Tree.Score(CurrPath);
283  if(CurrScore >= BestScore) {
284 #ifdef MERGE_TREE_VERBOSE_DEBUG
285  BF = F;
286 #endif
287  BestScore = CurrScore;
288  }
289  }
290 
291 
292 #ifdef MERGE_TREE_VERBOSE_DEBUG
293  if(BF > 0 || BL < (int)(Path.size()-1) ) {
294  cerr << "s_FindBestSubRange : " << BF << " : " << BL << " : " << BestScore << " of " << Path.size()-1 << endl;
295  }
296 #endif
297 }
298 
299 
300 void CTreeAlignMerger::Merge(const list< CRef<CSeq_align> >& Input,
301  list< CRef<CSeq_align> >& Output )
302 {
303  Merge_Dist(Input, Output);
304 }
305 
307  list< CRef<CSeq_align> >& Output )
308 {
309  CEquivRangeBuilder EquivRangeBuilder;
310 
311  typedef pair<CSeq_id_Handle, ENa_strand> TSeqIdPair;
312  typedef pair<TSeqIdPair, TSeqIdPair> TMapKey;
313  typedef vector<CRef<CSeq_align> > TAlignVec;
314  typedef map<TMapKey, TAlignVec> TAlignMap;
315  TAlignMap AlignMap;
316 
317  ITERATE(list<CRef<CSeq_align> >, AlignIter, Input) {
318  const CSeq_align& Align = **AlignIter;
319 
320  TMapKey Key;
321  Key.first.first = CSeq_id_Handle::GetHandle(Align.GetSeq_id(0));
322  Key.first.second = Align.GetSeqStrand(0);
323  Key.second.first = CSeq_id_Handle::GetHandle(Align.GetSeq_id(1));
324  Key.second.second = Align.GetSeqStrand(1);
325 
326  AlignMap[Key].push_back(*AlignIter);
327  }
328 
329  NON_CONST_ITERATE(TAlignMap, MapIter, AlignMap) {
330  if(MapIter->first.first.second == eNa_strand_plus)
331  sort(MapIter->second.begin(), MapIter->second.end(), s_SortSeqAlignByQuery_Subjt);
332  else if(MapIter->first.first.second == eNa_strand_minus)
333  sort(MapIter->second.begin(), MapIter->second.end(), s_SortSeqAlignByQueryMinus_Subjt);
334  }
335 
336  ITERATE(TAlignMap, MapIter, AlignMap) {
337  CSeq_id_Handle QueryIDH = MapIter->first.first.first;
338  CSeq_id_Handle SubjtIDH = MapIter->first.second.first;
339 
340  CBioseq_Handle QueryBSH, SubjtBSH;
341  if(m_Scope != NULL) {
342  QueryBSH = m_Scope->GetBioseqHandle(QueryIDH);
343  SubjtBSH = m_Scope->GetBioseqHandle(SubjtIDH);
344  }
345 
346 
347  TEquivList OrigEquivs, SplitEquivs;
348  int AID = 0;
349  ITERATE(TAlignVec, AlignIter, MapIter->second) {
350 #ifdef MERGE_TREE_VERBOSE_DEBUG
351  cerr << AID << "\t" << (*AlignIter)->GetSeqRange(0)
352  << "\t" << (*AlignIter)->GetSeqRange(1) << endl;
353 #endif
354  EquivRangeBuilder.ExtractRangesFromSeqAlign(**AlignIter, OrigEquivs);
355  AID++;
356  }
357 #ifdef MERGE_TREE_VERBOSE_DEBUG
358  cerr << "OrigEquivs; " << OrigEquivs.size() << endl;
359  cerr << "Verify AlignIds: " << s_AlignIdsFromEquivList(OrigEquivs).size() << endl;
360 #endif
361 
362  if(QueryBSH && SubjtBSH) {
363  EquivRangeBuilder.CalcMatches(QueryBSH, SubjtBSH, OrigEquivs);
364  } else {
365  NON_CONST_ITERATE(TEquivList, Iter, OrigEquivs) {
366  Iter->Matches = Iter->Query.GetLength();
367  Iter->MisMatches = 0;
368  }
369  }
370 
371 
372  //cerr << "A: " << MapIter->second.size() << " OS: " << OrigEquivs.size() << " SE: " << SplitEquivs.size() << endl;
373  /*{{
374  /// Mostly Failed effort to pre-screen the alignments
375  CRangeCollection<TSeqPos> QRC, SRC;
376  CRangeCollection<TSeqPos> DQRC, DSRC;
377  ITERATE(TEquivList, Iter, OrigEquivs) {
378  if(QRC.IntersectingWith(Iter->Query)) {
379  CRangeCollection<TSeqPos> T(Iter->Query);
380  T.IntersectWith(QRC);
381  DQRC += T;
382  }
383  if(SRC.IntersectingWith(Iter->Subjt)) {
384  CRangeCollection<TSeqPos> T(Iter->Subjt);
385  T.IntersectWith(SRC);
386  DSRC += T;
387  }
388  QRC += Iter->Query;
389  SRC += Iter->Subjt;
390  }
391  CRangeCollection<TSeqPos> NQRC, NSRC;
392  NQRC += QRC.GetLimits();
393  NSRC += SRC.GetLimits();
394  NQRC -= DQRC;//QRC;
395  NSRC -= DSRC;//SRC;
396 
397  vector<TSeqPos> RQs, RSs;
398  RQs.push_back(QRC.GetFrom());
399  ITERATE(CRangeCollection<TSeqPos>, Iter, NQRC) {
400  RQs.push_back(Iter->GetFrom());
401  RQs.push_back(Iter->GetTo());
402  cerr << "QZ: " << *Iter << endl;
403  }
404  RQs.push_back(QRC.GetTo()+1);
405  RSs.push_back(SRC.GetFrom());
406  ITERATE(CRangeCollection<TSeqPos>, Iter, NSRC) {
407  RSs.push_back(Iter->GetFrom());
408  RSs.push_back(Iter->GetTo());
409  cerr << "SZ: " << *Iter << endl;
410  }
411  RSs.push_back(SRC.GetTo()+1);
412  {{
413  vector<TSeqPos>::iterator NE;
414  sort(RQs.begin(), RQs.end());
415  NE = unique(RQs.begin(), RQs.end());
416  RQs.erase(NE, RQs.end());
417  sort(RSs.begin(), RSs.end());
418  NE = unique(RSs.begin(), RSs.end());
419  RSs.erase(NE, RSs.end());
420  }}
421 
422 
423  CMergeTree::SScoring QuadScoring;
424  QuadScoring.GapExtend = 0;
425  TSeqRange QR, SR;
426  ITERATE(TEquivList, Iter, OrigEquivs) {
427  QR.CombineWith(Iter->Query);
428  SR.CombineWith(Iter->Subjt);
429  }
430  cerr << QR << "\t" << SR << endl;
431  CQuadTree QT(64, QR, SR);
432  QT.SetRecs(RQs, RSs);
433  ITERATE(TEquivList, Iter, OrigEquivs) {
434  QT.Insert(*Iter);
435  }
436  QT.Print();
437 
438  int NodesVisited = 0;
439  TEquivList Path;
440  NON_CONST_ITERATE(CQuadTree, QuadIter, QT) {
441  NodesVisited++;
442 
443  if(!QuadIter->IsLeaf)
444  QuadIter->Merge();
445 
446  if(QuadIter->Storage.empty())
447  continue;
448 
449  set<int> AlignIds = s_AlignIdsFromEquivList(QuadIter->Storage);
450  cerr << QuadIter->XR << "\t" << QuadIter->YR
451  << "\t" << QuadIter->Depth
452  << "\t" << QuadIter->Storage.size()
453  << "\t" << AlignIds.size()
454  << "\t" << s_ScoreFromEquivList(QuadIter->Storage)
455  << endl;
456  ITERATE(set<int>, AlignIdIter, AlignIds) {
457  cerr << "\t" << *AlignIdIter;
458  }
459  cerr << endl;
460  {
461  TEquivList LocalSplits;
462  EquivRangeBuilder.SplitIntersections(QuadIter->Storage, LocalSplits);
463  QuadIter->Storage.clear();
464 
465  CMergeTree Tree;
466  Tree.SetScoring(QuadScoring); //m_Scoring);
467  Tree.AddEquivs(LocalSplits);
468  LocalSplits.clear();
469  Path.clear();
470  Tree.Search(Path);
471 
472 
473  cerr << "\t" << Path.size() << endl;
474 
475  //{{
476  // CRef<CSeq_align> Merged;
477  // Merged = x_MakeSeqAlign(Path, QueryIDH, SubjtIDH);
478  // if(Merged)
479  // Output.push_back(Merged);
480  //}}
481 
482  EquivRangeBuilder.MergeAbuttings(Path, QuadIter->Storage);
483 
484  //QuadIter->Storage.insert(QuadIter->Storage.end(), Path.begin(), Path.end());
485  AlignIds = s_AlignIdsFromEquivList(QuadIter->Storage);
486  cerr << "\t" << QuadIter->Storage.size()
487  << "\t" << AlignIds.size()
488  << "\t" << s_ScoreFromEquivList(QuadIter->Storage)
489  << endl;
490  ITERATE(set<int>, AlignIdIter, AlignIds) {
491  cerr << "\t" << *AlignIdIter;
492  }
493  cerr << endl;
494  }
495  }
496  cerr << "NodesVisited: " << NodesVisited << endl;
497 
498  {{
499  CRef<CSeq_align> Merged;
500  Merged = x_MakeSeqAlign(Path, QueryIDH, SubjtIDH);
501 
502  if(Merged)
503  Output.push_back(Merged);
504  }}
505 
506  set<int> AlignIds;
507  ITERATE(TEquivList, PathIter, Path) {
508  AlignIds.insert(PathIter->AlignId);
509  }
510  cerr << "Aligns: " << AlignIds.size() << endl;
511  TEquivList Filtered;
512  ITERATE(TEquivList, EraseIter, OrigEquivs) {
513  if( AlignIds.find(EraseIter->AlignId) != AlignIds.end()) {
514  Filtered.push_back(*EraseIter);
515  }
516  }
517  cerr << " Orig: " << OrigEquivs.size() << " Filt: " << Filtered.size() << endl;
518  OrigEquivs.swap(Filtered);
519  cerr << "_______________________________" << endl;
520  }}*/
521  EquivRangeBuilder.SplitIntersections(OrigEquivs, SplitEquivs);
522 
523  CMergeTree Tree(*this);
524  if(Callback != NULL)
526  Tree.SetScoring(m_Scoring);
527 
528  Tree.AddEquivs(SplitEquivs);
529 
530  while(!Tree.IsEmpty()) {
532 
533  Tree.Search(Path);
534 
535  if(Tree.GetInterrupted())
536  return;
537 
538  if(Path.empty())
539  break;
540 
541  CRef<CSeq_align> Merged;
542  Merged = x_MakeSeqAlign(Path, QueryIDH, SubjtIDH);
543 
544  if(Merged)
545  Output.push_back(Merged);
546  //break;
547  } // end Tree not empty
548  } // end Id-Strand-Map
549 }
550 
551 
553  list< CRef<CSeq_align> >& Output )
554 {
555 
556  typedef pair<CSeq_id_Handle, ENa_strand> TSeqIdPair;
557  typedef pair<TSeqIdPair, TSeqIdPair> TMapKey;
558  typedef vector<CRef<CSeq_align> > TAlignVec;
559  typedef map<TMapKey, TAlignVec> TAlignMap;
560  TAlignMap AlignMap;
561 
562  CEquivRangeBuilder EquivRangeBuilder;
563 
564  ITERATE(list<CRef<CSeq_align> >, AlignIter, Input) {
565  const CSeq_align& Align = **AlignIter;
566 
567  TMapKey Key;
568  Key.first.first = CSeq_id_Handle::GetHandle(Align.GetSeq_id(0));
569  Key.first.second = Align.GetSeqStrand(0);
570  Key.second.first = CSeq_id_Handle::GetHandle(Align.GetSeq_id(1));
571  Key.second.second = Align.GetSeqStrand(1);
572 
573  AlignMap[Key].push_back(*AlignIter);
574  }
575 
576  NON_CONST_ITERATE(TAlignMap, MapIter, AlignMap) {
577 
578  //sort(MapIter->second.begin(), MapIter->second.end(), s_SortSeqAlignByIntercept);
579  sort(MapIter->second.begin(), MapIter->second.end(), s_SortSeqAlignByLength);
580 
581  //if(MapIter->first.first.second == eNa_strand_plus)
582  // sort(MapIter->second.begin(), MapIter->second.end(), s_SortSeqAlignByQuery_Subjt);
583  //else if(MapIter->first.first.second == eNa_strand_minus)
584  // sort(MapIter->second.begin(), MapIter->second.end(), s_SortSeqAlignByQueryMinus_Subjt);
585 
586  //reverse(MapIter->second.begin(), MapIter->second.end());
587  }
588 
589 
590  ITERATE(TAlignMap, MapIter, AlignMap) {
591  CSeq_id_Handle QueryIDH = MapIter->first.first.first;
592  CSeq_id_Handle SubjtIDH = MapIter->first.second.first;
593 
594  CBioseq_Handle QueryBSH, SubjtBSH;
595  if(m_Scope != NULL) {
596  QueryBSH = m_Scope->GetBioseqHandle(QueryIDH);
597  SubjtBSH = m_Scope->GetBioseqHandle(SubjtIDH);
598  }
599 
600  typedef map<int, TEquivList> TAlignEquivListMap;
601  TAlignEquivListMap AlignEquivs;
602  TEquivList OrigEquivs, SplitEquivs;
603  int AID = 0;
604  ITERATE(TAlignVec, AlignIter, MapIter->second) {
605  TEquivList Temp;
606  //EquivRangeBuilder.ExtractRangesFromSeqAlign(**AlignIter, OrigEquivs);
607  EquivRangeBuilder.ExtractRangesFromSeqAlign(**AlignIter, Temp);
608 
609  if(QueryBSH && SubjtBSH) {
610  EquivRangeBuilder.CalcMatches(QueryBSH, SubjtBSH, Temp);
611  } else {
612  NON_CONST_ITERATE(TEquivList, Iter, Temp) {
613  Iter->Matches = Iter->Query.GetLength();
614  Iter->MisMatches = 0;
615  }
616  }
617 
618 
619  AlignEquivs[AID].insert(AlignEquivs[AID].end(), Temp.begin(), Temp.end());
620  OrigEquivs.insert(OrigEquivs.end(), Temp.begin(), Temp.end());
621 #ifdef MERGE_TREE_VERBOSE_DEBUG
622  cerr << AID << "\t" << (*AlignIter)->GetSeqRange(0)
623  << "\t" << (*AlignIter)->GetSeqRange(1) << endl;
624 #endif
625  AID++;
626  }
627 #ifdef MERGE_TREE_VERBOSE_DEBUG
628  cerr << "OrigEquivs; " << OrigEquivs.size() << endl;
629  cerr << "Verify AlignIds: " << s_AlignIdsFromEquivList(OrigEquivs).size() << endl;
630 #endif
631 
632  if(QueryBSH && SubjtBSH) {
633  EquivRangeBuilder.CalcMatches(QueryBSH, SubjtBSH, OrigEquivs);
634  } else {
635  NON_CONST_ITERATE(TEquivList, Iter, OrigEquivs) {
636  Iter->Matches = Iter->Query.GetLength();
637  Iter->MisMatches = 0;
638  }
639  }
640 
641 #ifdef MERGE_TREE_VERBOSE_DEBUG
642  int AccumScore = 0;
643 #endif
644  TEquivList AccumEquivs;
645  for(int Loop = 0; Loop < 3; Loop++) {
646  ITERATE(TAlignEquivListMap, OrigAlignIter, AlignEquivs) {
647  const TEquivList& CurrAlignEquivs = OrigAlignIter->second;
648 
649  TEquivList CurrEquivs;
650  CurrEquivs.insert(CurrEquivs.end(), AccumEquivs.begin(), AccumEquivs.end());
651  CurrEquivs.insert(CurrEquivs.end(), CurrAlignEquivs.begin(), CurrAlignEquivs.end());
652 
653  //cerr << "A: " << MapIter->second.size() << " OS: " << OrigEquivs.size() << " SE: " << SplitEquivs.size() << endl;
654  SplitEquivs.clear();
655  EquivRangeBuilder.SplitIntersections(CurrEquivs, SplitEquivs);
656 
657 
658  CMergeTree Tree(*this);
659  if(Callback != NULL)
661  Tree.SetScoring(m_Scoring);
662 
663  Tree.AddEquivs(SplitEquivs);
664  //Tree.Print(cout);
665 
666  while(!Tree.IsEmpty()) {
668 
669  Tree.Search(Path);
670 
671  if(Tree.GetInterrupted())
672  return;
673 
674  if(Path.empty())
675  break;
676 
677 #ifdef MERGE_TREE_VERBOSE_DEBUG
678  int NewScore = Tree.Score(Path);
679  if(AccumScore > NewScore)
680  cerr << "BACKWARDS!" << endl;
681  AccumScore = NewScore;
682 #endif
683 
684  //AccumEquivs.clear();
685  TEquivList Abutted;
686  EquivRangeBuilder.MergeAbuttings(Path, Abutted);
687  sort(Abutted.begin(), Abutted.end());
688  s_EquivDiff(AccumEquivs, Abutted);
689  AccumEquivs = Abutted;
690  //AccumEquivs.insert(AccumEquivs.end(), Path.begin(), Path.end());
691 
692 #ifdef MERGE_TREE_VERBOSE_DEBUG
693  cerr << __LINE__ << ":" << AccumEquivs.size()
694  << ":" << CurrAlignEquivs.size()
695  << ":" << SplitEquivs.size()
696  << ":" << Path.size()
697  << ":" << Tree.Size()
698  << ":" << Tree.Links()<< endl;
699 #endif
700  break;
701  } // end Tree not empty
702 
703  } // end orig align loop
704  } // end Loop loop
705 
706  {{
707  CRef<CSeq_align> Merged;
708  Merged = x_MakeSeqAlign(AccumEquivs, QueryIDH, SubjtIDH);
709 
710  if(Merged)
711  Output.push_back(Merged);
712  }}
713 
714  } // end Id-Strand-Map
715 }
716 
717 
719  list< CRef<CSeq_align> >& Output )
720 {
721  TAlignGroupMap AlignGroupMap;
722  x_MakeMergeableGroups(Input, AlignGroupMap);
723 
724  ITERATE(TAlignGroupMap, MapIter, AlignGroupMap) {
725  CSeq_id_Handle QueryIDH = MapIter->first.first.first;
726  CSeq_id_Handle SubjtIDH = MapIter->first.second.first;
727 
728 #ifdef MERGE_TREE_VERBOSE_DEBUG
729  cerr << "Starting " << QueryIDH.AsString() << (MapIter->first.first.second == eNa_strand_plus ? "+":"-")
730  << " x " << SubjtIDH.AsString() << (MapIter->first.second.second == eNa_strand_plus ? "+":"-") <<
731  " : " << MapIter->second.size() << " aligns"<<endl;
732 #endif
733 
734  CBioseq_Handle QueryBSH, SubjtBSH;
735  if(m_Scope != NULL) {
736  QueryBSH = m_Scope->GetBioseqHandle(QueryIDH);
737  SubjtBSH = m_Scope->GetBioseqHandle(SubjtIDH);
738  }
739 
740 
741  TAlignVec Aligns = MapIter->second;
742 
743  // length checks
744  if (QueryBSH && SubjtBSH &&
745  QueryBSH.CanGetInst_Length() &&
746  SubjtBSH.CanGetInst_Length()) {
747  TSeqPos QueryLen = QueryBSH.GetInst_Length();
748  TSeqPos SubjtLen = SubjtBSH.GetInst_Length();
749  TSeqRange QueryRange, SubjtRange;
750  ITERATE(TAlignVec, AlignIter, Aligns) {
751  QueryRange += (*AlignIter)->GetSeqRange(0);
752  SubjtRange += (*AlignIter)->GetSeqRange(1);
753  }
754  if (QueryRange.GetTo() >= QueryLen ||
755  SubjtRange.GetTo() >= SubjtLen) {
756  NCBI_THROW(CAlgoAlignException, eBadParameter, "Alignment ranges are outside of sequence lengths");
757  }
758  }
759 
760  {{
761  //cerr << "Unique IDs: " << QueryIDH.AsString() << " and " << SubjtIDH.AsString() << endl;
762  TAlignVec Unique, Other;
763  x_SplitGlobalUnique(Aligns, Unique, Other);
764 
765 #ifdef MERGE_TREE_VERBOSE_DEBUG
766  cerr << "Unique IDs: " << QueryIDH.AsString() << " and " << SubjtIDH.AsString() << endl;
767  cerr << " Unique Split: " << Aligns.size() << " into " << Unique.size() << " and " << Other.size() << endl;
768 #endif
769  // Merge the Uniques
770  // Then add the result to Others,
771  // Replace the Input , proceed
772  if(!Unique.empty()) {
773  list<CRef<CSeq_align> > Ins, Outs;
774  Ins.insert(Ins.end(), Unique.begin(), Unique.end());
775  Merge_AllAtOnce(Ins, Outs);
776 #ifdef MERGE_TREE_VERBOSE_DEBUG
777  cerr << " Unique Merged " << Ins.size() << " to " << Outs.size() << endl;
778 #endif
779  ITERATE(list<CRef<CSeq_align> >, OutIter, Outs) {
780  Other.push_back(*OutIter);
781  }
782  Aligns = Other;
783  }
784  }}
785 
786  x_Merge_Dist_Impl(Aligns, QueryIDH, SubjtIDH, QueryBSH, SubjtBSH, Output);
787 
788  } // end id-strand-map loop
789 
790 }
791 
792 
793 
796  CSeq_id_Handle QueryIDH,
797  CSeq_id_Handle SubjtIDH)
798 {
799  CRef<CSeq_align> New(new CSeq_align);
801  CDense_seg& Denseg = New->SetSegs().SetDenseg();
802 
803  Denseg.SetDim(2);
804  Denseg.SetNumseg(Equivs.size());
805 
806  CRef<CSeq_id> QueryId(new CSeq_id), SubjtId(new CSeq_id);
807  QueryId->Assign(*QueryIDH.GetSeqId());
808  SubjtId->Assign(*SubjtIDH.GetSeqId());
809  Denseg.SetIds().push_back(QueryId);
810  Denseg.SetIds().push_back(SubjtId);
811 
812 
813  sort(Equivs.begin(), Equivs.end(), s_SortEquivBySubjt);
814 
815  ITERATE(TEquivList, EquivIter, Equivs) {
816 
817  if(EquivIter->Empty())
818  continue;
819 
820  Denseg.SetStarts().push_back(EquivIter->Query.GetFrom());
821  Denseg.SetStarts().push_back(EquivIter->Subjt.GetFrom());
822 
823  Denseg.SetLens().push_back(EquivIter->Query.GetLength());
824 
825  Denseg.SetStrands().push_back(EquivIter->Strand);
826  Denseg.SetStrands().push_back(eNa_strand_plus);
827  }
828 
829  Denseg.Compact();
830  try {
831  CRef<CDense_seg> Temp = Denseg.FillUnaligned();
832  Denseg.Assign(*Temp);
833  } catch(CException& e) {
834  ERR_POST(Error << e.ReportAll() << MSerial_AsnText << Denseg);
835  throw;
836  }
837  Denseg.Compact();
838 
839  try {
840  New->Validate(true);
841  return New;
842  } catch(CException& e) {
843  ERR_POST(Error << e.ReportAll() << MSerial_AsnText << Denseg);
844  throw;
845  }
846 
847  return CRef<CSeq_align>();
848 }
849 
850 
851 void
853  TAlignGroupMap& AlignGroupMap) {
854  ITERATE(list<CRef<CSeq_align> >, AlignIter, Input) {
855  const CSeq_align& Align = **AlignIter;
856 
857  if(Align.GetSegs().IsDisc()) {
858  ITERATE(list<CRef<CSeq_align> >, DiscAlignIter,
859  Align.GetSegs().GetDisc().Get()) {
860  const CSeq_align& DiscAlign = **DiscAlignIter;
861  TMapKey Key;
862  Key.first.first = CSeq_id_Handle::GetHandle(DiscAlign.GetSeq_id(0));
863  Key.first.second = DiscAlign.GetSeqStrand(0);
864  Key.second.first = CSeq_id_Handle::GetHandle(DiscAlign.GetSeq_id(1));
865  Key.second.second = DiscAlign.GetSeqStrand(1);
866  AlignGroupMap[Key].push_back(*DiscAlignIter);
867  }
868  continue;
869  }
870 
871  TMapKey Key;
872  Key.first.first = CSeq_id_Handle::GetHandle(Align.GetSeq_id(0));
873  Key.first.second = Align.GetSeqStrand(0);
874  Key.second.first = CSeq_id_Handle::GetHandle(Align.GetSeq_id(1));
875  Key.second.second = Align.GetSeqStrand(1);
876  AlignGroupMap[Key].push_back(*AlignIter);
877  }
878 }
879 
880 
881 void
883  TAlignVec& Unique,
884  TAlignVec& Other)
885 {
886  ITERATE(TAlignVec, Outer, Input) {
887  bool Intersected = false;
888  TSeqRange OuterRanges[] = { (*Outer)->GetSeqRange(0), (*Outer)->GetSeqRange(1) };
889  ITERATE(TAlignVec, Inner, Input) {
890  if(Outer == Inner)
891  continue;
892  TSeqRange InnerRanges[] = { (*Inner)->GetSeqRange(0), (*Inner)->GetSeqRange(1) };
893 
894  if( OuterRanges[0].IntersectingWith(InnerRanges[0]) ||
895  OuterRanges[1].IntersectingWith(InnerRanges[1]) ) {
896  Intersected = true;
897  break;
898  }
899  }
900  if(Intersected)
901  Other.push_back(*Outer);
902  else
903  Unique.push_back(*Outer);
904  }
905 }
906 
907 
909 {
910 public:
912  CAlignDistGraph(const CAlignDistGraph& Original);
913 
914  size_t Size() const { return AlignEquivMap.size(); }
915  bool Empty() const { return AlignEquivMap.empty(); }
916 
917 
918  void AddAlignment(const TEquivList& NewAlignEquivs);
919  bool GetAndRemoveNearestPair(TEquivList& First, TEquivList& Second);
920  bool GetLastAlignEquivs(TEquivList& LastEquivs);
921  void RemoveEquivs(const TEquivList& RemoveEquivs);
922 
924 
925  void Print();
926 
927 private:
929 
930  size_t m_NextIndex;
931 
935 
939 
941  void x_CalculateOneDistance(size_t CalcIndex);
942 };
943 
945 {
946  {{
947 #ifdef MERGE_TREE_VERBOSE_DEBUG
948  cerr << "NearestMap: " << NearestMap.size() << endl;
949  cerr << "NearestDistMap: " << NearestDistMap.size() << endl;
950  cerr << "AlignEquivMap: " << AlignEquivMap.size() << endl;
952  size_t MinI;
953  ITERATE(TIndexIndexMap, IndexIter, NearestMap) {
954  size_t CI = IndexIter->first;
956  size_t OI = IndexIter->second;
957  cerr << "Near Pair : " << CI << " x " << OI << " => " << CD << endl;
958 
959  if(CD < MinD) {
960  MinD = CD;
961  MinI = CI;
962  }
963  }
964  cerr << "Minimum : " << MinI << " => " << MinD << endl;
965  if(AlignEquivMap.size() != NearestMap.size()) {
967  cerr << "Aligns : " << AlignIter->first << " : " << AlignIter->second.size() << endl;
968  }
969  }
970 #endif
971  }}
972 }
973 
975  : m_EquivRangeBuilder(Original.m_EquivRangeBuilder), m_NextIndex(0)
976 {
977  m_NextIndex = Original.m_NextIndex;
978  ITERATE(TAlignEquivListMap, AlignIter, Original.AlignEquivMap) {
979  ITERATE(TEquivList, EquivIter, AlignIter->second) {
980  AlignEquivMap[AlignIter->first].push_back(*EquivIter);
981  }
982  }
983 
984  NearestMap.insert(Original.NearestMap.begin(), Original.NearestMap.end());
986 
987  //Print();
988 
989 }
990 
991 
992 void CAlignDistGraph::AddAlignment(const TEquivList& NewAlignEquivs)
993 {
994  size_t NewI = m_NextIndex;
995  m_NextIndex++;
996  //NewI =
997 
998  AlignEquivMap[NewI].insert(AlignEquivMap[NewI].end(),
999  NewAlignEquivs.begin(), NewAlignEquivs.end());
1000 
1001  //x_CalculateAllDistances();
1002  x_CalculateOneDistance(NewI);
1003  return;
1004 
1005  const TEquivList& NewEquivs = AlignEquivMap[NewI];
1007  size_t BestI = NewI;
1008  ITERATE(TIndexIndexMap, IndexIter, NearestMap) {
1009  size_t CI = IndexIter->first;
1010  const TEquivList& CurrEquivs = AlignEquivMap[CI];
1011 
1012  TSeqPos CurrDist = CEquivRange::Distance(NewEquivs, CurrEquivs);
1013 
1014  if(CurrDist < BestD) {
1015  BestD = CurrDist;
1016  BestI = CI;
1017  }
1018  }
1019 
1020  NearestMap[NewI] = BestI;
1021  NearestDistMap[NewI] = BestD;
1022 
1023  if(BestD != numeric_limits<TSeqPos>::max()) {
1024  NearestMap[BestI] = NewI;
1025  NearestDistMap[BestI] = BestD;
1026  }
1027 }
1028 
1029 
1031 {
1032  if(AlignEquivMap.empty() || AlignEquivMap.size() == 1) {
1033  return false;
1034  }
1035 
1037  size_t MinI = numeric_limits<size_t>::max();
1038  ITERATE(TIndexIndexMap, IndexIter, NearestMap) {
1039  size_t CI = IndexIter->first;
1041  if(CD < MinD) {
1042  MinD = CD;
1043  MinI = CI;
1044  }
1045  }
1046 
1047  size_t OtherI = NearestMap[MinI];
1048 #ifdef MERGE_TREE_VERBOSE_DEBUG
1049  //cerr << "Merging " << MinI << " x " << OtherI
1050  // << " of " << /*Aligns.size() <<*/ " (" << AlignEquivMap.size() << ")" << endl;
1051 #endif
1052 
1053  const TEquivList& MinEs = AlignEquivMap[MinI];
1054  const TEquivList& OtherEs = AlignEquivMap[OtherI];
1055 
1056  First.insert(First.end(), MinEs.begin(), MinEs.end());
1057  Second.insert(Second.end(), OtherEs.begin(), OtherEs.end());
1058 
1059 
1060  NearestMap.erase(MinI);
1061  NearestMap.erase(OtherI);
1062  NearestDistMap.erase(MinI);
1063  NearestDistMap.erase(OtherI);
1064  AlignEquivMap.erase(MinI);
1065  AlignEquivMap.erase(OtherI);
1066 
1067  ITERATE(TIndexIndexMap, NearestIter, NearestMap) {
1068  if (NearestIter->second == MinI ||
1069  NearestIter->second == OtherI) {
1070  //x_CalculateAllDistances();
1071  NearestDistMap[NearestIter->first] = numeric_limits<TSeqPos>::max();
1072  x_CalculateOneDistance(NearestIter->first);
1073  //break;
1074  }
1075  }
1076 
1077 
1078  return true;
1079 }
1080 
1081 
1083 {
1084  if(AlignEquivMap.empty() || AlignEquivMap.size() != 1) {
1085  return false;
1086  }
1087 
1089  const TEquivList& MinEs = AlignIter->second;
1090  LastEquivs.insert(LastEquivs.end(), MinEs.begin(), MinEs.end());
1091  break;
1092  }
1093 
1094  return true;
1095 }
1096 
1097 
1098 void CAlignDistGraph::RemoveEquivs(const TEquivList& RemoveEquivs)
1099 {
1100  TAlignEquivListMap RemoveGroups;
1101 
1102  ITERATE(TEquivList, RemoveIter, RemoveEquivs) {
1103  RemoveGroups[RemoveIter->AlignId].push_back(*RemoveIter);
1104  }
1105 
1107  size_t CurrIndex = AlignIter->first;
1108  TEquivList& CurrEquivs = AlignIter->second;
1109 
1110  if(RemoveGroups.find(CurrEquivs.front().AlignId) == RemoveGroups.end()) {
1111  continue;
1112  }
1113 
1114  const TEquivList& RemoveGroupEquivs = RemoveGroups[CurrEquivs.front().AlignId];
1115 
1116  TEquivList Origs, Splits;
1117  Origs.insert(Origs.end(), CurrEquivs.begin(), CurrEquivs.end());
1118  Origs.insert(Origs.end(), RemoveGroupEquivs.begin(), RemoveGroupEquivs.end());
1119  m_EquivRangeBuilder.SplitIntersections(Origs, Splits);
1120  Origs.clear();
1121 
1122  TEquivList Remaining;
1123  ITERATE(TEquivList, SplitIter, Splits) {
1124  bool Keep = true;
1125  ITERATE(TEquivList, RemoveIter, RemoveGroupEquivs) {
1126  if (SplitIter->AlignId == RemoveIter->AlignId &&
1127  SplitIter->SegmtId == RemoveIter->SegmtId) {
1128  if(SplitIter->IntersectingWith(*RemoveIter)) {
1129  Keep = false;
1130  break;
1131  }
1132  }
1133  }
1134  if(Keep) {
1135  Remaining.push_back(*SplitIter);
1136  }
1137  }
1138  CurrEquivs.clear();
1139  m_EquivRangeBuilder.MergeAbuttings(Remaining, CurrEquivs);
1140 
1141 
1142  if(CurrEquivs.empty()) {
1143  NearestMap.erase(CurrIndex);
1144  NearestDistMap.erase(CurrIndex);
1145  AlignEquivMap.erase(AlignIter);
1146  }
1147  }
1148 
1149 
1150  ITERATE(TAlignEquivListMap, RemoveGroupIter, RemoveGroups) {
1151  ITERATE(TIndexIndexMap, NearestIter, NearestMap) {
1152  if (NearestIter->second == RemoveGroupIter->first) {
1153  //x_CalculateAllDistances();
1154  NearestDistMap[NearestIter->first] = numeric_limits<TSeqPos>::max();
1155  x_CalculateOneDistance(NearestIter->first);
1156  //break;
1157  }
1158  }
1159  }
1160 }
1161 
1162 
1164 {
1165  NearestMap.clear();
1167 
1168  vector<size_t> Indexes;
1170  Indexes.push_back(AlignIter->first);
1171  }
1172 
1173  size_t OII, III;
1174  size_t OI, II;
1175  for( OII = 0; OII < Indexes.size(); OII++) {
1176  OI = Indexes[OII];
1178  size_t BestI = 0;
1179  if(NearestMap.find(OI) != NearestMap.end()) {
1180  BestI = NearestMap[OI];
1181  BestDist = NearestDistMap[OI];
1182  }
1183  bool Found = false;
1184  for( III = OII+1; III < Indexes.size(); III++) {
1185  //for( III = 0; III < Indexes.size(); III++) {
1186  // if(OII == III)
1187  // continue;
1188  II = Indexes[III];
1190  if(CurrDist < BestDist) {
1191  BestI = II;
1192  BestDist = CurrDist;
1193  Found = true;
1194  }
1195  }
1196  if(Found) {
1197  NearestMap[OI] = BestI;
1198  NearestDistMap[OI] = BestDist;
1199  if (NearestMap.find(BestI) == NearestMap.end() ||
1200  NearestDistMap[BestI] > BestDist) {
1201  NearestMap[BestI] = OI;
1202  NearestDistMap[BestI] = BestDist;
1203  }
1204  }
1205  }
1206 }
1207 
1209 {
1211  size_t BestI = 0;
1212 
1213  bool Found = false;
1215 
1216  size_t CurrIndex = AlignIter->first;
1217  if(CurrIndex == CalcIndex)
1218  continue;
1219 
1220  TSeqPos CurrDist = CEquivRange::Distance(AlignEquivMap[CalcIndex], AlignEquivMap[CurrIndex]);
1221  if(CurrDist < BestDist) {
1222  BestI = CurrIndex;
1223  BestDist = CurrDist;
1224  Found = true;
1225  }
1226  }
1227 
1228  if(Found) {
1229  NearestMap[CalcIndex] = BestI;
1230  NearestDistMap[CalcIndex] = BestDist;
1231  if (NearestMap.find(BestI) == NearestMap.end() ||
1232  NearestDistMap[BestI] > BestDist) {
1233  NearestMap[BestI] = CalcIndex;
1234  NearestDistMap[BestI] = BestDist;
1235  }
1236  }
1237 }
1238 
1239 int s_UniformAlignId(const TEquivList& Equivs)
1240 {
1241  if (Equivs.empty())
1242  return -1;
1243 
1244  bool Uniform = true;
1245  ITERATE(TEquivList, EquivIter, Equivs) {
1246  if(EquivIter->AlignId != Equivs.front().AlignId) {
1247  Uniform = false;
1248  break;
1249  }
1250  }
1251 
1252  if (Uniform)
1253  return Equivs.front().AlignId;
1254 
1255  return -1;
1256 }
1257 
1258 void
1260  CSeq_id_Handle QueryIDH, CSeq_id_Handle SubjtIDH,
1261  CBioseq_Handle QueryBSH, CBioseq_Handle SubjtBSH,
1262  list< CRef<objects::CSeq_align> >& Output)
1263 {
1264  const int PASSES_BEFORE_EARLY_REMOVE = 5;
1265  list<CRef<CSeq_align> > EarlyRemoves;
1266 
1267  CEquivRangeBuilder EquivRangeBuilder;
1268 
1269  CAlignDistGraph OriginalGraph(EquivRangeBuilder);
1270 
1271 #ifdef MERGE_TREE_VERBOSE_DEBUG
1272  cerr << " Starting : " << Aligns.size() << endl;
1273 #endif
1274 
1275  // Sanity checking, same bases in as out
1276  TSeqPos OrigMatches = 0, AllMergeMatches = 0;
1277 
1278  ITERATE(TAlignVec, AlignIter, Aligns) {
1279  TEquivList Temp;
1280  EquivRangeBuilder.ExtractRangesFromSeqAlign(**AlignIter, Temp);
1281  if(QueryBSH && SubjtBSH) {
1282  EquivRangeBuilder.CalcMatches(QueryBSH, SubjtBSH, Temp);
1283  } else {
1284  NON_CONST_ITERATE(TEquivList, Iter, Temp) {
1285  Iter->Matches = Iter->Query.GetLength();
1286  Iter->MisMatches = 0;
1287  }
1288  }
1289  ITERATE(TEquivList, EquivIter, Temp) {
1290  OrigMatches += EquivIter->Matches;
1291  }
1292 #ifdef MERGE_TREE_VERBOSE_DEBUG
1293  cerr << " Inserting align_id: " << s_UniformAlignId(Temp) << endl;
1294 #endif
1295  OriginalGraph.AddAlignment(Temp);
1296  }
1297 
1298  OriginalGraph.CalculateAllDistances();
1299 
1300  typedef pair<int, size_t> TAlignIdScorePair;
1301  typedef pair<TAlignIdScorePair,TAlignIdScorePair> TAlignIdScoreKey;
1302  typedef map<TAlignIdScoreKey, int> TMergeResultMap;
1303  TMergeResultMap MergeResultMap;
1304  const int ALL_MIN = 1;
1305  const int ALL_OTHER = 2;
1306  const int FULL_MERGE = 3;
1307  const int MIXED = 4;
1308 
1309  int PassCounter = 0;
1310  while(!OriginalGraph.Empty()) {
1311  CAlignDistGraph CurrGraph(OriginalGraph);
1312 
1313 #ifdef MERGE_TREE_VERBOSE_DEBUG
1314  cerr << " --Starting : " << CurrGraph.Size() << endl;
1315 #endif
1316 
1317  // Sort dists. Pick smallest Dist.
1318  // TreeMerge the two aligns with the smallest Dist.
1319  // Block out those 2, Insert the Merged result.
1320  // Repeat until the list only contains 1.
1321 
1322  while(CurrGraph.Size() > 1) {
1323 #ifdef MERGE_TREE_VERBOSE_DEBUG
1324  cerr << " --++Starting : " << CurrGraph.Size() << " :::: " << PassCounter << endl;
1325 #endif
1326 
1327  TEquivList MinEs, OtherEs;
1328  CurrGraph.GetAndRemoveNearestPair(MinEs, OtherEs);
1329 
1330  if(MinEs.empty() || OtherEs.empty()) {
1331  if(!MinEs.empty())
1332  CurrGraph.AddAlignment(MinEs);
1333  else if(!OtherEs.empty())
1334  CurrGraph.AddAlignment(OtherEs);
1335  continue;
1336  }
1337 
1338  TEquivList CachedPath;
1339  TAlignIdScoreKey key;
1340  key.first.first = s_UniformAlignId(MinEs);
1341  key.first.second = s_LengthFromEquivList(MinEs);
1342  key.second.first = s_UniformAlignId(OtherEs);
1343  key.second.second = s_LengthFromEquivList(OtherEs);
1344 #ifdef MERGE_TREE_VERBOSE_DEBUG
1345  cerr << " --++MinEs id: " << key.first.first << " score: " << key.first.second << " len: " << MinEs.size()<< endl;
1346  cerr << " --++OtherEs id: " << key.second.first << " score: " << key.second.second << " len: " << OtherEs.size()<< endl;
1347 #endif
1348 
1349  if(key.first.first != -1 && key.second.first != -1) {{
1350  if (MergeResultMap.find(key) != MergeResultMap.end()) {
1351  int MergeResult = MergeResultMap[key];
1352 #ifdef MERGE_TREE_VERBOSE_DEBUG
1353  cerr << " --++ Had Cached Value: " << MergeResult << endl;
1354 #endif
1355  if(MergeResult == FULL_MERGE) {
1356  CachedPath.insert(CachedPath.end(), MinEs.begin(), MinEs.end());
1357  CachedPath.insert(CachedPath.end(), OtherEs.begin(), OtherEs.end());
1358  sort(CachedPath.begin(), CachedPath.end());
1359  } else if(MergeResult == ALL_MIN) {
1360  CachedPath = MinEs;
1361  } else if(MergeResult == ALL_OTHER) {
1362  CachedPath = OtherEs;
1363  }
1364  }
1365  }}
1366 
1367  TEquivList Path, Abutted;
1368  CMergeTree Tree(*this);
1369  if (CachedPath.empty()) {
1370  TEquivList CurrEquivs;
1371  CurrEquivs.insert(CurrEquivs.end(), MinEs.begin(), MinEs.end());
1372  CurrEquivs.insert(CurrEquivs.end(), OtherEs.begin(), OtherEs.end());
1373  sort(CurrEquivs.begin(), CurrEquivs.end());
1374  TEquivList SplitEquivs;
1375  EquivRangeBuilder.SplitIntersections(CurrEquivs, SplitEquivs);
1376 #ifdef MERGE_TREE_VERBOSE_DEBUG
1377  cerr << " --++ mid: " <<s_UniformAlignId(MinEs) << " oid: " << s_UniformAlignId(OtherEs)<<endl;
1378  cerr << " --++SplitEquivs : " << CurrEquivs.size() << " to " << SplitEquivs.size() << endl;
1379 #endif
1380 
1381 
1382  if(Callback != NULL)
1384  Tree.SetScoring(m_Scoring);
1385  Tree.AddEquivs(SplitEquivs);
1386  //Tree.Print(cout);
1387 
1388  Tree.Search(Path, true);
1389 
1390  // Only happens if the Tree's Interrupt is triggered
1391  if(Path.empty()) {
1392  Output.insert(Output.end(), EarlyRemoves.begin(), EarlyRemoves.end());
1393  return;
1394  }
1395 
1396  EquivRangeBuilder.MergeAbuttings(Path, Abutted);
1397  sort(Abutted.begin(), Abutted.end());
1398 
1399  if(key.first.first != -1 && key.second.first != -1) {{
1400  int ResultId = s_UniformAlignId(Abutted);
1401  int ResultLength = s_LengthFromEquivList(Abutted);
1402 #ifdef MERGE_TREE_VERBOSE_DEBUG
1403  cerr << " --++Result id: " << ResultId << " score: " << ResultLength << " len: " << Abutted.size()<< endl;
1404 #endif
1405  int MergeResult = 0;
1406  if (ResultId == -1 && ResultLength == (key.first.second+key.second.second)) {
1407  MergeResult = FULL_MERGE;
1408  } else if (ResultId == key.first.first && ResultLength == key.first.second) {
1409  MergeResult = ALL_MIN;
1410  } else if (ResultId == key.second.first && ResultLength == key.second.second) {
1411  MergeResult = ALL_OTHER;
1412  } else {
1413  MergeResult = MIXED;
1414  }
1415  MergeResultMap[key] = MergeResult;
1416 #ifdef MERGE_TREE_VERBOSE_DEBUG
1417  cerr << " --++New MergeResult: " << MergeResult << endl;
1418 #endif
1419  }}
1420  } else {
1421  Path = CachedPath;
1422  EquivRangeBuilder.MergeAbuttings(Path, Abutted);
1423  sort(Abutted.begin(), Abutted.end());
1424  }
1425 
1426 
1427  if(!Abutted.empty()) {
1428 #ifdef MERGE_TREE_VERBOSE_DEBUG
1429  cerr << " --++Merged from : " << MinEs.size() << " and " << OtherEs.size() << endl;
1430  cerr << " --++Re-inserting Merged Alignment: " << Abutted.size() << endl;
1431 #endif
1432  CurrGraph.AddAlignment(Abutted);
1433  }
1434 
1435  // Early Removes. In large enough data sets, going through the N-th
1436  // passes to find every single base a home is slow, and mostly
1437  // pointless. Most of this grinding is repeatedly trying to merge
1438  // low-quality whole alignment with others like it, but never
1439  // creating a new alignment, just picking the better-scoring of the two.
1440  // After the first few passes, the lower scoring alignments of such pairings
1441  // can just be removed from consideration early, because they are not
1442  // contributing to any merges.
1443  if(!Abutted.empty() && PassCounter > PASSES_BEFORE_EARLY_REMOVE) {{
1444  int ResultId = s_UniformAlignId(Abutted);
1445  if(ResultId != -1) {
1446  int MinEId = s_UniformAlignId(MinEs);
1447  int OtherEId = s_UniformAlignId(OtherEs);
1448  if ( (Abutted.size() == MinEs.size() &&
1449  ResultId == MinEId) ||
1450  (Abutted.size() == OtherEs.size() &&
1451  ResultId == OtherEId) ) {
1452 
1453  // Merges that go entirely one-way,
1454  // we can just remove the un-used whole alignment
1455  // from future consideration
1456  TEquivList* LoserEs = NULL;
1457 
1458  if(ResultId == MinEId) {
1459  LoserEs = &OtherEs;
1460  } else if(ResultId == OtherEId) {
1461  LoserEs = &MinEs;
1462  }
1463  if(LoserEs != NULL) {
1464  ITERATE(TEquivList, EquivIter, *LoserEs) {
1465  AllMergeMatches += EquivIter->Matches;
1466  }
1467  CRef<CSeq_align> Merged;
1468  Merged = x_MakeSeqAlign(*LoserEs, QueryIDH, SubjtIDH);
1469  if(Merged)
1470  EarlyRemoves.push_back(Merged);
1471  OriginalGraph.RemoveEquivs(*LoserEs);
1472  }
1473  }
1474  }
1475  }}
1476 
1477  {{
1478 #ifdef MERGE_TREE_VERBOSE_DEBUG
1479  //s_EquivDiff(CurrEquivs, Abutted);
1480  int MinScore = Tree.Score(MinEs);
1481  int OtherScore = Tree.Score(OtherEs);
1482  int NewScore = Tree.Score(Abutted);
1483  //cerr << " Scores: " << MinScore << " x " << OtherScore << " => " << NewScore << endl;
1484  if(NewScore < MinScore || NewScore < OtherScore) {
1485  cerr << "BACKWARDS!" << endl;
1486  }
1487  //s_FindBestSubRange(Tree, Abutted);
1488 #endif
1489  }}
1490 
1491  } // end while CurrGraph
1492 
1493  if(!CurrGraph.Empty()) {
1494  TEquivList LastEquivs;
1495  CurrGraph.GetLastAlignEquivs(LastEquivs);
1496  ITERATE(TEquivList, EquivIter, LastEquivs) {
1497  AllMergeMatches += EquivIter->Matches;
1498  }
1499  if(!LastEquivs.empty()) {
1500  CRef<CSeq_align> Merged;
1501  Merged = x_MakeSeqAlign(LastEquivs, QueryIDH, SubjtIDH);
1502  if(Merged)
1503  Output.push_back(Merged);
1504  }
1505  OriginalGraph.RemoveEquivs(LastEquivs);
1506  }
1507 
1508  PassCounter++;
1509  } // end while OriginalGraph
1510 
1511  Output.insert(Output.end(), EarlyRemoves.begin(), EarlyRemoves.end());
1512 
1513 #ifdef MERGE_TREE_VERBOSE_DEBUG
1514  cerr << "Orig : " << OrigMatches << " vs " << AllMergeMatches << endl;
1515  if(OrigMatches != AllMergeMatches)
1516  cerr << "MATCHES PROBLEM!" << endl;
1517 #endif
1518 }
1519 
1520 
1522 
1523 
void x_CalculateAllDistances()
CAlignDistGraph(CEquivRangeBuilder &Builder)
Definition: merge_tree.cpp:911
void AddAlignment(const TEquivList &NewAlignEquivs)
Definition: merge_tree.cpp:992
bool GetAndRemoveNearestPair(TEquivList &First, TEquivList &Second)
CEquivRangeBuilder & m_EquivRangeBuilder
Definition: merge_tree.cpp:928
void x_CalculateOneDistance(size_t CalcIndex)
void RemoveEquivs(const TEquivList &RemoveEquivs)
bool Empty() const
Definition: merge_tree.cpp:915
map< size_t, TSeqPos > TIndexDistMap
Definition: merge_tree.cpp:934
void CalculateAllDistances()
Definition: merge_tree.cpp:923
size_t Size() const
Definition: merge_tree.cpp:914
map< size_t, size_t > TIndexIndexMap
Definition: merge_tree.cpp:933
TIndexDistMap NearestDistMap
Definition: merge_tree.cpp:938
map< size_t, TEquivList > TAlignEquivListMap
Definition: merge_tree.cpp:932
TAlignEquivListMap AlignEquivMap
Definition: merge_tree.cpp:936
bool GetLastAlignEquivs(TEquivList &LastEquivs)
TIndexIndexMap NearestMap
Definition: merge_tree.cpp:937
CBioseq_Handle –.
CRef< CDense_seg > FillUnaligned() const
Create a new dense-seg with added all unaligned pieces (implicit inserts), if any,...
Definition: Dense_seg.cpp:1108
void Compact()
Join adjacent mergeable segments to create a more compact alignment.
Definition: Dense_seg.cpp:432
void Assign(const CSerialObject &obj, ESerialRecursionMode how=eRecursive)
overloaded Assign()
Definition: Dense_seg.cpp:62
void CalcMatches(objects::CBioseq_Handle QueryHandle, objects::CBioseq_Handle SubjtHandle, TEquivList &Equivs)
bool MergeAbuttings(const TEquivList &Originals, TEquivList &Merges)
void ExtractRangesFromSeqAlign(const objects::CSeq_align &Alignment, TEquivList &Equivs)
bool SplitIntersections(const TEquivList &Originals, TEquivList &Splits)
static TSeqPos Distance(const CEquivRange &A, const CEquivRange &B)
CI –.
Definition: I.hpp:64
int Score(const TEquivList &Path) const
void SetScoring(SScoring Scoring)
bool GetInterrupted()
TInterruptFnPtr SetInterruptCallback(TInterruptFnPtr Callback, void *CallbackData)
void Search(TEquivList &Path, bool Once=false)
size_t Links() const
size_t Size() const
void AddEquivs(const TEquivList &Equivs)
bool IsEmpty() const
CRange< TSeqPos > GetSeqRange(TDim row) const
GetSeqRange NB: On a Spliced-seg, in case the product-type is protein, these only return the amin par...
Definition: Seq_align.cpp:153
TSeqPos GetSeqStop(TDim row) const
Definition: Seq_align.cpp:273
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
Definition: Seq_align.cpp:317
TSeqPos GetSeqStart(TDim row) const
Definition: Seq_align.cpp:252
ENa_strand GetSeqStrand(TDim row) const
Get strand (the first one if segments have different strands).
Definition: Seq_align.cpp:294
TSeqPos GetAlignLength(bool include_gaps=true) const
Get the length of this alignment.
Definition: Seq_align.cpp:1993
void Validate(bool full_test=false) const
Definition: Seq_align.cpp:649
pair< TSeqIdPair, TSeqIdPair > TMapKey
Definition: merge_tree.hpp:124
void x_SplitGlobalUnique(const TAlignVec &Input, TAlignVec &Unique, TAlignVec &Other)
Definition: merge_tree.cpp:882
void Merge(const list< CRef< objects::CSeq_align > > &Input, list< CRef< objects::CSeq_align > > &Output)
Definition: merge_tree.cpp:300
void x_MakeMergeableGroups(list< CRef< objects::CSeq_align > > Input, TAlignGroupMap &AlignGroupMap)
Definition: merge_tree.cpp:852
vector< CRef< objects::CSeq_align > > TAlignVec
Definition: merge_tree.hpp:125
void Merge_Dist(const list< CRef< objects::CSeq_align > > &Input, list< CRef< objects::CSeq_align > > &Output)
Definition: merge_tree.cpp:718
void Merge_Pairwise(const list< CRef< objects::CSeq_align > > &Input, list< CRef< objects::CSeq_align > > &Output)
Definition: merge_tree.cpp:552
CRef< objects::CSeq_align > x_MakeSeqAlign(TEquivList &Equivs, objects::CSeq_id_Handle QueryIDH, objects::CSeq_id_Handle SubjtIDH)
Definition: merge_tree.cpp:795
void Merge_AllAtOnce(const list< CRef< objects::CSeq_align > > &Input, list< CRef< objects::CSeq_align > > &Output)
Definition: merge_tree.cpp:306
CMergeTree::TInterruptFnPtr Callback
Definition: merge_tree.hpp:110
objects::CScope * m_Scope
Definition: merge_tree.hpp:107
pair< objects::CSeq_id_Handle, objects::ENa_strand > TSeqIdPair
Definition: merge_tree.hpp:123
CMergeTree::SScoring m_Scoring
Definition: merge_tree.hpp:108
void x_Merge_Dist_Impl(TAlignVec &Aligns, objects::CSeq_id_Handle QueryIDH, objects::CSeq_id_Handle SubjtIDH, objects::CBioseq_Handle QueryBSH, objects::CBioseq_Handle SubjtBSH, list< CRef< objects::CSeq_align > > &Output)
void erase(iterator pos)
Definition: map.hpp:167
size_type size() const
Definition: map.hpp:148
const_iterator begin() const
Definition: map.hpp:151
const_iterator end() const
Definition: map.hpp:152
iterator_bool insert(const value_type &val)
Definition: map.hpp:165
bool empty() const
Definition: map.hpp:149
void clear()
Definition: map.hpp:169
const_iterator find(const key_type &key) const
Definition: map.hpp:153
Definition: map.hpp:338
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
size_type size() const
Definition: set.hpp:132
#define A(i)
Definition: ecp_curves.c:936
bool s_SortEquivBySubjt(const CEquivRange &A, const CEquivRange &B)
vector< CEquivRange > TEquivList
Definition: equiv_range.hpp:48
string Path(const string &dir, const string &file)
Definition: fileutil.cpp:243
#define CD
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define ERASE_ITERATE(Type, Var, Cont)
Non-constant version with ability to erase current element, if container permits.
Definition: ncbimisc.hpp:843
int TSignedSeqPos
Type for signed sequence position.
Definition: ncbimisc.hpp:887
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define NULL
Definition: ncbistd.hpp:225
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
string ReportAll(TDiagPostFlags flags=eDPF_Exception) const
Report all exceptions.
Definition: ncbiexpt.cpp:370
#define MSerial_AsnText
I/O stream manipulators –.
Definition: serialbase.hpp:696
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
Definition: Seq_id.cpp:318
CConstRef< CSeq_id > GetSeqId(void) const
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
string AsString(void) const
bool CanGetInst_Length(void) const
TInst_Length GetInst_Length(void) const
TThisType IntersectionWith(const TThisType &r) const
Definition: range.hpp:312
bool Empty(void) const
Definition: range.hpp:148
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
TLens & SetLens(void)
Assign a value to Lens data member.
Definition: Dense_seg_.hpp:561
void SetSegs(TSegs &value)
Assign a value to Segs data member.
Definition: Seq_align_.cpp:310
void SetDim(TDim value)
Assign a value to Dim data member.
Definition: Dense_seg_.hpp:427
void SetType(TType value)
Assign a value to Type data member.
Definition: Seq_align_.hpp:818
TStarts & SetStarts(void)
Assign a value to Starts data member.
Definition: Dense_seg_.hpp:536
TStrands & SetStrands(void)
Assign a value to Strands data member.
Definition: Dense_seg_.hpp:586
bool IsDisc(void) const
Check if variant Disc is selected.
Definition: Seq_align_.hpp:772
void SetNumseg(TNumseg value)
Assign a value to Numseg data member.
Definition: Dense_seg_.hpp:474
TIds & SetIds(void)
Assign a value to Ids data member.
Definition: Dense_seg_.hpp:511
const TDisc & GetDisc(void) const
Get the variant data.
Definition: Seq_align_.cpp:197
const Tdata & Get(void) const
Get the member data.
const TSegs & GetSegs(void) const
Get the Segs member data.
Definition: Seq_align_.hpp:921
@ eType_partial
mapping pieces together
Definition: Seq_align_.hpp:103
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
USING_SCOPE(objects)
bool s_SortSeqAlignByQuery_Subjt(CRef< CSeq_align > A, CRef< CSeq_align > B)
Definition: merge_tree.cpp:86
void s_EquivDiff(const TEquivList &A, const TEquivList &B)
Definition: merge_tree.cpp:140
bool s_SortSeqAlignByIntercept(CRef< CSeq_align > A, CRef< CSeq_align > B)
Definition: merge_tree.cpp:126
bool s_SortSeqAlignByLength(CRef< CSeq_align > A, CRef< CSeq_align > B)
Definition: merge_tree.cpp:135
TSeqPos s_AlignDist(const CSeq_align &A, const CSeq_align &B)
Definition: merge_tree.cpp:220
set< int > s_AlignIdsFromEquivList(const TEquivList &Equivs)
Definition: merge_tree.cpp:60
TSignedSeqPos s_SeqAlignIntercept(const CSeq_align &A)
Definition: merge_tree.cpp:114
bool s_SortSeqAlignByQueryMinus_Subjt(CRef< CSeq_align > A, CRef< CSeq_align > B)
Definition: merge_tree.cpp:100
size_t s_LengthFromEquivList(const TEquivList &Equivs)
Definition: merge_tree.cpp:76
size_t s_ScoreFromEquivList(const TEquivList &Equivs)
Definition: merge_tree.cpp:67
int s_UniformAlignId(const TEquivList &Equivs)
void s_FindBestSubRange(const CMergeTree &Tree, const TEquivList &Path)
Definition: merge_tree.cpp:257
constexpr auto sort(_Init &&init)
const struct ncbi::grid::netcache::search::fields::KEY key
#define BF(A, B, I)
#define abs(a)
Definition: ncbi_heapmgr.c:130
#define F(x)
Make a parametrized function appear to have only one variable.
Definition: ncbi_math.c:342
T max(T x_, T y_)
T min(T x_, T y_)
The Object manager core.
Modified on Tue Apr 23 07:40:26 2024 by modify_doxy.py rev. 669887