NCBI C++ ToolKit
equiv_range.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: equiv_range.cpp 81442 2018-03-01 19:18:38Z boukn $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Nathan Bouk
27  *
28  * File Description:
29  *
30  */
31 
32 
33 #include <ncbi_pch.hpp>
35 #include <cmath>
42 #include <objmgr/seq_vector.hpp>
43 #include <objmgr/scope.hpp>
44 
45 
46 
49 
50 
52 
53 
54 ///
55 template<typename T>
56 T s_CalcMean(const list<T>& List) {
57  typedef list<T> TypeList;
58  size_t Count = List.size();
59  T Mean = 0;
60 
61  ITERATE(typename TypeList, Iter, List) {
62  Mean += *Iter/Count;
63  }
64 
65  return Mean;
66 }
67 
68 template<typename T>
69 double s_CalcStdDev(const list<T>& List, T Mean) {
70  typedef list<T> TypeList;
71  size_t Count = List.size();
72 
73  double RunningTotal = 0;
74  ITERATE(typename TypeList, Iter, List) {
75  RunningTotal += ((*Iter - Mean) * (*Iter - Mean))/double(Count);
76  }
77 
78  return sqrt(RunningTotal);
79 }
80 
81 template<typename T>
82 void s_CalcDevs(const list<T>& Values, T Mean, double StdDev, list<double>& Devs) {
83  typedef list<T> TypeList;
84 
85  ITERATE(typename TypeList, Iter, Values) {
86  double Dist = fabs( double(*Iter) - double(Mean));
87  double Dev = (Dist/StdDev);
88  Devs.push_back(Dev);
89  }
90 }
91 
92 
93 ///
94 
95 
97 {
98  return out << range.Query
99  << (range.Strand == eNa_strand_minus ? "-" : "+")
100  << " to " << range.Subjt << "+"
101  << " (" << range.AlignId
102  << "," << range.SegmtId
103  << "," << range.SplitId << ")";
104 }
105 
107  if(rel == CEquivRange::eWtf)
108  return "WT";
109 
110  string Ret;
111  if(rel & CEquivRange::eBefore)
112  Ret = "B";
113  else if(rel & CEquivRange::eAfter)
114  Ret = "F";
115  else if(rel & CEquivRange::eAbove)
116  Ret = "A";
117  else if(rel & CEquivRange::eUnder)
118  Ret = "U";
119 
120  if(rel & CEquivRange::eIntersects)
121  Ret += "I";
122  else if(rel & CEquivRange::eInterQuery)
123  Ret += "Q";
124  else if(rel & CEquivRange::eInterSubjt)
125  Ret += "S";
126 
127  return Ret;
128 }
129 
130 
131 bool operator==(const CEquivRange& A, const CEquivRange& B) {
132  return (A.Query == B.Query && A.Subjt == B.Subjt);
133 }
134 
135 bool operator<(const CEquivRange& A, const CEquivRange& B) {
136  if(A.Query.GetFrom() != B.Query.GetFrom())
137  return (A.Query.GetFrom() < B.Query.GetFrom());
138  else if(A.Query.GetTo() != B.Query.GetTo())
139  return (A.Query.GetTo() < B.Query.GetTo());
140  else if(A.Subjt.GetFrom() != B.Subjt.GetFrom())
141  return (A.Subjt.GetFrom() < B.Subjt.GetFrom());
142  else if(A.Subjt.GetTo() != B.Subjt.GetTo())
143  return (A.Subjt.GetTo() < B.Subjt.GetTo());
144  else
145  return A.Strand < B.Strand;
146 }
147 
148 
149 
151  if(A.Subjt.GetFrom() != B.Subjt.GetFrom())
152  return (A.Subjt.GetFrom() < B.Subjt.GetFrom());
153  else if(A.Subjt.GetTo() != B.Subjt.GetTo())
154  return (A.Subjt.GetTo() < B.Subjt.GetTo());
155  else if(A.Query.GetFrom() != B.Query.GetFrom())
156  return (A.Query.GetFrom() < B.Query.GetFrom());
157  else if(A.Query.GetTo() != B.Query.GetTo())
158  return (A.Query.GetTo() < B.Query.GetTo());
159  else
160  return A.Strand < B.Strand;
161 }
162 
163 
164 
166 
167 
168 
171 {
172 /*
173  To make a valid Denseg, all equivs must be before/after
174  each other, in the proper order.
175 
176  |
177  | after
178  above |----
179  /
180  /
181  / under
182  -------|
183  before |
184  |
185 */
186 
187  ERelative Result = eWtf;
188 
189  if(Empty() || Check.Empty())
190  return eWtf;
191 
192  if(Strand != Check.Strand)
193  Result = eWtf;
194 
196  Result = eIntersects;
197 
198  if(Result != eWtf)
199  goto end;
200 
201  if(Strand == eNa_strand_plus) {
202  if (Check.Query.GetFrom() > Query.GetTo() &&
203  Check.Subjt.GetFrom() > Subjt.GetTo())
204  Result = eAfter;
205  else if(Check.Query.GetTo() < Query.GetFrom() &&
206  Check.Subjt.GetTo() < Subjt.GetTo())
207  Result = eBefore;
208  else if(Check.Query.GetFrom() > Query.GetFrom() &&
209  Check.Subjt.GetTo() < Subjt.GetTo())
210  Result = eAbove;
211  else if(Check.Query.GetTo() < Query.GetTo() &&
212  Check.Subjt.GetFrom() > Subjt.GetFrom())
213  Result = eUnder;
214  }
215  else if(Strand == eNa_strand_minus) {
216  if (Check.Query.GetFrom() < Query.GetTo() &&
217  Check.Subjt.GetTo() > Subjt.GetTo())
218  Result = eBefore;
219  else if(Check.Query.GetFrom() > Query.GetTo() &&
220  Check.Subjt.GetTo() < Subjt.GetFrom())
221  Result = eAfter;
222  else if(Check.Query.GetFrom() > Query.GetFrom() &&
223  Check.Subjt.GetFrom() > Subjt.GetFrom())
224  Result = eAbove;
225  else if(Check.Query.GetTo() < Query.GetTo() &&
226  Check.Subjt.GetTo() < Subjt.GetTo())
227  Result = eUnder;
228  }
229 
230 end:
231  if(Result == eWtf) {
232  ERR_POST(Info << "CEquivRange::CalcRelative:: Got a eWTF ("
233  << *this << ") vs (" << Check << ")" );
234  }
235  else {
236  // ERR_POST(Info << *this << " vs " << Check << " result: " << Result);
237  }
238  return Result;
239 }
240 
241 
244 {
245 /*
246  To make a valid Denseg, all equivs must be before/after
247  each other, in the proper order.
248 
249  | |
250  above| IQ| after
251  --------------------
252  |I /|
253  IS | / | IS
254  |/ I|
255  -------|------------
256  before | IQ| under
257  | |
258 
259  Only behaves reasonably if parts are split
260 */
261 
262  ERelative Result = eWtf;
263 
264  if(Strand != Check.Strand) {
265  Result = eWtf;
266  return Result;
267  }
268 
269  bool IQ, IS;
270 
271  IQ = Query.IntersectingWith(Check.Query);
272  IS = Subjt.IntersectingWith(Check.Subjt);
273 
274  //if(IntersectingWith(Check))
275  // Result = eIntersects;
276 
277  //if(Result != eWtf)
278  // goto end;
279 
280  if(Strand == eNa_strand_plus) {
281  if (Check.Query.GetFrom() > Query.GetTo() &&
282  Check.Subjt.GetFrom() > Subjt.GetTo())
283  Result = eAfter;
284  else if(Check.Query.GetTo() < Query.GetFrom() &&
285  Check.Subjt.GetTo() < Subjt.GetTo())
286  Result = eBefore;
287  else if(Check.Query.GetFrom() > Query.GetFrom() &&
288  Check.Subjt.GetTo() < Subjt.GetTo())
289  Result = eAbove;
290  else if(Check.Query.GetTo() < Query.GetTo() &&
291  Check.Subjt.GetFrom() > Subjt.GetFrom())
292  Result = eUnder;
293  }
294  else if(Strand == eNa_strand_minus) {
295  if (Check.Query.GetFrom() < Query.GetTo() &&
296  Check.Subjt.GetTo() > Subjt.GetTo())
297  Result = eBefore;
298  else if(Check.Query.GetFrom() > Query.GetTo() &&
299  Check.Subjt.GetTo() < Subjt.GetFrom())
300  Result = eAfter;
301  else if(Check.Query.GetFrom() > Query.GetFrom() &&
302  Check.Subjt.GetFrom() > Subjt.GetFrom())
303  Result = eAbove;
304  else if(Check.Query.GetTo() < Query.GetTo() &&
305  Check.Subjt.GetTo() < Subjt.GetTo())
306  Result = eUnder;
307  }
308 
309  if (IQ || IS)
310  Result = (ERelative)(Result | eIntersects);
311  else if (IQ)
312  Result = (ERelative)(Result | eInterQuery);
313  else if (IS)
314  Result = (ERelative)(Result | eInterSubjt);
315 
316  //end:
317  //if(Result == eWtf) {
318  // ERR_POST(Info << "CEquivRange::CalcRelative:: Got a eWTF ("
319  // << *this << ") vs (" << Check << ")" );
320  //}
321  //else {
322  // ERR_POST(Info << *this << " vs " << Check << " result: " << Result);
323  //}
324  //
325 
326  return Result;
327 }
328 
329 
331 {
332  TSeqRange QI = A.Query.IntersectionWith(B.Query);
333  TSeqRange SI = A.Subjt.IntersectionWith(B.Subjt);
334 
335  TSeqPos QD=0, SD=0;
336  if(QI.Empty()) {
337  if( A.Query.GetFrom() >= B.Query.GetTo() ) {
338  QD = A.Query.GetFrom() - B.Query.GetTo();
339  } else {
340  QD = B.Query.GetFrom() - A.Query.GetTo();
341  }
342  }
343  if(SI.Empty()) {
344  if( A.Subjt.GetFrom() >= B.Subjt.GetTo() ) {
345  SD = A.Subjt.GetFrom() - B.Subjt.GetTo();
346  } else {
347  SD = B.Subjt.GetFrom() - A.Subjt.GetTo();
348  }
349  }
350  TSeqPos D = QD + SD;
351 
352  TSignedSeqPos AINT = A.Intercept; //s_SeqAlignIntercept(A);
353  TSignedSeqPos BINT = B.Intercept; //s_SeqAlignIntercept(B);
354  TSeqPos DeltaInt = abs(AINT - BINT);
355 
356  return max(D, DeltaInt);
357 }
358 
360 {
361  TSeqRange AQR, ASR, BQR, BSR;
362  ITERATE(TEquivList, EquivIter, A) {
363  AQR += EquivIter->Query;
364  ASR += EquivIter->Subjt;
365  }
366  ITERATE(TEquivList, EquivIter, B) {
367  BQR += EquivIter->Query;
368  BSR += EquivIter->Subjt;
369  }
370 
371  TSeqRange QI = AQR.IntersectionWith(BQR);
372  TSeqRange SI = ASR.IntersectionWith(BSR);
373 
374  TSeqPos QD=0, SD=0;
375  if(QI.Empty()) {
376  if( AQR.GetFrom() >= BQR.GetTo() ) {
377  QD = AQR.GetFrom() - BQR.GetTo();
378  } else {
379  QD = BQR.GetFrom() - AQR.GetTo();
380  }
381  }
382  if(SI.Empty()) {
383  if( ASR.GetFrom() >= BSR.GetTo() ) {
384  SD = ASR.GetFrom() - BSR.GetTo();
385  } else {
386  SD = BSR.GetFrom() - ASR.GetTo();
387  }
388  }
389  TSeqPos D = QD + SD;
390 
391  TSignedSeqPos AINT = A.front().Intercept;
392  TSignedSeqPos BINT = B.front().Intercept;
393  TSeqPos DeltaInt = abs(AINT - BINT);
394 
395  return max(D, DeltaInt);
396 }
397 
398 
399 
401  : x_GAlignId(0), x_GSegmtId(0), x_GSplitId(0)
402 {
403  ;
404 }
405 
406 bool
408  TEquivList& Splits)
409 {
410  TEquivList Sources, Equivs;
411 
412  bool MadeCuts = false;
413  do {
414  MadeCuts = false;
415 
416  if(Equivs.empty()) {
417  ITERATE(TEquivList, OrigIter, Originals) {
418  //Sources.insert(*OrigIter);
419  Sources.push_back(*OrigIter);
420  }
421  TEquivList::iterator NE;
422  sort(Sources.begin(), Sources.end());
423  NE = unique(Sources.begin(), Sources.end());
424  Sources.erase(NE, Sources.end());
425  }
426  else {
427  Sources.clear();
428  Equivs.swap(Sources);
429  Equivs.clear();
430  }
431 
432  //cerr << "IS: " << Equivs.size() << " : " << Sources.size() << endl;
433  typedef vector<TSeqPos> TPointContainer;
434  //typedef set<TSeqPos> TPointContainer;
435  TPointContainer QueryPoints, SubjtPoints;
436  QueryPoints.reserve(Sources.size()*2);
437  SubjtPoints.reserve(Sources.size()*2);
438 
439  ITERATE(vector<CEquivRange>, SourceIter, Sources) {
440  QueryPoints.push_back(SourceIter->Query.GetFrom());
441  QueryPoints.push_back(SourceIter->Query.GetTo()+1);
442  SubjtPoints.push_back(SourceIter->Subjt.GetFrom());
443  SubjtPoints.push_back(SourceIter->Subjt.GetTo()+1);
444  }
445 
446  {{
447  vector<TSeqPos>::iterator NE;
448  sort(QueryPoints.begin(), QueryPoints.end());
449  NE = unique(QueryPoints.begin(), QueryPoints.end());
450  QueryPoints.erase(NE, QueryPoints.end());
451  sort(SubjtPoints.begin(), SubjtPoints.end());
452  NE = unique(SubjtPoints.begin(), SubjtPoints.end());
453  SubjtPoints.erase(NE, SubjtPoints.end());
454  }}
455 
456 
457  ITERATE(TEquivList, SourceIter, Sources) {
458  const CEquivRange& Equiv = *SourceIter;
459 
460  size_t JumpTo = 0;
461  {{
462  size_t F = 0, T = QueryPoints.size()-1;
463  size_t I = 0;
464  while(true) {
465  if(F+1 >= T) {
466  I = F;
467  break;
468  }
469  I = (F+((T-F)/2));
470  TSeqPos IV = QueryPoints[I];
471  //cerr << " F " << F << " T " << T << " I " << I << " IV " << IV << endl;
472  if( IV == Equiv.Query.GetFrom()) {
473  break;
474  } else if( Equiv.Query.GetFrom() < IV ) {
475  T = I;
476  } else {
477  F = I;
478  }
479  }
480  JumpTo = I;
481  }}
482  //cerr << "Q JumpTo: " << JumpTo << "\t" << Equiv << endl;
483 
484  //ITERATE(TPointContainer, StartIter, QueryPoints) {
485  for(TPointContainer::const_iterator StartIter = QueryPoints.begin()+JumpTo;
486  StartIter != QueryPoints.end(); ++StartIter) {
487 
488  TPointContainer::const_iterator Next = StartIter;
489  ++Next;
490  if(Next == QueryPoints.end())
491  continue;
492 
493  //CRange<TSeqPos> Range(*StartIter, *Next-1);
494  CRange<TSeqPos> Range(*StartIter, *Next-1);
495  Range.SetFrom(*StartIter);
496  Range.SetTo(*Next-1);
497 
498  if(Range.Empty())
499  continue;
500 
501  if(Range == Equiv.Query) {
502  Equivs.push_back(Equiv);
503  } else if(Range.IntersectingWith(Equiv.Query)) {
504  CEquivRange Temp = SliceOnQuery(Equiv, Range);
505  //if(Temp.NotEmpty() && Temp.Matches > 0) {
506  if(Temp.NotEmpty()) {
507  Equivs.push_back(Temp);
508  MadeCuts = true;
509  }
510  }
511 
512  if(*StartIter > Equiv.Query.GetTo()) {
513  break;
514  }
515  }
516 
517  {{
518  size_t F = 0, T = SubjtPoints.size()-1;
519  size_t I = 0;
520  while(true) {
521  if(F+1 >= T) {
522  I = F;
523  break;
524  }
525  I = (F+((T-F)/2));
526  TSeqPos IV = SubjtPoints[I];
527  if( IV == Equiv.Subjt.GetFrom()) {
528  break;
529  } else if( Equiv.Subjt.GetFrom() < IV ) {
530  T = I;
531  } else {
532  F = I;
533  }
534  }
535  JumpTo = I;
536  }}
537 
538  //ITERATE(TPointContainer, StartIter, SubjtPoints) {
539  for(TPointContainer::const_iterator StartIter = SubjtPoints.begin()+JumpTo;
540  StartIter != SubjtPoints.end(); ++StartIter) {
541 
542  TPointContainer::const_iterator Next = StartIter;
543  ++Next;
544  if(Next == SubjtPoints.end())
545  continue;
546 
547  //CRange<TSeqPos> Range(*StartIter, *Next-1);
548  CRange<TSeqPos> Range(*StartIter, *Next-1);
549  Range.SetFrom(*StartIter);
550  Range.SetTo(*Next-1);
551 
552 
553  if(Range.Empty())
554  continue;
555 
556  if(Range == Equiv.Subjt) {
557  Equivs.push_back(Equiv);
558  } else if(Range.IntersectingWith(Equiv.Subjt)) {
559  CEquivRange Temp = SliceOnSubjt(Equiv, Range);
560  //if(Temp.NotEmpty() && Temp.Matches > 0) {
561  if(Temp.NotEmpty()) {
562  Equivs.push_back(Temp);
563  MadeCuts = true;
564  }
565  }
566  if(*StartIter > Equiv.Subjt.GetTo()) {
567  break;
568  }
569  }
570  }
571 
572  /*
573  cerr << "Q" << endl;
574  cerr << CTime(CTime::eCurrent).AsString("m:s.l") << endl;
575  TEquivList::iterator LastSourceStart;
576  LastSourceStart = Sources.end();
577  int LCI = 0;
578  LCI = 0;
579  ITERATE(TPointContainer, StartIter, QueryPoints) {
580 
581  TPointContainer::const_iterator Next = StartIter;
582  ++Next;
583  if(Next == QueryPoints.end())
584  continue;
585 
586  CRange<TSeqPos> Range(*StartIter, *Next-1);
587 
588  if(Range.Empty())
589  continue;
590 
591 
592  TEquivList::iterator SourceIter = LastSourceStart;
593  LastSourceStart = Sources.end();
594  if(SourceIter == Sources.end())
595  SourceIter = Sources.begin();
596  size_t CI = LCI;
597  LCI = 0;
598  //ITERATE(vector<CEquivRange>, SourceIter, Sources) {
599  for( ; SourceIter != Sources.end(); ++SourceIter) {
600 
601  if(*Next < SourceIter->Query.GetFrom()) {
602  continue;
603  }
604 
605  if (LastSourceStart == Sources.end()) {
606  LastSourceStart = SourceIter;
607  LCI = CI;
608  }
609 
610  if(Range == SourceIter->Query) {
611  cerr << "QE : CI : " << CI << "\t" << Range << "\t" << SourceIter->Query << endl;
612  Equivs.push_back(*SourceIter);
613  } else if(Range.IntersectingWith(SourceIter->Query)) {
614  cerr << "QI : CI : " << CI << "\t" << Range << "\t" << SourceIter->Query << endl;
615  CEquivRange Temp = SourceIter->SliceOnQuery(Range);
616  if(Temp.NotEmpty()) {
617  //Equivs.insert(Temp);
618  Equivs.push_back(Temp);
619  }
620  } else if( *StartIter > SourceIter->Query.GetFrom() &&
621  *StartIter > LastSourceStart->Query.GetFrom() ) {
622  LastSourceStart = SourceIter;
623  ++LastSourceStart;
624  LCI = CI + 1;
625  break;
626  }
627  CI++;
628  }
629  }
630 
631  sort(Sources.begin(), Sources.end(), s_SortEquivBySubjt);
632 
633  cerr << "S" << endl;
634  cerr << CTime(CTime::eCurrent).AsString("m:s.l") << endl;
635  LastSourceStart = Sources.end();
636  ITERATE(TPointContainer, StartIter, SubjtPoints) {
637 
638  TPointContainer::const_iterator Next = StartIter;
639  ++Next;
640  if(Next == SubjtPoints.end())
641  continue;
642 
643  CRange<TSeqPos> Range(*StartIter, *Next-1);
644 
645  if(Range.Empty())
646  continue;
647 
648  TEquivList::iterator SourceIter = LastSourceStart;
649  LastSourceStart = Sources.end();
650  if(SourceIter == Sources.end())
651  SourceIter = Sources.begin();
652 
653  //ITERATE(vector<CEquivRange>, SourceIter, Sources) {
654  for( ; SourceIter != Sources.end(); ++SourceIter) {
655 
656  if(*Next < SourceIter->Subjt.GetFrom()) {
657  continue;
658  }
659 
660  if (LastSourceStart == Sources.end()) {
661  LastSourceStart = SourceIter;
662  }
663 
664  if(Range == SourceIter->Subjt) {
665  Equivs.push_back(*SourceIter);
666  } else if(Range.IntersectingWith(SourceIter->Subjt)) {
667  CEquivRange Temp = SourceIter->SliceOnSubjt(Range);
668  if(Temp.NotEmpty()) {
669  //Equivs.insert(Temp);
670  Equivs.push_back(Temp);
671  }
672  } else if( *StartIter > SourceIter->Query.GetFrom() &&
673  *StartIter > LastSourceStart->Query.GetFrom() ) {
674  LastSourceStart = SourceIter;
675  ++LastSourceStart;
676  continue;
677  }
678  }
679  }
680  */
681 
682 
683  {{
684  TEquivList::iterator NE;
685  sort(Equivs.begin(), Equivs.end());
686  NE = unique(Equivs.begin(), Equivs.end());
687  Equivs.erase(NE, Equivs.end());
688  }}
689  //} while(Equivs.size() > Sources.size());
690  } while(MadeCuts);
691 
692  ITERATE(vector<CEquivRange>, EquivIter, Equivs) {
693  Splits.push_back(*EquivIter);
694  }
695 
696  return !Splits.empty();
697 }
698 
699 bool
701  TEquivList& Merges)
702 {
703  bool MadeChanges = false;
704  do {
705 
706  TEquivList Sources;
707  Sources.clear();
708  if(Merges.empty())
709  Sources.insert(Sources.end(), Originals.begin(), Originals.end());
710  else
711  Sources.insert(Sources.end(), Merges.begin(), Merges.end());
712  {{
713  TEquivList::iterator NE;
714  sort(Sources.begin(), Sources.end(), s_SortEquivBySubjt);
715  NE = unique(Sources.begin(), Sources.end());
716  Sources.erase(NE, Sources.end());
717  }}
718 
719  MadeChanges = false;
720  Merges.clear();
721 
722  CEquivRange Curr;
723  ITERATE(TEquivList, SourceIter, Sources) {
724  if(SourceIter->Empty()) {
725  continue;
726  }
727 
728  if(Curr.Empty()) {
729  Curr = *SourceIter;
730  continue;
731  }
732 
733  if (Curr.AbuttingWith(*SourceIter) &&
734  Curr.AlignId == SourceIter->AlignId &&
735  Curr.SegmtId == SourceIter->SegmtId) {
736  Curr = Merge(Curr, *SourceIter);
737  MadeChanges = true;
738  continue;
739  }
740 
741  else {
742  Merges.push_back(Curr);
743  Curr = *SourceIter;
744  }
745 
746  }
747  if(!Curr.Empty())
748  Merges.push_back(Curr);
749 
750  } while(MadeChanges);
751 
752  return true;
753 }
754 
755 
757  TEquivList& Equivs)
758 {
759  // FIXME: Add support for more than Denseg and Disc-of-Denseg
760  if(Align.GetSegs().IsDisc()) {
761  ITERATE(CSeq_align_set::Tdata, AlignIter,
762  Align.GetSegs().GetDisc().Get()) {
763  ExtractRangesFromSeqAlign(**AlignIter, Equivs);
764  }
765  return;
766  }
767 
768  const CDense_seg& Denseg = Align.GetSegs().GetDenseg();
769 
770  int SegCount = Denseg.GetNumseg();
771 
772  //cout << MSerial_AsnText << Denseg;
773  for(int Loop = 0; Loop < SegCount; Loop++) {
774  size_t StartIndex = (Loop*Denseg.GetDim());
775 
776  if (Denseg.GetStarts()[StartIndex] == -1 ||
777  Denseg.GetStarts()[StartIndex+1] == -1)
778  continue;
779 
780  CEquivRange Curr;
781  Curr.Query.SetFrom(Denseg.GetStarts()[StartIndex]);
782  Curr.Query.SetLength(Denseg.GetLens()[Loop]);
783 
784  Curr.Subjt.SetFrom(Denseg.GetStarts()[StartIndex+1]);
785  Curr.Subjt.SetLength(Denseg.GetLens()[Loop]);
786 
787  if(Denseg.CanGetStrands() && Denseg.GetStrands().size() > StartIndex)
788  Curr.Strand = Denseg.GetStrands()[StartIndex];
789  else
790  Curr.Strand = eNa_strand_plus;
791 
792  {{
793  // Enforce Query Strand can vary, but SubjtStrand is Plus
794  ENa_strand SubjtStrand = eNa_strand_plus;
795  if(Denseg.CanGetStrands() && Denseg.GetStrands().size() > StartIndex+1)
796  SubjtStrand = Denseg.GetStrands()[StartIndex+1];
797  if(SubjtStrand == eNa_strand_minus) {
799  }
800  }}
801 
802  if(Curr.Strand == eNa_strand_plus)
803  Curr.Intercept = int(Curr.Subjt.GetFrom()) - int(Curr.Query.GetFrom());
804  else
805  Curr.Intercept = Curr.Subjt.GetTo() + Curr.Query.GetFrom();
806 
807  Curr.Matches = 0;
808  Curr.MisMatches = 0;
809 
810  Curr.AlignId = x_GAlignId;
811  Curr.SegmtId = x_GSegmtId; x_GSegmtId++;
812  Curr.SplitId = x_GSplitId; x_GSplitId++;
813 
814  Equivs.push_back(Curr);
815  //cout << Curr.Query << " to " << Curr.Subjt << endl;
816  }
817 
818  x_GAlignId++;
819 }
820 
821 
822 void CEquivRangeBuilder::CalcMatches(objects::CBioseq_Handle QueryHandle,
823  objects::CBioseq_Handle SubjtHandle,
824  TEquivList& Equivs)
825 {
826  if(Equivs.empty())
827  return;
828 
829  CSeqVector QueryVec(QueryHandle, CBioseq_Handle::eCoding_Iupac, Equivs.front().Strand);
830  //CSeqVector QueryVec(QueryHandle, CBioseq_Handle::eCoding_Iupac, eNa_strand_plus);
832 
833 
834  TSeqRange QRange, SRange;
835  NON_CONST_ITERATE(TEquivList, EquivIter, Equivs) {
836  CEquivRange& CurrEquiv = *EquivIter;
837  QRange += CurrEquiv.Query;
838  SRange += CurrEquiv.Subjt;
839  }
840 
841  string QueryStr, SubjtStr;
842  if(Equivs.front().Strand == eNa_strand_plus)
843  QueryVec.GetSeqData(QRange.GetFrom(), QRange.GetTo()+1, QueryStr);
844  else
845  QueryVec.GetSeqData(QueryVec.size()-QRange.GetTo()-1,
846  QueryVec.size()-QRange.GetFrom(), QueryStr);
847  SubjtVec.GetSeqData(SRange.GetFrom(), SRange.GetTo()+1, SubjtStr);
848 
849  NON_CONST_ITERATE(TEquivList, EquivIter, Equivs) {
850  CEquivRange& CurrEquiv = *EquivIter;
851 
852  //cerr << "CR: " << CurrEquiv.Query << "\t" << CurrEquiv.Subjt << endl;
853 
854  CurrEquiv.Matches = 0;
855  CurrEquiv.MisMatches = 0;
856 
857  size_t QPOffset = CurrEquiv.Query.GetFrom()-QRange.GetFrom();
858  size_t QMOffset = QRange.GetTo()-CurrEquiv.Query.GetTo();
859 
860  size_t QOffset = (CurrEquiv.Strand == eNa_strand_plus ? QPOffset : QMOffset);
861  size_t SOffset = CurrEquiv.Subjt.GetFrom()-SRange.GetFrom();
862 
863  for(size_t Loop = 0; Loop < CurrEquiv.Subjt.GetLength(); Loop++) {
864  size_t QLoop = QOffset+Loop;
865  size_t SLoop = SOffset+Loop;
866  //if(CurrEquiv.Strand == eNa_strand_minus)
867  // QLoop = (QRange.GetTo()-CurrEquiv.Query.GetTo()) + Loop;
868  //QLoop = QueryStr.length() - QLoop - 1;
869  // cerr << "CI: " << QLoop << "\t" << SLoop << endl;
870  if(QueryStr[QLoop] == SubjtStr[SLoop]) {
871  CurrEquiv.Matches++;
872  } else {
873  CurrEquiv.MisMatches++;
874  CurrEquiv.MisMatchSubjtPoints.push_back(CurrEquiv.Subjt.GetFrom()+Loop);
875  }
876  }
877 
878  // Windowing experiment
879  /*
880  const int WIN_LEN = 100;
881  list<size_t> CutPoints;
882  list<double> PctIdents;
883  for(size_t LenLoop = 0; LenLoop < CurrEquiv.Subjt.GetLength(); LenLoop++) {
884  if(LenLoop + WIN_LEN >= CurrEquiv.Subjt.GetLength())
885  break;
886 
887  size_t SLenPoint = CurrEquiv.Subjt.GetFrom()+LenLoop;
888 
889  size_t CountMM = 0;
890  for(size_t WinLoop = 0; WinLoop < WIN_LEN; WinLoop++) {
891  if( find(CurrEquiv.MisMatchSubjtPoints.begin(),
892  CurrEquiv.MisMatchSubjtPoints.end(),
893  SLenPoint+WinLoop) != CurrEquiv.MisMatchSubjtPoints.end() ) {
894  CountMM++;
895  }
896  }
897  PctIdents.push_back( float(WIN_LEN-CountMM)/float(WIN_LEN) );
898  }
899  double Mean = s_CalcMean(PctIdents);
900  double StdDev = s_CalcStdDev(PctIdents, Mean);
901  list<double> Devs;
902  s_CalcDevs(PctIdents, Mean, StdDev, Devs);
903  cout << "Mean: " << Mean << " StdDev: " << StdDev << endl;
904  list<double>::iterator DevsIter = Devs.begin();
905  double Prev = *DevsIter;
906  CutPoints.push_back(CurrEquiv.Subjt.GetFrom());
907  for(size_t LenLoop = 0; LenLoop < CurrEquiv.Subjt.GetLength(); LenLoop++) {
908  if(LenLoop + WIN_LEN >= CurrEquiv.Subjt.GetLength())
909  break;
910 
911  size_t SLenPoint = CurrEquiv.Subjt.GetFrom()+LenLoop;
912  cout << "WINDOW THINGY: " << SLenPoint << "\t" << *DevsIter;
913  if( (Prev >= 1.0 && *DevsIter <= 1.0) ||
914  (Prev <= 1.0 && *DevsIter >= 1.0) ) {
915  if(Prev > *DevsIter) {
916  cout << "\t" << "TRANSITION_HL";
917  CutPoints.push_back(SLenPoint);
918  } else {
919  cout << "\t" << "TRANSITION_LH";
920  CutPoints.push_back(SLenPoint+WIN_LEN-1);
921  }
922  }
923  cout << endl;
924  Prev = *DevsIter;
925  ++DevsIter;
926  }
927  CutPoints.push_back(CurrEquiv.Subjt.GetTo()+1);
928  ITERATE(list<size_t>, CutIter, CutPoints) {
929  cout << *CutIter << endl;
930  }*/
931 
932 
933  //cerr << "MM: " << CurrEquiv.Matches << "\t" << CurrEquiv.MisMatches << endl;
934  //if(Counter % 1000 == 0)
935  // cerr << "CalcMatches: " << Counter << endl;
936  //Counter++;
937  }
938 
939 }
940 
941 
944 {
945  CEquivRange Slice;
946  CRange<TSeqPos> Inter = Original.Query.IntersectionWith(pQuery);
947 
948  Slice.Strand = Original.Strand;
949  Slice.Intercept = Original.Intercept;
950  Slice.Matches = 0;
951  Slice.MisMatches = 0;
952  if(Inter.Empty()) {
953  Slice.Query = Inter;
954  return Slice;
955  } else if(Original.Query == Inter) {
956  return Original;
957  }
958 
959  Slice.Query = Inter;
960  TSeqPos SOffset = Inter.GetFrom() - Original.Query.GetFrom();
961 
962  if(Original.Strand == eNa_strand_plus) {
963  Slice.Subjt.SetFrom(Original.Subjt.GetFrom() + SOffset);
964  Slice.Subjt.SetLength(Inter.GetLength());
965  } else {
966  Slice.Subjt.SetTo(Original.Subjt.GetTo() - SOffset);
967  Slice.Subjt.SetLengthDown(Inter.GetLength());
968  }
969 
970  ITERATE(vector<TSeqPos>, MisPointIter, Original.MisMatchSubjtPoints) {
971  if (Slice.Subjt.GetFrom() <= *MisPointIter &&
972  Slice.Subjt.GetTo() >= *MisPointIter) {
973  Slice.MisMatchSubjtPoints.push_back(*MisPointIter);
974  }
975  }
976  Slice.MisMatches = Slice.MisMatchSubjtPoints.size();
977  Slice.Matches = Slice.Subjt.GetLength() - Slice.MisMatches;
978 
979  Slice.AlignId = Original.AlignId;
980  Slice.SegmtId = Original.SegmtId;
981  Slice.SplitId = x_GSplitId; x_GSplitId++;
982 
983  return Slice;
984 }
985 
986 
989 {
990  CEquivRange Slice;
991  CRange<TSeqPos> Inter = Original.Subjt.IntersectionWith(pSubjt);
992 
993  Slice.Strand = Original.Strand;
994  Slice.Intercept = Original.Intercept;
995  Slice.Matches = 0;
996  Slice.MisMatches = 0;
997  if(Inter.Empty()) {
998  Slice.Subjt = Inter;
999  return Slice;
1000  } else if(Original.Subjt == Inter) {
1001  return Original;
1002  }
1003 
1004  Slice.Subjt = Inter;
1005  TSeqPos QOffset = Inter.GetFrom() - Original.Subjt.GetFrom();
1006 
1007  if(Original.Strand == eNa_strand_plus) {
1008  Slice.Query.SetFrom(Original.Query.GetFrom() + QOffset);
1009  Slice.Query.SetLength(Inter.GetLength());
1010  } else {
1011  Slice.Query.SetTo(Original.Query.GetTo() - QOffset);
1012  Slice.Query.SetLengthDown(Inter.GetLength());
1013  }
1014 
1015  ITERATE(vector<TSeqPos>, MisPointIter, Original.MisMatchSubjtPoints) {
1016  if (Slice.Subjt.GetFrom() <= *MisPointIter &&
1017  Slice.Subjt.GetTo() >= *MisPointIter) {
1018  Slice.MisMatchSubjtPoints.push_back(*MisPointIter);
1019  }
1020  }
1021  Slice.MisMatches = Slice.MisMatchSubjtPoints.size();
1022  Slice.Matches = Slice.Subjt.GetLength() - Slice.MisMatches;
1023 
1024  Slice.AlignId = Original.AlignId;
1025  Slice.SegmtId = Original.SegmtId;
1026  Slice.SplitId = x_GSplitId; x_GSplitId++;
1027 
1028  return Slice;
1029 }
1030 
1031 
1032 CEquivRange
1034 {
1035  CEquivRange Merged;
1036 
1037  Merged.Query = First.Query + Second.Query;
1038  Merged.Subjt = First.Subjt + Second.Subjt;
1039 
1040  Merged.Strand = First.Strand;
1041  Merged.Intercept = First.Intercept;
1042  Merged.Matches = First.Matches + Second.Matches;
1043  Merged.MisMatches = First.MisMatches + Second.MisMatches;
1044 
1045  Merged.MisMatchSubjtPoints.insert(Merged.MisMatchSubjtPoints.end(),
1046  First.MisMatchSubjtPoints.begin(), First.MisMatchSubjtPoints.end());
1047  Merged.MisMatchSubjtPoints.insert(Merged.MisMatchSubjtPoints.end(),
1048  Second.MisMatchSubjtPoints.begin(), Second.MisMatchSubjtPoints.end());
1049 
1050  Merged.AlignId = First.AlignId;
1051  Merged.SegmtId = First.SegmtId;
1052  Merged.SplitId = x_GSplitId; x_GSplitId++;
1053 
1054 
1055  if(Merged.Query.GetLength() != Merged.Subjt.GetLength())
1056  cerr << __LINE__ << " ERROR" << endl;
1057  if(Merged.MisMatchSubjtPoints.size() != (size_t)Merged.MisMatches)
1058  cerr << __LINE__ << " ERROR" << endl;
1059  if(Merged.Query.GetLength() != (size_t)(Merged.Matches + Merged.MisMatches))
1060  cerr << __LINE__ << " ERROR" << endl;
1061 
1062  return Merged;
1063 }
1064 
1065 
1066 
void CalcMatches(objects::CBioseq_Handle QueryHandle, objects::CBioseq_Handle SubjtHandle, TEquivList &Equivs)
CEquivRange SliceOnSubjt(const CEquivRange &Original, const CRange< TSeqPos > &pSubjt)
bool MergeAbuttings(const TEquivList &Originals, TEquivList &Merges)
void ExtractRangesFromSeqAlign(const objects::CSeq_align &Alignment, TEquivList &Equivs)
CEquivRange Merge(const CEquivRange &First, const CEquivRange &Second)
CEquivRange SliceOnQuery(const CEquivRange &Original, const CRange< TSeqPos > &pQuery)
bool SplitIntersections(const TEquivList &Originals, TEquivList &Splits)
ERelative CalcRelative(const CEquivRange &Check) const
bool AbuttingWith(const CEquivRange &Other) const
Definition: equiv_range.hpp:81
ERelative CalcRelativeDuo(const CEquivRange &Check) const
bool Empty() const
Definition: equiv_range.hpp:68
static TSeqPos Distance(const CEquivRange &A, const CEquivRange &B)
bool NotEmpty() const
Definition: equiv_range.hpp:72
objects::ENa_strand Strand
Definition: equiv_range.hpp:58
vector< TSeqPos > MisMatchSubjtPoints
Definition: equiv_range.hpp:62
CRange< TSeqPos > Subjt
Definition: equiv_range.hpp:57
CRange< TSeqPos > Query
Definition: equiv_range.hpp:56
bool IntersectingWith(const CEquivRange &Other) const
Definition: equiv_range.hpp:76
CSeqVector –.
Definition: seq_vector.hpp:65
#define T(s)
Definition: common.h:230
void s_CalcDevs(const list< T > &Values, T Mean, double StdDev, list< double > &Devs)
Definition: equiv_range.cpp:82
bool s_SortEquivBySubjt(const CEquivRange &A, const CEquivRange &B)
bool operator<(const CEquivRange &A, const CEquivRange &B)
double s_CalcStdDev(const list< T > &List, T Mean)
Definition: equiv_range.cpp:69
T s_CalcMean(const list< T > &List)
Definition: equiv_range.cpp:56
string s_RelToStr(CEquivRange::ERelative rel)
USING_SCOPE(ncbi)
bool operator==(const CEquivRange &A, const CEquivRange &B)
CNcbiOstream & operator<<(CNcbiOstream &out, const CEquivRange &range)
Definition: equiv_range.cpp:96
vector< CEquivRange > TEquivList
Definition: equiv_range.hpp:48
std::ofstream out("events_result.xml")
main entry point for tests
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
int TSignedSeqPos
Type for signed sequence position.
Definition: ncbimisc.hpp:887
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
void Info(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1185
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
Definition: seq_vector.cpp:304
TSeqPos size(void) const
Definition: seq_vector.hpp:291
position_type GetLength(void) const
Definition: range.hpp:158
TThisType & SetLengthDown(position_type length)
Definition: range.hpp:204
bool IntersectingWith(const TThisType &r) const
Definition: range.hpp:331
TThisType IntersectionWith(const TThisType &r) const
Definition: range.hpp:312
TThisType & SetLength(position_type length)
Definition: range.hpp:194
bool Empty(void) const
Definition: range.hpp:148
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
void SetFrom(TFrom value)
Assign a value to From data member.
Definition: Range_.hpp:231
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
void SetTo(TTo value)
Assign a value to To data member.
Definition: Range_.hpp:278
const TDenseg & GetDenseg(void) const
Get the variant data.
Definition: Seq_align_.cpp:153
const TStarts & GetStarts(void) const
Get the Starts member data.
Definition: Dense_seg_.hpp:530
const TLens & GetLens(void) const
Get the Lens member data.
Definition: Dense_seg_.hpp:555
TDim GetDim(void) const
Get the Dim member data.
Definition: Dense_seg_.hpp:421
bool IsDisc(void) const
Check if variant Disc is selected.
Definition: Seq_align_.hpp:772
bool CanGetStrands(void) const
Check if it is safe to call GetStrands method.
Definition: Dense_seg_.hpp:574
TNumseg GetNumseg(void) const
Get the Numseg member data.
Definition: Dense_seg_.hpp:465
list< CRef< CSeq_align > > Tdata
const TDisc & GetDisc(void) const
Get the variant data.
Definition: Seq_align_.cpp:197
const TStrands & GetStrands(void) const
Get the Strands member data.
Definition: Dense_seg_.hpp:580
const Tdata & Get(void) const
Get the member data.
const TSegs & GetSegs(void) const
Get the Segs member data.
Definition: Seq_align_.hpp:921
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
range(_Ty, _Ty) -> range< _Ty >
constexpr auto sort(_Init &&init)
Magic spell ;-) needed for some weird compilers... very empiric.
#define fabs(v)
Definition: ncbi_dispd.c:46
#define abs(a)
Definition: ncbi_heapmgr.c:130
#define F(x)
Make a parametrized function appear to have only one variable.
Definition: ncbi_math.c:342
T max(T x_, T y_)
#define A
#define BSR(x)
#define B
#define D(d)
#define SD
Modified on Fri Sep 20 14:57:31 2024 by modify_doxy.py rev. 669887