NCBI C++ ToolKit
genomic_compart.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: genomic_compart.cpp 95294 2021-11-03 13:26:14Z dicuccio $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Mike DiCuccio
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbistd.hpp>
34 #include <util/range.hpp>
35 
39 
42 
43 //#define _VERBOSE_DEBUG
44 
45 
47  const pair<TSeqRange, TSeqRange>& r2)
48 {
49  bool is_intersecting =
50  r1.first.IntersectingWith(r2.first);
51 
52 #ifdef _VERBOSE_DEBUG
53  cerr << r1.first
54  << " x "
55  << r2.first
56  << ": is_intersecting = "
57  << (is_intersecting ? "true" : "false")
58  << endl;
59 #endif
60 
61  return is_intersecting;
62 }
63 
64 bool IsIntersectingSubject(const pair<TSeqRange, TSeqRange>& r1,
65  const pair<TSeqRange, TSeqRange>& r2)
66 {
67  bool is_intersecting =
68  r1.second.IntersectingWith(r2.second);
69 
70 #ifdef _VERBOSE_DEBUG
71  cerr << r1.second
72  << " x "
73  << r2.second
74  << ": is_intersecting = "
75  << (is_intersecting ? "true" : "false")
76  << endl;
77 #endif
78 
79  return is_intersecting;
80 }
81 
82 bool IsConsistent(const pair<TSeqRange, TSeqRange>& r1,
83  const pair<TSeqRange, TSeqRange>& r2,
84  ENa_strand s1, ENa_strand s2)
85 {
86  bool is_consistent = false;
87  if (SameOrientation(s1, s2)) {
88  is_consistent = (r1.first <= r2.first && r1.second <= r2.second) ||
89  (r2.first <= r1.first && r2.second <= r1.second);
90  }
91  else {
92  is_consistent = (r1.first <= r2.first && r2.second <= r1.second) ||
93  (r2.first <= r1.first && r1.second <= r2.second);
94  }
95 
96 #ifdef _VERBOSE_DEBUG
97  cerr << "[("
98  << r1.first << ", "
99  << r1.second
100  << ", " << (s1 == eNa_strand_minus ? '-' : '+') << ")"
101  << ":" << s1
102  << " x ("
103  << r2.first << ", "
104  << r2.second
105  << ", " << (s2 == eNa_strand_minus ? '-' : '+') << ")"
106  << ":" << s2
107  << ": is_consistent = "
108  << (is_consistent ? "true" : "false")
109  << " r1.first <= r2.first: "
110  << (r1.first <= r2.first ? "true" : "false")
111  << " r1.second <= r2.second: "
112  << (r1.second <= r2.second ? "true" : "false")
113  << ']';
114 #endif
115 
116  return is_consistent;
117 }
118 
119 
120 TSeqPos Difference(const pair<TSeqRange, TSeqRange>& r1,
121  const pair<TSeqRange, TSeqRange>& r2,
122  ENa_strand s1, ENa_strand s2)
123 {
124  TSeqPos diff = 0;
125 
126  if (r1.first.GetTo() < r2.first.GetFrom()) {
127  diff += r2.first.GetFrom() - r1.first.GetTo();
128  }
129  else if (r2.first.GetTo() < r1.first.GetFrom()) {
130  diff += r1.first.GetFrom() - r2.first.GetTo();
131  }
132 
133  if (r1.second.GetTo() < r2.second.GetFrom()) {
134  diff += r2.second.GetFrom() - r1.second.GetTo();
135  }
136  else if (r2.second.GetTo() < r1.second.GetFrom()) {
137  diff += r1.second.GetFrom() - r2.second.GetTo();
138  }
139 
140  /**
141  if (s1 == eNa_strand_minus) {
142  if (r2.first.GetTo() < r1.first.GetFrom()) {
143  diff += r1.first.GetFrom() - r2.first.GetTo();
144  }
145  } else {
146  if (r1.first.GetTo() < r2.first.GetFrom()) {
147  diff += r2.first.GetFrom() - r1.first.GetTo();
148  }
149  }
150  if (s2 == eNa_strand_minus) {
151  if (r2.second.GetTo() < r1.second.GetFrom()) {
152  diff += r1.second.GetFrom() - r2.second.GetTo();
153  }
154  } else {
155  if (r1.second.GetTo() < r2.second.GetFrom()) {
156  diff += r2.second.GetFrom() - r1.second.GetTo();
157  }
158  }
159  **/
160 
161 #ifdef _VERBOSE_DEBUG
162  cerr << "("
163  << r1.first << ", "
164  << r1.second << ")"
165  << " x ("
166  << r2.first << ", "
167  << r2.second << ")"
168  << ": diff = " << diff
169  << endl;
170 #endif
171 
172  return diff;
173 }
174 
175 
176 typedef pair<TSeqRange, TSeqRange> TRange;
177 typedef pair<TRange, CRef<CSeq_align> > TAlignRange;
178 
180 {
181  bool operator() (const TAlignRange& r1,
182  const TAlignRange& r2) const
183  {
184  // First the simple sorts, by query start,
185  // and subject start
186  if(r1.first.first != r2.first.first) {
187  return (r1.first.first < r2.first.first);
188  }
189  if(r1.first.second != r2.first.second) {
190  return (r1.first.second < r2.first.second);
191  }
192 
193  // after these what else to sort by? The goal is simply to stable,
194  // and never sort by alignment pointer.
195  // Return false ought to always be maintain original order,
196  // so as long as the input is stable, this sort won't break it.
197  return false;
198  }
199 };
201 
203 {
204  bool operator() (const TAlignRange& r1,
205  const TAlignRange& r2) const
206  {
207  TSeqPos len1 = max(r1.first.first.GetLength(),
208  r1.first.second.GetLength());
209  TSeqPos len2 = max(r2.first.first.GetLength(),
210  r2.first.second.GetLength());
211  if (len1 > len2) {
212  return true;
213  }
214  if (len2 > len1) {
215  return false;
216  }
217 
218  TSeqRange r1_0 = r1.second->GetSeqRange(0);
219  TSeqRange r2_0 = r2.second->GetSeqRange(0);
220  if (r1_0 < r2_0) {
221  return true;
222  }
223  if (r2_0 < r1_0) {
224  return false;
225  }
226 
227  return r1.second->GetSeqRange(1) < r2.second->GetSeqRange(1);
228  }
229 };
230 
232 {
233  bool operator() (const TAlignRange& r1,
234  const TAlignRange& r2) const
235  {
236  int scores[2] = {0, 0};
237  r1.second->GetNamedScore(CSeq_align::eScore_Score, scores[0]);
238  r2.second->GetNamedScore(CSeq_align::eScore_Score, scores[1]);
239 
240  if (scores[0] > scores[1]) {
241  return true;
242  }
243  if (scores[1] > scores[0]) {
244  return false;
245  }
246 
247  TSeqRange r1_0 = r1.second->GetSeqRange(0);
248  TSeqRange r2_0 = r2.second->GetSeqRange(0);
249  if (r1_0 < r2_0) {
250  return true;
251  }
252  if (r2_0 < r1_0) {
253  return false;
254  }
255 
256  return r1.second->GetSeqRange(1) < r2.second->GetSeqRange(1);
257  }
258 };
259 
261 {
262  bool operator() (const TAlignRange& r1,
263  const TAlignRange& r2) const
264  {
265  double scores[2] = {0.0, 0.0};
266  r1.second->GetNamedScore(CSeq_align::eScore_PercentIdentity_Ungapped, scores[0]);
267  r2.second->GetNamedScore(CSeq_align::eScore_PercentIdentity_Ungapped, scores[1]);
268  if(scores[0] == scores[1]) {
269  TSeqPos len1 = max(r1.first.first.GetLength(),
270  r1.first.second.GetLength());
271  TSeqPos len2 = max(r2.first.first.GetLength(),
272  r2.first.second.GetLength());
273  return len1 > len2;
274  }
275 
276  if (scores[0] > scores[1]) {
277  return true;
278  }
279  if (scores[1] > scores[0]) {
280  return false;
281  }
282 
283  TSeqRange r1_0 = r1.second->GetSeqRange(0);
284  TSeqRange r2_0 = r2.second->GetSeqRange(0);
285  if (r1_0 < r2_0) {
286  return true;
287  }
288  if (r2_0 < r1_0) {
289  return false;
290  }
291 
292  return r1.second->GetSeqRange(1) < r2.second->GetSeqRange(1);
293  }
294 };
295 
297 {
298  bool operator()(const CRef<CSeq_align>& al_ref1,
299  const CRef<CSeq_align>& al_ref2) const
300  {
301  TSeqPos al1_len = al_ref1->GetAlignLength();
302  TSeqPos al2_len = al_ref2->GetAlignLength();
303  if (al1_len > al2_len) {
304  return true;
305  }
306  if (al2_len > al1_len) {
307  return false;
308  }
309 
310  TSeqRange r1_0 = al_ref1->GetSeqRange(0);
311  TSeqRange r2_0 = al_ref2->GetSeqRange(0);
312  if (r1_0 < r2_0) {
313  return true;
314  }
315  if (r2_0 < r1_0) {
316  return false;
317  }
318 
319  return al_ref1->GetSeqRange(1) < al_ref2->GetSeqRange(1);
320  };
321 };
322 
324 {
325  bool operator()(const CRef<CSeq_align>& al_ref1,
326  const CRef<CSeq_align>& al_ref2) const
327  {
328  int scores[2] = {0, 0};
329  al_ref1->GetNamedScore(CSeq_align::eScore_Score, scores[0]);
330  al_ref2->GetNamedScore(CSeq_align::eScore_Score, scores[1]);
331  return scores[0] > scores[1];
332  };
333 };
334 
336 {
337  bool operator()(const CRef<CSeq_align>& al_ref1,
338  const CRef<CSeq_align>& al_ref2) const
339  {
340  double scores[2] = {0.0, 0.0};
343  if(scores[0] == scores[1]) {
344  return al_ref1->GetAlignLength() > al_ref2->GetAlignLength();
345  }
346  return scores[0] > scores[1];
347  };
348 };
349 
351 {
354  {
355  return &*it1 < &*it2;
356  };
357 };
358 
359 
363 
365 
366  bool operator<(const SCompartScore &o) const
367  {
368  if (total_size > o.total_size) {
369  return true;
370  }
371  if (total_size < o.total_size) {
372  return false;
373  }
374  return total_range < o.total_range;
375  }
376 
377  void Reset()
378  {
379  total_size = 0;
380  total_range = TRange();
381  }
382 };
383 
384 
385 void FindCompartments(const list< CRef<CSeq_align> >& aligns,
386  list< CRef<CSeq_align_set> >& align_sets,
387  TCompartOptions options,
388  float diff_len_filter)
389 {
390  //
391  // sort by sequence pair + strand
392  //
393  typedef pair<CSeq_id_Handle, ENa_strand> TIdStrand;
394  typedef pair<TIdStrand, TIdStrand> TIdPair;
395  typedef map<TIdPair, vector< CRef<CSeq_align> > > TAlignments;
396  TAlignments alignments;
397  ITERATE (list< CRef<CSeq_align> >, it, aligns) {
398  /**
399  CRef<CSeq_align> align = *it;
400  if (align->GetSegs().IsDenseg()) {
401  if (align->GetSeqStrand(0) == eNa_strand_minus) {
402  cerr << " before flip: ("
403  << align->GetSeqRange(0) << ", "
404  << (align->GetSeqStrand(0) == eNa_strand_minus ? '-' : '+')
405  << ") - ("
406  << align->GetSeqRange(1) << ", "
407  << (align->GetSeqStrand(1) == eNa_strand_minus ? '-' : '+')
408  << ")"
409  << endl;
410 
411  align->SetSegs().SetDenseg().Reverse();
412 
413  cerr << " after flip: ("
414  << align->GetSeqRange(0) << ", "
415  << (align->GetSeqStrand(0) == eNa_strand_minus ? '-' : '+')
416  << ") - ("
417  << align->GetSeqRange(1) << ", "
418  << (align->GetSeqStrand(1) == eNa_strand_minus ? '-' : '+')
419  << ")"
420  << endl;
421 
422  }
423  }
424  **/
425 
426 
427  CSeq_id_Handle qid = CSeq_id_Handle::GetHandle((*it)->GetSeq_id(0));
428  ENa_strand q_strand = (*it)->GetSeqStrand(0);
429  CSeq_id_Handle sid = CSeq_id_Handle::GetHandle((*it)->GetSeq_id(1));
430  ENa_strand s_strand = (*it)->GetSeqStrand(1);
431 
432  // strand normalization
433  if (q_strand != s_strand && q_strand == eNa_strand_minus) {
434  std::swap(q_strand, s_strand);
435  }
436  else if (q_strand == eNa_strand_minus) {
437  q_strand = eNa_strand_plus;
438  s_strand = eNa_strand_plus;
439  }
440 
441 
442  TIdPair p(TIdStrand(qid, q_strand), TIdStrand(sid, s_strand));
443  alignments[p].push_back(*it);
444  }
445 
446  typedef pair<SCompartScore, CRef<CSeq_align_set> > TCompartScore;
447  vector<TCompartScore> scored_compartments;
448 
449 
450  // we only compartmentalize within each sequence id / strand pair
451  NON_CONST_ITERATE (TAlignments, align_it, alignments) {
452  const TIdPair& id_pair = align_it->first;
453  ENa_strand q_strand = id_pair.first.second;
454  ENa_strand s_strand = id_pair.second.second;
455 
456  vector< CRef<CSeq_align> >& aligns = align_it->second;
457  if(options & fCompart_SortByScore) {
458  std::stable_sort(aligns.begin(), aligns.end(), SSeqAlignsByScore());
459  }
460  else if(options & fCompart_SortByPctIdent) {
461  std::sort(aligns.begin(), aligns.end(), SSeqAlignsByPctIdent());
462  }
463  else {
464  std::sort(aligns.begin(), aligns.end(), SSeqAlignsBySize());
465  }
466 
467 #ifdef _VERBOSE_DEBUG
468  {{
469  cerr << "ids: " << id_pair.first.first << " x "
470  << id_pair.second.first << endl;
471  if(options & fCompart_SortByScore) {
472  cerr << " sort by score" << endl;
473  }
474  else if(options & fCompart_SortByPctIdent) {
475  cerr << " sort by percent identity" << endl;
476  }
477  else {
478  cerr << " sort by size" << endl;
479  }
480 
481  ITERATE (vector< CRef<CSeq_align> >, it, aligns) {
482  cerr << " ("
483  << (*it)->GetSeqRange(0) << ", "
484  << ((*it)->GetSeqStrand(0) == eNa_strand_minus ? '-' : '+')
485  << ") - ("
486  << (*it)->GetSeqRange(1) << ", "
487  << ((*it)->GetSeqStrand(1) == eNa_strand_minus ? '-' : '+')
488  << ")"
489  << endl;
490  }
491  }}
492 #endif
493 
494  //
495  // reduce the list to a set of overall ranges
496  //
497 
498  vector<TAlignRange> align_ranges;
499 
500  ITERATE (vector< CRef<CSeq_align> >, iter, aligns) {
501  TSeqRange q_range = (*iter)->GetSeqRange(0);
502  TSeqRange s_range = (*iter)->GetSeqRange(1);
503  TRange r(q_range, s_range);
504  align_ranges.push_back(TAlignRange(r, *iter));
505  }
506 
507 #ifdef _VERBOSE_DEBUG
508  {{
509  cerr << "ranges: "
510  << id_pair.first.first << "/" << q_strand
511  << " x "
512  << id_pair.second.first << "/" << s_strand
513  << ":"<< endl;
514  vector<TAlignRange>::const_iterator prev_it = align_ranges.end();
515  ITERATE (vector<TAlignRange>, it, align_ranges) {
516  cerr << " ("
517  << it->first.first << ", "
518  << it->first.second << ")"
519  << " [" << it->first.first.GetLength()
520  << ", " << it->first.second.GetLength() << "]";
521  if (prev_it != align_ranges.end()) {
522  cerr << " consistent="
523  << (IsConsistent(prev_it->first, it->first,
524  q_strand, s_strand) ? "true" : "false");
525  cerr << " diff="
526  << Difference(prev_it->first, it->first,
527  q_strand, s_strand);
528  }
529 
530  cerr << endl;
531  prev_it = it;
532  }
533  }}
534 #endif
535 
536  //
537  // sort by descending hit size
538  // fit each new hit into its best compartment compartment
539  //
540  if (options & fCompart_SortByScore) {
541  std::sort(align_ranges.begin(), align_ranges.end(),
542  SRangesByScore());
543  }
544  else if (options & fCompart_SortByPctIdent) {
545  std::sort(align_ranges.begin(), align_ranges.end(),
547  }
548  else {
549  std::sort(align_ranges.begin(), align_ranges.end(),
550  SRangesBySize());
551  }
552 
553  list< TAlignRangeMultiSet > compartments;
554 
555  // iteration through this list now gives us a natural assortment by
556  // largest alignment. we iterate through this list and inspect the
557  // nascent compartments. we must evaluate for possible fit within each
558  // compartment
559  NON_CONST_ITERATE (vector<TAlignRange>, it, align_ranges) {
560 
561  bool found = false;
562  list<TAlignRangeMultiSet>::iterator best_compart =
563  compartments.end();
564  TSeqPos best_diff = kMax_Int;
565  size_t comp_id = 0;
566  NON_CONST_ITERATE (list<TAlignRangeMultiSet>,
567  compart_it, compartments) {
568  ++comp_id;
570  compart_it->lower_bound(*it);
571 
572  TSeqPos diff = 0;
573  bool is_consistent = false;
574  bool is_intersecting_query = false;
575  bool is_intersecting_subject = false;
576  if (place == compart_it->end()) {
577  // best place is the end; we therefore evaluate whether we
578  // can be appended to this compartment
579  --place;
580  is_intersecting_query =
581  IsIntersectingQuery(place->first, it->first);
582  is_intersecting_subject =
583  IsIntersectingSubject(place->first, it->first);
584  is_consistent = IsConsistent(place->first,
585  it->first,
586  q_strand, s_strand);
587  diff = Difference(place->first,
588  it->first,
589  q_strand, s_strand);
590  }
591  else {
592  if (place == compart_it->begin()) {
593  // best place is the beginning; we therefore evaluate
594  // whether we can be prepended to this compartment
595  is_intersecting_query =
596  IsIntersectingQuery(it->first, place->first);
597  is_intersecting_subject =
598  IsIntersectingSubject(it->first, place->first);
599  is_consistent = IsConsistent(it->first,
600  place->first,
601  q_strand, s_strand);
602  diff = Difference(it->first,
603  place->first,
604  q_strand, s_strand);
605  }
606  else {
607  // best place is in the middle; we must evaluate two
608  // positions
609  is_intersecting_query =
610  IsIntersectingQuery(it->first, place->first);
611  is_intersecting_subject =
612  IsIntersectingSubject(it->first, place->first);
613  is_consistent =
614  IsConsistent(it->first,
615  place->first,
616  q_strand, s_strand);
617  diff = Difference(it->first,
618  place->first,
619  q_strand, s_strand);
620 
621  --place;
622  is_intersecting_query |=
623  IsIntersectingQuery(place->first, it->first);
624  is_intersecting_subject |=
625  IsIntersectingSubject(place->first, it->first);
626  is_consistent &=
627  IsConsistent(place->first,
628  it->first,
629  q_strand, s_strand);
630  diff = min(diff,
631  Difference(place->first,
632  it->first,
633  q_strand, s_strand));
634  }
635  }
636 
637 #ifdef _VERBOSE_DEBUG
638  float diff_len_ratio = double(diff) / it->second->GetAlignLength(false);
639  cerr << " comp_id=" << comp_id
640  << " is_consistent=" << (is_consistent ? "true" : "false")
641  << " is_intersecting_query=" << (is_intersecting_query ? "true" : "false")
642  << " is_intersecting_subject=" << (is_intersecting_subject ? "true" : "false")
643  << " allow intersect_query="
644  << ((options & fCompart_AllowIntersectionsQuery) ? "true" : "false")
645  << " allow intersect_subject="
646  << ((options & fCompart_AllowIntersectionsSubject) ? "true" : "false")
647  << " allow intersect_both="
648  << ((options & fCompart_AllowIntersectionsBoth) ? "true" : "false")
649  << " diff=" << diff
650  << " best_diff=" << best_diff
651  << " align_len=" << it->second->GetAlignLength(false)
652  << " diff_len_ratio=" << diff_len_ratio
653  << " filter=" << (diff_len_ratio <= diff_len_filter ? "pass" : "fail" )
654  << endl;
655 #endif
656 
657  if ( ((is_consistent && !is_intersecting_query && !is_intersecting_subject) ||
658  (((options & fCompart_AllowInconsistentIntersection) || is_consistent) &&
659  (((options & fCompart_AllowIntersectionsQuery) &&
660  is_intersecting_query ) ||
661  ( (options & fCompart_AllowIntersectionsSubject) &&
662  is_intersecting_subject ) ||
663  ( (options & fCompart_AllowIntersectionsBoth) &&
664  is_intersecting_query && is_intersecting_subject )))) &&
665  diff < best_diff) {
666  best_compart = compart_it;
667  best_diff = diff;
668  found = true;
669  }
670  }
671 
672  if ( !found || best_compart == compartments.end() ) {
673  compartments.push_back(TAlignRangeMultiSet());
674  compartments.back().insert(*it);
675  }
676  else {
677  best_compart->insert(*it);
678  }
679 
680 #ifdef _VERBOSE_DEBUG
681  {{
682  cerr << "compartments: found " << compartments.size() << endl;
683  size_t count = 0;
684  ITERATE (list<TAlignRangeMultiSet>, it, compartments) {
685  ++count;
686  cerr << " compartment " << count << endl;
687  ITERATE (TAlignRangeMultiSet, i, *it) {
688  cerr << " ("
689  << i->first.first << ", "
690  << i->first.second << ")"
691  << " [" << i->first.first.GetLength()
692  << ", " << i->first.second.GetLength() << "]"
693  << " (0x" << std::hex << ((intptr_t)i->second.GetPointer()) << std::dec << ")"
694  << endl;
695  }
696  }
697  }}
698 #endif
699  }
700 
701 #ifdef DEBUG_VERBOSE_OUTPUT
702  {{
703  cerr << "found " << compartments.size() << endl;
704  size_t count = 0;
705  ITERATE (list<TAlignRangeMultiSet>, it, compartments) {
706  ++count;
707  cerr << " compartment " << count << endl;
708  ITERATE (TAlignRangeMultiSet, i, *it) {
709  cerr << " ("
710  << i->first.first << ", "
711  << i->first.second << ")"
712  << " [" << i->first.first.GetLength()
713  << ", " << i->first.second.GetLength() << "]"
714  << endl;
715  }
716  }
717  }}
718 #endif
719 
720  // pack into seq-align-sets
721  size_t compart_count = 0;
722  ITERATE (list<TAlignRangeMultiSet>, it, compartments) {
723  ++compart_count;
725  SCompartScore score;
727  if ((options & fCompart_FilterByDiffLen) && it->size() > 1) {
728  /// Find large gaps that require breaking the compartments.
729  /// Go over the compartment forward and then backwards, and mark
730  /// positions at which the gap/alignment-lengt ratio is over the
731  /// threahols; break positions are those that we fing in both
732  /// passes. We need to make the check here, not
733  /// when forming the compartments, because you can have two
734  /// alignments that are far away later joined through other
735  /// alignments in the middle
736  TAlignRangeMultiSet::const_iterator second_subject_range
737  = it->begin();
738  for (; second_subject_range != it->end()
739  && second_subject_range->first.second
740  == it->begin()->first.second; ++second_subject_range);
741  bool reverse_subject = second_subject_range != it->end() &&
742  second_subject_range->first.second
743  < it->begin()->first.second;
744  TSeqPos subject_end = (reverse_subject
745  ? it->begin()->first
746  : it->rbegin()->first) . second.GetTo();
748  forward_breaks, backward_breaks;
749  TSignedSeqPos current_end_point = 0;
750  TSignedSeqPos current_potential_break = 0;
751 #ifdef _VERBOSE_DEBUG
752  TSeqPos count = 1;
753  TSeqPos last_break = 0;
754 #endif
755  ITERATE (TAlignRangeMultiSet, i, *it) {
756 #ifdef _VERBOSE_DEBUG
757  cerr << " ("
758  << i->first.first << ", "
759  << i->first.second << ")"
760  << " [" << i->first.first.GetLength()
761  << ", " << i->first.second.GetLength() << "]"
762  << endl;
763 #endif
764  TSignedSeqPos start_point = i->first.first.GetFrom() +
765  (reverse_subject ? subject_end - i->first.second.GetTo()
766  : i->first.second.GetFrom());
767  TSignedSeqPos end_point = i->first.first.GetTo() +
768  (reverse_subject ? subject_end - i->first.second.GetFrom()
769  : i->first.second.GetTo());
770 #ifdef _VERBOSE_DEBUG
771  if (i != it->begin()) {
772  cerr << "At " << ++count << " Gap " << int(start_point - current_end_point) << " on length " << (current_potential_break - current_end_point) << endl;
773  }
774 #endif
775  if (i != it->begin() &&
776  start_point > current_potential_break)
777  {
778  /// gap since last alignment is over threshold; mark
779  /// this is a breakpoint
780  forward_breaks.insert(i);
781 #ifdef _VERBOSE_DEBUG
782  cerr << "Break! " << (count - last_break) << " members start_point " << start_point << " current break " << current_potential_break << endl;
783  last_break = count;
784 #endif
785  }
786  current_end_point = max(current_potential_break, end_point);
787  current_potential_break = current_end_point +
788  diff_len_filter * i->second->GetAlignLength(false);
789  }
790  current_potential_break = current_end_point = INT_MAX;
791 #ifdef _VERBOSE_DEBUG
792  count = 1;
793  last_break = 0;
794  TSeqPos last_double_break = 0;
795 #endif
797 #ifdef _VERBOSE_DEBUG
798  cerr << " ("
799  << i->first.first << ", "
800  << i->first.second << ")"
801  << " [" << i->first.first.GetLength()
802  << ", " << i->first.second.GetLength() << "]"
803  << endl;
804 #endif
805  TSignedSeqPos start_point = i->first.first.GetFrom() +
806  (reverse_subject ? subject_end - i->first.second.GetTo()
807  : i->first.second.GetFrom());
808  TSignedSeqPos end_point = i->first.first.GetTo() +
809  (reverse_subject ? subject_end - i->first.second.GetFrom()
810  : i->first.second.GetTo());
811 #ifdef _VERBOSE_DEBUG
812  if (i != it->rbegin()) {
813  cerr << "At " << ++count << " Gap " << int(current_end_point - end_point) << " on length " << (current_end_point - current_potential_break) << endl;
814  }
815 #endif
816  if (i != it->rbegin() &&
817  end_point < current_potential_break)
818  {
819  /// gap since last alignment is over threshold; mark
820  /// this is a breakpoint
821  TAlignRangeMultiSet::const_iterator breakpoint = i.base();
822  backward_breaks.insert(breakpoint);
823 #ifdef _VERBOSE_DEBUG
824  if (forward_breaks.count(breakpoint)) {
825  cerr << "Double after " << (count - last_double_break) << ' ';
826  last_double_break = count;
827  }
828  cerr << "Break! " << (count - last_break) << " members end_point " << end_point << " current break " << current_potential_break << endl;
829  last_break = count;
830 #endif
831  }
832  current_end_point = min(current_potential_break, start_point);
833  current_potential_break = current_end_point -
834  diff_len_filter * i->second->GetAlignLength(false);
835  }
836  set_intersection(forward_breaks.begin(), forward_breaks.end(),
837  backward_breaks.begin(), backward_breaks.end(),
838  inserter(break_positions, break_positions.end()),
840  }
841  ITERATE (TAlignRangeMultiSet, i, *it) {
842  if (break_positions.count(i)) {
843 #ifdef DEBUG_VERBOSE_OUTPUT
844  cerr << "compartment " << compart_count << " break at " << i->first.first.GetFrom() << ".." << i->first.first.GetTo() << endl;
845 #endif
846  TCompartScore sc(score, sas);
847  scored_compartments.push_back(sc);
848  score.Reset();
849  sas.Reset(new CSeq_align_set);
850  }
851  sas->Set().push_back(i->second);
852  score.total_size += i->second->GetAlignLength();
853  score.total_range.first += i->first.first;
854  score.total_range.second += i->first.second;
855  }
856 
857  TCompartScore sc(score, sas);
858  scored_compartments.push_back(sc);
859  }
860  }
861 
862  //
863  // sort our compartments by size descending
864  //
865  std::sort(scored_compartments.begin(), scored_compartments.end());
866  ITERATE (vector<TCompartScore>, it, scored_compartments) {
867  align_sets.push_back(it->second);
868  }
869 }
870 
871 
872 void JoinCompartment(const CRef<CSeq_align_set>& compartment,
873  float gap_ratio,
874  list< CRef<CSeq_align> >& aligns)
875 {
876  CRef<CSeq_align_set> disc_align_set;
877  typedef const list <CRef<CSeq_align> > TConstSeqAlignList;
878  TConstSeqAlignList& alignments = compartment->Get();
879  TSeqPos len = 0;
880  ITERATE(TConstSeqAlignList, al_it, alignments) {
881  len += (*al_it)->GetAlignLength(false);
882  }
883  TSeqPos max_gap_len = len * gap_ratio;
884  ITERATE(TConstSeqAlignList, al_it, alignments) {
885  TConstSeqAlignList::const_iterator al_it_next = al_it;
886  al_it_next++;
887  if (!disc_align_set) disc_align_set.Reset(new CSeq_align_set);
888  disc_align_set->Set().push_back(*al_it);
889  if (al_it_next == alignments.end() ||
890  (*al_it)->GetSeqStop(0) + max_gap_len < (*al_it_next)->GetSeqStart(0) ||
891  (*al_it)->GetSeqStop(1) + max_gap_len < (*al_it_next)->GetSeqStart(1))
892  {
893  // Pack and ship
894  CRef<CSeq_align> comp_align(new CSeq_align);
895  comp_align->SetType(CSeq_align::eType_disc);
896  comp_align->SetSegs().SetDisc(*disc_align_set);
897  aligns.push_back(comp_align);
898  disc_align_set.Reset(0);
899  }
900  }
901 }
902 
903 
906 
bool SameOrientation(ENa_strand a, ENa_strand b)
Definition: Na_strand.hpp:83
@ eScore_PercentIdentity_Ungapped
Definition: Seq_align.hpp:164
CRange< TSeqPos > GetSeqRange(TDim row) const
GetSeqRange NB: On a Spliced-seg, in case the product-type is protein, these only return the amin par...
Definition: Seq_align.cpp:153
bool GetNamedScore(const string &id, int &score) const
Get score.
Definition: Seq_align.cpp:563
TSeqPos GetAlignLength(bool include_gaps=true) const
Get the length of this alignment.
Definition: Seq_align.cpp:1993
Definition: map.hpp:338
parent_type::iterator iterator
Definition: set.hpp:198
parent_type::const_iterator const_iterator
Definition: set.hpp:197
Definition: set.hpp:45
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
const_iterator begin() const
Definition: set.hpp:135
const_iterator end() const
Definition: set.hpp:136
Include a standard set of the NCBI C++ Toolkit most basic headers.
#define bool
Definition: bool.h:34
pair< TSeqRange, TSeqRange > TRange
bool IsConsistent(const pair< TSeqRange, TSeqRange > &r1, const pair< TSeqRange, TSeqRange > &r2, ENa_strand s1, ENa_strand s2)
void JoinCompartment(const CRef< CSeq_align_set > &compartment, float gap_ratio, list< CRef< CSeq_align > > &aligns)
bool IsIntersectingQuery(const pair< TSeqRange, TSeqRange > &r1, const pair< TSeqRange, TSeqRange > &r2)
multiset< TAlignRange, SRangesByStart > TAlignRangeMultiSet
pair< TRange, CRef< CSeq_align > > TAlignRange
TSeqPos Difference(const pair< TSeqRange, TSeqRange > &r1, const pair< TSeqRange, TSeqRange > &r2, ENa_strand s1, ENa_strand s2)
void FindCompartments(const list< CRef< CSeq_align > > &aligns, list< CRef< CSeq_align_set > > &align_sets, TCompartOptions options, float diff_len_filter)
bool IsIntersectingSubject(const pair< TSeqRange, TSeqRange > &r1, const pair< TSeqRange, TSeqRange > &r2)
int TCompartOptions
@ fCompart_SortByScore
@ fCompart_AllowIntersectionsSubject
@ fCompart_SortByPctIdent
@ fCompart_FilterByDiffLen
@ fCompart_AllowIntersectionsBoth
@ fCompart_AllowIntersectionsQuery
@ fCompart_AllowInconsistentIntersection
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
int TSignedSeqPos
Type for signed sequence position.
Definition: ncbimisc.hpp:887
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
Definition: ncbimisc.hpp:1508
#define REVERSE_ITERATE(Type, Var, Cont)
ITERATE macro to reverse sequence through container elements.
Definition: ncbimisc.hpp:827
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
int intptr_t
Definition: ncbitype.h:185
#define kMax_Int
Definition: ncbi_limits.h:184
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
Tdata & Set(void)
Assign a value to data member.
void SetSegs(TSegs &value)
Assign a value to Segs data member.
Definition: Seq_align_.cpp:310
void SetType(TType value)
Assign a value to Type data member.
Definition: Seq_align_.hpp:818
const Tdata & Get(void) const
Get the member data.
@ eType_disc
discontinuous alignment
Definition: Seq_align_.hpp:104
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
int len
static void hex(unsigned char c)
Definition: mdb_dump.c:56
constexpr auto sort(_Init &&init)
T max(T x_, T y_)
T min(T x_, T y_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
bool operator<(const SCompartScore &o) const
bool operator()(TAlignRangeMultiSet::const_iterator it1, TAlignRangeMultiSet::const_iterator it2) const
bool operator()(const TAlignRange &r1, const TAlignRange &r2) const
bool operator()(const TAlignRange &r1, const TAlignRange &r2) const
bool operator()(const TAlignRange &r1, const TAlignRange &r2) const
bool operator()(const TAlignRange &r1, const TAlignRange &r2) const
bool operator()(const CRef< CSeq_align > &al_ref1, const CRef< CSeq_align > &al_ref2) const
bool operator()(const CRef< CSeq_align > &al_ref1, const CRef< CSeq_align > &al_ref2) const
bool operator()(const CRef< CSeq_align > &al_ref1, const CRef< CSeq_align > &al_ref2) const
#define const
Definition: zconf.h:232
Modified on Tue May 28 05:50:38 2024 by modify_doxy.py rev. 669887