NCBI C++ ToolKit
compartment_finder.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef ALGO_ALIGN_UTIL_COMPARTMENT_FINDER__HPP
2 #define ALGO_ALIGN_UTIL_COMPARTMENT_FINDER__HPP
3 
4 /* $Id: compartment_finder.hpp 91306 2020-10-08 11:57:15Z gouriano $
5 * ===========================================================================
6 *
7 * PUBLIC DOMAIN NOTICE
8 * National Center for Biotechnology Information
9 *
10 * This software/database is a "United States Government Work" under the
11 * terms of the United States Copyright Act. It was written as part of
12 * the author's official duties as a United States Government employee and
13 * thus cannot be copyrighted. This software/database is freely available
14 * to the public for use. The National Library of Medicine and the U.S.
15 * Government have not placed any restriction on its use or reproduction.
16 *
17 * Although all reasonable efforts have been taken to ensure the accuracy
18 * and reliability of the software and data, the NLM and the U.S.
19 * Government do not and cannot warrant the performance or results that
20 * may be obtained by using this software or data. The NLM and the U.S.
21 * Government disclaim all warranties, express or implied, including
22 * warranties of performance, merchantability or fitness for any particular
23 * purpose.
24 *
25 * Please cite the author in any work or product based on this material.
26 *
27 * ===========================================================================
28 *
29 * Author: Yuri Kapustin, Alexander Souvorov, Boris Kiryutin
30 *
31 * File Description:
32 *
33 * cDNA-to-genomic compartmentization algortihms.
34 * The task of compartmentization is to identify alternative placements
35 * of a transcript on a genomic sequence.
36 *
37 * CCompartmentFinder Identification of genomic compartments.
38 * CCompartmentAccessor An aggregator class to CCompartmentFinder.
39 *
40 * ===========================================================================
41 */
42 
43 
44 #include <corelib/ncbi_limits.hpp>
45 
48 
56 
57 #include <objmgr/seq_map.hpp>
58 #include <objmgr/scope.hpp>
59 #include <objmgr/seq_map_ci.hpp>
60 
61 #include <algorithm>
62 #include <numeric>
63 #include <vector>
64 
65 
67 
69 
70 template<class THit>
71 class CGapInfo {
72 public:
73  typedef typename THit::TCoord TCoord;
74 
75  CGapInfo(const CSeq_id& id, CScope *scope) {
76  m_scope = scope;
77  m_flipped = false;
78  if(scope) {
79  m_seq_map = & (scope->GetBioseqHandle(id).GetSeqMap() );
80  }
81  }
82 
83  void Flip(TCoord subj_min, TCoord subj_max) {
84  m_flipped = true;
85  m_subj_min = subj_min;
86  m_subj_max = subj_max;
87  }
88 
90  if(m_flipped) {
91  TCoord new_from = m_subj_min + m_subj_max - to;
92  TCoord new_to = m_subj_min + m_subj_max - from;
93  from = new_from;
94  to = new_to;
95  }
96  bool res = false;
97  if(m_scope && m_seq_map) {
98  CSeqMap_CI it = m_seq_map->ResolvedRangeIterator(m_scope, from, to - from + 1, eNa_strand_plus, size_t(-1), CSeqMap::fFindGap);
99  for( ; it; ++it) {
101  if(lit && lit->GetBridgeability() == CSeq_literal::e_NotBridgeable) {
102  res = true;
103  break;
104  }
105  }
106  }
107  return res;
108  }
109 
110 private:
112  bool m_flipped;
116 };
117 
118 
119 template<class THit>
121 public:
122  typedef typename THit::TCoord TCoord;
123 
124  CPrecalcGapInfo(const vector<pair<TCoord, TCoord> >* gaps) : m_gaps(gaps) {
125  m_flipped = false;
126  }
127 
128  void Flip(TCoord subj_min, TCoord subj_max) {
129  m_flipped = true;
130  m_subj_min = subj_min;
131  m_subj_max = subj_max;
132  }
133 
135  if(m_gaps == NULL) {
136  return false;
137  }
138  if(m_flipped) {
139  TCoord new_from = m_subj_min + m_subj_max - to;
140  TCoord new_to = m_subj_min + m_subj_max - from;
141  from = new_from;
142  to = new_to;
143  }
144  bool res = false;
145  for(size_t i = 0; i < m_gaps->size(); ++i) {
146  TCoord gfrom = (*m_gaps)[i].first;
147  TCoord gto = (*m_gaps)[i].second;
148  if( gfrom < to && from < gto ) {
149  res = true;
150  break;
151  }
152  }
153  return res;
154  }
155 
156 private:
157  bool m_flipped;
160  const vector<pair<TCoord, TCoord> > *m_gaps;
161 };
162 
163 
164 template<class THit>
166 
167 public:
168 
170  typedef vector<THitRef> THitRefs;
171  typedef typename THit::TCoord TCoord;
172 
173  /// Create the object from the complete set of local alignments (hits)
174  /// between the transcript and the genomic sequences.
175  /// The hits are assumed to share the same query and subject and
176  /// to have plus strand on both sequences.
177  ///
178  /// @param start
179  /// Start of the input set of alignments.
180  /// @param finish
181  /// End of the input set of alignments.
182 
183  CCompartmentFinder(typename THitRefs::const_iterator start,
184  typename THitRefs::const_iterator finish,
185  CGapInfo<THit> *gap_info = NULL, CPrecalcGapInfo<THit> *prec_gap_info = NULL);
186 
187  /// Set compartment overlap behaviour. Allow compartments to overlap
188  /// on subject.
189  ///
190  /// @param max_overlap
191  /// Maximum length of overlap on subject.
192  /// 0 - do not allow overlap.
193 
194  void SetMaxOverlap(TCoord max_overlap) {
195  m_max_overlap = max_overlap;
196  }
197 
198  /// Retrieve the default compartment overlap behaviour (no overlap).
199 
201  return 0;
202  }
203 
204  /// Set the maximum length of an intron
205  ///
206  /// @param intr_max
207  /// Maximum length of an intron, in base pairs.
208 
209  void SetMaxIntron(TCoord intr_max) {
210  m_intron_max = intr_max;
211  }
212 
213  /// Retrieve the default maximum length of an intron
214 
216  // Some examples:
217  // NM_147181.3 vs NC_000004.10 (~1.1M)
218  return 1200000; // NM_001128929.1 vs NC_000003.10 (~1.2M)
219  }
220 
221 
222  /// Set the penalty to open a compartment
223  ///
224  /// @param penalty
225  /// Compartment opening penalty, in base pairs.
226 
227  void SetPenalty(TCoord penalty) {
228  m_penalty = penalty;
229  }
230 
231  /// Set the minimum matching residues per compartment.
232  ///
233  /// @param min_matches
234  /// The min number of matching residues in base pairs
235 
236  void SetMinMatches(TCoord min_matches) {
237  m_MinSingletonMatches = m_MinMatches = min_matches;
238  }
239 
240  /// Set the minimum matching residues per singleton compartment.
241  ///
242  /// @param min_matches
243  /// The min number of matching residues in base pairs
244 
245  void SetMinSingletonMatches(TCoord min_matches) {
246  m_MinSingletonMatches = min_matches;
247  }
248 
249  /// Retrieve the default compartment penalty, in base pairs
250 
252  return 500;
253  }
254 
255  /// Retrieve the default minimum coverage, in base pairs
256 
258  return 500;
259  }
260 
261  /// Identify compartments.
262  ///
263  /// @param cross_filter
264  /// When activated, cross filtering will ensure that only
265  /// the alignments that provided non-ambguous mapping between
266  /// the sequences will be reported in the output.
267 
268  size_t Run(bool cross_filter = false); // do the compartment search
269 
270  /// Order compartments by lower subj coordinate
271 
272  void OrderCompartments(void);
273 
274  /// Individual compartment representation.
275  /// Compartment members are the hits comprising the compartment.
276 
277  class CCompartment {
278 
279  public:
280 
281  /// Create an empty compartment
282 
283  CCompartment(void) {
285  m_box[1] = m_box[3] = 0;
286  }
287 
288  /// Retrieve compartment's members
289 
290  const THitRefs& GetMembers(void) {
291  return m_members;
292  }
293 
294  /// Add a new member into the compartment
295 
296  void AddMember(const THitRef& hitref) {
297  m_members.push_back(hitref);
298  }
299 
300  /// Assign all members of the compartment
301 
303  return m_members;
304  }
305 
306  /// Make sure the compartment's box is up-to-date
307 
308  void UpdateMinMax(void);
309 
310  /// Retrieve the compartment's strand (true == plus)
311 
312  bool GetStrand(void) const;
313 
314  /// Retrieve the first member.
315  ///
316  /// @return
317  /// A reference on the first member of the compartment,
318  /// or a null reference if the compartment is empty.
319 
320  const THitRef GetFirst(void) const {
321  m_iter = 0;
322  return GetNext();
323  }
324 
325 
326  /// Retrieve the next member.
327  ///
328  /// @return
329  /// A reference on the next member of the compartment,
330  /// or a null reference if there are no more members.
331 
332  const THitRef GetNext(void) const {
333  if(m_iter < m_members.size()) {
334  return m_members[m_iter++];
335  }
336  return THitRef(NULL);
337  }
338 
339  /// Retrieve the compartment's box.
340  ///
341  /// @return
342  /// A const pointer at the compartment's box.
343 
344  const TCoord* GetBox(void) const {
345  return m_box;
346  }
347 
348  // helper predicate
349  static bool s_PLowerSubj(const CCompartment& c1, const CCompartment& c2) {
350 
351  return c1.m_box[2] < c2.m_box[2];
352  }
353 
354  protected:
355 
358  mutable size_t m_iter;
359  };
360 
361  // iterate through compartments
362 
363  CCompartment* GetFirst(void);
364  CCompartment* GetNext(void);
365 
366 private:
367 
368  TCoord m_max_overlap; //max overlap on subject for different compartments
369  TCoord m_intron_max; // max intron size
370  TCoord m_penalty; // penalty per compartment
371  TCoord m_MinMatches; // min approx matches to report
372  TCoord m_MinSingletonMatches; // min matches for singleton comps
373 
374  THitRefs m_hitrefs; // input hits
375  vector<CCompartment> m_compartments; // final compartments
376  int m_iter; // GetFirst/Next index
377  CGapInfo<THit> * m_gap_info; // holds information about sequence gaps
378  CPrecalcGapInfo<THit> * m_prec_gap_info; // holds precalculated information about sequence gaps
379 
380  struct SHitStatus {
381 
382  enum EType {
385  eOpening
386  };
387 
389  int m_prev;
390  double m_score;
391 
393  {}
394  SHitStatus(EType type, int prev, double score):
395  m_type(type), m_prev(prev), m_score(score)
396  {}
397  };
398 
399 
400  static size_t sx_XFilter(THitRefs& hitrefs,
401  typename THitRefs::iterator ihr,
402  typename THitRefs::iterator ihr_e,
403  Uint1 w,
404  size_t min_compartment_hit_len);
405 
406  // helper predicate
407 
408  static bool s_PNullRef(const THitRef& hr) {
409  return hr.IsNull();
410  }
411 };
412 
413 
414 // Facilitate access to compartments over a hit set
415 template<class THit>
417 {
418 public:
419 
423 
424  /// Construct the object and assign the parameters of the algorithm.
425  ///
426  /// @param comp_penalty_bps
427  /// Penalty to open a compartment, in base pairs.
428  /// @param min_matches
429  /// The minimum number of matching residues in a compartment, in base pairs.
430  /// @param min_singleton_matches
431  /// The minimum number of matching residues in a singleton, in base pairs.
432  /// @param cross_filter
433  /// Perform cross-filtering when true.
434 
435  CCompartmentAccessor(TCoord comp_penalty_bps,
436  TCoord min_matches,
437  TCoord min_singleton_matches = numeric_limits<TCoord>::max(),
438  bool cross_filter = false);
439 
440  /// Construct the object; assign the parameters of the algorithm; execute.
441  /// The input range of alignments is assumed to contain the complete set of
442  /// alignments between the same pair of sequences.
443  /// The alignments can be on one or both genomic strands.
444  ///
445  /// @param start
446  /// Start of the input range of input alignments.
447  /// @param finish
448  /// Stop of the input range of input alignments.
449  /// @param comp_penalty_bps
450  /// Penalty to open a compartment, in base pairs.
451  /// @param min_matches
452  /// The minimum number of matching residues in a compartment, in base pairs.
453  /// @param min_singleton_matches
454  /// The minimum number of matching residues in a singleton, in base pairs.
455  /// @param cross_filter
456  /// Perform cross-filtering when true.
457 
458  CCompartmentAccessor(typename THitRefs::iterator start,
459  typename THitRefs::iterator finish,
460  TCoord comp_penalty_bps,
461  TCoord min_matches,
462  TCoord min_singleton_matches = numeric_limits<TCoord>::max(),
463  bool cross_filter = false, CScope *scope = NULL,
464  const vector<pair<TCoord, TCoord> > *gaps = NULL);
465 
466  /// Execute: identify compartments. The alignments can be on one
467  /// or both genomic strands.
468  ///
469  /// @param start
470  /// Start of the input range of input alignments.
471  /// @param finish
472  /// Stop of the input range of input alignments.
473 
474  void Run(typename THitRefs::iterator start,
475  typename THitRefs::iterator finish, CScope *scope = NULL,
476  const vector<pair<TCoord, TCoord> > *gaps = NULL);
477 
478  /// Assign the maximum intron length, in base pairs
479 
480  void SetMaxIntron(TCoord mi) { m_MaxIntron = mi; }
481 
482  /// Retrieve the maximum intron length, in base pairs
483 
484  TCoord GetMaxIntron(void) const { return m_MaxIntron; }
485 
486  /// Assign the maximum length for compartments to overlap on the subject.
487 
488  void SetMaxOverlap(TCoord max_overlap) { m_MaxOverlap = max_overlap; }
489 
490  /// Initialize iteration over the results.
491  /// Results are sorted by strand (minus first) and subj position.
492  ///
493  /// @param compartment
494  /// The first identified compartment
495  /// @return
496  /// true if more compartments are available
497 
498  bool GetFirst(THitRefs& compartment);
499 
500  /// Proceed with iteration over the results.
501  ///
502  /// @param compartment
503  /// The next identified compartment
504  /// @return
505  /// true if more compartments are available
506 
507  bool GetNext(THitRefs& compartment);
508 
509  /// Retrieve the compartment counts.
510  ///
511  /// @return
512  /// The first number in the pair is the total count;
513  /// the second number is the number of compartments with the number of matches
514  /// abive the minimum.
515  ///
516 
517  pair<size_t,size_t> GetCounts(void) const {
518 
519  // std::count() not supported on some platforms
520  size_t valid_count (0);
521  for(size_t i(0), n(m_status.size()); i != n; ++i) {
522  if(m_status[i]) ++valid_count;
523  }
524 
525  pair<size_t, size_t> rv (m_pending.size(), valid_count);
526 
527  return rv;
528  }
529 
530  /// Retrieve a compartment by index.
531  ///
532  /// @param idx
533  /// The compartment's index
534  /// @param compartment
535  /// The reference to receive the compartment requested.
536 
537  void Get(size_t idx, THitRefs& compartment) const {
538  compartment = m_pending[idx];
539  }
540 
541  const TCoord* GetBox(size_t i) const {
542  return &m_ranges.front() + i*4;
543  }
544 
545  bool GetStrand(size_t i) const {
546  return m_strands[i];
547  }
548 
549  bool GetStatus(size_t i) const {
550  return m_status[i];
551  }
552 
553  /// Retrieve all valid compartments in a seq-align-set.
554  ///
555  /// @return
556  /// A seq-align-set object with this hierarchy.
557  /// 1. Compartment-level seq-align with bounds.
558  /// 2. Seq-align-set keeping individual hits of the compartment.
559  /// 3. Hit-level seq-align of the dense-seg type.
561 
562 private:
563 
564  void x_Init(TCoord comp_penalty, TCoord min_matches,
565  TCoord min_singleton_matches, bool cross_filter);
566 
567  // compartmentization parameters
574 
575  // ancillary members
576  vector<THitRefs> m_pending;
577  vector<TCoord> m_ranges;
578  vector<bool> m_strands;
579  vector<bool> m_status;
580  size_t m_iter;
581 
582  void x_Copy2Pending(TCompartmentFinder& finder);
583 };
584 
585 
586 
587 /// IMPLEMENTATION
588 /////////////////////////////////////////////////////////////////////////////
589 
590 
591 const double kPenaltyPerIntronBase = -2e-5; // a small penalty to prefer
592  // more compact models among equal
593 
594 const double kPenaltyPerIntronPos = -1e-9; // a small penalty to favor uniform
595  // selection among equivalent chains
596 
597 template<class THit>
599 {
600  typedef CHitFilter<THit> THitFilter;
601  THitFilter::s_GetSpan(m_members, m_box);
602 }
603 
604 
605 template<class THit>
607 
608  if(m_members.size()) {
609  return m_members.front()->GetSubjStrand();
610  }
611  else {
613  eInternal,
614  "Strand requested on an empty compartment" );
615  }
616 }
617 
618 
619 template<class THit>
621  typename THitRefs::const_iterator start,
622  typename THitRefs::const_iterator finish,
623  CGapInfo<THit> *gap_info,
624  CPrecalcGapInfo<THit> *prec_gap_info ):
625 
629  m_MinMatches(1),
631  m_iter(-1),
632  m_gap_info(gap_info),
633  m_prec_gap_info(prec_gap_info)
634 {
635  m_hitrefs.resize(finish - start);
636  copy(start, finish, m_hitrefs.begin());
637 }
638 
639 // Query match accumulator
640 template<class THit>
642 {
643 public:
644 
646  {}
647 
648  double operator() (double acm, CRef<THit> ph)
649  {
650  const typename THit::TCoord qmin (ph->GetQueryMin()),
651  qmax (ph->GetQueryMax());
652  if(qmin > m_Finish)
653  return acm + ph->GetIdentity() * ((m_Finish = qmax) - qmin + 1);
654  else {
655  if(qmax > m_Finish) {
656  const double finish0 (m_Finish);
657  return acm + ph->GetIdentity() * ((m_Finish = qmax) - finish0);
658  }
659  else {
660  return acm;
661  }
662  }
663  }
664 
665 private:
666 
667  double m_Finish;
668 };
669 
670 
671 template<class THit>
673  const typename CCompartmentFinder<THit>::THitRefs& hitrefs0)
674 {
675  typename CCompartmentFinder<THit>::THitRefs hitrefs (hitrefs0);
676 
677  typedef CHitComparator<THit> THitComparator;
678  THitComparator sorter (THitComparator::eQueryMin);
679  stable_sort(hitrefs.begin(), hitrefs.end(), sorter);
680 
681  const double rv (accumulate(hitrefs.begin(), hitrefs.end(), 0.0,
683  return rv;
684 }
685 
686 
687 template<class THit>
688 size_t CCompartmentFinder<THit>::Run(bool cross_filter)
689 {
690  const double kMinusInf (-1e12);
691 
692  m_compartments.clear();
693  const int dimhits = m_hitrefs.size();
694  if(dimhits == 0) {
695  return 0;
696  }
697 
698  // sort the hits to make sure that each hit is placed after:
699  // - hits from which to continue a compartment
700  // - hits from which to open a new compartment
701 
702  typedef CHitComparator<THit> THitComparator;
703  stable_sort(m_hitrefs.begin(), m_hitrefs.end(),
704  THitComparator(THitComparator::eSubjMaxQueryMax));
705 
706  // For every hit:
707  // - evaluate its best extension potential
708  // - evaluate its best potential to start a new compartment
709  // - select the best variant
710  // - update the hit status array
711 
712  typedef vector<SHitStatus> THitData;
713  THitData hitstatus (dimhits);
714 
715  const TCoord subj_min_global (m_hitrefs.front()->GetSubjMin());
716 
717  int i_bestsofar (0);
718  for(int i (0); i < dimhits; ++i) {
719 
720  const THitRef& h (m_hitrefs[i]);
721  const double identity (h->GetIdentity());
722  const typename THit::TCoord hbox [4] = {
723  h->GetQueryMin(), h->GetQueryMax(),
724  h->GetSubjMin(), h->GetSubjMax()
725  };
726 
727 //#define CF_DBG_TRACE
728 #ifdef CF_DBG_TRACE
729  cerr << endl << *h << endl;
730 #endif
731 
732  double best_ext_score (kMinusInf);
733  int i_best_ext (-1);
734 
735  double best_open_score (identity*(hbox[1] - hbox[0] + 1) - m_penalty);
736  int i_best_open (-1); // each can be a start of the very first compartment
737 
738  if(hbox[2] + m_max_overlap > m_hitrefs[i_bestsofar]->GetSubjMax()) {
739 
740  const double score_open (identity*(hbox[1] - hbox[0] + 1)
741  + hitstatus[i_bestsofar].m_score
742  - m_penalty);
743 
744  if(score_open > best_open_score) {
745  best_open_score = score_open;
746  i_best_open = i_bestsofar;
747  }
748  }
749  else {
750  // try every prior hit
751  for(int j (i - 1); j >= 0; --j) {
752 
753  const double score_open (identity * (hbox[1] - hbox[0] + 1)
754  + hitstatus[j].m_score
755  - m_penalty);
756 
757  if(score_open > best_open_score
758  && hbox[2] + m_max_overlap > m_hitrefs[j]->GetSubjMax())
759  {
760  best_open_score = score_open;
761  i_best_open = j;
762  }
763  }
764  }
765 
766  for(int j = i; j >= 0; --j) {
767 
768  THitRef phc;
769  typename THit::TCoord phcbox[4];
770  if(j != i) {
771 
772  phc = m_hitrefs[j];
773  phcbox[0] = phc->GetQueryMin();
774  phcbox[1] = phc->GetQueryMax();
775  phcbox[2] = phc->GetSubjMin();
776  phcbox[3] = phc->GetSubjMax();
777 
778 #ifdef CF_DBG_TRACE
779  cerr << '\t' << *phc << endl;
780 #endif
781  }
782 #ifdef CF_DBG_TRACE
783  else {
784  cerr << "\t[Dummy]" << endl;
785  }
786 #endif
787 
788  // check if good for extension
789  {{
790  typename THit::TCoord q0, s0; // possible continuation
791  bool good (false);
792  int subj_space;
793  TCoord intron_start (0);
794 
795  if(i != j) {
796 
797  if( ( phcbox[1] < hbox[1] && phcbox[0] < hbox[0] )
798  && phcbox[2] <= hbox[2] //prohibit to extend if previous hit is inside the current in genomic coordinates
799  ) {
800 
801  if(hbox[0] <= phcbox[1] &&
802  hbox[2] + phcbox[1] - hbox[0] >= phcbox[3]) {
803 
804  q0 = phcbox[1] + 1;
805  s0 = hbox[2] + phcbox[1] - hbox[0];
806  }
807  else if(phcbox[3] >= hbox[2]) {
808 
809  s0 = phcbox[3] + 1;
810  q0 = hbox[1] - (hbox[3] - s0);
811  }
812  else {
813 
814  q0 = hbox[0];
815  s0 = hbox[2];
816  }
817  subj_space = s0 - phcbox[3] - 1;
818 
819 
820  // max run of spaces inside an exon - NOW DISABLED
821  // example: NM_021645.4 vs NC_000013.9: 51.4 - 51.51M
822  //const TCoord max_gap (100);
823 
824  good = (subj_space <= int(m_intron_max))
825  && (s0 < hbox[3] && q0 < hbox[1]);
826 
827  //prohibit to go over non-bridgable gaps in subject
828  if(good) {
829  if( phcbox[3] <= hbox[2] ) {
830  if( ( m_prec_gap_info && m_prec_gap_info->IntersectsNonBridgeableGap(phcbox[3], hbox[2]) ) ||
831  ( m_gap_info && m_gap_info->IntersectsNonBridgeableGap(phcbox[3], hbox[2]) ) ) {
832  good = false;
833  }
834  }
835  }
836 
837  if(good) {
838  intron_start = phcbox[3];
839  }
840  }
841  }
842 
843  if(good) {
844 
845  const double jscore (hitstatus[j].m_score);
846  const double intron_penalty ((subj_space > 0)?
847  (kPenaltyPerIntronPos * (intron_start - subj_min_global)
848  + subj_space * kPenaltyPerIntronBase): 0.0);
849 
850  const double ext_score (jscore +
851  identity * (hbox[1] - q0 + 1) + intron_penalty);
852 
853  if(ext_score > best_ext_score) {
854  best_ext_score = ext_score;
855  i_best_ext = j;
856  }
857 #ifdef CF_DBG_TRACE
858  cerr << "\tGood for extension with score = " << ext_score << endl;
859 #endif
860  }
861  }}
862  }
863 
864  typename SHitStatus::EType hit_type;
865  int prev_hit;
866  double best_score;
867  if(best_ext_score > best_open_score) {
868 
869  hit_type = SHitStatus::eExtension;
870  prev_hit = i_best_ext;
871  best_score = best_ext_score;
872  }
873  else {
874 
875  hit_type = SHitStatus::eOpening;
876  prev_hit = i_best_open;
877  best_score = best_open_score;
878  }
879 
880  hitstatus[i].m_type = hit_type;
881  hitstatus[i].m_prev = prev_hit;
882  hitstatus[i].m_score = best_score;
883 
884  if(best_score > hitstatus[i_bestsofar].m_score) {
885  i_bestsofar = i;
886  }
887 
888 #ifdef CF_DBG_TRACE
889  cerr << "Status = " << ((hit_type == SHitStatus::eOpening)? "Open": "Extend")
890  << '\t' << best_score << endl;
891  if(prev_hit == -1) {
892  cerr << "[Dummy]" << endl;
893  }
894  else {
895  cerr << *(m_hitrefs[prev_hit]) << endl;
896  }
897 #endif
898  }
899 
900 #ifdef CF_DBG_TRACE
901  cerr << "\n\n--- BACKTRACE ---\n";
902 #endif
903 
904  // *** backtrace ***
905  // - trace back the chain with the highest score
906  const double score_best (hitstatus[i_bestsofar].m_score);
907  const double min_matches (m_MinSingletonMatches < m_MinMatches?
908  m_MinSingletonMatches: m_MinMatches);
909 
910  vector<bool> comp_status;
911  if(score_best + m_penalty >= min_matches) {
912 
913  int i (i_bestsofar);
914  bool new_compartment (true);
915  THitRefs hitrefs;
916  while(i != -1) {
917 
918  if(new_compartment) {
919 
920  const double mp (GetTotalMatches<THit>(hitrefs));
921  if(mp > 0) {
922  // save the current compartment
923  m_compartments.push_back(CCompartment());
924  m_compartments.back().SetMembers() = hitrefs;
925  comp_status.push_back(mp >= m_MinMatches);
926  }
927 
928  hitrefs.resize(0);
929  new_compartment = false;
930  }
931  hitrefs.push_back(m_hitrefs[i]);
932 
933 #ifdef CF_DBG_TRACE
934  cerr << *(m_hitrefs[i]) << endl;
935 #endif
936 
937  if(hitstatus[i].m_type == SHitStatus::eOpening) {
938  new_compartment = true;
939  }
940  i = hitstatus[i].m_prev;
941  }
942 
943  const double mp (GetTotalMatches<THit>(hitrefs));
944  if(mp > 0) {
945  bool status ( ( m_compartments.size() == 0 && mp >= m_MinSingletonMatches )
946  || mp >= m_MinMatches );
947  m_compartments.push_back(CCompartment());
948  m_compartments.back().SetMembers() = hitrefs;
949  comp_status.push_back(status);
950  }
951  }
952 
953  if(cross_filter && m_compartments.size()) {
954 
955  const TCoord kMinCompartmentHitLength (8);
956 
957  // partial x-filtering using compartment hits only
958  for(size_t icn (m_compartments.size()), ic (0); ic < icn; ++ic) {
959 
960  CCompartment & comp (m_compartments[ic]);
961  THitRefs& hitrefs (comp.SetMembers());
962  size_t nullified (0);
963  for(int in (hitrefs.size()), i (in - 1); i > 0; --i) {
964 
965  int j1 (i);
966  while(j1 < in && hitrefs[j1].IsNull()) ++j1;
967  if(j1 == in) continue;
968 
969  THitRef& h1 (hitrefs[j1]);
970  THitRef& h2 (hitrefs[i-1]);
971 
972  if(h1.IsNull()) continue;
973 
974  const TCoord * box1o (h1->GetBox());
975  TCoord box1 [4] = {box1o[0], box1o[1], box1o[2], box1o[3]};
976  const TCoord * box2o (h2->GetBox());
977  TCoord box2 [4] = {box2o[0], box2o[1], box2o[2], box2o[3]};
978 
979  const int qd (box1[1] - box2[0] + 1);
980  const int sd (box1[3] - box2[2] + 1);
981  if(qd > sd && qd > 0) {
982 
983  if(box2[0] <= box1[0] + kMinCompartmentHitLength) {
984 
985 #ifdef ALGOALIGNUTIL_COMPARTMENT_FINDER_KEEP_TERMINII
986  if(i + 1 == in) {
987  TCoord new_coord (box1[0] + (box1[1] - box1[0])/2);
988  if(box1[0] + 1 <= new_coord) {
989  --new_coord;
990  }
991  else {
992  new_coord = box1[0];
993  }
994  h1->Modify(1, new_coord);
995  }
996  else
997 #endif
998  {
999  h1.Reset(0);
1000  ++nullified;
1001  }
1002  }
1003  else {
1004  h1->Modify(1, box2[0] - 1);
1005  }
1006 
1007  if(box2[1] <= box1[1] + kMinCompartmentHitLength) {
1008 
1009 #ifdef ALGOALIGNUTIL_COMPARTMENT_FINDER_KEEP_TERMINII
1010  if(i == 1) {
1011  TCoord new_coord (box2[0] + (box2[1] - box2[0])/2);
1012  if(box2[1] >= new_coord + 1) {
1013  ++new_coord;
1014  }
1015  else {
1016  new_coord = box2[1];
1017  }
1018  h2->Modify(0, new_coord);
1019  }
1020  else
1021 #endif
1022  {
1023  h2.Reset(0);
1024  ++nullified;
1025  }
1026  }
1027  else {
1028  h2->Modify(0, box1[1] + 1);
1029  }
1030  }
1031  else if (sd > 0) {
1032 
1033  if(box2[2] <= box1[2] + kMinCompartmentHitLength) {
1034 
1035  if(i + 1 == in) {
1036  TCoord new_coord (box1[2] + (box1[3] - box1[2])/2);
1037  if(box1[2] + 1 <= new_coord) {
1038  --new_coord;
1039  }
1040  else {
1041  new_coord = box1[2];
1042  }
1043  h1->Modify(3, new_coord);
1044  }
1045  else {
1046  h1.Reset(0);
1047  ++nullified;
1048  }
1049  }
1050  else {
1051  h1->Modify(3, box2[2] - 1);
1052  }
1053 
1054  if(box2[3] <= box1[3] + kMinCompartmentHitLength) {
1055 
1056  if(i == 1) {
1057  TCoord new_coord (box2[2] + (box2[3] - box2[2])/2);
1058  if(box2[3] >= new_coord + 1) {
1059  ++new_coord;
1060  }
1061  else {
1062  new_coord = box2[3];
1063  }
1064 
1065  h2->Modify(2, new_coord);
1066  }
1067  else {
1068  h2.Reset(0);
1069  ++nullified;
1070  }
1071  }
1072  else {
1073  h2->Modify(2, box1[3] + 1);
1074  }
1075  }
1076  }
1077 
1078  if(nullified > 0) {
1079  hitrefs.erase(remove_if(hitrefs.begin(), hitrefs.end(),
1081  hitrefs.end());
1082  }
1083  }
1084 
1085 #define ALGO_ALIGN_COMPARTMENT_FINDER_USE_FULL_XFILTERING
1086 #ifdef ALGO_ALIGN_COMPARTMENT_FINDER_USE_FULL_XFILTERING
1087 
1088  typename THitRefs::iterator ihr_b (m_hitrefs.begin()),
1089  ihr_e(m_hitrefs.end()), ihr (ihr_b);
1090 
1091  stable_sort(ihr_b, ihr_e, THitComparator(THitComparator::eSubjMinSubjMax));
1092 
1093  // complete x-filtering using the full set of hits
1094  for(int icn (m_compartments.size()), ic (icn - 1); ic >= 0; --ic) {
1095 
1096  CCompartment & comp (m_compartments[ic]);
1097  THitRefs& hitrefs (comp.SetMembers());
1098 
1099  if(hitrefs.size() < 3) {
1100  continue;
1101  }
1102  else {
1103  NON_CONST_ITERATE(typename THitRefs, ii, hitrefs) {
1104  (*ii)->SetScore(- (*ii)->GetScore());
1105  }
1106  hitrefs.front()->SetEValue(-1);
1107  hitrefs.back()->SetEValue(-1);
1108  }
1109 
1110  const TCoord comp_subj_min (hitrefs.back()->GetSubjStart());
1111  const TCoord comp_subj_max (hitrefs.front()->GetSubjStop());
1112  while(ihr != ihr_e && ((*ihr)->GetSubjStart() < comp_subj_min
1113  || (*ihr)->GetScore() < 0))
1114  {
1115  ++ihr;
1116  }
1117 
1118  if(ihr == ihr_e) break;
1119  if((*ihr)->GetSubjStart() > comp_subj_max) continue;
1120  typename THitRefs::iterator ihrc_b (ihr);
1121 
1122  typename THitRefs::iterator ihrc_e (ihr);
1123  while(ihrc_e != ihr_e && ((*ihrc_e)->GetSubjStart() < comp_subj_max
1124  || (*ihrc_e)->GetScore() < 0))
1125  {
1126  ++ihrc_e;
1127  }
1128 
1129  size_t nullified (sx_XFilter(hitrefs, ihrc_b, ihrc_e, 1,
1130  kMinCompartmentHitLength));
1131  stable_sort(ihrc_b, ihrc_e, THitComparator (THitComparator::eQueryMinQueryMax));
1132 
1133  nullified += sx_XFilter(hitrefs, ihrc_b, ihrc_e, 0,
1134  kMinCompartmentHitLength);
1135 
1136  ihr = ihrc_e;
1137 
1138  if(nullified > 0) {
1139  hitrefs.erase(remove_if(hitrefs.begin(), hitrefs.end(),
1141  hitrefs.end());
1142  }
1143  }
1144 
1145  for(int icn (m_compartments.size()), ic (icn - 1); ic >= 0; --ic) {
1146 
1147  CCompartment & comp (m_compartments[ic]);
1148  THitRefs& hitrefs (comp.SetMembers());
1149  NON_CONST_ITERATE(typename THitRefs, ii, hitrefs) {
1150  const double score ((*ii)->GetScore());
1151  if(score < 0) {
1152  (*ii)->SetScore(-score);
1153  }
1154  const double eval ((*ii)->GetEValue());
1155  if(eval < 0) {
1156  (*ii)->SetEValue(0);
1157  }
1158  }
1159  }
1160 #endif
1161  }
1162 
1163  // mask out compartments with low identity
1164  for(size_t i (0), in (m_compartments.size()); i < in; ++i) {
1165  if(comp_status[i] == false) {
1166 
1167  THitRefs & hitrefs (m_compartments[i].SetMembers());
1168  NON_CONST_ITERATE(typename THitRefs, ii, hitrefs) {
1169  THitRef & hr (*ii);
1170  hr->SetScore(-hr->GetScore());
1171  }
1172  }
1173  }
1174 
1175  return m_compartments.size();
1176 }
1177 
1178 
1179 template<class THit>
1181  THitRefs& hitrefs,
1182  typename THitRefs::iterator ihr,
1183  typename THitRefs::iterator ihr_e,
1184  Uint1 w,
1185  size_t min_compartment_hit_len)
1186 {
1187  size_t nullified (0);
1188  for(int in (hitrefs.size()), i (in - 2); i > 0 && ihr != ihr_e; --i) {
1189 
1190  THitRef& h1 (hitrefs[i]);
1191  if(h1.IsNull()) continue;
1192 
1193  const TCoord * box1o (h1->GetBox());
1194  TCoord box1 [4] = {box1o[0], box1o[1], box1o[2], box1o[3]};
1195 
1196  // locate the first overlap
1197  for(; ihr != ihr_e; ++ihr) {
1198 
1199  THitRef hr (*ihr);
1200  if(hr.IsNull()) continue;
1201  if(hr->GetScore() > 0 && hr->GetStop(w) >= box1[2*w]) {
1202  break;
1203  }
1204  }
1205 
1206  if(ihr == ihr_e) {
1207  break;
1208  }
1209 
1210  // find the smallest not overlapped hit coord and its interval
1211  TCoord ls0 (box1[2*w+1] + 1), cursegmax (0);
1212  TCoord segmax_start(box1[2*w]), segmax_stop(box1[2*w+1]);
1213 
1214  if(box1[2*w] < (*ihr)->GetStart(w)) {
1215 
1216  segmax_start = ls0 = box1[2*w];
1217  segmax_stop = (*ihr)->GetStart(w) - 1;
1218  cursegmax = segmax_stop - segmax_start + 1;
1219  }
1220  else {
1221 
1222  TCoord shrsmax ((*ihr)->GetStop(w));
1223  for(++ihr; ihr != ihr_e; ++ihr) {
1224 
1225  THitRef hr (*ihr);
1226  if(hr.IsNull() || hr->GetScore() < 0) {
1227  continue;
1228  }
1229 
1230  const TCoord hrs0 (hr->GetStart(w));
1231  if(hrs0 > box1[2*w+1]) {
1232  segmax_stop = box1[2*w+1];
1233  break;
1234  }
1235 
1236  if(hrs0 > shrsmax) {
1237  segmax_stop = hrs0 - 1;
1238  break;
1239  }
1240 
1241  const TCoord hrs1 (hr->GetStop(w));
1242  if(hrs1 > shrsmax) {
1243  shrsmax = hrs1;
1244  }
1245  }
1246 
1247  if(shrsmax < box1[2*w+1]) {
1248  segmax_start = ls0 = shrsmax + 1;
1249  cursegmax = segmax_stop - segmax_start + 1;
1250  }
1251  }
1252 
1253  if(ls0 > box1[2*w+1]) {
1254  h1.Reset(0);
1255  ++nullified;
1256  continue;
1257  }
1258 
1259  // find the longest surviving segment
1260  for(; ihr != ihr_e; ++ihr) {
1261 
1262  THitRef hr (*ihr);
1263  if(hr.IsNull() || hr->GetScore() < 0) {
1264  continue;
1265  }
1266 
1267  const TCoord hrs0 (hr->GetStart(w));
1268  if(hrs0 > box1[2*w+1]) {
1269  if(ls0 <= box1[2*w+1] && box1[2*w+1] + 1 - ls0 > cursegmax) {
1270  segmax_start = ls0;
1271  segmax_stop = box1[2*w+1];
1272  cursegmax = segmax_stop - segmax_start + 1;
1273  }
1274  break;
1275  }
1276 
1277  if(hrs0 > ls0) {
1278  if(hrs0 - ls0 > cursegmax) {
1279  segmax_start = ls0;
1280  segmax_stop = hrs0 - 1;
1281  cursegmax = segmax_stop - segmax_start + 1;
1282  }
1283 
1284  ls0 = hr->GetStop(w) + 1;
1285  }
1286  else if(hr->GetStop(w) + 1 > ls0) {
1287  ls0 = hr->GetStop(w) + 1;
1288  }
1289  }
1290 
1291  if(box1[2*w+1] > ls0 + cursegmax) {
1292  segmax_start = ls0;
1293  segmax_stop = box1[2*w+1];
1294  cursegmax = segmax_stop - segmax_start + 1;
1295  }
1296 
1297  if(cursegmax < min_compartment_hit_len) {
1298  h1.Reset(0);
1299  ++nullified;
1300  continue;
1301  }
1302  else {
1303 
1304  if(box1[2*w] < segmax_start) {
1305  h1->Modify(2 * w, segmax_start);
1306  }
1307 
1308  if(segmax_stop < box1[2*w+1]) {
1309  h1->Modify(2 * w + 1, segmax_stop);
1310  }
1311  }
1312 
1313  if(ihr == ihr_e) break;
1314  }
1315 
1316  return nullified;
1317 }
1318 
1319 
1320 template<class THit>
1323 {
1324  if(m_compartments.size()) {
1325  m_iter = 0;
1326  return &m_compartments[m_iter++];
1327  }
1328  else {
1329  m_iter = -1;
1330  return 0;
1331  }
1332 }
1333 
1334 template<class THit>
1336 
1337  for(size_t i = 0, dim = m_compartments.size(); i < dim; ++i) {
1338  m_compartments[i].UpdateMinMax();
1339  }
1340 
1341  stable_sort(m_compartments.begin(), m_compartments.end(),
1343 }
1344 
1345 template<class THit>
1348 {
1349  const size_t dim = m_compartments.size();
1350  if(m_iter == -1 || m_iter >= int(dim)) {
1351  m_iter = -1;
1352  return 0;
1353  }
1354  else {
1355  return &m_compartments[m_iter++];
1356  }
1357 }
1358 
1359 
1360 template<class THit>
1362  TCoord min_matches,
1363  TCoord min_singleton_matches,
1364  bool cross_filter)
1365 {
1366  x_Init(comp_penalty, min_matches, min_singleton_matches, cross_filter);
1367 }
1368 
1369 
1370 template<class THit>
1372  typename THitRefs::iterator istart,
1373  typename THitRefs::iterator ifinish,
1374  TCoord comp_penalty,
1375  TCoord min_matches,
1376  TCoord min_singleton_matches,
1377  bool cross_filter,
1378  CScope *scope,
1379  const vector<pair<TCoord, TCoord> > *gaps)
1380 {
1381  x_Init(comp_penalty, min_matches, min_singleton_matches, cross_filter);
1382  Run(istart, ifinish, scope, gaps);
1383 }
1384 
1385 
1386 template<class THit>
1388  TCoord min_matches,
1389  TCoord min_singleton_matches,
1390  bool cross_filter)
1391 {
1392  m_Penalty = comp_penalty;
1393  m_MinMatches = min_matches;
1394  m_MinSingletonMatches = min_singleton_matches;
1395  m_CrossFiltering = cross_filter;
1398 }
1399 
1400 
1401 template<class THit>
1402 void CCompartmentAccessor<THit>::Run(typename THitRefs::iterator istart,
1403  typename THitRefs::iterator ifinish,
1404  CScope *scope,
1405  const vector<pair<TCoord, TCoord> > *gaps)
1406 {
1407 
1408  if(istart == ifinish) return;
1409 
1410  const TCoord kMax_TCoord (numeric_limits<TCoord>::max());
1411 
1412  // separate strands for CompartmentFinder
1413  typename THitRefs::iterator ib = istart, ie = ifinish, ii = ib,
1414  iplus_beg = ie;
1415  typedef CHitComparator<THit> THitComparator;
1416  THitComparator sorter (THitComparator::eSubjStrand);
1417  stable_sort(ib, ie, sorter);
1418 
1419  TCoord minus_subj_min = kMax_TCoord, minus_subj_max = 0;
1420  for(ii = ib; ii != ie; ++ii) {
1421  if((*ii)->GetSubjStrand()) {
1422  iplus_beg = ii;
1423  break;
1424  }
1425  else {
1426  if((*ii)->GetSubjMin() < minus_subj_min) {
1427  minus_subj_min = (*ii)->GetSubjMin();
1428  }
1429  if((*ii)->GetSubjMax() > minus_subj_max) {
1430  minus_subj_max = (*ii)->GetSubjMax();
1431  }
1432  }
1433  }
1434 
1435 
1436  // minus
1437  {{
1438  CGapInfo<THit> gapi( *(*istart)->GetSubjId(), scope);
1439  CPrecalcGapInfo<THit> prec_gapi( gaps );
1440  // flip
1441  gapi.Flip(minus_subj_min, minus_subj_max);
1442  prec_gapi.Flip(minus_subj_min, minus_subj_max);
1443  for(ii = ib; ii != iplus_beg; ++ii) {
1444 
1445  const typename THit::TCoord s0 = minus_subj_min + minus_subj_max
1446  - (*ii)->GetSubjMax();
1447  const typename THit::TCoord s1 = minus_subj_min + minus_subj_max
1448  - (*ii)->GetSubjMin();
1449  (*ii)->SetSubjStart(s0);
1450  (*ii)->SetSubjStop(s1);
1451  }
1452 
1453  CCompartmentFinder<THit> finder (ib, iplus_beg, &gapi, &prec_gapi);
1454  finder.SetPenalty(m_Penalty);
1455  finder.SetMinMatches(m_MinMatches);
1456  finder.SetMinSingletonMatches(m_MinSingletonMatches);
1457  finder.SetMaxIntron(m_MaxIntron);
1458  finder.SetMaxOverlap(m_MaxOverlap);
1459  finder.Run(m_CrossFiltering);
1460 
1461  // un-flip
1462  for(ii = ib; ii != iplus_beg; ++ii) {
1463 
1464  const typename THit::TCoord s0 = minus_subj_min + minus_subj_max
1465  - (*ii)->GetSubjMax();
1466  const typename THit::TCoord s1 = minus_subj_min + minus_subj_max
1467  - (*ii)->GetSubjMin();
1468  (*ii)->SetSubjStart(s1);
1469  (*ii)->SetSubjStop(s0);
1470  }
1471 
1472  x_Copy2Pending(finder);
1473  }}
1474 
1475  // plus
1476  {{
1477  CGapInfo<THit> gapi( *(*istart)->GetSubjId(), scope);
1478  CPrecalcGapInfo<THit> prec_gapi( gaps );
1479  CCompartmentFinder<THit> finder (iplus_beg, ie, &gapi, &prec_gapi);
1480  finder.SetPenalty(m_Penalty);
1481  finder.SetMinMatches(m_MinMatches);
1482  finder.SetMinSingletonMatches(m_MinSingletonMatches);
1483  finder.SetMaxIntron(m_MaxIntron);
1484  finder.SetMaxOverlap(m_MaxOverlap);
1485  finder.Run(m_CrossFiltering);
1486  x_Copy2Pending(finder);
1487  }}
1488 }
1489 
1490 
1491 template<class THit>
1493  CCompartmentFinder<THit>& finder)
1494 {
1495  finder.OrderCompartments();
1496 
1497  typedef typename CCompartmentFinder<THit>::THitRef THitRef;
1498 
1499  // copy to pending
1500  for(typename CCompartmentFinder<THit>::CCompartment* compartment =
1501  finder.GetFirst(); compartment; compartment = finder.GetNext()) {
1502 
1503  if(compartment->GetMembers().size() > 0) {
1504 
1505  m_pending.push_back(THitRefs(0));
1506  THitRefs& vh = m_pending.back();
1507 
1508  for(THitRef ph (compartment->GetFirst()); ph; ph = compartment->GetNext())
1509  {
1510  vh.push_back(ph);
1511  }
1512 
1513  const TCoord* box = compartment->GetBox();
1514  m_ranges.push_back(box[0]);
1515  m_ranges.push_back(box[1]);
1516  m_ranges.push_back(box[2]);
1517  m_ranges.push_back(box[3]);
1518 
1519  m_strands.push_back(compartment->GetStrand());
1520  m_status.push_back(compartment->GetFirst()->GetScore() > 0);
1521  }
1522  }
1523 }
1524 
1525 
1526 template<class THit>
1528 
1529  m_iter = 0;
1530  return GetNext(compartment);
1531 }
1532 
1533 
1534 template<class THit>
1536 
1537  compartment.clear();
1538  if(m_iter < m_pending.size()) {
1539  compartment = m_pending[m_iter++];
1540  return true;
1541  }
1542  else {
1543  return false;
1544  }
1545 }
1546 
1547 
1548 template<class THit>
1550 {
1552 
1554 
1555  // retrieve the ids
1556  CRef<CSeq_id> seqid_query (new CSeq_id), seqid_subj (new CSeq_id);
1557  if(m_pending.size()) {
1558  if(m_pending.front().size()) {
1559  const THit & h (*m_pending.front().front());
1560  seqid_query->Assign(*(h.GetQueryId()));
1561  seqid_subj->Assign(*(h.GetSubjId()));
1562  }
1563  }
1564 
1565  CSeq_align_set::Tdata& sas1_data (rv->Set());
1566 
1567  for(size_t i(0), idim(m_pending.size()); i < idim; ++i) {
1568 
1569  if(m_status[i]) {
1570 
1571  // note the range
1572  TCoord range_min (i > 0 && m_strands[i-1] == m_strands[i]
1573  ? m_ranges[4*i - 1]
1574  : 0);
1575 
1576  TCoord range_max (i + 1 < idim && m_strands[i+1] == m_strands[i]
1577  ? m_ranges[4*i + 6]
1579 
1580  CRef<CSeq_align> sa2 (new CSeq_align);
1582 
1584  const ENa_strand query_strand (eNa_strand_plus);
1585 
1586  const ENa_strand subj_strand (m_strands[i]? eNa_strand_plus:
1588  CRef<CSeq_loc> seq_loc (new CSeq_loc (*seqid_subj, range_min,
1589  range_max, subj_strand));
1590  bounds.push_back(seq_loc);
1591 
1592  // add the hits
1593  CSeq_align_set::Tdata& sas2_data (sa2->SetSegs().SetDisc().Set());
1594  ITERATE(typename THitRefs, ii, m_pending[i]) {
1595 
1596  const THit& h (**ii);
1597  CRef<CSeq_align> sa3 (new CSeq_align);
1599  const string& transcript(h.GetTranscript());
1600  if (transcript.empty()) {
1601  CRef<CStd_seg> sg(new CStd_seg);
1602  sa3->SetSegs().SetStd().push_back(sg);
1603  sg->SetDim(2);
1604  CRef<CSeq_loc> query_loc(new CSeq_loc(
1605  *seqid_query, h.GetQueryMin(), h.GetQueryMax(), query_strand));
1606  CRef<CSeq_loc> subj_loc(new CSeq_loc(
1607  *seqid_subj, h.GetSubjMin(), h.GetSubjMax(), subj_strand));
1608  sg->SetLoc().push_back(query_loc);
1609  sg->SetLoc().push_back(subj_loc);
1610  } else {
1611  CDense_seg & ds (sa3->SetSegs().SetDenseg());
1612  ds.FromTranscript(h.GetQueryStart(), query_strand,
1613  h.GetSubjStart(), subj_strand,
1614  transcript);
1615  ds.SetIds().push_back(seqid_query);
1616  ds.SetIds().push_back(seqid_subj);
1617  }
1618  sas2_data.push_back(sa3);
1619  }
1620 
1621  // save into the seq-align-set
1622  sas1_data.push_back(sa2);
1623  }
1624  }
1625 
1626  return rv;
1627 }
1628 
1629 
1630 
1632 
1633 
1634 #endif
User-defined methods of the data storage class.
void remove_if(Container &c, Predicate *__pred)
Definition: chainer.hpp:69
TCoord GetQueryStart(void) const
const TId & GetSubjId(void) const
TCoord GetQueryMin(void) const
TCoord GetSubjStart(void) const
const TId & GetQueryId(void) const
TCoord GetSubjMin(void) const
TCoord GetQueryMax(void) const
TCoord GetSubjMax(void) const
const TTranscript & GetTranscript(void) const
TParent::TCoord TCoord
void Run(typename THitRefs::iterator start, typename THitRefs::iterator finish, CScope *scope=NULL, const vector< pair< TCoord, TCoord > > *gaps=NULL)
Execute: identify compartments.
bool GetStrand(size_t i) const
void SetMaxIntron(TCoord mi)
Assign the maximum intron length, in base pairs.
bool GetFirst(THitRefs &compartment)
Initialize iteration over the results.
CRef< objects::CSeq_align_set > AsSeqAlignSet(void) const
Retrieve all valid compartments in a seq-align-set.
void x_Init(TCoord comp_penalty, TCoord min_matches, TCoord min_singleton_matches, bool cross_filter)
void Get(size_t idx, THitRefs &compartment) const
Retrieve a compartment by index.
const TCoord * GetBox(size_t i) const
TCompartmentFinder::THitRefs THitRefs
CCompartmentAccessor(TCoord comp_penalty_bps, TCoord min_matches, TCoord min_singleton_matches=numeric_limits< TCoord >::max(), bool cross_filter=false)
Construct the object and assign the parameters of the algorithm.
TCompartmentFinder::TCoord TCoord
void SetMaxOverlap(TCoord max_overlap)
Assign the maximum length for compartments to overlap on the subject.
vector< THitRefs > m_pending
bool GetNext(THitRefs &compartment)
Proceed with iteration over the results.
bool GetStatus(size_t i) const
TCoord GetMaxIntron(void) const
Retrieve the maximum intron length, in base pairs.
pair< size_t, size_t > GetCounts(void) const
Retrieve the compartment counts.
CCompartmentFinder< THit > TCompartmentFinder
void x_Copy2Pending(TCompartmentFinder &finder)
Individual compartment representation.
const THitRef GetNext(void) const
Retrieve the next member.
void AddMember(const THitRef &hitref)
Add a new member into the compartment.
const TCoord * GetBox(void) const
Retrieve the compartment's box.
const THitRefs & GetMembers(void)
Retrieve compartment's members.
static bool s_PLowerSubj(const CCompartment &c1, const CCompartment &c2)
bool GetStrand(void) const
Retrieve the compartment's strand (true == plus)
void UpdateMinMax(void)
Make sure the compartment's box is up-to-date.
CCompartment(void)
Create an empty compartment.
THitRefs & SetMembers(void)
Assign all members of the compartment.
const THitRef GetFirst(void) const
Retrieve the first member.
void SetMaxIntron(TCoord intr_max)
Set the maximum length of an intron.
static TCoord s_GetDefaultMaxOverlap(void)
Retrieve the default compartment overlap behaviour (no overlap).
static TCoord s_GetDefaultMinCoverage(void)
Retrieve the default minimum coverage, in base pairs.
void SetMinMatches(TCoord min_matches)
Set the minimum matching residues per compartment.
void SetPenalty(TCoord penalty)
Set the penalty to open a compartment.
vector< CCompartment > m_compartments
static TCoord s_GetDefaultPenalty(void)
Retrieve the default compartment penalty, in base pairs.
size_t Run(bool cross_filter=false)
Identify compartments.
vector< THitRef > THitRefs
CCompartment * GetNext(void)
CPrecalcGapInfo< THit > * m_prec_gap_info
static TCoord s_GetDefaultMaxIntron(void)
Retrieve the default maximum length of an intron.
void SetMinSingletonMatches(TCoord min_matches)
Set the minimum matching residues per singleton compartment.
CCompartmentFinder(typename THitRefs::const_iterator start, typename THitRefs::const_iterator finish, CGapInfo< THit > *gap_info=NULL, CPrecalcGapInfo< THit > *prec_gap_info=NULL)
Create the object from the complete set of local alignments (hits) between the transcript and the gen...
static bool s_PNullRef(const THitRef &hr)
CGapInfo< THit > * m_gap_info
void OrderCompartments(void)
Order compartments by lower subj coordinate.
void SetMaxOverlap(TCoord max_overlap)
Set compartment overlap behaviour.
static size_t sx_XFilter(THitRefs &hitrefs, typename THitRefs::iterator ihr, typename THitRefs::iterator ihr_e, Uint1 w, size_t min_compartment_hit_len)
CCompartment * GetFirst(void)
CConstRef –.
Definition: ncbiobj.hpp:1266
void FromTranscript(TSeqPos query_start, ENa_strand query_strand, TSeqPos subj_start, ENa_strand subj_strand, const string &transcript)
Initialize from pairwise alignment transcript (a string representation produced by CNWAligner)
Definition: Dense_seg.cpp:1273
void Flip(TCoord subj_min, TCoord subj_max)
CGapInfo(const CSeq_id &id, CScope *scope)
THit::TCoord TCoord
CConstRef< CSeqMap > m_seq_map
bool IntersectsNonBridgeableGap(TCoord from, TCoord to)
CPrecalcGapInfo(const vector< pair< TCoord, TCoord > > *gaps)
const vector< pair< TCoord, TCoord > > * m_gaps
bool IntersectsNonBridgeableGap(TCoord from, TCoord to)
void Flip(TCoord subj_min, TCoord subj_max)
double operator()(double acm, CRef< THit > ph)
CRef –.
Definition: ncbiobj.hpp:618
CScope –.
Definition: scope.hpp:92
Iterator over CSeqMap.
Definition: seq_map_ci.hpp:252
USING_SCOPE(objects)
const double kPenaltyPerIntronPos
double GetTotalMatches(const typename CCompartmentFinder< THit >::THitRefs &hitrefs0)
const double kPenaltyPerIntronBase
IMPLEMENTATION.
CSplign::THitRef THitRef
CSplign::THitRefs THitRefs
static const char * bounds[]
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:61
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define NULL
Definition: ncbistd.hpp:225
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
Definition: Seq_id.cpp:318
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
const CSeqMap & GetSeqMap(void) const
Get sequence map.
CConstRef< CSeq_literal > GetRefGapLiteral(void) const
return CSeq_literal with gap data, or null if either the segment is not a gap, or an unspecified gap
Definition: seq_map_ci.cpp:292
CSeqMap_CI ResolvedRangeIterator(CScope *scope, TSeqPos from, TSeqPos length, ENa_strand strand=eNa_strand_plus, size_t maxResolve=size_t(-1), TFlags flags=fDefaultFlags) const
Iterate segments in the range with specified strand coordinates.
Definition: seq_map.cpp:868
@ fFindGap
Definition: seq_map.hpp:130
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool IsNull(void) const THROWS_NONE
Check if pointer is null – same effect as Empty().
Definition: ncbiobj.hpp:735
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
void Run(void)
Enter the main loop.
Tdata & Set(void)
Assign a value to data member.
void SetSegs(TSegs &value)
Assign a value to Segs data member.
Definition: Seq_align_.cpp:310
void SetType(TType value)
Assign a value to Type data member.
Definition: Seq_align_.hpp:818
list< CRef< CSeq_loc > > TBounds
Definition: Seq_align_.hpp:400
list< CRef< CSeq_align > > Tdata
TIds & SetIds(void)
Assign a value to Ids data member.
Definition: Dense_seg_.hpp:511
TBounds & SetBounds(void)
Assign a value to Bounds data member.
Definition: Seq_align_.hpp:957
@ eType_disc
discontinuous alignment
Definition: Seq_align_.hpp:104
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
yy_size_t n
T max(T x_, T y_)
std::istream & in(std::istream &in_, double &x_)
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
SHitStatus(EType type, int prev, double score)
Definition: type.c:6
Modified on Wed Apr 24 14:15:11 2024 by modify_doxy.py rev. 669887