NCBI C++ ToolKit
chainer.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: chainer.cpp 101798 2024-02-13 17:18:22Z souvorov $
2  ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Alexandre Souvorov
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbiapp.hpp>
34 #include <corelib/ncbienv.hpp>
35 #include <corelib/ncbiargs.hpp>
36 
37 #include <algo/gnomon/chainer.hpp>
41 
43 
45 #include <algo/gnomon/gnomon.hpp>
46 #include <algo/gnomon/annot.hpp>
47 
48 #include <map>
49 #include <sstream>
50 #include <tuple>
51 #include <unordered_set>
52 #include <unordered_map>
53 
56 #include <objmgr/feat_ci.hpp>
57 #include <objmgr/util/sequence.hpp>
58 
59 #include "gnomon_seq.hpp"
60 
62 BEGIN_SCOPE(gnomon)
63 
64 bool BelongToExon(const CGeneModel::TExons& exons, int pos) {
65  ITERATE(CGeneModel::TExons, i, exons) {
66  if(Include(i->Limits(),pos))
67  return true;
68  }
69  return false;
70 }
71 
72 class CChain;
73 typedef list<CChain> TChainList;
74 typedef list<CChain*> TChainPointerList;
75 
76 
77 struct SChainMember;
78 typedef vector<SChainMember*> TContained;
79 
80 typedef map<Int8,CAlignModel*> TOrigAligns;
81 typedef map<Int8,CGeneModel> TUnmodAligns;
82 struct SFShiftsCluster;
83 class CChainMembers;
84 
85 class CGene;
86 
88 
89 private:
90  CChainerImpl(CRef<CHMMParameters>& hmm_params, unique_ptr<CGnomonEngine>& gnomon, const CAlignMap& edited_contig_map, const TSignedSeqRange& limits, const string& m_contig_acc);
91  void SetGenomicRange(const TAlignModelList& alignments);
93 
94  void FilterOutChimeras(TGeneModelList& clust);
95 
98  void CombineCompatibleChains(TChainList& chains);
99  void SetFlagsForChains(TChainList& chains);
100  SChainMember* FindOptimalChainForProtein(TContained& pointers_all, vector<CGeneModel*>& parts, CGeneModel& palign);
101  void CreateChainsForPartialProteins(TChainList& chains, TContained& pointers, TGeneModelList& unma_aligns, CChainMembers& unma_members);
102  void CutParts(TGeneModelList& clust);
103  bool CanIncludeJinI(const SChainMember& mi, const SChainMember& mj);
104  void IncludeInContained(SChainMember& big, SChainMember& small);
105  void FindContainedAlignments(TContained& pointers);
106  void DuplicateNotOriented(CChainMembers& pointers, TGeneModelList& clust);
108  void ReplicatePStops(CChainMembers& pointers);
109  void ScoreCdnas(CChainMembers& pointers);
110  void DuplicateUTRs(CChainMembers& pointers);
111  void CalculateSpliceWeights(CChainMembers& pointers);
112  bool LRCanChainItoJ(int& delta_cds, double& delta_num, double& delta_splice_num, SChainMember& mi, SChainMember& mj, TContained& contained, bool& not_sorted);
113  void LRIinit(SChainMember& mi, const TContained& micontained);
114  void LeftRight(TContained& pointers);
115  void RightLeft(TContained& pointers);
116  double GoodCDNAScore(const CGeneModel& algn, bool simple = false);
117  void RemovePoorCds(CGeneModel& algn, double minscor);
118  void SkipReason(CGeneModel* orig_align, const string& comment);
119  bool AddIfCompatible(set<SFShiftsCluster>& fshift_clusters, const CGeneModel& algn);
120  bool FsTouch(const TSignedSeqRange& lim, const CInDelInfo& fs);
121  void SplitAlignmentsByStrand(const TGeneModelList& clust, TGeneModelList& clust_plus, TGeneModelList& clust_minus);
122 
123  void FindGeneSeeds(list<CGene>& alts, TChainPointerList& not_placed_yet);
124  void ReplacePseudoGeneSeeds(list<CGene>& alts, TChainPointerList& not_placed_yet);
125  void FindAltsForGeneSeeds(list<CGene>& alts, TChainPointerList& not_placed_yet);
126  void PlaceAllYouCan(list<CGene>& alts, TChainPointerList& not_placed_yet, TChainPointerList& rejected);
128  ECompat CheckCompatibility(const CGene& gene, const CChain& algn);
129  list<CGene> FindGenes(TChainList& cls);
131  void FilterOutTandemOverlap(TChainPointerList& not_placed_yet, TChainPointerList& rejected, double fraction);
132  void TrimAlignmentsIncludedInDifferentGenes(list<CGene>& genes);
133 
134 
136  unique_ptr<CGnomonEngine>& m_gnomon;
139  const string& m_contig_acc;
140 
141 
144  int trim;
145  map<string,TSignedSeqRange> mrnaCDS;
146  map<string, pair<bool,bool> > prot_complet;
149 
154  int max_dist;
159 
160  int minpolya;
162  TIntMap confirmed_ends; // [splice], end
163 
166 
167  map<TSignedSeqRange,int> mrna_count;
168  map<TSignedSeqRange,int> est_count;
169  map<TSignedSeqRange,int> rnaseq_count;
171  set<TSignedSeqRange> oriented_introns_plus;
172  set<TSignedSeqRange> oriented_introns_minus;
173 
174  double altfrac;
180 
181  int m_idnext;
182  int m_idinc;
183 
185 
186  int flex_len;
187 
188  friend class CChainer;
189  friend class CChainerArgUtil;
190 };
191 
193 
195 
197 {
198  m_masking = true;
199 }
200 
202 {
204 }
205 
207 {
208 }
209 
210 CChainer::CChainerImpl::CChainerImpl(CRef<CHMMParameters>& hmm_params, unique_ptr<CGnomonEngine>& gnomon, const CAlignMap& edited_contig_map, const TSignedSeqRange& limits, const string& contig_acc)
211  :m_hmm_params(hmm_params), m_gnomon(gnomon), m_edited_contig_map(edited_contig_map), m_limits(limits), m_contig_acc(contig_acc), m_idnext(1), m_idinc(1)
212 {
213 }
214 
216 {
217  return m_data->MakeChains(models);
218 }
219 
220 enum {
223  eRightUTR
224 };
225 
226 typedef set<SChainMember*> TMemberPtrSet;
227 
229 {
231  m_align(0), m_cds_info(0), m_align_map(0), m_left_member(0), m_right_member(0), m_sink_for_contained(0),
232  m_copy(0), m_contained(0), m_identical_count(0),
233  m_left_num(0), m_right_num(0), m_num(0),
234  m_splice_weight(0), m_left_splice_num(0), m_right_splice_num(0), m_splice_num(0),
235  m_type(eCDS), m_left_cds(0), m_right_cds(0), m_cds(0), m_included(false), m_postponed(false), m_internal(false),
236  m_marked_for_deletion(false), m_marked_for_retention(false), m_restricted_to_start(false),
237  m_gapped_connection(false), m_fully_connected_to_part(-1), m_not_for_chaining(false),
238  m_rlimb(numeric_limits<int>::max()), m_llimb(numeric_limits<int>::max()), m_orig_align(0), m_unmd_align(0), m_mem_id(0) {}
239 
240  TContained CollectContainedForChain();
241  TContained CollectCodingContainedForChain();
242  void MarkIncluded();
243  void MarkIncludedForChain();
244  void MarkPostponed();
245  void MarkPostponedForChain();
246  void MarkUnwantedCopiesForChain(const TSignedSeqRange& cds);
247  TContained CollectContainedForMemeber();
248  TContained CollectCodingContainedForMemeber();
249  void AddToContained(TContained& contained, TMemberPtrSet& included_in_list);
250  void AddCodingToContained(TContained& contained, TMemberPtrSet& included_in_list);
251 
258  TContained* m_copy; // is used to make sure that the copy of already incuded duplicated alignment is not included in contained and doesn't trigger a new chain genereation
261  double m_left_num, m_right_num, m_num;
263  double m_left_splice_num, m_right_splice_num, m_splice_num;
264  double m_accumulated_splice_num, m_accumulated_num;
265  int m_type, m_left_cds, m_right_cds, m_cds;
272  bool m_gapped_connection; // used for gapped proteins
273  int m_fully_connected_to_part; // used for gapped proteins
274  bool m_not_for_chaining; // included in other alignmnet(s) or supressed and can't trigger a different chain
275  int m_rlimb; // leftmost compatible rexon
276  int m_llimb; // leftmost not compatible lexon
279  int m_mem_id;
280 };
281 
282 class CChain : public CGeneModel
283 {
284 private:
286  tuple<TIDMap, TSignedSeqRange> PeaksAndLimits(EStatus determinant, int min_blob_weight, int max_empty_dist, int min_splice_dist);
287  tuple<TIVec, TSignedSeqRange> MainPeaks(TIDMap& peak_weights, double secondary_peak, double tertiary_peak, double tertiary_peak_coverage, bool right_end);
288 public:
289  CChain(SChainMember& mbr, CGeneModel* gapped_helper = 0, bool keep_all_evidence = false, bool addallsupport = true);
290  CChain(SChainMember& mbr, bool full_support);
291  void AddAllMembersAndCoverage(SChainMember& mbr);
292 
293  void RestoreTrimmedEnds(int trim);
294  void RemoveFshiftsFromUTRs();
295  bool RestoreReasonableConfirmedStart(const CGnomonEngine& gnomon, TOrigAligns& orig_aligns);
296  void SetOpenForPartialyAlignedProteins(map<string, pair<bool,bool> >& prot_complet);
297  pair<bool,bool> ValidPolyA(int pos, const CResidueVec& contig);
298  void ClipToCap(int min_cap_blob, int max_dist, int min_flank_exon, double secondary_peak, bool recalulate_support = true );
299  void ClipToPolyA(const CResidueVec& contig, int min_polya_blob, int max_dist, int min_flank_exon, double secondary_peak, double tertiary_peak, double tertiary_peak_coverage, bool recalulate_support = true);
300  void CheckSecondaryCapPolyAEnds();
301  void ClipLowCoverageUTR(double utr_clip_threshold, bool recalulate_support = true);
302  void CalculateDropLimits();
303  void CalculateSupportAndWeightFromMembers(bool keep_all_evidence = false);
304  void ClipChain(TSignedSeqRange limits, bool recalulate_support = true);
305  bool SetConfirmedEnds(const CGnomonEngine& gnomon, CGnomonAnnotator_Base::TIntMap& confirmed_ends);
306 
307  void SetConfirmedStartStopForCompleteProteins(map<string, pair<bool,bool> >& prot_complet, const SMinScor& minscor);
308  void CollectTrustedmRNAsProts(TOrigAligns& orig_aligns, const SMinScor& minscor, CScope& scope, SMatrix& matrix, const CResidueVec& contig);
309  void SetBestPlacement(TOrigAligns& orig_aligns);
310  void SetConsistentCoverage();
311 
312  bool HarborsNested(const CChain& other_chain, bool check_in_holes) const;
313  bool HarborsNested(const CGene& other_gene, bool check_in_holes) const;
314 
315  bool HasTrustedEvidence() const;
316 
325  vector<double> m_coverage;
332 };
333 
334 
335 class CGene : public TChainPointerList
336 {
337 public:
338  CGene() : m_maxscore(BadScore()) {}
339  typedef list<CGeneModel>::iterator TIt;
340  typedef list<CGeneModel>::const_iterator TConstIt;
341  TSignedSeqRange Limits() const { return m_limits; }
342  TSignedSeqRange RealCdsLimits() const { return m_real_cds_limits; }
343  bool IsAlternative(const CChain& a) const;
344  bool IsAllowedAlternative(const ncbi::gnomon::CGeneModel&, int maxcomposite) const;
345  void Insert(CChain& a);
346  double MaxScore() const { return m_maxscore; }
347  bool Nested() const { return !m_nested_in_genes.empty(); }
348  bool LargeCdsOverlap(const CGeneModel& a) const;
349  bool HarborsNested(const CChain& other_chain, bool check_in_holes) const;
350  bool HarborsNested(const CGene& other_gene, bool check_in_holes) const;
351 
352  void AddToHarbored(CGene* p) { m_harbors_genes.insert(p); }
353  void AddToNestedIn(CGene* p) {m_nested_in_genes.insert(p); };
354  set<CGene*> RemoveGeneFromOtherGenesSets();
355 
356 
357 private:
358  bool HarborsRange(TSignedSeqRange range, bool check_in_holes) const;
359  void RemoveFromHarbored(CGene* p) { m_harbors_genes.erase(p); }
360  void RemoveFromNestedIn(CGene* p) {m_nested_in_genes.erase(p); };
361 
363  double m_maxscore;
364  set<CGene*> m_nested_in_genes;
365  set<CGene*> m_harbors_genes;
366 };
367 
369  NON_CONST_ITERATE(set<CGene*>, i, m_nested_in_genes)
370  (*i)->RemoveFromHarbored(this);
371  NON_CONST_ITERATE(set<CGene*>, i,m_harbors_genes)
372  (*i)->RemoveFromNestedIn(this);
373 
374  return m_harbors_genes;
375 }
376 
377 // if external model is 'open' all 5' introns can harbor
378 // gene with 'double' CDS can harbor in the interval between CDSes (intron or not)
379 // non coding models in external coding genes have no effect
380 bool CGene::HarborsRange(TSignedSeqRange range, bool check_in_holes) const {
381  TSignedSeqRange gene_lim_for_nested = Limits();
382  if(RealCdsLimits().NotEmpty())
383  gene_lim_for_nested = front()->OpenCds() ? front()->MaxCdsLimits() : RealCdsLimits(); // 'open' could be only a single variant gene
384  if(!Include(gene_lim_for_nested,range))
385  return false;
386 
387  bool nested = true;
388  ITERATE(CGene, it, *this) {
389  if(RealCdsLimits().NotEmpty() && (*it)->ReadingFrame().Empty()) // non coding model in coding gene
390  continue;
391  TSignedSeqRange model_lim_for_nested = (*it)->Limits();
392  if((*it)->ReadingFrame().NotEmpty())
393  model_lim_for_nested = (*it)->OpenCds() ? (*it)->MaxCdsLimits() : (*it)->RealCdsLimits(); // 'open' could be only a single variant gene
394  if(range.IntersectingWith(model_lim_for_nested) && !CModelCompare::RangeNestedInIntron(range, **it, check_in_holes)) {
395  nested = false;
396  break;
397  }
398  }
399 
400  return nested;
401 }
402 
403 // if external model is 'open' all 5' introns can harbor
404 // gene with 'double' CDS can harbor in the interval between CDSes (intron or not)
405 // for nested model 'open' is ignored
406 // non coding models in external coding genes have no effect
407 bool CGene::HarborsNested(const CChain& other_chain, bool check_in_holes) const {
408  TSignedSeqRange other_lim_for_nested = other_chain.Limits();
409  if(!other_chain.ReadingFrame().Empty())
410  other_lim_for_nested = other_chain.RealCdsLimits();
411 
412  return HarborsRange(other_lim_for_nested, check_in_holes);
413 }
414 
415 // if external model is 'open' all 5' introns can harbor
416 // gene with 'double' CDS can harbor in the interval between CDSes (intron or not)
417 // for nested model 'open' is ignored
418 // non coding models in external coding genes have no effect
419 bool CGene::HarborsNested(const CGene& other_gene, bool check_in_holes) const {
420  TSignedSeqRange other_lim_for_nested = other_gene.Limits();
421  if(!other_gene.RealCdsLimits().Empty())
422  other_lim_for_nested = other_gene.RealCdsLimits();
423 
424  return HarborsRange(other_lim_for_nested, check_in_holes);
425 }
426 
427 
428 bool CGene::LargeCdsOverlap(const CGeneModel& a) const {
429 
430  ITERATE(CGene, it, *this) {
431  const CGeneModel& b = **it;
432  int common_cds = 0;
433  ITERATE(CGeneModel::TExons, ib, b.Exons()) {
434  ITERATE(CGeneModel::TExons, ia, a.Exons()) {
435  common_cds += (ib->Limits()&b.RealCdsLimits()&ia->Limits()&a.RealCdsLimits()).GetLength();
436  }
437  }
438  if(common_cds > 50)
439  return true;
440  }
441 
442  return false;
443 }
444 
446 {
447  push_back(&a);
448  m_limits += a.Limits();
449  m_real_cds_limits += a.RealCdsLimits();
450  m_maxscore = max(m_maxscore,a.Score());
451 }
452 
453 bool CGene::IsAllowedAlternative(const CGeneModel& a, int maxcomposite) const
454 {
455  if(a.Exons().size() > 1 && (a.Status()&CGeneModel::ecDNAIntrons) == 0 && a.TrustedmRNA().empty() && a.TrustedProt().empty()) {
456  return false;
457  }
458 
459  if (a.Support().empty()) {
460  return false;
461  }
462 
463  int composite = 0;
464  ITERATE(CSupportInfoSet, s, a.Support()) {
465  if(s->IsCore() && ++composite > maxcomposite) return false;
466  }
467 
468  if(a.PStop(false) || !a.FrameShifts().empty())
469  return false;
470  if(front()->PStop(false) || !front()->FrameShifts().empty())
471  return false;
472 
473  // check for gapfillers
474 
475  vector<TSignedSeqRange> gene_gapfill_exons;
476  ITERATE(CGeneModel::TExons, e, front()->Exons()) {
477  if(e->m_fsplice_sig == "XX" || e->m_ssplice_sig == "XX")
478  gene_gapfill_exons.push_back(e->Limits());
479  }
480  vector<TSignedSeqRange> a_gapfill_exons;
481  ITERATE(CGeneModel::TExons, e, a.Exons()) {
482  if(e->m_fsplice_sig == "XX" || e->m_ssplice_sig == "XX")
483  a_gapfill_exons.push_back(e->Limits());
484  }
485  if(gene_gapfill_exons != a_gapfill_exons)
486  return false;
487 
488  bool a_share_intron = false;
489  ITERATE(CGene, it, *this) {
490  const CGeneModel& b = **it;
491  set<TSignedSeqRange> b_introns;
492  for(int i = 1; i < (int)b.Exons().size(); ++i) {
493  if(b.Exons()[i-1].m_ssplice && b.Exons()[i].m_fsplice) {
494  TSignedSeqRange intron(b.Exons()[i-1].GetTo()+1,b.Exons()[i].GetFrom()-1);
495  b_introns.insert(intron);
496  }
497  }
498 
499  bool a_has_new_intron = false;
500  for(int i = 1; i < (int)a.Exons().size(); ++i) {
501  if(a.Exons()[i-1].m_ssplice && a.Exons()[i].m_fsplice && a.Exons()[i-1].m_ssplice_sig != "XX" && a.Exons()[i].m_fsplice_sig != "XX") {
502  TSignedSeqRange intron(a.Exons()[i-1].GetTo()+1,a.Exons()[i].GetFrom()-1);
503  if(b_introns.insert(intron).second)
504  a_has_new_intron = true;
505  else
506  a_share_intron = true;
507  }
508  }
509 
510  if(a_has_new_intron) {
511  continue;
512  } else if(!gene_gapfill_exons.empty()) {
513  return false;
514  } else if(a.RealCdsLimits().NotEmpty() && b.RealCdsLimits().NotEmpty() && !a.RealCdsLimits().IntersectingWith(b.RealCdsLimits()) && (!a.TrustedmRNA().empty() || !a.TrustedProt().empty())) {
515 #ifdef _DEBUG
516  const_cast<CGeneModel&>(a).AddComment("Secondary CDS");
517 #endif
518  continue;
519  } else if(a.RealCdsLen() <= b.RealCdsLen()){
520  return false;
521  }
522  }
523 
524  return (a_share_intron || gene_gapfill_exons.empty());
525 }
526 
527 bool CGene::IsAlternative(const CChain& a) const
528 {
529  _ASSERT( size()>0 );
530 
531  if (a.Strand() != front()->Strand())
532  return false;
533 
534  bool gene_has_trusted = false;
535  ITERATE(CGene, it, *this) {
536  if((*it)->HasTrustedEvidence()) {
537  gene_has_trusted = true;
538  break;
539  }
540  }
541 
542  bool has_common_splice = false;
543 
544  ITERATE(CGene, it, *this) {
545  if(CModelCompare::CountCommonSplices(**it, a) > 0) { // has common splice
546  has_common_splice = true;
547  break;
548  }
549  }
550 
551  if(has_common_splice && (!gene_has_trusted || !a.HasTrustedEvidence())) // separate trusted genes with similar splices if they don't have common cds
552  return true;
553 
554  if(a.ReadingFrame().NotEmpty() && RealCdsLimits().NotEmpty()) {
555  CAlignMap amap(a.Exons(), a.FrameShifts(), a.Strand(), a.GetCdsInfo().Cds());
556  TIVec acds_map(amap.FShiftedLen(a.GetCdsInfo().Cds()),0);
557  for(unsigned int j = 0; j < a.Exons().size(); ++j) {
558  for(TSignedSeqPos k = max(a.Exons()[j].GetFrom(),a.GetCdsInfo().Cds().GetFrom()); k <= min(a.Exons()[j].GetTo(),a.GetCdsInfo().Cds().GetTo()); ++k) {
559  TSignedSeqPos p = amap.MapOrigToEdited(k);
560  _ASSERT(p < (int)acds_map.size());
561  if(p >= 0)
562  acds_map[p] = k;
563  }
564  }
565 
566 
567  bool has_common_cds = false;
568 
569  ITERATE(CGene, it, *this) {
570  if(!a.GetCdsInfo().Cds().IntersectingWith((*it)->GetCdsInfo().Cds()))
571  continue;
572 
573  CAlignMap gmap((*it)->Exons(), (*it)->FrameShifts(), (*it)->Strand(), (*it)->GetCdsInfo().Cds());
574  TIVec cds_map(gmap.FShiftedLen((*it)->GetCdsInfo().Cds()),0);
575  for(unsigned int j = 0; j < (*it)->Exons().size(); ++j) {
576  for(TSignedSeqPos k = max((*it)->Exons()[j].GetFrom(),(*it)->GetCdsInfo().Cds().GetFrom()); k <= min((*it)->Exons()[j].GetTo(),(*it)->GetCdsInfo().Cds().GetTo()); ++k) {
577  TSignedSeqPos p = gmap.MapOrigToEdited(k);
578  _ASSERT(p < (int)cds_map.size());
579  if(p >= 0)
580  cds_map[p] = k;
581  }
582  }
583 
584  for(unsigned int i = 0; i < acds_map.size(); ) {
585  unsigned int j = 0;
586  for( ; j < cds_map.size() && (acds_map[i] != cds_map[j] || i%3 != j%3); ++j);
587  if(j == cds_map.size()) {
588  ++i;
589  continue;
590  }
591 
592  int count = 0;
593  for( ; j < cds_map.size() && i < acds_map.size() && acds_map[i] == cds_map[j]; ++j, ++i, ++count);
594 
595  if(count > 30) { // has common cds
596  has_common_cds = true;
597  break;
598  }
599  }
600 
601  if(has_common_cds)
602  break;
603  }
604 
605  return has_common_cds;
606  }
607 
608  return has_common_splice;
609 }
610 
611 static bool DescendingModelOrder(const CChain& a, const CChain& b)
612 {
613  if (!a.Support().empty() && b.Support().empty())
614  return true;
615  else if (a.Support().empty() && !b.Support().empty())
616  return false;
617 
618 
619  bool atrusted = !a.TrustedmRNA().empty() || !a.TrustedProt().empty();
620  bool btrusted = !b.TrustedmRNA().empty() || !b.TrustedProt().empty();
621  if(atrusted && !btrusted) { // trusted gene is always better
622  return true;
623  } else if(btrusted && !atrusted) {
624  return false;
625  } else if(a.ReadingFrame().NotEmpty() && b.ReadingFrame().Empty()) { // coding is always better
626  return true;
627  } else if(b.ReadingFrame().NotEmpty() && a.ReadingFrame().Empty()) {
628  return false;
629  } else if(a.ReadingFrame().NotEmpty()) { // both coding
630 
631  double ds = 0.05*fabs(a.Score());
632  double as = a.Score();
633  if((a.Status()&CGeneModel::ecDNAIntrons) != 0)
634  as += 2*ds;
635  if((a.Status()&CGeneModel::ePolyA) != 0)
636  as += ds;
637  if((a.Status()&CGeneModel::eCap) != 0)
638  as += ds;
639  if(a.isNMD())
640  as -= ds;
641 
642  ds = 0.05*fabs(b.Score());
643  double bs = b.Score();
644  if((b.Status()&CGeneModel::ecDNAIntrons) != 0)
645  bs += 2*ds;
646  if((b.Status()&CGeneModel::ePolyA) != 0)
647  bs += ds;
648  if((b.Status()&CGeneModel::eCap) != 0)
649  bs += ds;
650  if(b.isNMD())
651  bs -= ds;
652 
653  if(as > bs) // better score
654  return true;
655  else if(bs > as)
656  return false;
657  else if(a.m_splice_weight > b.m_splice_weight) // more splice support
658  return true;
659  else if(a.m_splice_weight < b.m_splice_weight)
660  return false;
661  else if(a.Weight() > b.Weight()) // more alignments is better
662  return true;
663  else if(a.Weight() < b.Weight())
664  return false;
665  else if(a.Limits().GetLength() != b.Limits().GetLength())
666  return (a.Limits().GetLength() < b.Limits().GetLength()); // everything else equal prefer compact model
667  else
668  return a.ID() < b.ID();
669  } else { // both noncoding
670  double asize = a.m_splice_weight;
671  double bsize = b.m_splice_weight;
672  double ds = 0.025*(asize+bsize);
673 
674  if((a.Status()&CGeneModel::ePolyA) != 0)
675  asize += ds;
676  if((a.Status()&CGeneModel::eCap) != 0)
677  asize += ds;
678  if(a.isNMD())
679  asize -= ds;
680 
681  if((b.Status()&CGeneModel::ePolyA) != 0)
682  bsize += ds;
683  if((b.Status()&CGeneModel::eCap) != 0)
684  bsize += ds;
685  if(b.isNMD())
686  bsize -= ds;
687 
688  if(asize > bsize)
689  return true;
690  else if(bsize > asize)
691  return false;
692  else if(a.Limits().GetLength() != b.Limits().GetLength())
693  return (a.Limits().GetLength() < b.Limits().GetLength()); // everything else equal prefer compact model
694  else
695  return a.ID() < b.ID();
696  }
697 }
698 
699 typedef CChain* TChainPtr;
700 static bool DescendingModelOrderP(const TChainPtr& a, const TChainPtr& b)
701 {
702  return DescendingModelOrder(*a, *b);
703 }
705 {
707  return (a->Status()&CGeneModel::eConsistentCoverage) > (b->Status()&CGeneModel::eConsistentCoverage);
708  else
709  return DescendingModelOrder(*a, *b);
710 }
711 
713 {
714  bool gene_good_enough_to_be_annotation = allow_partialalts || gene.front()->GoodEnoughToBeAnnotation();
715  bool algn_good_enough_to_be_annotation = allow_partialalts || algn.GoodEnoughToBeAnnotation();
716 
717  TSignedSeqRange gene_cds = (gene.size() > 1 || gene.front()->CompleteCds() || algn_good_enough_to_be_annotation) ? gene.RealCdsLimits() : gene.front()->MaxCdsLimits();
718  TSignedSeqRange algn_cds = (algn.CompleteCds() || gene_good_enough_to_be_annotation) ? algn.RealCdsLimits() : algn.MaxCdsLimits();
719 
720  if(!gene_good_enough_to_be_annotation && !algn_good_enough_to_be_annotation) { // both need ab initio
721  const CGeneModel& b = *gene.front();
722  for(int i = 1; i < (int)b.Exons().size(); ++i) {
723  if(b.Exons()[i].m_ssplice_sig == "XX" && b.Exons()[i].m_fsplice_sig == "XX" && b.Exons()[i].Limits().IntersectingWith(gene_cds)) { // if gap cds extend range to left exon
724  gene_cds.SetFrom(min(gene_cds.GetFrom(), b.Exons()[i-1].GetTo()));
725  }
726  }
727 
728  for(int i = 1; i < (int)algn.Exons().size(); ++i) {
729  if(algn.Exons()[i].m_ssplice_sig == "XX" && algn.Exons()[i].m_fsplice_sig == "XX" && algn.Exons()[i].Limits().IntersectingWith(algn_cds)) { // if gap cds extend range to left exon
730  algn_cds.SetFrom(min(algn_cds.GetFrom(), algn.Exons()[i-1].GetTo()));
731  }
732  }
733  }
734 
735  if(!gene.Limits().IntersectingWith(algn.Limits())) // don't overlap
736  return eOtherGene;
737 
738  if(gene.IsAlternative(algn)) { // has common splice or common CDS
739 
740  if(gene.IsAllowedAlternative(algn, composite) && algn_good_enough_to_be_annotation) {
741  if(!algn.TrustedmRNA().empty() || !algn.TrustedProt().empty()) { // trusted gene
742  return eAlternative;
743  } else if(algn.ReadingFrame().Empty() || gene.front()->ReadingFrame().Empty()) { // one noncoding
744  if(algn.m_splice_weight > altfrac/100*gene.front()->m_splice_weight) // long enough
745  return eAlternative;
746  else
747  return eNotCompatible;
748  } else if(algn.RealCdsLen() > altfrac/100*gene.front()->RealCdsLen() || algn.Score() > altfrac/100*gene.front()->Score()) { // good score or long enough cds
749  return eAlternative;
750  }
751  }
752 
753  return eNotCompatible;
754  }
755 
756  // don't include overlapping gapfil 'introns' in different genes
757  set<TSignedSeqRange> gene_gapfill_introns;
758  set<TSignedSeqRange> align_gapfill_introns;
759  ITERATE(CGene, it, gene) {
760  const CGeneModel& b = **it;
761  for(int i = 1; i < (int)b.Exons().size(); ++i) {
762  if(b.Exons()[i-1].m_ssplice_sig == "XX" || b.Exons()[i].m_fsplice_sig == "XX") {
763  TSignedSeqRange intron(b.Exons()[i-1].GetTo(),b.Exons()[i].GetFrom());
764  gene_gapfill_introns.insert(intron);
765  }
766  }
767  }
768  for(int i = 1; i < (int)algn.Exons().size(); ++i) {
769  if(algn.Exons()[i-1].m_ssplice_sig == "XX" || algn.Exons()[i].m_fsplice_sig == "XX") {
770  TSignedSeqRange intron(algn.Exons()[i-1].GetTo(),algn.Exons()[i].GetFrom());
771  align_gapfill_introns.insert(intron);
772  }
773  }
774  ITERATE(set<TSignedSeqRange>, ig, gene_gapfill_introns) {
775  ITERATE(set<TSignedSeqRange>, ia, align_gapfill_introns) {
776  if(ig->IntersectingWith(*ia))
777  return eNotCompatible;
778  }
779  }
780 
781  if(algn.HarborsNested(gene, gene_good_enough_to_be_annotation)) // gene is nested in align's intron (could be partial)
782  return eExternal;
783 
784  if(gene.HarborsNested(algn, algn_good_enough_to_be_annotation)) // algn is nested in gene (could be partial)
785  return eNested;
786 
787  if(!algn_cds.Empty() && !gene_cds.Empty()) { // both coding
788  if (!gene_cds.IntersectingWith(algn_cds)) { // don't overlap
789 #ifdef _DEBUG
790  if((gene_cds+algn_cds).GetLength() < gene_cds.GetLength()+algn_cds.GetLength()+20)
791  const_cast<CChain&>(algn).AddComment("Close proximity");
792 #endif
793  return eOtherGene;
794  } else if(gene.LargeCdsOverlap(algn)) {
795  return eNotCompatible;
796  }
797  }
798 
799  if(gene_good_enough_to_be_annotation && algn_good_enough_to_be_annotation) {
800  if(gene.front()->Strand() != algn.Strand() && allow_opposite_strand &&
801  ((algn.Status()&CGeneModel::eBestPlacement) || (algn.Exons().size() > 1 && gene.front()->Exons().size() > 1)))
802  return eOtherGene;
803  else if(algn.Status() & CGeneModel::eBestPlacement && (algn.Exons().size() == 1 || (algn.Status()&CGeneModel::ecDNAIntrons))) {
804 #ifdef _DEBUG
805  const_cast<CChain&>(algn).AddComment("Best placement overlap");
806 #endif
807  return eOtherGene;
808  }
809  }
810 
811  return eNotCompatible;
812 }
813 
814 void CChainer::CChainerImpl::FindGeneSeeds(list<CGene>& alts, TChainPointerList& not_placed_yet) {
815 
816  not_placed_yet.sort(DescendingModelOrderP);
817 
818  for(TChainPointerList::iterator itloop = not_placed_yet.begin(); itloop != not_placed_yet.end(); ) {
819  TChainPointerList::iterator it = itloop++;
820  CChain& algn(**it);
821 
822  if(algn.Score() == BadScore()) // postpone noncoding models
823  continue;
824  else if(!(algn.Score() >= 2*minscor.m_min || algn.GetCdsInfo().ProtReadingFrame().NotEmpty() || (algn.Score() >= minscor.m_min && (algn.Status()&(CGeneModel::eCap|CGeneModel::ePolyA)))))
825  continue;
826 
827  list<CGene*> possibly_nested;
828 
829  bool good_model = true;
830  for(list<CGene>::iterator itl = alts.begin(); good_model && itl != alts.end(); ++itl) {
831  ECompat cmp = CheckCompatibility(*itl, algn);
832 
833  switch(cmp) {
834  case eExternal:
835  possibly_nested.push_back(&(*itl)); // already created gene is nested in this model
836  case eOtherGene:
837  break;
838  default:
839  good_model = false;
840  break;
841  }
842  }
843 
844  if(good_model) {
845  alts.push_back(CGene());
846 #ifdef _DEBUG
847  algn.AddComment("Pass1");
848 #endif
849  alts.back().Insert(algn);
850  not_placed_yet.erase(it);
851  }
852 
853  ITERATE(list<CGene*>, itl, possibly_nested) {
854  (*itl)->AddToNestedIn(&alts.back());
855  alts.back().AddToHarbored(*itl);
856  }
857  }
858 }
859 
860 void CChainer::CChainerImpl::ReplacePseudoGeneSeeds(list<CGene>& alts, TChainPointerList& not_placed_yet) {
861 
862  not_placed_yet.sort(DescendingModelOrderP);
863 
864  for(TChainPointerList::iterator itloop = not_placed_yet.begin(); itloop != not_placed_yet.end(); ) {
865  TChainPointerList::iterator it = itloop++;
866  CChain& algn(**it);
867 
868  list<list<CGene>::iterator> included_in;
869  list<CGene*> possibly_nested; // genes which 'could' become nested
870  list<CGene*> nested_in;
871 
872  bool good_model = true;
873  for(list<CGene>::iterator itl = alts.begin(); good_model && itl != alts.end(); ++itl) {
874  ECompat cmp = CheckCompatibility(*itl, algn);
875 
876  switch(cmp) {
877  case eNested:
878  nested_in.push_back(&(*itl));
879  break;
880  case eExternal:
881  possibly_nested.push_back(&(*itl)); // already created gene is nested in this model
882  break;
883  case eOtherGene:
884  break;
885  case eAlternative:
886  included_in.push_back(itl);
887  break;
888  case eNotCompatible:
889  case eNotCompatibleNested:
890  if(itl->IsAlternative(algn))
891  included_in.push_back(itl);
892  else
893  good_model = false;
894  break;
895  default:
896  good_model = false;
897  break;
898  }
899  }
900 
901  if(!good_model || included_in.size() != 1 || (!(algn.Status()&CGeneModel::ecDNAIntrons) && algn.TrustedmRNA().empty() && algn.TrustedProt().empty()))
902  continue;
903 
904  CGene& gene = *included_in.front();
905  CChain& model = *gene.front();
906  // if((!model.PStop(false) && model.FrameShifts().empty()) || algn.PStop(false) || !algn.FrameShifts().empty())
907  if(!model.PStop(false) || algn.PStop(false) || !algn.FrameShifts().empty()) // use only for pstops
908  continue;
909 
910  int algn_cds_len = algn.FShiftedLen(algn.GetCdsInfo().Cds(),false);
911  int model_cds_len = model.FShiftedLen(model.GetCdsInfo().Cds(),false);
912  if(algn_cds_len < 0.8*model_cds_len)
913  continue;
914 
915 #ifdef _DEBUG
916  algn.AddComment("Replacing pseudo "+NStr::NumericToString(model.ID()));
917 #endif
918  not_placed_yet.push_back(gene.front()); // position doesn't matter - will go to 'bad' models
920  gene = CGene();
921  gene.Insert(algn);
922  ITERATE(list<CGene*>, itl, nested_in) {
923  gene.AddToNestedIn(*itl);
924  (*itl)->AddToHarbored(&gene);
925  }
926  ITERATE(list<CGene*>, itl, possibly_nested) {
927  (*itl)->AddToNestedIn(&gene);
928  gene.AddToHarbored(*itl);
929  }
930 
931  not_placed_yet.erase(it);
932  }
933 }
934 
935 void CChainer::CChainerImpl::FindAltsForGeneSeeds(list<CGene>& alts, TChainPointerList& not_placed_yet) {
936 
937  not_placed_yet.sort(DescendingModelOrderPConsistentCoverage);
938 
939  for(TChainPointerList::iterator itloop = not_placed_yet.begin(); itloop != not_placed_yet.end(); ) {
940  TChainPointerList::iterator it = itloop++;
941  CChain& algn(**it);
942 
943  list<list<CGene>::iterator> included_in;
944  list<CGene*> possibly_nested; // genes which 'could' become nested
945 
946  bool good_model = true;
947  for(list<CGene>::iterator itl = alts.begin(); good_model && itl != alts.end(); ++itl) {
948  ECompat cmp = CheckCompatibility(*itl, algn);
949 
950  switch(cmp) {
951  case eExternal:
952  possibly_nested.push_back(&(*itl)); // already created gene is nested in this model
953  case eOtherGene:
954  break;
955  case eAlternative:
956  included_in.push_back(itl);
957  break;
958  default:
959  good_model = false;
960  break;
961  }
962  }
963 
964  if(good_model && !included_in.empty() && (allow_partialalts || included_in.front()->front()->GoodEnoughToBeAnnotation())) {
965  if(included_in.size() == 1) { // alternative to only one seed
966 #ifdef _DEBUG
967  algn.AddComment("Pass2a");
968 #endif
969 
970  CGene& gene = *included_in.front();
971  gene.Insert(algn);
972  not_placed_yet.erase(it);
973 
974  ITERATE(list<CGene*>, itl, possibly_nested) {
975  if(gene.HarborsNested(**itl, true)) {
976  (*itl)->AddToNestedIn(&gene);
977  gene.AddToHarbored(*itl);
978  }
979  }
980  } else { // connects seeds
981 
982  bool allow_connection = false;
983 
984  if(!algn.TrustedmRNA().empty() || !algn.TrustedProt().empty() || (algn.Status()&CGeneModel::eConsistentCoverage)) { // connects seeds but trusted
985  bool cds_overlap = true;
986  if(algn.ReadingFrame().Empty()) {
987  cds_overlap = false;
988  } else {
989  CChain a = algn;
990  a.Clip(a.RealCdsLimits(), CAlignModel::eRemoveExons);
991  ITERATE(list<list<CGene>::iterator>, k, included_in) {
992  if(!(*k)->IsAlternative(a)) {
993  cds_overlap = false;
994  break;
995  }
996  }
997  }
998 
999  if(cds_overlap || (algn.Status()&CGeneModel::eConsistentCoverage)) {
1000 #ifdef _DEBUG
1001  algn.AddComment("Gene overlap override");
1002 #endif
1003  allow_connection = true;
1004  }
1005  }
1006 
1007  if(allow_connection) {
1008  CGene& gene = *included_in.front();
1009  gene.Insert(algn);
1010 
1011  ITERATE(list<list<CGene>::iterator>, k, included_in) {
1012  if(k != included_in.begin()) {
1013  ITERATE(CGene, l, **k) {
1014  if(itloop == not_placed_yet.end() || !DescendingModelOrder(**itloop, **l)) { // next is not better
1015  if(CheckCompatibility(*included_in.front(), **l) == eAlternative) { // check that the thresholds are met
1016 #ifdef _DEBUG
1017  (*l)->AddComment("Pass2b");
1018 #endif
1019  included_in.front()->Insert(**l);
1020  } else {
1021  not_placed_yet.push_back(*l); // position doesn't matter - will go to 'bad' models
1022  }
1023  } else {
1024  TChainPointerList::iterator idest = itloop;
1025  for( ;idest != not_placed_yet.end() && DescendingModelOrder(**idest, **l); ++idest);
1026  not_placed_yet.insert(idest, *l);
1027  }
1028  }
1029  set<CGene*> nested_genes = (*k)->RemoveGeneFromOtherGenesSets();
1030  ITERATE(set<CGene*>, i, nested_genes)
1031  possibly_nested.push_back(*i);
1032  alts.erase(*k);
1033  }
1034  }
1035  not_placed_yet.erase(it);
1036 
1037  ITERATE(list<CGene*>, itl, possibly_nested) {
1038  if(gene.HarborsNested(**itl, true)) {
1039  (*itl)->AddToNestedIn(&gene);
1040  gene.AddToHarbored(*itl);
1041  }
1042  }
1043  }
1044  }
1045  }
1046  }
1047 }
1048 
1049 void CChainer::CChainerImpl::PlaceAllYouCan(list<CGene>& alts, TChainPointerList& not_placed_yet, TChainPointerList& rejected) {
1050 
1051  not_placed_yet.sort(DescendingModelOrderP);
1052 
1053  ITERATE(TChainPointerList, it, not_placed_yet) {
1054  CChain& algn(**it);
1055  list<CGene>::iterator included_in(alts.end());
1056  list<CGene*> possibly_nested;
1057  list<CGene*> nested_in;
1058 
1059  bool good_model = true;
1060  for(list<CGene>::iterator itl = alts.begin(); good_model && itl != alts.end(); ++itl) {
1061  ECompat cmp = CheckCompatibility(*itl, algn);
1062  CNcbiOstrstream ost;
1063  switch(cmp) {
1064  case eNotCompatibleNested:
1065  case eNotCompatible:
1066  rejected.push_back(&algn);
1067  rejected.back()->Status() |= CGeneModel::eSkipped;
1068  ost << "Trumped by another model " << itl->front()->ID();
1069  rejected.back()->AddComment(CNcbiOstrstreamToString(ost));
1070  if(cmp == eNotCompatibleNested)
1071  rejected.back()->SetType(rejected.back()->Type()|CGeneModel::eNested);
1072  good_model = false;
1073  break;
1074  case eAlternative:
1075  if(!allow_partialalts && !itl->front()->GoodEnoughToBeAnnotation()) {
1076  rejected.push_back(&algn);
1077  rejected.back()->Status() |= CGeneModel::eSkipped;
1078  ost << "Trumped by another model " << itl->front()->ID();
1079  rejected.back()->AddComment(CNcbiOstrstreamToString(ost));
1080  good_model = false;
1081  } else if(included_in == alts.end()) {
1082  included_in = itl;
1083  } else { // tries to connect two different genes
1084  good_model = false;
1085  rejected.push_back(&algn);
1086  rejected.back()->Status() |= CGeneModel::eSkipped;
1087  ost << "Connects two genes " << itl->front()->ID() << " " << included_in->front()->ID();
1088  rejected.back()->AddComment(CNcbiOstrstreamToString(ost));
1089  }
1090  break;
1091  case eNested:
1092  nested_in.push_back(&(*itl));
1093  break;
1094  case eExternal:
1095  possibly_nested.push_back(&(*itl)); // already created gene is nested in this model
1096  break;
1097  case eOtherGene:
1098  break;
1099  }
1100  }
1101  if(good_model) {
1102  CGene* genep;
1103  if(included_in != alts.end()) {
1104 #ifdef _DEBUG
1105  algn.AddComment("Pass3a");
1106 #endif
1107  included_in->Insert(algn);
1108  genep = &(*included_in);
1109  } else {
1110  alts.push_back(CGene());
1111  genep = &alts.back();
1112 #ifdef _DEBUG
1113  algn.AddComment("Pass3b");
1114 #endif
1115  alts.back().Insert(algn);
1116  }
1117  ITERATE(list<CGene*>, itl, nested_in) {
1118  if((*itl)->HarborsNested(*genep, true)) {
1119  genep->AddToNestedIn(*itl);
1120  (*itl)->AddToHarbored(genep);
1121  }
1122  }
1123  ITERATE(list<CGene*>, itl, possibly_nested) {
1124  if(genep->HarborsNested(**itl, true)) {
1125  (*itl)->AddToNestedIn(genep);
1126  genep->AddToHarbored(*itl);
1127  }
1128  }
1129  }
1130  }
1131 }
1132 
1134 {
1135  not_placed_yet.sort(DescendingModelOrderP);
1136 
1137  NON_CONST_ITERATE(TChainPointerList, it, not_placed_yet) {
1138  CChain& ai(**it);
1139  TChainPointerList::iterator jt_loop = it;
1140  for(++jt_loop; jt_loop != not_placed_yet.end();) {
1141  TChainPointerList::iterator jt = jt_loop++;
1142  CChain& aj(**jt);
1143  if (CModelCompare::AreSimilar(ai,aj,tolerance)) {
1144  CNcbiOstrstream ost;
1145  ost << "Trumped by similar chain " << ai.ID();
1147  rejected.push_back(&aj);
1148  not_placed_yet.erase(jt);
1149  }
1150  }
1151  }
1152 }
1153 
1155 {
1156  for(TChainPointerList::iterator it_loop = not_placed_yet.begin(); it_loop != not_placed_yet.end();) {
1157  TChainPointerList::iterator it = it_loop++;
1158  CChain& ai(**it);
1159 
1160  if(!ai.TrustedmRNA().empty() || !ai.TrustedProt().empty() || ai.ReadingFrame().Empty())
1161  continue;
1162  int cds_len = ai.RealCdsLen();
1163 
1164  vector<const CChain*> candidates;
1165  ITERATE(TChainPointerList, jt, not_placed_yet) {
1166  const CChain& aj(**jt);
1167  if(!aj.HasStart() || !aj.HasStop() || aj.Score() < fraction/100*ai.Score() || aj.RealCdsLen() < fraction/100*cds_len || !CModelCompare::HaveCommonExonOrIntron(ai,aj))
1168  continue;
1169  candidates.push_back(&aj);
1170  }
1171 
1172  bool alive = true;
1173  for (size_t i = 0; alive && i < candidates.size(); ++i) {
1174  for (size_t j = i+1; alive && j < candidates.size(); ++j) {
1175  if(!candidates[i]->Limits().IntersectingWith(candidates[j]->Limits())) {
1176  CNcbiOstrstream ost;
1177  ost << "Overlapping tandem " << candidates[i]->ID() - ai.ID() << " " << candidates[j]->ID() - ai.ID();
1179  rejected.push_back(*it);
1180  not_placed_yet.erase(it);
1181  alive = false;
1182  }
1183  }
1184  }
1185  }
1186 }
1187 
1189 {
1190  TChainPointerList not_placed_yet;
1191  NON_CONST_ITERATE(TChainList, it, cls) {
1192  if((it->Status()&CGeneModel::eSkipped) == 0) {
1193  if(it->Type()&CGeneModel::eNested)
1194  it->SetType(it->Type()^CGeneModel::eNested);
1195  it->SetGeneID(it->ID());
1196  it->SetRankInGene(0);
1197  not_placed_yet.push_back(&(*it));
1198  }
1199  }
1200 
1201  list<CGene> alts;
1202  TChainPointerList bad_aligns;
1203 
1204  FilterOutSimilarsWithLowerScore(not_placed_yet, bad_aligns);
1205  FilterOutTandemOverlap(not_placed_yet, bad_aligns, 80);
1206 
1207  FindGeneSeeds(alts, not_placed_yet);
1208  ReplacePseudoGeneSeeds(alts, not_placed_yet);
1209  FindAltsForGeneSeeds(alts, not_placed_yet);
1210  PlaceAllYouCan(alts, not_placed_yet, bad_aligns);
1211 
1212  NON_CONST_ITERATE(list<CGene>, k, alts) {
1213  int rank = 0;
1214  NON_CONST_ITERATE(CGene, l, *k) {
1215  (*l)->SetGeneID(k->front()->ID());
1216  (*l)->SetRankInGene(++rank);
1217  if(k->Nested())
1218  (*l)->SetType((*l)->Type()|CGeneModel::eNested);
1219  }
1220  }
1221 
1222  NON_CONST_ITERATE(TChainPointerList, l, bad_aligns)
1223  (*l)->Status() |= CGeneModel::eSkipped;
1224 
1225  return alts;
1226 }
1227 
1228 
1230 {
1231  bool operator()(const SChainMember* ap, const SChainMember* bp) // left end increasing, long first if left end equal
1232  {
1233  TSignedSeqRange alimits = ap->m_align->Limits();
1234  //ignore flexible ends for sorting
1236  alimits.SetFrom(alimits.GetTo());
1238  alimits.SetTo(alimits.GetFrom());
1239  TSignedSeqRange blimits = bp->m_align->Limits();
1240  //ignore flexible ends for sorting
1242  blimits.SetFrom(blimits.GetTo());
1244  blimits.SetTo(blimits.GetFrom());
1245  if(alimits == blimits)
1246  return ap->m_mem_id < bp->m_mem_id; // to make sort deterministic
1247  else if(alimits.GetFrom() == blimits.GetFrom())
1248  return (alimits.GetTo() > blimits.GetTo());
1249  else
1250  return (alimits.GetFrom() < blimits.GetFrom());
1251  }
1252 };
1253 
1254 
1255 typedef vector< pair<SChainMember*,CGene*> > TMemeberGeneVec;
1256 
1257 typedef tuple<Int8, TSignedSeqRange> TIdLim;
1259  return make_tuple(mp->m_align->ID(), mp->m_align->Limits());
1260 }
1262 {
1264  {
1265  return AlignIdLimits(a.first) < AlignIdLimits(b.first);
1266  }
1267 };
1268 
1269 
1271  TMemeberGeneVec members_genes;
1272  NON_CONST_ITERATE(list<CGene>, ig, genes) {
1273  CGene& gene = *ig;
1274  TMemberPtrSet gmembers;
1275  ITERATE(CGene, ic, gene) {
1276  CChain& chain = **ic;
1277  ITERATE(TContained, im, chain.m_members) {
1278  SChainMember& m = **im;
1279  _ASSERT(m.m_orig_align);
1281  continue;
1282  if(m.m_orig_align->Continuous())
1283  gmembers.insert(&m);
1284  }
1285  }
1286  ITERATE(TMemberPtrSet, im, gmembers) {
1287  SChainMember& m = **im;
1288  members_genes.push_back(TMemeberGeneVec::value_type(&m,&gene));
1289  }
1290  }
1291 
1292  if(members_genes.empty())
1293  return;
1294 
1295  sort(members_genes.begin(),members_genes.end(),AlignIdOrder());
1296 
1297  typedef map<CGene*,list<SChainMember*> > TGeneToMembers;
1298  typedef map<TIdLim, TGeneToMembers> TMembersInDiffGenes;
1299  TMembersInDiffGenes members_in_different_genes;
1300  {
1301  SChainMember* mp = members_genes.front().first;
1302  TIdLim idlim = AlignIdLimits(mp);
1303  CGene* genep = members_genes.front().second;
1304  members_in_different_genes[idlim][genep].push_back(mp);
1305  }
1306  for(int i = 1; i < (int)members_genes.size(); ++i) {
1307  TIdLim idlim_prev = AlignIdLimits(members_genes[i-1].first);
1308  SChainMember* mp = members_genes[i].first;
1309  TIdLim idlim = AlignIdLimits(mp);
1310  CGene* genep = members_genes[i].second;
1311  if(idlim_prev != idlim) {
1312  TMembersInDiffGenes::iterator it = members_in_different_genes.find(idlim_prev);
1313  if(it->second.size() < 2) // alignment in only one gene
1314  members_in_different_genes.erase(it);
1315  }
1316  members_in_different_genes[idlim][genep].push_back(mp);
1317  }
1318  {
1319  SChainMember* mp = members_genes.back().first;
1320  TIdLim idlim = AlignIdLimits(mp);
1321  TMembersInDiffGenes::iterator it = members_in_different_genes.find(idlim);
1322  if(it->second.size() < 2) // alignment in only one gene
1323  members_in_different_genes.erase(it);
1324  }
1325 
1326  ITERATE(TMembersInDiffGenes, imdg, members_in_different_genes) {
1327  ITERATE(TGeneToMembers, ig1, imdg->second) {
1328  CGene& gene1 = *ig1->first;
1329  ITERATE(CGene, ic1, gene1) {
1330  CChain& chain1 = **ic1;
1331  sort(chain1.m_members.begin(), chain1.m_members.end(), std::less<SChainMember*>());
1332  }
1333  }
1334  }
1335 
1336  typedef map<CChain*,TMemberPtrSet> TConflictMemebersInChains;
1337  TConflictMemebersInChains conflict_members_in_chains;
1338 
1339  ITERATE(TMembersInDiffGenes, imdg, members_in_different_genes) {
1340  ITERATE(TGeneToMembers, ig1, imdg->second) {
1341  CGene& gene1 = *ig1->first;
1342  ITERATE(CGene, ic1, gene1) {
1343  CChain* chain1p_orig = *ic1;
1344  SChainMember* mbr1p_orig = 0;
1345  for(list<SChainMember*>::const_iterator im = ig1->second.begin(); im != ig1->second.end() && mbr1p_orig == 0; ++im) {
1346  if(binary_search(chain1p_orig->m_members.begin(),chain1p_orig->m_members.end(),*im, std::less<SChainMember*>()))
1347  mbr1p_orig = *im;
1348  }
1349  for(TGeneToMembers::const_iterator ig2 = imdg->second.begin(); mbr1p_orig != 0 && ig2 != ig1; ++ig2) {
1350  CGene& gene2 = *ig2->first;
1351  ITERATE(CGene, ic2, gene2) {
1352  CChain* chain1p = chain1p_orig;
1353  SChainMember* mbr1p = mbr1p_orig;
1354  CChain* chain2p = *ic2;
1355  SChainMember* mbr2p = 0;
1356  for(list<SChainMember*>::const_iterator im = ig2->second.begin(); im != ig2->second.end() && mbr2p == 0; ++im) {
1357  if(binary_search(chain2p->m_members.begin(),chain2p->m_members.end(),*im, std::less<SChainMember*>()))
1358  mbr2p = *im;
1359  }
1360 
1361  if(mbr2p != 0) { // both chains have alignment
1362 
1363  TSignedSeqRange core1 = chain1p->RealCdsLimits();
1364  if(chain1p->Exons().size() > 1)
1365  core1 += TSignedSeqRange(chain1p->Exons().front().Limits().GetTo(),chain1p->Exons().back().Limits().GetFrom());
1366  TSignedSeqRange core2 = chain2p->RealCdsLimits();
1367  if(chain2p->Exons().size() > 1)
1368  core2 += TSignedSeqRange(chain2p->Exons().front().Limits().GetTo(),chain2p->Exons().back().Limits().GetFrom());
1369  _ASSERT(core1.NotEmpty() && core2.NotEmpty());
1370 
1371  if(Precede(core2,core1)) { // chain2 is on the left change them over to simplify coding below
1372  swap(chain1p,chain2p);
1373  swap(mbr1p,mbr2p);
1374  swap(core1,core2);
1375  }
1376 
1377  CChain& chain1 = *chain1p;
1378  CChain& chain2 = *chain2p;
1379  TSignedSeqRange align_lim = mbr1p->m_align->Limits();
1380 
1381  if(CModelCompare::RangeNestedInIntron(core2, chain1)) { // chain2 is nested
1382  conflict_members_in_chains[&chain2].insert(mbr2p);
1383  } else if(CModelCompare::RangeNestedInIntron(core1, chain2)) { // chain1 is nested
1384  conflict_members_in_chains[&chain1].insert(mbr1p);
1385  }else if(Precede(core1,core2)) { // chain1 on the left
1386  if(Precede(align_lim,core1)) // alignment on the left of chain1
1387  conflict_members_in_chains[&chain2].insert(mbr2p);
1388  else if(Precede(core2,align_lim)) // alignment on the right of chain2
1389  conflict_members_in_chains[&chain1].insert(mbr1p);
1390  else { // alignmnet in between
1391  if(chain1.m_coverage_drop_right > 0 && chain2.m_coverage_drop_left > chain1.m_coverage_drop_right) { // non overlapping drop limits
1392  if(align_lim.GetTo() > chain1.m_coverage_drop_right)
1393  conflict_members_in_chains[&chain1].insert(mbr1p);
1394  if(align_lim.GetFrom() < chain2.m_coverage_drop_left)
1395  conflict_members_in_chains[&chain2].insert(mbr2p);
1396  } else if(chain1.m_coverage_drop_right > 0 && chain2.m_coverage_drop_left < 0 && chain1.m_core_coverage > 2*chain2.m_core_coverage) { // only chain1 has drop limit and is more expressed
1397  if(align_lim.GetTo() > chain1.m_coverage_drop_right)
1398  conflict_members_in_chains[&chain1].insert(mbr1p);
1399  if(align_lim.GetFrom() < max(chain2.m_coverage_bump_left,chain1.m_coverage_drop_right+50))
1400  conflict_members_in_chains[&chain2].insert(mbr2p);
1401  } else if(chain1.m_coverage_drop_right < 0 && chain2.m_coverage_drop_left > 0 && chain2.m_core_coverage > 2*chain1.m_core_coverage) { // only chain2 has drop limit and is more expressed
1402  if(align_lim.GetFrom() < chain2.m_coverage_drop_left)
1403  conflict_members_in_chains[&chain2].insert(mbr2p);
1404  if(align_lim.GetTo() > chain2.m_coverage_drop_left-50 || (chain1.m_coverage_bump_right > 0 && align_lim.GetTo() > chain1.m_coverage_bump_right))
1405  conflict_members_in_chains[&chain1].insert(mbr1p);
1406  } else {
1407  conflict_members_in_chains[&chain1].insert(mbr1p);
1408  conflict_members_in_chains[&chain2].insert(mbr2p);
1409  }
1410  }
1411  } else {
1412  conflict_members_in_chains[&chain1].insert(mbr1p);
1413  conflict_members_in_chains[&chain2].insert(mbr2p);
1414  }
1415  }
1416  }
1417  }
1418  }
1419  }
1420  }
1421 
1422  for(CGene& gene : genes) {
1423  for(CChain* chainp : gene)
1424  sort(chainp->m_members.begin(),chainp->m_members.end(),GenomeOrderD());
1425  }
1426 
1427  /*
1428  ITERATE(TMembersInDiffGenes, imdg, members_in_different_genes) {
1429  ITERATE(TGeneToMembers, ig1, imdg->second) {
1430  CGene& gene1 = *ig1->first;
1431  ITERATE(CGene, ic1, gene1) {
1432  CChain& chain1 = **ic1;
1433  sort(chain1.m_members.begin(),chain1.m_members.end(),GenomeOrderD());
1434  }
1435  }
1436  }
1437  */
1438 
1439  ITERATE(TConflictMemebersInChains, it, conflict_members_in_chains) {
1440  CChain& chain = *it->first;
1441  const TMemberPtrSet& conflict_members = it->second;
1442 
1443  CAlignMap amap = chain.GetAlignMap();
1444 
1445  TSignedSeqRange hard_limits(chain.Exons().front().Limits().GetTo()-15,chain.Exons().back().Limits().GetFrom()+15);
1446  hard_limits = (hard_limits & chain.Limits());
1447  if(chain.ReadingFrame().NotEmpty())
1448  hard_limits = (chain.OpenCds() ? chain.MaxCdsLimits() : chain.RealCdsLimits());
1449 
1450  TSignedSeqRange noclip_limits = hard_limits;
1451 
1452  /*
1453  int hard_limits_len = amap.FShiftedLen(hard_limits);
1454  ITERATE(TContained, i, chain.m_members) {
1455  const CGeneModel& a = *(*i)->m_align;
1456  if(a.Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
1457  continue;
1458  TSignedSeqRange alim(amap.ShrinkToRealPoints(a.Limits()&chain.Limits(),false));
1459  if(Include(alim,hard_limits.GetFrom()) ) {
1460  TSignedSeqRange l(hard_limits.GetFrom(),alim.GetTo());
1461  l = amap.ShrinkToRealPoints(l,false);
1462  int len = 0;
1463  if(l.NotEmpty())
1464  len = amap.FShiftedLen(l);
1465  if(len > 0.75*a.AlignLen() || len > 0.75*hard_limits_len)
1466  noclip_limits.SetFrom(min(noclip_limits.GetFrom(),alim.GetFrom()));
1467  }
1468  if(Include(alim,hard_limits.GetTo())) {
1469  TSignedSeqRange l(alim.GetFrom(),hard_limits.GetTo());
1470  l = amap.ShrinkToRealPoints(l,false);
1471  int len = 0;
1472  if(l.NotEmpty())
1473  len = amap.FShiftedLen(l);
1474  if(len > 0.75*a.AlignLen() || len > 0.75*hard_limits_len)
1475  noclip_limits.SetTo(max(noclip_limits.GetTo(),alim.GetTo()));
1476  }
1477  }
1478 
1479  noclip_limits = (noclip_limits & chain.Limits());
1480  */
1481 
1482  if(chain.Status()&CGeneModel::ePolyA) {
1483  if(chain.Strand() == ePlus) {
1484  if(chain.m_coverage_drop_right < 0)
1485  noclip_limits.SetTo(max(noclip_limits.GetTo(),chain.m_polya_cap_right_soft_limit));
1486  else
1487  noclip_limits.SetTo(max(noclip_limits.GetTo(),chain.m_coverage_drop_right));
1488  } else {
1489  if(chain.m_coverage_drop_left < 0)
1490  noclip_limits.SetFrom(min(noclip_limits.GetFrom(),chain.m_polya_cap_left_soft_limit));
1491  else
1492  noclip_limits.SetFrom(min(noclip_limits.GetFrom(),chain.m_coverage_drop_left));
1493  }
1494  }
1495  if(chain.Status()&CGeneModel::eCap) {
1496  if(chain.Strand() == ePlus) {
1497  if(chain.m_coverage_drop_left < 0)
1498  noclip_limits.SetFrom(min(noclip_limits.GetFrom(),chain.m_polya_cap_left_soft_limit));
1499  else
1500  noclip_limits.SetFrom(min(noclip_limits.GetFrom(),chain.m_coverage_drop_left));
1501  } else {
1502  if(chain.m_coverage_drop_right < 0)
1503  noclip_limits.SetTo(max(noclip_limits.GetTo(),chain.m_polya_cap_right_soft_limit));
1504  else
1505  noclip_limits.SetTo(max(noclip_limits.GetTo(),chain.m_coverage_drop_right));
1506  }
1507  }
1508 
1509  TSignedSeqRange new_limits = chain.Limits();
1510  ITERATE(TMemberPtrSet, im, conflict_members) {
1511  TSignedSeqRange alim = (*im)->m_align->Limits()&chain.Limits();
1512  if(alim.Empty())
1513  continue;
1514  alim = amap.ShrinkToRealPoints(alim);
1515  if(alim.Empty())
1516  continue;
1517  if(alim.GetFrom() < noclip_limits.GetFrom()) {
1518  int to = min(noclip_limits.GetFrom(),alim.GetTo());
1519  if(chain.m_coverage_drop_left > 0 && Include(alim,chain.m_coverage_drop_left)) {
1520  to = min(noclip_limits.GetFrom(),chain.m_coverage_drop_left);
1521  }
1522  new_limits.SetFrom(max(new_limits.GetFrom(),to));
1523  } else if(alim.GetTo() > noclip_limits.GetTo()) {
1524  int from = max(noclip_limits.GetTo(),alim.GetFrom());
1525  if(chain.m_coverage_drop_right > 0 && Include(alim,chain.m_coverage_drop_right)) {
1526  from = max(noclip_limits.GetTo(),chain.m_coverage_drop_right);
1527  }
1528  new_limits.SetTo(min(new_limits.GetTo(),from));
1529  }
1530  }
1531 
1532  int left_splice = -1;
1533  int right_splice = -1;
1534  for(int e = 1; e < (int)chain.Exons().size(); ++e) {
1535  if(left_splice < 0 && chain.Exons()[e-1].m_ssplice && Include(new_limits,chain.Exons()[e-1].GetTo()))
1536  left_splice = chain.Exons()[e-1].GetTo();
1537  if(chain.Exons()[e].m_fsplice && Include(new_limits,chain.Exons()[e].GetFrom()))
1538  right_splice = chain.Exons()[e].GetFrom();
1539  }
1540  map<int,double> left_weights;
1541  double left_weights_total = 0.;
1542  map<int,double> right_weights;
1543  double right_weights_total = 0.;
1544  ITERATE(TContained, i, chain.m_members) {
1545  const CGeneModel& a = *(*i)->m_align;
1547  continue;
1548  TSignedSeqRange alim(amap.ShrinkToRealPoints(a.Limits()&chain.Limits(),false));
1549  for(int e = 1; e < (int)a.Exons().size(); ++e) {
1550  if(a.Exons()[e-1].m_ssplice && a.Exons()[e-1].GetTo() == left_splice) {
1551  left_weights[alim.GetFrom()] += a.Weight();
1552  left_weights_total += a.Weight();
1553  }
1554  if(a.Exons()[e].m_fsplice && a.Exons()[e].GetFrom() == right_splice) {
1555  right_weights[alim.GetTo()] += a.Weight();
1556  right_weights_total += a.Weight();
1557  }
1558  }
1559  }
1560  if(left_weights_total > 0.) {
1561  int left = numeric_limits<int>::max();
1562  double t = 0;
1563  for(map<int,double>::reverse_iterator it = left_weights.rbegin(); it != left_weights.rend(); ++it) {
1564  if(t < 0.9*left_weights_total)
1565  left = it->first;
1566  t += it->second;
1567  }
1568  if(left < new_limits.GetFrom())
1569  new_limits.SetFrom(left);
1570  }
1571  if(right_weights_total > 0.) {
1572  int right = 0;
1573  double t = 0;
1574  for(map<int,double>::iterator it = right_weights.begin(); it != right_weights.end(); ++it) {
1575  if(t < 0.9*right_weights_total)
1576  right = it->first;
1577  t += it->second;
1578  }
1579  if(right > new_limits.GetTo())
1580  new_limits.SetTo(right);
1581  }
1582 
1583  //if has to clip, clip to next cap/polya
1584  if(new_limits.GetFrom() != chain.Limits().GetFrom() && chain.m_polya_cap_left_soft_limit < chain.Limits().GetTo())
1585  new_limits.SetFrom(chain.m_polya_cap_left_soft_limit);
1586  if(new_limits.GetTo() != chain.Limits().GetTo() && chain.m_polya_cap_right_soft_limit > chain.Limits().GetFrom())
1587  new_limits.SetTo(chain.m_polya_cap_right_soft_limit);
1588 
1589  //don't clip confirmed ends
1591  new_limits.SetFrom(chain.Limits().GetFrom());
1593  new_limits.SetTo(chain.Limits().GetTo());
1594 
1595  if(new_limits != chain.Limits()) {
1596  string note;
1597  if(new_limits.GetFrom() != chain.Limits().GetFrom())
1598  note += "Left";
1599  if(new_limits.GetTo() != chain.Limits().GetTo())
1600  note += "Right";
1601  note += " overlap UTR clip";
1602  chain.AddComment(note);
1603  _ASSERT(new_limits.NotEmpty());
1604 
1605  bool wasopen = chain.OpenCds();
1606  chain.ClipChain(new_limits);
1607  if(chain.Type()&CGeneModel::eNested)
1608  chain.ClipLowCoverageUTR(0.1);
1609  _ASSERT(chain.Limits().NotEmpty());
1610  if(chain.ReadingFrame().NotEmpty()) {
1611  m_gnomon->GetScore(chain, !no5pextension);
1612  CCDSInfo cds = chain.GetCdsInfo();
1613  if(wasopen != chain.OpenCds() && (wasopen == false || cds.HasStart())) {
1614  cds.SetScore(cds.Score(),wasopen);
1615  chain.SetCdsInfo(cds);
1616  }
1617  }
1618  chain.CalculateDropLimits();
1619  }
1620  }
1621 }
1622 
1623 void SChainMember::AddCodingToContained(TContained& contained, TMemberPtrSet& included_in_list) {
1624  if(m_type != eCDS)
1625  return;
1626 
1627  // list<const SChainMember*> not_visited(1,this);
1628  deque<const SChainMember*> not_visited(1,this);
1629  while(!not_visited.empty()) {
1630  const SChainMember* mbr = not_visited.front();
1631  for(int c = 0; c < (int)mbr->m_contained->size(); ++c) {
1632  SChainMember* mi = (*mbr->m_contained)[c];
1633  if(mi->m_type != eCDS)
1634  continue;
1635  if(c < mbr->m_identical_count) {
1636  if(included_in_list.insert(mi).second) {
1637  contained.push_back(mi); //action
1638  if(mi->m_copy != 0)
1639  included_in_list.insert(mi->m_copy->begin(),mi->m_copy->end());
1640  }
1641  } else if(included_in_list.find(mi) == included_in_list.end()) {
1642  not_visited.push_back(mi); //store for future
1643  }
1644  }
1645  not_visited.pop_front();
1646  }
1647 }
1648 
1650 
1651  TContained contained;
1652  TMemberPtrSet included_in_list;
1653  AddCodingToContained(contained, included_in_list);
1654 
1655  return contained;
1656 }
1657 
1659 {
1660  TContained contained;
1661  TMemberPtrSet included_in_list;
1662 
1663  AddCodingToContained(contained, included_in_list);
1664 
1665  for (SChainMember* left = m_left_member; left != 0; left = left->m_left_member) {
1666  left->AddCodingToContained(contained, included_in_list);
1667  }
1668 
1669  for (SChainMember* right = m_right_member; right != 0; right = right->m_right_member) {
1670  right->AddCodingToContained(contained, included_in_list);
1671  }
1672 
1673  return contained;
1674 }
1675 //visits all levels of nested and adds uniquely to contained
1676 void SChainMember::AddToContained(TContained& contained, TMemberPtrSet& included_in_list) {
1677 
1678  // list<const SChainMember*> not_visited(1,this);
1679  deque<const SChainMember*> not_visited(1,this);
1680  while(!not_visited.empty()) {
1681  const SChainMember* mbr = not_visited.front();
1682  for(int c = 0; c < (int)mbr->m_contained->size(); ++c) {
1683  SChainMember* mi = (*mbr->m_contained)[c];
1684  if(c < mbr->m_identical_count) {
1685  if(included_in_list.insert(mi).second) {
1686  contained.push_back(mi); //action
1687  if(mi->m_copy != 0)
1688  included_in_list.insert(mi->m_copy->begin(),mi->m_copy->end());
1689  }
1690  } else if(included_in_list.find(mi) == included_in_list.end()) {
1691  not_visited.push_back(mi); //store for future
1692  }
1693  }
1694  not_visited.pop_front();
1695  }
1696 }
1697 
1699 
1700  TContained contained;
1701  TMemberPtrSet included_in_list;
1702  AddToContained(contained, included_in_list);
1703 
1704  return contained;
1705 }
1706 
1708 {
1709  TContained contained;
1710  TMemberPtrSet included_in_list;
1711 
1712  AddToContained(contained, included_in_list);
1713 
1714  for (SChainMember* left = m_left_member; left != 0; left = left->m_left_member) {
1715  left->AddToContained(contained, included_in_list);
1716  }
1717 
1718  for (SChainMember* right = m_right_member; right != 0; right = right->m_right_member) {
1719  right->AddToContained(contained, included_in_list);
1720  }
1721 
1722  return contained;
1723 }
1724 
1725 #define START_BONUS 600
1726 
1728  m_included = true;
1729  if (m_copy != 0) {
1730  ITERATE(TContained, j, *m_copy) {
1731  SChainMember* mj = *j;
1732  if(mj->m_type != eCDS || mj->m_cds < START_BONUS+25 ||
1733  (m_align->Strand() == mj->m_align->Strand() &&
1734  (m_cds_info->ReadingFrame().GetFrom() == mj->m_cds_info->ReadingFrame().GetFrom() || // same copy or supressed start
1735  m_cds_info->ReadingFrame().GetTo() == mj->m_cds_info->ReadingFrame().GetTo()))) // same copy or supressed start
1736  mj->m_included = true;
1737  }
1738  }
1739 }
1740 
1742 {
1743  TContained contained = CollectContainedForChain();
1744  NON_CONST_ITERATE (TContained, i, contained) {
1745  SChainMember* mi = *i;
1746  mi->MarkIncluded();
1747  }
1748 }
1749 
1751 {
1752  m_postponed = true;
1753  if (m_copy != 0) {
1754  ITERATE(TContained, j, *m_copy) {
1755  SChainMember* mj = *j;
1756  if(mj->m_type != eCDS || mj->m_cds < START_BONUS+25 ||
1757  (m_align->Strand() == mj->m_align->Strand() &&
1758  (m_cds_info->ReadingFrame().GetFrom() == mj->m_cds_info->ReadingFrame().GetFrom() || // same copy or supressed start
1759  m_cds_info->ReadingFrame().GetTo() == mj->m_cds_info->ReadingFrame().GetTo()))) // same copy or supressed start
1760  mj->m_postponed = true;
1761  }
1762  }
1763 }
1764 
1766 {
1767  TContained contained = CollectContainedForChain();
1768  NON_CONST_ITERATE (TContained, i, contained) {
1769  SChainMember* mi = *i;
1770  mi->MarkPostponed();
1771  }
1772 }
1773 
1775 {
1776  TContained contained = CollectContainedForChain();
1777  NON_CONST_ITERATE (TContained, i, contained) {
1778  SChainMember* mi = *i;
1779  CGeneModel& algni = *mi->m_align;
1780  const CCDSInfo& cinfoi = *mi->m_cds_info;
1781  if(Include(cds, cinfoi.ReadingFrame())) {
1782  mi->m_marked_for_retention = true;
1783  mi->m_marked_for_deletion = false;
1784  if (mi->m_copy != 0) {
1785  ITERATE(TContained, j, *mi->m_copy) {
1786  SChainMember* mj = *j;
1787  const CCDSInfo& cinfoj = *mj->m_cds_info;
1788  if(mj->m_marked_for_retention) // already included in cds
1789  continue;
1790  else if(cinfoi.HasStart() || cinfoj.HasStart()) { // don't delete copy which overrides the start or has the start
1791  if((algni.Strand() == ePlus && cinfoi.ReadingFrame().GetTo() == cinfoj.ReadingFrame().GetTo()) ||
1792  (algni.Strand() == eMinus && cinfoi.ReadingFrame().GetFrom() == cinfoj.ReadingFrame().GetFrom()))
1793  continue;
1794  }
1795  mj->m_marked_for_deletion = true;
1796  }
1797  }
1798  }
1799  }
1800 }
1801 
1802 
1804 {
1805  bool operator()(const SChainMember* ap, const SChainMember* bp) // right end increasing, short first if right end equal
1806  {
1807  TSignedSeqRange alimits = ap->m_align->Limits();
1808  //ignore flexible ends for sorting
1810  alimits.SetFrom(alimits.GetTo());
1812  alimits.SetTo(alimits.GetFrom());
1813  TSignedSeqRange blimits = bp->m_align->Limits();
1814  //ignore flexible ends for sorting
1816  blimits.SetFrom(blimits.GetTo());
1818  blimits.SetTo(blimits.GetFrom());
1819 
1820  if(alimits.GetTo() == blimits.GetTo())
1821  return (alimits.GetFrom() > blimits.GetFrom());
1822  else
1823  return (alimits.GetTo() < blimits.GetTo());
1824  }
1825 };
1826 
1827 struct LeftOrderD // use for sorting not for finding
1828 {
1829  bool operator()(const SChainMember* ap, const SChainMember* bp) // right end increasing, short first if right end equal
1830  {
1831  TSignedSeqRange alimits = ap->m_align->Limits();
1832  //ignore flexible ends for sorting
1834  alimits.SetFrom(alimits.GetTo());
1836  alimits.SetTo(alimits.GetFrom());
1837  TSignedSeqRange blimits = bp->m_align->Limits();
1838  //ignore flexible ends for sorting
1840  blimits.SetFrom(blimits.GetTo());
1842  blimits.SetTo(blimits.GetFrom());
1843 
1844  if(alimits == blimits)
1845  return ap->m_mem_id < bp->m_mem_id; // to make sort deterministic
1846  else if(alimits.GetTo() == blimits.GetTo())
1847  return (alimits.GetFrom() > blimits.GetFrom());
1848  else
1849  return (alimits.GetTo() < blimits.GetTo());
1850  }
1851 };
1852 
1853 
1855 {
1856  bool operator()(const SChainMember* ap, const SChainMember* bp) // left end decreasing, short first if left end equal
1857  {
1858  TSignedSeqRange alimits = ap->m_align->Limits();
1859  //ignore flexible ends for sorting
1861  alimits.SetFrom(alimits.GetTo());
1863  alimits.SetTo(alimits.GetFrom());
1864  TSignedSeqRange blimits = bp->m_align->Limits();
1865  //ignore flexible ends for sorting
1867  blimits.SetFrom(blimits.GetTo());
1869  blimits.SetTo(blimits.GetFrom());
1870 
1871  if(alimits.GetFrom() == blimits.GetFrom())
1872  return (alimits.GetTo() < blimits.GetTo());
1873  else
1874  return (alimits.GetFrom() > blimits.GetFrom());
1875  }
1876 };
1877 
1879 {
1880  bool operator()(const SChainMember* ap, const SChainMember* bp) // left end decreasing, short first if left end equal
1881  {
1882  TSignedSeqRange alimits = ap->m_align->Limits();
1883  //ignore flexible ends for sorting
1885  alimits.SetFrom(alimits.GetTo());
1887  alimits.SetTo(alimits.GetFrom());
1888  TSignedSeqRange blimits = bp->m_align->Limits();
1889  //ignore flexible ends for sorting
1891  blimits.SetFrom(blimits.GetTo());
1893  blimits.SetTo(blimits.GetFrom());
1894 
1895  if(alimits == blimits)
1896  return ap->m_mem_id < bp->m_mem_id; // to make sort deterministic
1897  else if(alimits.GetFrom() == blimits.GetFrom())
1898  return (alimits.GetTo() < blimits.GetTo());
1899  else
1900  return (alimits.GetFrom() > blimits.GetFrom());
1901  }
1902 };
1903 
1904 
1906 {
1907  bool operator()(const SChainMember* ap, const SChainMember* bp)
1908  {
1909  if(max(ap->m_cds,bp->m_cds) >= 300 && ap->m_cds != bp->m_cds) // only long cdses count
1910  return (ap->m_cds > bp->m_cds);
1911  else if(fabs(ap->m_splice_num - bp->m_splice_num) > 0.001)
1912  return (ap->m_splice_num > bp->m_splice_num);
1913  else if(fabs(ap->m_num - bp->m_num) > 0.001)
1914  return (ap->m_num > bp->m_num);
1915  else
1916  return ap->m_mem_id < bp->m_mem_id; // to make sort deterministic
1917  }
1918 };
1919 
1921 {
1922  bool operator()(const SChainMember* ap, const SChainMember* bp)
1923  {
1924  if (ap->m_cds_info->Score() == bp->m_cds_info->Score())
1925  return ap->m_mem_id < bp->m_mem_id; // to make sort deterministic
1926  else
1927  return (ap->m_cds_info->Score() > bp->m_cds_info->Score());
1928  }
1929 };
1930 
1931 template <class C>
1932 void uniq(C& container)
1933 {
1934  sort(container.begin(),container.end());
1935  container.erase( unique(container.begin(),container.end()), container.end() );
1936 }
1937 
1938 class CChainMembers : public vector<SChainMember*> {
1939 public:
1940  CChainMembers() { m_extra_cds.push_back(CCDSInfo()); } // empty cds for utrs; first in the list
1941  CChainMembers(TGeneModelList& clust, TOrigAligns& orig_aligns, TUnmodAligns& unmodified_aligns);
1942  void InsertMember(CGeneModel& algn, SChainMember* copy_ofp = 0);
1943  void InsertMemberCopyWithCds(const CCDSInfo& cds, SChainMember* copy_ofp);
1944  void InsertMemberCopyAndStoreCds(const CCDSInfo& cds, SChainMember* copy_ofp);
1945  void InsertMemberCopyWithoutCds(SChainMember* copy_ofp);
1946  void InsertMember(SChainMember& m, SChainMember* copy_ofp = 0);
1947  void DuplicateUTR(SChainMember* copy_ofp);
1948  void SpliceFromOther(CChainMembers& other);
1949 private:
1950  CChainMembers(const CChainMembers& object) = delete;
1951  CChainMembers& operator=(const CChainMembers& object) = delete;
1952  list<SChainMember> m_members;
1953  list<TContained> m_copylist;
1954  list<CAlignMap> m_align_maps;
1955  list<TContained> m_containedlist;
1956  list<CCDSInfo> m_extra_cds;
1957 };
1958 
1960  m_members.splice(m_members.end(),other.m_members);
1961  m_copylist.splice(m_copylist.end(),other.m_copylist);
1962  m_align_maps.splice(m_align_maps.end(),other.m_align_maps);
1963  m_containedlist.splice(m_containedlist.end(),other.m_containedlist);
1964  m_extra_cds.splice(m_extra_cds.end(),other.m_extra_cds);
1965  insert(end(),other.begin(),other.end());
1966 }
1967 
1969 
1970  SChainMember mbr = *copy_ofp;
1971  mbr.m_cds_info = &cds;
1972  mbr.m_type = eCDS;
1973  InsertMember(mbr, copy_ofp);
1974 }
1975 
1977 
1978  m_extra_cds.push_back(cds);
1979  InsertMemberCopyWithCds(m_extra_cds.back(), copy_ofp);
1980 }
1981 
1983 
1984  SChainMember mbr = *copy_ofp;
1985  mbr.m_cds_info = &m_extra_cds.front(); // empty cds
1986  mbr.m_type = eLeftUTR;
1987  InsertMember(mbr, copy_ofp);
1988 }
1989 
1990 
1992 {
1993  SChainMember mbr;
1994  mbr.m_align = &algn;
1995  mbr.m_cds_info = &algn.GetCdsInfo();
1996  mbr.m_type = eCDS;
1997  if(algn.Score() == BadScore())
1998  mbr.m_type = eLeftUTR;
1999  if(copy_ofp) {
2000  mbr.m_orig_align = copy_ofp->m_orig_align;
2001  mbr.m_unmd_align = copy_ofp->m_unmd_align;
2002  }
2003  InsertMember(mbr, copy_ofp);
2004 }
2005 
2007 {
2008  m.m_mem_id = (int)size()+1;
2009  m_members.push_back(m);
2010  push_back(&m_members.back());
2011 
2012  m_containedlist.push_back(TContained());
2013  m_members.back().m_contained = &m_containedlist.back();
2014 
2015  _ASSERT(copy_ofp == 0 || (m.m_align->Exons()==copy_ofp->m_align->Exons() && m.m_align->FrameShifts()==copy_ofp->m_align->FrameShifts()));
2016 
2017  if(copy_ofp == 0 || m.m_align->Strand() != copy_ofp->m_align->Strand()) { // first time or reversed copy
2018  m_align_maps.push_back(CAlignMap(m.m_align->Exons(), m.m_align->FrameShifts(), m.m_align->Strand()));
2019  m_members.back().m_align_map = &m_align_maps.back();
2020  } else {
2021  m_members.back().m_align_map = copy_ofp->m_align_map;
2022  }
2023 
2024  if(copy_ofp != 0) { // we are making a copy of member
2025  if(copy_ofp->m_copy == 0) {
2026  m_copylist.push_back(TContained(1,copy_ofp));
2027  copy_ofp->m_copy = &m_copylist.back();
2028  }
2029  m_members.back().m_copy = copy_ofp->m_copy;
2030  copy_ofp->m_copy->push_back(&m_members.back());
2031  }
2032 }
2033 
2035 {
2036  _ASSERT(copy_ofp->m_type == eLeftUTR);
2037  SChainMember new_mbr = *copy_ofp;
2038  new_mbr.m_type = eRightUTR;
2039  InsertMember(new_mbr, copy_ofp);
2040 }
2041 
2042 
2043 CChainMembers::CChainMembers(TGeneModelList& clust, TOrigAligns& orig_aligns, TUnmodAligns& unmodified_aligns)
2044 {
2045  m_extra_cds.push_back(CCDSInfo()); // empty cds for utrs; first in the list
2046  NON_CONST_ITERATE(TGeneModelList, itcl, clust) {
2047  InsertMember(*itcl);
2048  m_members.back().m_orig_align = orig_aligns[itcl->ID()];
2049  if(unmodified_aligns.count(itcl->ID()))
2050  m_members.back().m_unmd_align = &unmodified_aligns[itcl->ID()];
2051  }
2052 }
2053 
2054 
2056 {
2057  TSignedSeqRange limits(a.Limits().GetFrom()-1,a.Limits().GetTo()+1);
2058 
2059  return limits & cds.MaxCdsLimits();
2060 }
2061 
2062 
2064 {
2065  //all identical members are contained in each other; only one of them (with smaller m_mem_id) is contained in other members
2066  TSignedSeqRange big_limits = big.m_align->Limits();
2068  big_limits.SetFrom(big_limits.GetTo());
2070  big_limits.SetTo(big_limits.GetFrom());
2071  TSignedSeqRange small_limits = small.m_align->Limits();
2072  bool small_flex = false;
2074  small_limits.SetFrom(small_limits.GetTo());
2075  small_flex = true;
2076  }
2078  small_limits.SetTo(small_limits.GetFrom());
2079  small_flex = true;
2080  }
2081 
2082  if(big_limits == small_limits) { // identical
2083  ++big.m_identical_count;
2084  big.m_contained->push_back(&small);
2085  return;
2086  } else if(big.m_sink_for_contained != nullptr &&
2087  small_limits.GetTo() <= big.m_sink_for_contained->m_align->Limits().GetTo() &&
2088  CanIncludeJinI(*big.m_sink_for_contained, small)) {
2089  return; // contained in next level
2090  } else {
2091  big.m_contained->push_back(&small);
2092  if(!small_flex && (big.m_sink_for_contained == nullptr || small_limits.GetTo() > big.m_sink_for_contained->m_align->Limits().GetTo()))
2093  big.m_sink_for_contained = &small;
2094  }
2095 }
2096 
2097 
2099 {
2100  m_data->CutParts(models);
2101 }
2102 
2104  ERASE_ITERATE(TGeneModelList, im, models) {
2105  TGeneModelList parts = GetAlignParts(*im, true);
2106  if(!parts.empty()) {
2107  models.splice(models.begin(),parts);
2108  models.erase(im);
2109  }
2110  }
2111 }
2112 
2114 {
2115  size_t initial_size = pointers.size();
2116  for(size_t i = 0; i < initial_size; ++i) {
2117  SChainMember& mbr = *pointers[i];
2118  CGeneModel& algn = *mbr.m_align;
2119  if((algn.Status()&CGeneModel::eUnknownOrientation) != 0) {
2120  CGeneModel new_algn = algn;
2121  new_algn.ReverseComplementModel();
2122  new_algn.Status() &= ~CGeneModel::eReversed;
2123  clust.push_back(new_algn);
2124  pointers.InsertMember(clust.back(), &mbr); //reversed copy
2125  }
2126  }
2127 }
2128 
2130 {
2131  size_t initial_size = pointers.size();
2132  for(size_t i = 0; i < initial_size; ++i) {
2133  SChainMember& mbr = *pointers[i];
2135  mbr.m_type = eRightUTR;
2136  else if(mbr.m_align->Status()&CGeneModel::eRightFlexible)
2137  mbr.m_type = eLeftUTR;
2138  else if(mbr.m_cds_info->Score() == BadScore())
2139  pointers.DuplicateUTR(&mbr);
2140  }
2141 }
2142 
2144 {
2145  map<int, set<int> > oriented_splices;
2146  ITERATE(set<TSignedSeqRange>, i, oriented_introns_plus) {
2147  oriented_splices[ePlus].insert(i->GetFrom());
2148  oriented_splices[ePlus].insert(i->GetTo());
2149  }
2150  ITERATE(set<TSignedSeqRange>, i, oriented_introns_minus) {
2151  oriented_splices[eMinus].insert(i->GetFrom());
2152  oriented_splices[eMinus].insert(i->GetTo());
2153  }
2154 
2155  NON_CONST_ITERATE(CChainMembers, i, pointers) {
2156  SChainMember& mbr = **i;
2157  CGeneModel& algn = *mbr.m_align;
2159  continue;
2160  set<int>& ospl = oriented_splices[algn.Strand()];
2161  ITERATE(CGeneModel::TExons, ie, algn.Exons()) {
2162  TSignedSeqRange exon = *ie;
2163  for(set<int>::iterator spli = ospl.lower_bound(exon.GetFrom()); spli != ospl.end() && *spli <= exon.GetTo(); ++spli)
2164  mbr.m_splice_weight += algn.Weight();
2165  }
2166  }
2167 }
2168 
2170 {
2171  int left = numeric_limits<int>::max();
2172  int right = 0;
2173  typedef vector<pair<CCDSInfo::SPStop,TSignedSeqRange> > TPstopIntron;
2174  TPstopIntron pstops_with_intron_plus;
2175  TPstopIntron pstops_with_intron_minus;
2176  ITERATE(CChainMembers, i, pointers) {
2177  SChainMember& mbr = **i;
2178  CGeneModel& algn = *mbr.m_align;
2179  TPstopIntron& pstops_with_intron = (algn.Strand() == ePlus) ? pstops_with_intron_plus : pstops_with_intron_minus;
2180  ITERATE(CCDSInfo::TPStops, s, algn.GetCdsInfo().PStops()) {
2181  if(s->m_status == CCDSInfo::eSelenocysteine || s->m_status == CCDSInfo::eGenomeNotCorrect) {
2182  left = min(left,s->GetFrom());
2183  right = max(right,s->GetTo());
2184  if(s->GetLength() == 3) {
2185  pstops_with_intron.push_back(make_pair(*s,TSignedSeqRange(0,0)));
2186  } else {
2187  for(int i = 1; i < (int)algn.Exons().size(); ++i) {
2188  TSignedSeqRange intron(algn.Exons()[i-1].GetTo(),algn.Exons()[i].GetFrom());
2189  pstops_with_intron.push_back(make_pair(*s,intron));
2190  }
2191  }
2192  }
2193  }
2194  }
2195  uniq(pstops_with_intron_plus);
2196  uniq(pstops_with_intron_minus);
2197 
2198  ITERATE(CChainMembers, i, pointers) {
2199  SChainMember& mbr = **i;
2200  CGeneModel& algn = *mbr.m_align;
2201  if(algn.Limits().GetFrom() > right || algn.Limits().GetTo() < left)
2202  continue;
2203  if((algn.Type()&CGeneModel::eProt) && !algn.PStop())
2204  continue;
2206  continue;
2207 
2208  TPstopIntron& pstops_with_intron = (algn.Strand() == ePlus) ? pstops_with_intron_plus : pstops_with_intron_minus;
2209  if(pstops_with_intron.empty())
2210  continue;
2211 
2212  if(algn.Type()&CGeneModel::eProt) {
2213  CCDSInfo cds = algn.GetCdsInfo();
2214  CCDSInfo::TPStops pstops = cds.PStops();
2215  NON_CONST_ITERATE(CCDSInfo::TPStops, s, pstops) {
2216  ITERATE(TPstopIntron, si, pstops_with_intron) {
2217  if(si->second.GetLength() == 1) { // no split
2218  if(si->first == *s)
2219  *s = si->first; // assigns status
2220  } else {
2221  for(int i = 1; i < (int)algn.Exons().size(); ++i) {
2222  TSignedSeqRange intron(algn.Exons()[i-1].GetTo(),algn.Exons()[i].GetFrom());
2223  if(si->second == intron && si->first == *s)
2224  *s = si->first; // assigns status
2225  }
2226  }
2227  }
2228  }
2229  cds.ClearPStops();
2230  ITERATE(CCDSInfo::TPStops, s, pstops)
2231  cds.AddPStop(*s);
2232  algn.SetCdsInfo(cds);
2233  } else if(algn.ReadingFrame().Empty()) {
2234  CCDSInfo cds;
2235  const CGeneModel::TExons& exons = algn.Exons();
2236  ITERATE(TPstopIntron, si, pstops_with_intron) {
2237  if(si->first.GetTo() < algn.Limits().GetFrom())
2238  continue;
2239  if(si->first.GetFrom() > algn.Limits().GetTo())
2240  break;
2241  for(int i = 0; i < (int)exons.size(); ++i) {
2242  if(Include(exons[i].Limits(),si->first.GetFrom())) {
2243  if(si->second.GetLength() == 1) { // no split
2244  if(si->first.GetTo() <= exons[i].GetTo())
2245  cds.AddPStop(si->first);
2246  } else {
2247  if(i < (int)exons.size()-1) {
2248  TSignedSeqRange intron(exons[i].GetTo(),exons[i+1].GetFrom());
2249  if(intron == si->second && si->first.GetTo() <= exons[i+1].GetTo())
2250  cds.AddPStop(si->first);
2251  }
2252  }
2253  }
2254  }
2255  }
2256  if(cds.PStop())
2257  algn.SetCdsInfo(cds);
2258  }
2259  }
2260 }
2261 
2263 {
2264  NON_CONST_ITERATE(CChainMembers, i, pointers) {
2265  SChainMember& mbr = **i;
2266  CGeneModel& algn = *mbr.m_align;
2267 
2269  continue;
2270  if((algn.Type() & CGeneModel::eProt)!=0 || algn.ConfirmedStart())
2271  continue;
2272 
2273  m_gnomon->GetScore(algn);
2274  double ms = GoodCDNAScore(algn);
2275  RemovePoorCds(algn,ms);
2276 
2277  if(algn.Score() != BadScore())
2278  mbr.m_type = eCDS;
2279  }
2280 }
2281 
2283 {
2284  size_t initial_size = pointers.size();
2285  for(size_t i = 0; i < initial_size; ++i) {
2286  SChainMember& mbr = *pointers[i];
2287  CGeneModel& algn = *mbr.m_align;
2288 
2290  continue;
2291 
2292  if(mbr.m_type == eRightUTR) // avoid copying UTR copies
2293  continue;
2294 
2295  if(algn.GetCdsInfo().ProtReadingFrame().Empty() && algn.Score() < 5*minscor.m_min) {
2296  for(int i = 0; i < (int)algn.GetEdgeReadingFrames()->size(); ++i) {
2297  const CCDSInfo& cds_info = (*algn.GetEdgeReadingFrames())[i];
2298  if(cds_info.ReadingFrame() != algn.ReadingFrame()) {
2299  pointers.InsertMemberCopyWithCds(cds_info, &mbr); //copy with CDS
2300  }
2301  }
2302 
2303  if(algn.Score() != BadScore()) {
2304  pointers.InsertMemberCopyWithoutCds(&mbr); //UTR copy
2305  }
2306  }
2307  }
2308 
2309 
2310  initial_size = pointers.size();
2311  for(unsigned int i = 0; i < initial_size; ++i) {
2312  SChainMember& mbr = *pointers[i];
2313  CGeneModel& algn = *mbr.m_align;
2314  CCDSInfo& acdsinfo = const_cast<CCDSInfo&>(*mbr.m_cds_info);
2315 
2316  if(acdsinfo.HasStart()) {
2317  bool inf_5prime;
2318  if (algn.Strand()==ePlus) {
2319  inf_5prime = acdsinfo.MaxCdsLimits().GetFrom()==TSignedSeqRange::GetWholeFrom();
2320  } else {
2321  inf_5prime = acdsinfo.MaxCdsLimits().GetTo()==TSignedSeqRange::GetWholeTo();
2322  }
2323  if (inf_5prime) {
2324  CCDSInfo cdsinfo = acdsinfo;
2325 
2326  TSignedSeqPos start = (algn.Strand() == ePlus) ? acdsinfo.Start().GetFrom() : acdsinfo.Start().GetTo();
2327  acdsinfo.Set5PrimeCdsLimit(start);
2328  mbr.m_restricted_to_start = true;
2329 
2330  if(algn.Strand() == ePlus) {
2331  int full_rf_left = algn.FShiftedMove(algn.Limits().GetFrom(),(algn.FShiftedLen(algn.Limits().GetFrom(), cdsinfo.Start().GetFrom(), false)-1)%3);
2333  cdsinfo.SetScore(cdsinfo.Score(),false);
2334  cdsinfo.SetReadingFrame(TSignedSeqRange(full_rf_left,cdsinfo.ReadingFrame().GetTo()));
2335  } else {
2336  int full_rf_right = algn.FShiftedMove(algn.Limits().GetTo(),-(algn.FShiftedLen(cdsinfo.Start().GetTo(),algn.Limits().GetTo(),false)-1)%3);
2338  cdsinfo.SetScore(cdsinfo.Score(),false);
2339  cdsinfo.SetReadingFrame(TSignedSeqRange(cdsinfo.ReadingFrame().GetFrom(),full_rf_right));
2340  }
2341 
2342  if(mbr.m_copy != 0) {
2343  if(mbr.m_copy->front()->m_align->Strand() == algn.Strand()) { // first copy is original alignment; for not oriented the second copy is reverse
2344  if(mbr.m_copy->front()->m_cds_info->ReadingFrame() == cdsinfo.ReadingFrame())
2345  continue;
2346  } else if((*mbr.m_copy)[1]->m_cds_info->ReadingFrame() == cdsinfo.ReadingFrame()) {
2347  continue;
2348  }
2349  }
2350 
2351  pointers.InsertMemberCopyAndStoreCds(cdsinfo, &mbr);
2352  }
2353  }
2354 
2355  }
2356 }
2357 
2359  TInDels fs;
2360  ITERATE(TInDels, indl, indels) {
2361  if(indl->InDelEnd() > lim.GetFrom() && indl->Loc() <= lim.GetTo())
2362  fs.push_back(*indl);
2363  }
2364  return fs;
2365 }
2366 
2368  const CGeneModel& ai = *mi.m_align;
2369  const CGeneModel& aj = *mj.m_align;
2370 
2372  return false;
2373 
2374  bool jflex = false;
2375  TSignedSeqRange jlimits = aj.Limits();
2377  jlimits.SetFrom(jlimits.GetTo());
2378  jflex = true;
2379  }
2381  jlimits.SetTo(jlimits.GetFrom());
2382  jflex = true;
2383  }
2384 
2385  if(aj.Strand() != ai.Strand() || !Include(ai.Limits(),jlimits))
2386  return false;
2387 
2388  if(mi.m_type != eCDS && mj.m_type != mi.m_type)
2389  return false; // avoid including UTR copy and avoid including CDS into UTR because that will change m_type
2390 
2391  const CCDSInfo& ai_cds_info = *mi.m_cds_info;
2392  TSignedSeqRange ai_rf = ai_cds_info.Start()+ai_cds_info.ReadingFrame()+ai_cds_info.Stop();
2393  TSignedSeqRange ai_max_cds = ai_cds_info.MaxCdsLimits()&ai.Limits();
2394 
2395  const CCDSInfo& aj_cds_info = *mj.m_cds_info;
2396  TSignedSeqRange aj_rf = aj_cds_info.Start()+aj_cds_info.ReadingFrame()+aj_cds_info.Stop();
2397 
2398  // UTR in CDS
2399  if(mi.m_type == eCDS && mj.m_type == eLeftUTR) {
2400  if(!jflex && jlimits.GetTo()-ai_max_cds.GetFrom() >= 5) // normal UTR don't go into CDS > 5bp
2401  return false;
2402  else if(jflex && (aj.Status()&CGeneModel::ePolyA) && (!ai_cds_info.HasStop() || jlimits.GetTo()-ai_max_cds.GetFrom() >= 5)) // flex polyA needs stop and don't go into CDS > 5bp
2403  return false;
2404  else if(jflex && (aj.Status()&CGeneModel::eCap) && ai_cds_info.HasStop() && ai_max_cds.GetTo()-jlimits.GetTo() <= 5) // flex cap is allowed almost up to 3' UTR to be awailable if start moves
2405  return false;
2406  }
2407  if(mi.m_type == eCDS && mj.m_type == eRightUTR) {
2408  if(!jflex && ai_max_cds.GetTo()-jlimits.GetFrom() >= 5)
2409  return false;
2410  else if(jflex && (aj.Status()&CGeneModel::ePolyA) && (!ai_cds_info.HasStop() || ai_max_cds.GetTo()-jlimits.GetFrom() >= 5))
2411  return false;
2412  else if(jflex && (aj.Status()&CGeneModel::eCap) && ai_cds_info.HasStop() && jlimits.GetFrom()-ai_max_cds.GetFrom() <= 5)
2413  return false;
2414  }
2415 
2416  if(aj.FrameShifts() != StrictlyContainedInDels(ai.FrameShifts(), aj.Limits())) // not compatible frameshifts
2417  return false;
2418 
2419  if(mi.m_type == eCDS && mj.m_type == eCDS) { // CDS in CDS
2420  TSignedSeqRange max_cds_limits = ai_cds_info.MaxCdsLimits() & aj_cds_info.MaxCdsLimits();
2421  if (!Include(max_cds_limits, ExtendedMaxCdsLimits(ai, ai_cds_info) + ExtendedMaxCdsLimits(aj, aj_cds_info)))
2422  return false;;
2423  if(!Include(ai_rf,aj_rf))
2424  return false;
2425 
2426  if(ai_rf.GetFrom() != aj_rf.GetFrom()) {
2427  TSignedSeqPos j_from = mi.m_align_map->MapOrigToEdited(aj_rf.GetFrom());
2428  if(j_from < 0)
2429  return false;
2430  TSignedSeqPos i_from = mi.m_align_map->MapOrigToEdited(ai_rf.GetFrom());
2431  if(abs(j_from-i_from)%3 != 0)
2432  return false;
2433  }
2434  }
2435 
2436  int iex = (int)ai.Exons().size();
2437  int jex = (int)aj.Exons().size();
2438  if(jex > iex)
2439  return false;
2440  if(iex > 1) // big alignment is spliced
2441  return ai.HasCompatibleOverlap(aj, 1); // overlap 1 for flexible
2442 
2443  return true;
2444 }
2445 
2447 
2448  set<int> left_exon_ends, right_exon_ends;
2449  ITERATE(TContained, ip, pointers) {
2450  const CGeneModel& algn = *(*ip)->m_align;
2451  for(int i = 1; i < (int)algn.Exons().size(); ++i) {
2452  if(algn.Exons()[i-1].m_ssplice && algn.Exons()[i].m_fsplice) {
2453  left_exon_ends.insert(algn.Exons()[i].GetFrom());
2454  right_exon_ends.insert(algn.Exons()[i-1].GetTo());
2455  }
2456  }
2457  }
2458  NON_CONST_ITERATE(TContained, ip, pointers) {
2459  SChainMember& mi = **ip;
2460  CGeneModel& ai = *mi.m_align;
2461 
2462  set<int>::iterator ri = right_exon_ends.lower_bound(ai.Limits().GetTo()); // leftmost compatible rexon
2464  if(ri != right_exon_ends.end())
2465  mi.m_rlimb = *ri;
2466  set<int>::iterator li = left_exon_ends.upper_bound(ai.Limits().GetFrom()); // leftmost not compatible lexon
2468  if(li != left_exon_ends.end())
2469  mi.m_llimb = *li;
2470  }
2471 
2472 // finding contained subalignments (alignment is contained in itself) and selecting longer alignments for chaining
2473 
2474  sort(pointers.begin(),pointers.end(),GenomeOrderD());
2475 
2476  int jfirst = 0;
2477  for(int i = 0; i < (int)pointers.size(); ++i) {
2478  SChainMember& mi = *pointers[i];
2479  CGeneModel& ai = *mi.m_align;
2480  const CCDSInfo& ai_cds_info = *mi.m_cds_info;
2481 
2482  // knockdown spliced notconsensus UTRs in reads
2483  if(mi.m_type != eCDS && ai.Exons().size() > 1) {
2485  mi.m_not_for_chaining = true;
2486  } else {
2487  for(int i = 1; i < (int)ai.Exons().size() && !mi.m_not_for_chaining; ++i) {
2488 
2489  string ssplice = ai.Exons()[i-1].m_ssplice_sig;
2490  string fsplice = ai.Exons()[i].m_fsplice_sig;
2491  if(ssplice == "XX" || fsplice == "XX")
2492  continue;
2493  else if(ai.Strand() == ePlus && ((ssplice != "GT" && ssplice != "GC") || fsplice != "AG"))
2494  mi.m_not_for_chaining = true;
2495  else if(ai.Strand() == eMinus && (ssplice != "AG" || (fsplice != "GT" && fsplice != "GC")))
2496  mi.m_not_for_chaining = true;
2497 
2498  /*
2499  if(ai.Exons()[i-1].m_ssplice_sig == "XX" || ai.Exons()[i].m_fsplice_sig == "XX")
2500  continue;
2501  else if(ai.Strand() == ePlus && (ai.Exons()[i-1].m_ssplice_sig != "GT" || ai.Exons()[i].m_fsplice_sig != "AG"))
2502  mi.m_not_for_chaining = true;
2503  else if(ai.Strand() == eMinus && (ai.Exons()[i-1].m_ssplice_sig != "AG" || ai.Exons()[i].m_fsplice_sig != "GT"))
2504  mi.m_not_for_chaining = true;
2505  */
2506  }
2507  }
2508  }
2509 
2510  //don't use alignments intersection with frameshifts for hiding smaller alignments
2511  TSignedSeqRange intersect_with_fs;
2512  ITERATE(TInDels, indl, all_frameshifts) {
2513  if(indl->InDelEnd() < ai.Limits().GetFrom())
2514  continue;
2515  else if(indl->Loc() > ai.Limits().GetTo()+1)
2516  break;
2517  else {
2518  ITERATE(CGeneModel::TExons, e, ai.Exons()) {
2519  if(indl->IntersectingWith(e->GetFrom(), e->GetTo()))
2520  intersect_with_fs += TSignedSeqRange(indl->Loc(), indl->InDelEnd());
2521  }
2522  }
2523  }
2524 
2525  if(pointers[jfirst]->m_align->Limits() != ai.Limits())
2526  jfirst = i;
2527  for(int j = jfirst; j < (int)pointers.size() && pointers[j]->m_align->Limits().GetFrom() <= ai.Limits().GetTo(); ++j) {
2528  if(i == j) {
2529  IncludeInContained(mi, mi); // include self
2530  continue;
2531  }
2532 
2533  SChainMember& mj = *pointers[j];
2534  CGeneModel& aj = *mj.m_align;
2535  const CCDSInfo& aj_cds_info = *mj.m_cds_info;
2536 
2537  if(CanIncludeJinI(mi, mj))
2538  IncludeInContained(mi, mj);
2539  else
2540  continue;
2541 
2542  if(mi.m_not_for_chaining
2543  || mj.m_not_for_chaining
2544  || mi.m_internal
2545  || mj.m_internal
2546  || mj.m_type != mi.m_type
2547  || ai.Limits() == aj.Limits()) {
2548  continue;
2549  }
2550 
2551  if(intersect_with_fs.NotEmpty() && !Include(aj.Limits(), intersect_with_fs))
2552  continue;
2553  if((aj.Status()&CGeneModel::ePolyA) != 0 || (aj.Status()&CGeneModel::eCap) != 0)
2554  continue;
2555  if((aj.Type()&CGeneModel::eProt) != 0) // proteins (actually only gapped) should be directly available
2556  continue;
2557  if(mj.m_rlimb < ai.Limits().GetTo() || mj.m_llimb != mi.m_llimb) // bigger alignment may interfere with splices
2558  continue;
2559  if(mi.m_type == eCDS && mj.m_type == eCDS && !Include(ai_cds_info.MaxCdsLimits(),aj_cds_info.MaxCdsLimits())) // bigger alignment restricts the cds
2560  continue;
2561 
2562  // mj.m_not_for_chaining = true;
2563  mj.m_internal = true;
2564  }
2565  }
2566 }
2567 
2568 #define NON_CDNA_INTRON_PENALTY 20
2569 
2570 bool CChainer::CChainerImpl::LRCanChainItoJ(int& delta_cds, double& delta_num, double& delta_splice_num, SChainMember& mi, SChainMember& mj, TContained& contained, bool& not_sorted) {
2571 
2572  const CGeneModel& ai = *mi.m_align;
2573  const CGeneModel& aj = *mj.m_align;
2574 
2575 
2576  if(aj.Strand() != ai.Strand())
2577  return false;
2578 
2579  const CCDSInfo& ai_cds_info = *mi.m_cds_info;
2580  TSignedSeqRange ai_rf = ai_cds_info.Start()+ai_cds_info.ReadingFrame()+ai_cds_info.Stop();
2581  bool ai_left_complete = ai.Strand() == ePlus ? ai_cds_info.HasStart() : ai_cds_info.HasStop();
2582 
2583  const CCDSInfo& aj_cds_info = *mj.m_cds_info;
2584  TSignedSeqRange aj_rf = aj_cds_info.Start()+aj_cds_info.ReadingFrame()+aj_cds_info.Stop();
2585  bool aj_right_complete = aj.Strand() == ePlus ? aj_cds_info.HasStop() : aj_cds_info.HasStart();
2586 
2587  bool j_rflexible = aj.Status()&CGeneModel::eRightFlexible;
2588  bool i_lflexible = ai.Status()&CGeneModel::eLeftFlexible;
2589  switch(mi.m_type) {
2590  case eCDS:
2591  if(mj.m_type == eRightUTR)
2592  return false;
2593  else if(mj.m_type == eLeftUTR && (!ai_left_complete || (!j_rflexible && (aj.Limits()&ai_rf).GetLength() > 5)))
2594  return false;
2595  else
2596  break;
2597  case eLeftUTR:
2598  if(mj.m_type != eLeftUTR)
2599  return false;
2600  else
2601  break;
2602  case eRightUTR:
2603  if(mj.m_type == eLeftUTR)
2604  return false;
2605  else if(mj.m_type == eCDS && (!aj_right_complete || (!i_lflexible && (ai.Limits()&aj_rf).GetLength() > 5)))
2606  return false;
2607  else
2608  break;
2609  default:
2610  return false;
2611  }
2612 
2613  switch(ai.MutualExtension(aj)) {
2614  case 0: // not compatible
2615  return false;
2616  case 1: // no introns in intersection
2617  if(mi.m_type == eCDS && mj.m_type == eCDS) // no intersecting limit for coding
2618  break;
2619  if(j_rflexible || i_lflexible) // no intersecting limit for flexible
2620  break;
2621  if((ai.Limits() & aj.Limits()).GetLength() < intersect_limit)
2622  return false;
2623  break;
2624  default: // one or more introns in intersection
2625  break;
2626  }
2627 
2628  TSignedSeqRange overlap = (ai.Limits() & aj.Limits());
2629  if(StrictlyContainedInDels(ai.FrameShifts(), overlap) != StrictlyContainedInDels(aj.FrameShifts(), overlap)) // incompatible frameshifts
2630  return false;
2631 
2632  int cds_overlap = 0;
2633 
2634  if(mi.m_type == eCDS && mj.m_type == eCDS) {
2635  int genome_overlap = ai_rf.GetLength()+aj_rf.GetLength()-(ai_rf+aj_rf).GetLength();
2636  if(genome_overlap < 0)
2637  return false;
2638 
2639  TSignedSeqRange max_cds_limits = ai_cds_info.MaxCdsLimits() & aj_cds_info.MaxCdsLimits();
2640 
2641  if (!Include(max_cds_limits, ExtendedMaxCdsLimits(ai, ai_cds_info) + ExtendedMaxCdsLimits(aj, aj_cds_info)))
2642  return false;
2643 
2644  if((Include(ai_rf,aj_rf) || Include(aj_rf,ai_rf)) && ai_rf.GetFrom() != aj_rf.GetFrom() && ai_rf.GetTo() != aj_rf.GetTo())
2645  return false;
2646 
2647  cds_overlap = mi.m_align_map->FShiftedLen(ai_rf&aj_rf,false);
2648  if(cds_overlap%3 != 0)
2649  return false;
2650 
2651  if(ai_cds_info.HasStart() && aj_cds_info.HasStart())
2652  cds_overlap += START_BONUS;
2653 
2654  if(has_rnaseq) {
2655  for(int i = 1; i < (int)ai.Exons().size(); ++i) {
2656  if(ai.Exons()[i-1].m_ssplice && ai.Exons()[i].m_fsplice) {
2657  TSignedSeqRange intron(ai.Exons()[i-1].Limits().GetTo(),ai.Exons()[i].Limits().GetFrom());
2658  if(Include(ai_rf,intron) && Include(aj_rf,intron) && mrna_count[intron]+est_count[intron]+rnaseq_count[intron] == 0) {
2659  cds_overlap -= NON_CDNA_INTRON_PENALTY;
2660  }
2661  }
2662  }
2663  }
2664  }
2665 
2666  delta_cds = mi.m_cds-cds_overlap;
2667 
2668  delta_num = 0;
2669  delta_splice_num = 0;
2670  if(delta_cds >= 0) {
2671  if(not_sorted)
2672  sort(contained.begin(),contained.end(),LeftOrderD());
2673  int first = 0;
2674  if(!j_rflexible && !i_lflexible)
2675  first = upper_bound(contained.begin(), contained.end(), &mj, LeftOrder())-contained.begin(); // first alignmnet contained in ai and outside aj
2676  if(not_sorted) {
2677  not_sorted = false;
2678  contained.back()->m_accumulated_num = contained.back()->m_align->Weight();
2679  contained.back()->m_accumulated_splice_num = contained.back()->m_splice_weight;
2680  for(int i = (int)contained.size()-2; i >= first; --i) {
2681  contained[i]->m_accumulated_num = contained[i]->m_align->Weight()+contained[i+1]->m_accumulated_num;
2682  contained[i]->m_accumulated_splice_num = contained[i]->m_splice_weight+contained[i+1]->m_accumulated_splice_num;
2683  }
2684  }
2685 
2686  delta_num = contained[first]->m_accumulated_num;
2687  delta_splice_num = contained[first]->m_accumulated_splice_num;
2688  }
2689 
2690  return true;
2691 }
2692 
2693 
2695  const CCDSInfo& ai_cds_info = *mi.m_cds_info;
2696  TSignedSeqRange ai_rf = ai_cds_info.Start()+ai_cds_info.ReadingFrame()+ai_cds_info.Stop();
2697 
2698  // TContained micontained = mi.CollectContainedForMemeber();
2699  mi.m_num = 0;
2700  mi.m_splice_num = 0;
2701  for(auto p : micontained) {
2702  mi.m_num += p->m_align->Weight();
2703  mi.m_splice_num += p->m_splice_weight;
2704  }
2705 
2706  const CGeneModel& ai = *mi.m_align;
2707  mi.m_cds = mi.m_align_map->FShiftedLen(ai_rf,false);
2708  if(ai_cds_info.HasStart()) {
2709  mi.m_cds += START_BONUS;
2710  _ASSERT((ai.Strand() == ePlus && ai_cds_info.Start().GetFrom() == ai_cds_info.MaxCdsLimits().GetFrom()) ||
2711  (ai.Strand() == eMinus && ai_cds_info.Start().GetTo() == ai_cds_info.MaxCdsLimits().GetTo()));
2712  }
2713 
2714  if(has_rnaseq) {
2715  for(int i = 1; i < (int)ai.Exons().size(); ++i) {
2716  if(ai.Exons()[i-1].m_ssplice && ai.Exons()[i].m_fsplice) {
2717  TSignedSeqRange intron(ai.Exons()[i-1].Limits().GetTo(),ai.Exons()[i].Limits().GetFrom());
2718  if(Include(ai_rf,intron) && mrna_count[intron]+est_count[intron]+rnaseq_count[intron] == 0) {
2720  }
2721  }
2722  }
2723  }
2724 
2725  mi.m_left_member = 0;
2726  mi.m_left_num = mi.m_num;
2728  mi.m_left_cds = mi.m_cds;
2729 
2730  mi.m_gapped_connection = false;
2731  mi.m_fully_connected_to_part = -1;
2732 }
2733 
2735 {
2736  sort(pointers.begin(),pointers.end(),LeftOrderD());
2737  TIVec right_ends(pointers.size());
2738  for(int k = 0; k < (int)pointers.size(); ++k) {
2739  auto& kalign = *pointers[k]->m_align;
2740  int rend = kalign.Limits().GetTo();
2741  if(kalign.Status()&CGeneModel::eRightFlexible)
2742  rend = kalign.Limits().GetFrom();
2743  right_ends[k] = rend;
2744  }
2745  NON_CONST_ITERATE(TContained, i, pointers) {
2746  SChainMember& mi = **i;
2747  CGeneModel& ai = *mi.m_align;
2748  TContained micontained = mi.CollectContainedForMemeber();
2749  LRIinit(mi, micontained);
2750  bool not_sorted = true;
2751  // sort(micontained.begin(),micontained.end(),LeftOrderD());
2752 
2753  TIVec::iterator lb = lower_bound(right_ends.begin(),right_ends.end(),ai.Limits().GetFrom()-2*flex_len); // give some extra for flexible
2754  TContained::iterator jfirst = pointers.begin();
2755  if(lb != right_ends.end())
2756  jfirst = pointers.begin()+(lb-right_ends.begin()); // skip all on the left side
2757  for(TContained::iterator j = jfirst; j < i; ++j) {
2758  SChainMember& mj = **j;
2759  CGeneModel& aj = *mj.m_align;
2760 
2761  if(aj.Exons().back().m_fsplice_sig == "XX" || ai.Exons().front().m_ssplice_sig == "XX") // don't extend first/last exon if gapfill
2762  continue;
2763 
2764  if(aj.Limits().GetTo() < ai.Limits().GetFrom()) // skip not overlapping (may exist because of flex_len)
2765  continue;
2766 
2767  int delta_cds;
2768  double delta_num;
2769  double delta_splice_num;
2770  if(LRCanChainItoJ(delta_cds, delta_num, delta_splice_num, mi, mj, micontained, not_sorted)) {
2771  int newcds = mj.m_left_cds+delta_cds;
2772  double newnum = mj.m_left_num+delta_num;
2773  double newsplicenum = mj.m_left_splice_num+delta_splice_num;
2774 
2775  bool better_connection = false;
2776  if(newcds != mi.m_left_cds) {
2777  better_connection = (newcds > mi.m_left_cds);
2778  } else if(fabs(newsplicenum - mi.m_left_splice_num) > 0.001) {
2779  better_connection = (newsplicenum > mi.m_left_splice_num);
2780  } else if(newnum > mi.m_left_num) {
2781  better_connection = true;
2782  }
2783 
2784  if(better_connection) {
2785  mi.m_left_cds = newcds;
2786  mi.m_left_splice_num = newsplicenum;
2787  mi.m_left_num = newnum;
2788  mi.m_left_member = &mj;
2789  _ASSERT(((ai.Status()&CGeneModel::eLeftFlexible) || aj.Limits().GetFrom() < ai.Limits().GetFrom())
2790  && ((aj.Status()&CGeneModel::eRightFlexible) || aj.Limits().GetTo() < ai.Limits().GetTo()));
2791  }
2792  }
2793  }
2794  }
2795 }
2796 
2798 {
2799  sort(pointers.begin(),pointers.end(),RightOrderD());
2800  TIVec left_ends(pointers.size());
2801  for(int k = 0; k < (int)pointers.size(); ++k) {
2802  auto& kalign = *pointers[k]->m_align;
2803  int lend = kalign.Limits().GetFrom();
2804  if(kalign.Status()&CGeneModel::eRightFlexible)
2805  lend = kalign.Limits().GetTo();
2806  left_ends[k] = lend;
2807  }
2808  NON_CONST_ITERATE(TContained, i, pointers) {
2809  SChainMember& mi = **i;
2810  CGeneModel& ai = *mi.m_align;
2811  const CCDSInfo& ai_cds_info = *mi.m_cds_info;
2812  TSignedSeqRange ai_rf = ai_cds_info.Start()+ai_cds_info.ReadingFrame()+ai_cds_info.Stop();
2813  TSignedSeqRange ai_limits = ai.Limits();
2814  bool ai_right_complete = ai.Strand() == ePlus ? ai_cds_info.HasStop() : ai_cds_info.HasStart();
2815  mi.m_right_member = 0;
2816  mi.m_right_num = mi.m_num;
2818  mi.m_right_cds = mi.m_cds;
2819  TContained micontained = mi.CollectContainedForMemeber();
2820  bool not_sorted = true;
2821  // sort(micontained.begin(),micontained.end(),RightOrderD());
2822 
2823  TIVec::iterator lb = lower_bound(left_ends.begin(),left_ends.end(),ai.Limits().GetTo()+2*flex_len,greater<int>()); // first potentially intersecting
2824  TContained::iterator jfirst = pointers.begin();
2825  if(lb != left_ends.end())
2826  jfirst = pointers.begin()+(lb-left_ends.begin()); // skip all on the right side
2827  for(TContained::iterator j = jfirst; j < i; ++j) {
2828  SChainMember& mj = **j;
2829  CGeneModel& aj = *mj.m_align;
2830 
2831  if(aj.Exons().front().m_ssplice_sig == "XX" || ai.Exons().back().m_fsplice_sig == "XX") // don't extend first/last exon if gapfill
2832  continue;
2833 
2834  if(aj.Strand() != ai.Strand())
2835  continue;
2836  if(aj.Limits().GetFrom() > ai.Limits().GetTo()) // skip not overlapping (may exist because of flex_len)
2837  continue;
2838 
2839  const CCDSInfo& aj_cds_info = *mj.m_cds_info;
2840  TSignedSeqRange aj_rf = aj_cds_info.Start()+aj_cds_info.ReadingFrame()+aj_cds_info.Stop();
2841  bool aj_left_complete = aj.Strand() == ePlus ? aj_cds_info.HasStart() : aj_cds_info.HasStop();
2842 
2843  bool j_lflexible = aj.Status()&CGeneModel::eLeftFlexible;
2844  bool i_rflexible = ai.Status()&CGeneModel::eRightFlexible;
2845  switch(mi.m_type)
2846  {
2847  case eCDS:
2848  if(mj.m_type == eLeftUTR)
2849  continue;
2850  if(mj.m_type == eRightUTR && (!ai_right_complete || (!j_lflexible && (aj.Limits()&ai_rf).GetLength() > 5)))
2851  continue;
2852  else
2853  break;
2854  case eRightUTR:
2855  if(mj.m_type != eRightUTR)
2856  continue;
2857  else
2858  break;
2859  case eLeftUTR:
2860  if(mj.m_type == eRightUTR)
2861  continue;
2862  if(mj.m_type == eCDS && (!aj_left_complete || (!i_rflexible && (ai.Limits()&aj_rf).GetLength() > 5)))
2863  continue;
2864  else
2865  break;
2866  default:
2867  continue;
2868  }
2869 
2870  switch(ai.MutualExtension(aj))
2871  {
2872  case 0: // not compatible
2873  continue;
2874  case 1: // no introns in intersection
2875  {
2876  if(mi.m_type == eCDS && mj.m_type == eCDS) // no intersecting limit for coding
2877  break;
2878  if(j_lflexible || i_rflexible) // no intersecting limit for flexible
2879  break;
2880 
2881  int intersect = (ai_limits & aj.Limits()).GetLength();
2882  if(intersect < intersect_limit) continue;
2883  break;
2884  }
2885  default: // one or more introns in intersection
2886  break;
2887  }
2888 
2889  TSignedSeqRange overlap = (ai.Limits() & aj.Limits());
2890  if(StrictlyContainedInDels(ai.FrameShifts(), overlap) != StrictlyContainedInDels(aj.FrameShifts(), overlap)) // incompatible frameshifts
2891  continue;
2892 
2893  int cds_overlap = 0;
2894 
2895  if(mi.m_type == eCDS && mj.m_type == eCDS) {
2896  int genome_overlap = ai_rf.GetLength()+aj_rf.GetLength()-(ai_rf+aj_rf).GetLength();
2897  if(genome_overlap < 0)
2898  continue;
2899 
2900  TSignedSeqRange max_cds_limits = ai_cds_info.MaxCdsLimits() & aj_cds_info.MaxCdsLimits();
2901 
2902  if (!Include(max_cds_limits, ExtendedMaxCdsLimits(ai, ai_cds_info) + ExtendedMaxCdsLimits(aj, aj_cds_info)))
2903  continue;
2904 
2905  if((Include(ai_rf,aj_rf) || Include(aj_rf,ai_rf)) && ai_rf.GetFrom() != aj_rf.GetFrom() && ai_rf.GetTo() != aj_rf.GetTo())
2906  continue;
2907 
2908  cds_overlap = mi.m_align_map->FShiftedLen(ai_rf&aj_rf,false);
2909  if(cds_overlap%3 != 0)
2910  continue;
2911 
2912  if(ai_cds_info.HasStart() && aj_cds_info.HasStart())
2913  cds_overlap += START_BONUS;
2914 
2915  if(has_rnaseq) {
2916  for(int i = 1; i < (int)ai.Exons().size(); ++i) {
2917  if(ai.Exons()[i-1].m_ssplice && ai.Exons()[i].m_fsplice) {
2918  TSignedSeqRange intron(ai.Exons()[i-1].Limits().GetTo(),ai.Exons()[i].Limits().GetFrom());
2919  if(Include(ai_rf,intron) && Include(aj_rf,intron) && mrna_count[intron]+est_count[intron]+rnaseq_count[intron] == 0) {
2920  cds_overlap -= NON_CDNA_INTRON_PENALTY;
2921  }
2922  }
2923  }
2924  }
2925  }
2926 
2927 
2928  int delta_cds = mi.m_cds-cds_overlap;
2929  int newcds = mj.m_right_cds+delta_cds;
2930  if(newcds < mi.m_right_cds)
2931  continue;
2932 
2933  if(not_sorted)
2934  sort(micontained.begin(),micontained.end(),RightOrderD());
2935  int first = 0;
2936  if(!j_lflexible && !i_rflexible)
2937  first = upper_bound(micontained.begin(),micontained.end(),&mj,RightOrder())-micontained.begin(); // first alignment contained in ai and outside aj
2938  if(not_sorted) {
2939  not_sorted = false;
2940  micontained.back()->m_accumulated_num = micontained.back()->m_align->Weight();
2941  micontained.back()->m_accumulated_splice_num = micontained.back()->m_splice_weight;
2942  for(int i = (int)micontained.size()-2; i >= first; --i) {
2943  micontained[i]->m_accumulated_num = micontained[i]->m_align->Weight()+micontained[i+1]->m_accumulated_num;
2944  micontained[i]->m_accumulated_splice_num = micontained[i]->m_splice_weight+micontained[i+1]->m_accumulated_splice_num;
2945  }
2946  }
2947 
2948  double delta_num = micontained[first]->m_accumulated_num;
2949  double delta_splice_num = micontained[first]->m_accumulated_splice_num;
2950 
2951  double newnum = mj.m_right_num+delta_num;
2952  double newsplicenum = mj.m_right_splice_num+delta_splice_num;
2953 
2954  bool better_connection = false;
2955  if(newcds != mi.m_right_cds) {
2956  better_connection = (newcds > mi.m_right_cds);
2957  } else if(fabs(newsplicenum - mi.m_right_splice_num) > 0.001) {
2958  better_connection = (newsplicenum > mi.m_right_splice_num);
2959  } else if(newnum > mi.m_right_num) {
2960  better_connection = true;
2961  }
2962 
2963  if(better_connection) {
2964  mi.m_right_cds = newcds;
2965  mi.m_right_splice_num = newsplicenum;
2966  mi.m_right_num = newnum;
2967  mi.m_right_member = &mj;
2968  _ASSERT(((aj.Status()&CGeneModel::eLeftFlexible) || aj.Limits().GetFrom() > ai.Limits().GetFrom())
2969  && ((ai.Status()&CGeneModel::eRightFlexible) || aj.Limits().GetTo() > ai.Limits().GetTo()));
2970  }
2971  }
2972  }
2973 }
2974 
2975 bool MemberIsCoding(const SChainMember* mp) {
2976  return (mp->m_cds_info->Score() != BadScore());
2977 }
2978 
2980  return mp->m_marked_for_deletion;
2981 }
2982 
2983 // returns essential members of the chain for debugging
2985  vector<const SChainMember*> mal;
2986  mal.push_back(&mi);
2987  for (SChainMember* left = mi.m_left_member; left != 0; left = left->m_left_member) {
2988  mal.push_back(left);
2989  }
2990  for (SChainMember* right = mi.m_right_member; right != 0; right = right->m_right_member) {
2991  mal.push_back(right);
2992  }
2993  sort(mal.begin(),mal.end(),GenomeOrderD());
2994  string note = to_string(mi.m_align->ID()); //+":"+to_string(mi.m_mem_id);;
2995  ITERATE(vector<const SChainMember*>, imal, mal) {
2996  note = note+" "+to_string((*imal)->m_align->ID()); //+":"+to_string((*imal)->m_mem_id);
2997  }
2998  return note;
2999 }
3000 
3001 bool GoodSupportForIntrons(const CGeneModel& chain, const SMinScor& minscor,
3002  map<TSignedSeqRange,int>& mrna_count, map<TSignedSeqRange,int>& est_count, map<TSignedSeqRange,int>& rnaseq_count) {
3003  bool good = true;
3004  for(int i = 1; i < (int)chain.Exons().size() && good; ++i) {
3005  if(chain.Exons()[i-1].m_ssplice && chain.Exons()[i].m_fsplice) {
3006  TSignedSeqRange intron(chain.Exons()[i-1].Limits().GetTo(),chain.Exons()[i].Limits().GetFrom());
3007  if(mrna_count[intron] < minscor.m_minsupport_mrna && mrna_count[intron]+est_count[intron] < minscor.m_minsupport && rnaseq_count[intron] < minscor.m_minsupport_rnaseq)
3008  good = false;
3009  }
3010  }
3011 
3012  return good;
3013 }
3014 
3015 void MarkUnwantedLowSupportIntrons(TContained& pointers, const SMinScor& minscor,
3016  map<TSignedSeqRange,int>& mrna_count, map<TSignedSeqRange,int>& est_count, map<TSignedSeqRange,int>& rnaseq_count) {
3017 
3018  NON_CONST_ITERATE(TContained, i, pointers)
3019  (*i)->m_marked_for_deletion = !GoodSupportForIntrons(*(*i)->m_align, minscor, mrna_count, est_count, rnaseq_count);
3020 }
3021 
3023 {
3024  GModelOrder(TOrigAligns& oa) : orig_aligns(oa) {}
3025 
3027 
3028  bool operator()(const CGeneModel& a, const CGeneModel& b)
3029  {
3030  if(a.Limits() != b.Limits())
3031  return a.Limits() < b.Limits();
3032  // same limits
3033  unsigned aflex = a.Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible);
3034  unsigned bflex = b.Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible);
3035  if(aflex != bflex)
3036  return aflex < bflex;
3037  else
3038  return *orig_aligns[a.ID()]->GetTargetId() < *orig_aligns[ b.ID()]->GetTargetId(); // to make sort deterministic
3039  }
3040 };
3041 
3043 {
3044  if(clust.empty()) return TGeneModelList();
3045 
3046  clust.sort(GModelOrder(orig_aligns));
3047 
3048  {
3049  map<tuple<int, int>, TGeneModelList::iterator> special_aligns; // [left/right flex|cap/polya, position]
3050  //all known flexible
3051  for(TGeneModelList::iterator it = clust.begin(); it != clust.end(); ++it) {
3052  if(it->Status()&CGeneModel::eLeftFlexible) {
3054  special_aligns.emplace(make_tuple(status, it->Limits().GetTo()), it);
3055  }
3056  if(it->Status()&CGeneModel::eRightFlexible) {
3058  special_aligns.emplace(make_tuple(status, it->Limits().GetFrom()), it);
3059  }
3060  }
3061  //make flexible from normal cap/polya
3062  TSignedSeqPos contig_len = (TSignedSeqPos)m_gnomon->GetSeq().size();
3063  int spec_extend = SPECIAL_ALIGN_LEN-1;
3064  for(TGeneModelList::iterator it = clust.begin(); it != clust.end(); ++it) {
3066  continue;
3067 
3068  if(it->Status()&CGeneModel::eCap) {
3069  it->Status() &= ~CGeneModel::eCap;
3070  CGeneModel galign(it->Strand(), it->ID(), CGeneModel::eSR);
3071  galign.SetWeight(it->Weight());
3072 
3073  int pos;
3074  int status = CGeneModel::eCap;
3075  if(it->Strand() == ePlus) {
3076  pos = it->Limits().GetFrom();
3077  galign.AddExon(TSignedSeqRange(pos, pos+spec_extend));
3078  status |= CGeneModel::eRightFlexible;
3079  } else {
3080  pos = it->Limits().GetTo();
3081  galign.AddExon(TSignedSeqRange(pos-spec_extend, pos));
3082  status |= CGeneModel::eLeftFlexible;
3083  }
3084  if(galign.Limits().GetFrom() >= 0 && galign.Limits().GetTo() < contig_len) {
3085  galign.Status() |= status;
3086  clust.push_front(galign);
3087  auto rslt = special_aligns.emplace(make_tuple(status, pos), clust.begin());
3088  if(!rslt.second) { //this position already exists
3089  auto ialign = rslt.first->second;
3090  ialign->SetWeight(ialign->Weight()+galign.Weight());
3091  clust.pop_front();
3092  }
3093  }
3094  }
3095  if(it->Status()&CGeneModel::ePolyA) {
3096  it->Status() &= ~CGeneModel::ePolyA;
3097  CGeneModel galign(it->Strand(), it->ID(), CGeneModel::eSR);
3098  galign.SetWeight(it->Weight());
3099 
3100  int pos;
3101  int status = CGeneModel::ePolyA;
3102  if(it->Strand() == eMinus) {
3103  pos = it->Limits().GetFrom();
3104  galign.AddExon(TSignedSeqRange(pos, pos+spec_extend));
3105  status |= CGeneModel::eRightFlexible;
3106  } else {
3107  pos = it->Limits().GetTo();
3108  galign.AddExon(TSignedSeqRange(pos-spec_extend, pos));
3109  status |= CGeneModel::eLeftFlexible;
3110  }
3111  if(galign.Limits().GetFrom() >= 0 && galign.Limits().GetTo() < contig_len) {
3112  galign.Status() |= status;
3113  clust.push_front(galign);
3114  auto rslt = special_aligns.emplace(make_tuple(status, pos), clust.begin());
3115  if(!rslt.second) { //this position already exists
3116  auto ialign = rslt.first->second;
3117  ialign->SetWeight(ialign->Weight()+galign.Weight());
3118  clust.pop_front();
3119  }
3120  }
3121  }
3122  }
3123 
3124  //remove below threshold and crossing contig boundaries
3125  for(auto& sa : special_aligns) {
3126  auto ialign = sa.second;
3127  double min_pos_weight = ((ialign->Status()&CGeneModel::eCap) ? min_cap_weight : min_polya_weight);
3128  if(ialign->Limits().GetFrom() < 0 || ialign->Limits().GetTo() >= contig_len || ialign->Weight() < min_pos_weight)
3129  clust.erase(ialign);
3130  }
3131 
3132  clust.sort(GModelOrder(orig_aligns));
3133  }
3134 
3135  confirmed_ends.clear();
3136  all_frameshifts.clear();
3137  ITERATE (TGeneModelList, it, clust) {
3138  const CGeneModel& align = *it;
3139  if(use_confirmed_ends) {
3140  if(align.Status()&CGeneModel::eLeftConfirmed) {
3141  auto rslt = confirmed_ends.emplace(align.Exons().front().GetTo(), align.Exons().front().GetFrom());
3142  if(!rslt.second)
3143  rslt.first->second = min(rslt.first->second, align.Exons().front().GetFrom());
3144  }
3145  if(align.Status()&CGeneModel::eRightConfirmed) {
3146  auto rslt = confirmed_ends.emplace(align.Exons().back().GetFrom(), align.Exons().back().GetTo());
3147  if(!rslt.second)
3148  rslt.first->second = max(rslt.first->second, align.Exons().back().GetTo());
3149  }
3150  }
3151  all_frameshifts.insert(all_frameshifts.end(), align.FrameShifts().begin(), align.FrameShifts().end());
3152  for(int i = 1; i < (int)align.Exons().size(); ++i) {
3153  if(align.Exons()[i-1].m_ssplice && align.Exons()[i].m_fsplice) {
3154  TSignedSeqRange intron(align.Exons()[i-1].Limits().GetTo(),align.Exons()[i].Limits().GetFrom());
3155 
3156  if((align.Status()&CGeneModel::eUnknownOrientation) == 0) {
3157  if(align.Strand() == ePlus)
3158  oriented_introns_plus.insert(intron);
3159  else
3160  oriented_introns_minus.insert(intron);
3161  }
3162 
3163  if(align.Type() == CGeneModel::emRNA)
3164  mrna_count[intron] += align.Weight();
3165  else if(align.Type() == CGeneModel::eEST)
3166  est_count[intron] += align.Weight();
3167  else if(align.Type() == CGeneModel::eSR)
3168  rnaseq_count[intron] += align.Weight();
3169  }
3170  }
3171  }
3172 
3173  has_rnaseq = !rnaseq_count.empty();
3174  sort(all_frameshifts.begin(),all_frameshifts.end());
3175  if(!all_frameshifts.empty())
3176  uniq(all_frameshifts);
3177 
3178  flex_len = 0;
3179  NON_CONST_ITERATE (TGeneModelList, it, clust) {
3180  CGeneModel& align = *it;
3182  flex_len = max(flex_len, align.Limits().GetLength());
3183 
3185  int pluses = 0;
3186  int minuses = 0;
3187  for(int i = 1; i < (int)align.Exons().size(); ++i) {
3188  if(align.Exons()[i-1].m_ssplice && align.Exons()[i].m_fsplice) {
3189  TSignedSeqRange intron(align.Exons()[i-1].Limits().GetTo(),align.Exons()[i].Limits().GetFrom());
3190  if(oriented_introns_plus.find(intron) != oriented_introns_plus.end())
3191  ++pluses;
3192  if(oriented_introns_minus.find(intron) != oriented_introns_minus.end())
3193  ++minuses;
3194  }
3195  }
3196  if(pluses > 0 && minuses == 0) {
3198  if(align.Strand() == eMinus)
3199  align.ReverseComplementModel();
3200  } else if(minuses > 0 && pluses == 0) {
3202  if(align.Strand() == ePlus)
3203  align.ReverseComplementModel();
3204  }
3205  align.Status() &= ~CGeneModel::eReversed;
3206  }
3207  }
3208 
3209  CChainMembers allpointers(clust, orig_aligns, unmodified_aligns);
3210 
3211  DuplicateNotOriented(allpointers, clust);
3212  ReplicatePStops(allpointers);
3213  ScoreCdnas(allpointers);
3214  Duplicate5pendsAndShortCDSes(allpointers);
3215  DuplicateUTRs(allpointers);
3216  CalculateSpliceWeights(allpointers);
3217  FindContainedAlignments(allpointers);
3218 
3219  TContained pointers;
3220  ITERATE(TContained, ip, allpointers) {
3221  _ASSERT((*ip)->m_orig_align);
3222  if(!(*ip)->m_not_for_chaining)
3223  pointers.push_back(*ip);
3224  }
3225 
3226  TContained coding_pointers;
3227  ITERATE(CChainMembers, i, pointers) {
3228  if(MemberIsCoding(*i))
3229  coding_pointers.push_back(*i);
3230  }
3231 
3232  LeftRight(coding_pointers);
3233  RightLeft(coding_pointers);
3234 
3235  TChainList tmp_chains;
3236 
3237  array<map<TSignedSeqPos,TSignedSeqRange>, 2> coding_right_splices;
3238  array<map<TSignedSeqPos,TSignedSeqRange>, 2> coding_left_splices;
3239  array<set<TSignedSeqRange>, 2> coding_introns;
3240 
3241  NON_CONST_ITERATE(TContained, i, coding_pointers) {
3242  SChainMember& mi = **i;
3243  mi.m_cds = mi.m_left_cds+mi.m_right_cds-mi.m_cds;
3245  mi.m_num = mi.m_left_num+mi.m_right_num-mi.m_num;
3246  }
3247  sort(coding_pointers.begin(),coding_pointers.end(),CdsNumOrder());
3248  NON_CONST_ITERATE(TContained, i, coding_pointers) {
3249  SChainMember& mi = **i;
3250 
3252  continue;
3253 
3254  if(mi.m_included || mi.m_postponed || mi.m_internal)
3255  continue;
3256 
3257  CChain chain(mi, false);
3258  TSignedSeqRange i_rf = chain.ReadingFrame();
3259 
3260  m_gnomon->GetScore(chain, false, false, true); // max_cds extended
3261 
3262  if(chain.Score() == BadScore() || chain.PStop(false))
3263  continue;
3264 
3265  int cdslen = chain.FShiftedLen(chain.GetCdsInfo().Cds(),true);
3266  if(chain.GetCdsInfo().ProtReadingFrame().Empty() &&
3267  (cdslen < minscor.m_minlen || (chain.Score() < 2*minscor.m_min && cdslen < 2*minscor.m_cds_len)))
3268  continue;
3269 
3270  TSignedSeqRange real_cds = chain.MaxCdsLimits(); //chain.RealCdsLimits();
3271  for(int i = 1; i < (int)chain.Exons().size(); ++ i) {
3272  TSignedSeqPos donor = chain.Exons()[i-1].GetTo();
3273  bool coding_donor = Include(real_cds, donor);
3274  TSignedSeqPos acceptor = chain.Exons()[i].GetFrom();
3275  bool coding_acceptor = Include(real_cds, acceptor);
3276  if(coding_donor)
3277  coding_right_splices[chain.Strand()][donor].CombineWith(real_cds);
3278  if(coding_acceptor)
3279  coding_left_splices[chain.Strand()][acceptor].CombineWith(real_cds);
3280  if(coding_donor && coding_acceptor)
3281  coding_introns[chain.Strand()].emplace(donor, acceptor);
3282  }
3283 
3284  TSignedSeqRange n_rf = chain.ReadingFrame();
3285  if(!i_rf.IntersectingWith(n_rf))
3286  continue;
3287  int a,b;
3288  if(n_rf.GetFrom() <= i_rf.GetFrom()) {
3289  a = n_rf.GetFrom();
3290  b = i_rf.GetTo();
3291  } else {
3292  a = i_rf.GetFrom();
3293  b = n_rf.GetTo();
3294  }
3295  if(chain.FShiftedLen(a,b,true)%3 != 0)
3296  continue;
3297 
3299  }
3300 
3301  for(auto ip : pointers) {
3302  if(ip->m_align->Type()&CGeneModel::eSR || ip->m_marked_for_deletion)
3303  continue;
3304 
3305  TSignedSeqRange cds = ip->m_cds_info->Cds();
3306  int strand = ip->m_align->Strand();
3307  auto& crs = coding_right_splices[strand];
3308  auto& cls = coding_left_splices[strand];
3309  for(int i = 1; i < (int)ip->m_align->Exons().size(); ++ i) {
3310  TSignedSeqPos rsplice = ip->m_align->Exons()[i-1].GetTo();
3311  auto rslt = crs.find(rsplice);
3312  if(rslt != crs.end() && !cds.IntersectingWith(rslt->second)) {
3313  ip->m_marked_for_deletion = true;
3314  break;
3315  }
3316  TSignedSeqPos lsplice = ip->m_align->Exons()[i].GetFrom();
3317  rslt = cls.find(lsplice);
3318  if(rslt != cls.end() && !cds.IntersectingWith(rslt->second)) {
3319  ip->m_marked_for_deletion = true;
3320  break;
3321  }
3322  }
3323 
3324  if(!ip->m_marked_for_deletion) {
3325  auto& cdi = coding_introns[strand];
3326  for(auto& exon :ip->m_align->Exons()) {
3327  if(Include(cds, exon.Limits())) // coding exon
3328  continue;
3329  for(auto intronp = cdi.upper_bound(exon.Limits()); intronp != cdi.end() && intronp->GetFrom() < exon.GetTo(); ++intronp) {
3330  if(Include(exon.Limits(), *intronp) && !Include(cds, *intronp)) {
3331  ip->m_marked_for_deletion = true;
3332  break;
3333  }
3334  }
3335  if(ip->m_marked_for_deletion)
3336  break;
3337  }
3338  }
3339  }
3340 
3341  pointers.erase(std::remove_if(pointers.begin(),pointers.end(),MemberIsMarkedForDeletion),pointers.end()); // wrong orientaition/UTR/frames are removed
3342 
3343  set<TSignedSeqRange> introns;
3344  set<TSignedSeqRange> est_introns;
3345  for(auto p : pointers) {
3346  auto& exons = p->m_align->Exons();
3347  bool est = p->m_align->Type() == CGeneModel::eEST;
3348  for(unsigned i = 1; i < exons.size(); ++i) {
3349  if(!exons[i-1].m_ssplice || !exons[i].m_fsplice)
3350  continue;
3351  introns.emplace(exons[i-1].GetTo(), exons[i].GetFrom());
3352  if(est)
3353  est_introns.emplace(exons[i-1].GetTo(), exons[i].GetFrom());
3354  }
3355  }
3356  bool enough_est = !introns.empty() && est_introns.size() > longreadsthreshold/100*introns.size();
3357 
3358  cerr << "Introns: " << introns.size() << " " << est_introns.size() << " " << enough_est << endl;
3359 
3360  int old_oep = intersect_limit;
3361  if(enough_est) {
3362  intersect_limit = 10000;
3363  pointers.erase(std::remove_if(pointers.begin(),pointers.end(),[](SChainMember* p){
3364  if(p->m_align->Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
3365  return false;
3366  if(p->m_align->Exons().size() != 1 || p->m_type == eCDS)
3367  return false;
3368  if(p->m_copy != nullptr) {
3369  for(SChainMember* cp : *p->m_copy) {
3370  if(cp->m_cds_info->MaxCdsLimits() == TSignedSeqRange::GetWhole()) // through CDS
3371  // if(Include(cp->m_cds_info->MaxCdsLimits(), cp->m_align->Limits())) // through CDS
3372  return false;
3373  }
3374  }
3375  return true; }), pointers.end());
3376  }
3377 
3378  LeftRight(pointers);
3379  RightLeft(pointers);
3380  NON_CONST_ITERATE(TContained, i, pointers) {
3381  SChainMember& mi = **i;
3382  mi.m_included = false;
3383  mi.m_postponed = false;
3384  mi.m_cds = mi.m_left_cds+mi.m_right_cds-mi.m_cds;
3386  mi.m_num = mi.m_left_num+mi.m_right_num-mi.m_num;
3387  }
3388 
3389  sort(pointers.begin(),pointers.end(),CdsNumOrder());
3390 
3391  NON_CONST_ITERATE(TContained, i, pointers) {
3392  SChainMember& mi = **i;
3394  continue;
3395  if(mi.m_included || mi.m_postponed || mi.m_internal)
3396  continue;
3397 
3398  // simple chain and rough check
3399  CChain chain(mi, false);
3400  bool has_trusted = chain.HasTrustedEvidence();
3401  {
3402  if(!chain.RestoreReasonableConfirmedStart(*m_gnomon, orig_aligns))
3403  m_gnomon->GetScore(chain);
3404 
3405  if(!has_trusted)
3406  RemovePoorCds(chain, GoodCDNAScore(chain, true));
3407  if(chain.Score() == BadScore() || (!has_trusted && chain.RealCdsLen() < minscor.m_minlen)) {
3408  mi.MarkPostponedForChain();
3409  continue;
3410  }
3411  }
3412  chain.AddAllMembersAndCoverage(mi);
3413  if(!chain.SetConfirmedEnds(*m_gnomon, confirmed_ends))
3414  continue;
3415 
3416  chain.RemoveFshiftsFromUTRs();
3417  const CResidueVec& contig = m_gnomon->GetSeq();
3418  // alignments clipped below might not be in any chain; clipping may produce redundant chains
3419  chain.ClipToCap(min_cap_blob, max_dist, min_flank_exon, secondary_peak, false);
3420  chain.ClipToPolyA(contig, min_polya_blob, max_dist, min_flank_exon, secondary_peak, tertiary_peak, tertiary_peak_coverage, false);
3421  chain.ClipLowCoverageUTR(minscor.m_utr_clip_threshold, false);
3422  if(!chain.SetConfirmedEnds(*m_gnomon, confirmed_ends))
3423  continue;
3424 
3425  m_gnomon->GetScore(chain, !no5pextension); // this will return CDS to best/longest depending on no5pextension
3427  if(!has_trusted)
3428  RemovePoorCds(chain, GoodCDNAScore(chain));
3429  if(chain.Score() != BadScore() && (has_trusted || chain.RealCdsLen() >= minscor.m_minlen)) {
3430  mi.MarkIncludedForChain();
3431 
3432 #ifdef _DEBUG
3433  chain.AddComment("Link1 "+GetLinkedIdsForMember(mi));
3434 #endif
3435 
3437  chain.CalculateDropLimits();
3438  tmp_chains.push_back(chain);
3439  _ASSERT( chain.FShiftedLen(chain.GetCdsInfo().Start()+chain.ReadingFrame()+chain.GetCdsInfo().Stop(), false)%3==0 );
3440  }
3441  }
3442 
3443  TGeneModelList unma_aligns;
3444  CChainMembers unma_members;
3445  CreateChainsForPartialProteins(tmp_chains, pointers, unma_aligns, unma_members);
3446 
3447  pointers.erase(std::remove_if(pointers.begin(),pointers.end(),MemberIsCoding),pointers.end()); // only noncoding left
3448 
3449  MarkUnwantedLowSupportIntrons(pointers, minscor, mrna_count, est_count, rnaseq_count);
3450  pointers.erase(std::remove_if(pointers.begin(),pointers.end(),MemberIsMarkedForDeletion),pointers.end()); // low support introns removed
3451 
3452  // convert all flexible to left UTRs; copy contained flexible from right UTRs to left UTRs; remove right UTRs
3453  for(auto i : allpointers) {
3454  SChainMember& mi = *i;
3456  mi.m_type = eLeftUTR;
3457  } else if(mi.m_type == eLeftUTR) {
3458  if(mi.m_copy != nullptr) {
3459  for(auto j : *mi.m_copy) {
3460  if(j->m_type == eRightUTR && j->m_align->Strand() == mi.m_align->Strand()) {
3461  for(auto jc : *j->m_contained) {
3462  if(jc->m_align->Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
3463  mi.m_contained->push_back(jc);
3464  }
3465  }
3466  }
3467  }
3468  }
3469  }
3470  pointers.erase(std::remove_if(pointers.begin(),pointers.end(),[](SChainMember* p){ return p->m_type == eRightUTR; }), pointers.end());
3471 
3472  if(enough_est) {
3473  // remove notspliced alignments
3474  pointers.erase(std::remove_if(pointers.begin(),pointers.end(),[](SChainMember* p){
3475  return p->m_align->Exons().size() == 1 && !(p->m_align->Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible)); }), pointers.end());
3476 
3477  // remove alignments with notspliced inrons
3478  array<set<TSignedSeqRange>, 2> non_coding_introns;
3479  for(auto p : pointers) {
3480  auto& exons = p->m_align->Exons();
3481  for(unsigned i = 1; i < exons.size(); ++i) {
3482  TSignedSeqRange intron(exons[i-1].GetTo(), exons[i].GetFrom());
3483  non_coding_introns[p->m_align->Strand()].insert(intron);
3484  }
3485  }
3486  for(auto p : pointers) {
3487  int strand = p->m_align->Strand();
3488  auto& ncdi = non_coding_introns[strand];
3489  for(auto& exon : p->m_align->Exons()) {
3490  for(auto intronp = ncdi.upper_bound(exon.Limits()); intronp != ncdi.end() && intronp->GetFrom() < exon.GetTo(); ++intronp) {
3491  if(Include(exon.Limits(), *intronp)) {
3492  p->m_marked_for_deletion = true;
3493  break;
3494  }
3495  }
3496  if(p->m_marked_for_deletion)
3497  break;
3498  }
3499  }
3500  pointers.erase(std::remove_if(pointers.begin(),pointers.end(),MemberIsMarkedForDeletion),pointers.end());
3501  }
3502 
3503  LeftRight(pointers);
3504  RightLeft(pointers);
3505 
3506  ITERATE(TContained, i, pointers) {
3507  SChainMember& mi = **i;
3509  mi.m_num = mi.m_left_num+mi.m_right_num-mi.m_num;
3510  _ASSERT(mi.m_cds == 0);
3511  }
3512 
3513  sort(pointers.begin(),pointers.end(),CdsNumOrder());
3514 
3515  NON_CONST_ITERATE(TContained, i, pointers) {
3516  SChainMember& mi = **i;
3517  if(mi.m_included || mi.m_internal)
3518  continue;
3519 
3521  continue;
3522 
3523  CChain chain(mi, false);
3524  chain.AddAllMembersAndCoverage(mi);
3525  if(!chain.SetConfirmedEnds(*m_gnomon, confirmed_ends))
3526  continue;
3527 
3528  chain.RemoveFshiftsFromUTRs();
3529  mi.MarkIncludedForChain();
3530  const CResidueVec& contig = m_gnomon->GetSeq();
3531  chain.ClipToCap(min_cap_blob, max_dist, min_flank_exon, secondary_peak, false);
3532  chain.ClipToPolyA(contig, min_polya_blob, max_dist, min_flank_exon, secondary_peak, tertiary_peak, tertiary_peak_coverage, false);
3533  chain.ClipLowCoverageUTR(minscor.m_utr_clip_threshold, false);
3534  if(!chain.SetConfirmedEnds(*m_gnomon, confirmed_ends))
3535  continue;
3536  if(chain.Continuous() && chain.Exons().size() > 1) {
3537 #ifdef _DEBUG
3538  chain.AddComment("Link2 "+GetLinkedIdsForMember(mi));
3539 #endif
3541  chain.CalculateDropLimits();
3542  tmp_chains.push_back(chain);
3543  }
3544  }
3545 
3546  NON_CONST_ITERATE(TChainList, it, tmp_chains) {
3547  CChain& chain = *it;
3548  chain.SetID(m_idnext);
3549  chain.SetGeneID(m_idnext);
3550  m_idnext += m_idinc;
3551  }
3552 
3553  CombineCompatibleChains(tmp_chains);
3554  SetFlagsForChains(tmp_chains);
3555 
3556  intersect_limit = old_oep;
3557 
3558  list<CGene> genes = FindGenes(tmp_chains); // assigns geneid, rank, skip, nested
3559 
3560  if(genes.size() > 1) {
3561  TrimAlignmentsIncludedInDifferentGenes(genes);
3562  CombineCompatibleChains(tmp_chains);
3563  SetFlagsForChains(tmp_chains);
3564  }
3565 
3566  if(genes.size() > 1)
3567  FindGenes(tmp_chains); // redo genes after trim
3568 
3569  TGeneModelList chains;
3570  NON_CONST_ITERATE(TChainList, it, tmp_chains) {
3571  it->RestoreTrimmedEnds(trim);
3572  chains.push_back(*it);
3573  }
3574 
3575  enum { eFirstPeak = 1, eSecondPeak = 2, eThirdPeak = 4, eAs = 8};
3576  map<tuple<int, int, int>, int> cap_polya_info; // [cap/polya strand position]
3577  const CResidueVec& contig = m_gnomon->GetSeq();
3578  for(auto& chain : tmp_chains) {
3579  if(chain.Status()&CGeneModel::eSkipped)
3580  continue;
3581  if(chain.Status()&CGeneModel::eCap) {
3582  for(int i = 0; i < (int)chain.m_cap_peaks.size(); ++i) {
3583  int pos = chain.m_cap_peaks[i];
3584  if(pos >= 0)
3585  cap_polya_info[make_tuple(CGeneModel::eCap, chain.Strand(), pos)] |= (1 << i);
3586  }
3587  }
3588  if(chain.Status()&CGeneModel::ePolyA) {
3589  for(int i = 0; i < (int)chain.m_polya_peaks.size(); ++i) {
3590  int pos = chain.m_polya_peaks[i];
3591  if(pos >= 0) {
3592  cap_polya_info[make_tuple(CGeneModel::ePolyA, chain.Strand(), pos)] |= (1 << i);
3593  if(chain.ValidPolyA(pos, contig).second)
3594  cap_polya_info[make_tuple(CGeneModel::ePolyA, chain.Strand(), pos)] |= eAs;
3595  }
3596  }
3597  }
3598  }
3599  for(auto& info : cap_polya_info) {
3600  string determinant = get<0>(info.first) == CGeneModel::eCap ? "Cap" : "PolyA";
3601  char strand = get<1>(info.first) == ePlus ? '+' : '-';
3602  int pos = m_edited_contig_map.MapEditedToOrig(get<2>(info.first))+m_limits.GetFrom()+1;
3603  cerr << m_contig_acc << ' ' << determinant << ' ' << strand << ' ' << pos << ' ';
3604  if(info.second&eFirstPeak)
3605  cerr << ":FirstPeak";
3606  if(info.second&eSecondPeak)
3607  cerr << ":SecondPeak";
3608  if(info.second&eThirdPeak)
3609  cerr << ":ThirdPeak";
3610  if(info.second&eAs)
3611  cerr << ":As";
3612  cerr << ":\n";
3613  }
3614 
3615  return chains;
3616 }
3617 
3619 {
3620  bool operator()(const CGeneModel* ap, const CGeneModel* bp)
3621  {
3622  if (ap->Limits().GetFrom() != bp->Limits().GetFrom()) return ap->Limits().GetFrom() < bp->Limits().GetFrom();
3623  if (ap->Limits().GetTo() != bp->Limits().GetTo()) return ap->Limits().GetTo() > bp->Limits().GetTo();
3624  return ap->ID() < bp->ID(); // to make sort deterministic
3625  }
3626 };
3627 
3629  // Int8 id = parts.front()->ID();
3630 
3631  TIVec right_ends(pointers.size());
3632  vector<SChainMember> no_gap_members(pointers.size()); // temporary helper chain members; will be used for gap filling optimisation
3633  for(int k = 0; k < (int)pointers.size(); ++k) {
3634  SChainMember& mi = *pointers[k];
3635  right_ends[k] = mi.m_align->Limits().GetTo();
3636  no_gap_members[k] = mi;
3637  }
3638 
3639  SChainMember* best_right = 0;
3640 
3641  int first_member = (int)pointers.size()-1;
3642  int leftpos = palign.Limits().GetFrom();
3643  for(int i = (int)pointers.size()-1; i >= 0; --i) {
3644  TSignedSeqRange limi = pointers[i]->m_align->Limits();
3645  if(limi.GetTo() >= leftpos) {
3646  first_member = i;
3647  leftpos = min(leftpos,limi.GetFrom());
3648  } else {
3649  break;
3650  }
3651  }
3652 
3653  int last_member = 0;
3654  int rightpos = palign.Limits().GetTo();
3655  for(int i = 0; i < (int)pointers.size(); ++i) {
3656  TSignedSeqRange limi = pointers[i]->m_align->Limits();
3657  if(Include(limi,rightpos)) {
3658  last_member = i;
3659  rightpos = max(rightpos,limi.GetTo());
3660  }
3661  }
3662 
3663  int fully_connected_right = 0; // rightmost point already connected to all parts
3664 
3665  for(int i = first_member; i <= last_member; ++i) {
3666  SChainMember& mi = *pointers[i]; // best connection maybe gapped
3667  SChainMember& mi_no_gap = no_gap_members[i]; // best not gapped connection (if any)
3668  CGeneModel& ai = *mi.m_align;
3669  TContained micontained = mi.CollectContainedForMemeber();
3670  LRIinit(mi, micontained);
3671  mi_no_gap = mi;
3672  // LRIinit(mi_no_gap, mi_no_gap.CollectContainedForMemeber());
3673 
3674  if(ai.Strand() != palign.Strand())
3675  continue;
3676 
3677  int part_to_connect = (int)parts.size()-1;
3678  while(part_to_connect >= 0 && ai.Limits().GetFrom() <= parts[part_to_connect]->Limits().GetFrom())
3679  --part_to_connect;
3680 
3681  if(fully_connected_right > 0 && ai.Limits().GetFrom() > fully_connected_right) // can't possibly be connected
3682  continue;
3683 
3684  // TContained micontained = mi.CollectContainedForMemeber();
3685  // sort(micontained.begin(),micontained.end(),LeftOrderD());
3686  bool not_sorted = true;
3687 
3688  bool compatible_with_included_parts = true;
3689  int last_included_part = -1;
3690  bool includes_first_part = false;
3691  for(int p = part_to_connect+1; p < (int)parts.size(); ++p) {
3692  if(Include(ai.Limits(),parts[p]->Limits())) {
3693  TSignedSeqRange ai_rf = mi.m_cds_info->ReadingFrame();
3694  TSignedSeqRange aj_rf = parts[p]->GetCdsInfo().ReadingFrame();
3695  TSignedSeqRange ai_cds = mi.m_cds_info->Cds();
3696  TSignedSeqRange aj_cds = parts[p]->GetCdsInfo().Cds();
3697  bool compatible = (parts[p]->HasCompatibleOverlap(ai) && Include(ai_rf,aj_rf) && mi.m_align_map->FShiftedLen(ai_cds.GetFrom(),aj_cds.GetFrom(),false)%3==1);
3698  bool samestop = (parts[p]->GetCdsInfo().HasStop() == mi.m_cds_info->HasStop() && (!parts[p]->GetCdsInfo().HasStop() || parts[p]->GetCdsInfo().Stop() == mi.m_cds_info->Stop()));
3699  bool samefshifts = (parts[p]->FrameShifts() == StrictlyContainedInDels(ai.FrameShifts(), parts[p]->Limits()));
3700  if(compatible && samestop && samefshifts) {
3701  last_included_part = p;
3702  if(p == 0)
3703  includes_first_part = true;
3704  } else {
3705  compatible_with_included_parts = false;
3706  break;
3707  }
3708  } else if(ai.Limits().IntersectingWith(parts[p]->Limits())) {
3709  TSignedSeqRange overlap = (ai.Limits() & parts[p]->Limits());
3710  if(!parts[p]->HasCompatibleOverlap(ai) || StrictlyContainedInDels(ai.FrameShifts(), overlap) != StrictlyContainedInDels(parts[p]->FrameShifts(), overlap)) {
3711  compatible_with_included_parts = false;
3712  break;
3713  }
3714  } else {
3715  break;
3716  }
3717  }
3718 
3719  if(!compatible_with_included_parts)
3720  continue;
3721 
3722  _ASSERT(part_to_connect < 0 || part_to_connect == (int)parts.size()-1 || mi.m_type == eCDS); // coding if between parts
3723 
3724  if(includes_first_part) {
3725  mi.m_fully_connected_to_part = last_included_part;
3726  mi_no_gap.m_fully_connected_to_part = last_included_part;
3727  }
3728 
3729  TIVec::iterator lb = lower_bound(right_ends.begin(),right_ends.end(),(part_to_connect >= 0 ? parts[part_to_connect]->Limits().GetTo() : ai.Limits().GetFrom()));
3730  int jfirst = 0;
3731  if(lb != right_ends.end())
3732  jfirst = (int)(lb-right_ends.begin()); // skip all on the left side
3733 
3734  for(int j = jfirst; j < i; ++j) {
3735  SChainMember& mj = *pointers[j]; // best connection maybe gapped
3736  if(part_to_connect >= 0 && mj.m_fully_connected_to_part < part_to_connect) // alignmnet is not connected to all previous parts
3737  continue;
3738  CGeneModel& aj = *mj.m_align;
3739  if( ai.Strand() != aj.Strand())
3740  continue;
3741 
3742  SChainMember& mj_no_gap = no_gap_members[j]; // best not gapped connection (if any)
3743 
3744  if(ai.Limits().GetFrom() > aj.Limits().GetTo() && part_to_connect >= 0 && part_to_connect < (int)parts.size()-1 && // gap is not closed
3745  mj_no_gap.m_fully_connected_to_part == part_to_connect && // no additional gap
3746  mi.m_type == eCDS && mj.m_type == eCDS &&
3748  mi.m_cds_info->MaxCdsLimits().GetFrom() == TSignedSeqRange::GetWholeFrom()) { // reading frame not interrupted
3749 
3750 #define PGAP_PENALTY 120
3751 
3752  int newcds = mj_no_gap.m_left_cds+mi.m_cds - PGAP_PENALTY;
3753  double newnum = mj_no_gap.m_left_num+mi.m_num;
3754 
3755  if(mi.m_left_member == 0 || newcds > mi.m_left_cds || (newcds == mi.m_left_cds && newnum > mi.m_left_num)) {
3756  mi.m_left_cds = newcds;
3757  mi.m_left_num = newnum;
3758  mi.m_left_member = &mj_no_gap;
3759  mi.m_gapped_connection = true;
3760  mi.m_fully_connected_to_part = part_to_connect;
3761  }
3762  } else if(ai.Limits().IntersectingWith(aj.Limits())) {
3763  int delta_cds;
3764  double delta_num;
3765  double delta_splice_num;
3766  if(LRCanChainItoJ(delta_cds, delta_num, delta_splice_num, mi, mj, micontained, not_sorted)) { // i and j connected continuosly
3767  int newcds = mj.m_left_cds+delta_cds;
3768  double newnum = mj.m_left_num+delta_num;
3769  double newsplicenum = mj.m_left_splice_num+delta_splice_num;
3770 
3771  bool better_connection = false;
3772  if(newcds != mi.m_left_cds) {
3773  better_connection = (newcds > mi.m_left_cds);
3774  } else if(fabs(newsplicenum - mi.m_left_splice_num) > 0.001) {
3775  better_connection = (newsplicenum > mi.m_left_splice_num);
3776  } else if(newnum > mi.m_left_num) {
3777  better_connection = true;
3778  }
3779 
3780  if (mi.m_left_member == 0 || better_connection) {
3781  mi.m_left_cds = newcds;
3782  mi.m_left_splice_num = newsplicenum;
3783  mi.m_left_num = newnum;
3785  mi.m_left_member = &mj;
3786  mi.m_fully_connected_to_part = part_to_connect;
3787  if(!mi.m_gapped_connection)
3788  mi_no_gap = mi;
3789  } else if(mj_no_gap.m_fully_connected_to_part == part_to_connect) {
3790  newcds = mj_no_gap.m_left_cds+delta_cds;
3791  newnum = mj_no_gap.m_left_num+delta_num;
3792  newsplicenum = mj_no_gap.m_left_splice_num+delta_splice_num;
3793 
3794  better_connection = false;
3795  if(newcds != mi_no_gap.m_left_cds) {
3796  better_connection = (newcds > mi_no_gap.m_left_cds);
3797  } else if(fabs(newsplicenum - mi_no_gap.m_left_splice_num) > 0.001) {
3798  better_connection = (newsplicenum > mi_no_gap.m_left_splice_num);
3799  } else if(newnum > mi_no_gap.m_left_num) {
3800  better_connection = true;
3801  }
3802 
3803  if (mi_no_gap.m_left_member == 0 || better_connection) {
3804  mi_no_gap.m_left_cds = newcds;
3805  mi_no_gap.m_left_splice_num = newsplicenum;
3806  mi_no_gap.m_left_num = newnum;
3807  mi_no_gap.m_left_member = &mj_no_gap;
3808  mi_no_gap.m_fully_connected_to_part = part_to_connect;
3809  }
3810  }
3811  }
3812  }
3813  }
3814 
3815  if(mi.m_left_member != 0 && last_included_part >= 0) {
3816  mi.m_fully_connected_to_part = last_included_part;
3817  mi.m_gapped_connection = false;
3818  mi_no_gap = mi;
3819  }
3820 
3821  if(mi.m_fully_connected_to_part == (int)parts.size()-1) { // includes all parts
3822  fully_connected_right = max(fully_connected_right,mi.m_align->Limits().GetTo());
3823 
3824  if(best_right == 0 || (mi.m_left_cds > best_right->m_left_cds || (mi.m_left_cds == best_right->m_left_cds && mi.m_left_num > best_right->m_left_num)) )
3825  best_right = &mi;
3826  }
3827  }
3828 
3829  _ASSERT(best_right != 0);
3830 
3831  _ASSERT(std::less<SChainMember*>()(best_right, &no_gap_members.front()) || std::less<SChainMember*>()(&no_gap_members.back(), best_right)); // don't point to temporary vector
3832  // _ASSERT(best_right < &no_gap_members.front() || best_right > &no_gap_members.back()); // don't point to temporary vector
3833  for (SChainMember* mp = best_right; mp != 0; mp = mp->m_left_member) {
3834  if(!std::less<SChainMember*>()(mp->m_left_member, &no_gap_members.front()) && !std::less<SChainMember*>()(&no_gap_members.back(), mp->m_left_member)) { // points to temporary vector
3835  // if(mp->m_left_member >= &no_gap_members.front() && mp->m_left_member <= &no_gap_members.back()) { // points to temporary vector
3836  SChainMember* p = pointers[mp->m_left_member-&no_gap_members.front()];
3837  *p = *mp->m_left_member;
3838  mp->m_left_member = p;
3839  }
3840  }
3841 
3842  return best_right;
3843 }
3844 
3846 {
3847  AlignLenOrder(TOrigAligns& oa) : orig_aligns(oa) {}
3849 
3850  bool operator()(const vector<CGeneModel*>* ap, const vector<CGeneModel*>* bp)
3851  {
3852  const vector<CGeneModel*>& partsa = *ap;
3853  const vector<CGeneModel*>& partsb = *bp;
3854 
3855  int align_lena = 0;
3856  ITERATE(vector<CGeneModel*>, k, partsa)
3857  align_lena += (*k)->AlignLen();
3858 
3859  int align_lenb = 0;
3860  ITERATE(vector<CGeneModel*>, k, partsb)
3861  align_lenb += (*k)->AlignLen();
3862 
3863  if(align_lena != align_lenb) {
3864  return align_lena > align_lenb;
3865  } else {
3866  return *orig_aligns[partsa.front()->ID()]->GetTargetId() < *orig_aligns[partsb.front()->ID()]->GetTargetId(); // to make sort deterministic
3867  }
3868  }
3869 };
3870 
3872 
3873  sort(pointers_all.begin(),pointers_all.end(),LeftOrderD());
3874 
3875  typedef map<Int8, vector<CGeneModel*> > TIdChainMembermap;
3876  TIdChainMembermap protein_parts;
3877  for(int k = 0; k < (int)pointers_all.size(); ++k) {
3878  SChainMember& mi = *pointers_all[k];
3879 
3880  if((mi.m_align->Type() & CGeneModel::eProt) && (mi.m_copy == 0 || mi.m_cds_info->HasStart())) { // only prots with start can have copies
3881  protein_parts[mi.m_align->ID()].push_back(mi.m_align);
3882  }
3883  }
3884 
3885  vector<vector<CGeneModel*>*> gapped_sorted_protein_parts;
3886  NON_CONST_ITERATE(TIdChainMembermap, ip, protein_parts) {
3887  vector<CGeneModel*>& parts = ip->second;
3888  if(parts.size() > 1) {
3889  sort(parts.begin(),parts.end(),AlignSeqOrder());
3890  gapped_sorted_protein_parts.push_back(&parts);
3891  }
3892  }
3893  sort(gapped_sorted_protein_parts.begin(),gapped_sorted_protein_parts.end(),AlignLenOrder(orig_aligns));
3894 
3895  NON_CONST_ITERATE(vector<vector<CGeneModel*>*>, ip, gapped_sorted_protein_parts) { // make chains starting from long proteins
3896  vector<CGeneModel*>& parts = **ip;
3897  Int8 id = parts.front()->ID();
3898 
3899  CGeneModel palign(parts.front()->Strand(), id, CGeneModel::eProt);
3900  ITERATE(vector<CGeneModel*>, k, parts) {
3901  CGeneModel part = **k;
3902  CCDSInfo cds = part.GetCdsInfo();
3903  cds.Clear5PrimeCdsLimit();
3904  part.SetCdsInfo(cds);
3905  palign.Extend(part);
3906  }
3907  m_gnomon->GetScore(palign);
3908 
3909  bool connected = false;
3910  NON_CONST_ITERATE(TChainList, k, chains) {
3911  if(k->Continuous() && palign.Strand() == k->Strand() && palign.IsSubAlignOf(*k)) {
3912  connected = true;
3913 #ifdef _DEBUG
3914  k->AddComment("Was connected "+orig_aligns[palign.ID()]->TargetAccession());
3915 #endif
3916  break;
3917  }
3918  }
3919  if(connected)
3920  continue;
3921 
3922  TContained pointers;
3923  for(int k = 0; k < (int)pointers_all.size(); ++k) {
3924  SChainMember* mip = pointers_all[k];
3926 
3928  continue;
3929 
3930  if((mip->m_type != eCDS || !Include(mip->m_cds_info->MaxCdsLimits(),mip->m_align->Limits())) && Include(palign.Limits(),mip->m_align->Limits())) // skip all not entirely coding inside protein alignment
3931  continue;
3932 
3933  if(mip->m_align->Exons().front().m_ssplice_sig == "XX" && Include(palign.Limits(),mip->m_align->Exons().front().Limits())) // skip 3'/5' cdna gapfillers inside protein alignment
3934  continue;
3935 
3936  if(mip->m_align->Exons().back().m_fsplice_sig == "XX" && Include(palign.Limits(),mip->m_align->Exons().back().Limits())) // skip 3'/5' cdna gapfillers inside protein alignment
3937  continue;
3938 
3939  if(palign.Limits().IntersectingWith(limits)) { // skip not compatible alignments
3940  bool compatible = true;
3941  for(CGeneModel* partp : parts) {
3942  if((mip->m_align->ID() != id && Include(partp->Limits(), limits)) || (partp->Limits().IntersectingWith(limits) && !partp->HasCompatibleOverlap(*mip->m_align))) {
3943  compatible = false;
3944  break;
3945  }
3946  }
3947  if(!compatible)
3948  continue;
3949  }
3950 
3951  pointers.push_back(mip);
3952  }
3953 
3954  SChainMember* best_right = FindOptimalChainForProtein(pointers, parts, palign);
3955 
3956  best_right->m_right_member = 0;
3957  // CChain chain(*best_right, &palign, false, false);
3958  CChain chain(*best_right, false);
3959  chain.m_gapped_helper_align = palign;
3960 
3961  if(unmodified_aligns.count(id)) { // some unmodifies are dleted if interfere with a gap
3962  CGeneModel unma = unmodified_aligns[id];
3963  vector<TSignedSeqRange> new_holes;
3964  vector<TSignedSeqRange> remaining_holes;
3965  for(int k = 1; k < (int)chain.Exons().size(); ++k) {
3966  CModelExon exonl = chain.Exons()[k-1];
3967  CModelExon exonr = chain.Exons()[k];
3968  if(!(exonl.m_ssplice && exonr.m_fsplice)) {
3969  TSignedSeqRange h(exonl.GetTo()+1,exonr.GetFrom()-1);
3970  remaining_holes.push_back(h);
3971  for(int piece_begin = 0; piece_begin < (int)unma.Exons().size(); ++piece_begin) {
3972  int piece_end = piece_begin;
3973  for( ; piece_end < (int)unma.Exons().size() && unma.Exons()[piece_end].m_ssplice; ++piece_end);
3974  if(unma.Exons()[piece_begin].GetFrom() < h.GetFrom() && unma.Exons()[piece_end].GetTo() > h.GetTo()) {
3975  new_holes.push_back(h);
3976  break;
3977  }
3978  piece_begin = piece_end;
3979  }
3980  }
3981  }
3982 
3983  if(!new_holes.empty()) { // failed to connect all parts - try unsupported introns
3984  CAlignMap umap = unma.GetAlignMap();
3985  if(unma.Limits() != palign.Limits()) {
3986  TSignedSeqRange lim = umap.ShrinkToRealPoints(palign.Limits(), true);
3987  unma.Clip(lim,CGeneModel::eRemoveExons);
3988  }
3989 
3990  vector<TSignedSeqRange> existed_holes;
3991  for(int k = 1; k < (int)unma.Exons().size(); ++k) {
3992  CModelExon exonl = unma.Exons()[k-1];
3993  CModelExon exonr = unma.Exons()[k];
3994  if(!(exonl.m_ssplice && exonr.m_fsplice))
3995  existed_holes.push_back(TSignedSeqRange(exonl.GetTo()+1,exonr.GetFrom()-1));
3996  }
3997 
3998  for(int k = 1; k < (int)palign.Exons().size(); ++k) { // cut holes which were connected or existed
3999  CModelExon exonl = palign.Exons()[k-1];
4000  CModelExon exonr = palign.Exons()[k];
4001  if(!(exonl.m_ssplice && exonr.m_fsplice)) {
4002  TSignedSeqRange hole(exonl.GetTo()+1,exonr.GetFrom()-1);
4003  bool connected = true;
4004  ITERATE(vector<TSignedSeqRange>, h, remaining_holes) {
4005  _ASSERT(Include(unma.Limits(), *h));
4006  if(Include(hole, *h)) {
4007  connected = false;
4008  break;
4009  }
4010  }
4011 
4012  bool existed = false;
4013  ITERATE(vector<TSignedSeqRange>, h, existed_holes) {
4014  if(Include(hole, *h)) {
4015  existed = true;
4016  break;
4017  }
4018  }
4019 
4020  if(connected || existed) {
4021  TSignedSeqRange left = umap.ShrinkToRealPoints(TSignedSeqRange(unma.Limits().GetFrom(),hole.GetFrom()-1), true);
4022  TSignedSeqRange right = umap.ShrinkToRealPoints(TSignedSeqRange(hole.GetTo()+1,unma.Limits().GetTo()), true);
4023  if(left.GetTo()+1 == hole.GetFrom() && right.GetFrom()-1 == hole.GetTo())
4024  unma.CutExons(hole);
4025  }
4026  }
4027  }
4028  m_gnomon->GetScore(unma);
4029 
4030  TGeneModelList unmacl;
4031  unmacl.push_back(unma);
4032  CutParts(unmacl);
4033 
4034  vector<CGeneModel*> unmaparts;
4035  NON_CONST_ITERATE(TGeneModelList, im, unmacl) {
4036  m_gnomon->GetScore(*im);
4037  unmaparts.push_back(&(*im));
4038  }
4039 
4040  CChainMembers unmapointers(unmacl, orig_aligns, unmodified_aligns);
4041  Duplicate5pendsAndShortCDSes(unmapointers);
4042  sort(pointers.begin(),pointers.end(),GenomeOrderD());
4043  ITERATE(TContained, ip, unmapointers) {
4044  SChainMember& mi = **ip;
4045  IncludeInContained(mi, mi); // include self
4046  // ITERATE(TContained, jp, pointers) {
4047  ITERATE(TContained, jp, pointers_all) {
4048  SChainMember& mj = **jp;
4049  if(CanIncludeJinI(mi, mj))
4050  IncludeInContained(mi, mj);
4051  }
4052  }
4053 
4054  ITERATE(TContained, ip, unmapointers) {
4055  _ASSERT((*ip)->m_orig_align);
4056  (*ip)->m_mem_id = -(*ip)->m_mem_id; // unique m_mem_id
4057  pointers.push_back(*ip);
4058  }
4059 
4060  sort(pointers.begin(),pointers.end(),LeftOrderD());
4061  best_right = FindOptimalChainForProtein(pointers, unmaparts, unma);
4062  ITERATE(TContained, jp, unmapointers) { // add parts in case they were 'shadowed' by longer or identical alignment
4063  SChainMember& mj = **jp;
4064  bool present = false;
4065  for(SChainMember* ip = best_right; ip != 0 && !present; ip = ip->m_left_member)
4066  present = ip == &mj;
4067  for(SChainMember* ip = best_right; ip != 0 && !present; ip = ip->m_left_member) {
4068  SChainMember& mi = *ip;
4069  if(CanIncludeJinI(mi, mj)) {
4070  IncludeInContained(mi, mj);
4071  break;
4072  }
4073  }
4074  }
4075  // chain = CChain(*best_right, &unma, false, false);
4076  chain = CChain(*best_right, false);
4077  chain.m_gapped_helper_align = unma;
4078  unma_aligns.splice(unma_aligns.end(), unmacl);
4079  unma_members.SpliceFromOther(unmapointers);
4080  }
4081  }
4082 
4083  if(!chain.SetConfirmedEnds(*m_gnomon, confirmed_ends))
4084  continue;
4085 
4086  if(!chain.RestoreReasonableConfirmedStart(*m_gnomon, orig_aligns))
4087  m_gnomon->GetScore(chain);
4088  if(chain.Score() == BadScore())
4089  continue;
4090  chain.AddAllMembersAndCoverage(*best_right);
4091  chain.RemoveFshiftsFromUTRs();
4092  /*
4093  m_gnomon->GetScore(chain);
4094  if(chain.Score() == BadScore())
4095  continue;
4096  chain.RemoveFshiftsFromUTRs();
4097  chain.RestoreReasonableConfirmedStart(*m_gnomon, orig_aligns);
4098  */
4099  const CResidueVec& contig = m_gnomon->GetSeq();
4100  chain.ClipToCap(min_cap_blob, max_dist, min_flank_exon, secondary_peak, false);
4101  chain.ClipToPolyA(contig, min_polya_blob, max_dist, min_flank_exon, secondary_peak, tertiary_peak, tertiary_peak_coverage, false);
4102  chain.ClipLowCoverageUTR(minscor.m_utr_clip_threshold, false);
4103  if(!chain.SetConfirmedEnds(*m_gnomon, confirmed_ends))
4104  continue;
4105  m_gnomon->GetScore(chain, !no5pextension); // this will return CDS to best/longest depending on no5pextension
4108  chain.CalculateDropLimits();
4109  _ASSERT( chain.FShiftedLen(chain.GetCdsInfo().Start()+chain.ReadingFrame()+chain.GetCdsInfo().Stop(), false)%3==0 );
4110 
4111 #ifdef _DEBUG
4112  chain.AddComment("Connected "+orig_aligns[palign.ID()]->TargetAccession());
4113  chain.AddComment("LinkForGapped "+GetLinkedIdsForMember(*best_right));
4114 #endif
4115  chains.push_back(chain);
4116  }
4117 }
4118 
4120 
4121  int left = numeric_limits<int>::max();
4122  int right = 0;
4123  ITERATE(TOrigAligns, it, orig_aligns) {
4124  const CAlignModel& align = *it->second;
4125  left = min(left,align.Limits().GetFrom());
4126  right = max(right,align.Limits().GetTo());
4127  }
4128 
4129  int len = right-left+1;
4130 
4131  vector<int> prot_cov[2][3];
4132  prot_cov[0][0].resize(len,0);
4133  prot_cov[0][1].resize(len,0);
4134  prot_cov[0][2].resize(len,0);
4135  prot_cov[1][0].resize(len,0);
4136  prot_cov[1][1].resize(len,0);
4137  prot_cov[1][2].resize(len,0);
4138  ITERATE(TOrigAligns, it, orig_aligns) {
4139  const CAlignModel& align = *it->second;
4140  if(align.GetCdsInfo().ProtReadingFrame().NotEmpty()) {
4141  CAlignMap amap = align.GetAlignMap();
4142  int cdstr = amap.MapOrigToEdited(align.GetCdsInfo().Cds().GetFrom());
4143  for(int i = 0; i < (int)align.Exons().size(); ++i) {
4144  TSignedSeqRange rf = (align.Exons()[i].Limits() & align.ReadingFrame());
4145  if(rf.NotEmpty()) {
4146  for(int j = rf.GetFrom(); j <= rf.GetTo(); ++j) {
4147  int jtr = amap.MapOrigToEdited(j);
4148  if(jtr >= 0)
4149  ++prot_cov[align.Strand()][abs(cdstr-jtr)%3][j-left];
4150  }
4151  }
4152  }
4153  }
4154  }
4155 
4157  scope.AddDefaults();
4158 
4159  SMatrix matrix;
4160 
4161  const CResidueVec& contig = m_gnomon->GetSeq();
4162 
4163  NON_CONST_ITERATE(TChainList, it, chains) {
4164  CChain& chain = *it;
4165  // chain.RestoreReasonableConfirmedStart(*m_gnomon, orig_aligns);
4166  chain.SetOpenForPartialyAlignedProteins(prot_complet);
4167  chain.SetConfirmedStartStopForCompleteProteins(prot_complet, minscor);
4168  chain.CollectTrustedmRNAsProts(orig_aligns, minscor, scope, matrix, contig);
4169  chain.SetBestPlacement(orig_aligns);
4170  chain.SetConsistentCoverage();
4171  if(chain.Continuous() && chain.Exons().size() > 1) {
4172  bool allcdnaintrons = true;
4173  int num = 0;
4174  for(int i = 1; i < (int)chain.Exons().size() && allcdnaintrons; ++i) {
4175  if(chain.Exons()[i-1].m_ssplice_sig != "XX" && chain.Exons()[i].m_fsplice_sig != "XX") {
4176  TSignedSeqRange intron(TSignedSeqRange(chain.Exons()[i-1].GetTo(),chain.Exons()[i].GetFrom()));
4177  allcdnaintrons = (mrna_count[intron]+est_count[intron]+rnaseq_count[intron] > 0);
4178  ++num;
4179  }
4180  }
4181  if(allcdnaintrons && num >0)
4182  chain.Status() |= CGeneModel::ecDNAIntrons;
4183  }
4184  if (chain.FullCds()) {
4185  chain.Status() |= CGeneModel::eFullSupCDS;
4186  }
4187 
4188  if(chain.GetCdsInfo().ProtReadingFrame().Empty() && chain.ReadingFrame().NotEmpty()) { // coding chain without protein support
4189  int protcds = 0;
4190  int lrf_from_proteins = numeric_limits<int>::max();
4191  int rrf_from_proteins = 0;
4192  CAlignMap amap = chain.GetAlignMap();
4193  int cdstr = amap.MapOrigToEdited(chain.GetCdsInfo().Cds().GetFrom());
4194  for(int i = 0; i < (int)chain.Exons().size(); ++i) {
4195  TSignedSeqRange rf = (chain.Exons()[i].Limits() & chain.ReadingFrame());
4196  if(rf.NotEmpty()) {
4197  for(int j = rf.GetFrom(); j <= rf.GetTo(); ++j) {
4198  if(j < left || j > right)
4199  continue;
4200 
4201  int jtr = amap.MapOrigToEdited(j);
4202  int frame = abs(cdstr-jtr)%3;
4203  if(jtr >= 0 && prot_cov[chain.Strand()][frame][j-left] > 0) {
4204  if(frame == 0)
4205  lrf_from_proteins = min(lrf_from_proteins,j);
4206  if(frame == 2)
4207  rrf_from_proteins = max(rrf_from_proteins,j);
4208  ++protcds;
4209  }
4210  }
4211  }
4212  }
4213  if(protcds > 0.2*amap.FShiftedLen(chain.GetCdsInfo().Cds()) && rrf_from_proteins > lrf_from_proteins) {
4214  CCDSInfo cds = chain.GetCdsInfo();
4215  TSignedSeqRange reading_frame = cds.ReadingFrame();
4216  cds.SetReadingFrame(reading_frame&TSignedSeqRange(lrf_from_proteins,rrf_from_proteins), true);
4217  cds.SetReadingFrame(reading_frame);
4218  chain.SetCdsInfo(cds);
4219  chain.SetType(chain.Type()|CGeneModel::eProt);
4220 
4221 #ifdef _DEBUG
4222  chain.AddComment("Added protsupport");
4223 #endif
4224  }
4225  }
4226  }
4227 }
4228 
4229 
4231  for(TChainList::iterator itt = chains.begin(); itt != chains.end(); ++itt) {
4232  if(itt->Status()&CGeneModel::eSkipped)
4233  continue;
4234  CCDSInfo::TPStops istops = itt->GetCdsInfo().PStops();
4235  for(TChainList::iterator jt = chains.begin(); jt != chains.end();) {
4236  TChainList::iterator jtt = jt++;
4237  if(jtt->Status()&CGeneModel::eSkipped)
4238  continue;
4239 
4240  if(itt != jtt && itt->Strand() == jtt->Strand() && jtt->IsSubAlignOf(*itt) && itt->ReadingFrame().Empty() == jtt->ReadingFrame().Empty()) {
4241  if(itt->ReadingFrame().NotEmpty()) {
4242  TSignedSeqRange icds = itt->GetCdsInfo().Cds();
4243  TSignedSeqRange jcds = jtt->GetCdsInfo().Cds();
4244  TSignedSeqPos a = min(icds.GetFrom(),jcds.GetFrom());
4245  TSignedSeqPos b = max(icds.GetFrom(),jcds.GetFrom());
4246  bool same_frame = (itt->FShiftedLen(a, b, false)-1)%3 == 0;
4247  /*
4248  if(itt->Score() > jtt->Score() && (icds&jcds).Empty() && same_frame
4249  && ((itt->Strand() == ePlus && Precede(icds, jcds)) || (itt->Strand() == eMinus && Precede(jcds, icds)))) {
4250  jtt->AddComment("Secondary CDS for "+to_string(itt->ID()));
4251  }
4252  */
4253 
4254  if(!Include(jtt->MaxCdsLimits(), itt->MaxCdsLimits()))
4255  continue;
4256 
4257  if(jtt->FrameShifts() != StrictlyContainedInDels(itt->FrameShifts(), jtt->Limits()))
4258  continue;
4259 
4260  if(!same_frame)
4261  continue;
4262 
4263  CCDSInfo::TPStops jstops = jtt->GetCdsInfo().PStops();
4264  bool same_stops = true;
4265  ITERATE(CCDSInfo::TPStops, istp, istops) {
4266  if(Include(jtt->Limits(),*istp) && find(jstops.begin(), jstops.end(), *istp) == jstops.end()) {
4267  same_stops = false;
4268  break;
4269  }
4270  }
4271  if(!same_stops)
4272  continue;
4273  /* current algorithm prefers shorter CDS with less pstops (still includes longer protein as evidence)
4274  ITERATE(CCDSInfo::TPStops, jstp, jstops) {
4275  if(find(istops.begin(), istops.end(), *jstp) == istops.end()) {
4276  same_stops = false;
4277  break;
4278  }
4279  }
4280  if(!same_stops)
4281  continue;
4282  */
4283  }
4284 
4285  TMemberPtrSet support;
4286  ITERATE(TContained, i, itt->m_members) {
4287  support.insert(*i);
4288  if((*i)->m_copy != 0)
4289  support.insert((*i)->m_copy->begin(),(*i)->m_copy->end());
4290  }
4291  TSignedSeqRange jlimits = jtt->m_supported_range;
4292  ITERATE(TContained, i, jtt->m_members) {
4293  TSignedSeqRange il = (*i)->m_align->Limits();
4294  if(!jlimits.IntersectingWith(il))
4295  continue;
4296  if(((*i)->m_align->Status()&CGeneModel::eLeftFlexible) && il.GetTo() > jlimits.GetTo())
4297  continue;
4298  if(((*i)->m_align->Status()&CGeneModel::eRightFlexible) && il.GetFrom() < jlimits.GetFrom())
4299  continue;
4300 
4301  if(support.insert(*i).second && (Include(jlimits, il) || itt->HasCompatibleOverlap(*(*i)->m_align, 1))) {
4302  itt->m_was_combined = true;
4303  itt->m_members.push_back(*i);
4304  if((*i)->m_copy != 0)
4305  support.insert((*i)->m_copy->begin(),(*i)->m_copy->end());
4306  }
4307  }
4308  if(itt->m_was_combined) {
4309  sort(itt->m_members.begin(),itt->m_members.end(),GenomeOrderD());
4310  itt->CalculateSupportAndWeightFromMembers();
4311  }
4312 
4313  jtt->Status() |= CGeneModel::eSkipped;
4314  }
4315  }
4316  }
4317 }
4318 
4319 double CChainer::CChainerImpl::GoodCDNAScore(const CGeneModel& algn, bool simple)
4320 {
4321  if(algn.FShiftedLen(algn.GetCdsInfo().Cds(),true) > minscor.m_cds_len)
4322  return 0.99*BadScore();
4323 
4324  if(((algn.Type()&CGeneModel::eProt)!=0 || algn.ConfirmedStart()) && algn.FShiftedLen(algn.GetCdsInfo().ProtReadingFrame(),true) > minscor.m_prot_cds_len)
4325  return 0.99*BadScore();
4326 
4327  return minscor.m_min;
4328 
4329  /*
4330  int cdslen = algn.RealCdsLen();
4331  int len = algn.AlignLen();
4332 
4333  if(simple)
4334  return max(0.,minscor.m_min-minscor.m_cds_bonus*cdslen);
4335 
4336  int intron_left = 0, intron_internal = 0, intron_total =0;
4337  for(int i = 1; i < (int)algn.Exons().size(); ++i) {
4338  if(!algn.Exons()[i-1].m_ssplice || !algn.Exons()[i].m_fsplice) continue;
4339 
4340  ++intron_total;
4341  if(algn.Exons()[i].GetFrom()-1 < algn.RealCdsLimits().GetFrom()) ++intron_left;
4342  if(algn.Exons()[i-1].GetTo()+1 > algn.RealCdsLimits().GetFrom() && algn.Exons()[i].GetFrom()-1 < algn.RealCdsLimits().GetTo()) ++intron_internal;
4343  }
4344 
4345  int intron_3p, intron_5p;
4346  if(algn.Strand() == ePlus) {
4347  intron_5p = intron_left;
4348  intron_3p = intron_total -intron_5p - intron_internal;
4349  } else {
4350  intron_3p = intron_left;
4351  intron_5p = intron_total -intron_3p - intron_internal;
4352  }
4353 
4354  return max(0.,minscor.m_min+minscor.m_i5p_penalty*intron_5p+minscor.m_i3p_penalty*intron_3p-minscor.m_cds_bonus*cdslen+minscor.m_length_penalty*len);
4355  */
4356 }
4357 
4358 
4360 {
4361  if (algn.Score() < minscor)
4362  algn.SetCdsInfo(CCDSInfo());
4363 }
4364 
4365 #define SCAN_WINDOW 49 // odd number!!!
4366 
4368  m_members = mbr.CollectContainedForChain();
4369  _ASSERT(m_members.size()>0);
4370  sort(m_members.begin(),m_members.end(),GenomeOrderD());
4371 
4372  for(SChainMember* mi : m_members)
4373  mi->MarkPostponed();
4374 
4375  // CalculateSupportAndWeightFromMembers(false);
4376 
4377  CAlignMap amap = GetAlignMap();
4378  int mrna_len = amap.FShiftedLen(Limits());
4379  vector<double> coverage_raw(mrna_len+SCAN_WINDOW);
4380  ITERATE (TContained, it, m_members) {
4381  const CGeneModel& align = *(*it)->m_align;
4383  continue;
4384 
4385  TSignedSeqRange overlap = Limits()&align.Limits(); // theoretically some ends could be outside (partially trimmed from other chain and combined)
4386  if(align.Type() == CGeneModel::eSR && overlap.NotEmpty()) {
4387  TSignedSeqRange on_mrna = amap.MapRangeOrigToEdited(overlap); // for align partially in a hole will give the hole boundary
4388  for(int i = on_mrna.GetFrom(); i <= on_mrna.GetTo(); ++i)
4389  coverage_raw[i+SCAN_WINDOW/2] += align.Weight();
4390  }
4391  }
4392 
4393  m_coverage.clear();
4394  m_coverage.resize(mrna_len);
4395  double cov = 0;
4396  for(int i = 0; i < SCAN_WINDOW; ++i)
4397  cov += coverage_raw[i]/SCAN_WINDOW;
4398  for(int i = 0; i < mrna_len; ++i) { // will decrease coverage in SCAN_WINDOW/2 end intervals
4399  m_coverage[i] = cov;
4400  cov -= coverage_raw[i]/SCAN_WINDOW;
4401  cov += coverage_raw[i+SCAN_WINDOW]/SCAN_WINDOW;
4402  }
4403 }
4404 
4405 CChain::CChain(SChainMember& mbr, bool full_support) : m_coverage_drop_left(-1), m_coverage_drop_right(-1), m_coverage_bump_left(-1), m_coverage_bump_right(-1), m_core_coverage(0), m_splice_weight(0), m_cap_peaks(3, -1), m_polya_peaks(3, -1), m_was_combined(false) {
4406  m_strand = mbr.m_align->Strand();
4407  m_fshifts = mbr.m_align->m_fshifts;
4408  int atype = eSR|eEST|emRNA|eProt|eNotForChaining;
4409  m_type = eChain|(mbr.m_align->m_type&atype);
4410  m_weight = mbr.m_num;
4411  mbr.MarkPostponed();
4412 
4413  deque<CModelExon> exons(mbr.m_align->Exons().begin(), mbr.m_align->Exons().end());
4414  for(SChainMember* p = mbr.m_right_member; p != nullptr; p = p->m_right_member) {
4415  p->MarkPostponed();
4416  m_type |= (p->m_align->m_type&atype);
4418  exons.back().AddTo(p->m_align->Limits().GetTo()-exons.back().GetTo());
4419  } else {
4420  m_fshifts.insert(m_fshifts.end(), p->m_align->m_fshifts.begin(), p->m_align->m_fshifts.end());
4421  auto& other_exons = p->m_align->Exons();
4422  int num = other_exons.size();
4423  int first;
4424  if(num == 1)
4425  first = 0;
4426  else if(num == 2)
4427  first = other_exons.front().GetTo() >= exons.back().GetTo() ? 0 : 1;
4428  else
4429  first = std::lower_bound(other_exons.begin(), other_exons.end(), exons.back().GetTo(), [](const CModelExon& e, TSignedSeqPos a) { return e.GetTo() < a; })-other_exons.begin();
4430  exons.back().Extend(other_exons[first]);
4431  exons.insert(exons.end(), other_exons.begin()+first+1, other_exons.end());
4432  }
4433  }
4434  SChainMember* prev = &mbr;
4435  for(SChainMember* p = mbr.m_left_member; p != nullptr; prev = p, p = p->m_left_member) {
4436  p->MarkPostponed();
4437  m_type |= (p->m_align->m_type&atype);
4439  exons.front().AddFrom(p->m_align->Limits().GetFrom()-exons.front().GetFrom()); // must be negative
4440  } else {
4441  m_fshifts.insert(m_fshifts.end(), p->m_align->m_fshifts.begin(), p->m_align->m_fshifts.end());
4442  auto& other_exons = p->m_align->Exons();
4443  int num = other_exons.size();
4444 
4445  if(other_exons.back().GetTo() < exons.front().GetFrom()) { // hole from proteins
4446  exons.insert(exons.begin(), other_exons.begin(), other_exons.end());
4447  // set splices
4448  exons[num].m_fsplice = false;
4449  exons[num].m_fsplice_sig.clear();
4450  exons[num-1].m_ssplice = false;
4451  exons[num-1].m_ssplice_sig.clear();
4452  // cli hole to codons
4453  TSignedSeqRange cds = prev->m_cds_info->Cds();
4454  _ASSERT(cds.NotEmpty() && cds.IntersectingWith(exons[num].Limits()));
4455  if(cds.GetFrom() > exons[num].GetFrom())
4456  exons[num].Limits().SetFrom(cds.GetFrom());
4457  TSignedSeqRange other_cds = p->m_cds_info->Cds();
4458  _ASSERT(other_cds.NotEmpty() && other_cds.IntersectingWith(exons[num-1].Limits()));
4459  if(other_cds.GetTo() < exons[num-1].GetTo())
4460  exons[num-1].Limits().SetTo(other_cds.GetTo());
4461  } else {
4462  int first;
4463  if(num == 1)
4464  first = 0;
4465  else if(num == 2)
4466  first = other_exons.back().GetFrom() <= exons.front().GetFrom() ? 1 : 0;
4467  else
4468  first = std::lower_bound(other_exons.begin(), other_exons.end(), exons.front().GetFrom(), [](const CModelExon& e, TSignedSeqPos a) { return e.GetTo() < a; })-other_exons.begin();
4469  exons.front().Limits().SetFrom(other_exons[first].GetFrom());
4470  if(other_exons[first].m_fsplice) {
4471  exons.front().m_fsplice = true;
4472  exons.front().m_fsplice_sig = other_exons[first].m_fsplice_sig;
4473  }
4474  exons.insert(exons.begin(), other_exons.begin(), other_exons.begin()+first);
4475  }
4476  }
4477  }
4478 
4479  m_members = mbr.CollectCodingContainedForChain(); // only coding
4480  sort(m_members.begin(),m_members.end(),GenomeOrderD());
4481  for(SChainMember* mi : m_members) {
4482  mi->MarkPostponed();
4483  m_type |= (mi->m_align->m_type&atype);
4485  }
4486 
4487  m_exons.assign(exons.begin(), exons.end());
4488  m_range.SetFrom(m_exons.front().GetFrom());
4489  m_range.SetTo(m_exons.back().GetTo());
4490  m_exons.front().m_fsplice = false;
4491  m_exons.front().m_fsplice_sig.clear();
4492  m_exons.back().m_ssplice = false;
4493  m_exons.back().m_ssplice_sig.clear();
4494  m_supported_range = m_range; // could become != m_range in SetConfirmedEnds
4497  if(!m_fshifts.empty()) {
4498  sort(m_fshifts.begin(),m_fshifts.end());
4499  m_fshifts.erase(unique(m_fshifts.begin(),m_fshifts.end()), m_fshifts.end());
4500  }
4501 
4502  if(full_support)
4504 }
4505 
4506 CChain::CChain(SChainMember& mbr, CGeneModel* gapped_helper, bool keep_all_evid