4 /* $Id: gnomon_model.hpp 101798 2024-02-13 17:18:22Z souvorov $
5  * ===========================================================================
6  *
8  * National Center for Biotechnology Information
9  *
10  * This software/database is a "United States Government Work" under the
11  * terms of the United States Copyright Act. It was written as part of
12  * the author's official duties as a United States Government employee and
13  * thus cannot be copyrighted. This software/database is freely available
14  * to the public for use. The National Library of Medicine and the U.S.
15  * Government have not placed any restriction on its use or reproduction.
16  *
17  * Although all reasonable efforts have been taken to ensure the accuracy
18  * and reliability of the software and data, the NLM and the U.S.
19  * Government do not and cannot warrant the performance or results that
20  * may be obtained by using this software or data. The NLM and the U.S.
21  * Government disclaim all warranties, express or implied, including
22  * warranties of performance, merchantability or fitness for any particular
23  * purpose.
24  *
25  * Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Authors: Alexandre Souvorov
30  *
31  * File Description:
32  *
33  */
35 #include <corelib/ncbiobj.hpp>
36 #include <corelib/ncbistd.hpp>
37 #include <corelib/ncbi_limits.hpp>
39 #include <set>
40 #include <vector>
41 #include <algorithm>
42 #include <math.h>
44 #include <objmgr/seq_vector_ci.hpp> // CSeqVectorTypes::TResidue
45 #include <util/range.hpp> // TSignedSeqRange
49 class CSeq_align;
50 class CSeq_id;
51 class CGenetic_code;
54 BEGIN_SCOPE(gnomon)
56 class CGnomonEngine;
58 // Making this a constant declaration (kBadScore) would be preferable,
59 // but backfires on WorkShop, where it is implicitly static and hence
60 // unavailable for use in inline functions.
61 inline
62 double BadScore() { return -numeric_limits<double>::max(); }
64 enum EStrand { ePlus, eMinus};
65 inline EStrand OtherStrand(EStrand s) { return (s == ePlus ? eMinus : ePlus); }
69 typedef vector<TResidue> CResidueVec;
70 typedef vector<int> TIVec;
71 typedef vector<double> TDVec;
74 inline bool Precede(TSignedSeqRange l, TSignedSeqRange r) { return l.GetTo() < r.GetFrom(); }
75 inline bool Include(TSignedSeqRange big, TSignedSeqRange small) { return (big.GetFrom()<=small.GetFrom() && small.GetTo()<=big.GetTo()); }
76 inline bool Include(TSignedSeqRange r, TSignedSeqPos p) { return (r.GetFrom()<=p && p<=r.GetTo()); }
77 inline bool Enclosed(TSignedSeqRange big, TSignedSeqRange small) { return (big != small && Include(big, small)); }
80 {
81 public:
82  CSupportInfo(Int8 model_id, bool core=false);
85  Int8 GetId() const;
86  void SetCore(bool core);
87  bool IsCore() const;
88  bool operator==(const CSupportInfo& s) const;
89  bool operator<(const CSupportInfo& s) const;
91 private:
94 };
98 class CAlignModel;
101 public:
103  struct SSource {
104  SSource() : m_strand(ePlus) {}
105  string m_acc;
108  };
110  enum EType {eDel, eIns, eMism};
111  enum EStatus {eGenomeNotCorrect, eGenomeCorrect, eUnknown};
113  CInDelInfo(TSignedSeqPos l, int len, EType type, const string& v = kEmptyStr, const SSource& s = SSource()) { Init(l, len, type, v, s); }
115  TSignedSeqPos Loc() const { return m_loc; }
116  int Len() const { return m_len; }
117  int InDelEnd() const { return ((IsInsertion() || IsMismatch()) ? Loc()+Len() : Loc()); } // first base "after" correction
118  bool IsInsertion() const { return m_type == eIns; }
119  bool IsDeletion() const { return m_type == eDel; }
120  bool IsMismatch() const { return m_type == eMism; }
121  bool IntersectingWith(TSignedSeqPos a, TSignedSeqPos b) const // insertion/mismatch at least partially inside, deletion inside or flanking
122  {
123  return (IsDeletion() && Loc() >= a && Loc() <= b+1) ||
124  ((IsInsertion() || IsMismatch()) && Loc() <= b && a <= Loc()+Len()-1);
125  }
126  bool operator<(const CInDelInfo& fsi) const // source is ignored!!!!!!!!!!!
127  {
128  if(m_loc != fsi.m_loc)
129  return m_loc < fsi.m_loc;
130  else if(m_type != fsi.m_type)
131  return m_type < fsi.m_type; // if location is same deletion first
132  else if(m_len != fsi.m_len)
133  return m_len < fsi.m_len;
134  else
135  return m_indelv < fsi.m_indelv;
136  }
137  bool operator!=(const CInDelInfo& fsi) const { return (*this < fsi || fsi < *this); }
138  bool operator==(const CInDelInfo& fsi) const { return !(*this != fsi); }
139  string GetInDelV() const { return m_indelv; }
140  const SSource& GetSource() const { return m_source; }
141  EType GetType() const { return m_type; };
142  void SetStatus(EStatus s) { m_status = s; }
143  EStatus GetStatus() const { return m_status; }
144  void SetLoc(TSignedSeqPos l) { m_loc = l; }
146 private:
147  void Init(TSignedSeqPos l, int len, EType type, const string& v, const SSource& s) {
148  m_loc = l;
149  m_len = len;
150  m_type = type;
151  m_status = eUnknown;
152  m_indelv = v;
153  m_source = s;
154  _ASSERT(m_indelv.empty() || (int)m_indelv.length() == len);
155  _ASSERT(m_indelv.empty() || m_type != eIns);
156  if((IsDeletion() || IsMismatch()) && GetInDelV().empty())
157  m_indelv.insert( m_indelv.end(), Len(),'N');
158  }
160  TSignedSeqPos m_loc; // left location for insertion, deletion is before m_loc
161  // insertion - when there are extra bases in the genome
162  int m_len;
165  string m_indelv;
167 };
169 typedef vector<CInDelInfo> TInDels;
171 template <class Res>
172 bool IsStartCodon(const Res * seq, int strand = ePlus); // seq points to the first base in biological order
173 template <class Res>
174 bool IsStopCodon(const Res * seq, int strand = ePlus); // seq points to the first base in biological order
178 public:
179  virtual ~CRangeMapper() {}
180  virtual TSignedSeqRange operator()(TSignedSeqRange r, bool withextras = true) const = 0;
181 };
184 public:
185  CModelExon(TSignedSeqPos f = 0, TSignedSeqPos s = 0, bool fs = false, bool ss = false, const string& fsig = "", const string& ssig = "", double ident = 0, const string& seq = "", const CInDelInfo::SSource& src = CInDelInfo::SSource()) :
186  m_fsplice(fs), m_ssplice(ss), m_fsplice_sig(fsig), m_ssplice_sig(ssig), m_ident(ident), m_seq(seq), m_source(src), m_range(f,s)
187  {
188  _ASSERT(m_seq.empty() || m_range.Empty());
189  };
191  bool operator==(const CModelExon& p) const
192  {
193  return (m_range==p.m_range && m_fsplice == p.m_fsplice && m_ssplice == p.m_ssplice);
194  }
195  bool operator!=(const CModelExon& p) const
196  {
197  return !(*this == p);
198  }
199  bool operator<(const CModelExon& p) const { return Precede(Limits(),p.Limits()); }
201  operator TSignedSeqRange() const { return m_range; }
202  const TSignedSeqRange& Limits() const { return m_range; }
203  TSignedSeqRange& Limits() { return m_range; }
204  TSignedSeqPos GetFrom() const { return m_range.GetFrom(); }
205  TSignedSeqPos GetTo() const { return m_range.GetTo(); }
206  void Extend(const CModelExon& e);
207  void AddFrom(int d) { m_range.SetFrom( m_range.GetFrom() +d ); }
208  void AddTo(int d) { m_range.SetTo( m_range.GetTo() +d ); }
210  bool m_fsplice, m_ssplice;
211  string m_fsplice_sig, m_ssplice_sig; // obeys strand
212  double m_ident;
213  string m_seq; // exon sequence if in gap; obeys strand
216  void Remap(const CRangeMapper& mapper) { m_range = mapper(m_range); }
217 private:
219 };
221 class CAlignMap;
223 class CCDSInfo {
224 public:
227  bool operator== (const CCDSInfo& another) const;
229  //CDS mapped to transcript should be used only for for final models (not alignments)
230  //Change in indels or 5' UTR will invalidate the cooordinates (in particular conversion from CAlignModel to CGeneModel);
231  bool IsMappedToGenome() const { return m_genomic_coordinates; }
232  CCDSInfo MapFromOrigToEdited(const CAlignMap& amap) const;
233  CCDSInfo MapFromEditedToOrig(const CAlignMap& amap) const; // returns 'empty' CDS if can't map
237  TSignedSeqRange Cds() const { return Start()+ReadingFrame()+Stop(); }
240  TSignedSeqRange Start() const {return m_start; }
241  TSignedSeqRange Stop() const {return m_stop; }
242  bool HasStart() const { return Start().NotEmpty(); }
243  bool HasStop () const { return Stop().NotEmpty(); }
244  bool ConfirmedStart() const { return m_confirmed_start; } // start is confirmed by protein alignment
245  bool ConfirmedStop() const { return m_confirmed_stop; } // stop is confirmed by protein alignment
247  bool OpenCds() const { return m_open; } // "optimal" CDS is not internal
248  double Score() const { return m_score; }
250  void SetReadingFrame(TSignedSeqRange r, bool protein = false);
251  void SetStart(TSignedSeqRange r, bool confirmed = false);
252  void SetStop(TSignedSeqRange r, bool confirmed = false);
254  void Clear5PrimeCdsLimit();
255  void SetScore(double score, bool open=false);
257  void CombineWith(const CCDSInfo& another_cds_info);
258  void Remap(const CRangeMapper& mapper);
260  void Cut(TSignedSeqRange hole);
261  void Clear();
263  int Strand() const; // -1 (minus), 0 (unknown), 1 (plus)
266  struct SPStop : public TSignedSeqRange {
269  //not overloaded == is used for uniquing and finding intervals
270  //overloaded < is used for sorting before uniquing
271  bool operator<(const SPStop& stp) const {
272  if(operator==(stp)) // == is not overloaded
273  return m_status < stp.m_status;
274  else
275  return TSignedSeqRange::operator<(stp);
276  }
279  };
281  typedef vector<SPStop> TPStops;
282  const TPStops& PStops() const { return m_p_stops; }
283  bool PStop(bool includeall = true) const; // has premature stop(s)
284  void AddPStop(SPStop stp) { m_p_stops.push_back(stp); _ASSERT( Invariant() ); }
285  void AddPStop(TSignedSeqRange r, EStatus status);
286  void ClearPStops() { m_p_stops.clear(); }
288  bool Invariant() const
289  {
290 #ifdef _DEBUG
291  if (ReadingFrame().Empty()) {
292  _ASSERT( !HasStop() && !HasStart() );
294  _ASSERT( !ConfirmedStart() );
295  _ASSERT( !ConfirmedStop() );
296  // _ASSERT( !PStop() );
297  _ASSERT( !OpenCds() );
298  _ASSERT( Score()==BadScore() );
299  return true;
300  }
302  _ASSERT( !Start().IntersectingWith(ReadingFrame()) );
303  _ASSERT( !Stop().IntersectingWith(ReadingFrame()) );
305  _ASSERT( Include( MaxCdsLimits(), Cds() ) );
307  if (!HasStop() && !HasStart()) {
309  } else if (HasStart() && !HasStop()) {
310  if (Precede(Start(), ReadingFrame())) {
312  } else {
314  }
315  } else if (HasStart() && HasStop()) {
317  }
318  if (HasStop()) {
319  if (Precede(ReadingFrame(),Stop())) {
320  _ASSERT( MaxCdsLimits().GetTo()==Stop().GetTo() );
321  } else {
322  _ASSERT( MaxCdsLimits().GetFrom()==Stop().GetFrom() );
323  }
324  }
326  if (ConfirmedStart()) {
327  _ASSERT( HasStart() );
328  }
330  if (ConfirmedStop()) {
331  _ASSERT( HasStop() );
332  }
334  // ITERATE(TPStops, s, PStops())
335  // _ASSERT( Include(MaxCdsLimits(), *s) );
336 #endif
338  return true;
339  }
341 private:
350  bool m_open;
351  double m_score;
354 };
358 {
359 public:
360  enum EType {
361  eWall = 1,
362  eNested = 2,
363  eSR = 4,
364  eEST = 8,
365  emRNA = 16,
366  eProt = 32,
367  eNotForChaining = 64,
368  eChain = 128,
369  eGnomon = 256
370  };
371  static string TypeToString(int type);
373  enum EStatus {
374  ecDNAIntrons = 1,
375  eReversed = 2,
376  eSkipped = 4,
377  eLeftTrimmed = 8,
378  eRightTrimmed = 16,
379  eFullSupCDS = 32,
380  ePseudo = 64,
381  ePolyA = 128,
382  eCap = 256,
383  eBestPlacement = 512,
384  eUnknownOrientation = 1024,
385  eConsistentCoverage = 2048,
386  eGapFiller = 4096,
387  eUnmodifiedAlign = 8192,
388  eChangedByFilter = 16384,
389  eTSA = 32768,
390  eLeftConfirmed = 65536,
391  eRightConfirmed = 131072,
392  eLeftFlexible = 262144,
393  eRightFlexible = 524288
394  };
396  CGeneModel(EStrand s = ePlus, Int8 id = 0, int type = 0) :
397  m_type(type), m_id(id), m_status(0), m_ident(0), m_weight(1), m_expecting_hole(false), m_strand(s), m_geneid(0), m_rank_in_gene(0) {}
398  virtual ~CGeneModel() {}
400  void AddExon(TSignedSeqRange exon, const string& fs = "", const string& ss = "", double ident = 0, const string& seq = "", const CInDelInfo::SSource& src = CInDelInfo::SSource());
401  void AddHole(); // between model and next exons
402  void AddGgapExon(double ident, const string& seq, const CInDelInfo::SSource& src, bool infront);
403  void AddNormalExon(TSignedSeqRange exon, const string& fs, const string& ss, double ident, bool infront);
405  typedef vector<CModelExon> TExons;
406  const TExons& Exons() const { return m_exons; }
407  TExons& Exons() { return m_exons; }
408  void ClearExons() {
409  m_exons.clear();
410  m_fshifts.clear();
411  m_range = TSignedSeqRange::GetEmpty();
412  m_cds_info = CCDSInfo();
413  m_edge_reading_frames.clear();
414  }
415  void SetSplices(int i, const string& f_sig, const string& s_sig) { m_exons[i].m_fsplice_sig = f_sig; m_exons[i].m_ssplice_sig = s_sig; }
417  void ReverseComplementModel();
419  void Remap(const CRangeMapper& mapper);
420  enum EClipMode { eRemoveExons, eDontRemoveExons };
421  virtual void Clip(TSignedSeqRange limits, EClipMode mode, bool ensure_cds_invariant = true); // drops the score!!!!!!!!!
422  virtual void CutExons(TSignedSeqRange hole); // clip or remove exons, dangerous, should be completely in or outside the cds, should not cut an exon in two
423  void ExtendLeft(int amount);
424  void ExtendRight(int amount);
425  void Extend(const CGeneModel& a, bool ensure_cds_invariant = true);
426  void RemoveShortHolesAndRescore(const CGnomonEngine& gnomon); // removes holes shorter than min intron (may add frameshifts/stops)
428  TSignedSeqRange TranscriptExon(int i) const;
430  TSignedSeqRange Limits() const { return m_range; }
431  TSignedSeqRange TranscriptLimits() const;
432  int AlignLen() const ;
433  void RecalculateLimits();
435  // ReadingFrame doesn't include start/stop. It's always on codon boundaries
436  TSignedSeqRange ReadingFrame() const { return m_cds_info.ReadingFrame(); }
437  // CdsLimits include start/stop if any, goes to model limit if no start/stop
438  TSignedSeqRange RealCdsLimits() const;
439  int RealCdsLen() const ; // %3!=0 is possible
440  // MaxCdsLimits - longest cds. include start/stop if any,
441  // goes to 5' limit if no upstream stop, goes to 3' limit if no stop
442  TSignedSeqRange MaxCdsLimits() const;
444  const CCDSInfo& GetCdsInfo() const { return m_cds_info; }
445  void SetCdsInfo(const CCDSInfo& cds_info);
446  void SetCdsInfo(const CGeneModel& a);
447  void CombineCdsInfo(const CGeneModel& a, bool ensure_cds_invariant = true);
448  void CombineCdsInfo(const CCDSInfo& cds_info, bool ensure_cds_invariant = true);
450  bool IntersectingWith(const CGeneModel& a) const
451  {
452  return Limits().IntersectingWith(a.Limits());
453  }
455  double Ident() const { return m_ident; }
456  void SetIdent(double i) { m_ident = i; }
458  double Weight() const { return m_weight; }
459  void SetWeight(double w) { m_weight = w; }
461  void SetStrand(EStrand s) { m_strand = s; }
462  EStrand Strand() const { return m_strand; }
464  bool notreversed = (Status()&CGeneModel::eReversed) == 0;
465  bool plusstrand = Strand() == ePlus;
466  return (notreversed == plusstrand) ? ePlus : eMinus;
467  }
469  void SetType(int t) { m_type = t; }
470  int Type() const { return m_type; }
471  Int8 GeneID() const { return m_geneid; }
472  void SetGeneID(Int8 id) { m_geneid = id; }
473  int RankInGene() const { return m_rank_in_gene; }
474  void SetRankInGene(int rank) { m_rank_in_gene = rank; }
475  Int8 ID() const { return m_id; }
476  void SetID(Int8 id) { m_id = id; }
477  const CSupportInfoSet& Support() const { return m_support; }
478  bool AddSupport(const CSupportInfo& support) { return m_support.insert(support).second; }
479  void ReplaceSupport(const CSupportInfoSet& support_set) { m_support = support_set; }
480  const string& ProteinHit() const { return m_protein_hit; }
481  string& ProteinHit() { return m_protein_hit; }
483  unsigned int& Status() { return m_status; }
484  const unsigned int& Status() const { return m_status; }
485  void ClearStatus() { m_status = 0; }
487  const string& GetComment() const { return m_comment; }
488  void SetComment(const string& comment) { m_comment = comment; }
489  void AddComment(const string& comment) { m_comment += " " + comment; }
491  bool operator<(const CGeneModel& a) const { return Precede(Limits(),a.Limits()); }
493  double Score() const { return m_cds_info.Score(); }
495  bool Continuous() const // no "holes" in alignment
496  {
497  for(unsigned int i = 1; i < Exons().size(); ++i)
498  if (!Exons()[i-1].m_ssplice || !Exons()[i].m_fsplice)
499  return false;
500  return true;
501  }
502  bool HasStart() const { return m_cds_info.HasStart(); }
503  bool HasStop () const { return m_cds_info.HasStop (); }
504  bool LeftComplete() const { return Strand() == ePlus ? HasStart() : HasStop(); }
505  bool RightComplete() const { return Strand() == ePlus ? HasStop() : HasStart(); }
506  bool FullCds() const { return HasStart() && HasStop() && Continuous(); }
507  bool CompleteCds() const { return FullCds() && (!Open5primeEnd() || ConfirmedStart()); }
510  {
511  _ASSERT( !(OpenCds()&&ConfirmedStart()) );
512  return (ReadingFrame().Empty() || (!OpenCds() && FullCds()));
513  }
515  bool Open5primeEnd() const
516  {
517  return (Strand() == ePlus ? OpenLeftEnd() : OpenRightEnd());
518  }
519  bool OpenLeftEnd() const
520  {
521  return ReadingFrame().NotEmpty() && GetCdsInfo().MaxCdsLimits().GetFrom()==TSignedSeqRange::GetWholeFrom();
522  }
523  bool OpenRightEnd() const
524  {
525  return ReadingFrame().NotEmpty() && GetCdsInfo().MaxCdsLimits().GetTo()==TSignedSeqRange::GetWholeTo();
526  }
528  bool OpenCds() const { return m_cds_info.OpenCds(); } // "optimal" CDS is not internal
529  bool PStop(bool includeall = true) const { return m_cds_info.PStop(includeall); } // has premature stop(s)
531  bool ConfirmedStart() const { return m_cds_info.ConfirmedStart(); } // start is confirmed by protein alignment
532  bool ConfirmedStop() const { return m_cds_info.ConfirmedStop(); } // stop is confirmed by protein alignment
534  bool isNMD(int limit = 50) const;
536  TInDels& FrameShifts() { return m_fshifts; }
537  const TInDels& FrameShifts() const { return m_fshifts; }
538  TInDels FrameShifts(TSignedSeqPos a, TSignedSeqPos b) const;
539  TInDels GetInDels(bool fs_only) const;
540  TInDels GetInDels(TSignedSeqPos a, TSignedSeqPos b, bool fs_only) const;
542  int FShiftedLen(TSignedSeqRange ab, bool withextras = true) const; // won't work if a/b is insertion
543  int FShiftedLen(TSignedSeqPos a, TSignedSeqPos b, bool withextras = true) const { return FShiftedLen(TSignedSeqRange(a,b),withextras); }
545  // move along mrna skipping introns
546  TSignedSeqPos FShiftedMove(TSignedSeqPos pos, int len) const; // may retun <0 if hits a deletion at the end of move
548  virtual CAlignMap GetAlignMap() const;
550  string GetCdsDnaSequence (const CResidueVec& contig_sequence) const;
551  string GetProtein (const CResidueVec& contig_sequence) const;
552  string GetProtein (const CResidueVec& contig_sequence, const CGenetic_code* gencode) const;
554  // Below comparisons ignore CDS completely, first 3 assume that alignments are the same strand
556  int HasCompatibleOverlap(const CGeneModel& a, int min_overlap = 2) const; // returns 0 for notcompatible or (number of common splices)+1; neither alignment can have holes
557  int isCompatible(const CGeneModel& a) const; // returns 0 for notcompatible or (number of common splices)+1
558  bool IsSubAlignOf(const CGeneModel& a) const;
559  int MutualExtension(const CGeneModel& a) const; // returns 0 for notcompatible or (number of introns) + 1
561  bool IdenticalAlign(const CGeneModel& a) const
562  { return Strand()==a.Strand() && Limits()==a.Limits() && Exons() == a.Exons() && FrameShifts()==a.FrameShifts() &&
563  GetCdsInfo().PStops() == a.GetCdsInfo().PStops() && Type() == a.Type() && Status() == a.Status(); }
564  bool operator==(const CGeneModel& a) const
565  {
566  return IdenticalAlign(a) && Type()==a.Type() && m_id==a.m_id && m_support==a.m_support;
567  }
569  const list< CRef<CSeq_id> >& TrustedmRNA() const { return m_trusted_mrna; }
570  void InsertTrustedmRNA(CRef<CSeq_id> g) { m_trusted_mrna.push_back(g); };
571  void ClearTrustedmRNA() { m_trusted_mrna.clear(); };
573  const list< CRef<CSeq_id> >& TrustedProt() const { return m_trusted_prot; }
574  void InsertTrustedProt(CRef<CSeq_id> g) { m_trusted_prot.push_back(g); };
575  void ClearTrustedProt() { m_trusted_prot.clear(); };
577  const vector<CCDSInfo>* GetEdgeReadingFrames() const { return &m_edge_reading_frames; }
578  vector<CCDSInfo>* SetEdgeReadingFrames() { return &m_edge_reading_frames; }
581 #ifdef _DEBUG
583 #endif
585 private:
586  void RemoveExtraFShifts(int left, int right);
587  void TrimEdgesToFrameInOtherAlignGaps(const TExons& exons_with_gaps);
589  int m_type;
591  unsigned int m_status;
593  double m_ident;
594  double m_weight;
597  TExons& MyExons() { return m_exons; }
605  bool CdsInvariant(bool check_start_stop = true) const;
611  string m_comment;
612  list< CRef<CSeq_id> > m_trusted_prot;
613  list< CRef<CSeq_id> > m_trusted_mrna;
616  vector<CCDSInfo> m_edge_reading_frames;
618  friend class CChain;
619 };
622 class CAlignMap {
623 public:
627  CAlignMap() {};
629  m_orig_ranges.push_back(SMapRange(SMapRangeEdge(orig_a), SMapRangeEdge(orig_b), kEmptyStr));
631  m_target_len = FShiftedLen(orig_a, orig_b);
632  }
633  CAlignMap(TSignedSeqPos orig_a, TSignedSeqPos orig_b, TInDels::const_iterator fsi_begin, const TInDels::const_iterator fsi_end) : m_orientation(ePlus) {
634  EEdgeType atype = eBoundary;
635  EEdgeType btype = eBoundary;
636  if(fsi_begin != fsi_end) {
637  if(fsi_begin->Loc() == orig_a && !fsi_begin->IsMismatch()) {
638  _ASSERT(!fsi_begin->IsInsertion()); // no reason to have insertion
639  atype = eInDel;
640  }
641  TInDels::const_iterator fs = fsi_end-1;
642  if(fs->Loc() == orig_b+1 && fs->IsDeletion())
643  btype = eInDel;
644  }
645  InsertIndelRangesForInterval(orig_a, orig_b, 0, fsi_begin, fsi_end, atype, btype, "", "");
646  m_target_len = FShiftedLen(orig_a, orig_b);
647  }
648  CAlignMap(const CGeneModel::TExons& exons, const vector<TSignedSeqRange>& transcript_exons, const TInDels& indels, EStrand orientation, int targetlen ); //orientation == strand if not Reversed
649  CAlignMap(const CGeneModel::TExons& exons, const TInDels& frameshifts, EStrand strand, TSignedSeqRange lim = TSignedSeqRange::GetWhole(), int holelen = 0, int polyalen = 0);
651  TSignedSeqPos MapEditedToOrig(TSignedSeqPos edited_pos) const;
653  TSignedSeqRange MapRangeOrigToEdited(TSignedSeqRange orig_range, bool withextras = true) const { return MapRangeOrigToEdited(orig_range, withextras?eLeftEnd:eSinglePoint, withextras?eRightEnd:eSinglePoint); }
654  TSignedSeqRange MapRangeEditedToOrig(TSignedSeqRange edited_range, bool withextras = true) const;
655  template <class In, class Out>
656  void EditedSequence(const In& original_sequence, Out& edited_sequence, bool includeholes = false) const;
657  int FShiftedLen(TSignedSeqRange ab, ERangeEnd lend, ERangeEnd rend) const;
658  int FShiftedLen(TSignedSeqRange ab, bool withextras = true) const;
659  int FShiftedLen(TSignedSeqPos a, TSignedSeqPos b, bool withextras = true) const { return FShiftedLen(TSignedSeqRange(a,b), withextras); }
660  //snap to codons works by analising transcript coordinates (MUST be a protein or reading frame cutout)
661  TSignedSeqRange ShrinkToRealPoints(TSignedSeqRange orig_range, bool snap_to_codons = false) const;
663  TSignedSeqPos FShiftedMove(TSignedSeqPos orig_pos, int len) const; // may reurn < 0 if hits a gap
664  // TInDels GetInDels(bool fs_only) const;
665  // TInDels GetAllCorrections() const;
666  int TargetLen() const { return m_target_len; }
667  EStrand Orientation() const { return m_orientation; }
668  void MoveOrigin(TSignedSeqPos shift) {
669  for(auto& mrange : m_orig_ranges)
670  mrange.MoveOrigin(shift);
671  }
673 // private: // breaks SMapRange on WorkShop. :-/
674  struct SMapRangeEdge {
676  bool operator<(const SMapRangeEdge& mre) const { return m_pos < mre.m_pos; }
677  bool operator==(const SMapRangeEdge& mre) const { return m_pos == mre.m_pos; }
681  string m_extra_seq;
682  };
684  class SMapRange {
685  public:
686  SMapRange(SMapRangeEdge from, SMapRangeEdge to, const string& mism) : m_from(from), m_to(to), m_mism_seq(mism) {}
687  SMapRangeEdge GetEdgeFrom() const { return m_from; }
688  SMapRangeEdge GetEdgeTo() const { return m_to; }
689  void SetEdgeFrom(SMapRangeEdge from) { m_from = from; }
690  void SetEdgeTo(SMapRangeEdge to) { m_to = to; }
691  void MoveOrigin(TSignedSeqPos shift) {
692  m_from.m_pos -= shift;
693  m_to.m_pos -= shift;
694  }
695  TSignedSeqPos GetFrom() const { return m_from.m_pos; }
696  TSignedSeqPos GetTo() const { return m_to.m_pos; }
700  string GetExtraSeqFrom() const { return m_from.m_extra_seq; }
701  TSignedSeqPos GetExtraTo() const { return m_to.m_extra; }
702  string GetExtraSeqTo() const { return m_to.m_extra_seq; }
704  EEdgeType GetTypeTo() const { return m_to.m_edge_type; }
705  const string& GetMismatch() const { return m_mism_seq; }
706  bool operator<(const SMapRange& mr) const {
707  if(m_from.m_pos == mr.m_from.m_pos) return m_to.m_pos < mr.m_to.m_pos;
708  else return m_from.m_pos < mr.m_from.m_pos;
709  }
711  private:
713  string m_mism_seq;
714  };
716  // static TInDels RemoveExtraIndels(const TInDels& indels, const CGeneModel::TExons& exons);
718 private:
719  static TSignedSeqPos MapAtoB(const vector<CAlignMap::SMapRange>& a, const vector<CAlignMap::SMapRange>& b, TSignedSeqPos p, ERangeEnd move_mode);
720  static TSignedSeqRange MapRangeAtoB(const vector<CAlignMap::SMapRange>& a, const vector<CAlignMap::SMapRange>& b, TSignedSeqRange r, ERangeEnd lend, ERangeEnd rend);
721  static TSignedSeqRange MapRangeAtoB(const vector<CAlignMap::SMapRange>& a, const vector<CAlignMap::SMapRange>& b, TSignedSeqRange r, bool withextras ) {
722  return MapRangeAtoB(a, b, r, withextras?eLeftEnd:eSinglePoint, withextras?eRightEnd:eSinglePoint);
723  };
724  static int FindLowerRange(const vector<CAlignMap::SMapRange>& a, TSignedSeqPos p);
726  void InsertOneToOneRange(TSignedSeqPos orig_start, TSignedSeqPos edited_start, TSignedSeqPos len, const string& mism, TSignedSeqPos left_orige, TSignedSeqPos left_edite, TSignedSeqPos right_orige, TSignedSeqPos right_edite,
727  EEdgeType left_type, EEdgeType right_type, const string& left_edit_extra_seq, const string& right_edit_extra_seq);
728  TSignedSeqPos InsertIndelRangesForInterval(TSignedSeqPos orig_a, TSignedSeqPos orig_b, TSignedSeqPos edit_a, TInDels::const_iterator fsi_begin, TInDels::const_iterator fsi_end, EEdgeType type_a, EEdgeType type_b, const string& gseq_a, const string& gseq_b);
730  vector<SMapRange> m_orig_ranges, m_edited_ranges;
733 };
739 public:
741  CAlignModel(const objects::CSeq_align& seq_align);
742  CAlignModel(const CGeneModel& g, const CAlignMap& a);
743  virtual CAlignMap GetAlignMap() const { return m_alignmap; }
744  void ResetAlignMap();
746  virtual void Clip(TSignedSeqRange limits, EClipMode mode, bool ensure_cds_invariant = true) { // drops the score!!!!!!!!!
747  CGeneModel::Clip(limits,mode,ensure_cds_invariant);
748  RecalculateAlignMap(limits.GetFrom(), limits.GetTo());
749  }
750  virtual void CutExons(TSignedSeqRange hole) { // clip or remove exons, dangerous, should be completely in or outside the cds, should not cut an exon in two
751  CGeneModel::CutExons(hole);
752  RecalculateAlignMap(hole.GetTo()+1, hole.GetFrom()-1);
753  }
755  string TargetAccession() const;
756  void SetTargetId(const objects::CSeq_id& id) { m_target_id.Reset(&id); }
757  CConstRef<objects::CSeq_id> GetTargetId() const { return m_target_id; }
758  int TargetLen() const { return m_alignmap.TargetLen(); }
759  int PolyALen() const;
760  CRef<objects::CSeq_align> MakeSeqAlign(const string& contig) const; // should be used for alignments only; for chains and models will produce a Splign alignment of mRNA
762 private:
763  void RecalculateAlignMap(int left, int right);
766 };
772  const string& m_contig;
773  explicit setcontig(const string& cntg) : m_contig(cntg) {}
774 };
776  string& m_contig;
777  explicit getcontig(string& cntg) : m_contig(cntg) {}
778 };
787 template<class Model>
788 class NCBI_XALGOGNOMON_EXPORT CModelCluster : public list<Model> {
789 public:
790  typedef Model TModel;
791  CModelCluster(int f = numeric_limits<int>::max(), int s = 0) : m_limits(f,s) {}
793  void Insert(const Model& a) {
794  m_limits.CombineWith(a.Limits());
795  this->push_back(a);
796  }
797  void Splice(CModelCluster& c) { // elements removed from c and inserted into *this
798  m_limits.CombineWith(c.Limits());
799  this->splice(list<Model>::end(),c);
800  }
801  TSignedSeqRange Limits() const { return m_limits; }
802  bool operator<(const CModelCluster& c) const { return Precede(m_limits, c.m_limits); }
804  list<Model>::clear();
805  m_limits.SetFrom( first );
806  m_limits.SetTo( second );
807  }
809 private:
811 };
816 typedef list<CGeneModel> TGeneModelList;
817 typedef list<CAlignModel> TAlignModelList;
820 template<class Cluster>
822  public:
825  void Insert(const typename Cluster::TModel& a) {
826  Cluster clust;
827  clust.Insert(a);
828  Titerator first = set<Cluster>::lower_bound(Cluster(a.Limits().GetFrom(),a.Limits().GetFrom()));
829  Titerator second = set<Cluster>::upper_bound(Cluster(a.Limits().GetTo(),a.Limits().GetTo()));
830  for(Titerator it = first; it != second;) {
831  clust.Splice(const_cast<Cluster&>(*it));
832  this->erase(it++);
833  }
834  const_cast<Cluster&>(*this->insert(second,Cluster(clust.Limits()))).Splice(clust);
835  }
836 };
844 class EResidue {
845 public :
846  EResidue() : data(enN) {}
849  operator int() const { return int(data); }
851 private:
852  unsigned char data;
853 };
856 {
857  switch(c)
858  {
859  case 'A':
860  return 'T';
861  case 'a':
862  return 't';
863  case 'C':
864  return 'G';
865  case 'c':
866  return 'g';
867  case 'G':
868  return 'C';
869  case 'g':
870  return 'c';
871  case 'T':
872  return 'A';
873  case 't':
874  return 'a';
875  default:
876  return 'N';
877  }
878 }
880 extern const EResidue k_toMinus[5];
881 extern const char *const k_aa_table;
884 {
885  return k_toMinus[c];
886 }
888 template <class BidirectionalIterator>
889 void ReverseComplement(const BidirectionalIterator& first, const BidirectionalIterator& last)
890 {
891  for (BidirectionalIterator i( first ); i != last; ++i)
892  *i = Complement(*i);
893  reverse(first, last);
894 }
896 template<class Model>
897 list<Model> GetAlignParts(const Model& algn, bool settrimflags) { // if no parts result empty
898  list<Model> parts;
899  int left = algn.Limits().GetFrom();
900  for(unsigned int i = 1; i < algn.Exons().size(); ++i) {
901  if (!algn.Exons()[i-1].m_ssplice || !algn.Exons()[i].m_fsplice) {
902  Model m = algn;
903  m.Clip(TSignedSeqRange(left,algn.Exons()[i-1].GetTo()),CGeneModel::eRemoveExons);
904  if(!parts.empty() && settrimflags) {
905  parts.back().Status() &= ~CGeneModel::eRightTrimmed;
906  m.Status() &= ~CGeneModel::eLeftTrimmed;
907  }
908  parts.push_back(m);
909  left = algn.Exons()[i].GetFrom();
910  }
911  }
912  if(!parts.empty()) {
913  Model m = algn;
914  m.Clip(TSignedSeqRange(left,algn.Limits().GetTo()),CGeneModel::eRemoveExons);
915  if(settrimflags) {
916  parts.back().Status() &= ~CGeneModel::eRightTrimmed;
917  m.Status() &= ~CGeneModel::eLeftTrimmed;
918  }
919  parts.push_back(m);
920  }
922  return parts;
923 }
925 /*
926 template<class Model>
927 list<Model> GetAlignParts(const Model& algn, bool settrimflags) {
928  list<Model> parts;
929  int left = algn.Limits().GetFrom();
930  for(unsigned int i = 1; i < algn.Exons().size(); ++i) {
931  if (!algn.Exons()[i-1].m_ssplice || !algn.Exons()[i].m_fsplice) {
932  Model m = algn;
933  m.Status() &= ~CGeneModel::ePolyA;
934  m.Status() &= ~CGeneModel::eCap;
935  m.Clip(TSignedSeqRange(left,algn.Exons()[i-1].GetTo()),CGeneModel::eRemoveExons);
936  if(!parts.empty() && settrimflags) {
937  parts.back().Status() &= ~CGeneModel::eRightTrimmed;
938  m.Status() &= ~CGeneModel::eLeftTrimmed;
939  }
940  parts.push_back(m);
941  left = algn.Exons()[i].GetFrom();
942  }
943  }
944  if(!parts.empty()) {
945  Model m = algn;
946  m.Clip(TSignedSeqRange(left,algn.Limits().GetTo()),CGeneModel::eRemoveExons);
947  m.Status() &= ~CGeneModel::ePolyA;
948  m.Status() &= ~CGeneModel::eCap;
949  if(settrimflags) {
950  parts.back().Status() &= ~CGeneModel::eRightTrimmed;
951  m.Status() &= ~CGeneModel::eLeftTrimmed;
952  }
953  parts.push_back(m);
955  if(algn.Status()&CGeneModel::ePolyA) {
956  if(algn.Strand() == ePlus)
957  parts.back().Status() |= CGeneModel::ePolyA;
958  else
959  parts.front().Status() |= CGeneModel::ePolyA;
960  }
961  if(algn.Status()&CGeneModel::eCap) {
962  if(algn.Strand() == ePlus)
963  parts.front().Status() |= CGeneModel::eCap;
964  else
965  parts.back().Status() |= CGeneModel::eCap;
966  }
967  }
969  return parts;
970 }
971 */
973 void MapAlignsToOrigContig(TAlignModelList& aligns, const TInDels& corrections, int contig_size);
977 END_SCOPE(gnomon)
