NCBI C++ ToolKit
aligncollapser.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef ALGO_GNOMON___ALIGNCOLLAPSER__HPP
2 #define ALGO_GNOMON___ALIGNCOLLAPSER__HPP
3 
4 /* $Id: aligncollapser.hpp 101798 2024-02-13 17:18:22Z souvorov $
5  * ===========================================================================
6  *
7  * PUBLIC DOMAIN NOTICE
8  * National Center for Biotechnology Information
9  *
10  * This software/database is a "United States Government Work" under the
11  * terms of the United States Copyright Act. It was written as part of
12  * the author's official duties as a United States Government employee and
13  * thus cannot be copyrighted. This software/database is freely available
14  * to the public for use. The National Library of Medicine and the U.S.
15  * Government have not placed any restriction on its use or reproduction.
16  *
17  * Although all reasonable efforts have been taken to ensure the accuracy
18  * and reliability of the software and data, the NLM and the U.S.
19  * Government do not and cannot warrant the performance or results that
20  * may be obtained by using this software or data. The NLM and the U.S.
21  * Government disclaim all warranties, express or implied, including
22  * warranties of performance, merchantability or fitness for any particular
23  * purpose.
24  *
25  * Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Authors: Alexandre Souvorov
30  *
31  * File Description:
32  *
33  */
34 
36 #include <corelib/ncbiargs.hpp>
37 #include <objmgr/seq_vector.hpp>
38 
40 BEGIN_SCOPE(gnomon)
41 
43  SAlignIndividual() : m_weight(0) {};
44  SAlignIndividual(const CAlignModel& align, deque<char>& target_id_pool) : m_range(align.Limits()), m_align_id(align.ID()), m_weight(align.Weight()) {
45  m_target_id = (TSignedSeqPos)target_id_pool.size();
46  string acc = align.TargetAccession();
47  copy(acc.begin(),acc.end(),back_inserter(target_id_pool));
48  target_id_pool.push_back(0);
50  m_align_id = -m_align_id;
51  };
52 
54  Int8 m_align_id; // < 0 used for eChangedByFilter
55  float m_weight; // < 0 used for deleting
56  TSignedSeqPos m_target_id; // shift in deque<char> for 0 terminated string; deque is maintained by CAlignCollapser
57 };
58 
59 
60 struct SIntron {
61  SIntron(int a, int b, int strand, bool oriented, const string& sig) : m_range(a,b), m_strand(strand), m_oriented(oriented), m_sig(sig) {}
62  bool operator<(const SIntron& i) const { // m_sig should not be included
63  if(m_oriented != i.m_oriented)
64  return m_oriented < i.m_oriented;
65  else if(m_oriented && m_strand != i.m_strand)
66  return m_strand < i.m_strand;
67  else
68  return m_range < i.m_range;
69 
70  }
72  int m_strand;
73  bool m_oriented;
74  string m_sig;
75 };
76 
77 
78 class CAlignCommon {
79 public:
80  typedef vector<SIntron> Tintrons;
82  CAlignCommon(const CGeneModel& align);
83  const Tintrons& GetIntrons() const { return m_introns; }
84  CAlignModel GetAlignment(const SAlignIndividual& ali, const deque<char>& target_id_pool) const;
85  int GetFlags() const { return m_flags; }
86  bool isSR() const { return (m_flags&esr); }
87  bool isEST() const { return (m_flags&eest); }
88  bool isPolyA() const { return (m_flags&epolya); }
89  bool isCap() const { return (m_flags&ecap); }
90  bool isUnknown() const { return (m_flags&eunknownorientation); }
91  bool isPlus() const { return (m_flags&eplus); }
92  bool isMinus() const { return (m_flags&eminus); }
93  bool operator<(const CAlignCommon& cas) const {
94  if(m_flags != cas.m_flags)
95  return m_flags < cas.m_flags;
96  else if(m_introns.size() != cas.m_introns.size())
97  return m_introns.size() < cas.m_introns.size();
98  else
99  return m_introns < cas.m_introns;
100  }
101 
102 private:
103  enum {
104  esr = 1,
105  eest = 2,
106  epolya = 4,
107  ecap = 8,
109  eplus = 32,
110  eminus = 64
111  };
112 
114  int m_flags;
115 };
116 
118  list<TSignedSeqRange> m_confirmed_intervals; // include all "confirmed" or "corrected" positions
121 };
122 
123 
125 public:
126  CAlignCollapser(string contig = "", CScope* scope = 0, bool nofilteringcollapsing = false);
127  void InitContig(string contig, CScope* scope);
128  void AddAlignment(CAlignModel& align);
129  void FilterAlignments();
134 
135  //for compatibilty with 'pre-correction' worker node
138 
140  void SetGenomicCorrections(const SCorrectionData& correction_data) { m_correction_data = correction_data; }
141 
142  static void SetupArgDescriptions(CArgDescriptions* arg_desc);
143 
144  struct SIntronData {
145  double m_weight = 0.;
146  double m_ident = 0.;
147  int m_sr_support = 0;
148  int m_est_support = 0;
150  int m_intron_num = 0;
151  bool m_keep_anyway = false;
152  bool m_selfsp_support = false;
153  bool m_not_cross = false;
154  };
156 
157 
159  bool operator()(const CInDelInfo& a, const CInDelInfo& b) const
160  {
161  if(a != b)
162  return a < b;
163  else
164  return a.GetSource().m_acc < b.GetSource().m_acc;
165  }
166  };
167 
169  public:
170  void Init(const CSeqVector& sv, TSignedSeqPos from, TSignedSeqPos to) {
171  m_string.reserve(to-from+1);
172  sv.GetSeqData(from, to+1, m_string);
173  m_shift = from;
174  }
175  char& operator[](TSignedSeqPos p) { return m_string[p-m_shift]; }
176  const char& operator[](TSignedSeqPos p) const { return m_string[p-m_shift]; }
178  string substr(TSignedSeqPos p, TSignedSeqPos l) const { return m_string.substr(p-m_shift, l); }
179  void ToUpper() {
180  for(char& c : m_string)
181  c = toupper(c);
182  }
183  private:
184  string m_string;
186  };
187 
188 
189 private:
190  void CollapsIdentical();
191  enum {
194  efill_middle = 4
195  };
196  void CleanSelfTranscript(CAlignModel& align, const string& trans) const;
197  void CleanExonEdge(int ie, CAlignModel& align, const string& transcript, bool right_edge) const;
199  bool CheckAndInsert(const CAlignModel& align, TAlignModelClusterSet& clsset) const;
202  bool RemoveNotSupportedIntronsFromTranscript(CAlignModel& align, bool check_introns_on_both_strands) const;
203  void ClipNotSupportedFlanks(CAlignModel& align, double clip_threshold, double min_lim = 0);
204  void ClipESTorSR(CAlignModel& align, double clip_threshold, double min_lim);
205 
212  map<tuple<int, int>, CAlignModel> m_special_aligns; // [left/right flex|cap/polya, position]
213 
214  int m_count;
225  double m_minident;
226 
232 
234  vector<double> m_coverage;
235 
237 };
238 
239 #define SPECIAL_ALIGN_LEN 110
240 #define NOT_ALIGNED_PHONY_CAGE 10
241 
242 END_SCOPE(gnomon)
244 
245 
246 #endif // ALGO_GNOMON___ALIGNCOLLAPSER__HPP
char & operator[](TSignedSeqPos p)
TSignedSeqPos FullLength() const
const char & operator[](TSignedSeqPos p) const
void Init(const CSeqVector &sv, TSignedSeqPos from, TSignedSeqPos to)
string substr(TSignedSeqPos p, TSignedSeqPos l) const
SCorrectionData GetGenomicCorrections() const
CAlignModel FillGapsInAlignmentAndAddToGenomicGaps(const CAlignModel &align, int fill)
void ClipProteinToStartStop(CAlignModel &align)
TSignedSeqRange m_range
void CleanExonEdge(int ie, CAlignModel &align, const string &transcript, bool right_edge) const
bool RemoveNotSupportedIntronsFromProt(CAlignModel &align)
CAlignCollapser(string contig="", CScope *scope=0, bool nofilteringcollapsing=false)
void AddAlignment(CAlignModel &align)
void InitContig(string contig, CScope *scope)
TInDels GetGenomicGaps() const
vector< double > m_coverage
map< CAlignCommon, deque< SAlignIndividual > > Tdata
CPartialString m_contig
map< int, int > TIntMap
void ClipNotSupportedFlanks(CAlignModel &align, double clip_threshold, double min_lim=0)
bool RemoveNotSupportedIntronsFromTranscript(CAlignModel &align, bool check_introns_on_both_strands) const
map< CAlignCommon, deque< char > > Tidpool
void GetOnlyOtherAlignments(TAlignModelClusterSet &clsset)
bool CheckAndInsert(const CAlignModel &align, TAlignModelClusterSet &clsset) const
void SetGenomicCorrections(const SCorrectionData &correction_data)
TAlignIntrons m_align_introns
SCorrectionData m_correction_data
void ClipESTorSR(CAlignModel &align, double clip_threshold, double min_lim)
TIntMap m_genomic_gaps_len
void GetCollapsedAlgnments(TAlignModelClusterSet &clsset)
TAlignModelList m_aligns_for_filtering_only
map< tuple< int, int >, CAlignModel > m_special_aligns
map< SIntron, SIntronData > TAlignIntrons
TIntMap GetContigGaps() const
void CleanSelfTranscript(CAlignModel &align, const string &trans) const
static void SetupArgDescriptions(CArgDescriptions *arg_desc)
bool operator<(const CAlignCommon &cas) const
int GetFlags() const
bool isCap() const
bool isSR() const
bool isPlus() const
CAlignModel GetAlignment(const SAlignIndividual &ali, const deque< char > &target_id_pool) const
bool isEST() const
bool isMinus() const
Tintrons m_introns
bool isUnknown() const
bool isPolyA() const
vector< SIntron > Tintrons
const Tintrons & GetIntrons() const
string TargetAccession() const
CArgDescriptions –.
Definition: ncbiargs.hpp:541
unsigned int & Status()
CScope –.
Definition: scope.hpp:92
CSeqVector –.
Definition: seq_vector.hpp:65
list< CAlignModel > TAlignModelList
vector< CInDelInfo > TInDels
int TSignedSeqPos
Type for signed sequence position.
Definition: ncbimisc.hpp:887
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
Definition: seq_vector.cpp:304
#define NCBI_DEPRECATED
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
int i
Magic spell ;-) needed for some weird compilers... very empiric.
unsigned int a
Definition: ncbi_localip.c:102
Defines command line argument related classes.
int toupper(Uchar c)
Definition: ncbictype.hpp:73
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
bool operator()(const CInDelInfo &a, const CInDelInfo &b) const
TSignedSeqRange m_range
SAlignIndividual(const CAlignModel &align, deque< char > &target_id_pool)
TSignedSeqPos m_target_id
list< TSignedSeqRange > m_confirmed_intervals
map< int, char > m_replacements
TInDels m_correction_indels
bool m_oriented
string m_sig
TSignedSeqRange m_range
bool operator<(const SIntron &i) const
SIntron(int a, int b, int strand, bool oriented, const string &sig)
Modified on Sat Apr 13 11:46:16 2024 by modify_doxy.py rev. 669887