NCBI C++ ToolKit
chainer.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef ALGO_GNOMON___CHAINER__HPP
2 #define ALGO_GNOMON___CHAINER__HPP
3 
4 /* $Id: chainer.hpp 101632 2024-01-17 15:27:49Z souvorov $
5  * ===========================================================================
6  *
7  * PUBLIC DOMAIN NOTICE
8  * National Center for Biotechnology Information
9  *
10  * This software/database is a "United States Government Work" under the
11  * terms of the United States Copyright Act. It was written as part of
12  * the author's official duties as a United States Government employee and
13  * thus cannot be copyrighted. This software/database is freely available
14  * to the public for use. The National Library of Medicine and the U.S.
15  * Government have not placed any restriction on its use or reproduction.
16  *
17  * Although all reasonable efforts have been taken to ensure the accuracy
18  * and reliability of the software and data, the NLM and the U.S.
19  * Government do not and cannot warrant the performance or results that
20  * may be obtained by using this software or data. The NLM and the U.S.
21  * Government disclaim all warranties, express or implied, including
22  * warranties of performance, merchantability or fitness for any particular
23  * purpose.
24  *
25  * Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Authors: Alexandre Souvorov
30  *
31  * File Description:
32  *
33  */
34 
37 #include <algo/gnomon/pcsf.hpp>
38 
40 
41 class CArgDescriptions;
42 class CArgs;
43 
44 BEGIN_SCOPE(gnomon)
45 
46 class CHMMParameters;
47 
48 struct SMinScor {
49  double m_min;
50  double m_i5p_penalty;
51  double m_i3p_penalty;
52  double m_cds_bonus;
54  double m_minprotfrac;
55  double m_endprotfrac;
57  int m_cds_len;
62  int m_minlen;
63 };
64 
65 
66 // redefine STL algorithms to take Function Object pointer to allow for inheritance
67 
68 template <class Container, class Predicate>
69 void remove_if(Container& c, Predicate* __pred)
70 {
71  typedef typename Container::iterator Iterator;
72  Iterator __first = c.begin();
73  Iterator __last = c.end();
74  while (__first != __last) {
75  if ((*__pred)(*__first)) {
76  (__first)->Status() |= CGeneModel::eSkipped;
77  (__first)->AddComment(__pred->GetComment());
78  __first = c.erase(__first);
79  } else
80  ++__first;
81  }
82  delete __pred;
83 }
84 
85 template <class Container, class UnaryFunction>
86 void transform(Container& c, UnaryFunction* op)
87 {
88  typedef typename Container::iterator Iterator;
89  Iterator __first = c.begin();
90  Iterator __last = c.end();
91  for (;__first != __last;++__first)
92  (*op)(*__first);
93  delete op;
94 }
95 
97  virtual ~TransformFunction() {}
100 
101  virtual void transform_model(CGeneModel& /*a*/) {}
103 };
104 struct Predicate {
105  virtual ~Predicate() {}
106  virtual string GetComment() { return "reason not given"; }
109 
110  virtual bool model_predicate(CGeneModel& /*a*/) { return false; }
111  virtual bool align_predicate(CAlignModel& a) { return model_predicate(a); }
112 };
113 
115 public:
117  virtual ~CGnomonAnnotator_Base();
118 
119  void SetHMMParameters(CHMMParameters* params);
120  void EnableSeqMasking();
121  void SetGenomic(const CResidueVec& seq);
122  void SetGenomic(const CSeq_id& seqid, objects::CScope& scope, const string& mask_annots = kEmptyStr, const TGeneModelList* models = 0);
123  void SetGenomic(const CSeq_id& seqid, objects::CScope& scope, const SCorrectionData& correction_data, TSignedSeqRange range = TSignedSeqRange::GetWhole(), const string& mask_annots = kEmptyStr);
124  void SetPCSF(const CPhyloCSFData* pcsf_datap) { m_pcsf_data = pcsf_datap; }
125 
126  CGnomonEngine& GetGnomon();
127  void MapAlignmentsToEditedContig(TAlignModelList& alignments) const;
128  void MapModelsToEditedContig(TGeneModelList& models) const;
129  void MapModelsToOrigContig(TGeneModelList& models) const;
130 
133 
134 protected:
135  CAlignModel MapOneModelToEditedContig(const CGeneModel& align) const;
136  CGeneModel MapOneModelToOrigContig(const CGeneModel& srcmodel) const;
137 
138  bool m_masking;
140  unique_ptr<CGnomonEngine> m_gnomon;
142  TInDels m_editing_indels; // in original coordinates (include corrections, ggaps and Ns)
143  TInDels m_reversed_corrections; // corrections from edited genome back to original (without gggaps or ns)
144  TIntMap m_confirmed_bases_len; // include all "confirmed" or "corrected" positions in corrected coordinates
145  TIntMap m_confirmed_bases_orig_len; // include all "confirmed" or "corrected" positions in original coordinates
146  map<int,char> m_replacements; // in original coordinates
147  map<int,char> m_replaced_bases; // in original coordinates; just in case
148  TGgapInfo m_inserted_seqs; // edited left coord to indelinfo for ggaps
149  TIntMap m_notbridgeable_gaps_len; // don't allow introns to cross this
150  TSignedSeqRange m_limits; // limits on contig
151  string m_contig_acc;
152  unique_ptr<SPhyloCSFSlice> m_pcsf_slice;
153  const CPhyloCSFData* m_pcsf_data = nullptr;
154  double m_pcsf_factor = 0.;
155 };
156 
157 ////////////////////////////////////////////////////////////////////////
158 class CChainerArgUtil;
159 
161 public:
162  CChainer();
163  ~CChainer();
164 
165  void SetIntersectLimit(int value);
166  void SetTrim(int trim);
167  void SetMinPolyA(int minpolya);
168  SMinScor& SetMinScor();
169  void SetMinInframeFrac(double mininframefrac);
170  map<string, pair<bool,bool> >& SetProtComplet();
171  map<string,TSignedSeqRange>& SetMrnaCDS();
172  void SetGenomicRange(const TAlignModelList& alignments);
173  void SetNumbering(int idnext, int idinc);
174  void SetOnlyBestFs(bool onlybestfs);
175 
180  TransformFunction* ProjectCDS(objects::CScope& scope);
182  void SetConfirmedStartStopForProteinAlignments(TAlignModelList& alignments);
183  void DropAlignmentInfo(TAlignModelList& alignments, TGeneModelList& models);
184  void FilterOutChimeras(TGeneModelList& clust);
185  void ScoreCDSes_FilterOutPoorAlignments(TGeneModelList& clust);
186  void FindSelenoproteinsClipProteinsToStartStop(TGeneModelList& clust);
187  void CutParts(TGeneModelList& models);
188  TGeneModelList MakeChains(TGeneModelList& models);
189 
190 private:
191  // Prohibit copy constructor and assignment operator
193  CChainer& operator= (const CChainer& value);
194 
195  class CChainerImpl;
196  unique_ptr<CChainerImpl> m_data;
197 
198  friend class CChainerArgUtil;
199 };
200 
202  MarkupCappedEst(const set<string>& _caps, int _capgap);
203 
205  int capgap;
206 
207  virtual void transform_align(CAlignModel& align);
208 };
209 
211  MarkupTrustedGenes(const set<string>& _trusted_genes);
213 
214  virtual void transform_align(CAlignModel& align);
215 };
216 
217 struct ProteinWithBigHole : public Predicate {
219  double hthresh, hmaxlen;
221 
222  virtual bool model_predicate(CGeneModel& align);
223 };
224 
225 struct CdnaWithHole : public Predicate {
226  virtual bool model_predicate(CGeneModel& align);
227 };
228 
229 struct HasShortIntron : public Predicate {
232  virtual bool model_predicate(CGeneModel& align);
233 };
234 
235 struct HasLongIntron : public Predicate {
238  virtual bool model_predicate(CGeneModel& align);
239 };
240 
243  int minex;
244 
245  virtual void transform_align(CAlignModel& align);
246 };
247 
248 struct HasNoExons : public Predicate {
249  virtual bool model_predicate(CGeneModel& align);
250 };
251 
252 struct SingleExon_AllEst : public Predicate {
253  virtual bool model_predicate(CGeneModel& align);
254 };
255 
257  virtual bool model_predicate(CGeneModel& align);
258 };
259 
261  LowSupport_Noncoding(int _minsupport);
263  virtual bool model_predicate(CGeneModel& align);
264 };
265 
267 public:
268  static void SetupArgDescriptions(CArgDescriptions* arg_desc);
269  static void ArgsToChainer(CChainer* chainer, const CArgs& args, objects::CScope& scope);
270 };
271 
272 END_SCOPE(gnomon)
274 
275 
276 #endif // ALGO_GNOMON___CHAINER__HPP
void remove_if(Container &c, Predicate *__pred)
Definition: chainer.hpp:69
void transform(Container &c, UnaryFunction *op)
Definition: chainer.hpp:86
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgs –.
Definition: ncbiargs.hpp:379
CChainer(const CChainer &value)
void SetOnlyBestFs(bool onlybestfs)
unique_ptr< CChainerImpl > m_data
Definition: chainer.hpp:195
TInDels m_editing_indels
Definition: chainer.hpp:142
unique_ptr< CGnomonEngine > m_gnomon
Definition: chainer.hpp:140
map< int, char > m_replaced_bases
Definition: chainer.hpp:147
map< int, TInDels::const_iterator > TGgapInfo
Definition: chainer.hpp:131
void SetPCSF(const CPhyloCSFData *pcsf_datap)
Definition: chainer.hpp:124
TGgapInfo m_inserted_seqs
Definition: chainer.hpp:148
TSignedSeqRange m_limits
Definition: chainer.hpp:150
TInDels m_reversed_corrections
Definition: chainer.hpp:143
map< int, char > m_replacements
Definition: chainer.hpp:146
unique_ptr< SPhyloCSFSlice > m_pcsf_slice
Definition: chainer.hpp:152
void SetGenomic(const CSeq_id &seqid, objects::CScope &scope, const SCorrectionData &correction_data, TSignedSeqRange range=TSignedSeqRange::GetWhole(), const string &mask_annots=kEmptyStr)
TIntMap m_confirmed_bases_len
Definition: chainer.hpp:144
TIntMap m_confirmed_bases_orig_len
Definition: chainer.hpp:145
CAlignMap m_edited_contig_map
Definition: chainer.hpp:141
map< int, int > TIntMap
Definition: chainer.hpp:132
CRef< CHMMParameters > m_hmm_params
Definition: chainer.hpp:139
void SetGenomic(const CSeq_id &seqid, objects::CScope &scope, const string &mask_annots=kEmptyStr, const TGeneModelList *models=0)
TIntMap m_notbridgeable_gaps_len
Definition: chainer.hpp:149
HMM model parameters just create it and pass to a Gnomon engine.
Definition: gnomon.hpp:55
vector< TResidue > CResidueVec
list< CAlignModel > TAlignModelList
list< CGeneModel > TGeneModelList
vector< CInDelInfo > TInDels
static TThisType GetWhole(void)
Definition: range.hpp:272
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
#define kEmptyStr
Definition: ncbistr.hpp:123
#define NCBI_XALGOGNOMON_EXPORT
Definition: ncbi_export.h:1001
void AddComment(CSeq_feat &feat, const string &comment)
Definition: utils.cpp:44
range(_Ty, _Ty) -> range< _Ty >
Magic spell ;-) needed for some weird compilers... very empiric.
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
unsigned int a
Definition: ncbi_localip.c:102
virtual bool model_predicate(CGeneModel &align)
Definition: chainer.cpp:8577
CutShortPartialExons(int minex)
Definition: chainer.cpp:8614
virtual void transform_align(CAlignModel &align)
Definition: chainer.cpp:8623
CGnomonEngine & gnomon
Definition: chainer.hpp:237
virtual bool model_predicate(CGeneModel &align)
Definition: chainer.cpp:8602
HasLongIntron(CGnomonEngine &gnomon)
Definition: chainer.cpp:8599
virtual bool model_predicate(CGeneModel &align)
Definition: chainer.cpp:8723
CGnomonEngine & gnomon
Definition: chainer.hpp:231
virtual bool model_predicate(CGeneModel &align)
Definition: chainer.cpp:8587
HasShortIntron(CGnomonEngine &gnomon)
Definition: chainer.cpp:8584
virtual bool model_predicate(CGeneModel &align)
Definition: chainer.cpp:8741
LowSupport_Noncoding(int _minsupport)
Definition: chainer.cpp:8738
MarkupCappedEst(const set< string > &_caps, int _capgap)
Definition: chainer.cpp:8523
virtual void transform_align(CAlignModel &align)
Definition: chainer.cpp:8528
const set< string > & caps
Definition: chainer.hpp:204
MarkupTrustedGenes(const set< string > &_trusted_genes)
Definition: chainer.cpp:8538
virtual void transform_align(CAlignModel &align)
Definition: chainer.cpp:8540
const set< string > & trusted_genes
Definition: chainer.hpp:212
bool operator()(CGeneModel &a)
Definition: chainer.hpp:107
virtual ~Predicate()
Definition: chainer.hpp:105
bool operator()(CAlignModel &a)
Definition: chainer.hpp:108
virtual bool align_predicate(CAlignModel &a)
Definition: chainer.hpp:111
virtual bool model_predicate(CGeneModel &)
Definition: chainer.hpp:110
virtual string GetComment()
Definition: chainer.hpp:106
ProteinWithBigHole(double hthresh, double hmaxlen, CGnomonEngine &gnomon)
Definition: chainer.cpp:8553
virtual bool model_predicate(CGeneModel &align)
Definition: chainer.cpp:8555
CGnomonEngine & gnomon
Definition: chainer.hpp:220
double m_i5p_penalty
Definition: chainer.hpp:50
double m_cds_bonus
Definition: chainer.hpp:52
int m_cds_len
Definition: chainer.hpp:57
double m_minprotfrac
Definition: chainer.hpp:54
int m_prot_cds_len
Definition: chainer.hpp:56
int m_minsupport_rnaseq
Definition: chainer.hpp:61
double m_i3p_penalty
Definition: chainer.hpp:51
int m_minlen
Definition: chainer.hpp:62
double m_utr_clip_threshold
Definition: chainer.hpp:58
int m_minsupport
Definition: chainer.hpp:59
double m_min
Definition: chainer.hpp:49
double m_endprotfrac
Definition: chainer.hpp:55
int m_minsupport_mrna
Definition: chainer.hpp:60
double m_length_penalty
Definition: chainer.hpp:53
virtual bool model_predicate(CGeneModel &align)
Definition: chainer.cpp:8728
virtual bool model_predicate(CGeneModel &align)
Definition: chainer.cpp:8733
void operator()(CGeneModel &a)
Definition: chainer.hpp:98
virtual void transform_model(CGeneModel &)
Definition: chainer.hpp:101
void operator()(CAlignModel &a)
Definition: chainer.hpp:99
virtual ~TransformFunction()
Definition: chainer.hpp:97
virtual void transform_align(CAlignModel &a)
Definition: chainer.hpp:102
Modified on Tue May 21 10:56:40 2024 by modify_doxy.py rev. 669887