NCBI C++ ToolKit
prosplign.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef ALGO_ALIGN_PROSPLIGN__HPP
2 #define ALGO_ALIGN_PROSPLIGN__HPP
3 
4 /* $Id: prosplign.hpp 101425 2023-12-12 18:03:47Z dicuccio $
5 * ===========================================================================
6 *
7 * public DOMAIN NOTICE
8 * National Center for Biotechnology Information
9 *
10 * This software/database is a "United States Government Work" under the
11 * terms of the United States Copyright Act. It was written as part of
12 * the author's official duties as a United States Government employee and
13 * thus cannot be copyrighted. This software/database is freely available
14 * to the public for use. The National Library of Medicine and the U.S.
15 * Government have not placed any restriction on its use or reproduction.
16 *
17 * Although all reasonable efforts have been taken to ensure the accuracy
18 * and reliability of the software and data, the NLM and the U.S.
19 * Government do not and cannot warrant the performance or results that
20 * may be obtained by using this software or data. The NLM and the U.S.
21 * Government disclaim all warranties, express or implied, including
22 * warranties of performance, merchantability or fitness for any particular
23 * purpose.
24 *
25 * Please cite the author in any work or product based on this material.
26 *
27 * ===========================================================================
28 *
29 * Author: Boris Kiryutin (prosplign algorithm and implementation)
30 * Author: Vyacheslav Chetvernin (this adapter)
31 *
32 * File Description:
33 * CProSplign class definition
34 * spliced protein to genomic sequence alignment
35 *
36 */
37 
38 #include <corelib/ncbistd.hpp>
39 #include <corelib/ncbiargs.hpp>
40 #include <corelib/ncbiobj.hpp>
42 #include <objmgr/seq_vector_ci.hpp>
43 
44 #include <list>
45 
47 
49  class CScope;
51 
52 /// Scoring parameters object
54 {
55 public:
56  static void SetupArgDescriptions(CArgDescriptions* argdescr);
57 
58  /// creates scoring parameter object with default values
60  CProSplignOptions_Base(const CArgs& args);
61 
62  CProSplignOptions_Base& SetScoreMatrix(const string& matrix_name);
63  const string& GetScoreMatrix() const;
64 
65  CProSplignOptions_Base& SetAltStarts(bool allow_alt_start);
66  bool GetAltStarts() const;
67 
68  static const char* default_score_matrix_name; // BLOSUM62
69  static const bool default_allow_alt_starts;
70 
71 private:
74 };
75 
77 {
78 public:
79  static void SetupArgDescriptions(CArgDescriptions* argdescr);
80 
81  /// creates scoring parameter object with default values
83 
84  CProSplignScoring(const CArgs& args);
85 
86 
87  CProSplignScoring& SetMinIntronLen(int);
88  int GetMinIntronLen() const;
89 
90 
91  /// in addition to ScoreMatrix prosplign uses following costs (negate to get a score)
92 
93  CProSplignScoring& SetGapOpeningCost(int);
94  int GetGapOpeningCost() const;
95 
96  /// Gap Extension Cost for one aminoacid (three bases)
97  CProSplignScoring& SetGapExtensionCost(int);
98  int GetGapExtensionCost() const;
99 
100  CProSplignScoring& SetFrameshiftOpeningCost(int);
101  int GetFrameshiftOpeningCost() const;
102 
103  /// GT/AG intron opening cost
104  CProSplignScoring& SetGTIntronCost(int);
105  int GetGTIntronCost() const;
106  /// GC/AG intron opening cost
107  CProSplignScoring& SetGCIntronCost(int);
108  int GetGCIntronCost() const;
109  ///AT/AC intron opening cost
110  CProSplignScoring& SetATIntronCost(int);
111  int GetATIntronCost() const;
112 
113  /// Non Consensus Intron Cost
114  /// should not exceed a sum of lowest two intron opening costs,
115  /// i.e. intron_non_consensus cost <= intron_GT cost + intron_GC cost
116  CProSplignScoring& SetNonConsensusIntronCost(int);
117  int GetNonConsensusIntronCost() const;
118 
119  /// Inverted Intron Extension Cost
120  /// intron_extension cost for 1 base = 1/(inverted_intron_extension*3)
121  CProSplignScoring& SetInvertedIntronExtensionCost(int);
122  int GetInvertedIntronExtensionCost() const;
123 
124 public:
125  static const int default_min_intron_len = 30;
126 
127  static const int default_gap_opening = 10;
128  static const int default_gap_extension = 1;
129  static const int default_frameshift_opening = 30;
130 
131  static const int default_intron_GT = 15;
132  static const int default_intron_GC = 20;
133  static const int default_intron_AT = 25;
134  static const int default_intron_non_consensus = 34;
135  static const int default_inverted_intron_extension = 1000;
136 
137 private:
147 };
148 
149 /// Output filtering parameters
150 ///
151 /// ProSplign always makes a global alignment,
152 /// i.e. it aligns the whole protein no matter how bad some parts of this alignment might be.
153 /// Usually we don't want the bad pieces and remove them.
154 /// The following parameters define good parts.
156 {
157 public:
158  enum EMode {
159  /// default filtering parameters
161  /// all zeroes - no filtering
163  };
164 
165  static void SetupArgDescriptions(CArgDescriptions* argdescr);
166 
167  CProSplignOutputOptions(EMode mode = eWithHoles);
168  CProSplignOutputOptions(const CArgs& args);
169 
170  bool IsPassThrough() const;
171 
172  /// cut flanks if drop of positives is more than a dropoff
173  /// in comparison to positives in a window next to a flank
174  CProSplignOutputOptions& SetCutFlanksWithPositDrop(bool);
175  bool GetCutFlanksWithPositDrop() const;
176  CProSplignOutputOptions& SetCutFlanksWithPositDropoff(int);
177  int GetCutFlanksWithPositDropoff() const;
178  ///window size
179  CProSplignOutputOptions& SetCutFlanksWithPositWindow(int);
180  int GetCutFlanksWithPositWindow() const;
181  ///max flank size to cut
182  CProSplignOutputOptions& SetCutFlanksWithPositMaxLen(int);
183  int GetCutFlanksWithPositMaxLen() const;
184  ///count gaps as 1+1/gap_ratio, gap_ratio = 1 - standart behaviour.
185  CProSplignOutputOptions& SetCutFlanksWithPositGapRatio(int);
186  int GetCutFlanksWithPositGapRatio() const;
187 
188  /// cut partial codons and adjecent at the beginning and at the end good pieces
189  /// called at the end of post processing
190  CProSplignOutputOptions& SetCutFlankPartialCodons(bool);
191  bool GetCutFlankPartialCodons() const;
192 
193  /// fill back holes between good pieces. Flank regions remain unaligned
194  /// called at the end of post processing
195  CProSplignOutputOptions& SetFillHoles(bool);
196  bool GetFillHoles() const;
197 
198  /// fill back small holes between good pieces
199  /// holes with both unaligned protein and nucleotide portions
200  /// less than min_hole_len will be filled back
201  /// 0 - don't fill.
202  CProSplignOutputOptions& SetMinHoleLen(int);
203  int GetMinHoleLen() const;
204 
205  /// cut trailing Ns at the ends of good pieces.
206  /// called at the end of post processing
207  CProSplignOutputOptions& SetCutNs(bool);
208  bool GetCutNs() const;
209 
210  /// any length flank of a good piece should not be worse than this percentage threshold
211  CProSplignOutputOptions& SetFlankPositives(int);
212  int GetFlankPositives() const;
213  /// good piece total percentage threshold
214  CProSplignOutputOptions& SetTotalPositives(int);
215  int GetTotalPositives() const;
216 
217  /// any part of a good piece longer than max_bad_len should not be worse than min_positives
218  CProSplignOutputOptions& SetMaxBadLen(int);
219  int GetMaxBadLen() const;
220  CProSplignOutputOptions& SetMinPositives(int);
221  int GetMinPositives() const;
222 
223  /// minimum exon identity
224  CProSplignOutputOptions& SetMinExonId(int);
225  int GetMinExonId() const;
226  /// minimum exon positives percentage
227  CProSplignOutputOptions& SetMinExonPos(int);
228  int GetMinExonPos() const;
229 
230  /// minimum number of bases in the first and last exon
231  CProSplignOutputOptions& SetMinFlankingExonLen(int);
232  int GetMinFlankingExonLen() const;
233  /// good piece should not be shorter than that
234  CProSplignOutputOptions& SetMinGoodLen(int);
235  int GetMinGoodLen() const;
236 
237  /// reward (in # of positives?) for start codon match.
238  CProSplignOutputOptions& SetStartBonus(int);
239  int GetStartBonus() const;
240  /// reward for stop codon at the end. Not implemented yet
241  CProSplignOutputOptions& SetStopBonus(int);
242  int GetStopBonus() const;
243 
244 public:
245  static const bool default_cut_flanks_with_posit_drop = true;
246  static const int default_cut_flanks_with_posit_dropoff = 35;
247  static const int default_cut_flanks_with_posit_window = 90;
248  static const int default_cut_flanks_with_posit_max_len = 30;
249  static const int default_cut_flanks_with_posit_gap_ratio = 2;
250 
251  static const bool default_cut_flank_partial_codons = true;
252  static const bool default_fill_holes = false;
253  static const int default_min_hole_len = 200;
254  static const bool default_cut_ns = false;
255 
256  static const int default_flank_positives = 55;
257  static const int default_total_positives = 70;
258 
259  static const int default_max_bad_len = 45;
260  static const int default_min_positives = 15;
261 
262  static const int default_min_exon_id = 30;
263  static const int default_min_exon_pos = 55;
264 
265  static const int default_min_flanking_exon_len = 15;
266  static const int default_min_good_len = 59;
267 
268  static const int default_start_bonus = 8; /// ???
269  static const int default_stop_bonus = 8; /// ???
270 
271 private:
277 
281  bool cut_ns;
292 };
293 
294 class CProSplignText;
295 
296 /// spliced protein to genomic alignment
297 ///
299 {
300 public:
301 
302  /// By default ProSplign looks for introns.
303  /// Set intronless mode for protein to mRNA alignments, many viral genomes, etc.
304  CProSplign( CProSplignScoring scoring = CProSplignScoring(), bool intronless=false );
305  ~CProSplign();
306 
307  void SetTranslationTable(int gcode);
308 
309  ///for MT usage
310  ///set a signal for core algirithm to interrupt calculations
311  ///after this method is called from one thread for a ProSplign object,
312  ///FindAlignment for the same ProSplign object in other thread
313  /// is going to throw CProSplignException of "eUserInterrupt" type
314  void Interrupt(void);
315 
316  ///User interrupt logic for GBENCH. Set up callback function
317  ///when callback function returns true, CProSplignException of "eUserInterrupt" type will be thrown
318  typedef bool(* TInterruptFnPtr) (void *callback_data);
319  void SetInterruptCallback( TInterruptFnPtr prg_callback, void* data);
320 
321  /// Aligns protein to a region on genomic sequence.
322  /// genomic seq_loc should be a continuous region - an interval or a whole sequence
323  ///
324  /// Returns Spliced-seg
326  FindAlignment(objects::CScope& scope,
327  const objects::CSeq_id& protein,
328  const objects::CSeq_loc& genomic,
330  {
331  CRef<objects::CSeq_align> align_ref;
332  align_ref = FindGlobalAlignment(scope, protein, genomic);
333  align_ref = RefineAlignment(scope, *align_ref, output_options);
334  return align_ref;
335  }
336 
337  /// Globally aligns protein to a region on genomic sequence.
338  /// genomic seq_loc should be a continuous region - an interval or a whole sequence
339  ///
340  /// Returns Spliced-seg
342  FindGlobalAlignment(objects::CScope& scope,
343  const objects::CSeq_id& protein,
344  const objects::CSeq_loc& genomic);
345 
346  /// Refines Spliced-seg alignment by removing bad pieces according to output_options.
347  /// This is irreversible action - more relaxed parameters will not change the alignment back
349  RefineAlignment(objects::CScope& scope,
350  const objects::CSeq_align& seq_align,
352 
353  /// Sets scores expected from execution of ProSplign
354  void SetScores(objects::CScope& scope,
355  objects::CSeq_align& seq_align,
357 
358 
359  //Use this method to set/change genetic code field in genomic ASN
360  //ProSplign tries to get genetic code from ASN. If fails, standart code (1) is used.
361  //Note that when sequence source is fasta or BLAST db, genetic code is not set in ASN
362  void AssignGeneticCode(objects::CScope& scope, const objects::CSeq_id& gid, int gcode);
363 
364 
365  /// deprecated internals
366  CProSplign( CProSplignScoring scoring, bool intronless, bool one_stage, bool just_second_stage, bool old);
367  const vector<pair<int, int> >& GetExons() const;
368  vector<pair<int, int> >& SetExons();
369  void GetFlanks(bool& lgap, bool& rgap) const;
370  void SetFlanks(bool lgap, bool rgap);
371 
372 public:
373  class CImplementation;
374 private:
375  unique_ptr<CImplementation> m_implementation;
376 
377 private:
378  /// forbidden
381 };
382 
383 /// Text representation of ProSplign alignment
384 // dna : GATGAAACAGCACTAGTGACAGGTAAA----GATCTAAATATCGTTGA<skip>GGAAGACATCCATTGGCAATGGCAATGGCAT
385 // translation: D E T A L V T G K S K Y h hh I H
386 // match : | | + | | | | | | + ++ + | XXXXXbad partXXXXX
387 // protein : D E Q S F --- T G K E Y S K Y y.....intron.....yy L H D T S T E G
388 //
389 // there are no "<skip>", "intron", or "bad part" in actual values
391 public:
392  /// Outputs formatted text
393  static void Output(const objects::CSeq_align& seqalign, objects::CScope& scope, ostream& out, int width, const string& matrix_name = "BLOSUM62");
394 };
395 
397 
398 
399 #endif
void RefineAlignment(objects::CScope &scope, objects::CSeq_align &seq_align, const list< CNPiece > &good_parts)
void SetScores(objects::CSeq_align &seq_align, objects::CScope &scope, const string &matrix_name="BLOSUM62")
Boolean(* TInterruptFnPtr)(SBlastProgress *progress_info)
Prototype for function pointer to determine whether the BLAST search should proceed or be interrupted...
Definition: blast_def.h:354
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgs –.
Definition: ncbiargs.hpp:379
CObject –.
Definition: ncbiobj.hpp:180
Scoring parameters object.
Definition: prosplign.hpp:54
static void SetupArgDescriptions(CArgDescriptions *argdescr)
Definition: prosplign.cpp:60
static const bool default_allow_alt_starts
Definition: prosplign.hpp:69
static const char * default_score_matrix_name
Definition: prosplign.hpp:68
Output filtering parameters.
Definition: prosplign.hpp:156
bool cut_flanks_with_posit_drop
???
Definition: prosplign.hpp:272
@ ePassThrough
all zeroes - no filtering
Definition: prosplign.hpp:162
@ eWithHoles
default filtering parameters
Definition: prosplign.hpp:160
int inverted_intron_extension
Definition: prosplign.hpp:146
Text representation of ProSplign alignment.
Definition: prosplign.hpp:390
spliced protein to genomic alignment
Definition: prosplign.hpp:299
CProSplign & operator=(const CProSplign &)
CRef< objects::CSeq_align > FindAlignment(objects::CScope &scope, const objects::CSeq_id &protein, const objects::CSeq_loc &genomic, CProSplignOutputOptions output_options=CProSplignOutputOptions())
Aligns protein to a region on genomic sequence.
Definition: prosplign.hpp:326
unique_ptr< CImplementation > m_implementation
Definition: prosplign.hpp:373
CProSplign(const CProSplign &)
forbidden
CScope –.
Definition: scope.hpp:92
Include a standard set of the NCBI C++ Toolkit most basic headers.
std::ofstream out("events_result.xml")
main entry point for tests
#define bool
Definition: bool.h:34
char data[12]
Definition: iconv.c:80
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
#define NCBI_XALGOALIGN_EXPORT
Definition: ncbi_export.h:985
mdb_mode_t mode
Definition: lmdb++.h:38
Defines command line argument related classes.
Portable reference counted smart and weak pointers using CWeakRef, CRef, CObject and CObjectEx.
Modified on Wed Sep 04 15:00:23 2024 by modify_doxy.py rev. 669887