NCBI C++ ToolKit
splign.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 
2 #ifndef ALGO_ALIGN_SPLIGN__HPP
3 #define ALGO_ALIGN_SPLIGN__HPP
4 
5 /* $Id: splign.hpp 100425 2023-07-31 13:44:51Z mozese2 $
6 * ===========================================================================
7 *
8 * public DOMAIN NOTICE
9 * National Center for Biotechnology Information
10 *
11 * This software/database is a "United States Government Work" under the
12 * terms of the United States Copyright Act. It was written as part of
13 * the author's official duties as a United States Government employee and
14 * thus cannot be copyrighted. This software/database is freely available
15 * to the public for use. The National Library of Medicine and the U.S.
16 * Government have not placed any restriction on its use or reproduction.
17 *
18 * Although all reasonable efforts have been taken to ensure the accuracy
19 * and reliability of the software and data, the NLM and the U.S.
20 * Government do not and cannot warrant the performance or results that
21 * may be obtained by using this software or data. The NLM and the U.S.
22 * Government disclaim all warranties, express or implied, including
23 * warranties of performance, merchantability or fitness for any particular
24 * purpose.
25 *
26 * Please cite the author in any work or product based on this material.
27 *
28 * ===========================================================================
29 *
30 * Author: Yuri Kapustin
31 * Boris Kiryutin
32 *
33 * File Description:
34 * CSplign class definition
35 *
36 */
37 
38 #include <corelib/ncbistd.hpp>
39 #include <corelib/version_api.hpp>
40 
41 #include <objmgr/scope.hpp>
42 #include <objmgr/bioseq_handle.hpp>
43 
45 #include <util/range_coll.hpp>
46 
49 
51 
52 class CBlastTabular;
53 
54 const string kTestType_20_28_plus = "20_28_plus";//add on to the 20_28_90_cut20 (aka test_plus mode)
55 const string kTestType_20_28 = "20_28_90_cut20"; // aka "test mode"
56 const string kTestType_production_default = "production_default";
57 
58 
60  class CScope;
61  class CSeq_id;
62  class CScore_set;
63  class CSeq_align_set;
64  class CSeqMap;
65  class CSeq_id_Handle;
67 
68 
69 /// CSplign is the central library object for computing spliced
70 /// cDNA-to-genomic alignments.
71 
72 
74 {
75 public:
76 
78 
79  CSplign(void);
80  ~CSplign();
81 
82  /// Retrieve the library's version object
83 
84  static CVersionAPI& s_GetVersion(void);
85 
86  /// Access the spliced aligner core object.
87 
88  CRef<TAligner>& SetAligner(void);
89  CConstRef<TAligner> GetAligner(void) const;
90  //set aligner scores to current settings of CSplign object (m_match_score and so on).
91  //CSplign scores could be changed manually or set to mrna/est presettings, see "basic scores" section below
92  //aligner should be created and CSplign scores should be set before SetAlignerScores call.
93  void SetAlignerScores(void);
94 
95  //just create an aligner
96  static CRef<CSplicedAligner> s_CreateDefaultAligner(void);
97  //create an aligner and set basic scores to mRNA or EST preset
98  static CRef<CSplicedAligner> s_CreateDefaultAligner(bool low_query_quality);
99 
100  /// Access the scope object that the library will use to retrieve the sequences
101 
102  CRef<objects::CScope> GetScope(void) const;
103  CRef<objects::CScope>& SetScope(void);
104 
105  /// Controls whether to clean the scope object's cache on a new sequence.
106  ///
107  /// @param preserve
108  /// When true, the sequences previsouly loaded into the scope will not
109  /// be deleted, which is feasible when working with a fixed number
110  /// of sequences e.g. in an interactive application. When false,
111  /// transcript sequences will always be cleared from the scope, and
112  /// genomic sequences will be cleared unless the requested sequence
113  /// is the same as the last one.
114 
115  void PreserveScope(bool preserve = true);
116 
117  void SetEndGapDetection(bool on);
118  bool GetEndGapDetection(void) const;
119 
120  void SetPolyaDetection(bool on);
121  bool GetPolyaDetection(void) const;
122 
123  void SetStrand(bool strand);
124  bool GetStrand(void) const;
125 
126  void SetMaxGenomicExtent(size_t mge);
127  static size_t s_GetDefaultMaxGenomicExtent(void);
128  size_t GetMaxGenomicExtent(void) const;
129 
130  void SetMaxIntron(size_t max_intron);
131  size_t GetMaxIntron(void) const;
132 
133  void SetCompartmentPenalty(double penalty);
134  static double s_GetDefaultCompartmentPenalty(void);
135  double GetCompartmentPenalty(void) const;
136 
137  void SetMinCompartmentIdentity(double idty);
138  static double s_GetDefaultMinCompartmentIdty(void);
139  double GetMinCompartmentIdentity(void) const;
140 
141  void SetMinSingletonIdentity(double idty);
142  double GetMinSingletonIdentity(void) const;
143 
144  void SetMinSingletonIdentityBps(size_t idty);
145  size_t GetMinSingletonIdentityBps(void) const;
146 
147  void SetMinExonIdentity(double idty);
148  static double s_GetDefaultMinExonIdty(void);
149  double GetMinExonIdentity(void) const;
150 
151  void SetPolyaExtIdentity(double idty);
152  static double s_GetDefaultPolyaExtIdty(void);
153  double GetPolyaExtIdentity(void) const;
154 
155  void SetMinPolyaLen(size_t len);
156  static size_t s_GetDefaultMinPolyaLen(void);
157  size_t GetMinPolyaLen(void) const;
158 
159  void SetMinHoleLen(size_t len);
160  static size_t s_GetDefaultMinHoleLen(void);
161  size_t GetMinHoleLen(void) const;
162 
163  void SetTrimToCodons(bool);
164  static bool s_GetDefaultTrimToCodons(void);
165  bool GetTrimToCodons(void) const;
166 
167  void SetMaxPartExonIdentDrop(double ident);
168  static double s_GetDefaultMaxPartExonIdentDrop(void);
169  double GetMaxPartExonIdentDrop(void) const;
170 
171  void SetTestType(const string& test_type);
172  string GetTestType(void) const;
173 
174 
175  //BEGIN basic scores
176 
179  eEstScoring
180  };
181 
182  //note: SetScoringType call with mRNA or EST type is going to switch basic scores to preset values
183  void SetScoringType(EScoringType type);
184  static EScoringType s_GetDefaultScoringType(void);
185  EScoringType GetScoringType(void) const;
186 
187  void SetMatchScore(int score);
188  static int s_GetDefaultMatchScore(void);
189  int GetMatchScore(void) const;
190 
191  void SetMismatchScore(int score);
192  static int s_GetDefaultMismatchScore(void);
193  int GetMismatchScore(void) const;
194 
195  void SetGapOpeningScore(int score);
196  static int s_GetDefaultGapOpeningScore(void);
197  int GetGapOpeningScore(void) const;
198 
199  void SetGapExtensionScore(int score);
200  static int s_GetDefaultGapExtensionScore(void);
201  int GetGapExtensionScore(void) const;
202 
203  void SetGtAgSpliceScore(int score);
204  static int s_GetDefaultGtAgSpliceScore(void);
205  int GetGtAgSpliceScore(void) const;
206 
207  void SetGcAgSpliceScore(int score);
208  static int s_GetDefaultGcAgSpliceScore(void);
209  int GetGcAgSpliceScore(void) const;
210 
211  void SetAtAcSpliceScore(int score);
212  static int s_GetDefaultAtAcSpliceScore(void);
213  int GetAtAcSpliceScore(void) const;
214 
215  void SetNonConsensusSpliceScore(int score);
216  static int s_GetDefaultNonConsensusSpliceScore(void);
217  int GetNonConsensusSpliceScore(void) const;
218 
219  //END basic scores
220 
221  void SetStartModelId(size_t model_id) {
222  m_model_id = model_id - 1;
223  }
224  size_t GetNextModelId(void) const {
225  return m_model_id + 1;
226  }
227 
228  void SetMaxCompsPerQuery(size_t m);
229  size_t GetMaxCompsPerQuery(void) const;
230 
231 
233  void SetHardMaskRanges(objects::CSeq_id_Handle idh, const TSeqRangeColl& mask_ranges) {
234  m_MaskMap[idh] = mask_ranges;
235  }
236 
238  typedef vector<TSegment> TSegments;
239 
240 
241  // aligned compartment representation
243 
244  size_t m_Id;
245 
249  eStatus_Error
250  };
251 
253 
254  string m_Msg;
255  bool m_QueryStrand, m_SubjStrand;
256  size_t m_Cds_start, m_Cds_stop;
257  size_t m_QueryLen;
258  size_t m_PolyA;
259  float m_Score;
261 
263  m_Id(0),
264  m_Status(eStatus_Empty),
265  m_Cds_start(0), m_Cds_stop(0),
266  m_QueryLen (0),
267  m_PolyA(0),
268  m_Score(0)
269  {}
270 
271  SAlignedCompartment(size_t id, const char* msg):
272  m_Id(id),
273  m_Status(eStatus_Empty),
274  m_Msg(msg),
275  m_Cds_start(0), m_Cds_stop(0),
276  m_QueryLen(0),
277  m_PolyA(0),
278  m_Score(0)
279  {}
280 
281  // return overall identity (including gaps)
282  double GetIdentity(void) const;
283 
284  // get aligned min/max on query and subject
285  void GetBox(Uint4* box) const;
286 
287  // save to / read from NetCache buffer
288  typedef vector<char> TNetCacheBuffer;
289  void ToBuffer (TNetCacheBuffer* buf) const;
290  void FromBuffer (const TNetCacheBuffer& buf);
291  };
292 
295  typedef vector<THitRef> THitRefs;
296 
297  // identify compartments and align each of them
298  void Run(THitRefs* hitrefs);
299  typedef vector<SAlignedCompartment> TResults;
300 
301  // retrieve results computed with Run()
302  const TResults& GetResult(void) const {
303  return m_result;
304  }
305 
306  // align single compartment within given genomic bounds
307  bool AlignSingleCompartment(THitRefs* hitrefs,
308  THit::TCoord range_left, THit::TCoord range_right,
309  SAlignedCompartment* result);
310 
311  // align single ASN.1 compartment
312  bool AlignSingleCompartment(CRef<objects::CSeq_align> compartment,
313  SAlignedCompartment* result);
314 
315 
316  // clear sequence vectors and scope - use with caution
317  void ClearMem(void);
318 
319  typedef pair<size_t,size_t> TOrf;
320  typedef pair<TOrf,TOrf> TOrfPair;
321  TOrfPair GetCds(const THit::TId & id, const vector<char> * seq_data = 0);
322 
323  static size_t s_TestPolyA(const char * seq, size_t dim, size_t cds_stop = 0);
324  bool IsPolyA(const char * seq, size_t polya_start, size_t dim);
325 
326  // alignment statistics
327 
329  eCS_InframeMatches = 20,
330  eCS_InframeIdentity = 22,
331  eCS_CombinationIdentity = 32
332  };
333 
334  enum EStatFlags {
335  eSF_BasicNonCds = 1 << 0,
336  eSF_BasicCds = 1 << 1
337  };
338 
339  typedef list<CRef<objects::CScore_set> > TScoreSets;
340 
341  /// Generate statistics based on splign-generated seq-align-set,
342  /// with each seq-align corresponding to an aligned compartment.
343  ///
344  /// @param sas
345  /// [IN] Seq-align-set describing input alignments.
346  /// @param output_stats
347  /// [OUT] A pointer to the object to be be filled in with computed stats.
348  /// @param cds
349  /// [IN] Coding region start and stop to use when computing cds-related stats.
350  /// If both are null then no cds-related stats will be computed.
351  /// @param flags
352  /// [IN] Bitwise OR of the eSF_* flags specifying types of statistics to include.
353  /// @return
354  /// The number of elements written in output_stats.
355  static size_t s_ComputeStats(
357  TScoreSets * output_stats,
358  TOrf cds = TOrf(0, 0),
359  EStatFlags flags = eSF_BasicNonCds);
360 
361  /// Generate statistics based on splign-generated seq-align corresponding
362  /// to a single aligned compartment.
363  ///
364  /// @param sa
365  /// [IN] Seq-align describing one aligned compartment.
366  /// @param embed_scoreset
367  /// [IN] Decorate the input seq-align with the scores.
368  /// @param cds
369  /// [IN] Coding region start and stop to use when computing cds-related stats.
370  /// If both are null then no cds-related stats will be computed.
371  /// @param flags
372  /// [IN] Bitwise OR of the eSF_* flags specifying types of statistics to include.
373  /// @return
374  /// A reference to a score-set object with the computed statistics.
375  static CRef<objects::CScore_set> s_ComputeStats(
377  bool embed_scoreset = true,
378  TOrf cds = TOrf(0, 0),
379  EStatFlags flags = eSF_BasicNonCds);
380 
381 protected:
382 
383  // the spliced alignment computing object
385 
386  // access to sequence data
389 
390  // alignment pattern
391  vector<size_t> m_pattern;
392 
393  //basic NW scores
403 
404  // min exon idty - others will be marked as gaps
406 
407  // min idty to extend alignment into polya
409 
410  // min polya length
412 
413  //minimum length of a gap between exons
414  //If a gap between exons is less than min_hole_len (on both query and subject),
415  //stich them back together. The gap will be represented as a regular alignment
416  //gaps inside the joint exon. 0 - don\'t stich.
417  size_t m_MinHoleLen;
418 
419  //trim holes to codons
420  //Trim exons around a gap to full codons if CDS can be retrieved along with the query.
422 
423  // compartment penalty as a per cent of the query (mRna) length
425 
426  // min compartment idty - others will be skipped
428 
429  // min single compartment idty (per subject per strand) as a fraction of
430  // the query length and as an absolute value.
431  // The final value for the parameter is computed
432  // as min(m_MinSingletonIdty * query_length, m_MinSingletonIdtyBps)
434 
436 
437  string m_TestType;
438 
439  // external hard-mask data
442 
443 
444  // mandatory end gap detection flag
445  bool m_endgaps;
446 
447  // alignment map
448  struct SAlnMapElem {
449  size_t m_box [4];
450  Int8 m_pattern_start, m_pattern_end;
451  };
452  vector<SAlnMapElem> m_alnmap;
453 
456 
457  // query sequence
458  objects::CBioseq_Handle m_mrna_bio_handle;
459  vector<char> m_mrna;
460  bool m_strand;
462  bool m_nopolya;
463  vector<char> m_mrna_polya; // unmasked version used only for polya calcs
464 
465  // in antisense, these are computed based on a reverse-
466  // complimentary sequence, so start still less than stop
467  size_t m_cds_start;
468  size_t m_cds_stop;
469 
470  // genomic sequence
471  vector<char> m_genomic;
473 
474  // max space to look beyond end hits
476 
477  // max intron length
478  size_t m_MaxIntron;
479 
480  // max part of exon identity drop
481  //If identity near alignment gap drops more, the low identity part will be trimmed out.
483 
484  // The limiting range as defined by the compartment hits,
485  // if the max compartment hit identity is less than a cut-off.
486  pair<size_t, size_t> m_BoundingRange;
487 
488  // output per compartment
490 
491  // all compartments
492  size_t m_model_id;
494 
497 
498 
499 
500  SAlignedCompartment x_RunOnCompartment( THitRefs* hitrefs,
501  size_t range_left,
502  size_t range_right);
503 
504  float x_Run(const char* seq1, const char* seq2);
505 
506  void x_SplitQualifyingHits(THitRefs* phitrefs);
507  void x_SetPattern(THitRefs* hitrefs);
508  bool x_ProcessTermSegm(TSegment** term_segs, Uint1 side) const;
509  size_t x_GetGenomicExtent(const size_t query_extent, size_t max_ext = 0) const;
510  void x_FinalizeAlignedCompartment(SAlignedCompartment & ac);
511 
512  void x_LoadSequence(vector<char>* seq,
513  const objects::CSeq_id& seqid,
514  THit::TCoord start,
515  THit::TCoord finish,
516  bool retain, bool is_genomic = false, bool genomic_strand = true);
517  void x_MaskSequence(vector<char>* seq,
518  const TSeqRangeColl& mask_ranges,
519  THit::TCoord start,
520  THit::TCoord finish);
521 
522  //checks if position belongs to the genomic gap
523  //gap information comes from ASN (CSeqMap).
524  // fasta of LDS does not support gap info. If sequence comes from FASTA file ir LDS, the method will always return 'false'
525  //coordinates are resolved, meaning that 'm_genomic' coordinates should be used.
526  bool x_IsInGap(size_t pos);
527 
528  static THitRef sx_NewHit(THit::TCoord q0, THit::TCoord q,
529  THit::TCoord s0, THit::TCoord s);
530 
531  /// forbidden
532  CSplign(const CSplign&);
534 };
535 
536 
538 
539 
540 #endif
TParent::TCoord TCoord
CConstRef –.
Definition: ncbiobj.hpp:1266
CObject –.
Definition: ncbiobj.hpp:180
CRef –.
Definition: ncbiobj.hpp:618
CScope –.
Definition: scope.hpp:92
CScore_set –.
Definition: Score_set.hpp:66
CSeqMap –.
Definition: seq_map.hpp:93
CSplign is the central library object for computing spliced cDNA-to-genomic alignments.
Definition: splign.hpp:74
EStatFlags
Definition: splign.hpp:334
bool m_nopolya
Definition: splign.hpp:462
map< string, TOrfPair > TStrIdToOrfs
Definition: splign.hpp:454
CSplign(const CSplign &)
forbidden
void SetHardMaskRanges(objects::CSeq_id_Handle idh, const TSeqRangeColl &mask_ranges)
Definition: splign.hpp:233
int m_NonConsensusSpliceScore
Definition: splign.hpp:402
list< CRef< objects::CScore_set > > TScoreSets
Definition: splign.hpp:339
int m_GcAgSpliceScore
Definition: splign.hpp:400
int m_MatchScore
Definition: splign.hpp:395
EScoringType
Definition: splign.hpp:177
@ eMrnaScoring
Definition: splign.hpp:178
string m_TestType
Definition: splign.hpp:437
size_t m_cds_stop
Definition: splign.hpp:468
TSegments m_segments
Definition: splign.hpp:489
size_t m_MinSingletonIdtyBps
Definition: splign.hpp:435
TResults m_result
Definition: splign.hpp:493
pair< size_t, size_t > m_BoundingRange
Definition: splign.hpp:486
vector< size_t > m_pattern
Definition: splign.hpp:391
pair< size_t, size_t > TOrf
Definition: splign.hpp:319
bool m_endgaps
Definition: splign.hpp:445
CConstRef< objects::CSeqMap > m_GenomicSeqMap
Definition: splign.hpp:472
EScoringType m_ScoringType
Definition: splign.hpp:394
int m_GapExtensionScore
Definition: splign.hpp:398
CSplign & operator=(const CSplign &)
TSeqPos m_polya_start
Definition: splign.hpp:461
CBlastTabular THit
Definition: splign.hpp:293
vector< char > m_genomic
Definition: splign.hpp:471
int m_AtAcSpliceScore
Definition: splign.hpp:401
int m_GapOpeningScore
Definition: splign.hpp:397
vector< char > m_mrna_polya
Definition: splign.hpp:463
size_t m_MaxCompsPerQuery
Definition: splign.hpp:495
const TResults & GetResult(void) const
Definition: splign.hpp:302
double m_MinExonIdty
Definition: splign.hpp:405
CSplicedAligner TAligner
Definition: splign.hpp:77
CRef< THit > THitRef
Definition: splign.hpp:294
vector< char > m_mrna
Definition: splign.hpp:459
size_t m_MinPolyaLen
Definition: splign.hpp:411
size_t m_max_genomic_ext
Definition: splign.hpp:475
void SetStartModelId(size_t model_id)
Definition: splign.hpp:221
double m_MinSingletonIdty
Definition: splign.hpp:433
CRef< objects::CScope > m_Scope
Definition: splign.hpp:387
int m_GtAgSpliceScore
Definition: splign.hpp:399
bool m_CanResetHistory
Definition: splign.hpp:388
size_t GetNextModelId(void) const
Definition: splign.hpp:224
vector< SAlignedCompartment > TResults
Definition: splign.hpp:299
bool m_strand
Definition: splign.hpp:460
vector< THitRef > THitRefs
Definition: splign.hpp:295
size_t m_model_id
Definition: splign.hpp:492
double m_MinCompartmentIdty
Definition: splign.hpp:427
size_t m_MinHoleLen
Definition: splign.hpp:417
map< objects::CSeq_id_Handle, TSeqRangeColl > TSIHToMaskRanges
Definition: splign.hpp:440
double m_MaxPartExonIdentDrop
Definition: splign.hpp:482
ECDSCompartmentScores
Definition: splign.hpp:328
size_t m_MaxIntron
Definition: splign.hpp:478
size_t m_MinPatternHitLength
Definition: splign.hpp:496
bool m_TrimToCodons
Definition: splign.hpp:421
TSIHToMaskRanges m_MaskMap
Definition: splign.hpp:441
CRangeCollection< TSeqPos > TSeqRangeColl
Definition: splign.hpp:232
TStrIdToOrfs m_OrfMap
Definition: splign.hpp:455
vector< TSegment > TSegments
Definition: splign.hpp:238
size_t m_cds_start
Definition: splign.hpp:467
double m_CompartmentPenalty
Definition: splign.hpp:424
pair< TOrf, TOrf > TOrfPair
Definition: splign.hpp:320
objects::CBioseq_Handle m_mrna_bio_handle
Definition: splign.hpp:458
double m_MinPolyaExtIdty
Definition: splign.hpp:408
CRef< TAligner > m_aligner
Definition: splign.hpp:384
CNWFormatter::SSegment TSegment
Definition: splign.hpp:237
int m_MismatchScore
Definition: splign.hpp:396
vector< SAlnMapElem > m_alnmap
Definition: splign.hpp:452
CSplign::THitRefs THitRefs
Include a standard set of the NCBI C++ Toolkit most basic headers.
static uch flags
static void test_type(TDSSOCKET *tds, TDSCOLUMN *col)
Definition: all_types.c:18
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
ENa_strand GetStrand(const CSeq_loc &loc, CScope *scope=0)
Returns eNa_strand_unknown if multiple Bioseqs in loc Returns eNa_strand_other if multiple strands in...
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
void Run(void)
Enter the main loop.
#define NCBI_XALGOALIGN_EXPORT
Definition: ncbi_export.h:985
char * buf
int len
const string kTestType_20_28
Definition: splign.hpp:55
const string kTestType_20_28_plus
Definition: splign.hpp:54
const string kTestType_production_default
Definition: splign.hpp:56
ECompartmentStatus m_Status
Definition: splign.hpp:252
vector< char > TNetCacheBuffer
Definition: splign.hpp:288
SAlignedCompartment(size_t id, const char *msg)
Definition: splign.hpp:271
Definition: type.c:6
else result
Definition: token2.c:20
CScope & GetScope()
Modified on Sat May 25 14:18:08 2024 by modify_doxy.py rev. 669887