NCBI C++ ToolKit
gene_model.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef ALGO_SEQUENCE___GENE_MODEL__HPP
2 #define ALGO_SEQUENCE___GENE_MODEL__HPP
3 
4 /* $Id: gene_model.hpp 98412 2022-11-09 16:24:43Z mozese2 $
5  * ===========================================================================
6  *
7  * PUBLIC DOMAIN NOTICE
8  * National Center for Biotechnology Information
9  *
10  * This software/database is a "United States Government Work" under the
11  * terms of the United States Copyright Act. It was written as part of
12  * the author's official duties as a United States Government employee and
13  * thus cannot be copyrighted. This software/database is freely available
14  * to the public for use. The National Library of Medicine and the U.S.
15  * Government have not placed any restriction on its use or reproduction.
16  *
17  * Although all reasonable efforts have been taken to ensure the accuracy
18  * and reliability of the software and data, the NLM and the U.S.
19  * Government do not and cannot warrant the performance or results that
20  * may be obtained by using this software or data. The NLM and the U.S.
21  * Government disclaim all warranties, express or implied, including
22  * warranties of performance, merchantability or fitness for any particular
23  * purpose.
24  *
25  * Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Authors: Mike DiCuccio
30  *
31  * File Description:
32  *
33  */
34 
35 #include <corelib/ncbistd.hpp>
36 #include <corelib/ncbiobj.hpp>
37 #include <corelib/ncbiexpt.hpp>
38 #include <util/range.hpp>
42 
45  class CScope;
46  class CSeq_feat;
47  class CSeq_align;
48  class CSeq_annot;
49  class CBioseq_set;
50  class CBioseq_Handle;
51  struct SAnnotSelector;
53 
55  {
56  public:
57  enum EErrCode {
60  };
61  virtual const char* GetErrCodeString(void) const override {
62  switch ( GetErrCode() ) {
63  case eUnknown:
64  return "Unknown error";
65  case eMicroIntrons:
66  return "MicroIntron generation failure";
67  default:
69  }
70  }
72  };
73 
74 
76 {
77 public:
80 
81  CFeatureGenerator(objects::CScope& scope);
83 
85 
86  // CleanAlignment flags
87  fTrimEnds = 0x1000, // trim ends to codon boundaries (protein or mrna with CDS partially aligned)
88  fMaximizeTranslation = 0x2000, // leave only 1-2 base indels:
89  // minimize product-ins modulo 3,
90  // replace complete genomic-ins triplets with diags
91  // recalculate query positions.
92  // Need to be careful with transcript queries -
93  // cdregion passed to convert should correspond to the modified
94  // query positions
95 
96  // Convert flags
97  fCreateGene = 0x001,
98  fCreateMrna = 0x002,
99  fCreateCdregion = 0x004,
100  fPromoteAllFeatures = 0x008,
101  fPropagateOnly = 0x010,
102  fForceTranslateCds = 0x020,
103  fForceTranscribeMrna = 0x040,
104  fDensegAsExon = 0x080,
105  fGenerateLocalIds = 0x100, // uses current date
106  fGenerateStableLocalIds = 0x200, // reproducible ids
107  fPropagateNcrnaFeats = 0x400,
108  fTrustProteinSeq = 0x800,
109  // already-used: = 0x1000,
110  // already-used: = 0x2000,
111  fDeNovoProducts = 0x4000,
112  fAddTranslatedCDSAssembly = 0x8000, // add translated_cds_bioseq->SetInst().SetHist().SetAssembly().push_back(align)
113  fDropManeMarkup = 0x00010000,
114  fSkipLocationCheck = 0x00020000,
115 
116  fDefaults = fCreateGene | fCreateMrna | fCreateCdregion |
117  fGenerateLocalIds | fPropagateNcrnaFeats
118  };
120 
122  fProduct = 1,
123  fGenomic = 2,
124  fBoth = 3
125  };
126 
127  static const TSeqPos kDefaultMinIntron = 200;
128  static const TSeqPos kDefaultAllowedUnaligned = 10;
129 
130  void SetFlags(TFeatureGeneratorFlags);
131  TFeatureGeneratorFlags GetFlags() const;
132  void SetIntronStitchThresholdFlags(EIntronStitchThresholdFlags);
133  void SetMinIntron(TSeqPos);
134  void SetAllowedUnaligned(TSeqPos);
135 
136  /// Clean an alignment according to our best guess of its biological
137  /// representation. Cleaning involves adjusting segments to satisfy our
138  /// expectations of partial exonic alignments and account for unaligned
139  /// parts. Eg. stitching small gaps (less than min_intron), trimming to codon boundaries.
140  /// May shift product positions.
142  CleanAlignment(const objects::CSeq_align& align);
143 
144  /// Adjust alignment to the specified range
145  /// (cross-the-origin range on circular chromosome is indicated by range.from > range.to)
146  /// Will add necessary 'diags' at ends.
147  /// Throws an exception on attempt to shink past an indel in CDS
148  /// Works on Spliced-seg alignments only.
149  /// Note: for a protein alignment do not expand it to include stop codon.
150 
153  eTryToPreserveProductPositions
154  };
156  AdjustAlignment(const objects::CSeq_align& align, TSeqRange range, EProductPositionsMode mode = eForceProductFrom0);
157 
158 
159  /// Convert an alignment to an annotation.
160  /// This will optionally promote all features through the alignment
161  /// and create product sequences
162  /// Returns mRNA feature
163  CRef<objects::CSeq_feat> ConvertAlignToAnnot(const objects::CSeq_align& align,
164  objects::CSeq_annot& annot,
165  objects::CBioseq_set& seqs,
166  Int8 gene_id = 0,
167  const objects::CSeq_feat* cdregion_on_mrna = NULL);
168 
169  void ConvertAlignToAnnot(const list< CRef<objects::CSeq_align> > &aligns,
170  objects::CSeq_annot &annot,
171  objects::CBioseq_set &seqs);
172 
173  /// Convert genomic location to an annotation. Populates seqs with mRna
174  /// and protein sequences, and populates annot with gene, mRna
175  /// and cdretgion features
176  void ConvertLocToAnnot(
177  const objects::CSeq_loc &loc,
178  objects::CSeq_annot& annot,
179  objects::CBioseq_set& seqs,
180  objects::CCdregion::EFrame frame = objects::CCdregion::eFrame_one,
183 
184  /// Correctly mark exceptions on a feature
185  ///
186  void SetFeatureExceptions(objects::CSeq_feat& feat,
187  const objects::CSeq_align* align = NULL);
188 
189  /// Mark the correct partial states for a set of features
190  ///
191  void SetPartialFlags(CRef<objects::CSeq_feat> gene_feat,
192  CRef<objects::CSeq_feat> mrna_feat,
193  CRef<objects::CSeq_feat> cds_feat);
194 
195  /// Recompute the correct partial states for all features in this annotation
196  void RecomputePartialFlags(objects::CSeq_annot& annot);
197 
198 
199 
200 
201  /// Project RNA, preserving discontinuities in the CDS.
202  ///
203  /// Postcondition: Output is a mix of packed-ints, where each sub-loc in the mix
204  /// is an exon, and each subloc in the exon packed-int is an exon chunk. The chunks may
205  /// have gaps between them or overlap as to preserve the translation frame of the CDS.
206  ///
207  /// The discontinuities (gaps and overlaps of chunks) that are outside of the CDS are collapsed.
208  ///
209  /// Singleton container locs (comprised of single element) are canonicalized:
210  /// unbroken exons are represented as a single interval
211  /// single-exon locs are represented as a single packed-int (or int, as per above)
212  static CRef<objects::CSeq_loc> s_ProjectRNA(const objects::CSeq_align& spliced_aln,
214  size_t unaligned_ends_partialness_thr = kDefaultAllowedUnaligned);
215  /// Similar to s_ProjectRNA(...)
216  /// Postcondition: seq-vector of the returned loc is of exact same length and has no indels
217  /// relative to the seq-vector of the product_cds_loc truncated to the alignment boundaries.
218  /// 1-2 bp overlaps converted to gaps preserving frame if convert_overlaps = true
219  static CRef<objects::CSeq_loc> s_ProjectCDS(const objects::CSeq_align& spliced_aln,
220  const objects::CSeq_loc& product_cds_loc,
221  bool convert_overlaps = true);
222  // when specified, annot_name creates introns for features from a given annot_name
223  // non-NULL range limits processing to a specific range
224  static void CreateMicroIntrons(
225  objects::CScope& scope,
226  objects::CBioseq_Handle bsh,
227  const string& annot_name = "",
228  TSeqRange* range = NULL,
229  bool ignore_errors = false);
230 
231 private:
232  struct SImplementation;
233  unique_ptr<SImplementation> m_impl;
234 
235  // adjust the selector to use a given annotation if not empty
236  static void x_SetAnnotName(objects::SAnnotSelector& sel, const string& annot_name);
237 };
238 
239 
241 {
242 public:
249 
250  fDefaults = fCreateGene | fCreateMrna | fCreateCdregion
251  };
253 
254  /// Create a gene model from an alignment
255  /// this will optionally promote all features through the alignment
257  static void CreateGeneModelFromAlign(const objects::CSeq_align& align,
258  objects::CScope& scope,
259  objects::CSeq_annot& annot,
260  objects::CBioseq_set& seqs,
261  TGeneModelCreateFlags flags = fDefaults,
262  TSeqPos allowed_unaligned = 10);
263 
265  static void CreateGeneModelsFromAligns(const list< CRef<objects::CSeq_align> > &aligns,
266  objects::CScope& scope,
267  objects::CSeq_annot& annot,
268  objects::CBioseq_set& seqs,
269  TGeneModelCreateFlags flags = fDefaults,
270  TSeqPos allowed_unaligned = 10);
271 
272  /// Correctly mark exceptions on a feature
273  ///
275  static void SetFeatureExceptions(objects::CSeq_feat& feat,
276  objects::CScope& scope,
277  const objects::CSeq_align* align = NULL);
278 
280  static void SetPartialFlags(objects::CScope& scope,
281  CRef<objects::CSeq_feat> gene_feat,
282  CRef<objects::CSeq_feat> mrna_feat,
283  CRef<objects::CSeq_feat> cds_feat);
284 
286  static void RecomputePartialFlags(objects::CScope& scope,
287  objects::CSeq_annot& annot);
288 };
289 
291 
292 #endif // ALGO_SEQUENCE___GENE_MODEL__HPP
User-defined methods of the data storage class.
virtual const char * GetErrCodeString(void) const override
Get error code interpreted as text.
Definition: gene_model.hpp:61
NCBI_EXCEPTION_DEFAULT(CAlgoFeatureGeneratorException, CException)
CBioseq_Handle –.
CRef< objects::CSeq_feat > ConvertAlignToAnnot(const objects::CSeq_align &align, objects::CSeq_annot &annot, objects::CBioseq_set &seqs, Int8 gene_id=0, const objects::CSeq_feat *cdregion_on_mrna=NULL)
Convert an alignment to an annotation.
unique_ptr< SImplementation > m_impl
Definition: gene_model.hpp:232
CFeatureGenerator(objects::CScope &scope)
EProductPositionsMode
Adjust alignment to the specified range (cross-the-origin range on circular chromosome is indicated b...
Definition: gene_model.hpp:151
void ConvertAlignToAnnot(const list< CRef< objects::CSeq_align > > &aligns, objects::CSeq_annot &annot, objects::CBioseq_set &seqs)
CFeatureGenerator(CRef< objects::CScope > scope)
int TGeneModelCreateFlags
Definition: gene_model.hpp:252
CScope –.
Definition: scope.hpp:92
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
Include a standard set of the NCBI C++ Toolkit most basic headers.
static uch flags
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define NULL
Definition: ncbistd.hpp:225
virtual const char * GetErrCodeString(void) const
Get error code interpreted as text.
Definition: ncbiexpt.cpp:444
@ eUnknown
Definition: app_popup.hpp:72
#define NCBI_DEPRECATED
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
#define NCBI_XALGOSEQ_EXPORT
Definition: ncbi_export.h:1017
range(_Ty, _Ty) -> range< _Ty >
mdb_mode_t mode
Definition: lmdb++.h:38
Defines NCBI C++ exception handling.
Portable reference counted smart and weak pointers using CWeakRef, CRef, CObject and CObjectEx.
SAnnotSelector –.
@ fProduct
Definition: user_agent.cpp:470
Modified on Fri Sep 20 14:57:13 2024 by modify_doxy.py rev. 669887