NCBI C++ ToolKit
internal_stops.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: internal_stops.cpp 99011 2023-02-02 19:17:29Z mozese2 $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Vyacheslav Chetvernin
27  *
28  * File Description:
29  *
30  */
31 #include <ncbi_pch.hpp>
32 
34 #include "feature_generator.hpp"
35 
42 #include <objmgr/scope.hpp>
43 #include <objmgr/seq_vector.hpp>
44 #include <objmgr/util/sequence.hpp>
45 
47 
49  : scope(a_scope), generator(a_scope)
50 {
53 }
54 
56 
58 {
59  pair<TStarts, set<TSeqRange> > start_stop_ranges = FindStartStopRanges(align, padding);
60  set<TSeqPos> starts, stops;
61  ITERATE (TStarts, r, start_stop_ranges.first) {
62  starts.insert(r->first.GetFrom());
63  }
64  ITERATE (set<TSeqRange>, r, start_stop_ranges.second) {
65  stops.insert(r->GetFrom());
66  }
67  return make_pair(starts, stops);
68 }
69 pair<TStarts, set<TSeqRange> > CInternalStopFinder::FindStartStopRanges(const CSeq_align& align, int padding,
71 {
73  int genomic_length = bsh.GetBioseqLength();
74 
75  CConstRef<CSeq_align> clean_align;
76  pair<bool, bool> trim_by_contig(false, false);
77  {{
78  CConstRef<CSeq_align> padded_align(&align);
79  if (padding > 0) {
80  CRef<CSeq_loc> loc = align.CreateRowSeq_loc(1);
81  int start = loc->GetStart(eExtreme_Positional);
82  int stop = loc->GetStop(eExtreme_Positional);
83 
84  bool is_circular = (bsh.GetInst_Topology() == CSeq_inst::eTopology_circular);
85 
86  if (is_circular) {
87  //prevent self overlap
88  padding = min(padding, ((stop > start ? genomic_length : 0) - (stop - start +1))/2);
89  }
90 
91  start -= padding;
92  stop += padding;
93 
94  if (start <= 2 && !is_circular) {
95  trim_by_contig.first = true;
96  }
97  if (stop >= genomic_length-3 && !is_circular) {
98  trim_by_contig.second = true;
99  }
100 
101  if (start < 0) {
102  start = is_circular ? start + genomic_length : 0;
103  }
104  if (stop >= genomic_length) {
105  stop = is_circular ? stop - genomic_length : genomic_length-1;
106  }
107  padded_align = generator.AdjustAlignment(align, TSeqRange(start, stop));
108  //cerr << MSerial_AsnText << *padded_align;
109  }
110 
111  clean_align = generator.CleanAlignment(*padded_align);
112  //cerr << MSerial_AsnText << *clean_align;
113  }}
114 
115  CSeq_loc_Mapper mapper(*clean_align, 1);
116 
117  CRef<CSeq_loc> query_loc(new CSeq_loc);
118  const CSpliced_seg& spl = clean_align->GetSegs().GetSpliced();
119  query_loc->SetInt(*spl.GetExons().front()->CreateRowSeq_interval(0, spl));
120 
121  const bool is_protein = (spl.GetProduct_type() == CSpliced_seg::eProduct_type_protein);
122 
123  string seq = GetCDSNucleotideSequence(*clean_align);
124  if (seq.size()%3 != 0) {
125  cerr << MSerial_AsnText << align << endl;
126  _ASSERT(seq.size()%3 == 0);
127  NCBI_USER_THROW("CDSNucleotideSequence not divisible by 3");
128  }
129 
130  int gcode = fg::GetGeneticCode(bsh);
131 
132  CRef<CGenetic_code::C_E> c_e(new CGenetic_code::C_E); c_e->SetId(gcode);
134  code->Set().push_back(c_e);
135 
137  const size_t kUnknownState = tbl.SetCodonState('N', 'N', 'N');
138 
139  TStarts starts;
140  set<TSeqRange> stops;
141 
142  size_t state = 0;
143  int k = 0;
144  string codon = "NNN";
145 
146  ITERATE(string, s, seq) {
147  state = tbl.NextCodonState(state, *s);
148  codon[k%3] = *s;
149 
150  if (++k%3)
151  continue;
152 
153  if (state == kUnknownState)
154  continue;
155 
156  if (tbl.IsOrfStart(state) || tbl.IsOrfStop(state)) {
157  if (is_protein) {
158  query_loc->SetInt().SetFrom((k-3)/3);
159  query_loc->SetInt().SetTo((k-3)/3);
160  } else {
161  query_loc->SetInt().SetFrom(k-3);
162  query_loc->SetInt().SetTo(k-1);
163  }
164  TSeqPos mapped_pos = mapper.Map(*query_loc)->GetStart(eExtreme_Biological);
165  if (mapped_pos == kInvalidSeqPos)
166  continue;
167  TSeqPos mapped_pos2 = mapper.Map(*query_loc)->GetStop(eExtreme_Biological);
168 
169  if (tbl.IsOrfStart(state)) {
170  starts[TSeqRange(mapped_pos, mapped_pos2)] = codon;
171  }
172  if (tbl.IsOrfStop(state)) {
173  stops.insert(TSeqRange(mapped_pos, mapped_pos2));
174  }
175  }
176  }
177 
178  if (gaps != nullptr) {
179 
180  CRef<CSeq_loc> region_loc = clean_align->CreateRowSeq_loc(1);
181  if (trim_by_contig.first && region_loc->GetStart(eExtreme_Positional) < 3) {
182  region_loc = region_loc->Merge(CSeq_loc::fMerge_SingleRange, nullptr);
183  region_loc->SetInt().SetFrom(0);
184  }
185  if (trim_by_contig.second && int(region_loc->GetStop(eExtreme_Positional)) > genomic_length -3) {
186  region_loc = region_loc->Merge(CSeq_loc::fMerge_SingleRange, nullptr);
187  region_loc->SetInt().SetTo(genomic_length-1);
188  }
189 // cerr << MSerial_AsnText << *region_loc;
190 
191  CSeqVector region_vec(*region_loc, scope,
193  string region_seq;
194  region_vec.GetSeqData(region_vec.begin(), region_vec.end(), region_seq);
195 
196  region_seq += 'X'; // to finish last run of Ns
197 
198  int gap_begin = -1;
199  int gap_end = -1;
200  int k = 0;
201 
202  CRef<CSeq_id> id(new CSeq_id);
203  id->Assign(*region_loc->GetId());
204  CRef<CSeq_loc> query_loc(new CSeq_loc(*id, 0, region_vec.size()-1));
205  CSeq_loc_Mapper mapper(*query_loc, *region_loc);
206 
207  for (auto s: region_seq) {
208  if (s == 'N') {
209  if (gap_end != k) {
210  gap_begin = k;
211  }
212  gap_end = k+1;
213  } else if (gap_end == k) {
214  query_loc->SetInt().SetFrom(gap_begin);
215  query_loc->SetInt().SetTo(gap_end-1);
216 
217  auto mapped_loc = mapper.Map(*query_loc);
218  TSeqPos mapped_pos = mapped_loc->GetStart(eExtreme_Biological);
219  TSeqPos mapped_pos2 = mapped_loc->GetStop(eExtreme_Biological);
220  if (mapped_pos == kInvalidSeqPos || mapped_pos2 == kInvalidSeqPos) {
221  NCBI_USER_THROW("Cannot map Ns run");
222  }
223  gaps->insert(TSignedSeqRange(mapped_pos, mapped_pos2));
224  }
225  ++k;
226  }
227 
228  auto strand = region_loc->GetStrand();
229 
230  if (trim_by_contig.first) {
231  int gap_stop = -1;
232  if (strand != eNa_strand_minus) {
233  if (!gaps->empty() && gaps->begin()->GetFrom()==0) {
234  gap_stop = gaps->begin()->GetTo();
235  gaps->erase(gaps->begin());
236  }
237  gaps->insert(TSignedSeqRange(gap_stop -9, gap_stop));
238  } else {
239  if (!gaps->empty() && gaps->begin()->GetTo()==0) {
240  gap_stop = gaps->begin()->GetFrom();
241  gaps->erase(gaps->begin());
242  }
243  gaps->insert(TSignedSeqRange(gap_stop, gap_stop -9));
244  }
245  }
246  if (trim_by_contig.second) {
247  int gap_start = genomic_length;
248  if (strand != eNa_strand_minus) {
249  if (!gaps->empty() && gaps->rbegin()->GetTo()==genomic_length-1) {
250  gap_start = gaps->rbegin()->GetFrom();
251  gaps->erase(prev(gaps->end()));
252  }
253  gaps->insert(TSignedSeqRange(gap_start, gap_start +9));
254  } else {
255  if (!gaps->empty() && gaps->rbegin()->GetFrom()==genomic_length-1) {
256  gap_start = gaps->rbegin()->GetTo();
257  gaps->erase(prev(gaps->end()));
258  }
259  gaps->insert(TSignedSeqRange(gap_start +9, gap_start));
260  }
261  }
262  }
263 
264  return make_pair(starts, stops);
265 }
266 
268 {
269  return FindStartsStops(align).second;
270 }
271 
273 {
274  return !FindStops(align).empty();
275 }
276 
278 {
279  if (!align.GetSegs().IsSpliced()) {
280  NCBI_THROW(CException, eUnknown, "CInternalStopFinder supports Spliced-seg alignments only");
281  }
282 
283  string mRNA;
284 
285  const CSpliced_seg& spliced_seg = align.GetSegs().GetSpliced();
286 
287  int next_prod_start = 0;
288  CSeq_loc::TRanges cds_ranges;
289 
291  const CSeq_id& product_id = spliced_seg.GetProduct_id();
292  CMappedFeat cds_on_rna = GetCdsOnMrna(product_id, scope);
293  if (!cds_on_rna) {
294  /// No CDS
295  return kEmptyStr;
296  }
297  if (cds_on_rna.GetLocation().GetStrand() == eNa_strand_minus) {
298  NCBI_THROW(CException, eUnknown, "minus strand cdregion on mrna is not supported");
299  }
300 
301  ITERATE (CSeq_loc, loc_ci, cds_on_rna.GetLocation()) {
302  cds_ranges.push_back(loc_ci.GetRange());
303  }
304  if (!cds_on_rna.GetLocation().IsPartialStop(eExtreme_Biological)) {
305  cds_ranges.back().SetTo(cds_ranges.back().GetTo()-3);
306  }
307  } else {
308  cds_ranges.push_back(TSeqRange::GetWhole());
309  }
310 
311  ITERATE( CSpliced_seg::TExons, exon, spliced_seg.GetExons() ) {
312 
313  int prod_pos_start = (*exon)->GetProduct_start().AsSeqPos();
314 
315  CRef<CSeq_loc> subject_loc(new CSeq_loc);
316  subject_loc->SetInt(*(*exon)->CreateRowSeq_interval(1, spliced_seg));
317  CSeqVector subject_vec(*subject_loc, scope,
319  string subject_seq;
320  subject_vec.GetSeqData(subject_vec.begin(), subject_vec.end(), subject_seq);
321  int subj_pos = 0;
322 
323  if (next_prod_start < prod_pos_start) {
324  mRNA.append(prod_pos_start - next_prod_start, 'N');
325  next_prod_start = prod_pos_start;
326  }
327 
328  if ((*exon)->IsSetParts()) {
329  ITERATE (CSpliced_exon::TParts, part_it, (*exon)->GetParts()) {
330  pair<int, int> chunk = ChunkSize(**part_it);
331  prod_pos_start += chunk.second;
332  if (chunk.first == 0) {
333  if (next_prod_start < prod_pos_start) {
334  mRNA.append(prod_pos_start - next_prod_start, 'N');
335  next_prod_start = prod_pos_start;
336  }
337  } else if (chunk.second > 0) {
338  if (next_prod_start < prod_pos_start) {
339  mRNA.append(subject_seq, subj_pos+chunk.second-(prod_pos_start - next_prod_start), prod_pos_start - next_prod_start);
340  next_prod_start = prod_pos_start;
341  }
342  }
343  subj_pos += chunk.first;
344  }
345  } else {
346  mRNA.append(subject_seq);
347  next_prod_start += subject_seq.size();
348  }
349  }
350 
351  if (cds_ranges.front().IsWhole()) {
352  return mRNA;
353  } else {
354  string cds_seq;
355  for (const TSeqRange &range : cds_ranges) {
356  if (range.GetFrom() >= mRNA.size()) {
357  break;
358  }
359  cds_seq += mRNA.substr(range.GetFrom(), range.GetLength());
360  }
361  return cds_seq;
362  }
363 }
364 
365 
366 // pair(genomic, product)
367 pair<int, int> ChunkSize(const CSpliced_exon_chunk& chunk)
368 {
369  int len = 0;
370  switch (chunk.Which()) {
372  len = chunk.GetGenomic_ins();
373  return make_pair(len, 0);
375  len = chunk.GetProduct_ins();
376  return make_pair(0, len);
378  len = chunk.GetMatch();
379  break;
381  len = chunk.GetMismatch();
382  break;
384  len = chunk.GetDiag();
385  break;
386  default:
387  NCBI_THROW(CException, eUnknown, "Spliced_exon_chunk type not set");
388  }
389  return make_pair(len, len);
390 }
391 
@ eExtreme_Positional
numerical value
Definition: Na_strand.hpp:63
@ eExtreme_Biological
5' and 3'
Definition: Na_strand.hpp:62
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
CBioseq_Handle –.
CConstRef< objects::CSeq_align > AdjustAlignment(const objects::CSeq_align &align, TSeqRange range, EProductPositionsMode mode=eForceProductFrom0)
void SetFlags(TFeatureGeneratorFlags)
Definition: gene_model.cpp:195
void SetAllowedUnaligned(TSeqPos)
Definition: gene_model.cpp:215
CConstRef< objects::CSeq_align > CleanAlignment(const objects::CSeq_align &align)
Clean an alignment according to our best guess of its biological representation.
Definition: gene_model.cpp:221
static const CTrans_table & GetTransTable(int id)
string GetCDSNucleotideSequence(const CSeq_align &align)
set< TSeqPos > FindStops(const CSeq_align &align)
pair< map< TSeqRange, string >, set< TSeqRange > > FindStartStopRanges(const CSeq_align &align, int padding=0, set< TSignedSeqRange > *gaps=nullptr)
bool HasInternalStops(const CSeq_align &align)
pair< set< TSeqPos >, set< TSeqPos > > FindStartsStops(const CSeq_align &align, int padding=0)
CFeatureGenerator generator
CInternalStopFinder(CScope &scope)
CMappedFeat –.
Definition: mapped_feat.hpp:59
CScope –.
Definition: scope.hpp:92
CSeqVector –.
Definition: seq_vector.hpp:65
CRef< CSeq_loc > CreateRowSeq_loc(TDim row) const
Definition: Seq_align.cpp:2028
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
Definition: Seq_align.cpp:317
CSeq_loc_Mapper –.
CSpliced_exon_chunk –.
bool IsOrfStart(int state) const
static int SetCodonState(unsigned char ch1, unsigned char ch2, unsigned char ch3)
static int NextCodonState(int state, unsigned char ch)
bool IsOrfStop(int state) const
Definition: map.hpp:338
Definition: set.hpp:45
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
const_iterator begin() const
Definition: set.hpp:135
bool empty() const
Definition: set.hpp:133
void erase(iterator pos)
Definition: set.hpp:151
const_iterator end() const
Definition: set.hpp:136
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:61
CMappedFeat GetCdsOnMrna(const objects::CSeq_id &rna_id, CScope &scope)
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
const TSeqPos kInvalidSeqPos
Define special value for invalid sequence position.
Definition: ncbimisc.hpp:878
#define NCBI_USER_THROW(message)
Throw a quick-and-dirty runtime exception of type 'CException' with the given error message and error...
Definition: ncbiexpt.hpp:715
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
@ eUnknown
Definition: app_popup.hpp:72
#define MSerial_AsnText
I/O stream manipulators –.
Definition: serialbase.hpp:696
ENa_strand GetStrand(void) const
Get the location's strand.
Definition: Seq_loc.cpp:882
TSeqPos GetStart(ESeqLocExtremes ext) const
Return start and stop positions of the seq-loc.
Definition: Seq_loc.cpp:915
CRef< CSeq_loc > Merge(TOpFlags flags, ISynonymMapper *syn_mapper) const
All functions create and return a new seq-loc object.
Definition: Seq_loc.cpp:5037
void SetInt(TInt &v)
Definition: Seq_loc.hpp:983
CPacked_seqint::TRanges TRanges
Definition: Seq_loc.hpp:103
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
Definition: Seq_loc.hpp:941
bool IsPartialStop(ESeqLocExtremes ext) const
Definition: Seq_loc.cpp:3251
TSeqPos GetStop(ESeqLocExtremes ext) const
Definition: Seq_loc.cpp:963
@ fMerge_SingleRange
Definition: Seq_loc.hpp:332
CRef< CSeq_loc > Map(const CSeq_loc &src_loc)
Map seq-loc.
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
TSeqPos GetBioseqLength(void) const
TInst_Topology GetInst_Topology(void) const
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
const CSeq_loc & GetLocation(void) const
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
Definition: seq_vector.cpp:304
TSeqPos size(void) const
Definition: seq_vector.hpp:291
const_iterator begin(void) const
Definition: seq_vector.hpp:298
const_iterator end(void) const
Definition: seq_vector.hpp:305
CRange< TSeqPos > TSeqRange
typedefs for sequence ranges
Definition: range.hpp:419
CRange< TSignedSeqPos > TSignedSeqRange
Definition: range.hpp:420
static TThisType GetWhole(void)
Definition: range.hpp:272
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define kEmptyStr
Definition: ncbistr.hpp:123
TMatch GetMatch(void) const
Get the variant data.
const TProduct_id & GetProduct_id(void) const
Get the Product_id member data.
TDiag GetDiag(void) const
Get the variant data.
TProduct_type GetProduct_type(void) const
Get the Product_type member data.
TMismatch GetMismatch(void) const
Get the variant data.
const TSpliced & GetSpliced(void) const
Get the variant data.
Definition: Seq_align_.cpp:219
TGenomic_ins GetGenomic_ins(void) const
Get the variant data.
list< CRef< CSpliced_exon > > TExons
const TExons & GetExons(void) const
Get the Exons member data.
list< CRef< CSpliced_exon_chunk > > TParts
bool IsSpliced(void) const
Check if variant Spliced is selected.
Definition: Seq_align_.hpp:778
TProduct_ins GetProduct_ins(void) const
Get the variant data.
const TSegs & GetSegs(void) const
Get the Segs member data.
Definition: Seq_align_.hpp:921
E_Choice Which(void) const
Which variant is currently selected.
@ e_Product_ins
insertion in product sequence (i.e. gap in the genomic sequence)
@ e_Diag
both sequences are represented, there is sufficient similarity between product and genomic sequences....
@ e_Genomic_ins
insertion in genomic sequence (i.e. gap in the product sequence)
@ e_Match
both sequences represented, product and genomic sequences match
@ e_Mismatch
both sequences represented, product and genomic sequences do not match
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
pair< int, int > ChunkSize(const CSpliced_exon_chunk &chunk)
map< TSeqRange, string > TStarts
n padding
int len
range(_Ty, _Ty) -> range< _Ty >
int GetGeneticCode(const CBioseq_Handle &bsh)
T min(T x_, T y_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
Definition: inftrees.h:24
#define _ASSERT
Modified on Tue Nov 28 02:23:18 2023 by modify_doxy.py rev. 669887