NCBI C++ ToolKit
internal_stops.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: internal_stops.cpp 101345 2023-12-04 15:51:08Z dicuccio $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Vyacheslav Chetvernin
27  *
28  * File Description:
29  *
30  */
31 #include <ncbi_pch.hpp>
32 
34 #include "feature_generator.hpp"
35 
42 #include <objmgr/scope.hpp>
43 #include <objmgr/seq_vector.hpp>
44 #include <objmgr/util/sequence.hpp>
45 
47 
49  : scope(a_scope), generator(a_scope)
50 {
53 }
54 
56 
58 {
59  pair<TStarts, set<TSeqRange> > start_stop_ranges = FindStartStopRanges(align, padding);
60  set<TSeqPos> starts, stops;
61  ITERATE (TStarts, r, start_stop_ranges.first) {
62  starts.insert(r->first.GetFrom());
63  }
64  ITERATE (set<TSeqRange>, r, start_stop_ranges.second) {
65  stops.insert(r->GetFrom());
66  }
67  return make_pair(starts, stops);
68 }
69 pair<TStarts, set<TSeqRange> > CInternalStopFinder::FindStartStopRanges(const CSeq_align& align, int padding,
71 {
73  int genomic_length = bsh.GetBioseqLength();
74 
75  CConstRef<CSeq_align> clean_align;
76  pair<bool, bool> trim_by_contig(false, false);
77  {{
78  CConstRef<CSeq_align> padded_align(&align);
79  if (padding > 0) {
80  CRef<CSeq_loc> loc = align.CreateRowSeq_loc(1);
81  int start = loc->GetStart(eExtreme_Positional);
82  int stop = loc->GetStop(eExtreme_Positional);
83 
84  bool is_circular = (bsh.GetInst_Topology() == CSeq_inst::eTopology_circular);
85 
86  if (is_circular) {
87  //prevent self overlap
88  padding = min(padding, ((stop > start ? genomic_length : 0) - (stop - start +1))/2);
89  }
90 
91  start -= padding;
92  stop += padding;
93 
94  if (start <= 2 && !is_circular) {
95  trim_by_contig.first = true;
96  }
97  if (stop >= genomic_length-3 && !is_circular) {
98  trim_by_contig.second = true;
99  }
100 
101  if (start < 0) {
102  start = is_circular ? start + genomic_length : 0;
103  }
104  if (stop >= genomic_length) {
105  stop = is_circular ? stop - genomic_length : genomic_length-1;
106  }
107  padded_align = generator.AdjustAlignment(align, TSeqRange(start, stop));
108  //cerr << MSerial_AsnText << *padded_align;
109  }
110 
111  clean_align = generator.CleanAlignment(*padded_align);
112  //cerr << MSerial_AsnText << *clean_align;
113  }}
114 
115  CSeq_loc_Mapper mapper(*clean_align, 1);
116 
117  CRef<CSeq_loc> query_loc(new CSeq_loc);
118  const CSpliced_seg& spl = clean_align->GetSegs().GetSpliced();
119  query_loc->SetInt(*spl.GetExons().front()->CreateRowSeq_interval(0, spl));
120 
121  const bool is_protein = (spl.GetProduct_type() == CSpliced_seg::eProduct_type_protein);
122 
123  string seq = GetCDSNucleotideSequence(*clean_align);
124  if (seq.size()%3 != 0) {
125  cerr << MSerial_AsnText << align << endl;
126  _ASSERT(seq.size()%3 == 0);
127  NCBI_USER_THROW("CDSNucleotideSequence not divisible by 3");
128  }
129 
130  int gcode = fg::GetGeneticCode(bsh);
131 
132  CRef<CGenetic_code::C_E> c_e(new CGenetic_code::C_E); c_e->SetId(gcode);
134  code->Set().push_back(c_e);
135 
137  const size_t kUnknownState = tbl.SetCodonState('N', 'N', 'N');
138 
139  TStarts starts;
140  set<TSeqRange> stops;
141 
142  size_t state = 0;
143  int k = 0;
144  string codon = "NNN";
145 
146  ITERATE(string, s, seq) {
147  state = tbl.NextCodonState(state, *s);
148  codon[k%3] = *s;
149 
150  if (++k%3)
151  continue;
152 
153  if (state == kUnknownState)
154  continue;
155 
156  if (tbl.IsOrfStart(state) || tbl.IsOrfStop(state)) {
157  if (is_protein) {
158  query_loc->SetInt().SetFrom((k-3)/3);
159  query_loc->SetInt().SetTo((k-3)/3);
160  } else {
161  query_loc->SetInt().SetFrom(k-3);
162  query_loc->SetInt().SetTo(k-1);
163  }
164  CConstRef<CSeq_loc> mapped_loc = mapper.Map(*query_loc);
165  TSeqPos mapped_pos = mapped_loc->GetStart(eExtreme_Biological);
166  if (mapped_pos == kInvalidSeqPos)
167  continue;
168  TSeqPos mapped_pos2 = mapped_loc->GetStop(eExtreme_Biological);
169 
170  if (tbl.IsOrfStart(state)) {
171  starts[TSeqRange(mapped_pos, mapped_pos2)] = codon;
172  }
173  if (tbl.IsOrfStop(state)) {
174  stops.insert(TSeqRange(mapped_pos, mapped_pos2));
175  }
176  }
177  }
178 
179  if (gaps != nullptr) {
180 
181  CRef<CSeq_loc> region_loc = clean_align->CreateRowSeq_loc(1);
182  if (trim_by_contig.first && region_loc->GetStart(eExtreme_Positional) < 3) {
183  region_loc = region_loc->Merge(CSeq_loc::fMerge_SingleRange, nullptr);
184  region_loc->SetInt().SetFrom(0);
185  }
186  if (trim_by_contig.second && int(region_loc->GetStop(eExtreme_Positional)) > genomic_length -3) {
187  region_loc = region_loc->Merge(CSeq_loc::fMerge_SingleRange, nullptr);
188  region_loc->SetInt().SetTo(genomic_length-1);
189  }
190 // cerr << MSerial_AsnText << *region_loc;
191 
192  CSeqVector region_vec(*region_loc, scope,
194  string region_seq;
195  region_vec.GetSeqData(region_vec.begin(), region_vec.end(), region_seq);
196 
197  region_seq += 'X'; // to finish last run of Ns
198 
199  int gap_begin = -1;
200  int gap_end = -1;
201  int k = 0;
202 
203  CRef<CSeq_id> id(new CSeq_id);
204  id->Assign(*region_loc->GetId());
205  CRef<CSeq_loc> query_loc(new CSeq_loc(*id, 0, region_vec.size()-1));
206  CSeq_loc_Mapper mapper(*query_loc, *region_loc);
207 
208  for (auto s: region_seq) {
209  if (s == 'N') {
210  if (gap_end != k) {
211  gap_begin = k;
212  }
213  gap_end = k+1;
214  } else if (gap_end == k) {
215  query_loc->SetInt().SetFrom(gap_begin);
216  query_loc->SetInt().SetTo(gap_end-1);
217 
218  auto mapped_loc = mapper.Map(*query_loc);
219  TSeqPos mapped_pos = mapped_loc->GetStart(eExtreme_Biological);
220  TSeqPos mapped_pos2 = mapped_loc->GetStop(eExtreme_Biological);
221  if (mapped_pos == kInvalidSeqPos || mapped_pos2 == kInvalidSeqPos) {
222  NCBI_USER_THROW("Cannot map Ns run");
223  }
224  gaps->insert(TSignedSeqRange(mapped_pos, mapped_pos2));
225  }
226  ++k;
227  }
228 
229  auto strand = region_loc->GetStrand();
230 
231  if (trim_by_contig.first) {
232  int gap_stop = -1;
233  if (strand != eNa_strand_minus) {
234  if (!gaps->empty() && gaps->begin()->GetFrom()==0) {
235  gap_stop = gaps->begin()->GetTo();
236  gaps->erase(gaps->begin());
237  }
238  gaps->insert(TSignedSeqRange(gap_stop -9, gap_stop));
239  } else {
240  if (!gaps->empty() && gaps->begin()->GetTo()==0) {
241  gap_stop = gaps->begin()->GetFrom();
242  gaps->erase(gaps->begin());
243  }
244  gaps->insert(TSignedSeqRange(gap_stop, gap_stop -9));
245  }
246  }
247  if (trim_by_contig.second) {
248  int gap_start = genomic_length;
249  if (strand != eNa_strand_minus) {
250  if (!gaps->empty() && gaps->rbegin()->GetTo()==genomic_length-1) {
251  gap_start = gaps->rbegin()->GetFrom();
252  gaps->erase(prev(gaps->end()));
253  }
254  gaps->insert(TSignedSeqRange(gap_start, gap_start +9));
255  } else {
256  if (!gaps->empty() && gaps->rbegin()->GetFrom()==genomic_length-1) {
257  gap_start = gaps->rbegin()->GetTo();
258  gaps->erase(prev(gaps->end()));
259  }
260  gaps->insert(TSignedSeqRange(gap_start +9, gap_start));
261  }
262  }
263  }
264 
265  return make_pair(starts, stops);
266 }
267 
269 {
270  return FindStartsStops(align).second;
271 }
272 
274 {
275  return !FindStops(align).empty();
276 }
277 
279 {
280  if (!align.GetSegs().IsSpliced()) {
281  NCBI_THROW(CException, eUnknown, "CInternalStopFinder supports Spliced-seg alignments only");
282  }
283 
284  string mRNA;
285 
286  const CSpliced_seg& spliced_seg = align.GetSegs().GetSpliced();
287 
288  int next_prod_start = 0;
289  CSeq_loc::TRanges cds_ranges;
290 
292  const CSeq_id& product_id = spliced_seg.GetProduct_id();
293  CMappedFeat cds_on_rna = GetCdsOnMrna(product_id, scope);
294  if (!cds_on_rna) {
295  /// No CDS
296  return kEmptyStr;
297  }
298  if (cds_on_rna.GetLocation().GetStrand() == eNa_strand_minus) {
299  NCBI_THROW(CException, eUnknown, "minus strand cdregion on mrna is not supported");
300  }
301 
302  ITERATE (CSeq_loc, loc_ci, cds_on_rna.GetLocation()) {
303  cds_ranges.push_back(loc_ci.GetRange());
304  }
305  if (!cds_on_rna.GetLocation().IsPartialStop(eExtreme_Biological)) {
306  cds_ranges.back().SetTo(cds_ranges.back().GetTo()-3);
307  }
308  } else {
309  cds_ranges.push_back(TSeqRange::GetWhole());
310  }
311 
312  ITERATE( CSpliced_seg::TExons, exon, spliced_seg.GetExons() ) {
313 
314  int prod_pos_start = (*exon)->GetProduct_start().AsSeqPos();
315 
316  CRef<CSeq_loc> subject_loc(new CSeq_loc);
317  subject_loc->SetInt(*(*exon)->CreateRowSeq_interval(1, spliced_seg));
318  CSeqVector subject_vec(*subject_loc, scope,
320  string subject_seq;
321  subject_vec.GetSeqData(subject_vec.begin(), subject_vec.end(), subject_seq);
322  int subj_pos = 0;
323 
324  if (next_prod_start < prod_pos_start) {
325  mRNA.append(prod_pos_start - next_prod_start, 'N');
326  next_prod_start = prod_pos_start;
327  }
328 
329  if ((*exon)->IsSetParts()) {
330  ITERATE (CSpliced_exon::TParts, part_it, (*exon)->GetParts()) {
331  pair<int, int> chunk = ChunkSize(**part_it);
332  prod_pos_start += chunk.second;
333  if (chunk.first == 0) {
334  if (next_prod_start < prod_pos_start) {
335  mRNA.append(prod_pos_start - next_prod_start, 'N');
336  next_prod_start = prod_pos_start;
337  }
338  } else if (chunk.second > 0) {
339  if (next_prod_start < prod_pos_start) {
340  mRNA.append(subject_seq, subj_pos+chunk.second-(prod_pos_start - next_prod_start), prod_pos_start - next_prod_start);
341  next_prod_start = prod_pos_start;
342  }
343  }
344  subj_pos += chunk.first;
345  }
346  } else {
347  mRNA.append(subject_seq);
348  next_prod_start += subject_seq.size();
349  }
350  }
351 
352  if (cds_ranges.front().IsWhole()) {
353  return mRNA;
354  } else {
355  string cds_seq;
356  for (const TSeqRange &range : cds_ranges) {
357  if (range.GetFrom() >= mRNA.size()) {
358  break;
359  }
360  cds_seq += mRNA.substr(range.GetFrom(), range.GetLength());
361  }
362  return cds_seq;
363  }
364 }
365 
366 
367 // pair(genomic, product)
368 pair<int, int> ChunkSize(const CSpliced_exon_chunk& chunk)
369 {
370  int len = 0;
371  switch (chunk.Which()) {
373  len = chunk.GetGenomic_ins();
374  return make_pair(len, 0);
376  len = chunk.GetProduct_ins();
377  return make_pair(0, len);
379  len = chunk.GetMatch();
380  break;
382  len = chunk.GetMismatch();
383  break;
385  len = chunk.GetDiag();
386  break;
387  default:
388  NCBI_THROW(CException, eUnknown, "Spliced_exon_chunk type not set");
389  }
390  return make_pair(len, len);
391 }
392 
@ eExtreme_Positional
numerical value
Definition: Na_strand.hpp:63
@ eExtreme_Biological
5' and 3'
Definition: Na_strand.hpp:62
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
CBioseq_Handle –.
CConstRef< objects::CSeq_align > AdjustAlignment(const objects::CSeq_align &align, TSeqRange range, EProductPositionsMode mode=eForceProductFrom0)
void SetFlags(TFeatureGeneratorFlags)
Definition: gene_model.cpp:195
void SetAllowedUnaligned(TSeqPos)
Definition: gene_model.cpp:215
CConstRef< objects::CSeq_align > CleanAlignment(const objects::CSeq_align &align)
Clean an alignment according to our best guess of its biological representation.
Definition: gene_model.cpp:221
static const CTrans_table & GetTransTable(int id)
string GetCDSNucleotideSequence(const CSeq_align &align)
set< TSeqPos > FindStops(const CSeq_align &align)
pair< map< TSeqRange, string >, set< TSeqRange > > FindStartStopRanges(const CSeq_align &align, int padding=0, set< TSignedSeqRange > *gaps=nullptr)
bool HasInternalStops(const CSeq_align &align)
pair< set< TSeqPos >, set< TSeqPos > > FindStartsStops(const CSeq_align &align, int padding=0)
CFeatureGenerator generator
CInternalStopFinder(CScope &scope)
CMappedFeat –.
Definition: mapped_feat.hpp:59
CScope –.
Definition: scope.hpp:92
CSeqVector –.
Definition: seq_vector.hpp:65
CRef< CSeq_loc > CreateRowSeq_loc(TDim row) const
Definition: Seq_align.cpp:2028
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
Definition: Seq_align.cpp:317
CSeq_loc_Mapper –.
CSpliced_exon_chunk –.
bool IsOrfStart(int state) const
static int SetCodonState(unsigned char ch1, unsigned char ch2, unsigned char ch3)
static int NextCodonState(int state, unsigned char ch)
bool IsOrfStop(int state) const
Definition: map.hpp:338
Definition: set.hpp:45
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
const_iterator begin() const
Definition: set.hpp:135
bool empty() const
Definition: set.hpp:133
void erase(iterator pos)
Definition: set.hpp:151
const_iterator end() const
Definition: set.hpp:136
CMappedFeat GetCdsOnMrna(const objects::CSeq_id &rna_id, CScope &scope)
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:61
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
const TSeqPos kInvalidSeqPos
Define special value for invalid sequence position.
Definition: ncbimisc.hpp:878
#define NCBI_USER_THROW(message)
Throw a quick-and-dirty runtime exception of type 'CException' with the given error message and error...
Definition: ncbiexpt.hpp:715
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
@ eUnknown
Definition: app_popup.hpp:72
#define MSerial_AsnText
I/O stream manipulators –.
Definition: serialbase.hpp:696
ENa_strand GetStrand(void) const
Get the location's strand.
Definition: Seq_loc.cpp:882
TSeqPos GetStart(ESeqLocExtremes ext) const
Return start and stop positions of the seq-loc.
Definition: Seq_loc.cpp:915
CRef< CSeq_loc > Merge(TOpFlags flags, ISynonymMapper *syn_mapper) const
All functions create and return a new seq-loc object.
Definition: Seq_loc.cpp:5037
void SetInt(TInt &v)
Definition: Seq_loc.hpp:983
CPacked_seqint::TRanges TRanges
Definition: Seq_loc.hpp:103
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
Definition: Seq_loc.hpp:941
bool IsPartialStop(ESeqLocExtremes ext) const
Definition: Seq_loc.cpp:3251
TSeqPos GetStop(ESeqLocExtremes ext) const
Definition: Seq_loc.cpp:963
@ fMerge_SingleRange
Definition: Seq_loc.hpp:332
CRef< CSeq_loc > Map(const CSeq_loc &src_loc)
Map seq-loc.
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
TSeqPos GetBioseqLength(void) const
TInst_Topology GetInst_Topology(void) const
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
const CSeq_loc & GetLocation(void) const
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
Definition: seq_vector.cpp:304
TSeqPos size(void) const
Definition: seq_vector.hpp:291
const_iterator begin(void) const
Definition: seq_vector.hpp:298
const_iterator end(void) const
Definition: seq_vector.hpp:305
CRange< TSeqPos > TSeqRange
typedefs for sequence ranges
Definition: range.hpp:419
CRange< TSignedSeqPos > TSignedSeqRange
Definition: range.hpp:420
static TThisType GetWhole(void)
Definition: range.hpp:272
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define kEmptyStr
Definition: ncbistr.hpp:123
TMatch GetMatch(void) const
Get the variant data.
const TProduct_id & GetProduct_id(void) const
Get the Product_id member data.
TDiag GetDiag(void) const
Get the variant data.
TProduct_type GetProduct_type(void) const
Get the Product_type member data.
TMismatch GetMismatch(void) const
Get the variant data.
const TSpliced & GetSpliced(void) const
Get the variant data.
Definition: Seq_align_.cpp:219
TGenomic_ins GetGenomic_ins(void) const
Get the variant data.
list< CRef< CSpliced_exon > > TExons
const TExons & GetExons(void) const
Get the Exons member data.
list< CRef< CSpliced_exon_chunk > > TParts
bool IsSpliced(void) const
Check if variant Spliced is selected.
Definition: Seq_align_.hpp:778
TProduct_ins GetProduct_ins(void) const
Get the variant data.
const TSegs & GetSegs(void) const
Get the Segs member data.
Definition: Seq_align_.hpp:921
E_Choice Which(void) const
Which variant is currently selected.
@ e_Product_ins
insertion in product sequence (i.e. gap in the genomic sequence)
@ e_Diag
both sequences are represented, there is sufficient similarity between product and genomic sequences....
@ e_Genomic_ins
insertion in genomic sequence (i.e. gap in the product sequence)
@ e_Match
both sequences represented, product and genomic sequences match
@ e_Mismatch
both sequences represented, product and genomic sequences do not match
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
pair< int, int > ChunkSize(const CSpliced_exon_chunk &chunk)
map< TSeqRange, string > TStarts
n padding
int len
range(_Ty, _Ty) -> range< _Ty >
int GetGeneticCode(const CBioseq_Handle &bsh)
T min(T x_, T y_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
Definition: inftrees.h:24
#define _ASSERT
Modified on Wed Jul 24 17:18:14 2024 by modify_doxy.py rev. 669887