NCBI C++ ToolKit
gnomon_objmgr.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: gnomon_objmgr.cpp 100465 2023-08-03 17:24:45Z souvorov $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Mike DiCuccio
27  *
28  * File Description: gnomon library parts requiring object manager support
29  * to allow apps not using these to be linked w/o object manager libs
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <algo/gnomon/gnomon.hpp>
35 #include <algo/gnomon/chainer.hpp>
38 #include "hmm.hpp"
39 #include "hmm_inlines.hpp"
40 
48 #include <objmgr/util/sequence.hpp>
52 
53 #include <stdio.h>
54 
55 #include <functional>
56 
58 BEGIN_SCOPE(gnomon)
60 
61 void debug()
62 {
63 }
64 
65 Int8 GetModelId(const CSeq_align& seq_align)
66 {
67  Int8 id = 0;
68  if(seq_align.CanGetId())
69  seq_align.GetId().back()->GetIdType(id); // 0 if no integer id
70 
71  return id;
72  // return CIdHandler::GetId(*seq_align.GetId().back());
73 }
74 
75 CAlignModel::CAlignModel(const CSeq_align& seq_align) :
76  CGeneModel(seq_align.GetSegs().GetSpliced().GetGenomic_strand()==eNa_strand_minus?eMinus:ePlus, GetModelId(seq_align), emRNA)
77 {
78 #ifdef _DEBUG
79  debug();
80 #endif
81 
82  const CSpliced_seg& sps = seq_align.GetSegs().GetSpliced();
83 
84  bool is_protein = false;
86  SetType(eProt);
87  is_protein = true;
88  }
89 
90  bool is_product_reversed = false;
92  Status() |= CGeneModel::eReversed;
93  is_product_reversed = true;
94  }
95 
96  /* for not gpipe alignments
97  CScope scope(*CObjectManager::GetInstance());
98  scope.AddDefaults();
99  CIdHandler idh(scope);
100  SetTargetId(*idh.ToCanonical(sps.GetProduct_id()));
101  */
102  SetTargetId(sps.GetProduct_id());
103 
104  int product_len = sps.CanGetProduct_length()?sps.GetProduct_length():0;
105  if (is_protein)
106  product_len *=3;
107  int prod_prev = -1;
108  bool prev_3_prime_splice = false;
109 
110  int target_len = product_len;
111 
112  vector<TSignedSeqRange> transcript_exons;
113  TInDels indels;
114 
115  if(!sps.CanGetGenomic_id())
116  NCBI_THROW(CGnomonException, eGenericError, "CSpliced_seg must have genomic id");
117 
118  const CSeq_id& genomic_id = sps.GetGenomic_id();
119 
120  string mismatches;
121  string mismstatus;
122 
123  if(seq_align.CanGetExt()) {
124  int count = 0;
125  ITERATE(CSeq_align::TExt, i, seq_align.GetExt()) {
126  if((*i)->IsSetType() && (*i)->GetType().IsStr()) {
127  string type = (*i)->GetType().GetStr();
128  if(type == "RNASeq-Counts") {
129  ITERATE(CUser_object::TData, j, (*i)->GetData()) {
130  if(*j && (*j)->CanGetLabel() && (*j)->GetLabel().IsStr()) {
131  string label = (*j)->GetLabel().GetStr();
132  if(NStr::EndsWith(label, "alignments") || label.find(' ') == string::npos)
133  count += (*j)->GetData().GetInt();
134  }
135  }
136  } else if(type == "MismatchedBases") {
137  mismatches = (*i)->GetData().front()->GetData().GetStr();
138  } else if(type == "MismatchedBasesStatus") {
139  mismstatus = (*i)->GetData().front()->GetData().GetStr();
140  }
141  }
142  }
143  if(count > 0)
144  SetWeight(count);
145  if(Strand() == eMinus) {
146  reverse(mismatches.begin(),mismatches.end());
147  reverse(mismstatus.begin(),mismstatus.end());
148  }
149  }
150 
151 #ifdef _DEBUG
152  bool ggap_model= false;
153 #endif
154 
155  ITERATE(CSpliced_seg::TExons, e_it, sps.GetExons()) {
156  const CSpliced_exon& exon = **e_it;
157  int prod_cur_start = exon.GetProduct_start().AsSeqPos();
158  int prod_cur_end = exon.GetProduct_end().AsSeqPos();
159  if (is_product_reversed) {
160  int tmp = prod_cur_start;
161  prod_cur_start = product_len - prod_cur_end -1;
162  prod_cur_end = product_len - tmp -1;
163  }
164  int nuc_cur_start = exon.GetGenomic_start();
165  int nuc_cur_end = exon.GetGenomic_end();
166 
167  // bool cur_5_prime_splice = exon.CanGetAcceptor_before_exon() && exon.GetAcceptor_before_exon().CanGetBases() && exon.GetAcceptor_before_exon().GetBases().size()==2;
168  bool cur_5_prime_splice = exon.CanGetAcceptor_before_exon() && exon.GetAcceptor_before_exon().CanGetBases();
169 
170  if (prod_prev+1 != prod_cur_start || !prev_3_prime_splice || !cur_5_prime_splice) {
171  AddHole();
172  if(!mismatches.empty()) // also will take care about first exon
173  mismatches = mismatches.substr(prod_cur_start - prod_prev -1);
174  }
175 
176  // prev_3_prime_splice = exon.CanGetDonor_after_exon() && exon.GetDonor_after_exon().CanGetBases() && exon.GetDonor_after_exon().GetBases().size()==2;
177  prev_3_prime_splice = exon.CanGetDonor_after_exon() && exon.GetDonor_after_exon().CanGetBases();
178 
179  string fs, ss;
181  fs = exon.GetAcceptor_before_exon().GetBases();
182  }
184  ss = exon.GetDonor_after_exon().GetBases();
185  }
186  if(Strand() == eMinus) {
187  swap(fs,ss);
188  }
189 
190  double eident = 0;
191  if(exon.CanGetScores() && exon.GetScores().CanGet()) {
192  CScore_set::Tdata scores = exon.GetScores().Get();
193  ITERATE(CScore_set::Tdata, it, scores) {
194  if((*it)->CanGetId() && (*it)->GetId().IsStr()) {
195  if((*it)->GetId().GetStr() == "idty") {
196  eident = (*it)->GetValue().GetReal();
197  break;
198  }
199  }
200  }
201  }
202 
203  if(!exon.CanGetGenomic_id() || !(exon.GetGenomic_id() < genomic_id || genomic_id < exon.GetGenomic_id())) { // normal exon
204  AddNormalExon(TSignedSeqRange(nuc_cur_start,nuc_cur_end), fs, ss, eident, Strand() == eMinus);
205  } else { // genomic gap
206  if(!exon.CanGetGenomic_strand())
207  NCBI_THROW(CGnomonException, eGenericError, "CSpliced_exon for gap filling must have genomic strand");
210  scope.AddDefaults();
211 
212  string transcript = GetDNASequence(fill_id,scope);
213  string fill_seq = transcript.substr(nuc_cur_start, nuc_cur_end-nuc_cur_start+1);
214 
215  CInDelInfo::SSource fill_src;
216  fill_src.m_acc = CIdHandler::ToString(*fill_id);
217  fill_src.m_strand = ePlus;
218  fill_src.m_range = TSignedSeqRange(nuc_cur_start, nuc_cur_end);
219 
220  if(exon.GetGenomic_strand() == eNa_strand_minus) {
221  fill_src.m_strand = eMinus;
222  ReverseComplement(fill_seq.begin(),fill_seq.end());
223  }
224 
225  AddGgapExon(eident, fill_seq, fill_src, Strand() == eMinus);
226 #ifdef _DEBUG
227  ggap_model = true;
228 #endif
229  }
230  transcript_exons.push_back(TSignedSeqRange(exon.GetProduct_start().AsSeqPos(), exon.GetProduct_end().AsSeqPos()));
231 
232  _ASSERT(transcript_exons.back().NotEmpty());
233 
234  int pos = 0;
235  int prod_pos = prod_cur_start;
236 
237  ITERATE(CSpliced_exon::TParts, p_it, exon.GetParts()) {
238  const CSpliced_exon_chunk& chunk = **p_it;
240  if(!mismstatus.empty() && (chunk.IsProduct_ins() || chunk.IsGenomic_ins() || chunk.IsMismatch())) {
241  if(mismstatus[0] == 'n')
242  indelstatus = CInDelInfo::eGenomeNotCorrect;
243  else if(mismstatus[0] == 'c')
244  indelstatus = CInDelInfo::eGenomeCorrect;
245  mismstatus = mismstatus.substr(1);
246  }
247  if (chunk.IsProduct_ins()) {
248  string v = kEmptyStr;
249  int product_ins = chunk.GetProduct_ins();
250  if(!mismatches.empty()) {
251  v = mismatches.substr(0,product_ins);
252  mismatches = mismatches.substr(product_ins);
253  }
254  if(Strand() == eMinus)
255  reverse(v.begin(),v.end());
256  CInDelInfo fs(Strand()==ePlus?nuc_cur_start+pos:nuc_cur_end-pos+1, product_ins, CInDelInfo::eDel, v);
257  if (Strand() == ePlus)
258  indels.push_back(fs);
259  else
260  indels.insert(indels.begin(), fs);
261  prod_pos += chunk.GetProduct_ins();
262  } else if (chunk.IsGenomic_ins()) {
263  const int genomic_ins = chunk.GetGenomic_ins();
264  if (Strand()==ePlus)
265  indels.push_back(CInDelInfo(nuc_cur_start+pos, genomic_ins, CInDelInfo::eIns));
266  else
267  indels.insert(indels.begin(), CInDelInfo(nuc_cur_end-pos-genomic_ins+1, genomic_ins, CInDelInfo::eIns));
268  pos += genomic_ins;
269  } else if (chunk.IsMatch()) {
270  pos += chunk.GetMatch();
271  prod_pos += chunk.GetMatch();
272  } else if (chunk.IsMismatch()) {
273  int mismatch_len = chunk.GetMismatch();
274  string v(mismatch_len,'N');
275  if(!mismatches.empty()) {
276  _ASSERT(mismatch_len <= (int)mismatches.length());
277  v = mismatches.substr(0,mismatch_len);
278  mismatches = mismatches.substr(mismatch_len);
279  }
280  if(Strand() == ePlus) {
281  indels.push_back(CInDelInfo(nuc_cur_start+pos, mismatch_len, CInDelInfo::eMism, v));
282  } else {
283  reverse(v.begin(),v.end());
284  indels.insert(indels.begin(), CInDelInfo(nuc_cur_end-pos-mismatch_len+1, mismatch_len, CInDelInfo::eMism, v));
285  }
286  pos += mismatch_len;
287  prod_pos += mismatch_len;
288  } else { // if (chunk.IsDiag())
289  pos += chunk.GetDiag();
290  prod_pos += chunk.GetDiag();
291  }
292  if(indelstatus != CInDelInfo::eUnknown) {
293  if(Strand() == ePlus)
294  indels.back().SetStatus(indelstatus);
295  else
296  indels.front().SetStatus(indelstatus);
297  }
298  }
299 
300  prod_prev = prod_cur_end;
301  }
302 
303  _ASSERT(mismatches.empty() || (product_len - prod_prev - 1 == (int)mismatches.length()));
304 
305  sort(transcript_exons.begin(),transcript_exons.end());
306  bool minusstrand = Strand() == eMinus;
307  EStrand orientation = (is_product_reversed == minusstrand) ? ePlus : eMinus;
308  if(orientation == eMinus)
309  reverse(transcript_exons.begin(),transcript_exons.end());
310 
311  if (is_protein) {
312  _ASSERT(orientation == Strand());
313  if (sps.CanGetModifiers()) {
315  if ((*m)->IsStop_codon_found()) {
316  target_len += 3;
317  if (Strand() == ePlus) {
318  ExtendRight( 3 );
319  _ASSERT((transcript_exons.back().GetTo()+1)%3 == 0);
320  transcript_exons.back().SetTo(transcript_exons.back().GetTo()+3);
321  } else {
322  ExtendLeft( 3 );
323  _ASSERT((transcript_exons.front().GetTo()+1)%3 == 0);
324  transcript_exons.front().SetTo(transcript_exons.front().GetTo()+3);
325  }
326  }
327  }
328  }
329  }
330 
331  //convert tandem indels into indel+mism
332  bool keepdoing = true;
333  while(keepdoing) {
334  keepdoing = false;
335  NON_CONST_ITERATE(TInDels, indl, indels) {
336  TInDels::iterator indl_next = indl;
337  if(++indl_next == indels.end())
338  break;
339 
340  if(indl->InDelEnd() == indl_next->Loc()) { // tandem
341  string new_seq = indl->GetInDelV()+indl_next->GetInDelV();
342  TSignedSeqPos new_seq_len = (TSignedSeqPos)new_seq.size();
343  if(indl->GetType() == indl_next->GetType()) { // combine same
344  *indl = CInDelInfo(indl->Loc(), indl->Len()+indl_next->Len(), indl->GetType(), new_seq);
345  indels.erase(indl_next);
346  keepdoing = true;
347  } else if(!indl->IsMismatch() && !indl_next->IsMismatch()) { // tandem indels
348  if(indl->Len() == indl_next->Len()) {
349  *indl = CInDelInfo(indl->Loc(), indl->Len(), CInDelInfo::eMism, new_seq);
350  indels.erase(indl_next);
351  } else if(indl->Len() < indl_next->Len()) {
352  *indl = CInDelInfo(indl->Loc(), indl->Len(), CInDelInfo::eMism, new_seq.substr(0,indl->Len()));
353  *indl_next = CInDelInfo(indl->InDelEnd(), indl_next->Len()-indl->Len(), indl_next->GetType(), new_seq.substr(indl->Len()));
354  } else { // indl_next->Len() < indl->Len()
355  *indl = CInDelInfo(indl->Loc(), indl->Len()-indl_next->Len(), indl->GetType(), new_seq.substr(0,new_seq_len-indl_next->Len()));
356  *indl_next = CInDelInfo(indl->InDelEnd(), indl_next->Len(), CInDelInfo::eMism, new_seq.substr(new_seq_len-indl_next->Len()));
357  }
358  keepdoing = true;
359  }
360  }
361  }
362  }
363 
364  m_alignmap = CAlignMap(Exons(), transcript_exons, indels, orientation, target_len );
365  FrameShifts() = indels;
366 
367  TSignedSeqRange newlimits = m_alignmap.ShrinkToRealPoints(Limits(),is_protein);
368  if(newlimits != Limits()) {
369  Clip(newlimits,CAlignModel::eRemoveExons);
370  }
371 
372  for (CGeneModel::TExons::const_iterator piece_begin = Exons().begin(); piece_begin != Exons().end(); ++piece_begin) {
373  _ASSERT( !piece_begin->m_fsplice );
374 
375  if(piece_begin->Limits().Empty()) { // ggap
376  _ASSERT(piece_begin->m_ssplice);
377  ++piece_begin;
378  _ASSERT(piece_begin->Limits().NotEmpty());
379  }
380 
381  CGeneModel::TExons::const_iterator piece_end;
382  for (piece_end = piece_begin; piece_end != Exons().end() && piece_end->m_ssplice; ++piece_end) ;
383  _ASSERT( piece_end != Exons().end() );
384 
385  CGeneModel::TExons::const_iterator piece_end_g = piece_end;
386  if(piece_end_g->Limits().Empty()) { // ggap
387  _ASSERT(piece_end_g->m_fsplice);
388  --piece_end_g;
389  _ASSERT(piece_end_g->Limits().NotEmpty());
390  }
391 
392  TSignedSeqRange piece_range(piece_begin->GetFrom(),piece_end_g->GetTo());
393 
394  piece_range = m_alignmap.ShrinkToRealPoints(piece_range, is_protein); // finds first projectable interval (on codon boundaries for proteins)
395 
396  /*
397  TSignedSeqRange pr;
398  while(pr != piece_range) {
399  pr = piece_range;
400  ITERATE(TInDels, i, indels) { // here we check that no indels touch our interval from outside
401  if((i->IsDeletion() && i->Loc() == pr.GetFrom()) || ((i->IsInsertion() || i->IsMismatch()) && i->Loc()+i->Len() == pr.GetFrom()))
402  pr.SetFrom(pr.GetFrom()+1);
403  else if(i->Loc() == pr.GetTo()+1)
404  pr.SetTo(pr.GetTo()-1);
405  }
406  if(pr != piece_range)
407  piece_range = m_alignmap.ShrinkToRealPoints(pr, is_protein);
408  }
409  */
410 
411  _ASSERT(piece_range.NotEmpty());
412  _ASSERT(piece_range.IntersectingWith(piece_begin->Limits()) && piece_range.IntersectingWith(piece_end_g->Limits()));
413 
414  if(piece_range.GetFrom() != piece_begin->GetFrom() || piece_range.GetTo() != piece_end_g->GetTo()) {
415  _ASSERT(!ggap_model);
416  Clip(piece_range, CGeneModel::eDontRemoveExons);
417  }
418 
419  piece_begin = piece_end;
420  }
421 
422  if (is_protein) {
423  TSignedSeqRange reading_frame = m_alignmap.MapRangeOrigToEdited(Limits(), true);
424  TSignedSeqRange start, stop;
425  if (sps.CanGetModifiers()) {
427  if ((*m)->IsStart_codon_found()) {
428  start = TSignedSeqRange(reading_frame.GetFrom(),reading_frame.GetFrom()+2);
429  reading_frame.SetFrom(start.GetTo()+1);
430  } else if ((*m)->IsStop_codon_found()) {
431  stop = TSignedSeqRange(reading_frame.GetTo()-2,reading_frame.GetTo());
432  reading_frame.SetTo(stop.GetFrom()-1);
433  }
434  }
435  }
436 
437  CCDSInfo cds_info_t;
438  cds_info_t.SetReadingFrame(reading_frame, true);
439  if (start.NotEmpty()) {
440  cds_info_t.SetStart(start, false);
441  }
442  if (stop.NotEmpty()) {
443  cds_info_t.SetStop(stop, false);
444  }
445  CCDSInfo cds_info_g = cds_info_t.MapFromEditedToOrig(m_alignmap);
446  if(cds_info_g.ReadingFrame().NotEmpty()) // successful projection
447  SetCdsInfo(cds_info_g);
448  else
449  SetCdsInfo(cds_info_t);
450  }
451 
452  if (sps.IsSetPoly_a()) {
453  Status() |= CGeneModel::ePolyA;
454  }
455 
456  if(seq_align.CanGetScore()) {
457  const CSeq_align::TScore& score = seq_align.GetScore();
458  ITERATE(CSeq_align::TScore, it, score) {
459  if((*it)->CanGetId() && (*it)->GetId().IsStr()) {
460  string scr = (*it)->GetId().GetStr();
461  if((scr == "N of matches") || (scr == "num_ident") || (scr == "matches")) {
462  double ident = (*it)->GetValue().GetInt();
463  ident /= seq_align.GetAlignLength();
464  SetIdent(ident);
465  } else if(scr == "rank" && (*it)->GetValue().GetInt() == 1) {
466  Status() |= CGeneModel::eBestPlacement;
467  } else if(scr == "ambiguous_orientation") {
469  } else if(scr == "count") {
470  _ASSERT(Weight() == 1 || Weight() == (*it)->GetValue().GetInt());
471  SetWeight((*it)->GetValue().GetInt());
472  }
473  }
474  }
475  }
476 }
477 
478 
479 
480 string CGeneModel::GetCdsDnaSequence (const CResidueVec& contig_sequence) const
481 {
482  if(ReadingFrame().Empty())
483  return kEmptyStr;
484 
486  CCDSInfo cds_info = GetCdsInfo();
487  if(cds_info.IsMappedToGenome())
488  cds_info = cds_info.MapFromOrigToEdited(amap);
489  int cds_len = cds_info.Cds().GetLength();
490  int cds_start = cds_info.Cds().GetFrom()-TranscriptLimits().GetFrom();
491 
492  CResidueVec mrna;
493  amap.EditedSequence(contig_sequence,mrna);
494 
495  string cds_seq(cds_len,'A');
496  copy(mrna.begin()+cds_start, mrna.begin()+cds_start+cds_len, cds_seq.begin());
497 
498  if(Status() & eReversed)
499  ReverseComplement(cds_seq.begin(),cds_seq.end());
500 
501  return cds_seq;
502 }
503 
504 string CGeneModel::GetProtein (const CResidueVec& contig_sequence) const
505 {
506  string cds_seq = GetCdsDnaSequence(contig_sequence);
507  string prot_seq;
508 
509  objects::CSeqTranslator::Translate(cds_seq, prot_seq, objects::CSeqTranslator::fIs5PrimePartial);
510 
511  if(PStop()) {
512  CCDSInfo cds_info = GetCdsInfo();
513  if(cds_info.IsMappedToGenome()) {
514  CAlignMap amap = GetAlignMap();
515  cds_info = cds_info.MapFromOrigToEdited(amap);
516  }
517  ITERATE(CCDSInfo::TPStops, stp, cds_info.PStops()) {
518  if(stp->m_status == CCDSInfo::eSelenocysteine)
519  prot_seq[(stp->GetFrom()- cds_info.Cds().GetFrom())/3] = 'U';
520  }
521  }
522 
523  return prot_seq;
524 }
525 
526 string CGeneModel::GetProtein (const CResidueVec& contig_sequence, const CGenetic_code* gencode) const
527 {
528  string cds_seq = GetCdsDnaSequence(contig_sequence);
529  string prot_seq;
530 
531  objects::CSeqTranslator::Translate(cds_seq, prot_seq, objects::CSeqTranslator::fDefault, gencode );
532  if (prot_seq[0] == '-') {
533  string first_triplet = cds_seq.substr(0, 3);
534  string first_aa;
535  objects::CSeqTranslator::Translate(first_triplet, first_aa, objects::CSeqTranslator::fIs5PrimePartial, gencode );
536  prot_seq = first_aa+prot_seq.substr(1);
537  }
538 
539  if(PStop()) {
540  CCDSInfo cds_info = GetCdsInfo();
541  if(cds_info.IsMappedToGenome()) {
542  CAlignMap amap = GetAlignMap();
543  cds_info = cds_info.MapFromOrigToEdited(amap);
544  }
545  ITERATE(CCDSInfo::TPStops, stp, cds_info.PStops()) {
546  if(stp->m_status == CCDSInfo::eSelenocysteine)
547  prot_seq[(stp->GetFrom()- cds_info.Cds().GetFrom())/3] = 'U';
548  }
549  }
550 
551  return prot_seq;
552 }
553 
554 //
555 // helper function - convert a vector<TSignedSeqRange> into a compact CSeq_loc
556 //
557 namespace {
558 
559 CRef<CSeq_loc> s_ExonDataToLoc(const vector<TSignedSeqRange>& vec,
560  ENa_strand strand, const CSeq_id& id)
561 {
562  CRef<CSeq_loc> loc(new CSeq_loc());
563 
565  ITERATE (vector<TSignedSeqRange>, iter, vec) {
567  ival->SetFrom(iter->GetFrom());
568  ival->SetTo(iter->GetTo());
569  ival->SetStrand(strand);
570  ival->SetId().Assign(id);
571 
572  data.push_back(ival);
573  }
574 
575  if (data.size() == 1) {
576  loc->SetInt(*data.front());
577  } else {
578  loc->SetPacked_int().Set().swap(data);
579  }
580 
581  return loc;
582 }
583 
584 } //end unnamed namespace
585 
587 {
588  TGeneModelList genes = GetGenes();
589 
591 
592  annot->SetNameDesc("Gnomon gene scan output");
593 
594  CSeq_annot::C_Data::TFtable& ftable = annot->SetData().SetFtable();
595 
596  unsigned int counter = 0;
597  string locus_tag_base("GNOMON_");
598  ITERATE (TGeneModelList, it, genes) {
599  const CGeneModel& igene = *it;
600  int strand = igene.Strand();
601  TSignedSeqRange cds_limits = igene.RealCdsLimits();
602 
603  vector<TSignedSeqRange> mrna_vec;
604  copy(igene.Exons().begin(), igene.Exons().end(), back_inserter(mrna_vec));
605  vector<TSignedSeqRange> cds_vec;
606 
607  for (size_t j = 0; j < mrna_vec.size(); ++j) {
608  TSignedSeqRange intersect(mrna_vec[j] & cds_limits);
609  if (!intersect.Empty()) {
610  cds_vec.push_back(intersect);
611  }
612  }
613 
614  // stop-codon removal from cds
615  if (igene.HasStop()) {
616  if (strand == ePlus) {
617  _ASSERT(cds_vec.back().GetLength()>=3);
618  cds_vec.back().SetTo(cds_vec.back().GetTo() - 3);
619  } else {
620  _ASSERT(cds_vec.front().GetLength()>=3);
621  cds_vec.front().SetFrom(cds_vec.front().GetFrom() + 3);
622  }
623  }
624 
625  //
626  // create our mRNA
627  CRef<CSeq_feat> feat_mrna;
628  if (mrna_vec.size()) {
629  feat_mrna = new CSeq_feat();
630  feat_mrna->SetData().SetRna().SetType(CRNA_ref::eType_mRNA);
631  feat_mrna->SetLocation
632  (*s_ExonDataToLoc(mrna_vec,
633  (strand == ePlus ? eNa_strand_plus : eNa_strand_minus), id));
634  } else {
635  continue;
636  }
637 
638  //
639  // create the accompanying CDS
640  CRef<CSeq_feat> feat_cds;
641  if (!cds_vec.empty()) {
642  feat_cds = new CSeq_feat();
643  feat_cds->SetData().SetCdregion().SetFrame(CCdregion::TFrame(1+(strand == ePlus?(igene.ReadingFrame().GetFrom()-cds_limits.GetFrom())%3:(cds_limits.GetTo()-igene.ReadingFrame().GetTo())%3)));
644 
645  feat_cds->SetLocation
646  (*s_ExonDataToLoc(cds_vec,
647  (strand == ePlus ? eNa_strand_plus : eNa_strand_minus), id));
648  }
649 
650  //
651  // create a dummy gene feature as well
652  CRef<CSeq_feat> feat_gene(new CSeq_feat());
653 
654  char buf[32];
655  sprintf(buf, "%04u", ++counter);
656  string name(locus_tag_base);
657  name += buf;
658  feat_gene->SetData().SetGene().SetLocus_tag(name);
659  feat_gene->SetLocation().SetInt()
660  .SetFrom(feat_mrna->GetLocation().GetTotalRange().GetFrom());
661  feat_gene->SetLocation().SetInt()
662  .SetTo(feat_mrna->GetLocation().GetTotalRange().GetTo());
663  feat_gene->SetLocation().SetInt().SetStrand
664  (strand == ePlus ? eNa_strand_plus : eNa_strand_minus);
665 
666  feat_gene->SetLocation().SetId
667  (sequence::GetId(feat_mrna->GetLocation(), 0));
668 
669  ftable.push_back(feat_gene);
670  ftable.push_back(feat_mrna);
671  if (feat_cds) {
672  ftable.push_back(feat_cds);
673  }
674  }
675  return annot;
676 }
677 
678 
679 //
680 //
681 // This function deduces the frame from 5p coordinate of the CDS which should be on the 5p codon boundary
682 //
683 //
684 double CCodingPropensity::GetScore(CConstRef<CHMMParameters> hmm_params, const CSeq_loc& cds, CScope& scope, int *const gccontent, double *const startscore)
685 {
686  *gccontent = 0;
687  const CSeq_id* seq_id = cds.GetId();
688  if (seq_id == NULL)
689  NCBI_THROW(CGnomonException, eGenericError, "CCodingPropensity: CDS has multiple ids or no id at all");
690 
691  // Need to know GC content in order to load correct models.
692  // Compute on the whole transcript, not just CDS.
693  CBioseq_Handle xcript_hand = scope.GetBioseqHandle(*seq_id);
694  CSeqVector xcript_vec = xcript_hand.GetSeqVector();
695  xcript_vec.SetIupacCoding();
696  unsigned int gc_count = 0;
697  CSeqVector_CI xcript_iter(xcript_vec);
698  for( ; xcript_iter; ++xcript_iter) {
699  if (*xcript_iter == 'G' || *xcript_iter == 'C') {
700  ++gc_count;
701  }
702  }
703  *gccontent = static_cast<unsigned int>(100.0 * gc_count / xcript_vec.size() + 0.5);
704 
705  const CMC3_CodingRegion<5>& cdr = dynamic_cast<const CMC3_CodingRegion<5>&>(hmm_params->GetParameter(CMC3_CodingRegion<5>::class_id(), *gccontent));
706  const CMC_NonCodingRegion<5>& ncdr = dynamic_cast<const CMC_NonCodingRegion<5>&>(hmm_params->GetParameter(CMC_NonCodingRegion<5>::class_id(), *gccontent));
707 
708  // Represent coding sequence as enum Nucleotides
709  CSeqVector vec(cds, scope);
710  vec.SetIupacCoding();
711  CEResidueVec seq;
712  seq.reserve(vec.size());
713  CSeqVector_CI iter(vec);
714  for( ; iter; ++iter) {
715  seq.push_back(fromACGT(*iter));
716  }
717 
718  // Sum coding and non-coding scores across coding sequence.
719  // Don't include stop codon!
720  double score = 0;
721  for (unsigned int i = 5; i < seq.size() - 3; ++i)
722  score += cdr.Score(seq, i, i % 3) - ncdr.Score(seq, i);
723 
724  if(startscore) {
725  //
726  // start evalustion needs 17 bases total (11 before ATG and 3 after)
727  // if there is not enough sequence it will be substituted by Ns which will degrade the score
728  //
729 
730  const CWMM_Start& start = dynamic_cast<const CWMM_Start&>(hmm_params->GetParameter(CWMM_Start::class_id(), *gccontent));
731 
732  int totallen = xcript_vec.size();
733  int left, right;
734  int extraNs5p = 0;
735  int extrabases = start.Left()+2; // (11) extrabases left of ATG needed for start (6) and noncoding (5) evaluation
736  if(cds.GetStrand() == eNa_strand_plus) {
737  int startposition = cds.GetTotalRange().GetFrom();
738  if(startposition < extrabases) {
739  left = 0;
740  extraNs5p = extrabases-startposition;
741  } else {
742  left = startposition-extrabases;
743  }
744  right = min(startposition+2+start.Right(),totallen-1);
745  } else {
746  int startposition = cds.GetTotalRange().GetTo();
747  if(startposition+extrabases >= totallen) {
748  right = totallen-1;
749  extraNs5p = startposition+extrabases-totallen+1;
750  } else {
751  right = startposition+extrabases;
752  }
753  left = max(0,startposition-2-start.Right());
754  }
755 
756 
757  CRef<CSeq_loc> startarea(new CSeq_loc());
758  CSeq_interval& d = startarea->SetInt();
759  d.SetStrand(cds.GetStrand());
760  CRef<CSeq_id> id(new CSeq_id());
761  id->Assign(*seq_id);
762  d.SetId(*id);
763  d.SetFrom(left);
764  d.SetTo(right);
765 
766  CSeqVector sttvec(*startarea, scope);
767  sttvec.SetIupacCoding();
768  CEResidueVec sttseq;
769  sttseq.resize(extraNs5p, enN); // fill with Ns if necessery
770  for(unsigned int i = 0; i < sttvec.size(); ++i) {
771  sttseq.push_back(fromACGT(sttvec[i]));
772  }
773  sttseq.resize(5+start.Left()+start.Right(), enN); // add Ns if necessery
774 
775  *startscore = start.Score(sttseq, extrabases+2);
776  if(*startscore != BadScore()) {
777  for(unsigned int i = 5; i < sttseq.size(); ++i) {
778  *startscore -= ncdr.Score(sttseq, i);
779  }
780  }
781  }
782 
783  return score;
784 }
785 
786 END_SCOPE(gnomon)
User-defined methods of the data storage class.
void EditedSequence(const In &original_sequence, Out &edited_sequence, bool includeholes=false) const
Definition: gnomon_seq.cpp:632
CBioseq_Handle –.
CCDSInfo MapFromEditedToOrig(const CAlignMap &amap) const
void SetStart(TSignedSeqRange r, bool confirmed=false)
CCDSInfo MapFromOrigToEdited(const CAlignMap &amap) const
bool IsMappedToGenome() const
TSignedSeqRange Cds() const
TSignedSeqRange ReadingFrame() const
void SetStop(TSignedSeqRange r, bool confirmed=false)
const TPStops & PStops() const
vector< SPStop > TPStops
void SetReadingFrame(TSignedSeqRange r, bool protein=false)
static double GetScore(CConstRef< CHMMParameters > hmm_params, const objects::CSeq_loc &cds, objects::CScope &scope, int *const gccontent, double *const startscore=0)
string GetProtein(const CResidueVec &contig_sequence) const
TSignedSeqRange TranscriptLimits() const
unsigned int & Status()
const TExons & Exons() const
TSignedSeqRange ReadingFrame() const
virtual CAlignMap GetAlignMap() const
TSignedSeqRange RealCdsLimits() const
string GetCdsDnaSequence(const CResidueVec &contig_sequence) const
const CCDSInfo & GetCdsInfo() const
bool HasStop() const
bool PStop(bool includeall=true) const
EStrand Strand() const
list< CGeneModel > GetGenes() const
CRef< objects::CSeq_annot > GetAnnot(const objects::CSeq_id &id)
const CInputModel & GetParameter(const string &type, int cgcontent) const
Definition: hmm.cpp:758
static string ToString(const CSeq_id &id)
Definition: id_handler.cpp:68
double Score(const CEResidueVec &seq, int i, int codonshift) const
double Score(const CEResidueVec &seq, int i) const
TSeqPos AsSeqPos() const
Definition: Product_pos.cpp:56
CScope –.
Definition: scope.hpp:92
CSeqVector –.
Definition: seq_vector.hpp:65
TSeqPos GetAlignLength(bool include_gaps=true) const
Get the length of this alignment.
Definition: Seq_align.cpp:1993
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
CSpliced_exon_chunk –.
int Right() const
Definition: hmm.hpp:114
int Left() const
Definition: hmm.hpp:113
static string class_id()
Definition: hmm.hpp:150
double Score(const CEResidueVec &seq, int i) const
Definition: hmm.cpp:444
constexpr auto begin(const ct_const_array< T, N > &in) noexcept
constexpr auto end(const ct_const_array< T, N > &in) noexcept
bool Empty(const CNcbiOstrstream &src)
Definition: fileutil.cpp:523
static char tmp[3200]
Definition: utf8.c:42
char data[12]
Definition: iconv.c:80
vector< TResidue > CResidueVec
@ enN
double BadScore()
EStrand
@ eMinus
@ ePlus
void ReverseComplement(const BidirectionalIterator &first, const BidirectionalIterator &last)
list< CGeneModel > TGeneModelList
vector< CInDelInfo > TInDels
void debug()
USING_SCOPE(ncbi::objects)
Int8 GetModelId(const CSeq_align &seq_align)
EResidue fromACGT(TResidue c)
Definition: gnomon_seq.hpp:59
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
int TSignedSeqPos
Type for signed sequence position.
Definition: ncbimisc.hpp:887
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
Definition: ncbimisc.hpp:1508
#define NULL
Definition: ncbistd.hpp:225
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
ENa_strand GetStrand(void) const
Get the location's strand.
Definition: Seq_loc.cpp:882
TRange GetTotalRange(void) const
Definition: Seq_loc.hpp:913
void SetInt(TInt &v)
Definition: Seq_loc.hpp:983
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
Definition: Seq_loc.hpp:941
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
CSeqVector GetSeqVector(EVectorCoding coding, ENa_strand strand=eNa_strand_plus) const
Get sequence: Iupacna or Iupacaa if use_iupac_coding is true.
TSeqPos size(void) const
Definition: seq_vector.hpp:291
void SetIupacCoding(void)
Set coding to either Iupacaa or Iupacna depending on molecule type.
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
position_type GetLength(void) const
Definition: range.hpp:158
bool NotEmpty(void) const
Definition: range.hpp:152
bool Empty(void) const
Definition: range.hpp:148
CRange< TSignedSeqPos > TSignedSeqRange
Definition: range.hpp:420
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
#define kEmptyStr
Definition: ncbistr.hpp:123
static bool EndsWith(const CTempString str, const CTempString end, ECase use_case=eCase)
Check if a string ends with a specified suffix value.
Definition: ncbistr.hpp:5430
static const char label[]
void SetFrom(TFrom value)
Assign a value to From data member.
Definition: Range_.hpp:231
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
void SetTo(TTo value)
Assign a value to To data member.
Definition: Range_.hpp:278
vector< CRef< CUser_field > > TData
const TDonor_after_exon & GetDonor_after_exon(void) const
Get the Donor_after_exon member data.
const TId & GetId(void) const
Get the Id member data.
Definition: Seq_align_.hpp:976
bool CanGetProduct_length(void) const
Check if it is safe to call GetProduct_length method.
const TGenomic_id & GetGenomic_id(void) const
Get the Genomic_id member data.
bool CanGetAcceptor_before_exon(void) const
Check if it is safe to call GetAcceptor_before_exon method.
bool CanGetBases(void) const
Check if it is safe to call GetBases method.
bool CanGetGenomic_id(void) const
Check if it is safe to call GetGenomic_id method.
vector< CRef< CScore > > TScore
Definition: Seq_align_.hpp:398
TMatch GetMatch(void) const
Get the variant data.
list< CRef< CScore > > Tdata
Definition: Score_set_.hpp:90
const TProduct_id & GetProduct_id(void) const
Get the Product_id member data.
TGenomic_start GetGenomic_start(void) const
Get the Genomic_start member data.
const TAcceptor_before_exon & GetAcceptor_before_exon(void) const
Get the Acceptor_before_exon member data.
bool CanGetGenomic_id(void) const
Check if it is safe to call GetGenomic_id method.
bool IsMismatch(void) const
Check if variant Mismatch is selected.
TProduct_length GetProduct_length(void) const
Get the Product_length member data.
bool CanGetProduct_type(void) const
Check if it is safe to call GetProduct_type method.
bool IsSetPoly_a(void) const
start of poly(A) tail on the transcript For sense transcripts: aligned product positions < poly-a <= ...
bool CanGetModifiers(void) const
Check if it is safe to call GetModifiers method.
TDiag GetDiag(void) const
Get the variant data.
bool CanGetScore(void) const
Check if it is safe to call GetScore method.
Definition: Seq_align_.hpp:890
TProduct_type GetProduct_type(void) const
Get the Product_type member data.
TMismatch GetMismatch(void) const
Get the variant data.
list< CRef< CUser_object > > TExt
Definition: Seq_align_.hpp:402
bool CanGetProduct_strand(void) const
Check if it is safe to call GetProduct_strand method.
const TParts & GetParts(void) const
Get the Parts member data.
const TProduct_start & GetProduct_start(void) const
Get the Product_start member data.
const TProduct_end & GetProduct_end(void) const
Get the Product_end member data.
const TSpliced & GetSpliced(void) const
Get the variant data.
Definition: Seq_align_.cpp:219
list< CRef< CSpliced_seg_modifier > > TModifiers
bool IsGenomic_ins(void) const
Check if variant Genomic_ins is selected.
bool IsMatch(void) const
Check if variant Match is selected.
bool CanGetGenomic_strand(void) const
Check if it is safe to call GetGenomic_strand method.
TGenomic_ins GetGenomic_ins(void) const
Get the variant data.
const TScores & GetScores(void) const
Get the Scores member data.
bool CanGetExt(void) const
Check if it is safe to call GetExt method.
Definition: Seq_align_.hpp:995
list< CRef< CSpliced_exon > > TExons
const TExons & GetExons(void) const
Get the Exons member data.
TGenomic_strand GetGenomic_strand(void) const
Get the Genomic_strand member data.
bool CanGetScores(void) const
Check if it is safe to call GetScores method.
const TExt & GetExt(void) const
Get the Ext member data.
const TBases & GetBases(void) const
Get the Bases member data.
list< CRef< CSpliced_exon_chunk > > TParts
bool CanGetDonor_after_exon(void) const
Check if it is safe to call GetDonor_after_exon method.
TGenomic_end GetGenomic_end(void) const
Get the Genomic_end member data.
const Tdata & Get(void) const
Get the member data.
Definition: Score_set_.hpp:165
TProduct_strand GetProduct_strand(void) const
Get the Product_strand member data.
bool CanGet(void) const
Check if it is safe to call Get method.
Definition: Score_set_.hpp:159
bool IsProduct_ins(void) const
Check if variant Product_ins is selected.
const TScore & GetScore(void) const
Get the Score member data.
Definition: Seq_align_.hpp:896
const TModifiers & GetModifiers(void) const
Get the Modifiers member data.
TProduct_ins GetProduct_ins(void) const
Get the variant data.
const TSegs & GetSegs(void) const
Get the Segs member data.
Definition: Seq_align_.hpp:921
const TGenomic_id & GetGenomic_id(void) const
Get the Genomic_id member data.
bool CanGetId(void) const
Check if it is safe to call GetId method.
Definition: Seq_align_.hpp:970
void SetLocation(TLocation &value)
Assign a value to Location data member.
Definition: Seq_feat_.cpp:131
const TLocation & GetLocation(void) const
Get the Location member data.
Definition: Seq_feat_.hpp:1117
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_feat_.cpp:94
void SetTo(TTo value)
Assign a value to To data member.
list< CRef< CSeq_interval > > Tdata
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
void SetId(TId &value)
Assign a value to Id data member.
void SetFrom(TFrom value)
Assign a value to From data member.
void SetStrand(TStrand value)
Assign a value to Strand data member.
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
list< CRef< CSeq_feat > > TFtable
Definition: Seq_annot_.hpp:193
string GetDNASequence(CConstRef< objects::CSeq_id > id, CScope &scope)
Definition: id_handler.cpp:130
char * buf
int i
constexpr auto sort(_Init &&init)
Magic spell ;-) needed for some weird compilers... very empiric.
T max(T x_, T y_)
T min(T x_, T y_)
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
The Object manager core.
static bool Translate(CSeq_feat &feat, string &prot)
Definition: nucprot.cpp:1393
TSignedSeqRange m_range
Definition: type.c:6
#define _ASSERT
#define ftable
Definition: utilfeat.h:37
Modified on Tue May 28 05:51:56 2024 by modify_doxy.py rev. 669887