NCBI C++ ToolKit
annot.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: annot.cpp 100690 2023-08-30 14:46:42Z souvorov $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Vyacheslav Chetvernin
27  *
28  * File Description:
29  *
30  * Builds annotation models out of chained alignments:
31  * selects good chains as alternatively spliced genes,
32  * selects good chains inside other chains introns,
33  * other chains filtered to leave one chain per placement,
34  * gnomon is run to improve chains and predict models in regions w/o chains
35  *
36  */
37 
38 #include <ncbi_pch.hpp>
39 #include <corelib/ncbiapp.hpp>
40 #include <corelib/ncbienv.hpp>
41 #include <corelib/ncbiargs.hpp>
42 
43 #include <algo/gnomon/annot.hpp>
44 #include "gnomon_engine.hpp"
46 #include <algo/gnomon/gnomon.hpp>
48 
50 
53 BEGIN_SCOPE(gnomon)
54 
56 {
57 }
58 
60 {
61 }
62 
64 {
65 }
66 
67 bool s_AlignScoreOrder(const CGeneModel& ap, const CGeneModel& bp)
68 {
69  return (ap.Score() < bp.Score());
70 }
71 
73 {
74  NON_CONST_ITERATE(TGeneModelList, it, chains) {
75  it->RemoveShortHolesAndRescore(*m_gnomon);
76  }
77 }
78 
80  TSignedSeqPos left, TSignedSeqPos right)
81 {
82  TGeneModelList test_align;
83  test_align.push_back(chain);
84  int l = max((int)left,(int)chain.Limits().GetFrom()-10000);
85  int r = min(right,chain.Limits().GetTo()+10000);
86  cerr << "Testing alignment " << chain.ID() << " in fragment " << l << ' ' << r << endl;
87 
88  m_gnomon->ResetRange(l,r);
89  return m_gnomon->Run(test_align, false, false, false, false, mpp, nonconsensp, m_notbridgeable_gaps_len, m_inserted_seqs, m_pcsf_slice.get());
90 }
91 
93  bool leftwall, bool rightwall, bool leftanchor, bool rightanchor,
94  TSignedSeqPos left, TSignedSeqPos right,
95  TSignedSeqRange& tested_range)
96 {
97  bool already_tested = Include(tested_range, TSignedSeqRange(left,right));
98 
99  if (already_tested) {
100  for(TGeneModelList::iterator it = aligns.begin(); it != aligns.end(); it++) {
101  if(left <= it->Limits().GetTo() && it->Limits().GetFrom() <= right)
102  suspect_aligns.push_back(*it);
103  }
104  } else {
105  tested_range = TSignedSeqRange(left,right);
106 
107  bool found_bad_cluster = false;
108  for(TGeneModelList::iterator it = aligns.begin(); it != aligns.end(); ) {
109  if(it->Limits().GetTo() < left || it->Limits().GetFrom() > right) {
110  ++it;
111  continue;
112  }
113 
114  if ((it->Type() & (CGeneModel::eWall | CGeneModel::eNested))==0 &&
115  ExtendJustThisChain(*it, left, right) == BadScore()) {
116  found_bad_cluster = true;
117  cerr << "Deleting alignment " << it->ID() << endl;
118  it->Status() |= CGeneModel::eSkipped;
119  it->AddComment("Bad score prediction alone");
120  bad_aligns.push_back(*it);
121 
122  it = aligns.erase(it);
123  continue;
124  }
125  suspect_aligns.push_back(*it++);
126  }
127 
128  m_gnomon->ResetRange(left, right);
129  if(found_bad_cluster) {
130  cerr << "Testing w/o bad alignments in fragment " << left << ' ' << right << endl;
131  return m_gnomon->Run(suspect_aligns, leftwall, rightwall, leftanchor, rightanchor, mpp, nonconsensp, m_notbridgeable_gaps_len, m_inserted_seqs, m_pcsf_slice.get());
132  }
133  }
134  return BadScore();
135 }
136 
138  bool leftwall, bool rightwall, bool leftanchor, bool rightanchor)
139 {
140  double score = BadScore();
141  for(TGeneModelList::iterator it = suspect_aligns.begin(); it != suspect_aligns.end();) {
142  if((it->Type() & (CGeneModel::eWall | CGeneModel::eNested))!=0) {
143  ++it;
144  continue;
145  }
146  CGeneModel algn = *it;
147  it = suspect_aligns.erase(it);
148 
149  cerr << "Testing w/o " << algn.ID();
150  score = m_gnomon->Run(suspect_aligns, leftwall, rightwall, leftanchor, rightanchor, mpp, nonconsensp, m_notbridgeable_gaps_len, m_inserted_seqs, m_pcsf_slice.get());
151  if (score != BadScore()) {
152  cerr << "- Good. Deleting alignment " << algn.ID() << endl;
153  algn.Status() |= CGeneModel::eSkipped;
154  algn.AddComment("Good score prediction without");
155  bad_aligns.push_back(algn);
156  break;
157  } else {
158  cerr << " - Still bad." << endl;
159  }
160  suspect_aligns.insert(it,algn);
161  }
162  return score;
163 }
164 
166  bool leftwall, bool rightwall, bool leftanchor, bool rightanchor)
167 {
168  double score = BadScore();
169  for (TGeneModelList::iterator it = suspect_aligns.begin(); score == BadScore() && it != suspect_aligns.end(); ) {
170  if ((it->Type() & (CGeneModel::eWall | CGeneModel::eNested))!=0 || it->GoodEnoughToBeAnnotation()) {
171  ++it;
172  continue;
173  }
174  cerr << "Deleting alignment " << it->ID() << endl;
175  it->Status() |= CGeneModel::eSkipped;
176  it->AddComment("Bad score prediction in combination");
177  bad_aligns.push_back(*it);
178  it = suspect_aligns.erase(it);
179 
180  cerr << "Testing fragment " << left << ' ' << right << endl;
181  score = m_gnomon->Run(suspect_aligns, leftwall, rightwall, leftanchor, rightanchor, mpp, nonconsensp, m_notbridgeable_gaps_len, m_inserted_seqs, m_pcsf_slice.get());
182  }
183  return score;
184 }
185 
186 void CGnomonAnnotator::Predict(TSignedSeqPos llimit, TSignedSeqPos rlimit, TGeneModelList::const_iterator il, TGeneModelList::const_iterator ir, TGeneModelList& models,
187  bool leftmostwall, bool rightmostwall, bool leftmostanchor, bool rightmostanchor, TGeneModelList& bad_aligns)
188 {
189  TGeneModelList aligns(il, ir);
190 
191  TSignedSeqPos left = llimit;
192  bool leftwall = leftmostwall;
193  bool leftanchor = leftmostanchor;
194 
195  TSignedSeqPos right = llimit+window;
196  bool rightwall = false;
197  bool rightanchor = false;
198 
199  Int8 prev_bad_right = rlimit+1;
200  bool do_it_again = false;
201 
202  m_gnomon->ResetRange(left, right);
203 
205 
206  TGeneModelList suspect_aligns;
207  TSignedSeqRange tested_range;
208 
209  TIVec busy_spots(rlimit+1,0);
210  ITERATE(TGeneModelList, it_c, aligns) {
211  int a = max(0,it_c->Limits().GetFrom()-margin);
212  int b = min(rlimit,it_c->Limits().GetTo()+margin);
213  for(int i = a; i<=b; ++i)
214  busy_spots[i] = 1;
215  }
216 
217  do {
218  for( ; right < rlimit && busy_spots[right] != 0; ++right);
219 
220  if (right + (right-left)/2 >= rlimit) {
221  right = rlimit;
222  rightwall = rightmostwall;
223  rightanchor = rightmostanchor;
224  } else {
225  rightwall = false;
226  rightanchor = false;
227  }
228 
229  if (do_it_again)
230  rightwall = true;
231 
232  double score = BadScore();
233 
234  if (right < prev_bad_right) {
235  suspect_aligns.clear();
236 
237  m_gnomon->ResetRange(left,right);
238 
239  cerr << left << ' ' << right << ' ' << m_gnomon->GetGCcontent() << endl;
240 
241  score = m_gnomon->Run(aligns, leftwall, rightwall, leftanchor, rightanchor, mpp, nonconsensp, m_notbridgeable_gaps_len, m_inserted_seqs, m_pcsf_slice.get());
242 
243  if(score == BadScore()) {
244  cerr << "Inconsistent alignments in fragment " << left << ' ' << right << '\n';
245 
246  score = TryWithoutObviouslyBadAlignments(aligns, suspect_aligns, bad_aligns,
247  leftwall, rightwall, leftanchor, rightanchor,
248  left, right, tested_range);
249  }
250 
251  if(score == BadScore()) {
252 
253  prev_bad_right = right;
254  right = (left+right)/2;
255 
256  continue;
257  }
258  } else {
259  suspect_aligns.sort(s_AlignScoreOrder);
260 
261  score = TryToEliminateOneAlignment(suspect_aligns, bad_aligns,
262  leftwall, rightwall, leftanchor, rightanchor);
263  if (score == BadScore())
264  score = TryToEliminateAlignmentsFromTail(suspect_aligns, bad_aligns,
265  leftwall, rightwall, leftanchor, rightanchor);
266  if(score == BadScore()) {
267  cerr << "!!! BAD SCORE EVEN WITH FINISHED ALIGNMENTS !!! " << endl;
268  ITERATE(TGeneModelList, it, suspect_aligns) {
269  if ((it->Type() & (CGeneModel::eWall | CGeneModel::eNested))==0 && it->GoodEnoughToBeAnnotation())
270  models.push_back(*it);
271  }
272  }
273  }
274  prev_bad_right = rlimit+1;
275 
276  list<CGeneModel> genes = m_gnomon->GetGenes();
277 
278  TSignedSeqPos partial_start = right;
279 
280  if (right < rlimit && !genes.empty() && !genes.back().RightComplete() && !do_it_again) {
281  partial_start = genes.back().LeftComplete() ? genes.back().RealCdsLimits().GetFrom() : left;
282  _ASSERT ( partial_start < right );
283  genes.pop_back();
284  }
285 
286  do_it_again = false;
287 
288  if (!genes.empty()) {
289  left = genes.back().ReadingFrame().GetTo()+1;
290  leftanchor = true;
291  } else if (partial_start < left+1000) {
292  do_it_again=true;
293  } else if (partial_start < right) {
294  int new_left = partial_start-100;
295  for( ; new_left > left && busy_spots[new_left] != 0; --new_left);
296  if(new_left > left+1000) {
297  left = new_left;
298  leftanchor = false;
299  } else {
300  do_it_again=true;
301  }
302  } else {
303  left = (left+right)/2+1;
304  leftanchor = false;
305  }
306 
307  models.splice(models.end(), genes);
308 
309  if (right >= rlimit)
310  break;
311 
312  if (!do_it_again)
313  leftwall = true;
314 
315  right = left + window;
316 
317  } while(left <= rlimit);
318 }
319 
321 {
322  return ((a.Type() & CGeneModel::eWall)!=0) ? a.Limits() : a.MaxCdsLimits();
323 }
324 
326 {
327  return m.RealCdsLimits().Empty() ? m.Limits() : m.RealCdsLimits();
328 }
329 
330 bool s_AlignSeqOrder(const CGeneModel& ap, const CGeneModel& bp)
331 {
334 
335  return (a.GetFrom() != b.GetFrom() ?
336  a.GetFrom() < b.GetFrom() :
337  a.GetTo() > b.GetTo()
338  );
339 }
340 
341 void SaveWallModel(unique_ptr<CGeneModel>& wall_model, TGeneModelList& aligns)
342 {
343  if (wall_model.get() != 0 && wall_model->Type() == CGeneModel::eWall+CGeneModel::eGnomon) {
344  aligns.push_back(*wall_model);
345  }
346 }
347 
348 void FindPartials(TGeneModelList& models, TGeneModelList& aligns, EStrand strand)
349 {
350  TSignedSeqPos right = -1;
351  unique_ptr<CGeneModel> wall_model;
352 
353  for (TGeneModelList::iterator loop_it = models.begin(); loop_it != models.end();) {
354  TGeneModelList::iterator ir = loop_it;
355  ++loop_it;
356 
357  if(ir->Strand() != strand || (ir->Type()&CGeneModel::eNested)) { //at this point all nested are ab initio extensions yet without model/gene attributes
358  continue;
359  }
360 
362 
363  if ( right < limits.GetFrom() ) { // new cluster
364  SaveWallModel(wall_model, aligns);
365  }
366 
367  if ( right < limits.GetFrom() ) { // new cluster
368  wall_model.reset( new CGeneModel(ir->Strand(),ir->ID(),CGeneModel::eWall+CGeneModel::eGnomon));
369  wall_model->SetGeneID(ir->GeneID());
370  wall_model->AddExon(limits);
371  }
372 
373  right = max(right, limits.GetTo());
374  if (ir->RankInGene() == 1 && !ir->GoodEnoughToBeAnnotation()) {
375  ir->Status() &= ~CGeneModel::eFullSupCDS;
376  aligns.splice(aligns.end(), models, ir);
377  wall_model->SetType(CGeneModel::eGnomon);
378  } else if (limits.GetTo()- wall_model->Limits().GetTo() > 0) {
379  wall_model->ExtendRight(limits.GetTo() - wall_model->Limits().GetTo());
380  }
381  }
382  SaveWallModel(wall_model, aligns);
383 }
384 
385 
387 {
388  if (models.empty() && int(m_gnomon->GetSeq().size()) < mincontig)
389  return;
390 
391  if (GnomonNeeded()) {
392 
393  //extend partial nested
394 
395  typedef list<TGeneModelList::iterator> TIterList;
396  typedef map<Int8,TIterList> TGIDIterlist;
397  TGIDIterlist genes;
398  NON_CONST_ITERATE(TGeneModelList, im, models) {
399  if(im->Type()&CGeneModel::eNested)
400  im->SetType(im->Type()-CGeneModel::eNested); // ignore flag set in chainer
401  genes[im->GeneID()].push_back(im);
402  }
403 
404  set<TSignedSeqRange> hosting_intervals;
405  ITERATE(TGIDIterlist, ig, genes) { // first - ID; second - list
406  bool coding_gene = false;
407  ITERATE(TIterList, im, ig->second) {
408  if((*im)->ReadingFrame().NotEmpty()) {
409  coding_gene = true;
410  break;
411  }
412  }
413 
414  TSignedSeqRange gene_lim_for_nested;
415  ITERATE(TIterList, im, ig->second) {
416  const CGeneModel& ai = **im;
417  if(coding_gene && ai.ReadingFrame().Empty())
418  continue;
419  TSignedSeqRange model_lim_for_nested = ai.Limits();
420  if(ai.ReadingFrame().NotEmpty())
421  model_lim_for_nested = ai.OpenCds() ? ai.MaxCdsLimits() : ai.RealCdsLimits(); // 'open' could be only a single variant gene
422  gene_lim_for_nested += model_lim_for_nested;
423  }
424 
425  vector<int> grange(gene_lim_for_nested.GetLength(),1);
426  ITERATE(TIterList, im, ig->second) { // exclude all positions included in CDS (any exons for not coding genes) and holes
427  const CGeneModel& ai = **im;
428  if(coding_gene && ai.ReadingFrame().Empty())
429  continue;
430 
431  TSignedSeqRange model_lim_for_nested = ai.Limits();
432  if(ai.ReadingFrame().NotEmpty())
433  model_lim_for_nested = ai.OpenCds() ? ai.MaxCdsLimits() : ai.RealCdsLimits(); // 'open' could be only a single variant gene
434 
435  for(int i = 0; i < (int)ai.Exons().size(); ++i) {
436  TSignedSeqRange overlap = (model_lim_for_nested & ai.Exons()[i].Limits());
437  for(int j = overlap.GetFrom(); j <= overlap.GetTo(); ++j)
438  grange[j-gene_lim_for_nested.GetFrom()] = 0;
439  }
440 
441  for(int i = 1; i < (int)ai.Exons().size(); ++i) {
442  if(!ai.Exons()[i-1].m_ssplice || !ai.Exons()[i].m_fsplice) {
443  TSignedSeqRange hole(ai.Exons()[i-1].Limits().GetTo()+1,ai.Exons()[i].Limits().GetFrom()-1);
444  _ASSERT(Include(model_lim_for_nested, hole));
445  for(int j = hole.GetFrom(); j <= hole.GetTo(); ++j)
446  grange[j-gene_lim_for_nested.GetFrom()] = 0;
447  }
448  }
449  }
450  _ASSERT(grange.front() == 0 && grange.back() == 0);
451 
452  int left = -1;
453  int right;
454  for(int j = 0; j < (int)grange.size(); ++j) {
455  if(left < 0) {
456  if(grange[j] == 1) {
457  left = j;
458  right = j;
459  }
460  } else if(grange[j] == 1) {
461  right = j;
462  } else {
463  TSignedSeqRange interval(left+gene_lim_for_nested.GetFrom(),right+gene_lim_for_nested.GetFrom());
464  hosting_intervals.insert(interval);
465  left = -1;
466  }
467  }
468  }
469 
470  typedef map<TSignedSeqRange,TIterList> TRangeModels;
471  TRangeModels nested_models;
472  ITERATE(TGIDIterlist, ig, genes) { // first - ID; second - list
473  TGeneModelList::iterator nested_modeli = ig->second.front();
474  if(!nested_modeli->GoodEnoughToBeAnnotation()) {
475  _ASSERT(ig->second.size() == 1);
476  TSignedSeqRange lim_for_nested = nested_modeli->RealCdsLimits().Empty() ? nested_modeli->Limits() : nested_modeli->RealCdsLimits();
477 
478  TSignedSeqRange hosting_interval;
479  ITERATE(set<TSignedSeqRange>, ii, hosting_intervals) {
480  TSignedSeqRange interval = *ii;
481  if(Include(interval,lim_for_nested)) {
482  if(hosting_interval.Empty())
483  hosting_interval = interval;
484  else
485  hosting_interval = (hosting_interval&interval);
486  }
487  }
488 
489  if(hosting_interval.NotEmpty()) {
490  TIterList nested(1,nested_modeli);
491  ITERATE(TGIDIterlist, igg, genes) { // first - ID; second - list
492  const TIterList& other_gene = igg->second;
493  if(igg == ig || !other_gene.front()->GoodEnoughToBeAnnotation())
494  continue;
495 
496  bool coding_gene = false;
497  ITERATE(TIterList, im, other_gene) {
498  if((*im)->ReadingFrame().NotEmpty()) {
499  coding_gene = true;
500  break;
501  }
502  }
503 
504  TSignedSeqRange finished_interval;
505  ITERATE(TIterList, im, other_gene) {
506  const CGeneModel& ai = **im;
507  if(coding_gene && ai.ReadingFrame().Empty())
508  continue;
509 
510  finished_interval += coding_gene ? ai.RealCdsLimits() : ai.Limits();
511  }
512  if(!finished_interval.IntersectingWith(hosting_interval) || Include(finished_interval,hosting_interval))
513  continue;
514 
515  if(Precede(finished_interval,lim_for_nested)) { // before partial model
516  hosting_interval.SetFrom(finished_interval.GetTo());
517  } else if(Precede(lim_for_nested,finished_interval)) { // after partial model
518  hosting_interval.SetTo(finished_interval.GetFrom());
519  } else if(CModelCompare::RangeNestedInIntron(finished_interval, *nested_modeli, true)) {
520  //} else { // overlaps partial model
521  // _ASSERT(CModelCompare::RangeNestedInIntron(finished_interval, *nested_modeli, true));
522  ITERATE(TIterList, im, other_gene) {
523  nested.push_back(*im);
524  }
525  }
526  }
527  _ASSERT(hosting_interval.NotEmpty());
528  nested_models[hosting_interval].splice(nested_models[hosting_interval].begin(),nested);
529  }
530  }
531  }
532 
533  bool scaffold_wall = wall;
534  wall = true;
535  ITERATE(TRangeModels, i, nested_models) {
536  TSignedSeqRange hosting_interval = i->first;
537 
538  TGeneModelList nested;
539  set<Int8> included_complete_models;
540  ITERATE(TIterList, im, i->second) {
541  nested.push_back(**im);
542 
543  if(!(*im)->GoodEnoughToBeAnnotation()) {
544  /* already done
545  if(((*im)->Type()&CGeneModel::eNested)) {
546  nested.back().SetType(nested.back().Type()-CGeneModel::eNested); // remove flag to allow ab initio extension
547  }
548  */
549 
550  if(nested.back().HasStart() && !Include(hosting_interval,nested.back().MaxCdsLimits())) {
551  CCDSInfo cds = nested.back().GetCdsInfo();
552  if(nested.back().Strand() == ePlus)
553  cds.Set5PrimeCdsLimit(cds.Start().GetFrom());
554  else
555  cds.Set5PrimeCdsLimit(cds.Start().GetTo());
556  nested.back().SetCdsInfo(cds);
557  }
558 
559 
560 #ifdef _DEBUG
561  nested.back().AddComment("partialnested");
562 #endif
563 
564  models.erase(*im);
565  } else {
566  included_complete_models.insert((*im)->ID());
567  }
568  }
569 
570  cerr << "Interval " << hosting_interval << '\t' << nested.size() << endl;
571 
572  Predict(nested, bad_aligns, hosting_interval.GetFrom()+1,hosting_interval.GetTo()-1);
573 
574  NON_CONST_ITERATE(TGeneModelList, im, nested) {
575  if(!im->Support().empty()) {
576  im->SetType(im->Type()|CGeneModel::eNested);
577  if(im->ID() == 0 || included_complete_models.find(im->ID()) == included_complete_models.end()) // include only models which we tried to extend
578  models.push_back(*im);
579  }
580  }
581  }
582  wall = scaffold_wall;
583  }
584  //at this point all nested models are marked as eNested and don't need ab initio any more
585 
586  Predict(models, bad_aligns, 0, TSignedSeqPos(m_gnomon->GetSeq().size())-1);
587 
588  ERASE_ITERATE(TGeneModelList, im, models) {
589  CGeneModel& model = *im;
590  TSignedSeqRange cds = model.RealCdsLimits();
591  if(cds.Empty())
592  continue;
593 
594  bool gapfilled = false;
595  int genome_cds = 0;
596  ITERATE(CGeneModel::TExons, ie, model.Exons()) {
597  if(ie->m_fsplice_sig == "XX" || ie->m_ssplice_sig == "XX")
598  gapfilled = true;
599  else
600  genome_cds += (cds&ie->Limits()).GetLength();
601  }
602 
603  if(gapfilled && genome_cds < 45) {
604  model.Status() |= CGeneModel::eSkipped;
605  model.AddComment("Most CDS in genomic gap");
606  bad_aligns.push_back(model);
607  models.erase(im);
608  }
609  }
610 }
611 
613 {
614  if (GnomonNeeded()) {
615 
616  models.sort(s_AlignSeqOrder);
617 
618  if(!models.empty()) {
619  for(auto it_loop = next(models.begin()); it_loop != models.end(); ) {
620  auto it = it_loop++;
621  if(it->RankInGene() != 1 || it->GoodEnoughToBeAnnotation() || it->Type()&CGeneModel::eNested)
622  continue;
623  auto it_prev = prev(it);
624  if(it_prev->RankInGene() != 1 || it_prev->GoodEnoughToBeAnnotation() || it_prev->Type()&CGeneModel::eNested)
625  continue;
626 
627  if(it->MaxCdsLimits().IntersectingWith(it_prev->MaxCdsLimits())) {
628  cerr << "Intersecting alignments " << it->ID() << " " << it_prev->ID() << " " << it->Score() << " " << it_prev->Score() << endl;
629  auto it_erase = (it->Score() < it_prev->Score()) ? it : it_prev;
630  it_erase->Status() |= CGeneModel::eSkipped;
631  it_erase->AddComment("Intersects with other partial");
632  bad_aligns.push_back(*it_erase);
633  models.erase(it_erase);
634  }
635  }
636  }
637 
638  TGeneModelList aligns;
639 
640  FindPartials(models, aligns, ePlus);
641  FindPartials(models, aligns, eMinus);
642 
643  aligns.sort(s_AlignSeqOrder);
644 
645  TGeneModelList models_tmp;
646  Predict(left, right, aligns.begin(), aligns.end(), models_tmp,(left!=0 || wall), wall, left!=0, false, bad_aligns);
647  ITERATE(TGeneModelList, it, models_tmp) {
648  if(!it->Support().empty() || it->RealCdsLen() >= minCdsLen)
649  models.push_back(*it);
650  }
651  }
652 
653  NON_CONST_ITERATE(TGeneModelList, it, models) {
654  CCDSInfo cds_info = it->GetCdsInfo();
655 
656  // removing fshifts in UTRs
657  TInDels fs;
658  TSignedSeqRange fullcds = cds_info.Cds();
659  ITERATE(TInDels, i, it->FrameShifts()) {
660  if(((i->IsInsertion() || i->IsMismatch()) && Include(fullcds,i->Loc())) ||
661  (i->IsDeletion() && i->Loc() > fullcds.GetFrom() && i->Loc() <= fullcds.GetTo())) {
662  fs.push_back(*i);
663  }
664  }
665  it->FrameShifts() = fs;
666 
667  // removing pstops in UTRs
668  CCDSInfo::TPStops pstops = cds_info.PStops();
669  cds_info.ClearPStops();
670  ITERATE(CCDSInfo::TPStops, ps, pstops) {
671  if(Include(fullcds,*ps))
672  cds_info.AddPStop(*ps);
673  }
674  it->SetCdsInfo(cds_info);
675 
676  if (it->PStop(false) || !it->FrameShifts().empty()) {
677  it->Status() |= CGeneModel::ePseudo;
678  }
679  if(it->OpenCds()) {
680  CCDSInfo cds_info = it->GetCdsInfo();
681  cds_info.SetScore(cds_info.Score(),false); // kill the Open flag
682  it->SetCdsInfo(cds_info);
683  }
684  }
685 }
686 
688  : seq(_seq)
689 {
690 }
691 
693 {
694  CAlignMap mrnamap(m.GetAlignMap());
695  CResidueVec vec;
696  mrnamap.EditedSequence(seq, vec);
697 
698  int five_p, three_p;
699  for(five_p=0; five_p < (int)vec.size() && vec[five_p] == 'N'; ++five_p);
700  for(three_p=0; three_p < (int)vec.size() && vec[(int)vec.size()-1-three_p] == 'N'; ++three_p);
701 
702  if(five_p > 0 || three_p > 0) {
703  int left = five_p;
704  int right = three_p;
705  if(m.Strand() == eMinus)
706  swap(left,right);
707 
708  TSignedSeqRange new_lim(m.Limits());
709  if(left > 0) {
710  _ASSERT(m.Exons().front().Limits().GetLength() > left);
711  new_lim.SetFrom(new_lim.GetFrom()+left);
712  }
713  if(right > 0) {
714  _ASSERT(m.Exons().back().Limits().GetLength() > right);
715  new_lim.SetTo(new_lim.GetTo()-right);
716  }
717 
718  double score = m.Score();
720  CCDSInfo cds_info = m.GetCdsInfo();
721  cds_info.SetScore(score, false);
722  m.SetCdsInfo(cds_info);
723 
725  CAlignModel* a = dynamic_cast<CAlignModel*>(&m);
726  if (a != NULL) {
727  a->ResetAlignMap();
728  }
729  }
730  }
731 }
732 
734 {
735  arg_desc->AddKey("param", "param",
736  "Organism specific parameters",
738  arg_desc->AddDefaultKey("pcsf_factor","pcsf_factor","Normalisation factor for phyloPCSF scores",CArgDescriptions::eDouble,"0.1");
739  arg_desc->AddFlag("nognomon","Skips ab initio prediction and ab initio extension of partial chains.");
740  arg_desc->AddDefaultKey("window","window","Prediction window",CArgDescriptions::eInteger,"200000");
741  arg_desc->AddDefaultKey("margin","margin","The minimal distance between chains to place the end of prediction window",CArgDescriptions::eInteger,"1000");
742  arg_desc->AddFlag("open","Allow partial predictions at the ends of contigs. Used for poorly assembled genomes with lots of unfinished contigs.");
743  arg_desc->AddDefaultKey("mpp","mpp","Penalty for connection two protein containing chains into one model.",CArgDescriptions::eDouble,"10.0");
744  arg_desc->AddFlag("nonconsens","Allows to accept nonconsensus splices starts/stops to complete partial alignmet. If not allowed some partial alignments "
745  "may be rejected if there is no way to complete them.");
746  arg_desc->AddDefaultKey("ncsp","ncsp","Nonconsensus penalty",CArgDescriptions::eDouble,"25");
747 
748  arg_desc->AddFlag("norep","DO NOT mask lower case letters");
749  arg_desc->AddDefaultKey("mincont","mincont","Contigs shorter than that will be skipped unless they have alignments.",CArgDescriptions::eInteger,"1000");
750 
751  arg_desc->SetCurrentGroup("Prediction tuning");
752  arg_desc->AddFlag("singlest","Allow single exon EST chains as evidence");
753  arg_desc->AddDefaultKey("minlen","minlen","Minimal CDS length for pure ab initio models",CArgDescriptions::eInteger,"100");
754 }
755 
757 {
758  CNcbiIfstream param_file(args["param"].AsString().c_str());
759  annot->SetHMMParameters(new CHMMParameters(param_file));
760  annot->m_pcsf_factor = args["pcsf_factor"].AsDouble();
761 
762  annot->window = args["window"].AsInteger();
763  annot->margin = args["margin"].AsInteger();
764  annot->wall = !args["open"];
765  annot->mpp = args["mpp"].AsDouble();
766  bool nonconsens = args["nonconsens"];
767  annot->nonconsensp = (nonconsens ? -args["ncsp"].AsDouble() : BadScore());
768  annot->do_gnomon = !args["nognomon"];
769 
770  annot->mincontig = args["mincont"].AsInteger();
771 
772  annot->minCdsLen = args["minlen"].AsInteger();
773 
774  if (!args["norep"])
775  annot->EnableSeqMasking();
776 }
777 END_SCOPE(gnomon)
779 
USING_SCOPE(objects)
void SaveWallModel(unique_ptr< CGeneModel > &wall_model, TGeneModelList &aligns)
Definition: annot.cpp:341
TSignedSeqRange WalledCdsLimits(const CGeneModel &a)
Definition: annot.cpp:320
bool s_AlignScoreOrder(const CGeneModel &ap, const CGeneModel &bp)
Definition: annot.cpp:67
bool s_AlignSeqOrder(const CGeneModel &ap, const CGeneModel &bp)
Definition: annot.cpp:330
void FindPartials(TGeneModelList &models, TGeneModelList &aligns, EStrand strand)
Definition: annot.cpp:348
TSignedSeqRange GetWallLimits(const CGeneModel &m)
Definition: annot.cpp:325
void EditedSequence(const In &original_sequence, Out &edited_sequence, bool includeholes=false) const
Definition: gnomon_seq.cpp:622
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgs –.
Definition: ncbiargs.hpp:379
void Set5PrimeCdsLimit(TSignedSeqPos p)
void SetScore(double score, bool open=false)
TSignedSeqRange Start() const
void AddPStop(SPStop stp)
TSignedSeqRange Cds() const
double Score() const
const TPStops & PStops() const
vector< SPStop > TPStops
void ClearPStops()
double Score() const
unsigned int & Status()
const TExons & Exons() const
TSignedSeqRange ReadingFrame() const
virtual CAlignMap GetAlignMap() const
void SetType(int t)
TSignedSeqRange RealCdsLimits() const
virtual void Clip(TSignedSeqRange limits, EClipMode mode, bool ensure_cds_invariant=true)
void SetCdsInfo(const CCDSInfo &cds_info)
Int8 ID() const
TSignedSeqRange Limits() const
int Type() const
void AddComment(const string &comment)
const CCDSInfo & GetCdsInfo() const
vector< CModelExon > TExons
TSignedSeqRange MaxCdsLimits() const
bool OpenCds() const
EStrand Strand() const
static void SetupArgDescriptions(CArgDescriptions *arg_desc)
Definition: annot.cpp:733
static void ReadArgs(CGnomonAnnotator *annot, const CArgs &args)
Definition: annot.cpp:756
void SetHMMParameters(CHMMParameters *params)
Definition: chainer.cpp:7275
unique_ptr< CGnomonEngine > m_gnomon
Definition: chainer.hpp:140
TGgapInfo m_inserted_seqs
Definition: chainer.hpp:148
unique_ptr< SPhyloCSFSlice > m_pcsf_slice
Definition: chainer.hpp:152
TIntMap m_notbridgeable_gaps_len
Definition: chainer.hpp:149
void Predict(TGeneModelList &models, TGeneModelList &bad_aligns)
Definition: annot.cpp:386
void RemoveShortHolesAndRescore(TGeneModelList chains)
Definition: annot.cpp:72
bool GnomonNeeded() const
Definition: annot.hpp:93
double TryToEliminateOneAlignment(TGeneModelList &suspect_aligns, TGeneModelList &bad_aligns, bool leftwall, bool rightwall, bool leftanchor, bool rightanchor)
Definition: annot.cpp:137
double ExtendJustThisChain(CGeneModel &chain, TSignedSeqPos left, TSignedSeqPos right)
Definition: annot.cpp:79
double TryWithoutObviouslyBadAlignments(TGeneModelList &aligns, TGeneModelList &suspect_aligns, TGeneModelList &bad_aligns, bool leftwall, bool rightwall, bool leftanchor, bool rightanchor, TSignedSeqPos left, TSignedSeqPos right, TSignedSeqRange &tested_range)
Definition: annot.cpp:92
double nonconsensp
Definition: annot.hpp:124
double TryToEliminateAlignmentsFromTail(TGeneModelList &suspect_aligns, TGeneModelList &bad_aligns, bool leftwall, bool rightwall, bool leftanchor, bool rightanchor)
Definition: annot.cpp:165
HMM model parameters just create it and pass to a Gnomon engine.
Definition: gnomon.hpp:55
static bool RangeNestedInIntron(TSignedSeqRange r, const CGeneModel &algn, bool check_in_holes=true)
Definition: map.hpp:338
Definition: set.hpp:45
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
const_iterator find(const key_type &key) const
Definition: set.hpp:137
const_iterator end() const
Definition: set.hpp:136
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:61
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:56
vector< TResidue > CResidueVec
bool Precede(TSignedSeqRange l, TSignedSeqRange r)
vector< int > TIVec
double BadScore()
EStrand
@ eMinus
@ ePlus
bool Include(TSignedSeqRange big, TSignedSeqRange small)
list< CGeneModel > TGeneModelList
vector< CInDelInfo > TInDels
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define ERASE_ITERATE(Type, Var, Cont)
Non-constant version with ability to erase current element, if container permits.
Definition: ncbimisc.hpp:843
int TSignedSeqPos
Type for signed sequence position.
Definition: ncbimisc.hpp:887
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
Definition: ncbimisc.hpp:1508
void AddFlag(const string &name, const string &comment, CBoolEnum< EFlagValue > set_value=eFlagHasValueIfSet, TFlags flags=0)
Add description for flag argument.
Definition: ncbiargs.cpp:2459
void AddKey(const string &name, const string &synopsis, const string &comment, EType type, TFlags flags=0)
Add description for mandatory key.
Definition: ncbiargs.cpp:2412
void SetCurrentGroup(const string &group)
Set current arguments group name.
Definition: ncbiargs.cpp:2632
void AddDefaultKey(const string &name, const string &synopsis, const string &comment, EType type, const string &default_value, TFlags flags=0, const string &env_var=kEmptyStr, const char *display_value=nullptr)
Add description for optional key with default value.
Definition: ncbiargs.cpp:2442
@ eInputFile
Name of file (must exist and be readable)
Definition: ncbiargs.hpp:595
@ eDouble
Convertible into a floating point number (double)
Definition: ncbiargs.hpp:594
@ eInteger
Convertible into an integer number (int or Int8)
Definition: ncbiargs.hpp:592
#define NULL
Definition: ncbistd.hpp:225
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
position_type GetLength(void) const
Definition: range.hpp:158
bool NotEmpty(void) const
Definition: range.hpp:152
bool IntersectingWith(const TThisType &r) const
Definition: range.hpp:331
bool Empty(void) const
Definition: range.hpp:148
CRange< TSignedSeqPos > TSignedSeqRange
Definition: range.hpp:420
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
void SetFrom(TFrom value)
Assign a value to From data member.
Definition: Range_.hpp:231
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
void SetTo(TTo value)
Assign a value to To data member.
Definition: Range_.hpp:278
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
unsigned int a
Definition: ncbi_localip.c:102
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Defines command line argument related classes.
Defines unified interface to application:
T max(T x_, T y_)
T min(T x_, T y_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
static const TDS_WORD limits[]
Definition: num_limits.h:85
virtual void transform_model(CGeneModel &a)
Definition: annot.cpp:692
const CResidueVec & seq
Definition: annot.hpp:134
RemoveTrailingNs(const CResidueVec &seq)
Definition: annot.cpp:687
#define _ASSERT
Modified on Tue Nov 28 02:18:37 2023 by modify_doxy.py rev. 669887