NCBI C++ ToolKit
annot.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: annot.cpp 101964 2024-03-12 17:29:59Z souvorov $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Vyacheslav Chetvernin
27  *
28  * File Description:
29  *
30  * gnomon is run to improve chains and predict models in regions w/o chains
31  *
32  */
33 
34 #include <ncbi_pch.hpp>
35 #include <corelib/ncbiapp.hpp>
36 #include <corelib/ncbienv.hpp>
37 #include <corelib/ncbiargs.hpp>
38 
39 #include <algo/gnomon/annot.hpp>
40 #include "gnomon_engine.hpp"
42 #include <algo/gnomon/gnomon.hpp>
44 
46 
49 BEGIN_SCOPE(gnomon)
50 
52 {
53 }
54 
56 {
57 }
58 
60 {
61 }
62 
63 bool s_AlignScoreOrder(const CGeneModel& ap, const CGeneModel& bp)
64 {
65  return (ap.Score() < bp.Score());
66 }
67 
69 {
70  NON_CONST_ITERATE(TGeneModelList, it, chains) {
71  it->RemoveShortHolesAndRescore(*m_gnomon);
72  }
73 }
74 
76  TSignedSeqPos left, TSignedSeqPos right)
77 {
78  TGeneModelList test_align;
79  test_align.push_back(chain);
80  int l = max((int)left,(int)chain.Limits().GetFrom()-10000);
81  int r = min(right,chain.Limits().GetTo()+10000);
82  cerr << "Testing alignment " << chain.ID() << " in fragment " << l << ' ' << r << endl;
83 
84  m_gnomon->ResetRange(l,r);
85  return m_gnomon->Run(test_align, false, false, false, false, mpp, nonconsensp, m_notbridgeable_gaps_len, m_inserted_seqs, m_pcsf_slice.get());
86 }
87 
89  bool leftwall, bool rightwall, bool leftanchor, bool rightanchor,
90  TSignedSeqPos left, TSignedSeqPos right,
91  TSignedSeqRange& tested_range)
92 {
93  bool already_tested = Include(tested_range, TSignedSeqRange(left,right));
94 
95  if (already_tested) {
96  for(TGeneModelList::iterator it = aligns.begin(); it != aligns.end(); it++) {
97  if(left <= it->Limits().GetTo() && it->Limits().GetFrom() <= right)
98  suspect_aligns.push_back(*it);
99  }
100  } else {
101  tested_range = TSignedSeqRange(left,right);
102 
103  bool found_bad_cluster = false;
104  for(TGeneModelList::iterator it = aligns.begin(); it != aligns.end(); ) {
105  if(it->Limits().GetTo() < left || it->Limits().GetFrom() > right) {
106  ++it;
107  continue;
108  }
109 
110  if ((it->Type() & (CGeneModel::eWall | CGeneModel::eNested))==0 &&
111  ExtendJustThisChain(*it, left, right) == BadScore()) {
112  found_bad_cluster = true;
113  cerr << "Deleting alignment " << it->ID() << endl;
114  it->Status() |= CGeneModel::eSkipped;
115  it->AddComment("Bad score prediction alone");
116  bad_aligns.push_back(*it);
117 
118  it = aligns.erase(it);
119  continue;
120  }
121  suspect_aligns.push_back(*it++);
122  }
123 
124  m_gnomon->ResetRange(left, right);
125  if(found_bad_cluster) {
126  cerr << "Testing w/o bad alignments in fragment " << left << ' ' << right << endl;
127  return m_gnomon->Run(suspect_aligns, leftwall, rightwall, leftanchor, rightanchor, mpp, nonconsensp, m_notbridgeable_gaps_len, m_inserted_seqs, m_pcsf_slice.get());
128  }
129  }
130  return BadScore();
131 }
132 
134  bool leftwall, bool rightwall, bool leftanchor, bool rightanchor)
135 {
136  double score = BadScore();
137  for(TGeneModelList::iterator it = suspect_aligns.begin(); it != suspect_aligns.end();) {
138  if((it->Type() & (CGeneModel::eWall | CGeneModel::eNested))!=0) {
139  ++it;
140  continue;
141  }
142  CGeneModel algn = *it;
143  it = suspect_aligns.erase(it);
144 
145  cerr << "Testing w/o " << algn.ID();
146  score = m_gnomon->Run(suspect_aligns, leftwall, rightwall, leftanchor, rightanchor, mpp, nonconsensp, m_notbridgeable_gaps_len, m_inserted_seqs, m_pcsf_slice.get());
147  if (score != BadScore()) {
148  cerr << "- Good. Deleting alignment " << algn.ID() << endl;
149  algn.Status() |= CGeneModel::eSkipped;
150  algn.AddComment("Good score prediction without");
151  bad_aligns.push_back(algn);
152  break;
153  } else {
154  cerr << " - Still bad." << endl;
155  }
156  suspect_aligns.insert(it,algn);
157  }
158  return score;
159 }
160 
162  bool leftwall, bool rightwall, bool leftanchor, bool rightanchor)
163 {
164  double score = BadScore();
165  for (TGeneModelList::iterator it = suspect_aligns.begin(); score == BadScore() && it != suspect_aligns.end(); ) {
166  if ((it->Type() & (CGeneModel::eWall | CGeneModel::eNested))!=0 || it->GoodEnoughToBeAnnotation()) {
167  ++it;
168  continue;
169  }
170  cerr << "Deleting alignment " << it->ID() << endl;
171  it->Status() |= CGeneModel::eSkipped;
172  it->AddComment("Bad score prediction in combination");
173  bad_aligns.push_back(*it);
174  it = suspect_aligns.erase(it);
175 
176  cerr << "Testing fragment " << left << ' ' << right << endl;
177  score = m_gnomon->Run(suspect_aligns, leftwall, rightwall, leftanchor, rightanchor, mpp, nonconsensp, m_notbridgeable_gaps_len, m_inserted_seqs, m_pcsf_slice.get());
178  }
179  return score;
180 }
181 
182 void CGnomonAnnotator::Predict(TSignedSeqPos llimit, TSignedSeqPos rlimit, TGeneModelList::const_iterator il, TGeneModelList::const_iterator ir, TGeneModelList& models,
183  bool leftmostwall, bool rightmostwall, bool leftmostanchor, bool rightmostanchor, TGeneModelList& bad_aligns)
184 {
185  TGeneModelList aligns(il, ir);
186 
187  // TSignedSeqPos left = llimit;
188  int64_t left = llimit;
189  bool leftwall = leftmostwall;
190  bool leftanchor = leftmostanchor;
191 
192  // TSignedSeqPos right = llimit+window;
193  int64_t right = llimit+window;
194  bool rightwall = false;
195  bool rightanchor = false;
196 
197  Int8 prev_bad_right = rlimit+1;
198  bool do_it_again = false;
199 
200  m_gnomon->ResetRange(left, right);
201 
203 
204  TGeneModelList suspect_aligns;
205  TSignedSeqRange tested_range;
206 
207  TIVec busy_spots(rlimit+1,0);
208  ITERATE(TGeneModelList, it_c, aligns) {
209  int a = max(0,it_c->Limits().GetFrom()-margin);
210  int b = min(rlimit,it_c->Limits().GetTo()+margin);
211  for(int i = a; i<=b; ++i)
212  busy_spots[i] = 1;
213  }
214 
215  do {
216  for( ; right < rlimit && busy_spots[right] != 0; ++right);
217 
218  if (right + (right-left)/2 >= rlimit) {
219  right = rlimit;
220  rightwall = rightmostwall;
221  rightanchor = rightmostanchor;
222  } else {
223  rightwall = false;
224  rightanchor = false;
225  }
226 
227  if (do_it_again)
228  rightwall = true;
229 
230  double score = BadScore();
231 
232  if (right < prev_bad_right) {
233  suspect_aligns.clear();
234 
235  m_gnomon->ResetRange(left,right);
236 
237  cerr << left << ' ' << right << ' ' << m_gnomon->GetGCcontent() << endl;
238 
239  score = m_gnomon->Run(aligns, leftwall, rightwall, leftanchor, rightanchor, mpp, nonconsensp, m_notbridgeable_gaps_len, m_inserted_seqs, m_pcsf_slice.get());
240 
241  if(score == BadScore()) {
242  cerr << "Inconsistent alignments in fragment " << left << ' ' << right << '\n';
243 
244  score = TryWithoutObviouslyBadAlignments(aligns, suspect_aligns, bad_aligns,
245  leftwall, rightwall, leftanchor, rightanchor,
246  left, right, tested_range);
247  }
248 
249  if(score == BadScore()) {
250 
251  prev_bad_right = right;
252  right = (left+right)/2;
253 
254  continue;
255  }
256  } else {
257  suspect_aligns.sort(s_AlignScoreOrder);
258 
259  score = TryToEliminateOneAlignment(suspect_aligns, bad_aligns,
260  leftwall, rightwall, leftanchor, rightanchor);
261  if (score == BadScore())
262  score = TryToEliminateAlignmentsFromTail(suspect_aligns, bad_aligns,
263  leftwall, rightwall, leftanchor, rightanchor);
264  if(score == BadScore()) {
265  cerr << "!!! BAD SCORE EVEN WITH FINISHED ALIGNMENTS !!! " << endl;
266  ITERATE(TGeneModelList, it, suspect_aligns) {
267  if ((it->Type() & (CGeneModel::eWall | CGeneModel::eNested))==0 && it->GoodEnoughToBeAnnotation())
268  models.push_back(*it);
269  }
270  }
271  }
272  prev_bad_right = rlimit+1;
273 
274  list<CGeneModel> genes = m_gnomon->GetGenes();
275 
276  TSignedSeqPos partial_start = right;
277 
278  if (right < rlimit && !genes.empty() && !genes.back().RightComplete() && !do_it_again) {
279  partial_start = genes.back().LeftComplete() ? genes.back().RealCdsLimits().GetFrom() : left;
280  _ASSERT ( partial_start < right );
281  genes.pop_back();
282  }
283 
284  do_it_again = false;
285 
286  if (!genes.empty()) {
287  left = genes.back().ReadingFrame().GetTo()+1;
288  leftanchor = true;
289  } else if (partial_start < left+1000) {
290  do_it_again=true;
291  } else if (partial_start < right) {
292  int new_left = partial_start-100;
293  for( ; new_left > left && busy_spots[new_left] != 0; --new_left);
294  if(new_left > left+1000) {
295  left = new_left;
296  leftanchor = false;
297  } else {
298  do_it_again=true;
299  }
300  } else {
301  left = (left+right)/2+1;
302  leftanchor = false;
303  }
304 
305  models.splice(models.end(), genes);
306 
307  if (right >= rlimit)
308  break;
309 
310  if (!do_it_again)
311  leftwall = true;
312 
313  right = left + window;
314 
315  } while(left <= rlimit);
316 }
317 
319 {
320  return ((a.Type() & CGeneModel::eWall)!=0) ? a.Limits() : a.MaxCdsLimits();
321 }
322 
323 TSignedSeqRange GetWallLimits(const CGeneModel& m, bool external = false)
324 {
325  TSignedSeqRange model_lim_for_nested = m.Limits();
326  if(m.ReadingFrame().NotEmpty()) {
327  if(external)
328  model_lim_for_nested = m.OpenCds() ? m.MaxCdsLimits() : m.RealCdsLimits(); // open models can harbor in 5' introns; they are not alternative varinats
329  else
330  model_lim_for_nested = m.RealCdsLimits();
331  }
332 
333  return model_lim_for_nested;
334 }
335 pair<TSignedSeqRange, bool> GetGeneWallLimits(const list<TGeneModelList::iterator>& models, bool external = false)
336 {
337  bool coding_gene = false;
338  for(auto im : models) {
339  if(im->ReadingFrame().NotEmpty()) {
340  coding_gene = true;
341  break;
342  }
343  }
344 
345  TSignedSeqRange gene_lim;
346  for(auto im : models) {
347  if(coding_gene && im->ReadingFrame().Empty())
348  continue;
349  gene_lim += GetWallLimits(*im, external);
350  }
351 
352  return make_pair(gene_lim, coding_gene);
353 }
354 
355 bool s_AlignSeqOrder(const CGeneModel& ap, const CGeneModel& bp)
356 {
359 
360  return (a.GetFrom() != b.GetFrom() ?
361  a.GetFrom() < b.GetFrom() :
362  a.GetTo() > b.GetTo()
363  );
364 }
365 
366 void FindPartials(TGeneModelList& models, TGeneModelList& aligns, EStrand strand)
367 {
368  for (TGeneModelList::iterator loop_it = models.begin(); loop_it != models.end();) {
369  TGeneModelList::iterator ir = loop_it;
370  ++loop_it;
371 
372  if(ir->Strand() != strand)
373  continue;
374 
375  if(ir->Type()&CGeneModel::eNested) {
376  CGeneModel wall_model(ir->Strand(),ir->ID(), CGeneModel::eNested);
377  wall_model.SetGeneID(ir->GeneID());
379  wall_model.AddExon(limits);
380  aligns.push_back(wall_model);
381  } else if(ir->GoodEnoughToBeAnnotation()) {
382  CGeneModel wall_model(ir->Strand(),ir->ID(), CGeneModel::eWall);
383  wall_model.SetGeneID(ir->GeneID());
385  wall_model.AddExon(limits);
386  aligns.push_back(wall_model);
387  } else if(ir->RankInGene() == 1) {
388  ir->Status() &= ~CGeneModel::eFullSupCDS;
389  aligns.splice(aligns.end(), models, ir);
390  }
391  }
392 }
393 
395 {
396  if (models.empty() && int(m_gnomon->GetSeq().size()) < mincontig)
397  return;
398 
399  if (GnomonNeeded()) {
400  typedef list<TGeneModelList::iterator> TIterList;
401  typedef map<Int8,TIterList> TGIDIterlist;
402  typedef TGIDIterlist::iterator TGIter;
403  struct geneid_order {
404  bool operator()(TGIter a, TGIter b) const { return a->second.front()->GeneID() < b->second.front()->GeneID(); }
405  };
406  typedef tuple<TSignedSeqRange, bool, TGIter> TGenomeRange; // range, is hole, gene iterator
407  struct grange_order {
408  bool operator()(const TGenomeRange& a, TGenomeRange& b) const {
409  if(get<0>(a) != get<0>(b))
410  return get<0>(a) < get<0>(b);
411  else if(get<1>(a) != get<1>(b))
412  return get<1>(a) < get<1>(b);
413  else
414  return geneid_order()(get<2>(a), get<2>(b));
415  }
416  };
417  struct interval_order {
418  bool operator()(const TSignedSeqRange& a, const TSignedSeqRange& b) const { return a.GetTo() < b.GetFrom(); }
419  };
420  struct GenomeRangeMap : public map<TSignedSeqRange, list<TGenomeRange>, interval_order> { // map argument is the range for overlapping 'introns' - could be used for any kind of ranges
421  void Insert(const TGenomeRange& intron) { // combine ranges overlapping with intron and insert
422  list<TGenomeRange> clust(1, intron);
423  TSignedSeqRange range(get<0>(intron));
424  TSignedSeqRange intron_left(range.GetFrom(), range.GetFrom());
425  for(auto it = lower_bound(intron_left); it != end() && it->first.IntersectingWith(range); ) {
426  range.CombineWith(it->first);
427  clust.splice(clust.end(), it->second);
428  it = erase(it);
429  }
430  emplace(range, clust);
431  }
432  void Unique() { // remove duplicates from the lists
433  for(auto& range_intronlist : *this) {
434  auto& lst = range_intronlist.second;
435  lst.sort(grange_order());
436  lst.unique();
437  }
438  }
439  };
440 
441  TGIDIterlist genes;
442  NON_CONST_ITERATE(TGeneModelList, im, models) {
443  if(im->Type()&CGeneModel::eNested)
444  im->SetType(im->Type()-CGeneModel::eNested); // ignore flag set in chainer
445  genes[im->GeneID()].push_back(im);
446  }
447 
448  GenomeRangeMap introns;
449  for(auto ig = genes.begin(); ig != genes.end(); ++ig) {
450  auto rslt = GetGeneWallLimits(ig->second, true);
451  TSignedSeqRange lim_for_nested = rslt.first;
452  bool coding = rslt.second;
453  for(auto im : ig->second) {
454  CGeneModel& m = *im;
455  if(coding && m.RealCdsLimits().Empty()) // ignore nocoding variants in coding genes
456  continue;
457  for(int i = 1; i < (int)m.Exons().size(); ++i) {
458  if(m.Exons()[i-1].m_ssplice_sig == "XX" || m.Exons()[i].m_fsplice_sig == "XX") // skip genomic gaps
459  continue;
460  TSignedSeqRange range(m.Exons()[i-1].Limits().GetTo(), m.Exons()[i].Limits().GetFrom());
461  if(Include(lim_for_nested, range)) {
462  bool is_hole = !m.Exons()[i-1].m_ssplice || !m.Exons()[i].m_fsplice;
463  TGenomeRange intron(range, is_hole, ig);
464  introns.Insert(intron);
465  }
466  }
467  }
468  }
469  introns.Unique(); // remove duplicate introns
470 
471  list<TGIter> genes_hosting_partial;
472  list<TGIter> nested_partial;
473  GenomeRangeMap finished_intervals;
474  if(!introns.empty()) {
475  list<TGIter> genes_to_remove;
476  for(auto ig = genes.begin(); ig != genes.end(); ++ig) {
477  TIterList& modelsi = ig->second;
478  auto gfront = modelsi.front();
479  TSignedSeqRange lim_for_nested = GetGeneWallLimits(modelsi).first;
480  auto iclust = introns.lower_bound(TSignedSeqRange(lim_for_nested.GetFrom(), lim_for_nested.GetFrom())); // first not to the left
481  if(iclust != introns.end() && Include(iclust->first, lim_for_nested)) {
482  for(TGenomeRange& intron : iclust->second) {
483  TSignedSeqRange& range = get<0>(intron);
484  if(Include(range, lim_for_nested)) {
485  bool is_hole = get<1>(intron);
486  auto host_it = get<2>(intron);
487  if(is_hole && !gfront->GoodEnoughToBeAnnotation()) { // partial gene in a hole - one gene will be removed
488  if(host_it->second.front()->Score() > gfront->Score()) // no variants in both - using front()
489  genes_to_remove.push_back(ig);
490  else
491  genes_to_remove.push_back(host_it);
492  } else {
493  if(gfront->GoodEnoughToBeAnnotation()) { // complete gene nested in an intron/hole - assign nested flag and calculate finished interval
494  for(auto im : modelsi)
495  im->SetType(im->Type()|CGeneModel::eNested);
496  } else {
497  genes_hosting_partial.push_back(host_it);
498  nested_partial.push_back(ig);
499  }
500  }
501  }
502  }
503  }
504 
505  if(gfront->GoodEnoughToBeAnnotation()) { // find finished intervals which may limit the extension of nested
506  bool found = false;
507  for( ; !found && iclust != introns.end() && iclust->first.IntersectingWith(lim_for_nested); ++iclust) {
508  for(TGenomeRange& intron : iclust->second) {
509  if(get<2>(intron) == ig)
510  continue;
511  TSignedSeqRange& range = get<0>(intron);
512  if(range.IntersectingWith(lim_for_nested) && !Include(lim_for_nested, range)) {
513  found = true;
514  TGenomeRange finished_interval(lim_for_nested, false, ig);
515  finished_intervals.Insert(finished_interval);
516  break;
517  }
518  }
519  }
520  }
521  }
522  genes_to_remove.sort(geneid_order());
523  genes_to_remove.unique();
524  genes_hosting_partial.sort(geneid_order());
525  genes_hosting_partial.unique();
526  nested_partial.sort(geneid_order());
527  nested_partial.unique();
528  for(auto it : genes_to_remove) {
529  genes_hosting_partial.remove(it);
530  nested_partial.remove(it);
531  for(auto im : it->second) {
532  im->Status() |= CGeneModel::eSkipped;
533  im->AddComment("Partial gene in a hole");
534  bad_aligns.push_back(*im);
535  models.erase(im);
536  }
537  genes.erase(it);
538  }
539  }
540 
541  //extend partial nested
542  GenomeRangeMap hosting_intervals;
543  for(auto it : genes_hosting_partial) {
544  TIterList& lst = it->second;
545 
546  bool coding_gene = find_if(lst.begin(), lst.end(), [](TGeneModelList::iterator im){ return im->ReadingFrame().NotEmpty(); }) != lst.end();
547  // if external model is 'open' all 5' introns can harbor
548  // for nested model 'open' is ignored
549  TSignedSeqRange gene_lim_for_nested;
550  for(auto im : lst) {
551  const CGeneModel& ai = *im;
552  if(coding_gene && ai.ReadingFrame().Empty())
553  continue;
554  TSignedSeqRange model_lim_for_nested = ai.Limits();
555  if(ai.ReadingFrame().NotEmpty())
556  model_lim_for_nested = ai.OpenCds() ? ai.MaxCdsLimits() : ai.RealCdsLimits(); // 'open' could be only a single variant gene
557  gene_lim_for_nested += model_lim_for_nested;
558  }
559 
560  vector<int> grange(gene_lim_for_nested.GetLength(),1);
561  for(auto im : lst) { // exclude all positions included in CDS (any exons for not coding genes) and holes
562  const CGeneModel& ai = *im;
563  if(coding_gene && ai.ReadingFrame().Empty())
564  continue;
565 
566  TSignedSeqRange model_lim_for_nested = ai.Limits();
567  if(ai.ReadingFrame().NotEmpty())
568  model_lim_for_nested = ai.OpenCds() ? ai.MaxCdsLimits() : ai.RealCdsLimits(); // 'open' could be only a single variant gene
569 
570  for(int i = 0; i < (int)ai.Exons().size(); ++i) {
571  TSignedSeqRange overlap = (model_lim_for_nested & ai.Exons()[i].Limits());
572  for(int j = overlap.GetFrom(); j <= overlap.GetTo(); ++j)
573  grange[j-gene_lim_for_nested.GetFrom()] = 0;
574  }
575 
576  for(int i = 1; i < (int)ai.Exons().size(); ++i) {
577  if(!ai.Exons()[i-1].m_ssplice || !ai.Exons()[i].m_fsplice) {
578  TSignedSeqRange hole(ai.Exons()[i-1].Limits().GetTo()+1,ai.Exons()[i].Limits().GetFrom()-1);
579  _ASSERT(Include(model_lim_for_nested, hole));
580  for(int j = hole.GetFrom(); j <= hole.GetTo(); ++j)
581  grange[j-gene_lim_for_nested.GetFrom()] = 0;
582  }
583  }
584  }
585  _ASSERT(grange.front() == 0 && grange.back() == 0);
586 
587  int left = -1;
588  int right;
589  for(int j = 0; j < (int)grange.size(); ++j) {
590  if(left < 0) {
591  if(grange[j] == 1) {
592  left = j;
593  right = j;
594  }
595  } else if(grange[j] == 1) {
596  right = j;
597  } else {
598  TSignedSeqRange interval(left+gene_lim_for_nested.GetFrom(),right+gene_lim_for_nested.GetFrom());
599  TGenomeRange hosting_interval(interval, false, it);
600  hosting_intervals.Insert(hosting_interval);
601  left = -1;
602  }
603  }
604  }
605 
606  typedef map<TSignedSeqRange,TIterList> TRangeModels;
607  TRangeModels nested_models;
608  for(auto ig : nested_partial) {
609  TGeneModelList::iterator nested_modeli = ig->second.front();
610  _ASSERT(ig->second.size() == 1);
611  TSignedSeqRange lim_for_nested = GetWallLimits(*nested_modeli);
612  TSignedSeqRange hosting_interval;
613  {
614  auto rslt = hosting_intervals.lower_bound(TSignedSeqRange(lim_for_nested.GetFrom(), lim_for_nested.GetFrom()));
615  if(rslt != hosting_intervals.end() && Include(rslt->first, lim_for_nested)) {
616  for(auto& grange : rslt->second) {
617  TSignedSeqRange& interval = get<0>(grange);
618  if(Include(interval,lim_for_nested)) {
619  if(hosting_interval.Empty())
620  hosting_interval = interval;
621  else
622  hosting_interval = (hosting_interval&interval);
623  }
624  }
625  }
626  }
627 
628  if(hosting_interval.NotEmpty()) {
629  TIterList nested(1,nested_modeli);
630  TSignedSeqRange left(hosting_interval.GetFrom(), hosting_interval.GetFrom());
631  for(auto it = finished_intervals.lower_bound(left); it != finished_intervals.end() && it->first.IntersectingWith(hosting_interval); ++it) {
632  for(auto& grange : it->second) {
633  TSignedSeqRange& finished_interval = get<0>(grange);
634  if(!finished_interval.IntersectingWith(hosting_interval) || Include(finished_interval, hosting_interval))
635  continue;
636 
637  if(Precede(finished_interval,lim_for_nested)) { // before partial model
638  hosting_interval.SetFrom(finished_interval.GetTo());
639  } else if(Precede(lim_for_nested,finished_interval)) { // after partial model
640  hosting_interval.SetTo(finished_interval.GetFrom());
641  } else if(CModelCompare::RangeNestedInIntron(finished_interval, *nested_modeli, true)) {
642  for(auto im : get<2>(grange)->second)
643  nested.push_back(im);
644  }
645  }
646  }
647  _ASSERT(hosting_interval.NotEmpty());
648  nested_models[hosting_interval].splice(nested_models[hosting_interval].begin(), nested);
649  }
650 
651  }
652 
653  bool scaffold_wall = wall;
654  wall = true;
655  ITERATE(TRangeModels, i, nested_models) {
656  TSignedSeqRange hosting_interval = i->first;
657 
658  TGeneModelList nested;
659  set<Int8> included_complete_models;
660  ITERATE(TIterList, im, i->second) {
661  nested.push_back(**im);
662 
663  if(!(*im)->GoodEnoughToBeAnnotation()) {
664  if(nested.back().HasStart() && !Include(hosting_interval,nested.back().MaxCdsLimits())) {
665  CCDSInfo cds = nested.back().GetCdsInfo();
666  if(nested.back().Strand() == ePlus)
667  cds.Set5PrimeCdsLimit(cds.Start().GetFrom());
668  else
669  cds.Set5PrimeCdsLimit(cds.Start().GetTo());
670  nested.back().SetCdsInfo(cds);
671  }
672  nested.back().AddComment("partialnested");
673  models.erase(*im);
674  } else {
675  included_complete_models.insert((*im)->ID());
676  }
677  }
678 
679  cerr << "Interval " << hosting_interval << '\t' << nested.size() << endl;
680 
681  Predict(nested, bad_aligns, hosting_interval.GetFrom()+1,hosting_interval.GetTo()-1);
682 
683  NON_CONST_ITERATE(TGeneModelList, im, nested) {
684  if(!im->Support().empty()) {
685  im->SetType(im->Type()|CGeneModel::eNested);
686  if(im->ID() == 0 || included_complete_models.find(im->ID()) == included_complete_models.end()) // include only models which we tried to extend
687  models.push_back(*im);
688  }
689  }
690  }
691  wall = scaffold_wall;
692  }
693  //at this point all nested models don't need ab initio any more
694 
695  Predict(models, bad_aligns, 0, TSignedSeqPos(m_gnomon->GetSeq().size())-1);
696 
697  ERASE_ITERATE(TGeneModelList, im, models) {
698  CGeneModel& model = *im;
699  TSignedSeqRange cds = model.RealCdsLimits();
700  if(cds.Empty())
701  continue;
702 
703  bool gapfilled = false;
704  int genome_cds = 0;
705  ITERATE(CGeneModel::TExons, ie, model.Exons()) {
706  if(ie->m_fsplice_sig == "XX" || ie->m_ssplice_sig == "XX")
707  gapfilled = true;
708  else
709  genome_cds += (cds&ie->Limits()).GetLength();
710  }
711 
712  if(gapfilled && genome_cds < 45) {
713  model.Status() |= CGeneModel::eSkipped;
714  model.AddComment("Most CDS in genomic gap");
715  bad_aligns.push_back(model);
716  models.erase(im);
717  }
718  }
719 }
720 
722 {
723  if (GnomonNeeded()) {
724 
725  models.sort(s_AlignSeqOrder);
726 
727  if(!models.empty()) {
728  for(auto it_loop = next(models.begin()); it_loop != models.end(); ) {
729  auto it = it_loop++;
730  if(it->RankInGene() != 1 || it->GoodEnoughToBeAnnotation() || it->Type()&CGeneModel::eNested)
731  continue;
732  auto it_prev = prev(it);
733  if(it_prev->RankInGene() != 1 || it_prev->GoodEnoughToBeAnnotation() || it_prev->Type()&CGeneModel::eNested)
734  continue;
735 
736  if(it->MaxCdsLimits().IntersectingWith(it_prev->MaxCdsLimits())) {
737  cerr << "Intersecting alignments " << it->ID() << " " << it_prev->ID() << " " << it->Score() << " " << it_prev->Score() << endl;
738  auto it_erase = (it->Score() < it_prev->Score()) ? it : it_prev;
739  it_erase->Status() |= CGeneModel::eSkipped;
740  it_erase->AddComment("Intersects with other partial");
741  bad_aligns.push_back(*it_erase);
742  models.erase(it_erase);
743  }
744  }
745  }
746 
747  TGeneModelList aligns;
748 
749  FindPartials(models, aligns, ePlus);
750  FindPartials(models, aligns, eMinus);
751 
752  aligns.sort(s_AlignSeqOrder);
753 
754  TGeneModelList models_tmp;
755  Predict(left, right, aligns.begin(), aligns.end(), models_tmp,(left!=0 || wall), wall, left!=0, false, bad_aligns);
756  ITERATE(TGeneModelList, it, models_tmp) {
757  if(!it->Support().empty() || it->RealCdsLen() >= minCdsLen)
758  models.push_back(*it);
759  }
760  }
761 
762  NON_CONST_ITERATE(TGeneModelList, it, models) {
763  CCDSInfo cds_info = it->GetCdsInfo();
764 
765  // removing fshifts in UTRs
766  TInDels fs;
767  TSignedSeqRange fullcds = cds_info.Cds();
768  ITERATE(TInDels, i, it->FrameShifts()) {
769  if(((i->IsInsertion() || i->IsMismatch()) && Include(fullcds,i->Loc())) ||
770  (i->IsDeletion() && i->Loc() > fullcds.GetFrom() && i->Loc() <= fullcds.GetTo())) {
771  fs.push_back(*i);
772  }
773  }
774  it->FrameShifts() = fs;
775 
776  // removing pstops in UTRs
777  CCDSInfo::TPStops pstops = cds_info.PStops();
778  cds_info.ClearPStops();
779  ITERATE(CCDSInfo::TPStops, ps, pstops) {
780  if(Include(fullcds,*ps))
781  cds_info.AddPStop(*ps);
782  }
783  it->SetCdsInfo(cds_info);
784 
785  if (it->PStop(false) || !it->FrameShifts().empty()) {
786  it->Status() |= CGeneModel::ePseudo;
787  }
788  if(it->OpenCds()) {
789  CCDSInfo cds_info = it->GetCdsInfo();
790  cds_info.SetScore(cds_info.Score(),false); // kill the Open flag
791  it->SetCdsInfo(cds_info);
792  }
793  }
794 }
795 
797  : seq(_seq)
798 {
799 }
800 
802 {
803  CAlignMap mrnamap(m.GetAlignMap());
804  CResidueVec vec;
805  mrnamap.EditedSequence(seq, vec);
806 
807  int five_p, three_p;
808  for(five_p=0; five_p < (int)vec.size() && vec[five_p] == 'N'; ++five_p);
809  for(three_p=0; three_p < (int)vec.size() && vec[(int)vec.size()-1-three_p] == 'N'; ++three_p);
810 
811  if(five_p > 0 || three_p > 0) {
812  int left = five_p;
813  int right = three_p;
814  if(m.Strand() == eMinus)
815  swap(left,right);
816 
817  TSignedSeqRange new_lim(m.Limits());
818  if(left > 0) {
819  _ASSERT(m.Exons().front().Limits().GetLength() > left);
820  new_lim.SetFrom(new_lim.GetFrom()+left);
821  }
822  if(right > 0) {
823  _ASSERT(m.Exons().back().Limits().GetLength() > right);
824  new_lim.SetTo(new_lim.GetTo()-right);
825  }
826 
827  double score = m.Score();
829  CCDSInfo cds_info = m.GetCdsInfo();
830  cds_info.SetScore(score, false);
831  m.SetCdsInfo(cds_info);
832 
834  CAlignModel* a = dynamic_cast<CAlignModel*>(&m);
835  if (a != NULL) {
836  a->ResetAlignMap();
837  }
838  }
839  }
840 }
841 
843 {
844  arg_desc->AddKey("param", "param",
845  "Organism specific parameters",
847  arg_desc->AddDefaultKey("pcsf_factor","pcsf_factor","Normalisation factor for phyloPCSF scores",CArgDescriptions::eDouble,"0.1");
848  arg_desc->AddFlag("nognomon","Skips ab initio prediction and ab initio extension of partial chains.");
849  arg_desc->AddDefaultKey("window","window","Prediction window",CArgDescriptions::eInteger,"200000");
850  arg_desc->AddDefaultKey("margin","margin","The minimal distance between chains to place the end of prediction window",CArgDescriptions::eInteger,"1000");
851  arg_desc->AddFlag("open","Allow partial predictions at the ends of contigs. Used for poorly assembled genomes with lots of unfinished contigs.");
852  arg_desc->AddDefaultKey("mpp","mpp","Penalty for connection two protein containing chains into one model.",CArgDescriptions::eDouble,"10.0");
853  arg_desc->AddFlag("nonconsens","Allows to accept nonconsensus splices starts/stops to complete partial alignmet. If not allowed some partial alignments "
854  "may be rejected if there is no way to complete them.");
855  arg_desc->AddDefaultKey("ncsp","ncsp","Nonconsensus penalty",CArgDescriptions::eDouble,"25");
856 
857  arg_desc->AddFlag("norep","DO NOT mask lower case letters");
858  arg_desc->AddDefaultKey("mincont","mincont","Contigs shorter than that will be skipped unless they have alignments.",CArgDescriptions::eInteger,"1000");
859 
860  arg_desc->SetCurrentGroup("Prediction tuning");
861  arg_desc->AddFlag("singlest","Allow single exon EST chains as evidence");
862  arg_desc->AddDefaultKey("minlen","minlen","Minimal CDS length for pure ab initio models",CArgDescriptions::eInteger,"100");
863 }
864 
866 {
867  CNcbiIfstream param_file(args["param"].AsString().c_str());
868  annot->SetHMMParameters(new CHMMParameters(param_file));
869  annot->m_pcsf_factor = args["pcsf_factor"].AsDouble();
870 
871  annot->window = args["window"].AsInteger();
872  annot->margin = args["margin"].AsInteger();
873  annot->wall = !args["open"];
874  annot->mpp = args["mpp"].AsDouble();
875  bool nonconsens = args["nonconsens"];
876  annot->nonconsensp = (nonconsens ? -args["ncsp"].AsDouble() : BadScore());
877  annot->do_gnomon = !args["nognomon"];
878 
879  annot->mincontig = args["mincont"].AsInteger();
880 
881  annot->minCdsLen = args["minlen"].AsInteger();
882 
883  if (!args["norep"])
884  annot->EnableSeqMasking();
885 }
886 END_SCOPE(gnomon)
888 
USING_SCOPE(objects)
pair< TSignedSeqRange, bool > GetGeneWallLimits(const list< TGeneModelList::iterator > &models, bool external=false)
Definition: annot.cpp:335
TSignedSeqRange WalledCdsLimits(const CGeneModel &a)
Definition: annot.cpp:318
bool s_AlignScoreOrder(const CGeneModel &ap, const CGeneModel &bp)
Definition: annot.cpp:63
bool s_AlignSeqOrder(const CGeneModel &ap, const CGeneModel &bp)
Definition: annot.cpp:355
TSignedSeqRange GetWallLimits(const CGeneModel &m, bool external=false)
Definition: annot.cpp:323
void FindPartials(TGeneModelList &models, TGeneModelList &aligns, EStrand strand)
Definition: annot.cpp:366
void EditedSequence(const In &original_sequence, Out &edited_sequence, bool includeholes=false) const
Definition: gnomon_seq.cpp:632
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgs –.
Definition: ncbiargs.hpp:379
void Set5PrimeCdsLimit(TSignedSeqPos p)
void SetScore(double score, bool open=false)
TSignedSeqRange Start() const
void AddPStop(SPStop stp)
TSignedSeqRange Cds() const
double Score() const
const TPStops & PStops() const
vector< SPStop > TPStops
void ClearPStops()
void AddExon(TSignedSeqRange exon, const string &fs="", const string &ss="", double ident=0, const string &seq="", const CInDelInfo::SSource &src=CInDelInfo::SSource())
void SetGeneID(Int8 id)
double Score() const
unsigned int & Status()
const TExons & Exons() const
TSignedSeqRange ReadingFrame() const
virtual CAlignMap GetAlignMap() const
TSignedSeqRange RealCdsLimits() const
virtual void Clip(TSignedSeqRange limits, EClipMode mode, bool ensure_cds_invariant=true)
void SetCdsInfo(const CCDSInfo &cds_info)
Int8 ID() const
TSignedSeqRange Limits() const
int Type() const
void AddComment(const string &comment)
const CCDSInfo & GetCdsInfo() const
vector< CModelExon > TExons
TSignedSeqRange MaxCdsLimits() const
bool OpenCds() const
EStrand Strand() const
static void SetupArgDescriptions(CArgDescriptions *arg_desc)
Definition: annot.cpp:842
static void ReadArgs(CGnomonAnnotator *annot, const CArgs &args)
Definition: annot.cpp:865
void SetHMMParameters(CHMMParameters *params)
Definition: chainer.cpp:7560
unique_ptr< CGnomonEngine > m_gnomon
Definition: chainer.hpp:140
TGgapInfo m_inserted_seqs
Definition: chainer.hpp:148
unique_ptr< SPhyloCSFSlice > m_pcsf_slice
Definition: chainer.hpp:152
TIntMap m_notbridgeable_gaps_len
Definition: chainer.hpp:149
void Predict(TGeneModelList &models, TGeneModelList &bad_aligns)
Definition: annot.cpp:394
void RemoveShortHolesAndRescore(TGeneModelList chains)
Definition: annot.cpp:68
bool GnomonNeeded() const
Definition: annot.hpp:93
double TryToEliminateOneAlignment(TGeneModelList &suspect_aligns, TGeneModelList &bad_aligns, bool leftwall, bool rightwall, bool leftanchor, bool rightanchor)
Definition: annot.cpp:133
double ExtendJustThisChain(CGeneModel &chain, TSignedSeqPos left, TSignedSeqPos right)
Definition: annot.cpp:75
double TryWithoutObviouslyBadAlignments(TGeneModelList &aligns, TGeneModelList &suspect_aligns, TGeneModelList &bad_aligns, bool leftwall, bool rightwall, bool leftanchor, bool rightanchor, TSignedSeqPos left, TSignedSeqPos right, TSignedSeqRange &tested_range)
Definition: annot.cpp:88
double nonconsensp
Definition: annot.hpp:124
double TryToEliminateAlignmentsFromTail(TGeneModelList &suspect_aligns, TGeneModelList &bad_aligns, bool leftwall, bool rightwall, bool leftanchor, bool rightanchor)
Definition: annot.cpp:161
HMM model parameters just create it and pass to a Gnomon engine.
Definition: gnomon.hpp:55
static bool RangeNestedInIntron(TSignedSeqRange r, const CGeneModel &algn, bool check_in_holes=true)
Definition: map.hpp:338
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
const_iterator find(const key_type &key) const
Definition: set.hpp:137
const_iterator end() const
Definition: set.hpp:136
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:61
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:56
static const TDS_WORD limits[]
Definition: num_limits.h:85
Int8 int64_t
vector< TResidue > CResidueVec
bool Precede(TSignedSeqRange l, TSignedSeqRange r)
vector< int > TIVec
double BadScore()
EStrand
@ eMinus
@ ePlus
bool Include(TSignedSeqRange big, TSignedSeqRange small)
list< CGeneModel > TGeneModelList
vector< CInDelInfo > TInDels
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define ERASE_ITERATE(Type, Var, Cont)
Non-constant version with ability to erase current element, if container permits.
Definition: ncbimisc.hpp:843
int TSignedSeqPos
Type for signed sequence position.
Definition: ncbimisc.hpp:887
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
Definition: ncbimisc.hpp:1508
void AddFlag(const string &name, const string &comment, CBoolEnum< EFlagValue > set_value=eFlagHasValueIfSet, TFlags flags=0)
Add description for flag argument.
Definition: ncbiargs.cpp:2459
void AddKey(const string &name, const string &synopsis, const string &comment, EType type, TFlags flags=0)
Add description for mandatory key.
Definition: ncbiargs.cpp:2412
void SetCurrentGroup(const string &group)
Set current arguments group name.
Definition: ncbiargs.cpp:2632
void AddDefaultKey(const string &name, const string &synopsis, const string &comment, EType type, const string &default_value, TFlags flags=0, const string &env_var=kEmptyStr, const char *display_value=nullptr)
Add description for optional key with default value.
Definition: ncbiargs.cpp:2442
@ eInputFile
Name of file (must exist and be readable)
Definition: ncbiargs.hpp:595
@ eDouble
Convertible into a floating point number (double)
Definition: ncbiargs.hpp:594
@ eInteger
Convertible into an integer number (int or Int8)
Definition: ncbiargs.hpp:592
#define NULL
Definition: ncbistd.hpp:225
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
position_type GetLength(void) const
Definition: range.hpp:158
bool NotEmpty(void) const
Definition: range.hpp:152
bool IntersectingWith(const TThisType &r) const
Definition: range.hpp:331
bool Empty(void) const
Definition: range.hpp:148
CRange< TSignedSeqPos > TSignedSeqRange
Definition: range.hpp:420
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
void SetFrom(TFrom value)
Assign a value to From data member.
Definition: Range_.hpp:231
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
void SetTo(TTo value)
Assign a value to To data member.
Definition: Range_.hpp:278
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
range(_Ty, _Ty) -> range< _Ty >
unsigned int a
Definition: ncbi_localip.c:102
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Defines command line argument related classes.
Defines unified interface to application:
T max(T x_, T y_)
T min(T x_, T y_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
virtual void transform_model(CGeneModel &a)
Definition: annot.cpp:801
const CResidueVec & seq
Definition: annot.hpp:134
RemoveTrailingNs(const CResidueVec &seq)
Definition: annot.cpp:796
#define _ASSERT
Modified on Thu May 02 14:27:02 2024 by modify_doxy.py rev. 669887