71 it->RemoveShortHolesAndRescore(*
m_gnomon);
79 test_align.push_back(chain);
82 cerr <<
"Testing alignment " << chain.
ID() <<
" in fragment " << l <<
' ' <<
r << endl;
89 bool leftwall,
bool rightwall,
bool leftanchor,
bool rightanchor,
96 for(TGeneModelList::iterator it = aligns.begin(); it != aligns.end(); it++) {
97 if(left <= it->Limits().GetTo() && it->Limits().GetFrom() <= right)
98 suspect_aligns.push_back(*it);
103 bool found_bad_cluster =
false;
104 for(TGeneModelList::iterator it = aligns.begin(); it != aligns.end(); ) {
105 if(it->Limits().GetTo() < left || it->Limits().GetFrom() > right) {
112 found_bad_cluster =
true;
113 cerr <<
"Deleting alignment " << it->ID() << endl;
115 it->AddComment(
"Bad score prediction alone");
116 bad_aligns.push_back(*it);
118 it = aligns.erase(it);
121 suspect_aligns.push_back(*it++);
125 if(found_bad_cluster) {
126 cerr <<
"Testing w/o bad alignments in fragment " << left <<
' ' << right << endl;
134 bool leftwall,
bool rightwall,
bool leftanchor,
bool rightanchor)
137 for(TGeneModelList::iterator it = suspect_aligns.begin(); it != suspect_aligns.end();) {
143 it = suspect_aligns.erase(it);
145 cerr <<
"Testing w/o " << algn.
ID();
148 cerr <<
"- Good. Deleting alignment " << algn.
ID() << endl;
150 algn.
AddComment(
"Good score prediction without");
151 bad_aligns.push_back(algn);
154 cerr <<
" - Still bad." << endl;
156 suspect_aligns.insert(it,algn);
162 bool leftwall,
bool rightwall,
bool leftanchor,
bool rightanchor)
165 for (TGeneModelList::iterator it = suspect_aligns.begin(); score ==
BadScore() && it != suspect_aligns.end(); ) {
170 cerr <<
"Deleting alignment " << it->ID() << endl;
172 it->AddComment(
"Bad score prediction in combination");
173 bad_aligns.push_back(*it);
174 it = suspect_aligns.erase(it);
176 cerr <<
"Testing fragment " << left <<
' ' << right << endl;
183 bool leftmostwall,
bool rightmostwall,
bool leftmostanchor,
bool rightmostanchor,
TGeneModelList& bad_aligns)
189 bool leftwall = leftmostwall;
190 bool leftanchor = leftmostanchor;
194 bool rightwall =
false;
195 bool rightanchor =
false;
197 Int8 prev_bad_right = rlimit+1;
198 bool do_it_again =
false;
207 TIVec busy_spots(rlimit+1,0);
209 int a =
max(0,it_c->Limits().GetFrom()-
margin);
210 int b =
min(rlimit,it_c->Limits().GetTo()+
margin);
211 for(
int i =
a;
i<=
b; ++
i)
216 for( ; right < rlimit && busy_spots[right] != 0; ++right);
218 if (right + (right-left)/2 >= rlimit) {
220 rightwall = rightmostwall;
221 rightanchor = rightmostanchor;
232 if (right < prev_bad_right) {
233 suspect_aligns.clear();
237 cerr << left <<
' ' << right <<
' ' <<
m_gnomon->GetGCcontent() << endl;
242 cerr <<
"Inconsistent alignments in fragment " << left <<
' ' << right <<
'\n';
245 leftwall, rightwall, leftanchor, rightanchor,
246 left, right, tested_range);
251 prev_bad_right = right;
252 right = (left+right)/2;
260 leftwall, rightwall, leftanchor, rightanchor);
263 leftwall, rightwall, leftanchor, rightanchor);
265 cerr <<
"!!! BAD SCORE EVEN WITH FINISHED ALIGNMENTS !!! " << endl;
268 models.push_back(*it);
272 prev_bad_right = rlimit+1;
274 list<CGeneModel> genes =
m_gnomon->GetGenes();
278 if (right < rlimit && !genes.empty() && !genes.back().RightComplete() && !do_it_again) {
279 partial_start = genes.back().LeftComplete() ? genes.back().RealCdsLimits().GetFrom() : left;
280 _ASSERT ( partial_start < right );
286 if (!genes.empty()) {
287 left = genes.back().ReadingFrame().GetTo()+1;
289 }
else if (partial_start < left+1000) {
291 }
else if (partial_start < right) {
292 int new_left = partial_start-100;
293 for( ; new_left > left && busy_spots[new_left] != 0; --new_left);
294 if(new_left > left+1000) {
301 left = (left+right)/2+1;
305 models.splice(models.end(), genes);
315 }
while(left <= rlimit);
333 return model_lim_for_nested;
335 pair<TSignedSeqRange, bool>
GetGeneWallLimits(
const list<TGeneModelList::iterator>& models,
bool external =
false)
337 bool coding_gene =
false;
338 for(
auto im : models) {
339 if(im->ReadingFrame().NotEmpty()) {
346 for(
auto im : models) {
347 if(coding_gene && im->ReadingFrame().Empty())
352 return make_pair(gene_lim, coding_gene);
360 return (
a.GetFrom() !=
b.GetFrom() ?
361 a.GetFrom() <
b.GetFrom() :
362 a.GetTo() >
b.GetTo()
368 for (TGeneModelList::iterator loop_it = models.begin(); loop_it != models.end();) {
369 TGeneModelList::iterator ir = loop_it;
372 if(ir->Strand() != strand)
380 aligns.push_back(wall_model);
381 }
else if(ir->GoodEnoughToBeAnnotation()) {
386 aligns.push_back(wall_model);
387 }
else if(ir->RankInGene() == 1) {
389 aligns.splice(aligns.end(), models, ir);
400 typedef list<TGeneModelList::iterator> TIterList;
402 typedef TGIDIterlist::iterator TGIter;
403 struct geneid_order {
404 bool operator()(TGIter
a, TGIter
b)
const {
return a->second.front()->GeneID() <
b->second.front()->GeneID(); }
406 typedef tuple<TSignedSeqRange, bool, TGIter> TGenomeRange;
407 struct grange_order {
408 bool operator()(
const TGenomeRange&
a, TGenomeRange&
b)
const {
409 if(get<0>(
a) != get<0>(
b))
410 return get<0>(
a) < get<0>(
b);
411 else if(get<1>(
a) != get<1>(
b))
412 return get<1>(
a) < get<1>(
b);
414 return geneid_order()(get<2>(
a), get<2>(
b));
417 struct interval_order {
420 struct GenomeRangeMap :
public map<TSignedSeqRange, list<TGenomeRange>, interval_order> {
421 void Insert(
const TGenomeRange& intron) {
422 list<TGenomeRange> clust(1, intron);
425 for(
auto it = lower_bound(intron_left); it != end() && it->first.IntersectingWith(
range); ) {
426 range.CombineWith(it->first);
427 clust.splice(clust.end(), it->second);
430 emplace(
range, clust);
433 for(
auto& range_intronlist : *
this) {
434 auto& lst = range_intronlist.second;
435 lst.sort(grange_order());
445 genes[im->GeneID()].push_back(im);
448 GenomeRangeMap introns;
449 for(
auto ig = genes.begin(); ig != genes.end(); ++ig) {
452 bool coding = rslt.second;
453 for(
auto im : ig->second) {
458 if(m.
Exons()[
i-1].m_ssplice_sig ==
"XX" || m.
Exons()[
i].m_fsplice_sig ==
"XX")
462 bool is_hole = !m.
Exons()[
i-1].m_ssplice || !m.
Exons()[
i].m_fsplice;
463 TGenomeRange intron(
range, is_hole, ig);
464 introns.Insert(intron);
471 list<TGIter> genes_hosting_partial;
472 list<TGIter> nested_partial;
473 GenomeRangeMap finished_intervals;
474 if(!introns.empty()) {
475 list<TGIter> genes_to_remove;
476 for(
auto ig = genes.begin(); ig != genes.end(); ++ig) {
477 TIterList& modelsi = ig->second;
478 auto gfront = modelsi.front();
481 if(iclust != introns.end() &&
Include(iclust->first, lim_for_nested)) {
482 for(TGenomeRange& intron : iclust->second) {
485 bool is_hole = get<1>(intron);
486 auto host_it = get<2>(intron);
487 if(is_hole && !gfront->GoodEnoughToBeAnnotation()) {
488 if(host_it->second.front()->Score() > gfront->Score())
489 genes_to_remove.push_back(ig);
491 genes_to_remove.push_back(host_it);
493 if(gfront->GoodEnoughToBeAnnotation()) {
494 for(
auto im : modelsi)
497 genes_hosting_partial.push_back(host_it);
498 nested_partial.push_back(ig);
505 if(gfront->GoodEnoughToBeAnnotation()) {
507 for( ; !found && iclust != introns.end() && iclust->first.IntersectingWith(lim_for_nested); ++iclust) {
508 for(TGenomeRange& intron : iclust->second) {
509 if(get<2>(intron) == ig)
514 TGenomeRange finished_interval(lim_for_nested,
false, ig);
515 finished_intervals.Insert(finished_interval);
522 genes_to_remove.sort(geneid_order());
523 genes_to_remove.unique();
524 genes_hosting_partial.sort(geneid_order());
525 genes_hosting_partial.unique();
526 nested_partial.sort(geneid_order());
527 nested_partial.unique();
528 for(
auto it : genes_to_remove) {
529 genes_hosting_partial.remove(it);
530 nested_partial.remove(it);
531 for(
auto im : it->second) {
533 im->AddComment(
"Partial gene in a hole");
534 bad_aligns.push_back(*im);
542 GenomeRangeMap hosting_intervals;
543 for(
auto it : genes_hosting_partial) {
544 TIterList& lst = it->second;
546 bool coding_gene = find_if(lst.begin(), lst.end(), [](TGeneModelList::iterator im){ return im->ReadingFrame().NotEmpty(); }) != lst.end();
557 gene_lim_for_nested += model_lim_for_nested;
560 vector<int> grange(gene_lim_for_nested.
GetLength(),1);
572 for(
int j = overlap.
GetFrom(); j <= overlap.
GetTo(); ++j)
573 grange[j-gene_lim_for_nested.
GetFrom()] = 0;
577 if(!ai.
Exons()[
i-1].m_ssplice || !ai.
Exons()[
i].m_fsplice) {
581 grange[j-gene_lim_for_nested.
GetFrom()] = 0;
585 _ASSERT(grange.front() == 0 && grange.back() == 0);
589 for(
int j = 0; j < (
int)grange.size(); ++j) {
595 }
else if(grange[j] == 1) {
599 TGenomeRange hosting_interval(interval,
false, it);
600 hosting_intervals.Insert(hosting_interval);
607 TRangeModels nested_models;
608 for(
auto ig : nested_partial) {
609 TGeneModelList::iterator nested_modeli = ig->second.front();
610 _ASSERT(ig->second.size() == 1);
615 if(rslt != hosting_intervals.end() &&
Include(rslt->first, lim_for_nested)) {
616 for(
auto& grange : rslt->second) {
618 if(
Include(interval,lim_for_nested)) {
619 if(hosting_interval.
Empty())
620 hosting_interval = interval;
622 hosting_interval = (hosting_interval&interval);
629 TIterList nested(1,nested_modeli);
631 for(
auto it = finished_intervals.lower_bound(left); it != finished_intervals.end() && it->first.IntersectingWith(hosting_interval); ++it) {
632 for(
auto& grange : it->second) {
637 if(
Precede(finished_interval,lim_for_nested)) {
639 }
else if(
Precede(lim_for_nested,finished_interval)) {
642 for(
auto im : get<2>(grange)->second)
643 nested.push_back(im);
648 nested_models[hosting_interval].splice(nested_models[hosting_interval].begin(), nested);
653 bool scaffold_wall =
wall;
655 ITERATE(TRangeModels,
i, nested_models) {
661 nested.push_back(**im);
663 if(!(*im)->GoodEnoughToBeAnnotation()) {
664 if(nested.back().HasStart() && !
Include(hosting_interval,nested.back().MaxCdsLimits())) {
665 CCDSInfo cds = nested.back().GetCdsInfo();
666 if(nested.back().Strand() ==
ePlus)
670 nested.back().SetCdsInfo(cds);
672 nested.back().AddComment(
"partialnested");
675 included_complete_models.
insert((*im)->ID());
679 cerr <<
"Interval " << hosting_interval <<
'\t' << nested.size() << endl;
684 if(!im->Support().empty()) {
686 if(im->ID() == 0 || included_complete_models.
find(im->ID()) == included_complete_models.
end())
687 models.push_back(*im);
691 wall = scaffold_wall;
703 bool gapfilled =
false;
706 if(ie->m_fsplice_sig ==
"XX" || ie->m_ssplice_sig ==
"XX")
709 genome_cds += (cds&ie->Limits()).
GetLength();
712 if(gapfilled && genome_cds < 45) {
715 bad_aligns.push_back(model);
727 if(!models.empty()) {
728 for(
auto it_loop =
next(models.begin()); it_loop != models.end(); ) {
730 if(it->RankInGene() != 1 || it->GoodEnoughToBeAnnotation() || it->Type()&
CGeneModel::eNested)
732 auto it_prev =
prev(it);
733 if(it_prev->RankInGene() != 1 || it_prev->GoodEnoughToBeAnnotation() || it_prev->Type()&
CGeneModel::eNested)
736 if(it->MaxCdsLimits().IntersectingWith(it_prev->MaxCdsLimits())) {
737 cerr <<
"Intersecting alignments " << it->ID() <<
" " << it_prev->ID() <<
" " << it->Score() <<
" " << it_prev->Score() << endl;
738 auto it_erase = (it->Score() < it_prev->Score()) ? it : it_prev;
740 it_erase->AddComment(
"Intersects with other partial");
741 bad_aligns.push_back(*it_erase);
742 models.erase(it_erase);
755 Predict(left, right, aligns.begin(), aligns.end(), models_tmp,(left!=0 ||
wall),
wall, left!=0,
false, bad_aligns);
757 if(!it->Support().empty() || it->RealCdsLen() >=
minCdsLen)
758 models.push_back(*it);
763 CCDSInfo cds_info = it->GetCdsInfo();
769 if(((
i->IsInsertion() ||
i->IsMismatch()) &&
Include(fullcds,
i->Loc())) ||
770 (
i->IsDeletion() &&
i->Loc() > fullcds.
GetFrom() &&
i->Loc() <= fullcds.
GetTo())) {
774 it->FrameShifts() = fs;
783 it->SetCdsInfo(cds_info);
785 if (it->PStop(
false) || !it->FrameShifts().empty()) {
789 CCDSInfo cds_info = it->GetCdsInfo();
791 it->SetCdsInfo(cds_info);
808 for(five_p=0; five_p < (
int)vec.size() && vec[five_p] ==
'N'; ++five_p);
809 for(three_p=0; three_p < (
int)vec.size() && vec[(
int)vec.size()-1-three_p] ==
'N'; ++three_p);
811 if(five_p > 0 || three_p > 0) {
827 double score = m.
Score();
844 arg_desc->
AddKey(
"param",
"param",
845 "Organism specific parameters",
848 arg_desc->
AddFlag(
"nognomon",
"Skips ab initio prediction and ab initio extension of partial chains.");
851 arg_desc->
AddFlag(
"open",
"Allow partial predictions at the ends of contigs. Used for poorly assembled genomes with lots of unfinished contigs.");
853 arg_desc->
AddFlag(
"nonconsens",
"Allows to accept nonconsensus splices starts/stops to complete partial alignmet. If not allowed some partial alignments "
854 "may be rejected if there is no way to complete them.");
857 arg_desc->
AddFlag(
"norep",
"DO NOT mask lower case letters");
861 arg_desc->
AddFlag(
"singlest",
"Allow single exon EST chains as evidence");
871 annot->
window = args[
"window"].AsInteger();
872 annot->
margin = args[
"margin"].AsInteger();
873 annot->
wall = !args[
"open"];
874 annot->
mpp = args[
"mpp"].AsDouble();
875 bool nonconsens = args[
"nonconsens"];
879 annot->
mincontig = args[
"mincont"].AsInteger();
881 annot->
minCdsLen = args[
"minlen"].AsInteger();
pair< TSignedSeqRange, bool > GetGeneWallLimits(const list< TGeneModelList::iterator > &models, bool external=false)
TSignedSeqRange WalledCdsLimits(const CGeneModel &a)
bool s_AlignScoreOrder(const CGeneModel &ap, const CGeneModel &bp)
bool s_AlignSeqOrder(const CGeneModel &ap, const CGeneModel &bp)
TSignedSeqRange GetWallLimits(const CGeneModel &m, bool external=false)
void FindPartials(TGeneModelList &models, TGeneModelList &aligns, EStrand strand)
void EditedSequence(const In &original_sequence, Out &edited_sequence, bool includeholes=false) const
void Set5PrimeCdsLimit(TSignedSeqPos p)
void SetScore(double score, bool open=false)
TSignedSeqRange Start() const
void AddPStop(SPStop stp)
TSignedSeqRange Cds() const
const TPStops & PStops() const
void AddExon(TSignedSeqRange exon, const string &fs="", const string &ss="", double ident=0, const string &seq="", const CInDelInfo::SSource &src=CInDelInfo::SSource())
const TExons & Exons() const
TSignedSeqRange ReadingFrame() const
virtual CAlignMap GetAlignMap() const
TSignedSeqRange RealCdsLimits() const
virtual void Clip(TSignedSeqRange limits, EClipMode mode, bool ensure_cds_invariant=true)
void SetCdsInfo(const CCDSInfo &cds_info)
TSignedSeqRange Limits() const
void AddComment(const string &comment)
const CCDSInfo & GetCdsInfo() const
vector< CModelExon > TExons
TSignedSeqRange MaxCdsLimits() const
static void SetupArgDescriptions(CArgDescriptions *arg_desc)
static void ReadArgs(CGnomonAnnotator *annot, const CArgs &args)
void SetHMMParameters(CHMMParameters *params)
unique_ptr< CGnomonEngine > m_gnomon
TGgapInfo m_inserted_seqs
unique_ptr< SPhyloCSFSlice > m_pcsf_slice
TIntMap m_notbridgeable_gaps_len
void Predict(TGeneModelList &models, TGeneModelList &bad_aligns)
void RemoveShortHolesAndRescore(TGeneModelList chains)
bool GnomonNeeded() const
double TryToEliminateOneAlignment(TGeneModelList &suspect_aligns, TGeneModelList &bad_aligns, bool leftwall, bool rightwall, bool leftanchor, bool rightanchor)
double ExtendJustThisChain(CGeneModel &chain, TSignedSeqPos left, TSignedSeqPos right)
double TryWithoutObviouslyBadAlignments(TGeneModelList &aligns, TGeneModelList &suspect_aligns, TGeneModelList &bad_aligns, bool leftwall, bool rightwall, bool leftanchor, bool rightanchor, TSignedSeqPos left, TSignedSeqPos right, TSignedSeqRange &tested_range)
double TryToEliminateAlignmentsFromTail(TGeneModelList &suspect_aligns, TGeneModelList &bad_aligns, bool leftwall, bool rightwall, bool leftanchor, bool rightanchor)
HMM model parameters just create it and pass to a Gnomon engine.
static bool RangeNestedInIntron(TSignedSeqRange r, const CGeneModel &algn, bool check_in_holes=true)
iterator_bool insert(const value_type &val)
const_iterator find(const key_type &key) const
const_iterator end() const
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
static const TDS_WORD limits[]
vector< TResidue > CResidueVec
bool Precede(TSignedSeqRange l, TSignedSeqRange r)
bool Include(TSignedSeqRange big, TSignedSeqRange small)
list< CGeneModel > TGeneModelList
vector< CInDelInfo > TInDels
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
#define ERASE_ITERATE(Type, Var, Cont)
Non-constant version with ability to erase current element, if container permits.
int TSignedSeqPos
Type for signed sequence position.
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
void AddFlag(const string &name, const string &comment, CBoolEnum< EFlagValue > set_value=eFlagHasValueIfSet, TFlags flags=0)
Add description for flag argument.
void AddKey(const string &name, const string &synopsis, const string &comment, EType type, TFlags flags=0)
Add description for mandatory key.
void SetCurrentGroup(const string &group)
Set current arguments group name.
void AddDefaultKey(const string &name, const string &synopsis, const string &comment, EType type, const string &default_value, TFlags flags=0, const string &env_var=kEmptyStr, const char *display_value=nullptr)
Add description for optional key with default value.
@ eInputFile
Name of file (must exist and be readable)
@ eDouble
Convertible into a floating point number (double)
@ eInteger
Convertible into an integer number (int or Int8)
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
int64_t Int8
8-byte (64-bit) signed integer
position_type GetLength(void) const
bool NotEmpty(void) const
bool IntersectingWith(const TThisType &r) const
CRange< TSignedSeqPos > TSignedSeqRange
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define BEGIN_SCOPE(ns)
Define a new scope.
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
void SetFrom(TFrom value)
Assign a value to From data member.
TTo GetTo(void) const
Get the To member data.
TFrom GetFrom(void) const
Get the From member data.
void SetTo(TTo value)
Assign a value to To data member.
unsigned int
A callback function used to compare two keys in a database.
range(_Ty, _Ty) -> range< _Ty >
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Defines command line argument related classes.
Defines unified interface to application:
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
virtual void transform_model(CGeneModel &a)
RemoveTrailingNs(const CResidueVec &seq)