75 it->RemoveShortHolesAndRescore(*
m_gnomon);
83 test_align.push_back(chain);
86 cerr <<
"Testing alignment " << chain.
ID() <<
" in fragment " << l <<
' ' <<
r << endl;
93 bool leftwall,
bool rightwall,
bool leftanchor,
bool rightanchor,
100 for(TGeneModelList::iterator it = aligns.begin(); it != aligns.end(); it++) {
101 if(left <= it->Limits().GetTo() && it->Limits().GetFrom() <= right)
102 suspect_aligns.push_back(*it);
107 bool found_bad_cluster =
false;
108 for(TGeneModelList::iterator it = aligns.begin(); it != aligns.end(); ) {
109 if(it->Limits().GetTo() < left || it->Limits().GetFrom() > right) {
116 found_bad_cluster =
true;
117 cerr <<
"Deleting alignment " << it->ID() << endl;
119 it->AddComment(
"Bad score prediction alone");
120 bad_aligns.push_back(*it);
122 it = aligns.erase(it);
125 suspect_aligns.push_back(*it++);
129 if(found_bad_cluster) {
130 cerr <<
"Testing w/o bad alignments in fragment " << left <<
' ' << right << endl;
138 bool leftwall,
bool rightwall,
bool leftanchor,
bool rightanchor)
141 for(TGeneModelList::iterator it = suspect_aligns.begin(); it != suspect_aligns.end();) {
147 it = suspect_aligns.erase(it);
149 cerr <<
"Testing w/o " << algn.
ID();
152 cerr <<
"- Good. Deleting alignment " << algn.
ID() << endl;
154 algn.
AddComment(
"Good score prediction without");
155 bad_aligns.push_back(algn);
158 cerr <<
" - Still bad." << endl;
160 suspect_aligns.insert(it,algn);
166 bool leftwall,
bool rightwall,
bool leftanchor,
bool rightanchor)
169 for (TGeneModelList::iterator it = suspect_aligns.begin(); score ==
BadScore() && it != suspect_aligns.end(); ) {
174 cerr <<
"Deleting alignment " << it->ID() << endl;
176 it->AddComment(
"Bad score prediction in combination");
177 bad_aligns.push_back(*it);
178 it = suspect_aligns.erase(it);
180 cerr <<
"Testing fragment " << left <<
' ' << right << endl;
187 bool leftmostwall,
bool rightmostwall,
bool leftmostanchor,
bool rightmostanchor,
TGeneModelList& bad_aligns)
192 bool leftwall = leftmostwall;
193 bool leftanchor = leftmostanchor;
196 bool rightwall =
false;
197 bool rightanchor =
false;
199 Int8 prev_bad_right = rlimit+1;
200 bool do_it_again =
false;
209 TIVec busy_spots(rlimit+1,0);
211 int a =
max(0,it_c->Limits().GetFrom()-
margin);
212 int b =
min(rlimit,it_c->Limits().GetTo()+
margin);
213 for(
int i =
a;
i<=
b; ++
i)
218 for( ; right < rlimit && busy_spots[right] != 0; ++right);
220 if (right + (right-left)/2 >= rlimit) {
222 rightwall = rightmostwall;
223 rightanchor = rightmostanchor;
234 if (right < prev_bad_right) {
235 suspect_aligns.clear();
239 cerr << left <<
' ' << right <<
' ' <<
m_gnomon->GetGCcontent() << endl;
244 cerr <<
"Inconsistent alignments in fragment " << left <<
' ' << right <<
'\n';
247 leftwall, rightwall, leftanchor, rightanchor,
248 left, right, tested_range);
253 prev_bad_right = right;
254 right = (left+right)/2;
262 leftwall, rightwall, leftanchor, rightanchor);
265 leftwall, rightwall, leftanchor, rightanchor);
267 cerr <<
"!!! BAD SCORE EVEN WITH FINISHED ALIGNMENTS !!! " << endl;
270 models.push_back(*it);
274 prev_bad_right = rlimit+1;
276 list<CGeneModel> genes =
m_gnomon->GetGenes();
280 if (right < rlimit && !genes.empty() && !genes.back().RightComplete() && !do_it_again) {
281 partial_start = genes.back().LeftComplete() ? genes.back().RealCdsLimits().GetFrom() : left;
282 _ASSERT ( partial_start < right );
288 if (!genes.empty()) {
289 left = genes.back().ReadingFrame().GetTo()+1;
291 }
else if (partial_start < left+1000) {
293 }
else if (partial_start < right) {
294 int new_left = partial_start-100;
295 for( ; new_left > left && busy_spots[new_left] != 0; --new_left);
296 if(new_left > left+1000) {
303 left = (left+right)/2+1;
307 models.splice(models.end(), genes);
317 }
while(left <= rlimit);
335 return (
a.GetFrom() !=
b.GetFrom() ?
336 a.GetFrom() <
b.GetFrom() :
337 a.GetTo() >
b.GetTo()
344 aligns.push_back(*wall_model);
351 unique_ptr<CGeneModel> wall_model;
353 for (TGeneModelList::iterator loop_it = models.begin(); loop_it != models.end();) {
354 TGeneModelList::iterator ir = loop_it;
363 if ( right <
limits.GetFrom() ) {
367 if ( right <
limits.GetFrom() ) {
369 wall_model->SetGeneID(ir->GeneID());
370 wall_model->AddExon(
limits);
374 if (ir->RankInGene() == 1 && !ir->GoodEnoughToBeAnnotation()) {
376 aligns.splice(aligns.end(), models, ir);
378 }
else if (
limits.GetTo()- wall_model->Limits().GetTo() > 0) {
379 wall_model->ExtendRight(
limits.GetTo() - wall_model->Limits().GetTo());
395 typedef list<TGeneModelList::iterator> TIterList;
401 genes[im->GeneID()].push_back(im);
405 ITERATE(TGIDIterlist, ig, genes) {
406 bool coding_gene =
false;
407 ITERATE(TIterList, im, ig->second) {
408 if((*im)->ReadingFrame().NotEmpty()) {
415 ITERATE(TIterList, im, ig->second) {
422 gene_lim_for_nested += model_lim_for_nested;
425 vector<int> grange(gene_lim_for_nested.
GetLength(),1);
426 ITERATE(TIterList, im, ig->second) {
437 for(
int j = overlap.
GetFrom(); j <= overlap.
GetTo(); ++j)
438 grange[j-gene_lim_for_nested.
GetFrom()] = 0;
442 if(!ai.
Exons()[
i-1].m_ssplice || !ai.
Exons()[
i].m_fsplice) {
446 grange[j-gene_lim_for_nested.
GetFrom()] = 0;
450 _ASSERT(grange.front() == 0 && grange.back() == 0);
454 for(
int j = 0; j < (
int)grange.size(); ++j) {
460 }
else if(grange[j] == 1) {
464 hosting_intervals.
insert(interval);
471 TRangeModels nested_models;
472 ITERATE(TGIDIterlist, ig, genes) {
473 TGeneModelList::iterator nested_modeli = ig->second.front();
474 if(!nested_modeli->GoodEnoughToBeAnnotation()) {
475 _ASSERT(ig->second.size() == 1);
476 TSignedSeqRange lim_for_nested = nested_modeli->RealCdsLimits().
Empty() ? nested_modeli->Limits() : nested_modeli->RealCdsLimits();
481 if(
Include(interval,lim_for_nested)) {
482 if(hosting_interval.
Empty())
483 hosting_interval = interval;
485 hosting_interval = (hosting_interval&interval);
490 TIterList nested(1,nested_modeli);
491 ITERATE(TGIDIterlist, igg, genes) {
492 const TIterList& other_gene = igg->second;
493 if(igg == ig || !other_gene.front()->GoodEnoughToBeAnnotation())
496 bool coding_gene =
false;
497 ITERATE(TIterList, im, other_gene) {
498 if((*im)->ReadingFrame().NotEmpty()) {
505 ITERATE(TIterList, im, other_gene) {
515 if(
Precede(finished_interval,lim_for_nested)) {
517 }
else if(
Precede(lim_for_nested,finished_interval)) {
522 ITERATE(TIterList, im, other_gene) {
523 nested.push_back(*im);
528 nested_models[hosting_interval].splice(nested_models[hosting_interval].begin(),nested);
533 bool scaffold_wall =
wall;
535 ITERATE(TRangeModels,
i, nested_models) {
541 nested.push_back(**im);
543 if(!(*im)->GoodEnoughToBeAnnotation()) {
550 if(nested.back().HasStart() && !
Include(hosting_interval,nested.back().MaxCdsLimits())) {
551 CCDSInfo cds = nested.back().GetCdsInfo();
552 if(nested.back().Strand() ==
ePlus)
556 nested.back().SetCdsInfo(cds);
561 nested.back().AddComment(
"partialnested");
566 included_complete_models.
insert((*im)->ID());
570 cerr <<
"Interval " << hosting_interval <<
'\t' << nested.size() << endl;
575 if(!im->Support().empty()) {
577 if(im->ID() == 0 || included_complete_models.
find(im->ID()) == included_complete_models.
end())
578 models.push_back(*im);
582 wall = scaffold_wall;
594 bool gapfilled =
false;
597 if(ie->m_fsplice_sig ==
"XX" || ie->m_ssplice_sig ==
"XX")
600 genome_cds += (cds&ie->Limits()).
GetLength();
603 if(gapfilled && genome_cds < 45) {
606 bad_aligns.push_back(model);
618 if(!models.empty()) {
619 for(
auto it_loop =
next(models.begin()); it_loop != models.end(); ) {
621 if(it->RankInGene() != 1 || it->GoodEnoughToBeAnnotation() || it->Type()&
CGeneModel::eNested)
623 auto it_prev =
prev(it);
624 if(it_prev->RankInGene() != 1 || it_prev->GoodEnoughToBeAnnotation() || it_prev->Type()&
CGeneModel::eNested)
627 if(it->MaxCdsLimits().IntersectingWith(it_prev->MaxCdsLimits())) {
628 cerr <<
"Intersecting alignments " << it->ID() <<
" " << it_prev->ID() <<
" " << it->Score() <<
" " << it_prev->Score() << endl;
629 auto it_erase = (it->Score() < it_prev->Score()) ? it : it_prev;
631 it_erase->AddComment(
"Intersects with other partial");
632 bad_aligns.push_back(*it_erase);
633 models.erase(it_erase);
646 Predict(left, right, aligns.begin(), aligns.end(), models_tmp,(left!=0 ||
wall),
wall, left!=0,
false, bad_aligns);
648 if(!it->Support().empty() || it->RealCdsLen() >=
minCdsLen)
649 models.push_back(*it);
654 CCDSInfo cds_info = it->GetCdsInfo();
660 if(((
i->IsInsertion() ||
i->IsMismatch()) &&
Include(fullcds,
i->Loc())) ||
661 (
i->IsDeletion() &&
i->Loc() > fullcds.
GetFrom() &&
i->Loc() <= fullcds.
GetTo())) {
665 it->FrameShifts() = fs;
674 it->SetCdsInfo(cds_info);
676 if (it->PStop(
false) || !it->FrameShifts().empty()) {
680 CCDSInfo cds_info = it->GetCdsInfo();
682 it->SetCdsInfo(cds_info);
699 for(five_p=0; five_p < (
int)vec.size() && vec[five_p] ==
'N'; ++five_p);
700 for(three_p=0; three_p < (
int)vec.size() && vec[(
int)vec.size()-1-three_p] ==
'N'; ++three_p);
702 if(five_p > 0 || three_p > 0) {
718 double score = m.
Score();
735 arg_desc->
AddKey(
"param",
"param",
736 "Organism specific parameters",
739 arg_desc->
AddFlag(
"nognomon",
"Skips ab initio prediction and ab initio extension of partial chains.");
742 arg_desc->
AddFlag(
"open",
"Allow partial predictions at the ends of contigs. Used for poorly assembled genomes with lots of unfinished contigs.");
744 arg_desc->
AddFlag(
"nonconsens",
"Allows to accept nonconsensus splices starts/stops to complete partial alignmet. If not allowed some partial alignments "
745 "may be rejected if there is no way to complete them.");
748 arg_desc->
AddFlag(
"norep",
"DO NOT mask lower case letters");
752 arg_desc->
AddFlag(
"singlest",
"Allow single exon EST chains as evidence");
762 annot->
window = args[
"window"].AsInteger();
763 annot->
margin = args[
"margin"].AsInteger();
764 annot->
wall = !args[
"open"];
765 annot->
mpp = args[
"mpp"].AsDouble();
766 bool nonconsens = args[
"nonconsens"];
770 annot->
mincontig = args[
"mincont"].AsInteger();
772 annot->
minCdsLen = args[
"minlen"].AsInteger();
void SaveWallModel(unique_ptr< CGeneModel > &wall_model, TGeneModelList &aligns)
TSignedSeqRange WalledCdsLimits(const CGeneModel &a)
bool s_AlignScoreOrder(const CGeneModel &ap, const CGeneModel &bp)
bool s_AlignSeqOrder(const CGeneModel &ap, const CGeneModel &bp)
void FindPartials(TGeneModelList &models, TGeneModelList &aligns, EStrand strand)
TSignedSeqRange GetWallLimits(const CGeneModel &m)
void EditedSequence(const In &original_sequence, Out &edited_sequence, bool includeholes=false) const
void Set5PrimeCdsLimit(TSignedSeqPos p)
void SetScore(double score, bool open=false)
TSignedSeqRange Start() const
void AddPStop(SPStop stp)
TSignedSeqRange Cds() const
const TPStops & PStops() const
const TExons & Exons() const
TSignedSeqRange ReadingFrame() const
virtual CAlignMap GetAlignMap() const
TSignedSeqRange RealCdsLimits() const
virtual void Clip(TSignedSeqRange limits, EClipMode mode, bool ensure_cds_invariant=true)
void SetCdsInfo(const CCDSInfo &cds_info)
TSignedSeqRange Limits() const
void AddComment(const string &comment)
const CCDSInfo & GetCdsInfo() const
vector< CModelExon > TExons
TSignedSeqRange MaxCdsLimits() const
static void SetupArgDescriptions(CArgDescriptions *arg_desc)
static void ReadArgs(CGnomonAnnotator *annot, const CArgs &args)
void SetHMMParameters(CHMMParameters *params)
unique_ptr< CGnomonEngine > m_gnomon
TGgapInfo m_inserted_seqs
unique_ptr< SPhyloCSFSlice > m_pcsf_slice
TIntMap m_notbridgeable_gaps_len
void Predict(TGeneModelList &models, TGeneModelList &bad_aligns)
void RemoveShortHolesAndRescore(TGeneModelList chains)
bool GnomonNeeded() const
double TryToEliminateOneAlignment(TGeneModelList &suspect_aligns, TGeneModelList &bad_aligns, bool leftwall, bool rightwall, bool leftanchor, bool rightanchor)
double ExtendJustThisChain(CGeneModel &chain, TSignedSeqPos left, TSignedSeqPos right)
double TryWithoutObviouslyBadAlignments(TGeneModelList &aligns, TGeneModelList &suspect_aligns, TGeneModelList &bad_aligns, bool leftwall, bool rightwall, bool leftanchor, bool rightanchor, TSignedSeqPos left, TSignedSeqPos right, TSignedSeqRange &tested_range)
double TryToEliminateAlignmentsFromTail(TGeneModelList &suspect_aligns, TGeneModelList &bad_aligns, bool leftwall, bool rightwall, bool leftanchor, bool rightanchor)
HMM model parameters just create it and pass to a Gnomon engine.
static bool RangeNestedInIntron(TSignedSeqRange r, const CGeneModel &algn, bool check_in_holes=true)
iterator_bool insert(const value_type &val)
const_iterator find(const key_type &key) const
const_iterator end() const
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
vector< TResidue > CResidueVec
bool Precede(TSignedSeqRange l, TSignedSeqRange r)
bool Include(TSignedSeqRange big, TSignedSeqRange small)
list< CGeneModel > TGeneModelList
vector< CInDelInfo > TInDels
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
#define ERASE_ITERATE(Type, Var, Cont)
Non-constant version with ability to erase current element, if container permits.
int TSignedSeqPos
Type for signed sequence position.
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
void AddFlag(const string &name, const string &comment, CBoolEnum< EFlagValue > set_value=eFlagHasValueIfSet, TFlags flags=0)
Add description for flag argument.
void AddKey(const string &name, const string &synopsis, const string &comment, EType type, TFlags flags=0)
Add description for mandatory key.
void SetCurrentGroup(const string &group)
Set current arguments group name.
void AddDefaultKey(const string &name, const string &synopsis, const string &comment, EType type, const string &default_value, TFlags flags=0, const string &env_var=kEmptyStr, const char *display_value=nullptr)
Add description for optional key with default value.
@ eInputFile
Name of file (must exist and be readable)
@ eDouble
Convertible into a floating point number (double)
@ eInteger
Convertible into an integer number (int or Int8)
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
int64_t Int8
8-byte (64-bit) signed integer
position_type GetLength(void) const
bool NotEmpty(void) const
bool IntersectingWith(const TThisType &r) const
CRange< TSignedSeqPos > TSignedSeqRange
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define BEGIN_SCOPE(ns)
Define a new scope.
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
void SetFrom(TFrom value)
Assign a value to From data member.
TTo GetTo(void) const
Get the To member data.
TFrom GetFrom(void) const
Get the From member data.
void SetTo(TTo value)
Assign a value to To data member.
unsigned int
A callback function used to compare two keys in a database.
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Defines command line argument related classes.
Defines unified interface to application:
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
static const TDS_WORD limits[]
virtual void transform_model(CGeneModel &a)
RemoveTrailingNs(const CResidueVec &seq)