51 #include <unordered_set>
52 #include <unordered_map>
211 :m_hmm_params(hmm_params), m_gnomon(gnomon), m_edited_contig_map(edited_contig_map), m_limits(
limits), m_contig_acc(contig_acc), m_idnext(1), m_idinc(1)
217 return m_data->MakeChains(models);
231 m_align(0), m_cds_info(0), m_align_map(0), m_left_member(0), m_right_member(0), m_sink_for_contained(0),
232 m_copy(0), m_contained(0), m_identical_count(0),
233 m_left_num(0), m_right_num(0), m_num(0),
234 m_splice_weight(0), m_left_splice_num(0), m_right_splice_num(0), m_splice_num(0),
235 m_type(
eCDS), m_left_cds(0), m_right_cds(0), m_cds(0), m_included(
false), m_postponed(
false), m_internal(
false),
236 m_marked_for_deletion(
false), m_marked_for_retention(
false), m_restricted_to_start(
false),
237 m_gapped_connection(
false), m_fully_connected_to_part(-1), m_not_for_chaining(
false),
243 void MarkIncludedForChain();
244 void MarkPostponed();
245 void MarkPostponedForChain();
248 TContained CollectCodingContainedForMemeber();
265 int m_type, m_left_cds, m_right_cds, m_cds;
286 tuple<TIDMap, TSignedSeqRange> PeaksAndLimits(
EStatus determinant,
int min_blob_weight,
int max_empty_dist,
int min_splice_dist);
287 tuple<TIVec, TSignedSeqRange> MainPeaks(
TIDMap& peak_weights,
double secondary_peak,
double tertiary_peak,
double tertiary_peak_coverage,
bool right_end);
293 void RestoreTrimmedEnds(
int trim);
294 void RemoveFshiftsFromUTRs();
296 void SetOpenForPartialyAlignedProteins(map<
string, pair<bool,bool> >& prot_complet);
297 pair<bool,bool> ValidPolyA(
int pos,
const CResidueVec& contig);
298 void ClipToCap(
int min_cap_blob,
int max_dist,
int min_flank_exon,
double secondary_peak,
bool recalulate_support =
true );
299 void ClipToPolyA(
const CResidueVec& contig,
int min_polya_blob,
int max_dist,
int min_flank_exon,
double secondary_peak,
double tertiary_peak,
double tertiary_peak_coverage,
bool recalulate_support =
true);
300 void CheckSecondaryCapPolyAEnds();
301 void ClipLowCoverageUTR(
double utr_clip_threshold,
bool recalulate_support =
true);
302 void CalculateDropLimits();
303 void CalculateSupportAndWeightFromMembers(
bool keep_all_evidence =
false);
307 void SetConfirmedStartStopForCompleteProteins(map<
string, pair<bool,bool> >& prot_complet,
const SMinScor& minscor);
310 void SetConsistentCoverage();
312 bool HarborsNested(
const CChain& other_chain,
bool check_in_holes)
const;
313 bool HarborsNested(
const CGene& other_gene,
bool check_in_holes)
const;
315 bool HasTrustedEvidence()
const;
339 typedef list<CGeneModel>::iterator
TIt;
343 bool IsAlternative(
const CChain&
a)
const;
344 bool IsAllowedAlternative(
const ncbi::gnomon::CGeneModel&,
int maxcomposite)
const;
347 bool Nested()
const {
return !m_nested_in_genes.empty(); }
349 bool HarborsNested(
const CChain& other_chain,
bool check_in_holes)
const;
350 bool HarborsNested(
const CGene& other_gene,
bool check_in_holes)
const;
354 set<CGene*> RemoveGeneFromOtherGenesSets();
370 (*i)->RemoveFromHarbored(
this);
372 (*i)->RemoveFromNestedIn(
this);
374 return m_harbors_genes;
382 if(RealCdsLimits().NotEmpty())
383 gene_lim_for_nested =
front()->OpenCds() ?
front()->MaxCdsLimits() : RealCdsLimits();
389 if(RealCdsLimits().NotEmpty() && (*it)->ReadingFrame().Empty())
392 if((*it)->ReadingFrame().NotEmpty())
393 model_lim_for_nested = (*it)->OpenCds() ? (*it)->MaxCdsLimits() : (*it)->RealCdsLimits();
412 return HarborsRange(other_lim_for_nested, check_in_holes);
424 return HarborsRange(other_lim_for_nested, check_in_holes);
435 common_cds += (ib->Limits()&
b.RealCdsLimits()&ia->Limits()&
a.RealCdsLimits()).
GetLength();
449 m_real_cds_limits +=
a.RealCdsLimits();
450 m_maxscore =
max(m_maxscore,
a.Score());
459 if (
a.Support().empty()) {
465 if(s->IsCore() && ++composite > maxcomposite)
return false;
468 if(
a.PStop(
false) || !
a.FrameShifts().empty())
475 vector<TSignedSeqRange> gene_gapfill_exons;
478 gene_gapfill_exons.push_back(e->
Limits());
480 vector<TSignedSeqRange> a_gapfill_exons;
483 a_gapfill_exons.push_back(e->
Limits());
485 if(gene_gapfill_exons != a_gapfill_exons)
488 bool a_share_intron =
false;
491 set<TSignedSeqRange> b_introns;
492 for(
int i = 1;
i < (
int)
b.Exons().size(); ++
i) {
493 if(
b.Exons()[
i-1].m_ssplice &&
b.Exons()[
i].m_fsplice) {
495 b_introns.insert(intron);
499 bool a_has_new_intron =
false;
500 for(
int i = 1;
i < (
int)
a.Exons().size(); ++
i) {
501 if(
a.Exons()[
i-1].m_ssplice &&
a.Exons()[
i].m_fsplice &&
a.Exons()[
i-1].m_ssplice_sig !=
"XX" &&
a.Exons()[
i].m_fsplice_sig !=
"XX") {
503 if(b_introns.insert(intron).second)
504 a_has_new_intron =
true;
506 a_share_intron =
true;
510 if(a_has_new_intron) {
512 }
else if(!gene_gapfill_exons.empty()) {
514 }
else if(
a.RealCdsLimits().NotEmpty() &&
b.RealCdsLimits().NotEmpty() && !
a.RealCdsLimits().IntersectingWith(
b.RealCdsLimits()) && (!
a.TrustedmRNA().empty() || !
a.TrustedProt().empty())) {
519 }
else if(
a.RealCdsLen() <=
b.RealCdsLen()){
524 return (a_share_intron || gene_gapfill_exons.empty());
531 if (
a.Strand() !=
front()->Strand())
534 bool gene_has_trusted =
false;
536 if((*it)->HasTrustedEvidence()) {
537 gene_has_trusted =
true;
542 bool has_common_splice =
false;
546 has_common_splice =
true;
551 if(has_common_splice && (!gene_has_trusted || !
a.HasTrustedEvidence()))
554 if(
a.ReadingFrame().NotEmpty() && RealCdsLimits().NotEmpty()) {
555 CAlignMap amap(
a.Exons(),
a.FrameShifts(),
a.Strand(),
a.GetCdsInfo().Cds());
557 for(
unsigned int j = 0; j <
a.Exons().
size(); ++j) {
558 for(
TSignedSeqPos k =
max(
a.Exons()[j].GetFrom(),
a.GetCdsInfo().Cds().GetFrom()); k <=
min(
a.Exons()[j].GetTo(),
a.GetCdsInfo().Cds().GetTo()); ++k) {
560 _ASSERT(p < (
int)acds_map.size());
567 bool has_common_cds =
false;
570 if(!
a.GetCdsInfo().Cds().IntersectingWith((*it)->GetCdsInfo().Cds()))
573 CAlignMap gmap((*it)->Exons(), (*it)->FrameShifts(), (*it)->Strand(), (*it)->GetCdsInfo().Cds());
575 for(
unsigned int j = 0; j < (*it)->Exons().
size(); ++j) {
576 for(
TSignedSeqPos k =
max((*it)->Exons()[j].GetFrom(),(*it)->GetCdsInfo().Cds().GetFrom()); k <=
min((*it)->Exons()[j].GetTo(),(*it)->GetCdsInfo().Cds().GetTo()); ++k) {
578 _ASSERT(p < (
int)cds_map.size());
584 for(
unsigned int i = 0;
i < acds_map.size(); ) {
586 for( ; j < cds_map.size() && (acds_map[
i] != cds_map[j] ||
i%3 != j%3); ++j);
587 if(j == cds_map.size()) {
593 for( ; j < cds_map.size() &&
i < acds_map.size() && acds_map[
i] == cds_map[j]; ++j, ++
i, ++
count);
596 has_common_cds =
true;
605 return has_common_cds;
608 return has_common_splice;
613 if (!
a.Support().empty() &&
b.Support().empty())
615 else if (
a.Support().empty() && !
b.Support().empty())
619 bool atrusted = !
a.TrustedmRNA().empty() || !
a.TrustedProt().empty();
620 bool btrusted = !
b.TrustedmRNA().empty() || !
b.TrustedProt().empty();
621 if(atrusted && !btrusted) {
623 }
else if(btrusted && !atrusted) {
625 }
else if(
a.ReadingFrame().NotEmpty() &&
b.ReadingFrame().Empty()) {
627 }
else if(
b.ReadingFrame().NotEmpty() &&
a.ReadingFrame().Empty()) {
629 }
else if(
a.ReadingFrame().NotEmpty()) {
631 double ds = 0.05*
fabs(
a.Score());
632 double as =
a.Score();
642 ds = 0.05*
fabs(
b.Score());
643 double bs =
b.Score();
657 else if(
a.m_splice_weight >
b.m_splice_weight)
659 else if(
a.m_splice_weight <
b.m_splice_weight)
661 else if(
a.Weight() >
b.Weight())
663 else if(
a.Weight() <
b.Weight())
665 else if(
a.Limits().GetLength() !=
b.Limits().GetLength())
666 return (
a.Limits().GetLength() <
b.Limits().GetLength());
668 return a.ID() <
b.ID();
670 double asize =
a.m_splice_weight;
671 double bsize =
b.m_splice_weight;
672 double ds = 0.025*(asize+bsize);
690 else if(bsize > asize)
692 else if(
a.Limits().GetLength() !=
b.Limits().GetLength())
693 return (
a.Limits().GetLength() <
b.Limits().GetLength());
695 return a.ID() <
b.ID();
714 bool gene_good_enough_to_be_annotation = allow_partialalts || gene.front()->GoodEnoughToBeAnnotation();
717 TSignedSeqRange gene_cds = (gene.size() > 1 || gene.front()->CompleteCds() || algn_good_enough_to_be_annotation) ? gene.
RealCdsLimits() : gene.front()->MaxCdsLimits();
720 if(!gene_good_enough_to_be_annotation && !algn_good_enough_to_be_annotation) {
722 for(
int i = 1;
i < (
int)
b.Exons().size(); ++
i) {
723 if(
b.Exons()[
i].m_ssplice_sig ==
"XX" &&
b.Exons()[
i].m_fsplice_sig ==
"XX" &&
b.Exons()[
i].Limits().IntersectingWith(gene_cds)) {
728 for(
int i = 1;
i < (
int)algn.
Exons().size(); ++
i) {
729 if(algn.
Exons()[
i].m_ssplice_sig ==
"XX" && algn.
Exons()[
i].m_fsplice_sig ==
"XX" && algn.
Exons()[
i].Limits().IntersectingWith(algn_cds)) {
747 return eNotCompatible;
748 }
else if(algn.
RealCdsLen() > altfrac/100*gene.front()->RealCdsLen() || algn.
Score() > altfrac/100*gene.front()->Score()) {
753 return eNotCompatible;
757 set<TSignedSeqRange> gene_gapfill_introns;
758 set<TSignedSeqRange> align_gapfill_introns;
761 for(
int i = 1;
i < (
int)
b.Exons().size(); ++
i) {
762 if(
b.Exons()[
i-1].m_ssplice_sig ==
"XX" ||
b.Exons()[
i].m_fsplice_sig ==
"XX") {
764 gene_gapfill_introns.insert(intron);
768 for(
int i = 1;
i < (
int)algn.
Exons().size(); ++
i) {
769 if(algn.
Exons()[
i-1].m_ssplice_sig ==
"XX" || algn.
Exons()[
i].m_fsplice_sig ==
"XX") {
771 align_gapfill_introns.insert(intron);
774 ITERATE(set<TSignedSeqRange>, ig, gene_gapfill_introns) {
775 ITERATE(set<TSignedSeqRange>, ia, align_gapfill_introns) {
776 if(ig->IntersectingWith(*ia))
777 return eNotCompatible;
781 if(algn.
HarborsNested(gene, gene_good_enough_to_be_annotation))
784 if(gene.
HarborsNested(algn, algn_good_enough_to_be_annotation))
795 return eNotCompatible;
799 if(gene_good_enough_to_be_annotation && algn_good_enough_to_be_annotation) {
800 if(gene.front()->Strand() != algn.
Strand() && allow_opposite_strand &&
811 return eNotCompatible;
818 for(TChainPointerList::iterator itloop = not_placed_yet.begin(); itloop != not_placed_yet.end(); ) {
819 TChainPointerList::iterator it = itloop++;
827 list<CGene*> possibly_nested;
829 bool good_model =
true;
830 for(list<CGene>::iterator itl = alts.begin(); good_model && itl != alts.end(); ++itl) {
835 possibly_nested.push_back(&(*itl));
845 alts.push_back(
CGene());
849 alts.back().Insert(algn);
850 not_placed_yet.erase(it);
853 ITERATE(list<CGene*>, itl, possibly_nested) {
854 (*itl)->AddToNestedIn(&alts.back());
855 alts.back().AddToHarbored(*itl);
864 for(TChainPointerList::iterator itloop = not_placed_yet.begin(); itloop != not_placed_yet.end(); ) {
865 TChainPointerList::iterator it = itloop++;
868 list<list<CGene>::iterator> included_in;
869 list<CGene*> possibly_nested;
870 list<CGene*> nested_in;
872 bool good_model =
true;
873 for(list<CGene>::iterator itl = alts.begin(); good_model && itl != alts.end(); ++itl) {
878 nested_in.push_back(&(*itl));
881 possibly_nested.push_back(&(*itl));
886 included_in.push_back(itl);
889 case eNotCompatibleNested:
890 if(itl->IsAlternative(algn))
891 included_in.push_back(itl);
904 CGene& gene = *included_in.front();
905 CChain& model = *gene.front();
912 if(algn_cds_len < 0.8*model_cds_len)
918 not_placed_yet.push_back(gene.front());
922 ITERATE(list<CGene*>, itl, nested_in) {
924 (*itl)->AddToHarbored(&gene);
926 ITERATE(list<CGene*>, itl, possibly_nested) {
927 (*itl)->AddToNestedIn(&gene);
931 not_placed_yet.erase(it);
939 for(TChainPointerList::iterator itloop = not_placed_yet.begin(); itloop != not_placed_yet.end(); ) {
940 TChainPointerList::iterator it = itloop++;
943 list<list<CGene>::iterator> included_in;
944 list<CGene*> possibly_nested;
946 bool good_model =
true;
947 for(list<CGene>::iterator itl = alts.begin(); good_model && itl != alts.end(); ++itl) {
952 possibly_nested.push_back(&(*itl));
956 included_in.push_back(itl);
964 if(good_model && !included_in.empty() && (allow_partialalts || included_in.front()->front()->GoodEnoughToBeAnnotation())) {
965 if(included_in.size() == 1) {
970 CGene& gene = *included_in.front();
972 not_placed_yet.erase(it);
974 ITERATE(list<CGene*>, itl, possibly_nested) {
976 (*itl)->AddToNestedIn(&gene);
982 bool allow_connection =
false;
985 bool cds_overlap =
true;
991 ITERATE(list<list<CGene>::iterator>, k, included_in) {
992 if(!(*k)->IsAlternative(
a)) {
1003 allow_connection =
true;
1007 if(allow_connection) {
1008 CGene& gene = *included_in.front();
1011 ITERATE(list<list<CGene>::iterator>, k, included_in) {
1012 if(k != included_in.begin()) {
1015 if(CheckCompatibility(*included_in.front(), **
l) == eAlternative) {
1017 (*l)->AddComment(
"Pass2b");
1019 included_in.front()->Insert(**
l);
1021 not_placed_yet.push_back(*
l);
1024 TChainPointerList::iterator idest = itloop;
1026 not_placed_yet.insert(idest, *
l);
1029 set<CGene*> nested_genes = (*k)->RemoveGeneFromOtherGenesSets();
1030 ITERATE(set<CGene*>,
i, nested_genes)
1031 possibly_nested.push_back(*
i);
1035 not_placed_yet.erase(it);
1037 ITERATE(list<CGene*>, itl, possibly_nested) {
1039 (*itl)->AddToNestedIn(&gene);
1055 list<CGene>::iterator included_in(alts.end());
1056 list<CGene*> possibly_nested;
1057 list<CGene*> nested_in;
1059 bool good_model =
true;
1060 for(list<CGene>::iterator itl = alts.begin(); good_model && itl != alts.end(); ++itl) {
1061 ECompat cmp = CheckCompatibility(*itl, algn);
1064 case eNotCompatibleNested:
1065 case eNotCompatible:
1066 rejected.push_back(&algn);
1068 ost <<
"Trumped by another model " << itl->front()->ID();
1070 if(
cmp == eNotCompatibleNested)
1075 if(!allow_partialalts && !itl->front()->GoodEnoughToBeAnnotation()) {
1076 rejected.push_back(&algn);
1078 ost <<
"Trumped by another model " << itl->front()->ID();
1081 }
else if(included_in == alts.end()) {
1085 rejected.push_back(&algn);
1087 ost <<
"Connects two genes " << itl->front()->ID() <<
" " << included_in->front()->ID();
1092 nested_in.push_back(&(*itl));
1095 possibly_nested.push_back(&(*itl));
1103 if(included_in != alts.end()) {
1107 included_in->Insert(algn);
1108 genep = &(*included_in);
1110 alts.push_back(
CGene());
1111 genep = &alts.back();
1115 alts.back().Insert(algn);
1117 ITERATE(list<CGene*>, itl, nested_in) {
1118 if((*itl)->HarborsNested(*genep,
true)) {
1120 (*itl)->AddToHarbored(genep);
1123 ITERATE(list<CGene*>, itl, possibly_nested) {
1125 (*itl)->AddToNestedIn(genep);
1139 TChainPointerList::iterator jt_loop = it;
1140 for(++jt_loop; jt_loop != not_placed_yet.end();) {
1141 TChainPointerList::iterator jt = jt_loop++;
1145 ost <<
"Trumped by similar chain " << ai.
ID();
1147 rejected.push_back(&aj);
1148 not_placed_yet.erase(jt);
1156 for(TChainPointerList::iterator it_loop = not_placed_yet.begin(); it_loop != not_placed_yet.end();) {
1157 TChainPointerList::iterator it = it_loop++;
1164 vector<const CChain*> candidates;
1169 candidates.push_back(&aj);
1173 for (
size_t i = 0; alive &&
i < candidates.size(); ++
i) {
1174 for (
size_t j =
i+1; alive && j < candidates.size(); ++j) {
1175 if(!candidates[
i]->Limits().IntersectingWith(candidates[j]->Limits())) {
1177 ost <<
"Overlapping tandem " << candidates[
i]->ID() - ai.
ID() <<
" " << candidates[j]->ID() - ai.
ID();
1179 rejected.push_back(*it);
1180 not_placed_yet.erase(it);
1195 it->SetGeneID(it->ID());
1196 it->SetRankInGene(0);
1197 not_placed_yet.push_back(&(*it));
1204 FilterOutSimilarsWithLowerScore(not_placed_yet, bad_aligns);
1205 FilterOutTandemOverlap(not_placed_yet, bad_aligns, 80);
1207 FindGeneSeeds(alts, not_placed_yet);
1208 ReplacePseudoGeneSeeds(alts, not_placed_yet);
1209 FindAltsForGeneSeeds(alts, not_placed_yet);
1210 PlaceAllYouCan(alts, not_placed_yet, bad_aligns);
1215 (*l)->SetGeneID(k->front()->ID());
1216 (*l)->SetRankInGene(++rank);
1245 if(alimits == blimits)
1283 gmembers.insert(&m);
1292 if(members_genes.empty())
1297 typedef map<CGene*,list<SChainMember*> > TGeneToMembers;
1298 typedef map<TIdLim, TGeneToMembers> TMembersInDiffGenes;
1299 TMembersInDiffGenes members_in_different_genes;
1303 CGene* genep = members_genes.front().second;
1304 members_in_different_genes[idlim][genep].push_back(mp);
1306 for(
int i = 1;
i < (
int)members_genes.size(); ++
i) {
1310 CGene* genep = members_genes[
i].second;
1311 if(idlim_prev != idlim) {
1312 TMembersInDiffGenes::iterator it = members_in_different_genes.find(idlim_prev);
1313 if(it->second.size() < 2)
1314 members_in_different_genes.erase(it);
1316 members_in_different_genes[idlim][genep].push_back(mp);
1321 TMembersInDiffGenes::iterator it = members_in_different_genes.find(idlim);
1322 if(it->second.size() < 2)
1323 members_in_different_genes.erase(it);
1326 ITERATE(TMembersInDiffGenes, imdg, members_in_different_genes) {
1327 ITERATE(TGeneToMembers, ig1, imdg->second) {
1328 CGene& gene1 = *ig1->first;
1336 typedef map<CChain*,TMemberPtrSet> TConflictMemebersInChains;
1337 TConflictMemebersInChains conflict_members_in_chains;
1339 ITERATE(TMembersInDiffGenes, imdg, members_in_different_genes) {
1340 ITERATE(TGeneToMembers, ig1, imdg->second) {
1341 CGene& gene1 = *ig1->first;
1343 CChain* chain1p_orig = *ic1;
1345 for(list<SChainMember*>::const_iterator im = ig1->second.begin(); im != ig1->second.end() && mbr1p_orig == 0; ++im) {
1346 if(binary_search(chain1p_orig->
m_members.begin(),chain1p_orig->
m_members.end(),*im, std::less<SChainMember*>()))
1349 for(TGeneToMembers::const_iterator ig2 = imdg->second.begin(); mbr1p_orig != 0 && ig2 != ig1; ++ig2) {
1350 CGene& gene2 = *ig2->first;
1352 CChain* chain1p = chain1p_orig;
1356 for(list<SChainMember*>::const_iterator im = ig2->second.begin(); im != ig2->second.end() && mbr2p == 0; ++im) {
1357 if(binary_search(chain2p->
m_members.begin(),chain2p->
m_members.end(),*im, std::less<SChainMember*>()))
1364 if(chain1p->
Exons().size() > 1)
1367 if(chain2p->
Exons().size() > 1)
1372 swap(chain1p,chain2p);
1377 CChain& chain1 = *chain1p;
1378 CChain& chain2 = *chain2p;
1382 conflict_members_in_chains[&chain2].insert(mbr2p);
1384 conflict_members_in_chains[&chain1].insert(mbr1p);
1385 }
else if(
Precede(core1,core2)) {
1387 conflict_members_in_chains[&chain2].insert(mbr2p);
1388 else if(
Precede(core2,align_lim))
1389 conflict_members_in_chains[&chain1].insert(mbr1p);
1393 conflict_members_in_chains[&chain1].insert(mbr1p);
1395 conflict_members_in_chains[&chain2].insert(mbr2p);
1398 conflict_members_in_chains[&chain1].insert(mbr1p);
1400 conflict_members_in_chains[&chain2].insert(mbr2p);
1403 conflict_members_in_chains[&chain2].insert(mbr2p);
1405 conflict_members_in_chains[&chain1].insert(mbr1p);
1407 conflict_members_in_chains[&chain1].insert(mbr1p);
1408 conflict_members_in_chains[&chain2].insert(mbr2p);
1412 conflict_members_in_chains[&chain1].insert(mbr1p);
1413 conflict_members_in_chains[&chain2].insert(mbr2p);
1422 for(
CGene& gene : genes) {
1423 for(
CChain* chainp : gene)
1439 ITERATE(TConflictMemebersInChains, it, conflict_members_in_chains) {
1440 CChain& chain = *it->first;
1446 hard_limits = (hard_limits & chain.
Limits());
1523 }
else if(alim.
GetTo() > noclip_limits.
GetTo()) {
1532 int left_splice = -1;
1533 int right_splice = -1;
1534 for(
int e = 1; e < (
int)chain.
Exons().size(); ++e) {
1535 if(left_splice < 0 && chain.
Exons()[e-1].m_ssplice &&
Include(new_limits,chain.
Exons()[e-1].GetTo()))
1536 left_splice = chain.
Exons()[e-1].GetTo();
1537 if(chain.
Exons()[e].m_fsplice &&
Include(new_limits,chain.
Exons()[e].GetFrom()))
1538 right_splice = chain.
Exons()[e].GetFrom();
1541 double left_weights_total = 0.;
1543 double right_weights_total = 0.;
1549 for(
int e = 1; e < (
int)
a.Exons().size(); ++e) {
1550 if(
a.Exons()[e-1].m_ssplice &&
a.Exons()[e-1].GetTo() == left_splice) {
1551 left_weights[alim.
GetFrom()] +=
a.Weight();
1552 left_weights_total +=
a.Weight();
1554 if(
a.Exons()[e].m_fsplice &&
a.Exons()[e].GetFrom() == right_splice) {
1555 right_weights[alim.
GetTo()] +=
a.Weight();
1556 right_weights_total +=
a.Weight();
1560 if(left_weights_total > 0.) {
1563 for(map<int,double>::reverse_iterator it = left_weights.rbegin(); it != left_weights.rend(); ++it) {
1564 if(
t < 0.9*left_weights_total)
1568 if(left < new_limits.
GetFrom())
1571 if(right_weights_total > 0.) {
1575 if(
t < 0.9*right_weights_total)
1579 if(right > new_limits.
GetTo())
1580 new_limits.
SetTo(right);
1595 if(new_limits != chain.
Limits()) {
1601 note +=
" overlap UTR clip";
1605 bool wasopen = chain.
OpenCds();
1611 m_gnomon->GetScore(chain, !no5pextension);
1613 if(wasopen != chain.
OpenCds() && (wasopen ==
false || cds.
HasStart())) {
1628 deque<const SChainMember*> not_visited(1,
this);
1629 while(!not_visited.empty()) {
1635 if(c < mbr->m_identical_count) {
1636 if(included_in_list.insert(mi).second) {
1637 contained.push_back(mi);
1639 included_in_list.insert(mi->
m_copy->begin(),mi->
m_copy->end());
1641 }
else if(included_in_list.find(mi) == included_in_list.end()) {
1642 not_visited.push_back(mi);
1645 not_visited.pop_front();
1653 AddCodingToContained(contained, included_in_list);
1663 AddCodingToContained(contained, included_in_list);
1666 left->AddCodingToContained(contained, included_in_list);
1670 right->AddCodingToContained(contained, included_in_list);
1679 deque<const SChainMember*> not_visited(1,
this);
1680 while(!not_visited.empty()) {
1684 if(c < mbr->m_identical_count) {
1685 if(included_in_list.insert(mi).second) {
1686 contained.push_back(mi);
1688 included_in_list.insert(mi->
m_copy->begin(),mi->
m_copy->end());
1690 }
else if(included_in_list.find(mi) == included_in_list.end()) {
1691 not_visited.push_back(mi);
1694 not_visited.pop_front();
1702 AddToContained(contained, included_in_list);
1712 AddToContained(contained, included_in_list);
1715 left->AddToContained(contained, included_in_list);
1719 right->AddToContained(contained, included_in_list);
1725 #define START_BONUS 600
1743 TContained contained = CollectContainedForChain();
1767 TContained contained = CollectContainedForChain();
1776 TContained contained = CollectContainedForChain();
1844 if(alimits == blimits)
1895 if(alimits == blimits)
1934 sort(container.begin(),container.end());
1935 container.erase( unique(container.begin(),container.end()), container.end() );
1945 void InsertMemberCopyWithoutCds(
SChainMember* copy_ofp);
1960 m_members.splice(m_members.end(),other.
m_members);
1961 m_copylist.splice(m_copylist.end(),other.
m_copylist);
1962 m_align_maps.splice(m_align_maps.end(),other.
m_align_maps);
1964 m_extra_cds.splice(m_extra_cds.end(),other.
m_extra_cds);
1965 insert(end(),other.begin(),other.end());
1973 InsertMember(mbr, copy_ofp);
1978 m_extra_cds.push_back(cds);
1979 InsertMemberCopyWithCds(m_extra_cds.back(), copy_ofp);
1987 InsertMember(mbr, copy_ofp);
2003 InsertMember(mbr, copy_ofp);
2009 m_members.push_back(m);
2010 push_back(&m_members.back());
2013 m_members.back().m_contained = &m_containedlist.back();
2019 m_members.back().m_align_map = &m_align_maps.back();
2021 m_members.back().m_align_map = copy_ofp->
m_align_map;
2025 if(copy_ofp->
m_copy == 0) {
2026 m_copylist.push_back(
TContained(1,copy_ofp));
2027 copy_ofp->
m_copy = &m_copylist.back();
2029 m_members.back().m_copy = copy_ofp->
m_copy;
2030 copy_ofp->
m_copy->push_back(&m_members.back());
2039 InsertMember(new_mbr, copy_ofp);
2047 InsertMember(*itcl);
2048 m_members.back().m_orig_align = orig_aligns[itcl->ID()];
2049 if(unmodified_aligns.count(itcl->ID()))
2050 m_members.back().m_unmd_align = &unmodified_aligns[itcl->ID()];
2072 bool small_flex =
false;
2082 if(big_limits == small_limits) {
2100 m_data->CutParts(models);
2106 if(!parts.empty()) {
2107 models.splice(models.begin(),parts);
2115 size_t initial_size = pointers.size();
2116 for(
size_t i = 0;
i < initial_size; ++
i) {
2123 clust.push_back(new_algn);
2131 size_t initial_size = pointers.size();
2132 for(
size_t i = 0;
i < initial_size; ++
i) {
2145 map<int, set<int> > oriented_splices;
2146 ITERATE(set<TSignedSeqRange>,
i, oriented_introns_plus) {
2147 oriented_splices[
ePlus].insert(
i->GetFrom());
2148 oriented_splices[
ePlus].insert(
i->GetTo());
2150 ITERATE(set<TSignedSeqRange>,
i, oriented_introns_minus) {
2151 oriented_splices[
eMinus].insert(
i->GetFrom());
2152 oriented_splices[
eMinus].insert(
i->GetTo());
2173 typedef vector<pair<CCDSInfo::SPStop,TSignedSeqRange> > TPstopIntron;
2174 TPstopIntron pstops_with_intron_plus;
2175 TPstopIntron pstops_with_intron_minus;
2179 TPstopIntron& pstops_with_intron = (algn.
Strand() ==
ePlus) ? pstops_with_intron_plus : pstops_with_intron_minus;
2182 left =
min(left,s->GetFrom());
2183 right =
max(right,s->GetTo());
2184 if(s->GetLength() == 3) {
2187 for(
int i = 1;
i < (
int)algn.
Exons().size(); ++
i) {
2189 pstops_with_intron.push_back(make_pair(*s,intron));
2195 uniq(pstops_with_intron_plus);
2196 uniq(pstops_with_intron_minus);
2208 TPstopIntron& pstops_with_intron = (algn.
Strand() ==
ePlus) ? pstops_with_intron_plus : pstops_with_intron_minus;
2209 if(pstops_with_intron.empty())
2216 ITERATE(TPstopIntron,
si, pstops_with_intron) {
2217 if(
si->second.GetLength() == 1) {
2221 for(
int i = 1;
i < (
int)algn.
Exons().size(); ++
i) {
2223 if(
si->second == intron &&
si->first == *s)
2236 ITERATE(TPstopIntron,
si, pstops_with_intron) {
2241 for(
int i = 0;
i < (
int)exons.size(); ++
i) {
2242 if(
Include(exons[
i].Limits(),
si->first.GetFrom())) {
2243 if(
si->second.GetLength() == 1) {
2244 if(
si->first.GetTo() <= exons[
i].GetTo())
2247 if(
i < (
int)exons.size()-1) {
2249 if(intron ==
si->second &&
si->first.GetTo() <= exons[
i+1].GetTo())
2274 double ms = GoodCDNAScore(algn);
2275 RemovePoorCds(algn,
ms);
2284 size_t initial_size = pointers.size();
2285 for(
size_t i = 0;
i < initial_size; ++
i) {
2310 initial_size = pointers.size();
2311 for(
unsigned int i = 0;
i < initial_size; ++
i) {
2343 if(mbr.
m_copy->front()->m_align->Strand() == algn.
Strand()) {
2361 if(indl->InDelEnd() > lim.
GetFrom() && indl->Loc() <= lim.
GetTo())
2362 fs.push_back(*indl);
2400 if(!jflex && jlimits.
GetTo()-ai_max_cds.
GetFrom() >= 5)
2408 if(!jflex && ai_max_cds.
GetTo()-jlimits.
GetFrom() >= 5)
2431 if(
abs(j_from-i_from)%3 != 0)
2448 set<int> left_exon_ends, right_exon_ends;
2451 for(
int i = 1;
i < (
int)algn.
Exons().size(); ++
i) {
2452 if(algn.
Exons()[
i-1].m_ssplice && algn.
Exons()[
i].m_fsplice) {
2464 if(ri != right_exon_ends.
end())
2468 if(li != left_exon_ends.
end())
2477 for(
int i = 0;
i < (
int)pointers.size(); ++
i) {
2489 string ssplice = ai.
Exons()[
i-1].m_ssplice_sig;
2490 string fsplice = ai.
Exons()[
i].m_fsplice_sig;
2491 if(ssplice ==
"XX" || fsplice ==
"XX")
2493 else if(ai.
Strand() ==
ePlus && ((ssplice !=
"GT" && ssplice !=
"GC") || fsplice !=
"AG"))
2495 else if(ai.
Strand() ==
eMinus && (ssplice !=
"AG" || (fsplice !=
"GT" && fsplice !=
"GC")))
2525 if(pointers[jfirst]->m_align->Limits() != ai.
Limits())
2527 for(
int j = jfirst; j < (
int)pointers.size() && pointers[j]->m_align->Limits().GetFrom() <= ai.
Limits().
GetTo(); ++j) {
2529 IncludeInContained(mi, mi);
2537 if(CanIncludeJinI(mi, mj))
2538 IncludeInContained(mi, mj);
2568 #define NON_CDNA_INTRON_PENALTY 20
2593 else if(mj.
m_type ==
eLeftUTR && (!ai_left_complete || (!j_rflexible && (aj.
Limits()&ai_rf).GetLength() > 5)))
2605 else if(mj.
m_type ==
eCDS && (!aj_right_complete || (!i_lflexible && (ai.
Limits()&aj_rf).GetLength() > 5)))
2619 if(j_rflexible || i_lflexible)
2621 if((ai.
Limits() & aj.
Limits()).GetLength() < intersect_limit)
2632 int cds_overlap = 0;
2636 if(genome_overlap < 0)
2648 if(cds_overlap%3 != 0)
2656 if(ai.
Exons()[
i-1].m_ssplice && ai.
Exons()[
i].m_fsplice) {
2658 if(
Include(ai_rf,intron) &&
Include(aj_rf,intron) && mrna_count[intron]+est_count[intron]+rnaseq_count[intron] == 0) {
2666 delta_cds = mi.
m_cds-cds_overlap;
2669 delta_splice_num = 0;
2670 if(delta_cds >= 0) {
2674 if(!j_rflexible && !i_lflexible)
2675 first = upper_bound(contained.begin(), contained.end(), &mj,
LeftOrder())-contained.begin();
2678 contained.back()->m_accumulated_num = contained.back()->m_align->Weight();
2679 contained.back()->m_accumulated_splice_num = contained.back()->m_splice_weight;
2680 for(
int i = (
int)contained.size()-2;
i >=
first; --
i) {
2681 contained[
i]->m_accumulated_num = contained[
i]->m_align->Weight()+contained[
i+1]->m_accumulated_num;
2682 contained[
i]->m_accumulated_splice_num = contained[
i]->m_splice_weight+contained[
i+1]->m_accumulated_splice_num;
2686 delta_num = contained[
first]->m_accumulated_num;
2687 delta_splice_num = contained[
first]->m_accumulated_splice_num;
2701 for(
auto p : micontained) {
2716 if(ai.
Exons()[
i-1].m_ssplice && ai.
Exons()[
i].m_fsplice) {
2718 if(
Include(ai_rf,intron) && mrna_count[intron]+est_count[intron]+rnaseq_count[intron] == 0) {
2737 TIVec right_ends(pointers.size());
2738 for(
int k = 0; k < (
int)pointers.size(); ++k) {
2739 auto& kalign = *pointers[k]->m_align;
2740 int rend = kalign.Limits().GetTo();
2742 rend = kalign.Limits().GetFrom();
2743 right_ends[k] = rend;
2749 LRIinit(mi, micontained);
2750 bool not_sorted =
true;
2753 TIVec::iterator
lb = lower_bound(right_ends.begin(),right_ends.end(),ai.
Limits().
GetFrom()-2*flex_len);
2754 TContained::iterator jfirst = pointers.begin();
2755 if(
lb != right_ends.end())
2756 jfirst = pointers.begin()+(
lb-right_ends.begin());
2757 for(TContained::iterator j = jfirst; j <
i; ++j) {
2761 if(aj.
Exons().back().m_fsplice_sig ==
"XX" || ai.
Exons().front().m_ssplice_sig ==
"XX")
2769 double delta_splice_num;
2770 if(LRCanChainItoJ(delta_cds, delta_num, delta_splice_num, mi, mj, micontained, not_sorted)) {
2775 bool better_connection =
false;
2777 better_connection = (newcds > mi.
m_left_cds);
2781 better_connection =
true;
2784 if(better_connection) {
2800 TIVec left_ends(pointers.size());
2801 for(
int k = 0; k < (
int)pointers.size(); ++k) {
2802 auto& kalign = *pointers[k]->m_align;
2803 int lend = kalign.Limits().GetFrom();
2805 lend = kalign.Limits().GetTo();
2806 left_ends[k] = lend;
2820 bool not_sorted =
true;
2823 TIVec::iterator
lb = lower_bound(left_ends.begin(),left_ends.end(),ai.
Limits().
GetTo()+2*flex_len,greater<int>());
2824 TContained::iterator jfirst = pointers.begin();
2825 if(
lb != left_ends.end())
2826 jfirst = pointers.begin()+(
lb-left_ends.begin());
2827 for(TContained::iterator j = jfirst; j <
i; ++j) {
2831 if(aj.
Exons().front().m_ssplice_sig ==
"XX" || ai.
Exons().back().m_fsplice_sig ==
"XX")
2850 if(mj.
m_type ==
eRightUTR && (!ai_right_complete || (!j_lflexible && (aj.
Limits()&ai_rf).GetLength() > 5)))
2862 if(mj.
m_type ==
eCDS && (!aj_left_complete || (!i_rflexible && (ai.
Limits()&aj_rf).GetLength() > 5)))
2878 if(j_lflexible || i_rflexible)
2882 if(intersect < intersect_limit)
continue;
2893 int cds_overlap = 0;
2897 if(genome_overlap < 0)
2909 if(cds_overlap%3 != 0)
2917 if(ai.
Exons()[
i-1].m_ssplice && ai.
Exons()[
i].m_fsplice) {
2919 if(
Include(ai_rf,intron) &&
Include(aj_rf,intron) && mrna_count[intron]+est_count[intron]+rnaseq_count[intron] == 0) {
2928 int delta_cds = mi.
m_cds-cds_overlap;
2936 if(!j_lflexible && !i_rflexible)
2937 first = upper_bound(micontained.begin(),micontained.end(),&mj,
RightOrder())-micontained.begin();
2940 micontained.back()->m_accumulated_num = micontained.back()->m_align->Weight();
2941 micontained.back()->m_accumulated_splice_num = micontained.back()->m_splice_weight;
2942 for(
int i = (
int)micontained.size()-2;
i >=
first; --
i) {
2943 micontained[
i]->m_accumulated_num = micontained[
i]->m_align->Weight()+micontained[
i+1]->m_accumulated_num;
2944 micontained[
i]->m_accumulated_splice_num = micontained[
i]->m_splice_weight+micontained[
i+1]->m_accumulated_splice_num;
2948 double delta_num = micontained[
first]->m_accumulated_num;
2949 double delta_splice_num = micontained[
first]->m_accumulated_splice_num;
2954 bool better_connection =
false;
2960 better_connection =
true;
2963 if(better_connection) {
2985 vector<const SChainMember*> mal;
2988 mal.push_back(left);
2991 mal.push_back(right);
2994 string note = to_string(mi.
m_align->
ID());
2995 ITERATE(vector<const SChainMember*>, imal, mal) {
2996 note = note+
" "+to_string((*imal)->m_align->ID());
3002 map<TSignedSeqRange,int>& mrna_count, map<TSignedSeqRange,int>& est_count, map<TSignedSeqRange,int>& rnaseq_count) {
3004 for(
int i = 1;
i < (
int)chain.
Exons().size() && good; ++
i) {
3005 if(chain.
Exons()[
i-1].m_ssplice && chain.
Exons()[
i].m_fsplice) {
3016 map<TSignedSeqRange,int>& mrna_count, map<TSignedSeqRange,int>& est_count, map<TSignedSeqRange,int>& rnaseq_count) {
3019 (*i)->m_marked_for_deletion = !
GoodSupportForIntrons(*(*i)->m_align, minscor, mrna_count, est_count, rnaseq_count);
3030 if(
a.Limits() !=
b.Limits())
3031 return a.Limits() <
b.Limits();
3036 return aflex < bflex;
3038 return *orig_aligns[
a.ID()]->GetTargetId() < *orig_aligns[
b.ID()]->GetTargetId();
3049 map<tuple<int, int>, TGeneModelList::iterator> special_aligns;
3051 for(TGeneModelList::iterator it = clust.begin(); it != clust.end(); ++it) {
3054 special_aligns.emplace(make_tuple(status, it->Limits().GetTo()), it);
3058 special_aligns.emplace(make_tuple(status, it->Limits().GetFrom()), it);
3064 for(TGeneModelList::iterator it = clust.begin(); it != clust.end(); ++it) {
3075 if(it->Strand() ==
ePlus) {
3076 pos = it->Limits().GetFrom();
3080 pos = it->Limits().GetTo();
3085 galign.
Status() |= status;
3086 clust.push_front(galign);
3087 auto rslt = special_aligns.emplace(make_tuple(status, pos), clust.begin());
3089 auto ialign = rslt.first->second;
3090 ialign->SetWeight(ialign->Weight()+galign.
Weight());
3102 if(it->Strand() ==
eMinus) {
3103 pos = it->Limits().GetFrom();
3107 pos = it->Limits().GetTo();
3112 galign.
Status() |= status;
3113 clust.push_front(galign);
3114 auto rslt = special_aligns.emplace(make_tuple(status, pos), clust.begin());
3116 auto ialign = rslt.first->second;
3117 ialign->SetWeight(ialign->Weight()+galign.
Weight());
3125 for(
auto& sa : special_aligns) {
3126 auto ialign = sa.second;
3127 double min_pos_weight = ((ialign->Status()&
CGeneModel::eCap) ? min_cap_weight : min_polya_weight);
3128 if(ialign->Limits().GetFrom() < 0 || ialign->Limits().GetTo() >= contig_len || ialign->Weight() < min_pos_weight)
3129 clust.erase(ialign);
3135 confirmed_ends.clear();
3136 all_frameshifts.clear();
3139 if(use_confirmed_ends) {
3141 auto rslt = confirmed_ends.emplace(align.
Exons().front().GetTo(), align.
Exons().front().GetFrom());
3143 rslt.first->second =
min(rslt.first->second, align.
Exons().front().GetFrom());
3146 auto rslt = confirmed_ends.emplace(align.
Exons().back().GetFrom(), align.
Exons().back().GetTo());
3148 rslt.first->second =
max(rslt.first->second, align.
Exons().back().GetTo());
3152 for(
int i = 1;
i < (
int)align.
Exons().size(); ++
i) {
3153 if(align.
Exons()[
i-1].m_ssplice && align.
Exons()[
i].m_fsplice) {
3158 oriented_introns_plus.insert(intron);
3160 oriented_introns_minus.insert(intron);
3164 mrna_count[intron] += align.
Weight();
3166 est_count[intron] += align.
Weight();
3168 rnaseq_count[intron] += align.
Weight();
3173 has_rnaseq = !rnaseq_count.empty();
3174 sort(all_frameshifts.begin(),all_frameshifts.end());
3175 if(!all_frameshifts.empty())
3176 uniq(all_frameshifts);
3187 for(
int i = 1;
i < (
int)align.
Exons().size(); ++
i) {
3188 if(align.
Exons()[
i-1].m_ssplice && align.
Exons()[
i].m_fsplice) {
3190 if(oriented_introns_plus.find(intron) != oriented_introns_plus.end())
3192 if(oriented_introns_minus.find(intron) != oriented_introns_minus.end())
3196 if(pluses > 0 && minuses == 0) {
3200 }
else if(minuses > 0 && pluses == 0) {
3209 CChainMembers allpointers(clust, orig_aligns, unmodified_aligns);
3211 DuplicateNotOriented(allpointers, clust);
3212 ReplicatePStops(allpointers);
3213 ScoreCdnas(allpointers);
3214 Duplicate5pendsAndShortCDSes(allpointers);
3215 DuplicateUTRs(allpointers);
3216 CalculateSpliceWeights(allpointers);
3217 FindContainedAlignments(allpointers);
3222 if(!(*ip)->m_not_for_chaining)
3223 pointers.push_back(*
ip);
3229 coding_pointers.push_back(*
i);
3232 LeftRight(coding_pointers);
3233 RightLeft(coding_pointers);
3237 array<map<TSignedSeqPos,TSignedSeqRange>, 2> coding_right_splices;
3238 array<map<TSignedSeqPos,TSignedSeqRange>, 2> coding_left_splices;
3239 array<set<TSignedSeqRange>, 2> coding_introns;
3260 m_gnomon->GetScore(chain,
false,
false,
true);
3267 (cdslen < minscor.m_minlen || (chain.
Score() < 2*minscor.m_min && cdslen < 2*minscor.m_cds_len)))
3271 for(
int i = 1;
i < (
int)chain.
Exons().size(); ++
i) {
3273 bool coding_donor =
Include(real_cds, donor);
3275 bool coding_acceptor =
Include(real_cds, acceptor);
3277 coding_right_splices[chain.
Strand()][donor].CombineWith(real_cds);
3279 coding_left_splices[chain.
Strand()][acceptor].CombineWith(real_cds);
3280 if(coding_donor && coding_acceptor)
3281 coding_introns[chain.
Strand()].emplace(donor, acceptor);
3301 for(
auto ip : pointers) {
3306 int strand =
ip->m_align->Strand();
3307 auto& crs = coding_right_splices[strand];
3308 auto& cls = coding_left_splices[strand];
3309 for(
int i = 1;
i < (
int)
ip->m_align->Exons().size(); ++
i) {
3311 auto rslt = crs.find(rsplice);
3313 ip->m_marked_for_deletion =
true;
3317 rslt = cls.find(lsplice);
3319 ip->m_marked_for_deletion =
true;
3324 if(!
ip->m_marked_for_deletion) {
3325 auto& cdi = coding_introns[strand];
3326 for(
auto& exon :
ip->m_align->Exons()) {
3327 if(
Include(cds, exon.Limits()))
3329 for(
auto intronp = cdi.upper_bound(exon.Limits()); intronp != cdi.end() && intronp->GetFrom() < exon.GetTo(); ++intronp) {
3330 if(
Include(exon.Limits(), *intronp) && !
Include(cds, *intronp)) {
3331 ip->m_marked_for_deletion =
true;
3335 if(
ip->m_marked_for_deletion)
3343 set<TSignedSeqRange> introns;
3344 set<TSignedSeqRange> est_introns;
3345 for(
auto p : pointers) {
3348 for(
unsigned i = 1;
i < exons.size(); ++
i) {
3349 if(!exons[
i-1].m_ssplice || !exons[
i].m_fsplice)
3351 introns.emplace(exons[
i-1].GetTo(), exons[
i].GetFrom());
3353 est_introns.emplace(exons[
i-1].GetTo(), exons[
i].GetFrom());
3356 bool enough_est = !introns.empty() && est_introns.size() > longreadsthreshold/100*introns.size();
3358 cerr <<
"Introns: " << introns.size() <<
" " << est_introns.size() <<
" " << enough_est << endl;
3360 int old_oep = intersect_limit;
3362 intersect_limit = 10000;
3364 if(p->m_align->Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible))
3366 if(p->m_align->Exons().size() != 1 || p->m_type == eCDS)
3368 if(p->m_copy != nullptr) {
3369 for(SChainMember* cp : *p->m_copy) {
3370 if(cp->m_cds_info->MaxCdsLimits() == TSignedSeqRange::GetWhole())
3375 return true; }), pointers.end());
3378 LeftRight(pointers);
3379 RightLeft(pointers);
3406 RemovePoorCds(chain, GoodCDNAScore(chain,
true));
3419 chain.
ClipToCap(min_cap_blob, max_dist, min_flank_exon, secondary_peak,
false);
3420 chain.
ClipToPolyA(contig, min_polya_blob, max_dist, min_flank_exon, secondary_peak, tertiary_peak, tertiary_peak_coverage,
false);
3425 m_gnomon->GetScore(chain, !no5pextension);
3428 RemovePoorCds(chain, GoodCDNAScore(chain));
3438 tmp_chains.push_back(chain);
3445 CreateChainsForPartialProteins(tmp_chains, pointers, unma_aligns, unma_members);
3453 for(
auto i : allpointers) {
3458 if(mi.
m_copy !=
nullptr) {
3459 for(
auto j : *mi.
m_copy) {
3461 for(
auto jc : *j->m_contained) {
3470 pointers.erase(
std::remove_if(pointers.begin(),pointers.end(),[](
SChainMember* p){ return p->m_type == eRightUTR; }), pointers.end());
3475 return p->m_align->Exons().size() == 1 && !(p->m_align->Status()&(CGeneModel::eLeftFlexible|CGeneModel::eRightFlexible)); }), pointers.end());
3478 array<set<TSignedSeqRange>, 2> non_coding_introns;
3479 for(
auto p : pointers) {
3481 for(
unsigned i = 1;
i < exons.size(); ++
i) {
3486 for(
auto p : pointers) {
3488 auto& ncdi = non_coding_introns[strand];
3490 for(
auto intronp = ncdi.upper_bound(exon.Limits()); intronp != ncdi.end() && intronp->GetFrom() < exon.GetTo(); ++intronp) {
3491 if(
Include(exon.Limits(), *intronp)) {
3503 LeftRight(pointers);
3504 RightLeft(pointers);
3531 chain.
ClipToCap(min_cap_blob, max_dist, min_flank_exon, secondary_peak,
false);
3532 chain.
ClipToPolyA(contig, min_polya_blob, max_dist, min_flank_exon, secondary_peak, tertiary_peak, tertiary_peak_coverage,
false);
3542 tmp_chains.push_back(chain);
3548 chain.
SetID(m_idnext);
3550 m_idnext += m_idinc;
3553 CombineCompatibleChains(tmp_chains);
3554 SetFlagsForChains(tmp_chains);
3556 intersect_limit = old_oep;
3558 list<CGene> genes = FindGenes(tmp_chains);
3560 if(genes.size() > 1) {
3561 TrimAlignmentsIncludedInDifferentGenes(genes);
3562 CombineCompatibleChains(tmp_chains);
3563 SetFlagsForChains(tmp_chains);
3566 if(genes.size() > 1)
3567 FindGenes(tmp_chains);
3571 it->RestoreTrimmedEnds(trim);
3572 chains.push_back(*it);
3575 enum { eFirstPeak = 1, eSecondPeak = 2, eThirdPeak = 4, eAs = 8};
3576 map<tuple<int, int, int>,
int> cap_polya_info;
3578 for(
auto& chain : tmp_chains) {
3599 for(
auto&
info : cap_polya_info) {
3601 char strand = get<1>(
info.first) ==
ePlus ?
'+' :
'-';
3603 cerr <<
m_contig_acc <<
' ' << determinant <<
' ' << strand <<
' ' << pos <<
' ';
3604 if(
info.second&eFirstPeak)
3605 cerr <<
":FirstPeak";
3606 if(
info.second&eSecondPeak)
3607 cerr <<
":SecondPeak";
3608 if(
info.second&eThirdPeak)
3609 cerr <<
":ThirdPeak";
3624 return ap->
ID() < bp->
ID();
3631 TIVec right_ends(pointers.size());
3632 vector<SChainMember> no_gap_members(pointers.size());
3633 for(
int k = 0; k < (
int)pointers.size(); ++k) {
3636 no_gap_members[k] = mi;
3641 int first_member = (
int)pointers.size()-1;
3643 for(
int i = (
int)pointers.size()-1;
i >= 0; --
i) {
3645 if(limi.
GetTo() >= leftpos) {
3653 int last_member = 0;
3655 for(
int i = 0;
i < (
int)pointers.size(); ++
i) {
3659 rightpos =
max(rightpos,limi.
GetTo());
3663 int fully_connected_right = 0;
3665 for(
int i = first_member;
i <= last_member; ++
i) {
3670 LRIinit(mi, micontained);
3677 int part_to_connect = (
int)parts.size()-1;
3678 while(part_to_connect >= 0 && ai.
Limits().
GetFrom() <= parts[part_to_connect]->Limits().GetFrom())
3681 if(fully_connected_right > 0 && ai.
Limits().
GetFrom() > fully_connected_right)
3686 bool not_sorted =
true;
3688 bool compatible_with_included_parts =
true;
3689 int last_included_part = -1;
3690 bool includes_first_part =
false;
3691 for(
int p = part_to_connect+1; p < (
int)parts.size(); ++p) {
3698 bool samestop = (parts[p]->GetCdsInfo().HasStop() == mi.
m_cds_info->
HasStop() && (!parts[p]->GetCdsInfo().HasStop() || parts[p]->GetCdsInfo().Stop() == mi.
m_cds_info->
Stop()));
3700 if(compatible && samestop && samefshifts) {
3701 last_included_part = p;
3703 includes_first_part =
true;
3705 compatible_with_included_parts =
false;
3711 compatible_with_included_parts =
false;
3719 if(!compatible_with_included_parts)
3722 _ASSERT(part_to_connect < 0 || part_to_connect == (
int)parts.size()-1 || mi.
m_type ==
eCDS);
3724 if(includes_first_part) {
3729 TIVec::iterator
lb = lower_bound(right_ends.begin(),right_ends.end(),(part_to_connect >= 0 ? parts[part_to_connect]->Limits().GetTo() : ai.
Limits().
GetFrom()));
3731 if(
lb != right_ends.end())
3732 jfirst = (
int)(
lb-right_ends.begin());
3734 for(
int j = jfirst; j <
i; ++j) {
3750 #define PGAP_PENALTY 120
3765 double delta_splice_num;
3766 if(LRCanChainItoJ(delta_cds, delta_num, delta_splice_num, mi, mj, micontained, not_sorted)) {
3771 bool better_connection =
false;
3773 better_connection = (newcds > mi.
m_left_cds);
3777 better_connection =
true;
3794 better_connection =
false;
3796 better_connection = (newcds > mi_no_gap.
m_left_cds);
3800 better_connection =
true;
3831 _ASSERT(std::less<SChainMember*>()(best_right, &no_gap_members.front()) || std::less<SChainMember*>()(&no_gap_members.back(), best_right));
3834 if(!std::less<SChainMember*>()(mp->m_left_member, &no_gap_members.front()) && !std::less<SChainMember*>()(&no_gap_members.back(), mp->m_left_member)) {
3836 SChainMember* p = pointers[mp->m_left_member-&no_gap_members.front()];
3850 bool operator()(
const vector<CGeneModel*>* ap,
const vector<CGeneModel*>* bp)
3852 const vector<CGeneModel*>& partsa = *ap;
3853 const vector<CGeneModel*>& partsb = *bp;
3856 ITERATE(vector<CGeneModel*>, k, partsa)
3857 align_lena += (*k)->AlignLen();
3860 ITERATE(vector<CGeneModel*>, k, partsb)
3861 align_lenb += (*k)->AlignLen();
3863 if(align_lena != align_lenb) {
3864 return align_lena > align_lenb;
3866 return *orig_aligns[partsa.front()->ID()]->GetTargetId() < *orig_aligns[partsb.front()->ID()]->GetTargetId();
3875 typedef map<Int8, vector<CGeneModel*> > TIdChainMembermap;
3876 TIdChainMembermap protein_parts;
3877 for(
int k = 0; k < (
int)pointers_all.size(); ++k) {
3885 vector<vector<CGeneModel*>*> gapped_sorted_protein_parts;
3887 vector<CGeneModel*>& parts =
ip->second;
3888 if(parts.size() > 1) {
3890 gapped_sorted_protein_parts.push_back(&parts);
3893 sort(gapped_sorted_protein_parts.begin(),gapped_sorted_protein_parts.end(),
AlignLenOrder(orig_aligns));
3896 vector<CGeneModel*>& parts = **
ip;
3897 Int8 id = parts.front()->ID();
3900 ITERATE(vector<CGeneModel*>, k, parts) {
3909 bool connected =
false;
3914 k->AddComment(
"Was connected "+orig_aligns[palign.
ID()]->TargetAccession());
3923 for(
int k = 0; k < (
int)pointers_all.size(); ++k) {
3940 bool compatible =
true;
3951 pointers.push_back(mip);
3954 SChainMember* best_right = FindOptimalChainForProtein(pointers, parts, palign);
3958 CChain chain(*best_right,
false);
3961 if(unmodified_aligns.count(
id)) {
3963 vector<TSignedSeqRange> new_holes;
3964 vector<TSignedSeqRange> remaining_holes;
3965 for(
int k = 1; k < (
int)chain.
Exons().size(); ++k) {
3970 remaining_holes.push_back(h);
3971 for(
int piece_begin = 0; piece_begin < (
int)unma.
Exons().size(); ++piece_begin) {
3972 int piece_end = piece_begin;
3973 for( ; piece_end < (
int)unma.
Exons().size() && unma.
Exons()[piece_end].m_ssplice; ++piece_end);
3975 new_holes.push_back(h);
3978 piece_begin = piece_end;
3983 if(!new_holes.empty()) {
3990 vector<TSignedSeqRange> existed_holes;
3991 for(
int k = 1; k < (
int)unma.
Exons().size(); ++k) {
3998 for(
int k = 1; k < (
int)palign.
Exons().size(); ++k) {
4003 bool connected =
true;
4004 ITERATE(vector<TSignedSeqRange>, h, remaining_holes) {
4012 bool existed =
false;
4013 ITERATE(vector<TSignedSeqRange>, h, existed_holes) {
4020 if(connected || existed) {
4031 unmacl.push_back(unma);
4034 vector<CGeneModel*> unmaparts;
4037 unmaparts.push_back(&(*im));
4040 CChainMembers unmapointers(unmacl, orig_aligns, unmodified_aligns);
4041 Duplicate5pendsAndShortCDSes(unmapointers);
4045 IncludeInContained(mi, mi);
4049 if(CanIncludeJinI(mi, mj))
4050 IncludeInContained(mi, mj);
4056 (*ip)->m_mem_id = -(*ip)->m_mem_id;
4057 pointers.push_back(*
ip);
4061 best_right = FindOptimalChainForProtein(pointers, unmaparts, unma);
4064 bool present =
false;
4066 present =
ip == &mj;
4069 if(CanIncludeJinI(mi, mj)) {
4070 IncludeInContained(mi, mj);
4076 chain =
CChain(*best_right,
false);
4078 unma_aligns.splice(unma_aligns.end(), unmacl);
4100 chain.
ClipToCap(min_cap_blob, max_dist, min_flank_exon, secondary_peak,
false);
4101 chain.
ClipToPolyA(contig, min_polya_blob, max_dist, min_flank_exon, secondary_peak, tertiary_peak, tertiary_peak_coverage,
false);
4105 m_gnomon->GetScore(chain, !no5pextension);
4112 chain.
AddComment(
"Connected "+orig_aligns[palign.
ID()]->TargetAccession());
4115 chains.push_back(chain);
4129 int len = right-left+1;
4131 vector<int> prot_cov[2][3];
4132 prot_cov[0][0].resize(
len,0);
4133 prot_cov[0][1].resize(
len,0);
4134 prot_cov[0][2].resize(
len,0);
4135 prot_cov[1][0].resize(
len,0);
4136 prot_cov[1][1].resize(
len,0);
4137 prot_cov[1][2].resize(
len,0);
4143 for(
int i = 0;
i < (
int)align.
Exons().size(); ++
i) {
4149 ++prot_cov[align.
Strand()][
abs(cdstr-jtr)%3][j-left];
4172 bool allcdnaintrons =
true;
4174 for(
int i = 1;
i < (
int)chain.
Exons().size() && allcdnaintrons; ++
i) {
4175 if(chain.
Exons()[
i-1].m_ssplice_sig !=
"XX" && chain.
Exons()[
i].m_fsplice_sig !=
"XX") {
4177 allcdnaintrons = (mrna_count[intron]+est_count[intron]+rnaseq_count[intron] > 0);
4181 if(allcdnaintrons && num >0)
4191 int rrf_from_proteins = 0;
4194 for(
int i = 0;
i < (
int)chain.
Exons().size(); ++
i) {
4198 if(j < left || j > right)
4202 int frame =
abs(cdstr-jtr)%3;
4203 if(jtr >= 0 && prot_cov[chain.
Strand()][frame][j-left] > 0) {
4205 lrf_from_proteins =
min(lrf_from_proteins,j);
4207 rrf_from_proteins =
max(rrf_from_proteins,j);
4231 for(TChainList::iterator itt = chains.begin(); itt != chains.end(); ++itt) {
4235 for(TChainList::iterator jt = chains.begin(); jt != chains.end();) {
4236 TChainList::iterator jtt = jt++;
4240 if(itt != jtt && itt->Strand() == jtt->Strand() && jtt->IsSubAlignOf(*itt) && itt->ReadingFrame().Empty() == jtt->ReadingFrame().Empty()) {
4241 if(itt->ReadingFrame().NotEmpty()) {
4246 bool same_frame = (itt->FShiftedLen(
a,
b,
false)-1)%3 == 0;
4254 if(!
Include(jtt->MaxCdsLimits(), itt->MaxCdsLimits()))
4264 bool same_stops =
true;
4266 if(
Include(jtt->Limits(),*istp) && find(jstops.begin(), jstops.end(), *istp) == jstops.end()) {
4288 if((*i)->m_copy != 0)
4289 support.insert((*i)->m_copy->begin(),(*i)->m_copy->end());
4301 if(support.insert(*i).second && (
Include(jlimits, il) || itt->HasCompatibleOverlap(*(*i)->m_align, 1))) {
4302 itt->m_was_combined =
true;
4303 itt->m_members.push_back(*
i);
4304 if((*i)->m_copy != 0)
4305 support.insert((*i)->m_copy->begin(),(*i)->m_copy->end());
4308 if(itt->m_was_combined) {
4310 itt->CalculateSupportAndWeightFromMembers();
4327 return minscor.m_min;
4361 if (algn.
Score() < minscor)
4365 #define SCAN_WINDOW 49
4379 vector<double> coverage_raw(mrna_len+
SCAN_WINDOW);
4394 m_coverage.resize(mrna_len);
4398 for(
int i = 0;
i < mrna_len; ++
i) {
4399 m_coverage[
i] = cov;
4405 CChain::CChain(
SChainMember& mbr,
bool full_support) : m_coverage_drop_left(-1), m_coverage_drop_right(-1), m_coverage_bump_left(-1), m_coverage_bump_right(-1), m_core_coverage(0), m_splice_weight(0), m_cap_peaks(3, -1), m_polya_peaks(3, -1), m_was_combined(
false) {
4422 int num = other_exons.size();
4427 first = other_exons.front().GetTo() >= exons.back().GetTo() ? 0 : 1;
4429 first = std::lower_bound(other_exons.begin(), other_exons.end(), exons.back().GetTo(), [](
const CModelExon& e,
TSignedSeqPos a) { return e.GetTo() < a; })-other_exons.begin();
4430 exons.back().Extend(other_exons[
first]);
4431 exons.insert(exons.end(), other_exons.begin()+
first+1, other_exons.end());
4443 int num = other_exons.size();
4445 if(other_exons.back().GetTo() < exons.front().GetFrom()) {
4446 exons.insert(exons.begin(), other_exons.begin(), other_exons.end());
4448 exons[num].m_fsplice =
false;
4449 exons[num].m_fsplice_sig.clear();
4450 exons[num-1].m_ssplice =
false;
4451 exons[num-1].m_ssplice_sig.clear();
4455 if(cds.
GetFrom() > exons[num].GetFrom())
4456 exons[num].Limits().SetFrom(cds.
GetFrom());
4459 if(other_cds.
GetTo() < exons[num-1].GetTo())
4460 exons[num-1].Limits().SetTo(other_cds.
GetTo());
4466 first = other_exons.back().GetFrom() <= exons.front().GetFrom() ? 1 : 0;
4468 first = std::lower_bound(other_exons.begin(), other_exons.end(), exons.front().GetFrom(), [](
const CModelExon& e,
TSignedSeqPos a) { return e.GetTo() < a; })-other_exons.begin();
4469 exons.front().Limits().SetFrom(other_exons[
first].GetFrom());
4470 if(other_exons[
first].m_fsplice) {
4471 exons.front().m_fsplice =
true;
4472 exons.front().m_fsplice_sig = other_exons[
first].m_fsplice_sig;
4474 exons.insert(exons.begin(), other_exons.begin(), other_exons.begin()+
first);
4487 m_exons.assign(exons.begin(), exons.end());
4490 m_exons.front().m_fsplice =
false;
4491 m_exons.front().m_fsplice_sig.clear();
4492 m_exons.back().m_ssplice =
false;
4493 m_exons.back().m_ssplice_sig.clear();