41 const char *
const k_aa_table =
"KNKNXTTTTTRSRSXIIMIXXXXXXQHQHXPPPPPRRRRRLLLLLXXXXXEDEDXAAAAAGGGGGVVVVVXXXXX*Y*YXSSSSS*CWCXLFLFXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX";
45 size_t len = src.size();
48 for(
size_t i = 0;
i <
len; ++
i)
60 size_t len = src.size();
63 for(
size_t i = 0;
i <
len; ++
i)
64 dst.push_back(
toACGT(src[
i]) );
89 template <
typename Res>
110 const Res * start_codon;
117 return equal(start_codon,start_codon+3,seq);
126 if(strand ==
ePlus) {
130 for (
int i = 1;
i <= 3; ++
i)
138 for (
int i = 1;
i <= 3; ++
i)
150 for (CEResidueVec::const_iterator pos = mrna.begin()+search_region.
GetFrom(); (pos = search(pos,mrna.end(),codon,codon+3)) < mrna.begin()+search_region.
GetTo(); ++pos) {
151 int l = (
int)(pos-mrna.begin());
153 if (fixed_frame==-1 || fixed_frame==frame)
154 positions[frame].push_back(
l);
165 for (
int i=1;
i <=3; ++
i)
167 for (
int f = 0;
f < 3; ++
f)
168 sort(stops[
f].begin(), stops[
f].end());
176 int codon_start = start+frame-3;
177 if(codon_start >= 0 &&
IsStopCodon(&seq_strand[codon_start]))
185 int left_cds_limit = -1;
186 int reading_frame_start = (
int)mrna.size();
187 int reading_frame_stop = (
int)mrna.size();
188 int right_cds_limit = (
int)mrna.size();
204 reading_frame_start = rf.
GetFrom();
205 _ASSERT(reading_frame_start >= 0);
206 reading_frame_stop = rf.
GetTo();
207 _ASSERT(reading_frame_stop >= 0);
209 if (reading_frame_start == 0 &&
IsStartCodon(&mrna[reading_frame_start]) && reading_frame_start+3 < reading_frame_stop)
210 reading_frame_start += 3;
212 _ASSERT( -1 <= left_cds_limit && left_cds_limit <= reading_frame_start );
213 _ASSERT( 0 <= reading_frame_start && reading_frame_start <= reading_frame_stop && reading_frame_stop <
int(mrna.size()) );
214 _ASSERT( reading_frame_stop <= right_cds_limit && right_cds_limit <=
int(mrna.size()) );
216 frame = reading_frame_start%3;
218 if (left_cds_limit<0) {
219 if (reading_frame_start >= 3) {
223 if (stops[frame].
size()>0)
224 left_cds_limit = stops[frame].back()+3;
229 reading_frame_start = reading_frame_stop-5;
232 reading_frame_start =
min(protrf.
GetFrom(),reading_frame_start);
236 if (left_cds_limit<0) {
240 starts[0].push_back(-3);
243 model_start = (
TSignedSeqPos)contig_seq.size()-1-model_start;
244 for (
int i = 0;
i<3; ++
i) {
245 if (frame == -1 || frame ==
i) {
248 stops[
i].push_back(
i-3);
250 starts[
i].push_back(
i-3);
268 starts[frame].push_back(start.
GetFrom());
269 }
else if(reading_frame_start-left_cds_limit >= 3) {
275 }
else if (right_cds_limit - reading_frame_stop >= 3) {
279 if (
int(mrna.size()) <= right_cds_limit) {
280 stops[mrna.size()%3].push_back((
int)mrna.size());
281 stops[(mrna.size()-1)%3].push_back((
int)mrna.size()-1);
282 stops[(mrna.size()-2)%3].push_back((
int)mrna.size()-2);
289 vector<int>::const_iterator it_stop = lower_bound(stops.begin(),stops.end(),start);
291 if(it_stop != stops.begin()) {
299 auto it_start = lower_bound(starts.begin(), starts.end(), stop);
300 if(it_start != starts.end()) {
327 if(!fs_only ||
len%3 != 0)
407 EEdgeType left_type,
EEdgeType right_type,
const string& left_edit_extra_seq,
const string& right_edit_extra_seq)
417 _ASSERT((
int)left_edit_extra_seq.length() == 0 || (
int)left_edit_extra_seq.length() == left_edite);
419 _ASSERT((
int)right_edit_extra_seq.length() == 0 || (
int)right_edit_extra_seq.length() == right_edite);
424 TInDels::const_iterator fsi_end,
EEdgeType type_a,
EEdgeType type_b,
const string& gseq_a,
const string& gseq_b)
426 TInDels::const_iterator fsi = fsi_begin;
427 for( ;fsi != fsi_end && fsi->Loc() < orig_a; ++fsi ) {
428 _ASSERT( !fsi->IntersectingWith(orig_a,orig_b) );
433 string left_edit_extra_seq = gseq_a;
436 for( ;fsi != fsi_end && fsi->Loc() == orig_a && !fsi->IsMismatch(); ++fsi ) {
437 if(fsi->IsInsertion()) {
439 orig_a += fsi->Len();
440 left_orige += fsi->Len();
442 edit_a += fsi->Len();
443 left_edite += fsi->Len();
444 left_edit_extra_seq += fsi->GetInDelV();
447 for( ; fsi != fsi_end && fsi->IsMismatch() && fsi->Loc() == orig_a+(
int)mism.size(); ++fsi)
448 mism += fsi->GetInDelV();
450 while(fsi != fsi_end && fsi->InDelEnd() <= orig_b+1) {
457 string right_edit_extra_seq;
458 for( ;fsi != fsi_end && fsi->Loc() ==
bb && !fsi->IsMismatch(); ++fsi ) {
459 if (fsi->IsInsertion()) {
460 right_orige += fsi->Len();
463 right_edite += fsi->Len();
464 right_edit_extra_seq += fsi->GetInDelV();
470 if(next_orig_a > orig_b) {
471 right_edit_extra_seq += gseq_b;
472 right_edite += gseq_b.length();
475 InsertOneToOneRange(orig_a, edit_a,
len, mism, left_orige, left_edite, right_orige, right_edite, type_a, tb, left_edit_extra_seq, right_edit_extra_seq);
477 orig_a = next_orig_a;
478 edit_a +=
len+right_edite;
480 left_orige = right_orige;
481 left_edite = right_edite;
482 left_edit_extra_seq = right_edit_extra_seq;
484 for( ; fsi != fsi_end && fsi->IsMismatch() && fsi->Loc() == orig_a+(
int)mism.size(); ++fsi)
485 mism += fsi->GetInDelV();
490 string right_edit_extra_seq;
492 if(orig_a+
len > orig_b) {
493 right_edit_extra_seq = gseq_b;
496 InsertOneToOneRange(orig_a, edit_a,
len, mism, left_orige, left_edite, 0, (
TSignedSeqPos)gseq_b.length(), type_a, tb, left_edit_extra_seq, right_edit_extra_seq);
502 left_edit_extra_seq.clear();
506 if(orig_a <= orig_b) {
507 int len = orig_b-orig_a+1;
509 InsertOneToOneRange(orig_a, edit_a,
len, mism, left_orige, left_edite, 0, (
TSignedSeqPos)gseq_b.length(), type_a, type_b, left_edit_extra_seq, gseq_b);
519 _ASSERT(transcript_exons.size() == exons.size());
520 _ASSERT(transcript_exons.size() == 1 || (orientation ==
ePlus && transcript_exons.front().GetFrom() < transcript_exons.back().GetFrom()) ||
521 (orientation ==
eMinus && transcript_exons.front().GetFrom() > transcript_exons.back().GetFrom()));
523 for(
unsigned int i = 0;
i < exons.size(); ++
i) {
525 diff += exonlen-(transcript_exons[
i].GetTo()-transcript_exons[
i].GetFrom()+1);
529 diff += (
f->IsDeletion()) ?
f->Len() : -
f->Len();
538 for(
unsigned int i = 0;
i < exons.size(); ++
i) {
539 if(exons[
i].Limits().Empty()) {
540 _ASSERT(
i == 0 || exons[
i-1].Limits().NotEmpty());
541 _ASSERT(
i == exons.size()-1 || exons[
i+1].Limits().NotEmpty());
547 if(
i > 0 && exons[
i-1].Limits().
Empty()) {
549 gseq_a = exons[
i-1].m_seq;
550 estart += gseq_a.length();
552 if(
i < exons.size()-1 && exons[
i+1].Limits().Empty()) {
554 gseq_b = exons[
i+1].m_seq;
560 estart =
InsertIndelRangesForInterval(exons[
i].GetFrom(), exons[
i].GetTo(), estart, indels.begin(), indels.end(), type_a, type_b, gseq_a, gseq_b);
563 if(
i != exons.size()-1) {
565 estart += transcript_exons[
i+1].GetFrom()-transcript_exons[
i].GetTo()-1;
567 estart += transcript_exons[
i].GetFrom()-transcript_exons[
i+1].GetTo()-1;
575 TInDels::const_iterator fsi_begin = indels.begin();
576 TInDels::const_iterator fsi_end = indels.end();
582 for(
unsigned int i = 0;
i < exons.size(); ++
i) {
583 if(exons[
i].Limits().
Empty()) {
584 _ASSERT(
i == 0 || exons[
i-1].Limits().NotEmpty());
585 _ASSERT(
i == exons.size()-1 || exons[
i+1].Limits().NotEmpty());
593 if(
i > 0 && exons[
i-1].Limits().
Empty()) {
595 gseq_a = exons[
i-1].m_seq;
596 estart += gseq_a.length();
598 if(
i < exons.size()-1 && exons[
i+1].Limits().Empty()) {
600 gseq_b = exons[
i+1].m_seq;
607 if(stop < lim.
GetFrom())
continue;
608 if(lim.
GetTo() < start)
break;
614 if(lim.
GetTo() <= stop) {
620 if(
i != exons.size()-1 && (!exons[
i+1].m_fsplice || !exons[
i].m_ssplice))
631 template <
class In,
class Out>
634 edited_sequence.clear();
639 s.insert(s.end(),
l,
'N');
652 for(
int i =
a;
i <
b; ++
i)
653 edited_sequence.push_back(original_sequence[
i]);
661 seq.insert(seq.end(),
l,
'N');
671 seq.insert(seq.end(),
l,
'N');
690 void CAlignMap::EditedSequence<CResidueVec,CResidueVec>(
const CResidueVec& original_sequence,
CResidueVec& edited_sequence,
bool includeholes)
const;
692 void CAlignMap::EditedSequence<CEResidueVec,CEResidueVec>(
const CEResidueVec& original_sequence,
CEResidueVec& edited_sequence,
bool includeholes)
const;
694 void CAlignMap::EditedSequence<string,string>(
const string& original_sequence,
string& edited_sequence,
bool includeholes)
const;
696 void CAlignMap::EditedSequence<CAlignCollapser::CPartialString,string>(
const CAlignCollapser::CPartialString& original_sequence,
string& edited_sequence,
bool includeholes)
const;
778 bool snapped =
false;
806 bool snapped =
false;
843 if(p <
a.front().GetExtendedFrom() || p >
a.back().GetExtendedTo())
return -1;
845 if(p <
a.front().GetFrom()) {
847 return b.front().GetExtendedFrom();
853 if(p >
a.back().GetTo()) {
855 return b.back().GetExtendedTo();
863 if(p >
a[num].GetTo()) {
864 if(
a[num].GetTypeTo() ==
eGgap)
869 return b[num+1].GetExtendedFrom();
871 return b[num].GetExtendedTo();
875 }
else if(p ==
a[num].GetTo()) {
877 return b[num].GetExtendedTo();
878 }
else if(p ==
a[num].GetFrom() && move_mode ==
eLeftEnd &&
b[num].GetTypeFrom() !=
eGgap) {
879 return b[num].GetExtendedFrom();
881 return b[num].GetTo();
883 }
else if(p ==
a[num].GetFrom()) {
885 return b[num].GetExtendedFrom();
887 return b[num].GetFrom();
890 return b[num].GetFrom()+p-
a[num].GetFrom();
TSignedSeqPos FShiftedMove(TSignedSeqPos orig_pos, int len) const
static int FindLowerRange(const vector< CAlignMap::SMapRange > &a, TSignedSeqPos p)
TSignedSeqRange ShrinkToRealPointsOnEdited(TSignedSeqRange edited_range) const
void InsertOneToOneRange(TSignedSeqPos orig_start, TSignedSeqPos edited_start, TSignedSeqPos len, const string &mism, TSignedSeqPos left_orige, TSignedSeqPos left_edite, TSignedSeqPos right_orige, TSignedSeqPos right_edite, EEdgeType left_type, EEdgeType right_type, const string &left_edit_extra_seq, const string &right_edit_extra_seq)
static TSignedSeqRange MapRangeAtoB(const vector< CAlignMap::SMapRange > &a, const vector< CAlignMap::SMapRange > &b, TSignedSeqRange r, ERangeEnd lend, ERangeEnd rend)
vector< SMapRange > m_edited_ranges
TSignedSeqPos InsertIndelRangesForInterval(TSignedSeqPos orig_a, TSignedSeqPos orig_b, TSignedSeqPos edit_a, TInDels::const_iterator fsi_begin, TInDels::const_iterator fsi_end, EEdgeType type_a, EEdgeType type_b, const string &gseq_a, const string &gseq_b)
TSignedSeqRange MapRangeEditedToOrig(TSignedSeqRange edited_range, bool withextras=true) const
TSignedSeqPos MapOrigToEdited(TSignedSeqPos orig_pos) const
void EditedSequence(const In &original_sequence, Out &edited_sequence, bool includeholes=false) const
TSignedSeqPos MapEditedToOrig(TSignedSeqPos edited_pos) const
TSignedSeqRange ShrinkToRealPoints(TSignedSeqRange orig_range, bool snap_to_codons=false) const
int FShiftedLen(TSignedSeqRange ab, ERangeEnd lend, ERangeEnd rend) const
static TSignedSeqPos MapAtoB(const vector< CAlignMap::SMapRange > &a, const vector< CAlignMap::SMapRange > &b, TSignedSeqPos p, ERangeEnd move_mode)
TSignedSeqRange MapRangeOrigToEdited(TSignedSeqRange orig_range, ERangeEnd lend, ERangeEnd rend) const
vector< SMapRange > m_orig_ranges
TSignedSeqRange Start() const
TSignedSeqRange ProtReadingFrame() const
TSignedSeqRange ReadingFrame() const
const CCDSInfo & GetCdsInfo() const
vector< CModelExon > TExons
static EResidue _fromACGT(TResidue x)
static const EResidue * _rev_codons(int i)
static const EResidue * _codons(int i)
static const Res * _rev_codons(int i)
static const Res * _codons(int i)
static Res _fromACGT(TResidue x)
bool Empty(const CNcbiOstrstream &src)
vector< TResidue > CResidueVec
bool Include(TSignedSeqRange big, TSignedSeqRange small)
vector< CInDelInfo > TInDels
objects::CSeqVectorTypes::TResidue TResidue
bool IsStopCodon(const Res *seq, int strand)
bool FindFirstStart(const vector< int > &starts, int stop, int &start)
bool FindUpstreamStop(const vector< int > &stops, int start, int &stop)
static const EResidue s_ecodons2[3]
const TResidue rev_codons[4][4]
const TResidue codons[4][4]
void FindAllStops(TIVec stops[], const CEResidueVec &mrna, TSignedSeqRange search_region, int fixed_frame)
const char *const k_aa_table
void FindStartsStops(const CGeneModel &model, const CEResidueVec &contig_seq, const CEResidueVec &mrna, const CAlignMap &mrnamap, TIVec starts[3], TIVec stops[3], int &frame, bool obeystart)
const EResidue k_toMinus[5]
void FindAllCodonInstances(TIVec positions[], const EResidue codon[], const CEResidueVec &mrna, TSignedSeqRange search_region, int fixed_frame)
const EResidue * ecodons[4]
static const EResidue s_ecodons0r[3]
static const EResidue s_ecodons1[3]
static const EResidue s_ecodons2r[3]
template bool IsStartCodon< TResidue >(const TResidue *seq, int strand)
const EResidue * rev_ecodons[4]
template bool IsStartCodon< EResidue >(const EResidue *seq, int strand)
void ReverseComplement(const CEResidueVec &src, CEResidueVec &dst)
bool IsStartCodon(const Res *seq, int strand)
template bool IsStopCodon< EResidue >(const EResidue *seq, int strand)
void Convert(const CResidueVec &src, CEResidueVec &dst)
template bool IsStopCodon< TResidue >(const TResidue *seq, int strand)
static const EResidue s_ecodons3r[3]
bool Partial5pCodonIsStop(const CEResidueVec &seq_strand, int start, int frame)
static const EResidue s_ecodons3[3]
void PushInDel(TInDels &indels, bool fs_only, TSignedSeqPos p, int len, CInDelInfo::EType type, const string &seq="")
void FindAllStarts(TIVec starts[], const CEResidueVec &mrna, TSignedSeqRange search_region, int fixed_frame)
static const EResidue s_ecodons0[3]
static const EResidue s_ecodons1r[3]
TResidue toACGT(EResidue c)
EResidue fromACGT(TResidue c)
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
int TSignedSeqPos
Type for signed sequence position.
position_type GetLength(void) const
bool NotEmpty(void) const
static TThisType GetEmpty(void)
static position_type GetWholeFrom(void)
CRange< TSignedSeqPos > TSignedSeqRange
static position_type GetWholeTo(void)
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define BEGIN_SCOPE(ns)
Define a new scope.
TTo GetTo(void) const
Get the To member data.
TFrom GetFrom(void) const
Get the From member data.
unsigned int
A callback function used to compare two keys in a database.
range(_Ty, _Ty) -> range< _Ty >
constexpr auto sort(_Init &&init)
const struct ncbi::grid::netcache::search::fields::SIZE size
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
void Out(T t, int w, CNcbiOstream &to=cout)
static SLJIT_INLINE sljit_ins l(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)