16 indels.reserve(sam_indels.size());
23 if(indl->GetInDelV().empty())
24 insertlen = indl->Len();
26 deletion = indl->GetInDelV();
27 }
else if(loc+insertlen == indl->Loc()) {
28 if(indl->GetInDelV().empty())
29 insertlen += indl->Len();
31 deletion += indl->GetInDelV();
36 indels.push_back(
CLiteIndel(loc+insertlen,(
int)deletion.size(),deletion));
39 if(indl->GetInDelV().empty()) {
40 insertlen = indl->Len();
43 deletion = indl->GetInDelV();
52 indels.push_back(
CLiteIndel(loc+insertlen,(
int)deletion.size(),deletion));
70 size_t first_element = cigar.find_first_not_of(
"0123456789");
71 if(first_element != string::npos && cigar[first_element] ==
'I')
72 cigar[first_element] =
'S';
73 if(cigar[cigar.size()-1] ==
'I')
74 cigar[cigar.size()-1] =
'S';
83 istringstream istr_cigar(cigar);
86 const string& seq = ad.
m_seq;
87 while(istr_cigar >>
len >> c) {
95 for(
int l = 0;
l <
len; ++
l) {
96 if(seq[seq_pos] != contig[gstop]) {
98 sam_indels.push_back(
CLiteIndel(gstop+1,1,seq.substr(seq_pos,1)));
109 seq_pos +=
len; align_len +=
len; gstop +=
len;
116 seq_pos +=
len; align_len +=
len; gstop +=
len;
121 seq_pos +=
len; align_len +=
len;
126 align_len +=
len; gstop +=
len;
130 throw runtime_error(
"Alignments can't have introns");
138 m_ident = (double)matches/align_len;
154 string seq = contig.substr(
l,
r-
l);
165 seq += contig.substr(
l,
r-
l);
186 int length (sv.
size());
200 for(
int i = 0;
i < (
int)align.
Exons().size(); ++
i) {
204 if(
i->IntersectingWith(exon_lim.
GetFrom(), exon_lim.
GetTo()))
205 corrections.push_back(*
i);
207 while(!corrections.empty() && corrections.front().Loc() == exon_lim.
GetFrom()) {
208 exon_lim.
SetFrom(corrections.front().InDelEnd());
209 corrections.erase(corrections.begin());
211 while(!corrections.empty() && corrections.back().InDelEnd() == exon_lim.
GetTo()+1) {
212 exon_lim.
SetTo(corrections.back().Loc()-1);
213 corrections.pop_back();
222 errors += indl->Len();
223 if(indl->IsDeletion())
224 align_len += indl->Len();
225 if(!indl->IsInsertion()) {
226 string s = indl->GetInDelV();
227 ns +=
count(s.begin(), s.end(),
'N');
235 if(indl->IsMismatch()) {
236 indels.push_back(
CLiteIndel(indl->Loc(), indl->Len()));
237 indels.push_back(
CLiteIndel(indl->InDelEnd(), indl->Len(), indl->GetInDelV()));
238 }
else if(indl->IsInsertion()) {
239 indels.push_back(
CLiteIndel(indl->Loc(), indl->Len()));
241 indels.push_back(
CLiteIndel(indl->Loc(), indl->Len(), indl->GetInDelV()));
254 map<TSignedSeqRange,TSIMap> variations;
255 list<TSignedSeqRange> confirmed_ranges;
260 ITERATE(list<TSignedSeqRange>,
i, confirmed_ranges) {
267 aligns.back().SetTargetId(*
id);
272 TSIMap& seq_counts =
i->second;
274 if(!correctionsonly) {
277 const string& seq = j->first;
278 int count = j->second;
288 aligns.back().SetTargetId(*
id);
294 double selected_weight = 0;
296 const string& seq = j->first;
297 int count = j->second;
300 if(dist < selected_dist || (dist == selected_dist &&
count > selected_weight)) {
301 selected_cigar = cigar;
303 selected_dist = dist;
304 selected_weight =
count;
311 a.SetWeight(selected_weight);
314 aligns.back().SetTargetId(*
id);
322 #define ENTROPY_LEVEL_FOR_ALIGNS 0.51
327 string read = al->TranscriptSeq(
m_contigt);
328 string base =
m_contigt.substr(al->Limits().GetFrom(),al->Limits().GetLength());
332 all_alignsp.push_back(&(*al));
339 for(
int ir = 0; ir < (
int)all_alignsp.size(); ++ir) {
353 for(
int ir = 0; ir < reads_num; ++ir) {
354 for(
int p =
m_alignsp[ir]->Limits().GetFrom(); p <=
m_alignsp[ir]->Limits().GetTo(); ++p)
358 if((*indl)->IsDeletion()) {
359 int len = (*indl)->Len();
360 if(indl != indels.begin()) {
361 TLiteInDelsP::const_iterator
prev = indl-1;
362 if((*prev)->IsInsertion() && (*prev)->Loc()+(*prev)->Len() == (*indl)->Loc())
363 len -= (*prev)->Len();
366 deletion_len[(*indl)->Loc()] =
max(
len, deletion_len[(*indl)->Loc()]);
373 int contigp =
i->first;
374 if(del != deletion_len.
end() && contigp == del->first) {
375 int del_len = (del++)->second;
376 for(
int l = 0;
l < del_len; ++
l)
379 int alignp = contigp+shift;
384 int total_deletion_len = 0;
386 total_deletion_len +=
i->second;
388 m_base.reserve(contig_len+total_deletion_len);
390 for(
int p = 0; p < contig_len; ++p) {
392 if(rslt != deletion_len.
end()) {
393 int n = rslt->second;
405 for(
int p = 0; p < (
int)
m_base.length(); ++p) {
409 while(
r < reads_num &&
m_starts[
r]+dashes == p)
415 for(
int ir = 0; ir < reads_num; ++ir) {
422 list<pair<int,int> > indel_pos_length;
424 if((*indl)->IsDeletion()) {
425 if(!indel_pos_length.empty() && indel_pos_length.back().first-indel_pos_length.back().second == (*indl)->Loc()) {
426 _ASSERT(indel_pos_length.back().second < 0);
427 int new_len = indel_pos_length.back().second+(*indl)->Len();
429 indel_pos_length.back().second = new_len;
430 }
else if(new_len > 0) {
431 indel_pos_length.back().first = (*indl)->Loc();
432 indel_pos_length.back().second = new_len;
434 indel_pos_length.pop_back();
437 indel_pos_length.push_back(make_pair((*indl)->Loc(), (*indl)->Len()));
440 indel_pos_length.push_back(make_pair((*indl)->Loc(), -(*indl)->Len()));
444 list<pair<int,int> >::iterator indl = indel_pos_length.begin();
445 for(
int p = start+1; p <
base_length && p < start+(
int)read.size(); ) {
450 int insertp = p-start;
452 if(indl != indel_pos_length.end() && indl->second > 0 &&
m_contig_to_align[indl->first] == p) {
457 read.insert(insertp,
len,
'-');
458 }
else if(indl != indel_pos_length.end() && indl->second < 0 &&
m_contig_to_align[indl->first] == p) {
473 for(
int ir = 0; ir < reads_num; ++ir) {
475 const string& read =
m_reads[ir];
479 for(
int p = legit_range.
GetFrom(); p <= legit_range.
GetTo(); ++p) {
480 char c = read[p-start];
487 vector<const CLiteAlign*> all_alignsp;
490 int aligns_size = (
int)all_alignsp.size();
511 if(
Include(read_range,two_word_range)) {
517 seq_counts[read_seq] += w;
524 const string& read =
m_reads[ir];
526 int end = start+(
int)read.size()-1;
528 int first_legit_match = start;
530 while(shift <
m_min_edge || (first_legit_match <= end && (read[first_legit_match-start] ==
'-' || read[first_legit_match-start] !=
m_base[first_legit_match]))) {
531 if(
m_base[first_legit_match] !=
'-')
535 int last_legit_match = end;
537 while(shift <
m_min_edge || (last_legit_match >= start && (read[last_legit_match-start] ==
'-' || read[last_legit_match-start] !=
m_base[last_legit_match]))) {
538 if(
m_base[last_legit_match] !=
'-')
554 string maximal_bases(
m_base.size(),
'A');
555 for(
int p = 0; p < (
int)
m_base.size(); ++p) {
558 maximal_bases[p] =
'#';
574 maximal_bases[p] =
'#';
577 maximal_bases[p] = c;
582 string prev_strong_word;
583 for(
int p = 0; p < (
int)maximal_bases.size(); ) {
584 if(maximal_bases[p] ==
'#') {
593 if(strong_word_range.
Empty())
596 p = strong_word_range.
GetFrom()+1;
598 bool same_as_contig =
true;
599 for(
int pos = strong_word_range.
GetFrom(); pos <= strong_word_range.
GetTo() && same_as_contig; ++pos)
600 same_as_contig = (maximal_bases[pos] ==
m_base[pos]);
602 if(!same_as_contig) {
604 }
else if(first_gap >= 0) {
608 bool there_is_weak_range = prev_strong_word_range.
NotEmpty() && prev_strong_word_range.
GetTo()+1 < strong_word_range.
GetFrom();
616 if(there_is_weak_range) {
619 int accepted_cross = 0;
620 SeqCountsBetweenTwoStrongWords(prev_strong_word_range, prev_strong_word, strong_word_range, strong_word, seq_counts, total_cross, accepted_cross);
624 if(!seq_counts.
empty()) {
630 if(
i->second > most_frequent_variant->second)
631 most_frequent_variant =
i;
636 string var_seq = prev_strong_word+it->first+strong_word;
638 var_counts[var_seq] = it->second;
641 if(!var_counts.
empty()) {
643 int base_posr = swordr;
646 if(var_counts.
size() == 1 && var_counts.
begin()->first ==
m_contigt.substr(base_posl,base_posr-base_posl+1)) {
647 confirmed_ranges.back().SetTo(base_posr);
648 there_is_weak_range =
false;
651 variations.insert(var);
658 prev_strong_word_range = strong_word_range;
659 prev_strong_word = strong_word;
661 if(confirmed_ranges.empty() || there_is_weak_range || confirmed_ranges.back().GetTo()+1 < swordl)
664 confirmed_ranges.back().
SetTo(swordr);
670 const string& read =
m_reads[ir];
672 int stop = start+(
int)read.size()-1;
675 if(read[
r-start] !=
'-')
676 read_word.push_back(read[
r-start]);
685 read_word.push_back(
m_base[
r]);
697 if(
Include(legit_range,word_range)) {
700 if(word == read_word)
714 while(nextp < (
int)maximal_bases.size()) {
716 int word_start = nextp;
717 int word_end = nextp;
718 bool low_complexity =
false;
719 for( ; word_end < (
int)maximal_bases.size() && (
int)word.size() <
m_word && maximal_bases[word_end] !=
'#'; ++word_end) {
720 if(maximal_bases[word_end] !=
'-') {
721 word.push_back(
toupper(maximal_bases[word_end]));
722 if(
islower(maximal_bases[word_end]))
723 low_complexity =
true;
727 if((
int)word.size() <
m_word) {
728 if(maximal_bases[word_end] ==
'#') {
729 if(first_gap < 0) first_gap = word_end;
738 if(!low_complexity &&
CheckWord(word_range,word)) {
740 strong_word_range = word_range;
761 m_word = args[
"word"].AsInteger();
764 m_maxNs = args[
"maxNs"].AsInteger();
int Distance(const char *query, const char *subject) const
TInDels GetInDels(int sstart, const char *const query, const char *subject) const
const TExons & Exons() const
TSignedSeqRange Limits() const
CLiteAlign(TSignedSeqRange range, const TLiteInDels &indels, set< CLiteIndel > &indel_holder, double weight, double ident)
string TranscriptSeq(const string &contig) const
void InsertDashesInBase()
void InsertDashesInReads()
static void SetupArgDescriptions(CArgDescriptions *arg_desc)
int m_min_abs_support_for_variant
bool CheckWord(const TSignedSeqRange &word_range, const string &word)
void SelectAligns(vector< const CLiteAlign * > &all_alignsp)
string EmitSequenceFromBase(const TSignedSeqRange &word_range)
set< CLiteIndel > m_indel_holder
void ProcessArgs(const CArgs &args)
map< int, TCharIntMap > m_counts
void AddAlignment(const SSamData &align)
TAlignModelList GetVariationAlignList(bool correctionsonly)
TSignedSeqRange LegitRange(int ir)
void SetGenomic(const CConstRef< CSeq_id > &seqid, CScope &scope)
vector< const CLiteAlign * > m_alignsp
TIntMap m_contig_to_align
double m_min_rel_support_for_variant
int FindNextStrongWord(int nextp, const string &maximal_bases, string &strong_word, TSignedSeqRange &strong_word_range, int &first_gap)
string EmitSequenceFromRead(int ir, const TSignedSeqRange &word_range)
void SeqCountsBetweenTwoStrongWords(const TSignedSeqRange &prev_strong_word_range, const string &prev_strong_word, const TSignedSeqRange &strong_word_range, const string &strong_word, TSIMap &seq_counts, int &total_cross, int &accepted_cross)
void Variations(map< TSignedSeqRange, TSIMap > &variations, list< TSignedSeqRange > &confirmed_ranges)
double m_strong_consensus
void PrepareReads(const vector< const CLiteAlign * > &all_alignsp)
TIntMap m_align_to_contig
container_type::const_iterator const_iterator
container_type::iterator iterator
const_iterator begin() const
const_iterator end() const
iterator_bool insert(const value_type &val)
container_type::value_type value_type
const_iterator find(const key_type &key) const
iterator_bool insert(const value_type &val)
static int base_length[29]
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
double Entropy(const string &seq)
CCigar GlbAlign(const char *query, int querylen, const char *subject, int subjectlen, int gopen, int gapextend, const char delta[256][256])
list< CAlignModel > TAlignModelList
bool Include(TSignedSeqRange big, TSignedSeqRange small)
vector< CInDelInfo > TInDels
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
int TSignedSeqPos
Type for signed sequence position.
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
void AddDefaultKey(const string &name, const string &synopsis, const string &comment, EType type, const string &default_value, TFlags flags=0, const string &env_var=kEmptyStr, const char *display_value=nullptr)
Add description for optional key with default value.
@ eDouble
Convertible into a floating point number (double)
@ eInteger
Convertible into an integer number (int or Int8)
string GetSeqIdString(bool with_version=false) const
Return seqid string with optional version for text seqid type.
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
CSeqVector GetSeqVector(EVectorCoding coding, ENa_strand strand=eNa_strand_plus) const
Get sequence: Iupacna or Iupacaa if use_iupac_coding is true.
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
position_type GetLength(void) const
bool NotEmpty(void) const
static TThisType GetEmpty(void)
CRange< TSignedSeqPos > TSignedSeqRange
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define BEGIN_SCOPE(ns)
Define a new scope.
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
void SetFrom(TFrom value)
Assign a value to From data member.
TTo GetTo(void) const
Get the To member data.
TFrom GetFrom(void) const
Get the From member data.
void SetTo(TTo value)
Assign a value to To data member.
unsigned int
A callback function used to compare two keys in a database.
range(_Ty, _Ty) -> range< _Ty >
constexpr auto sort(_Init &&init)
const struct ncbi::grid::netcache::search::fields::SIZE size
Int4 delta(size_t dimension_, const Int4 *score_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
static int match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, uint16_t top_bracket, PCRE2_SIZE frame_size, pcre2_match_data *match_data, match_block *mb)
static SLJIT_INLINE sljit_ins l(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
TLiteInDels GroupInDels(const TLiteInDels &sam_indels)
#define ENTROPY_LEVEL_FOR_ALIGNS
vector< const CLiteIndel * > TLiteInDelsP
vector< CLiteIndel > TLiteInDels
list< CLiteAlign > TLiteAlignList