NCBI C++ ToolKit
variations.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #include <ncbi_pch.hpp>
5 
7 #include <objmgr/scope.hpp>
8 #include <objmgr/seq_vector.hpp>
9 #include <sstream>
10 
12 BEGIN_SCOPE(gnomon)
13 
15  TLiteInDels indels;
16  indels.reserve(sam_indels.size());
17  int loc = -1;
18  int insertlen = 0;
19  string deletion;
20  ITERATE(TLiteInDels, indl, sam_indels) {
21  if(loc < 0) {
22  loc = indl->Loc();
23  if(indl->GetInDelV().empty())
24  insertlen = indl->Len();
25  else
26  deletion = indl->GetInDelV();
27  } else if(loc+insertlen == indl->Loc()) {
28  if(indl->GetInDelV().empty())
29  insertlen += indl->Len();
30  else
31  deletion += indl->GetInDelV();
32  } else {
33  if(insertlen > 0)
34  indels.push_back(CLiteIndel(loc, insertlen));
35  if(!deletion.empty())
36  indels.push_back(CLiteIndel(loc+insertlen,(int)deletion.size(),deletion));
37 
38  loc = indl->Loc();
39  if(indl->GetInDelV().empty()) {
40  insertlen = indl->Len();
41  deletion.clear();
42  } else {
43  deletion = indl->GetInDelV();
44  insertlen = 0;
45  }
46  }
47  }
48  if(loc >= 0) {
49  if(insertlen > 0)
50  indels.push_back(CLiteIndel(loc, insertlen));
51  if(!deletion.empty())
52  indels.push_back(CLiteIndel(loc+insertlen,(int)deletion.size(),deletion));
53  }
54 
55  return indels;
56 }
57 
58 CLiteAlign::CLiteAlign(TSignedSeqRange range, const TLiteInDels& indels, set<CLiteIndel>& indel_holder, double weight, double ident) :
59  m_weight(weight), m_ident(ident), m_range(range) {
60 
61  m_indels.reserve(indels.size());
62  ITERATE( TLiteInDels, indl, indels) {
63  const CLiteIndel* indelp = &(*indel_holder.insert(*indl).first);
64  m_indels.push_back(indelp);
65  }
66 }
67 
68 CLiteAlign::CLiteAlign(const SSamData& ad, const string& contig, set<CLiteIndel>& indel_holder) : m_weight(ad.m_weight) {
69  string cigar = ad.m_cigar;
70  size_t first_element = cigar.find_first_not_of("0123456789");
71  if(first_element != string::npos && cigar[first_element] == 'I')
72  cigar[first_element] = 'S';
73  if(cigar[cigar.size()-1] == 'I')
74  cigar[cigar.size()-1] = 'S';
75 
76  TLiteInDels sam_indels;
77  int matches = 0;
78  int align_len = 0;
79  int seq_pos = 0; //position on seq
80  int gstart = ad.m_contigp-1; //initial position on contig
81  int gstop = gstart; //current position on contig
82 
83  istringstream istr_cigar(cigar);
84  int len;
85  char c;
86  const string& seq = ad.m_seq;
87  while(istr_cigar >> len >> c) {
88  switch(c) {
89  case 'S':
90  seq_pos += len;
91  case 'H':
92  break;
93 
94  case 'M':
95  for(int l = 0; l < len; ++l) {
96  if(seq[seq_pos] != contig[gstop]) { // mismatch
97  sam_indels.push_back(CLiteIndel(gstop,1));
98  sam_indels.push_back(CLiteIndel(gstop+1,1,seq.substr(seq_pos,1)));
99  } else {
100  ++matches;
101  }
102  ++seq_pos;
103  ++gstop;
104  ++align_len;
105  }
106  break;
107  case '=':
108  matches += len;
109  seq_pos += len; align_len += len; gstop += len;
110  break;
111 
112  case 'X':
113  case 'R':
114  sam_indels.push_back(CLiteIndel(gstop,len));
115  sam_indels.push_back(CLiteIndel(gstop+len,len,seq.substr(seq_pos,len)));
116  seq_pos += len; align_len += len; gstop += len;
117  break;
118 
119  case 'I':
120  sam_indels.push_back(CLiteIndel(gstop,len,seq.substr(seq_pos,len)));
121  seq_pos += len; align_len += len;
122  break;
123 
124  case 'D':
125  sam_indels.push_back(CLiteIndel(gstop,len));
126  align_len += len; gstop += len;
127  break;
128 
129  case 'N':
130  throw runtime_error("Alignments can't have introns");
131 
132  default:
133  break;
134  }
135  }
136 
137  m_range = TSignedSeqRange(gstart,gstop-1);
138  m_ident = (double)matches/align_len;
139 
140  TLiteInDels indels = GroupInDels(sam_indels);
141 
142  m_indels.reserve(indels.size());
143  ITERATE(TLiteInDels, indl, indels) {
144  const CLiteIndel* indelp = &(*indel_holder.insert(*indl).first);
145  m_indels.push_back(indelp);
146  }
147 }
148 
149 string CLiteAlign::TranscriptSeq(const string& contig) const {
150  int l = m_range.GetFrom();
151  int r = m_range.GetTo()+1;
152  if(!m_indels.empty())
153  r = m_indels.front()->Loc();
154  string seq = contig.substr(l,r-l);
155  for(int i = 0; i < (int)m_indels.size(); ++i) {
156  if(m_indels[i]->IsInsertion()) {
157  l = r+m_indels[i]->Len();
158  } else {
159  seq += m_indels[i]->GetInDelV();
160  l = r;
161  }
162  r = m_range.GetTo()+1;
163  if(i < (int)m_indels.size()-1)
164  r = m_indels[i+1]->Loc();
165  seq += contig.substr(l,r-l);
166  }
167 
168  return seq;
169 }
170 
172  m_max_len = 0;
173  m_reads.clear();
174  m_starts.clear();
175  m_alignsp.clear();
178  m_counts.clear();
179  m_aligns.clear();
181 
182  if(m_contig_id != seqid->GetSeqIdString(true)) {
183  m_contig_id = seqid->GetSeqIdString(true);
184  CBioseq_Handle bh(scope.GetBioseqHandle(*seqid));
186  int length (sv.size());
187  sv.GetSeqData(0, length, m_contigt);
188  }
189  m_base = m_contigt;
190 }
191 
192 void CMultAlign::AddAlignment(const SSamData& alignd) {
193  if(count(alignd.m_seq.begin(),alignd.m_seq.end(),'N') <= m_maxNs) {
194  m_aligns.push_back(CLiteAlign(alignd, m_contigt, m_indel_holder));
195  }
196 }
197 
199  TInDels all_corrections = align.FrameShifts();
200  for(int i = 0; i < (int)align.Exons().size(); ++i) {
201  TSignedSeqRange exon_lim = align.Exons()[i].Limits();
202  TInDels corrections;
203  ITERATE(TInDels, i, all_corrections) {
204  if(i->IntersectingWith(exon_lim.GetFrom(), exon_lim.GetTo()))
205  corrections.push_back(*i);
206  }
207  while(!corrections.empty() && corrections.front().Loc() == exon_lim.GetFrom()) {
208  exon_lim.SetFrom(corrections.front().InDelEnd());
209  corrections.erase(corrections.begin());
210  }
211  while(!corrections.empty() && corrections.back().InDelEnd() == exon_lim.GetTo()+1) {
212  exon_lim.SetTo(corrections.back().Loc()-1);
213  corrections.pop_back();
214  }
215  if(exon_lim.GetLength() < 3*m_min_edge)
216  continue;
217 
218  int ns = 0;
219  double errors = 0;
220  int align_len = exon_lim.GetLength();
221  ITERATE(TInDels, indl, corrections) {
222  errors += indl->Len();
223  if(indl->IsDeletion())
224  align_len += indl->Len();
225  if(!indl->IsInsertion()) {
226  string s = indl->GetInDelV();
227  ns += count(s.begin(), s.end(), 'N');
228  }
229  }
230  if(ns > m_maxNs)
231  continue;
232 
233  TLiteInDels indels;
234  ITERATE(TInDels, indl, corrections) {
235  if(indl->IsMismatch()) {
236  indels.push_back(CLiteIndel(indl->Loc(), indl->Len()));
237  indels.push_back(CLiteIndel(indl->InDelEnd(), indl->Len(), indl->GetInDelV()));
238  } else if(indl->IsInsertion()) {
239  indels.push_back(CLiteIndel(indl->Loc(), indl->Len()));
240  } else {
241  indels.push_back(CLiteIndel(indl->Loc(), indl->Len(), indl->GetInDelV()));
242  }
243  }
244 
245  m_aligns.push_back(CLiteAlign(exon_lim, GroupInDels(indels), m_indel_holder, align.Weight(), errors/align_len));
246  }
247 }
248 
250 
251  SMatrix delta(1,1);
252  TAlignModelList aligns;
253 
254  map<TSignedSeqRange,TSIMap> variations;
255  list<TSignedSeqRange> confirmed_ranges;
256  Variations(variations, confirmed_ranges);
257 
258  string contig_acc = m_contig_id;
259 
260  ITERATE(list<TSignedSeqRange>, i, confirmed_ranges) {
262  string acc = (correctionsonly ? "CorrectionData:" : "Confirmed:")+contig_acc+":"+NStr::IntToString(range.GetFrom()+1)+":"+NStr::IntToString(range.GetTo()+1);
264  a.AddExon(range);
265  aligns.push_back(CAlignModel(a, a.GetAlignMap()));
266  CRef<CSeq_id> id(new CSeq_id(CSeq_id::e_Local, acc));
267  aligns.back().SetTargetId(*id);
268  }
269 
270  for(map<TSignedSeqRange,TSIMap>::iterator i = variations.begin(); i != variations.end(); ++i) {
271  TSignedSeqRange range = i->first;
272  TSIMap& seq_counts = i->second;
273 
274  if(!correctionsonly) {
275  int num = 0;
276  ITERATE(TSIMap, j, seq_counts) {
277  const string& seq = j->first;
278  int count = j->second;
279  string acc = "Variant:"+contig_acc+":"+NStr::IntToString(range.GetFrom()+1)+":"+NStr::IntToString(range.GetTo()+1)+":"+NStr::IntToString(++num)+":"+NStr::IntToString(count);
280 
281  CCigar cigar = GlbAlign(seq.c_str()+m_word, (int)seq.size()-2*m_word, m_contigt.c_str()+range.GetFrom()+m_word, range.GetLength()-2*m_word, 1, 1, delta.matrix); // don't align anchors
283  a.AddExon(range);
284  a.FrameShifts() = cigar.GetInDels(range.GetFrom()+m_word, seq.c_str()+m_word, m_contigt.c_str()+range.GetFrom()+m_word);
285  a.SetWeight(count);
286  aligns.push_back(CAlignModel(a, a.GetAlignMap()));
287  CRef<CSeq_id> id(new CSeq_id(CSeq_id::e_Local, acc));
288  aligns.back().SetTargetId(*id);
289  }
290  } else {
291  CCigar selected_cigar;
292  string selected_seq;
293  int selected_dist = numeric_limits<int>::max();
294  double selected_weight = 0;
295  ITERATE(TSIMap, j, seq_counts) {
296  const string& seq = j->first;
297  int count = j->second;
298  CCigar cigar = GlbAlign(seq.c_str()+m_word, (int)seq.size()-2*m_word, m_contigt.c_str()+range.GetFrom()+m_word, range.GetLength()-2*m_word, 1, 1, delta.matrix); // don't align anchors
299  int dist = cigar.Distance(seq.c_str()+m_word, m_contigt.c_str()+range.GetFrom()+m_word);
300  if(dist < selected_dist || (dist == selected_dist && count > selected_weight)) {
301  selected_cigar = cigar;
302  selected_seq = seq;
303  selected_dist = dist;
304  selected_weight = count;
305  }
306  }
307  string acc = "CorrectionData:" + contig_acc+":"+NStr::IntToString(range.GetFrom()+1)+":"+NStr::IntToString(range.GetTo()+1);
309  a.AddExon(range);
310  a.FrameShifts() = selected_cigar.GetInDels(range.GetFrom()+m_word, selected_seq.c_str()+m_word, m_contigt.c_str()+range.GetFrom()+m_word);
311  a.SetWeight(selected_weight);
312  aligns.push_back(CAlignModel(a, a.GetAlignMap()));
313  CRef<CSeq_id> id(new CSeq_id(CSeq_id::e_Local, acc));
314  aligns.back().SetTargetId(*id);
315  }
316  }
317 
318  return aligns;
319 }
320 
321 
322 #define ENTROPY_LEVEL_FOR_ALIGNS 0.51
323 
324 void CMultAlign::SelectAligns(vector<const CLiteAlign*>& all_alignsp) {
325 
327  string read = al->TranscriptSeq(m_contigt);
328  string base = m_contigt.substr(al->Limits().GetFrom(),al->Limits().GetLength());
329  if(min(Entropy(read),Entropy(base)) < ENTROPY_LEVEL_FOR_ALIGNS)
330  continue;
331 
332  all_alignsp.push_back(&(*al));
333  }
334 
335  sort(all_alignsp.begin(),all_alignsp.end(),AlignsLeftEndFirst());
336 }
337 
338 void CMultAlign::PrepareReads(const vector<const CLiteAlign*>& all_alignsp) {
339  for(int ir = 0; ir < (int)all_alignsp.size(); ++ir) {
340  const CLiteAlign* al = all_alignsp[ir];
341  string read = al->TranscriptSeq(m_contigt);
342  m_alignsp.push_back(al);
343  m_reads.push_back(read);
344  m_starts.push_back(al->Limits().GetFrom());
345  }
346 }
347 
349  int reads_num = (int)m_reads.size();
350  TSignedSeqPos contig_len = (TSignedSeqPos)m_contigt.size();
351 
352  TIntMap deletion_len;
353  for(int ir = 0; ir < reads_num; ++ir) {
354  for(int p = m_alignsp[ir]->Limits().GetFrom(); p <= m_alignsp[ir]->Limits().GetTo(); ++p)
355  m_contig_to_align[p] = p;
356  TLiteInDelsP indels = m_alignsp[ir]->GetInDels();
357  ITERATE(TLiteInDelsP, indl, indels) {
358  if((*indl)->IsDeletion()) {
359  int len = (*indl)->Len();
360  if(indl != indels.begin()) {
361  TLiteInDelsP::const_iterator prev = indl-1;
362  if((*prev)->IsInsertion() && (*prev)->Loc()+(*prev)->Len() == (*indl)->Loc()) // mismatch
363  len -= (*prev)->Len();
364  }
365  if(len > 0)
366  deletion_len[(*indl)->Loc()] = max(len, deletion_len[(*indl)->Loc()]);
367  }
368  }
369  }
370  int shift = 0;
371  TIntMap::iterator del = deletion_len.begin();
373  int contigp = i->first;
374  if(del != deletion_len.end() && contigp == del->first) {
375  int del_len = (del++)->second;
376  for(int l = 0; l < del_len; ++l)
377  m_align_to_contig[contigp+shift++] = - 1;
378  }
379  int alignp = contigp+shift;
380  i->second = alignp;
381  m_align_to_contig[alignp] = contigp;
382  }
383 
384  int total_deletion_len = 0;
385  ITERATE(TIntMap, i, deletion_len)
386  total_deletion_len += i->second;
387 
388  m_base.reserve(contig_len+total_deletion_len);
389  m_base.clear();
390  for(int p = 0; p < contig_len; ++p) {
391  TIntMap::iterator rslt = deletion_len.find(p);
392  if(rslt != deletion_len.end()) {
393  int n = rslt->second;
394  m_base.insert(m_base.size(), n, '-');
395  }
396  m_base.push_back(m_contigt[p]);
397  }
398 }
399 
401  int reads_num = (int)m_reads.size();
402 
403  int dashes = 0;
404  int r = 0;
405  for(int p = 0; p < (int)m_base.length(); ++p) {
406  if(m_base[p] == '-') {
407  ++dashes;
408  } else {
409  while(r < reads_num && m_starts[r]+dashes == p)
410  m_starts[r++] = p;
411  }
412  }
413 
414  int base_length = (int)m_base.length();
415  for(int ir = 0; ir < reads_num; ++ir) {
416  string& read = m_reads[ir];
417  int start = m_starts[ir];
418  if(m_contig_to_align[m_alignsp[ir]->Limits().GetTo()]-start+1 == (int)read.size())
419  continue;
420 
421  TLiteInDelsP indels = m_alignsp[ir]->GetInDels();
422  list<pair<int,int> > indel_pos_length; // deletion positive length
423  ITERATE(TLiteInDelsP, indl, indels) {
424  if((*indl)->IsDeletion()) {
425  if(!indel_pos_length.empty() && indel_pos_length.back().first-indel_pos_length.back().second == (*indl)->Loc()) { // mismatch
426  _ASSERT(indel_pos_length.back().second < 0);
427  int new_len = indel_pos_length.back().second+(*indl)->Len();
428  if(new_len < 0) { // still insertion
429  indel_pos_length.back().second = new_len;
430  } else if(new_len > 0) { // becomes deletion
431  indel_pos_length.back().first = (*indl)->Loc();
432  indel_pos_length.back().second = new_len;
433  } else { // pure mismatch
434  indel_pos_length.pop_back();
435  }
436  } else {
437  indel_pos_length.push_back(make_pair((*indl)->Loc(), (*indl)->Len()));
438  }
439  } else {
440  indel_pos_length.push_back(make_pair((*indl)->Loc(), -(*indl)->Len()));
441  }
442  }
443 
444  list<pair<int,int> >::iterator indl = indel_pos_length.begin();
445  for(int p = start+1; p < base_length && p < start+(int)read.size(); ) {
446  if(m_base[p] == '-') {
447  int len = 1;
448  while(p+len < base_length && m_base[p+len] == '-')
449  ++len;
450  int insertp = p-start;
451  p += len;
452  if(indl != indel_pos_length.end() && indl->second > 0 && m_contig_to_align[indl->first] == p) {
453  len -= indl->second;
454  ++indl;
455  }
456  if(len > 0)
457  read.insert(insertp,len,'-');
458  } else if(indl != indel_pos_length.end() && indl->second < 0 && m_contig_to_align[indl->first] == p) { // left of insertion
459  int n = m_contig_to_align[indl->first-indl->second-1]-p+1;
460  read.insert(p-start,n,'-');
461  p += n;
462  ++indl;
463  } else {
464  ++p;
465  }
466  }
467  }
468 }
469 
471  int reads_num = (int)m_reads.size();
472  // m_counts.resize(m_base.size());
473  for(int ir = 0; ir < reads_num; ++ir) {
474  int w = m_alignsp[ir]->Weight()+0.5;
475  const string& read = m_reads[ir];
476  m_max_len = max(m_max_len,(int)read.size());
477  int start = m_starts[ir];
478  TSignedSeqRange legit_range = LegitRange(ir);
479  for(int p = legit_range.GetFrom(); p <= legit_range.GetTo(); ++p) {
480  char c = read[p-start];
481  m_counts[p][c] += w;
482  }
483  }
484 }
485 
487  vector<const CLiteAlign*> all_alignsp;
488  SelectAligns(all_alignsp);
489 
490  int aligns_size = (int)all_alignsp.size();
491  m_reads.reserve(aligns_size);
492  m_starts.reserve(aligns_size);
493  m_alignsp.reserve(aligns_size);
494 
495  PrepareReads(all_alignsp);
498  GetCounts();
499 }
500 
501 void CMultAlign::SeqCountsBetweenTwoStrongWords(const TSignedSeqRange& prev_strong_word_range, const string& prev_strong_word, const TSignedSeqRange& strong_word_range, const string& strong_word, TSIMap& seq_counts, int& total_cross, int& accepted_cross) {
502 
503  total_cross = 0;
504  accepted_cross = 0;
505  TSignedSeqRange two_word_range(prev_strong_word_range.GetFrom(),strong_word_range.GetTo());
506  TSignedSeqRange weak_range(prev_strong_word_range.GetTo()+1,strong_word_range.GetFrom()-1);
507  int ir_first = (int)(lower_bound(m_starts.begin(),m_starts.end(),two_word_range.GetFrom()-m_max_len)-m_starts.begin());
508  for(int ir = ir_first; ir < (int)m_alignsp.size() && m_starts[ir] <= two_word_range.GetFrom(); ++ir) {
509  int w = m_alignsp[ir]->Weight()+0.5;
510  TSignedSeqRange read_range(m_starts[ir],m_starts[ir]+(int)m_reads[ir].size()-1);
511  if(Include(read_range,two_word_range)) {
512  total_cross += w;
513  if(prev_strong_word == EmitSequenceFromRead(ir, prev_strong_word_range)
514  && strong_word == EmitSequenceFromRead(ir, strong_word_range)) {
515  accepted_cross += w;
516  string read_seq = EmitSequenceFromRead(ir, weak_range);
517  seq_counts[read_seq] += w;
518  }
519  }
520  }
521 }
522 
524  const string& read = m_reads[ir];
525  int start = m_starts[ir];
526  int end = start+(int)read.size()-1;
527 
528  int first_legit_match = start;
529  int shift = 0;
530  while(shift < m_min_edge || (first_legit_match <= end && (read[first_legit_match-start] == '-' || read[first_legit_match-start] != m_base[first_legit_match]))) {
531  if(m_base[first_legit_match] != '-')
532  ++shift;
533  ++first_legit_match;
534  }
535  int last_legit_match = end;
536  shift = 0;
537  while(shift < m_min_edge || (last_legit_match >= start && (read[last_legit_match-start] == '-' || read[last_legit_match-start] != m_base[last_legit_match]))) {
538  if(m_base[last_legit_match] != '-')
539  ++shift;
540  --last_legit_match;
541  }
542 
543  return TSignedSeqRange(first_legit_match, last_legit_match);
544 }
545 
546 
547 void CMultAlign::Variations(map<TSignedSeqRange,TSIMap>& variations, list<TSignedSeqRange>& confirmed_ranges) {
548 
549  if(m_aligns.empty())
550  return;
551 
552  PrepareStats();
553 
554  string maximal_bases(m_base.size(),'A');
555  for(int p = 0; p < (int)m_base.size(); ++p) {
557  if(rslt == m_counts.end()) {
558  maximal_bases[p] = '#';
559  continue;
560  }
561 
562  const map<char,int>& count = rslt->second;
563  char c = 0;
564  int w = 0;
565  int t = 0;
566  for(map<char,int>::const_iterator i = count.begin(); i != count.end(); ++i) {
567  if(i->second > w) {
568  c = i->first;
569  w = i->second;
570  }
571  t += i->second;
572  }
573  if(t < m_min_coverage) {
574  maximal_bases[p] = '#';
575  } else {
576  _ASSERT(c != 0);
577  maximal_bases[p] = c;
578  }
579  }
580 
581  TSignedSeqRange prev_strong_word_range;
582  string prev_strong_word;
583  for(int p = 0; p < (int)maximal_bases.size(); ) {
584  if(maximal_bases[p] == '#') {
585  prev_strong_word_range = TSignedSeqRange::GetEmpty();
586  ++p;
587  } else {
588  string strong_word;
589  TSignedSeqRange strong_word_range;
590  int first_gap;
591  FindNextStrongWord(p, maximal_bases, strong_word, strong_word_range, first_gap);
592 
593  if(strong_word_range.Empty()) // end of contig
594  break;
595 
596  p = strong_word_range.GetFrom()+1;
597 
598  bool same_as_contig = true;
599  for(int pos = strong_word_range.GetFrom(); pos <= strong_word_range.GetTo() && same_as_contig; ++pos)
600  same_as_contig = (maximal_bases[pos] == m_base[pos]);
601 
602  if(!same_as_contig) { // genomic error
603  continue;
604  } else if(first_gap >= 0) { // there is a strong word after a gap
605  prev_strong_word_range = TSignedSeqRange::GetEmpty();
606  }
607 
608  bool there_is_weak_range = prev_strong_word_range.NotEmpty() && prev_strong_word_range.GetTo()+1 < strong_word_range.GetFrom();
609 
610  while(m_align_to_contig[strong_word_range.GetFrom()] < 0) // shift from possible dashes
611  strong_word_range.SetFrom(strong_word_range.GetFrom()+1);
612 
613  int swordl = m_align_to_contig[strong_word_range.GetFrom()];
614  int swordr = m_align_to_contig[strong_word_range.GetTo()];
615 
616  if(there_is_weak_range) { // there is a weak range
617  TSIMap seq_counts;
618  int total_cross = 0;
619  int accepted_cross = 0;
620  SeqCountsBetweenTwoStrongWords(prev_strong_word_range, prev_strong_word, strong_word_range, strong_word, seq_counts, total_cross, accepted_cross);
621 
622  // cout << "SeqCounts:" << endl;
623 
624  if(!seq_counts.empty()) {
625  TSIMap::const_iterator most_frequent_variant = seq_counts.begin();
626  for(TSIMap::const_iterator i = seq_counts.begin(); i != seq_counts.end(); ++i) {
627 
628  // cout << i->first << " " << i->second << endl;
629 
630  if(i->second > most_frequent_variant->second)
631  most_frequent_variant = i;
632  }
633 
634  TSIMap var_counts;
635  ITERATE(TSIMap, it, seq_counts) {
636  string var_seq = prev_strong_word+it->first+strong_word;
637  if(it->second >= max(m_min_abs_support_for_variant,(int)(m_min_rel_support_for_variant*most_frequent_variant->second+0.5)))
638  var_counts[var_seq] = it->second;
639  }
640 
641  if(!var_counts.empty()) {
642  int base_posl = m_align_to_contig[prev_strong_word_range.GetFrom()];
643  int base_posr = swordr;
644  TSignedSeqRange var_range(base_posl,base_posr);
645 
646  if(var_counts.size() == 1 && var_counts.begin()->first == m_contigt.substr(base_posl,base_posr-base_posl+1)) { // confirmed weak range
647  confirmed_ranges.back().SetTo(base_posr);
648  there_is_weak_range = false;
649  } else {
650  map<TSignedSeqRange,TSIMap>::value_type var(var_range,var_counts);
651  variations.insert(var);
652  }
653  }
654  }
655  }
656 
657 
658  prev_strong_word_range = strong_word_range;
659  prev_strong_word = strong_word;
660 
661  if(confirmed_ranges.empty() || there_is_weak_range || confirmed_ranges.back().GetTo()+1 < swordl)
662  confirmed_ranges.push_back(TSignedSeqRange(swordl,swordr));
663  else
664  confirmed_ranges.back().SetTo(swordr);
665  }
666  }
667 }
668 
669 string CMultAlign::EmitSequenceFromRead(int ir, const TSignedSeqRange& word_range) {
670  const string& read = m_reads[ir];
671  int start = m_starts[ir];
672  int stop = start+(int)read.size()-1;
673  string read_word;
674  for(int r = max(start,word_range.GetFrom()); r <= min(stop,word_range.GetTo()); ++r) {
675  if(read[r-start] != '-')
676  read_word.push_back(read[r-start]);
677  }
678  return read_word;
679 }
680 
682  string read_word;
683  for(int r = word_range.GetFrom(); r <= word_range.GetTo(); ++r) {
684  if(m_base[r] != '-')
685  read_word.push_back(m_base[r]);
686  }
687  return read_word;
688 }
689 
690 bool CMultAlign::CheckWord(const TSignedSeqRange& word_range, const string& word) {
691  int total = 0;
692  int match = 0;
693  int ir_first = (int)(lower_bound(m_starts.begin(),m_starts.end(),word_range.GetFrom()-m_max_len)-m_starts.begin());
694  for(int ir = ir_first; ir < (int)m_alignsp.size() && m_starts[ir] <= word_range.GetFrom(); ++ir) {
695  int w = m_alignsp[ir]->Weight()+0.5;
696  TSignedSeqRange legit_range = LegitRange(ir);
697  if(Include(legit_range,word_range)) {
698  total += w;
699  string read_word = EmitSequenceFromRead(ir, word_range);
700  if(word == read_word)
701  match += w;
702  }
703  }
704 
705  if(match > m_strong_consensus*total) {
706  return true;
707  } else {
708  return false;
709  }
710 }
711 
712 int CMultAlign::FindNextStrongWord(int nextp, const string& maximal_bases, string& strong_word, TSignedSeqRange& strong_word_range, int& first_gap) {
713  first_gap = -1;
714  while(nextp < (int)maximal_bases.size()) {
715  string word;
716  int word_start = nextp;
717  int word_end = nextp;
718  bool low_complexity = false;
719  for( ; word_end < (int)maximal_bases.size() && (int)word.size() < m_word && maximal_bases[word_end] != '#'; ++word_end) {
720  if(maximal_bases[word_end] != '-') {
721  word.push_back(toupper(maximal_bases[word_end]));
722  if(islower(maximal_bases[word_end]))
723  low_complexity = true;
724  }
725  }
726 
727  if((int)word.size() < m_word) {
728  if(maximal_bases[word_end] == '#') { // we reached a gap
729  if(first_gap < 0) first_gap = word_end;
730  nextp = word_end+1;
731  continue;
732  } else { // we reached the end
733  return word_end;
734  }
735  }
736 
737  TSignedSeqRange word_range(word_start,word_end-1);
738  if(!low_complexity && CheckWord(word_range,word)) {
739  strong_word = word;
740  strong_word_range = word_range;
741  return word_end;
742  }
743  ++nextp;
744  }
745 
746  return nextp;
747 }
748 
750  arg_desc->AddDefaultKey("word", "word", "Elementary size for anchor regions", CArgDescriptions::eInteger, "8");
751  arg_desc->AddDefaultKey("min_edge", "min_edge", "This many bases are ignored on both side of alignment", CArgDescriptions::eInteger, "5");
752  arg_desc->AddDefaultKey("min_coverage", "min_coverage", "Minimal coverage on contig", CArgDescriptions::eInteger, "3");
753  arg_desc->AddDefaultKey("maxNs", "maxNs", "Maximal number of Ns allowed per alignment", CArgDescriptions::eInteger, "2");
754  arg_desc->AddDefaultKey("min_abs_support", "min_abs_support", "Minimal number of reads to support a variant", CArgDescriptions::eInteger, "3");
755  arg_desc->AddDefaultKey("min_rel_support", "min_rel_support", "Minimal fraction of reads to support a variant", CArgDescriptions::eDouble, "0.075");
756  arg_desc->AddDefaultKey("strong_consensus", "strong_consensus", "Minimal fraction of alignmnets to support a strong word", CArgDescriptions::eDouble, "0.85");
757 }
758 
759 void CMultAlign::ProcessArgs(const CArgs& args) {
760 
761  m_word = args["word"].AsInteger();
762  m_min_edge = args["min_edge"].AsInteger();
763  m_min_coverage = args["min_coverage"].AsInteger();
764  m_maxNs = args["maxNs"].AsInteger();
765  m_min_rel_support_for_variant = args["min_rel_support"].AsDouble();
766  m_min_abs_support_for_variant = args["min_abs_support"].AsInteger();
767  m_strong_consensus = args["strong_consensus"].AsDouble();
768 }
769 
771  m_word = 8;
772  m_min_edge = 5;
773  m_min_coverage = 3;
774  m_maxNs = 2;
777  m_strong_consensus = 0.85;
778 }
779 
780 
781 END_SCOPE(gnomon)
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgs –.
Definition: ncbiargs.hpp:379
CBioseq_Handle –.
int Distance(const char *query, const char *subject) const
Definition: glb_align.cpp:245
TInDels GetInDels(int sstart, const char *const query, const char *subject) const
Definition: glb_align.cpp:119
double Weight() const
const TExons & Exons() const
TInDels & FrameShifts()
TSignedSeqRange Limits() const
Definition: variations.hpp:57
CLiteAlign(TSignedSeqRange range, const TLiteInDels &indels, set< CLiteIndel > &indel_holder, double weight, double ident)
Definition: variations.cpp:58
double m_ident
Definition: variations.hpp:65
TSignedSeqRange m_range
Definition: variations.hpp:66
TLiteInDelsP m_indels
Definition: variations.hpp:67
string TranscriptSeq(const string &contig) const
Definition: variations.cpp:149
void InsertDashesInBase()
Definition: variations.cpp:348
vector< string > m_reads
Definition: variations.hpp:131
void InsertDashesInReads()
Definition: variations.cpp:400
static void SetupArgDescriptions(CArgDescriptions *arg_desc)
Definition: variations.cpp:749
int m_min_abs_support_for_variant
Definition: variations.hpp:152
bool CheckWord(const TSignedSeqRange &word_range, const string &word)
Definition: variations.cpp:690
void GetCounts()
Definition: variations.cpp:470
void SelectAligns(vector< const CLiteAlign * > &all_alignsp)
Definition: variations.cpp:324
string EmitSequenceFromBase(const TSignedSeqRange &word_range)
Definition: variations.cpp:681
set< CLiteIndel > m_indel_holder
Definition: variations.hpp:140
string m_contig_id
Definition: variations.hpp:144
string m_contigt
Definition: variations.hpp:143
void ProcessArgs(const CArgs &args)
Definition: variations.cpp:759
int m_min_coverage
Definition: variations.hpp:148
map< int, TCharIntMap > m_counts
Definition: variations.hpp:138
void AddAlignment(const SSamData &align)
Definition: variations.cpp:192
TAlignModelList GetVariationAlignList(bool correctionsonly)
Definition: variations.cpp:249
string m_base
Definition: variations.hpp:142
TSignedSeqRange LegitRange(int ir)
Definition: variations.cpp:523
void SetGenomic(const CConstRef< CSeq_id > &seqid, CScope &scope)
Definition: variations.cpp:171
vector< const CLiteAlign * > m_alignsp
Definition: variations.hpp:133
TIntMap m_contig_to_align
Definition: variations.hpp:135
void SetDefaultParams()
Definition: variations.cpp:770
double m_min_rel_support_for_variant
Definition: variations.hpp:151
void PrepareStats()
Definition: variations.cpp:486
int FindNextStrongWord(int nextp, const string &maximal_bases, string &strong_word, TSignedSeqRange &strong_word_range, int &first_gap)
Definition: variations.cpp:712
string EmitSequenceFromRead(int ir, const TSignedSeqRange &word_range)
Definition: variations.cpp:669
void SeqCountsBetweenTwoStrongWords(const TSignedSeqRange &prev_strong_word_range, const string &prev_strong_word, const TSignedSeqRange &strong_word_range, const string &strong_word, TSIMap &seq_counts, int &total_cross, int &accepted_cross)
Definition: variations.cpp:501
void Variations(map< TSignedSeqRange, TSIMap > &variations, list< TSignedSeqRange > &confirmed_ranges)
Definition: variations.cpp:547
TLiteAlignList m_aligns
Definition: variations.hpp:139
double m_strong_consensus
Definition: variations.hpp:153
vector< int > m_starts
Definition: variations.hpp:132
void PrepareReads(const vector< const CLiteAlign * > &all_alignsp)
Definition: variations.cpp:338
TIntMap m_align_to_contig
Definition: variations.hpp:136
CScope –.
Definition: scope.hpp:92
CSeqVector –.
Definition: seq_vector.hpp:65
size_type size() const
Definition: map.hpp:148
container_type::const_iterator const_iterator
Definition: map.hpp:53
container_type::iterator iterator
Definition: map.hpp:54
const_iterator begin() const
Definition: map.hpp:151
const_iterator end() const
Definition: map.hpp:152
iterator_bool insert(const value_type &val)
Definition: map.hpp:165
bool empty() const
Definition: map.hpp:149
container_type::value_type value_type
Definition: map.hpp:52
void clear()
Definition: map.hpp:169
const_iterator find(const key_type &key) const
Definition: map.hpp:153
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
void clear()
Definition: set.hpp:153
static int base_length[29]
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:61
double Entropy(const string &seq)
Definition: glb_align.cpp:866
CCigar GlbAlign(const char *query, int querylen, const char *subject, int subjectlen, int gopen, int gapextend, const char delta[256][256])
Definition: glb_align.cpp:293
list< CAlignModel > TAlignModelList
@ ePlus
bool Include(TSignedSeqRange big, TSignedSeqRange small)
vector< CInDelInfo > TInDels
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
int TSignedSeqPos
Type for signed sequence position.
Definition: ncbimisc.hpp:887
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
void AddDefaultKey(const string &name, const string &synopsis, const string &comment, EType type, const string &default_value, TFlags flags=0, const string &env_var=kEmptyStr, const char *display_value=nullptr)
Add description for optional key with default value.
Definition: ncbiargs.cpp:2442
@ eDouble
Convertible into a floating point number (double)
Definition: ncbiargs.hpp:594
@ eInteger
Convertible into an integer number (int or Int8)
Definition: ncbiargs.hpp:592
string GetSeqIdString(bool with_version=false) const
Return seqid string with optional version for text seqid type.
Definition: Seq_id.cpp:2145
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
CSeqVector GetSeqVector(EVectorCoding coding, ENa_strand strand=eNa_strand_plus) const
Get sequence: Iupacna or Iupacaa if use_iupac_coding is true.
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
Definition: seq_vector.cpp:304
TSeqPos size(void) const
Definition: seq_vector.hpp:291
position_type GetLength(void) const
Definition: range.hpp:158
bool NotEmpty(void) const
Definition: range.hpp:152
static TThisType GetEmpty(void)
Definition: range.hpp:306
bool Empty(void) const
Definition: range.hpp:148
CRange< TSignedSeqPos > TSignedSeqRange
Definition: range.hpp:420
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5078
void SetFrom(TFrom value)
Assign a value to From data member.
Definition: Range_.hpp:231
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
void SetTo(TTo value)
Assign a value to To data member.
Definition: Range_.hpp:278
@ e_Local
local use
Definition: Seq_id_.hpp:95
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
n font weight
int i
yy_size_t n
int len
range(_Ty, _Ty) -> range< _Ty >
constexpr auto sort(_Init &&init)
const struct ncbi::grid::netcache::search::fields::SIZE size
unsigned int a
Definition: ncbi_localip.c:102
EIPRangeType t
Definition: ncbi_localip.c:101
int toupper(Uchar c)
Definition: ncbictype.hpp:73
int islower(Uchar c)
Definition: ncbictype.hpp:66
T max(T x_, T y_)
T min(T x_, T y_)
Int4 delta(size_t dimension_, const Int4 *score_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
#define Loc
static int match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, uint16_t top_bracket, PCRE2_SIZE frame_size, pcre2_match_data *match_data, match_block *mb)
Definition: pcre2_match.c:594
#define count
static SLJIT_INLINE sljit_ins l(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
string m_cigar
Definition: variations.hpp:46
int m_contigp
Definition: variations.hpp:45
string m_seq
Definition: variations.hpp:47
#define _ASSERT
TLiteInDels GroupInDels(const TLiteInDels &sam_indels)
Definition: variations.cpp:14
#define ENTROPY_LEVEL_FOR_ALIGNS
Definition: variations.cpp:322
vector< const CLiteIndel * > TLiteInDelsP
Definition: variations.hpp:41
vector< CLiteIndel > TLiteInDels
Definition: variations.hpp:40
list< CLiteAlign > TLiteAlignList
Definition: variations.hpp:70
#define const
Definition: zconf.h:232
Modified on Fri Sep 20 14:57:21 2024 by modify_doxy.py rev. 669887