NCBI C++ ToolKit
exon_selector.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: exon_selector.cpp 100237 2023-07-11 16:24:15Z mozese2 $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information *
6  * This software/database is a "United States Government Work" under the
7  * terms of the United States Copyright Act. It was written as part of
8  * the author's official duties as a United States Government employee and
9  * thus cannot be copyrighted. This software/database is freely available
10  * to the public for use. The National Library of Medicine and the U.S.
11  * Government have not placed any restriction on its use or reproduction.
12  *
13  * Although all reasonable efforts have been taken to ensure the accuracy
14  * and reliability of the software and data, the NLM and the U.S.
15  * Government do not and cannot warrant the performance or results that
16  * may be obtained by using this software or data. The NLM and the U.S.
17  * Government disclaim all warranties, express or implied, including
18  * warranties of performance, merchantability or fitness for any particular
19  * purpose.
20  *
21  * Please cite the author in any work or product based on this material.
22  *
23  * ===========================================================================
24  *
25  * Authors: Alexandre Souvorov
26  *
27  * File Description:
28  * Test application for selecting exon chains in SAM alignments
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbiapp.hpp>
34 #include <corelib/ncbienv.hpp>
35 #include <corelib/ncbiargs.hpp>
36 #include <math.h>
37 
38 #include <array>
39 
41 
43 {
44 public:
45  virtual void Init();
46  virtual int Run();
47 
48  typedef vector<string> TSamFields;
49  typedef list<TSamFields> TSplitAligns;
50  typedef list<pair<int, string>> TMDTag;
51  typedef list<pair<int, char>> TCigar;
52  typedef vector<char> TScript;
53  template<typename T>
55 
57  struct Exon {
58  int m_qfrom = 0;
59  int m_qto = 0;
60  int m_sfrom = 0;
61  int m_sto = 0;
62  int m_matches = 0;
63  int m_mismatches = 0;
64  int m_indels = 0;
65  int m_align_len = 0;
66  int m_score = 0;
67 
68  string m_tstrand;
72 
73  void RemoveLeftIndels(int gapopen1, int gapextend1, int gapopen2, int gapextend2) {
74  while(!m_script.empty() && (m_script.front() == 'D' || m_script.front() == 'I'))
75  m_script.erase(m_script.begin());
76  m_align_len = m_script.size();
77  while(!m_md.empty() && m_md.front().first < 0)
78  m_md.pop_front();
79  while(!m_cigar.empty() && (m_cigar.front().second == 'D' || m_cigar.front().second == 'I')) {
80  --m_indels;
81  int len = m_cigar.front().first;
82  m_score += min(gapopen1+gapextend1*len, gapopen2+gapextend2*len);
83  if(m_cigar.front().second == 'D')
84  m_sfrom += len;
85  else
86  m_qfrom += len;
87  m_cigar.pop_front();
88  }
89  }
90  void RemoveRightIndels(int gapopen1, int gapextend1, int gapopen2, int gapextend2) {
91  while(!m_script.empty() && (m_script.back() == 'D' || m_script.back() == 'I'))
92  m_script.pop_back();
93  m_align_len = m_script.size();
94  while(!m_md.empty() && m_md.back().first < 0)
95  m_md.pop_back();
96  while(!m_cigar.empty() && (m_cigar.back().second == 'D' || m_cigar.back().second == 'I')) {
97  --m_indels;
98  int len = m_cigar.back().first;
99  m_score += min(gapopen1+gapextend1*len, gapopen2+gapextend2*len);
100  if(m_cigar.back().second == 'D')
101  m_sto -= len;
102  else
103  m_qto -= len;
104  m_cigar.pop_back();
105  }
106  }
107 
109  int m_chain_score = 0;
110  int m_intron_len = 0;
111  Exon* m_prevp = nullptr;
112  };
113  typedef vector<Exon> TExons;
114 
117  string m_read;
118  string m_rseq;
119  int m_score = 0;
120  pair<int, int> m_range;
121  string m_cigar;
122  string m_md;
123  string m_tstrand;
124  int m_dist = 0;
125 
126  bool m_above_thresholds = false;
127  bool m_paired_read = false;
128  AlignCompartment* m_matep = nullptr; // if other mate was found
129  };
130  typedef list<AlignCompartment> TAlignCompartments;
131 
132  Exon GetNextExon(string& cigar_string, string& MD_tag, int qfrom, int sfrom);
133  void ExtractExonsFromSAM(const TSamFields& tags, TExons& exons);
134  void ChainExons();
135  void ConnectPairs();
136  bool CompatiblePair(const AlignCompartment& left, const AlignCompartment& right);
137  void SelectBestLocations();
138  void FormatResults();
139  void FormatSingle(AlignCompartment& compart, const string& contig, int mate, int strand, int count, int index);
140  void FormatPaired(AlignCompartment& compart, const string& contig, int mate, int strand, int count, int index, bool left);
141 
142  int m_matchscore = 1;
144  int m_gapopen1 = 2;
145  int m_gapopen2 = 32;
146  int m_gapextend1 = 1;
147  int m_gapextend2 = 0;
148 
151  double m_penalty;
154 
155  double m_min_entropy = 0.65;
156 
157  bool m_remove_repats = false;
158  bool m_read_through = false;
159  map<string, TMatrix<TSplitAligns>> m_read_aligns; // [contig][strand] list of SAM fields
161 };
162 
163 CExonSelectorApplication::Exon CExonSelectorApplication::GetNextExon(string& cigar_string, string& MD_tag, int qfrom, int sfrom) {
164  Exon e;
165  e.m_qfrom = qfrom;
166  e.m_sfrom = sfrom;
167  e.m_qto = e.m_qfrom-1;
168  e.m_sto = e.m_sfrom-1;
169  while(!cigar_string.empty()) {
170  size_t letter;
171  int len = stoi(cigar_string, &letter);
172  char c = cigar_string[letter];
173 
174  // capture exon portion of cigar
175  if(c != 'S' && c != 'N')
176  e.m_cigar.emplace_back(len, c);
177  // clip used cigar; keep intron for next exon
178  if(c != 'N' || e.m_align_len == 0)
179  cigar_string.erase(0, letter+1);
180 
181  if(c == 'S') {
182  if(e.m_align_len == 0) { //new query start
183  e.m_qfrom += len;
184  e.m_qto = e.m_qfrom-1;
185  }
186  } else if(c == 'N') {
187  if(e.m_align_len == 0) { // new subject start
188  e.m_sfrom += len;
189  e.m_sto = e.m_sfrom-1;
190  } else { // end of exon
192  return e;
193  }
194  } else if(c == 'M' || c == '=' || c =='X') {
195  e.m_align_len += len;
196  e.m_qto += len;
197  e.m_sto += len;
198 
199  while(len > 0) {
200  if(isdigit(MD_tag.front())) {
201  size_t idx;
202  int m = stoi(MD_tag, &idx);
203  if(e.m_md.empty() || e.m_md.back().first <= 0)
204  e.m_md.emplace_back(min(m,len), "");
205  else
206  e.m_md.back().first += min(m,len);
207  MD_tag.erase(0, idx); // remove matches
208  if(m > len) { // matches split between exons
209  MD_tag = to_string(m-len)+MD_tag; // return remaining matches
210  e.m_matches += len;
211  e.m_script.insert(e.m_script.end(), len, '=');
212  len = 0;
213  } else {
214  e.m_matches += m;
215  len -= m;
216  e.m_script.insert(e.m_script.end(), m, '=');
217  }
218  } else {
219  if(e.m_md.empty() || e.m_md.back().first != 0)
220  e.m_md.emplace_back(0, MD_tag.substr(0, 1));
221  else
222  e.m_md.back().second += MD_tag.front();
223  ++e.m_mismatches;
224  e.m_script.push_back('X');
225  --len;
226  MD_tag.erase(0, 1);
227  }
228  }
229  } else if(c == 'I') {
230  e.m_align_len += len;
231  e.m_script.insert(e.m_script.end(), len, 'I');
232  e.m_qto += len;
233  ++e.m_indels;
235  } else if(c == 'D') {
236  e.m_align_len += len;
237  e.m_script.insert(e.m_script.end(), len, 'D');
238  e.m_sto += len;
239  ++e.m_indels;
240  string del;
241  if(MD_tag.front() == '^') {
242  del = MD_tag.substr(1, len);
243  MD_tag.erase(0, len+1);
244  } else { // in case deletion was split between exons
245  del = MD_tag.substr(0, len);
246  MD_tag.erase(0, len);
247  }
248  if(e.m_md.empty() || e.m_md.back().first >= 0)
249  e.m_md.emplace_back(-1, del);
250  else
251  e.m_md.back().second += del;
253  } else {
254  cerr << "Unexpected symbol in CIGAR" << endl;
255  exit(1);
256  }
257  }
259  return e;
260 }
261 
262 double Entropy(const string& seq) {
263  int length = seq.size();
264  if(length == 0)
265  return 0;
266  double tA = 1.e-8;
267  double tC = 1.e-8;
268  double tG = 1.e-8;
269  double tT = 1.e-8;
270  ITERATE(string, i, seq) {
271  switch(*i) {
272  case 'A': tA += 1; break;
273  case 'C': tC += 1; break;
274  case 'G': tG += 1; break;
275  case 'T': tT += 1; break;
276  default: break;
277  }
278  }
279  double entropy = -(tA*log(tA/length)+tC*log(tC/length)+tG*log(tG/length)+tT*log(tT/length))/(length*log(4.));
280 
281  return entropy;
282 }
283 
285  TExons align_exons;
286  string MD_tag;
287  string tstrand;
288  for(int i = 11; i < (int)tags.size() && (MD_tag.empty() || tstrand.empty()); ++i) {
289  auto& tag = tags[i];
290  vector<string> fields;
291  NStr::Split(tag, ":", fields, 0);
292  if(fields.size() != 3)
293  continue;
294  if(fields[0] == "MD" && fields[1] == "Z")
295  MD_tag = fields[2];
296  if((fields[0] == "ts" || fields[0] == "TS" || fields[0] == "XS") && fields[1] == "A")
297  tstrand = tag;
298  }
299  if(MD_tag.empty()) {
300  cerr << "MD:Z tag in SAM file is expected" << endl;
301  exit(1);
302  }
303 
304  // extract exons
305  string cigar_string = tags[5];
306  int qfrom = 0;
307  int sfrom = std::stoi(tags[3])-1;
308  while(!cigar_string.empty()) {
309  align_exons.push_back(GetNextExon(cigar_string, MD_tag, qfrom, sfrom));
310  align_exons.back().m_tstrand = tstrand;
311  qfrom = align_exons.back().m_qto+1;
312  sfrom = align_exons.back().m_sto+1;
313  }
314 
315  // remove low complexity exons
316  const string& seq = tags[9];
317  while(!align_exons.empty() && Entropy(seq.substr(align_exons.front().m_qfrom, align_exons.front().m_qto-align_exons.front().m_qfrom+1)) < m_min_entropy)
318  align_exons.erase(align_exons.begin());
319  while(!align_exons.empty() && Entropy(seq.substr(align_exons.back().m_qfrom, align_exons.back().m_qto-align_exons.back().m_qfrom+1)) < m_min_entropy)
320  align_exons.pop_back();
321 
322  exons.insert(exons.end(), align_exons.begin(), align_exons.end());
323 }
324 
325 string Tag(const string& name, int value) {
326  return name+":i:"+to_string(value);
327 }
328 string Tag(const string& name, const string& value) {
329  return name+":Z:"+value;
330 }
331 string Tag(const string& name, char value) {
332  return name+":A:"+value;
333 }
334 
335 void CExonSelectorApplication::FormatPaired(AlignCompartment& compart, const string& contig, int mate, int strand, int count, int index, bool left) {
336  TSamFields fields(11);
337  fields[0] = compart.m_read;
338  int flag = 1+2+strand*16+(1-strand)*32+(mate+1)*64;
339  fields[1] = to_string(flag);
340  fields[2] = contig;
341  fields[3] = to_string(compart.m_range.first+1);
342  fields[4] = "0";
343  fields[5] = compart.m_cigar;
344  fields[6] = "=";
345  fields[7] = to_string(compart.m_matep->m_range.first+1);
346  int span = max(compart.m_range.second, compart.m_matep->m_range.second)-min(compart.m_range.first, compart.m_matep->m_range.first)+1;
347  if(!left)
348  span = -span;
349  fields[8] = to_string(span);
350  fields[9] = compart.m_rseq;
351  fields[10] = "*";
352  fields.push_back(Tag("NM", compart.m_dist));
353  fields.push_back(Tag("AS", compart.m_score+compart.m_matep->m_score));
354  if(!compart.m_tstrand.empty())
355  fields.push_back(compart.m_tstrand);
356  fields.push_back(Tag("MD", compart.m_md));
357  fields.push_back(Tag("NH", count));
358  fields.push_back(Tag("HI", index));
359  fields.push_back(Tag("MC", compart.m_matep->m_cigar));
360 
361  cout << fields[0];
362  for(unsigned i = 1; i < fields.size(); ++i)
363  cout << "\t" << fields[i];
364  cout << endl;
365 }
366 
367 void CExonSelectorApplication::FormatSingle(AlignCompartment& compart, const string& contig, int mate, int strand, int count, int index) {
368  TSamFields fields(11);
369  fields[0] = compart.m_read;
370  int flag = strand*16;
371  if(compart.m_paired_read) // paired ends (mates not connected)
372  flag += 1+8+(mate+1)*64; // if mates not connected the other mate reflected as 'not aligned' regrdless
373  fields[1] = to_string(flag);
374  fields[2] = contig;
375  fields[3] = to_string(compart.m_range.first+1);
376  fields[4] = "0";
377  fields[5] = compart.m_cigar;
378  fields[6] = "*";
379  fields[7] = "0";
380  fields[8] = "0";
381  fields[9] = compart.m_rseq;
382  fields[10] = "*";
383 
384  fields.push_back(Tag("NM", compart.m_dist));
385  fields.push_back(Tag("AS", compart.m_score));
386  if(!compart.m_tstrand.empty())
387  fields.push_back(compart.m_tstrand);
388  fields.push_back(Tag("MD", compart.m_md));
389  fields.push_back(Tag("NH", count));
390  fields.push_back(Tag("HI", index));
391 
392  cout << fields[0];
393  for(unsigned i = 1; i < fields.size(); ++i)
394  cout << "\t" << fields[i];
395  cout << endl;
396 }
397 
399  bool paired = false; // has mates, and they are connected
400  array<int, 2> mate_count = {0, 0};
401  for(auto& contig_aligns : m_compartments) {
402  for(int mate = 0; mate < 2; ++mate) {
403  for(int strand = 0; strand < 2; ++strand) {
404  TAlignCompartments& compact_aligns = m_compartments[contig_aligns.first][mate][strand];
405  for(auto& compart : compact_aligns) {
406  ++mate_count[mate];
407  if(compart.m_matep != nullptr)
408  paired = true;
409  }
410  }
411  }
412  }
413 
414  if(paired) {
415  int index = 0;
416  for(auto& contig_aligns : m_compartments) {
417  auto& contig = contig_aligns.first;
418  for(int mate = 0; mate < 2; ++mate) {
419  int strand = 0;
420  auto& left_aligns = contig_aligns.second[mate][strand];
421  for(auto& compart : left_aligns) {
422  FormatPaired(compart, contig, mate, strand, mate_count[0], ++index, true);
423  FormatPaired(*compart.m_matep, contig, 1-mate, 1-strand, mate_count[0], index, false);
424  }
425  }
426  }
427  } else {
428  array<int, 2> mate_index = {0, 0};
429  for(auto& contig_aligns : m_compartments) {
430  auto& contig = contig_aligns.first;
431  for(int mate = 0; mate < 2; ++mate) {
432  for(int strand = 0; strand < 2; ++strand) {
433  TAlignCompartments& compact_aligns = m_compartments[contig_aligns.first][mate][strand];
434  for(auto& compart : compact_aligns)
435  FormatSingle(compart, contig, mate, strand, mate_count[mate], ++mate_index[mate]);
436  }
437  }
438  }
439  }
440 }
441 
443  int pair_score = numeric_limits<int>::min();
445  for(auto& contig_aligns : m_compartments) {
446  for(int mate = 0; mate < 2; ++mate) {
447  for(int strand = 0; strand < 2; ++strand) {
448  TAlignCompartments& compact_aligns = contig_aligns.second[mate][strand];
449  for(auto& align : compact_aligns) {
450  mate_score[mate] = max(mate_score[mate], align.m_score);
451  if(align.m_matep != nullptr)
452  pair_score = max(pair_score, align.m_score+align.m_matep->m_score);
453  }
454  compact_aligns.remove_if([](const AlignCompartment& a){ return !a.m_above_thresholds; }); // remove after best score found; connected mates are above thresholds and will not be broken
455  }
456  }
457  }
458 
459  bool paired = pair_score >= max(mate_score[0], mate_score[1]);
460  for(auto& contig_aligns : m_compartments) {
461  for(int mate = 0; mate < 2; ++mate) {
462  for(int strand = 0; strand < 2; ++strand) {
463  TAlignCompartments& compact_aligns = contig_aligns.second[mate][strand];
464  if(paired)
465  compact_aligns.remove_if([&](const AlignCompartment& a){
466  if(a.m_matep == nullptr) return true;
467  if(a.m_score+a.m_matep->m_score == pair_score) return false;
468  a.m_matep->m_matep = nullptr; return true; });
469  else
470  compact_aligns.remove_if([&](AlignCompartment& a){ a.m_matep = nullptr; return a.m_score != mate_score[mate]; });
471  }
472  }
473  }
474 }
475 
477  if(!left.m_above_thresholds || !right.m_above_thresholds)
478  return false;
479  if(right.m_range.first < left.m_range.first)
480  return false; // sticks out on the left
481  int gap = right.m_range.first-left.m_range.second-1;
482  if(gap > m_max_intron) // too far
483  return false;
484  if(gap >= 0) // no overlap
485  return true;
486 
487  // check if introns same in overlapping region
488  vector<pair<int,int>> left_introns;
489  vector<pair<int,int>> right_introns;
490  // a small not spliced portion is allowed
491  for(unsigned i = 1; i < left.m_exons.size(); ++i) {
492  if(left.m_exons[i].m_sfrom > right.m_range.first+5)
493  left_introns.emplace_back(left.m_exons[i-1].m_sto+1, left.m_exons[i].m_sfrom-1);
494  }
495  for(unsigned i = 1; i < right.m_exons.size(); ++i) {
496  if(right.m_exons[i-1].m_sto < left.m_range.second-5)
497  right_introns.emplace_back(right.m_exons[i-1].m_sto+1, right.m_exons[i].m_sfrom-1);
498  }
499  return left_introns == right_introns;
500 }
501 
503  for(auto& contig_aligns : m_compartments) {
504  for(int left_mate = 0; left_mate < 2; ++left_mate) {
505  int right_mate = 1-left_mate;
506  int left_strand = 0;
507  int right_strand = 1;
508  auto& left_aligns = contig_aligns.second[left_mate][left_strand];
509  auto& right_aligns = contig_aligns.second[right_mate][right_strand];
510 
511  auto iright = right_aligns.begin();
512  for(auto ileft = left_aligns.begin(); ileft != left_aligns.end() && iright != right_aligns.end(); ++ileft) {
513  while(iright != right_aligns.end() && iright->m_range.second < ileft->m_range.second)
514  ++iright;
515  if(iright == right_aligns.end())
516  break;
517  auto inext = next(ileft);
518  if(inext != left_aligns.end() && inext->m_range.second < iright->m_range.second)
519  continue;
520 
521  // connect compatible mates
522  if(CompatiblePair(*ileft, *iright)) {
523  ileft->m_matep = &(*iright);
524  iright->m_matep = &(*ileft);
525  }
526  ++iright;
527  }
528  }
529  }
530 }
531 
532 
534  m_read_through = false;
535  for(auto& contig_aligns : m_read_aligns) {
536  auto& contig = contig_aligns.first;
537  for(int mate = 0; mate < 2; ++mate) {
538  for(int strand = 0; strand < 2; ++strand) {
539  TSplitAligns& orig_aligns = contig_aligns.second[mate][strand];
540  if(orig_aligns.empty())
541  continue;
542  string& read = orig_aligns.front()[0];
543  bool paired_read = stoi(orig_aligns.front()[1])&1;
544  string& rseq = orig_aligns.front()[9];
545  int read_len = rseq.size();
546 
547  // find exons
548  TExons exons;
549  for(auto& oa : orig_aligns)
550  ExtractExonsFromSAM(oa, exons);
551 
552  if(exons.empty())
553  continue;
554 
555  sort(exons.begin(), exons.end(), [](const Exon& e1, const Exon& e2){
556  if(e1.m_sto != e2.m_sto)
557  return e1.m_sto > e2.m_sto;
558  if(e1.m_sfrom != e2.m_sfrom)
559  return e1.m_sfrom < e2.m_sfrom;
560  if(e1.m_qto != e2.m_qto)
561  return e1.m_qto > e2.m_qto;
562  if(e1.m_qfrom != e2.m_qfrom)
563  return e1.m_qfrom < e2.m_qfrom;
564  if(e1.m_score != e2.m_score)
565  return e1.m_score > e2.m_score;
566  return e1.m_cigar < e2.m_cigar; });
567 
568  // remove identical exons
569  auto tail = unique(exons.begin(), exons.end(), [](const Exon& e1, const Exon& e2){
570  return e1.m_sfrom == e2.m_sfrom && e1.m_sto == e2.m_sto && e1.m_qfrom == e2.m_qfrom && e1.m_qto == e2.m_qto; });
571  exons.erase(tail, exons.end());
572 
573  // remove shorter exons with the same right end
574  for(auto i = exons.begin(); i != exons.end(); ++i) {
575  if(i->m_type == eBogus || i->m_qfrom > 10)
576  continue;
577  for(auto j = next(i); j != exons.end() && j->m_sto == i->m_sto && j->m_qto == i->m_qto; ++j)
578  j->m_type = eBogus;
579  }
580  exons.erase(remove_if(exons.begin(), exons.end(), [](Exon& e){ return e.m_type == eBogus; }), exons.end());
581 
582  sort(exons.begin(), exons.end(), [](const Exon& e1, const Exon& e2){
583  if(e1.m_sfrom != e2.m_sfrom)
584  return e1.m_sfrom < e2.m_sfrom;
585  if(e1.m_sto != e2.m_sto)
586  return e1.m_sto > e2.m_sto;
587  if(e1.m_qfrom != e2.m_qfrom)
588  return e1.m_qfrom < e2.m_qfrom;
589  return e1.m_qto > e2.m_qto; });
590 
591  // remove shorter exons with the same left end
592  for(auto i = exons.begin(); i != exons.end(); ++i) {
593  if(i->m_type == eBogus || read_len-i->m_qto-1 > 10)
594  continue;
595  for(auto j = next(i); j != exons.end() && j->m_sfrom == i->m_sfrom && j->m_qfrom == i->m_qfrom; ++j)
596  j->m_type = eBogus;
597  }
598  exons.erase(remove_if(exons.begin(), exons.end(), [](Exon& e){ return e.m_type == eBogus; }), exons.end());
599 
600  // find repeats
601  if(m_remove_repats) {
602  for(auto i = exons.begin(); i != exons.end() && !m_read_through; ++i) {
603  for(auto j = next(i); j != exons.end() && j->m_sfrom <= i->m_sto && !m_read_through; ++j) {
604  int soverlap = i->m_sto-j->m_sfrom+1;
605  int slen = min(i->m_sto-i->m_sfrom+1, j->m_sto-j->m_sfrom+1);
606  int qoverlap = i->m_qto-j->m_qfrom+1;
607  int qlen = min(i->m_qto-i->m_qfrom+1, j->m_qto-j->m_qfrom+1);
608  if(soverlap > 0.5*slen && qoverlap < 0.1*qlen)
609  m_read_through = true;
610  }
611  }
612  }
613 
614  int matches = 0;
615  int align_len = 0;
616  int qmin = numeric_limits<int>::max();
617  int qmax = 0;
618  for(auto& e : exons) {
619  matches += e.m_matches;
620  align_len += e.m_align_len;
621  qmin = min(qmin, e.m_qfrom);
622  qmax = max(qmax, e.m_qto);
623  }
624  double ident = double(matches)/align_len;
625  double cov = double(qmax-qmin+1)/read_len;
626 
627  double log2factor = 0.25;
628  int compart_penalty = ident*cov*m_penalty*read_len + 0.5;
629  int exon_num = exons.size();
630  int best_index = exon_num-1;
631  for(int i = exon_num-1; i >= 0; --i) {
632  auto& ei = exons[i];
633  int iscore = ei.m_matches;
634 
635  ei.m_chain_score = iscore; // start of everything
636  ei.m_type = eNewAlignment;
637  for(int j = i+1; j < exon_num; ++j) {
638  auto& ej = exons[j];
639  int ijscore = ej.m_chain_score+iscore;
640 
641  // if(ej.m_sfrom > ei.m_sto+m_max_intron) // too far
642  // break;
643  if(ej.m_sfrom <= ei.m_sto) // wrong genome order
644  continue;
645 
646  int overlap = ei.m_qto+1-ej.m_qfrom;
647  int intron = ej.m_sfrom-ei.m_sto-1;
648  if(overlap == 0 && intron > 20 && intron <= m_max_intron) { // intron within range
649  int score = ijscore;
650  double intron_len = ej.m_intron_len+intron;
651  double deltas = score-ei.m_chain_score-log2factor*log2((intron_len+1)/(ei.m_intron_len+1));
652  if(deltas > 0) {
653  ei.m_chain_score = score;
654  ei.m_prevp = &ej;
655  ei.m_type = eExtension;
656  ei.m_intron_len = intron_len;
657  }
658  } else if(overlap <= 0 && intron <= m_max_intron) { // no transcript overlap
659  int overlap_penalty = 0;
660  overlap_penalty = max(2, int(ident*abs(overlap)+0.5));
661  int score = ijscore-overlap_penalty;
662  double intron_len = ej.m_intron_len;
663  double deltas = score-ei.m_chain_score-log2factor*log2((intron_len+1)/(ei.m_intron_len+1));
664  if(deltas > 0) {
665  ei.m_chain_score = score;
666  ei.m_prevp = &ej;
667  ei.m_type = eNewAlignment;
668  ei.m_intron_len = intron_len+intron;
669  }
670  } else if(overlap > 0 && overlap < 10 && intron <= m_max_intron) { // small portion of transcript used twice
671  int overlap_penalty = 0;
672  int s = ei.m_script.size()-1;
673  int l = overlap;
674  while(s >= 0 && l > 0) {
675  if(ei.m_script[s] == '=')
676  ++overlap_penalty;
677  if(ei.m_script[s] != 'D')
678  --l;
679  --s;
680  }
681  s = 0;
682  l = overlap;
683  while(s < (int)ej.m_script.size() && l > 0) {
684  if(ej.m_script[s] == '=')
685  ++overlap_penalty;
686  if(ej.m_script[s] != 'D')
687  --l;
688  ++s;
689  }
690  overlap_penalty = max(2, overlap_penalty);
691  int score = ijscore-overlap_penalty;
692  double intron_len = ej.m_intron_len+intron;
693  double deltas = score-ei.m_chain_score-log2factor*log2((intron_len+1)/(ei.m_intron_len+1));
694  if(deltas > 0) {
695  ei.m_chain_score = score;
696  ei.m_prevp = &ej;
697  ei.m_type = eNewAlignment;
698  ei.m_intron_len = intron_len;
699  }
700  } else {
701  int score = ijscore-compart_penalty;
702  double intron_len = ej.m_intron_len;
703  double deltas = score-ei.m_chain_score-log2factor*log2((intron_len+1)/(ei.m_intron_len+1));
704  if(deltas > 0) { // start new compartment
705  ei.m_chain_score = score;
706  ei.m_prevp = &ej;
707  ei.m_type = eNewCompartment;
708  ei.m_intron_len = intron_len;
709  }
710  }
711  }
712  double deltas = ei.m_chain_score-exons[best_index].m_chain_score-log2factor*log2((double)(ei.m_intron_len+1)/(exons[best_index].m_intron_len+1));
713  if(deltas > 0)
714  best_index = i;
715  }
716 
717  /*
718  cerr << "Best index: " << best_index << " " << contig << " " << mate << " " << strand << " " << compart_penalty << endl;
719  for(int i = 0; i < (int)exons.size(); ++i) {
720  auto& e = exons[i];
721  cerr << i << "\t" << e.m_matches << "\t" << e.m_chain_score << "\t" << e.m_intron_len << "\t" << e.m_qfrom+1 << "\t" << e.m_qto+1 << "\t" << e.m_sfrom+1 << "\t" << e.m_sto+1 << "\t" << e.m_type << "\t" << strand << endl;
722  }
723  */
724 
725 
726  // not connected partial alignments initially are put in different compartments
727  TAlignCompartments& compartments = m_compartments[contig][mate][strand];
728  for(Exon* p = &exons[best_index]; p != nullptr; p = p->m_prevp) {
729  if(compartments.empty() || compartments.back().m_exons.back().m_type != eExtension)
730  compartments.emplace_back();
731  compartments.back().m_exons.push_back(*p);
732  }
733 
734  // remove end indels
735  for(auto iloop = compartments.begin(); iloop != compartments.end(); ) {
736  auto it = iloop++;
737  while(!it->m_exons.empty()) {
738  it->m_exons.front().RemoveLeftIndels(m_gapopen1, m_gapextend1, m_gapopen2, m_gapextend2);
739  if(it->m_exons.front().m_align_len == 0)
740  it->m_exons.erase(it->m_exons.begin());
741  else
742  break;
743  }
744  while(!it->m_exons.empty()) {
745  it->m_exons.back().RemoveRightIndels(m_gapopen1, m_gapextend1, m_gapopen2, m_gapextend2);
746  if(it->m_exons.back().m_align_len == 0)
747  it->m_exons.pop_back();
748  else
749  break;
750  }
751  if(it->m_exons.empty())
752  compartments.erase(it);
753  }
754 
755  // init compartment
756  for(auto& compartment : compartments) {
757  int matches = 0;
758  int align_len = 0;
759  string& cigar = compartment.m_cigar;
760  if(compartment.m_exons.front().m_qfrom > 0)
761  cigar += to_string(compartment.m_exons.front().m_qfrom)+"S";
762  string& tstrand = compartment.m_tstrand;
763  tstrand = compartment.m_exons.front().m_tstrand;
764  TMDTag md;
765  for(int i = 0; i < (int)compartment.m_exons.size(); ++i) {
766  auto& e = compartment.m_exons[i];
767  compartment.m_score += e.m_score;
768  matches += e.m_matches;
769  align_len += e.m_align_len;
770  if(i > 0) {
771  int intron = compartment.m_exons[i].m_sfrom-compartment.m_exons[i-1].m_sto-1;
772  cigar += to_string(intron)+"N";
773  }
774  for(auto& elem : compartment.m_exons[i].m_cigar)
775  cigar += to_string(elem.first)+elem.second;
776  compartment.m_dist += e.m_align_len-e.m_matches;
777  if(!tstrand.empty() && e.m_tstrand != tstrand)
778  tstrand.clear();
779  md.insert(md.end(), e.m_md.begin(), e.m_md.end());
780  }
781  int tail = rseq.size()-compartment.m_exons.back().m_qto-1;
782  if(tail > 0)
783  cigar += to_string(tail)+"S";
784 
785  if(compartment.m_exons.size() == 1)
786  tstrand.clear();
787 
788  string& md_tag = compartment.m_md;
789  for(auto i = md.begin(); i != md.end(); ++i) {
790  for(auto j = next(i); j != md.end(); j = next(i)) {
791  if(i->first > 0 && j->first > 0)
792  i->first += j->first;
793  else if(i->first == j->first)
794  i->second += j->second;
795  else
796  break;
797  md.erase(j);
798  }
799  if(i->first > 0)
800  md_tag += to_string(i->first);
801  else if(i->first == 0)
802  md_tag += i->second;
803  else
804  md_tag += "^"+i->second;
805  }
806  compartment.m_range.first = compartment.m_exons.front().m_sfrom;
807  compartment.m_range.second = compartment.m_exons.back().m_sto;
808  int read_span = compartment.m_exons.back().m_qto-compartment.m_exons.front().m_qfrom+1;
809  compartment.m_above_thresholds = double(read_span)/rseq.size() >= m_min_output_cov && double(matches)/align_len >= m_min_output_idty;
810  compartment.m_read = read;
811  compartment.m_rseq = rseq;
812  compartment.m_paired_read = paired_read;
813  }
814 
815  /*
816  for(auto& compartment : compartments) {
817  cerr << "Full path: " << compartment.m_score << endl;
818  for(int i = 0; i < (int)compartment.m_exons.size(); ++i) {
819  auto& e = compartment.m_exons[i];
820  cerr << i << "\t" << e.m_matches << "\t" << e.m_chain_score << "\t" << e.m_qfrom+1 << "\t" << e.m_qto+1 << "\t" << e.m_sfrom+1 << "\t" << e.m_sto+1 << "\t" << e.m_type << "\t" << strand << endl;
821  }
822  cerr << endl << endl;
823  }
824  */
825 
826  // remove lower score parts
827  list<TAlignCompartments::iterator> erased;
828  TAlignCompartments::reverse_iterator keeper = compartments.rbegin();
829  for(auto i = next(keeper); i != compartments.rend(); ++i) {
830  if(i->m_exons.back().m_type == eNewCompartment) {
831  keeper = i;
832  continue;
833  }
834  if(i->m_score > keeper->m_score) {
835  erased.push_back(prev(keeper.base()));
836  keeper = i;
837  } else {
838  erased.push_back(prev(i.base()));
839  }
840  }
841  for(auto i : erased)
842  compartments.erase(i);
843 
844 
845  /*
846  for(auto& compartment : compartments) {
847  cerr << "Score: " << compartment.m_score << endl;
848  for(int i = 0; i < (int)compartment.m_exons.size(); ++i) {
849  auto& e = compartment.m_exons[i];
850  cerr << i << "\t" << e.m_matches << "\t" << e.m_chain_score << "\t" << e.m_qfrom+1 << "\t" << e.m_qto+1 << "\t" << e.m_sfrom+1 << "\t" << e.m_sto+1 << "\t" << e.m_type << "\t" << strand << endl;
851  }
852  cerr << endl << endl;
853  }
854  */
855 
856  }
857  }
858  }
859 }
860 
862 {
864 
865  unique_ptr<CArgDescriptions> argdescr(new CArgDescriptions);
866  argdescr->SetUsageContext(GetArguments().GetProgramBasename(), "exon_selector expects SAM alignments at stdin collated by query, e.g. with 'sort -k 1,1'");
867  argdescr->AddDefaultKey("penalty", "penalty", "Per-compartment penalty", CArgDescriptions::eDouble, "0.35");
868  argdescr->AddDefaultKey ("max_intron", "max_intron", "Maximum intron length (in base pairs)", CArgDescriptions::eInteger, "1200000");
869  argdescr->AddDefaultKey("min_query_len", "min_query_len", "Minimum length for individual cDNA sequences.", CArgDescriptions::eInteger, "50");
870  argdescr->AddFlag("star","STAR aligner settings: -match 1 -mismatch 1 -gapopen1 2 -gapextend1 2 -gapopen2 2 -gapextend2 2 -min_output_identity 0.9 -min_output_coverage 0.9");
871  argdescr->AddFlag("minimap2","minimap2 aligner settings: -match 1 -mismatch 2 -gapopen1 2 -gapextend1 1 -gapopen2 32 -gapextend2 0 -min_output_identity 0.8 -min_output_coverage 0.3");
872  argdescr->AddOptionalKey("min_output_identity", "min_output_identity", "Minimal identity for output alignments",CArgDescriptions::eDouble);
873  argdescr->AddOptionalKey("min_output_coverage", "min_output_coverage", "Minimal coverage for output alignments",CArgDescriptions::eDouble);
874  argdescr->AddOptionalKey("match", "match", "Match score", CArgDescriptions::eInteger);
875  argdescr->AddOptionalKey("mismatch", "mismatch", "Mismatch penalty", CArgDescriptions::eInteger);
876  argdescr->AddOptionalKey("gapopen1", "gapopen1", "Gap open penalty. A gap of length k costs min{gapopen1+k*gapextend1,gapopen2+k*gapextend2}", CArgDescriptions::eInteger);
877  argdescr->AddOptionalKey("gapextend1", "gapextend1", "Gap extend penalty", CArgDescriptions::eInteger);
878  argdescr->AddOptionalKey("gapopen2", "gapopen2", "Gap open penalty", CArgDescriptions::eInteger);
879  argdescr->AddOptionalKey("gapextend2", "gapextend2", "Gap extend penalty", CArgDescriptions::eInteger);
880  argdescr->AddFlag("nocheck","Don't check if reads are collated (saves memory)");
881  argdescr->AddFlag("remove_repeats","Remove alignments which have repeats in the read");
882 
883  CArgAllow* constrain01 (new CArgAllow_Doubles(0.0, 1.0));
884  argdescr->SetConstraint("penalty", constrain01);
885  argdescr->SetConstraint("min_output_identity", constrain01);
886  argdescr->SetConstraint("min_output_coverage", constrain01);
887 
888  CArgAllow_Integers* constrain_minqlen (new CArgAllow_Integers(21,99999));
889  argdescr->SetConstraint("min_query_len", constrain_minqlen);
890 
891  CArgAllow_Integers* constrain_score (new CArgAllow_Integers(0,99999));
892  argdescr->SetConstraint("match", constrain_score);
893  argdescr->SetConstraint("mismatch", constrain_score);
894  argdescr->SetConstraint("gapopen1", constrain_score);
895  argdescr->SetConstraint("gapextend1", constrain_score);
896  argdescr->SetConstraint("gapopen2", constrain_score);
897  argdescr->SetConstraint("gapextend2", constrain_score);
898 
899  SetupArgDescriptions(argdescr.release());
900 }
901 
903 {
904  const CArgs& args(GetArgs());
905 
906  m_penalty = args["penalty"].AsDouble();
907  m_min_query_len = args["min_query_len"].AsInteger();
908  m_max_intron = args["max_intron"].AsInteger();
909  bool check = !args["nocheck"];
910  if(args["remove_repeats"])
911  m_remove_repats = true;
912 
913  if(args["star"] && args["minimap2"]) {
914  cerr << "-star and -minimap2 can't be used together" << endl;
915  exit(1);
916  }
917 
918  if(args["minimap2"] || !args["star"]) {
919  m_matchscore = 1;
920  m_mismatchpenalty = 2;
921  m_gapopen1 = 2;
922  m_gapopen2 = 32;
923  m_gapextend1 = 1;
924  m_gapextend2 = 0;
925  m_min_output_idty = 0.8;
926  m_min_output_cov = 0.3;
927  }
928 
929  if(args["star"]) {
930  m_matchscore = 1;
931  m_mismatchpenalty = 1;
932  m_gapopen1 = 2;
933  m_gapopen2 = 2;
934  m_gapextend1 = 2;
935  m_gapextend2 = 2;
936  m_min_output_idty = 0.9;
937  m_min_output_cov = 0.9;
938  }
939 
940  if(args["min_output_identity"])
941  m_min_output_idty = args["min_output_identity"].AsDouble();
942  if(args["min_output_coverage"])
943  m_min_output_cov = args["min_output_coverage"].AsDouble();
944 
945  if(args["match"])
946  m_matchscore = args["match"].AsInteger();
947  if(args["mismatch"])
948  m_mismatchpenalty = args["mismatch"].AsInteger();
949  if(args["gapopen1"])
950  m_gapopen1 = args["gapopen1"].AsInteger();
951  if(args["gapextend1"])
952  m_gapextend1 = args["gapextend1"].AsInteger();
953  if(args["gapopen2"])
954  m_gapopen2 = args["gapopen2"].AsInteger();
955  if(args["gapextend2"])
956  m_gapextend2 = args["gapextend2"].AsInteger();
957 
958  string line;
959  string read_old;
960  bool paired_read = false;
961  set<string> previous_reads;
962  while(getline(cin, line)) {
963  if(line.empty())
964  continue;
965  if(line[0] == '@') {
966  cout << line << endl;
967  continue;
968  }
969  vector<string> fields;
970  NStr::Split(line, "\t", fields, 0);
971  string read = fields[0];
972  if(read != read_old && !m_read_aligns.empty()) { // select and emit alignments for read; clear storage
973  if(check && !previous_reads.insert(read_old).second) {
974  cerr << "Input collated by reads is expected" << endl;
975  exit(1);
976  }
977  ChainExons();
978  if(paired_read)
979  ConnectPairs();
981  if(!m_read_through)
982  FormatResults();
983  m_read_aligns.clear();
984  m_compartments.clear();
985  }
986  read_old = read;
987  paired_read = stoi(fields[1])&1;
988 
989  if((int)fields[9].size() < m_min_query_len)
990  continue;
991 
992  // separate contigs/strands
993  string contig = fields[2];
994  int flag = std::stoi(fields[1]);
995  int strand = (flag&16) ? 1 : 0;
996  int mate = (flag&128) ? 1 : 0;
997  m_read_aligns[contig][mate][strand].push_back(fields);
998  }
999 
1000  // deal with the last read
1001  if(check && !previous_reads.insert(read_old).second) {
1002  cerr << "Input collated by reads is expected" << endl;
1003  exit(1);
1004  }
1005  ChainExons();
1006  if(paired_read)
1007  ConnectPairs();
1009  if(!m_read_through)
1010  FormatResults();
1011  m_read_aligns.clear();
1012  m_compartments.clear();
1013 
1014  return 0;
1015 }
1016 
1017 int main(int argc, const char* argv[])
1018 {
1019  // Execute main application function
1020  return CExonSelectorApplication().AppMain(argc, argv, 0, eDS_ToStderr);
1021 }
void remove_if(Container &c, Predicate *__pred)
Definition: chainer.hpp:69
CArgAllow_Doubles –.
Definition: ncbiargs.hpp:1781
CArgAllow_Integers –.
Definition: ncbiargs.hpp:1751
CArgAllow –.
Definition: ncbiargs.hpp:1488
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgs –.
Definition: ncbiargs.hpp:379
virtual void Init()
Initialize the application.
bool CompatiblePair(const AlignCompartment &left, const AlignCompartment &right)
map< string, TMatrix< TAlignCompartments > > m_compartments
list< pair< int, char > > TCigar
void FormatSingle(AlignCompartment &compart, const string &contig, int mate, int strand, int count, int index)
list< TSamFields > TSplitAligns
map< string, TMatrix< TSplitAligns > > m_read_aligns
void ExtractExonsFromSAM(const TSamFields &tags, TExons &exons)
Exon GetNextExon(string &cigar_string, string &MD_tag, int qfrom, int sfrom)
virtual int Run()
Run the application.
list< pair< int, string > > TMDTag
vector< string > TSamFields
list< AlignCompartment > TAlignCompartments
void FormatPaired(AlignCompartment &compart, const string &contig, int mate, int strand, int count, int index, bool left)
Definition: map.hpp:338
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
#define md
Definition: compat-1.3.h:2001
double Entropy(const string &seq)
string Tag(const string &name, int value)
int main(int argc, const char *argv[])
USING_NCBI_SCOPE
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:61
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:56
#define check(s)
Definition: describecol2.c:21
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:305
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
Definition: ncbiapp.cpp:819
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
Definition: ncbiapp.cpp:1195
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
@ eDouble
Convertible into a floating point number (double)
Definition: ncbiargs.hpp:594
@ eInteger
Convertible into an integer number (int or Int8)
Definition: ncbiargs.hpp:592
EDiagSev SetDiagPostLevel(EDiagSev post_sev=eDiag_Error)
Set the threshold severity for posting the messages.
Definition: ncbidiag.cpp:6129
@ eDS_ToStderr
To standard error stream.
Definition: ncbidiag.hpp:1782
@ eDiag_Info
Informational message.
Definition: ncbidiag.hpp:651
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3461
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
exit(2)
int i
int len
constexpr auto sort(_Init &&init)
const struct ncbi::grid::netcache::search::fields::SIZE size
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
#define abs(a)
Definition: ncbi_heapmgr.c:130
unsigned int a
Definition: ncbi_localip.c:102
const char * tag
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Defines command line argument related classes.
int isdigit(Uchar c)
Definition: ncbictype.hpp:64
Defines unified interface to application:
T log2(T x_)
T max(T x_, T y_)
T min(T x_, T y_)
void RemoveLeftIndels(int gapopen1, int gapextend1, int gapopen2, int gapextend2)
void RemoveRightIndels(int gapopen1, int gapextend1, int gapopen2, int gapextend2)
static Uint4 letter(char c)
Modified on Sun Apr 14 05:29:05 2024 by modify_doxy.py rev. 669887