NCBI C++ ToolKit
aligncollapser.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: aligncollapser.cpp 101798 2024-02-13 17:18:22Z souvorov $
2  ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Alexandre Souvorov
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
37 #include <corelib/ncbiargs.hpp>
38 #include <objmgr/bioseq_handle.hpp>
39 #include <objmgr/scope.hpp>
40 #include <objmgr/util/sequence.hpp>
41 #include "gnomon_seq.hpp"
42 
43 
45 BEGIN_SCOPE(gnomon)
46 USING_SCOPE(sequence);
47 
48 string GetTargetAcc(int shift, const deque<char>& id_pool) {
49  string target;
50  for(int i = shift; id_pool[i] != 0; ++i)
51  target.push_back(id_pool[i]);
52 
53  return target;
54 }
55 
56 CAlignModel CAlignCommon::GetAlignment(const SAlignIndividual& ali, const deque<char>& target_id_pool) const {
57 
59  if(isPolyA())
60  a.Status() |= CGeneModel::ePolyA;
61  if(isCap())
62  a.Status() |= CGeneModel::eCap;
63  if(isUnknown())
65  a.SetID(ali.m_align_id >= 0 ? ali.m_align_id : -ali.m_align_id);
66  a.SetWeight(ali.m_weight);
67  if(ali.m_align_id < 0)
68  a.Status() |= CGeneModel::eChangedByFilter;
69 
70  if(m_introns.empty()) {
71  a.AddExon(ali.m_range);
72  } else {
73  string fs;
74  string ss;
75  if(!m_introns.front().m_sig.empty()) {
76  if(a.Strand() == ePlus)
77  ss = m_introns.front().m_sig.substr(0,2);
78  else
79  ss = m_introns.front().m_sig.substr(2,2);
80  }
81  a.AddExon(TSignedSeqRange(ali.m_range.GetFrom(), m_introns.front().m_range.GetFrom()), fs, ss);
82  for(int i = 0; i < (int)m_introns.size()-1; ++i) {
83  if(!m_introns[i].m_sig.empty() && !m_introns[i+1].m_sig.empty()) {
84  if(a.Strand() == ePlus) {
85  fs = m_introns[i].m_sig.substr(2,2);
86  ss = m_introns[i+1].m_sig.substr(0,2);
87  } else {
88  fs = m_introns[i].m_sig.substr(0,2);
89  ss = m_introns[i+1].m_sig.substr(2,2);
90  }
91  }
92  a.AddExon(TSignedSeqRange(m_introns[i].m_range.GetTo(), m_introns[i+1].m_range.GetFrom()), fs, ss);
93  }
94  if(!m_introns.back().m_sig.empty()) {
95  if(a.Strand() == ePlus)
96  fs = m_introns.back().m_sig.substr(2,2);
97  else
98  fs = m_introns.back().m_sig.substr(0,2);
99  }
100  ss = "";
101  a.AddExon(TSignedSeqRange(m_introns.back().m_range.GetTo(), ali.m_range.GetTo()), fs, ss);
102  }
103 
104  CAlignMap amap(a.Exons(), a.FrameShifts(), a.Strand());
105  CAlignModel align(a, amap);
106 
107  CRef<CSeq_id> target_id(CIdHandler::ToSeq_id(GetTargetAcc(ali.m_target_id, target_id_pool)));
108  align.SetTargetId(*target_id);
109 
110  return align;
111 }
112 
114  LeftAndLongFirstOrder(const deque<char>& idp) : id_pool(idp) {}
115  const deque<char>& id_pool;
116 
117  bool operator() (const SAlignIndividual& a, const SAlignIndividual& b) { // left and long first
118  if(a.m_range == b.m_range)
119  return GetTargetAcc(a.m_target_id,id_pool) < GetTargetAcc(b.m_target_id,id_pool);
120  else if(a.m_range.GetFrom() != b.m_range.GetFrom())
121  return a.m_range.GetFrom() < b.m_range.GetFrom();
122  else
123  return a.m_range.GetTo() > b.m_range.GetTo();
124  }
125 };
126 
127 bool OriginalOrder(const SAlignIndividual& a, const SAlignIndividual& b) { // the order in which alignments were added
128  return a.m_target_id < b.m_target_id;
129 }
130 
131 
132 
134 
135  m_flags = 0;
136  if(align.Type()&CGeneModel::eSR)
137  m_flags |= esr;
138  if(align.Type()&CGeneModel::eEST)
139  m_flags |= eest;
140  if(align.Status()&CGeneModel::ePolyA)
141  m_flags |= epolya;
142  if(align.Status()&CGeneModel::eCap)
143  m_flags |= ecap;
144 
147  m_flags |= eplus;
148  } else if(align.Strand() == ePlus){
149  m_flags |= eplus;
150  } else {
151  m_flags |= eminus;
152  }
153 
154  const CGeneModel::TExons& e = align.Exons();
155  for(int i = 1; i < (int)e.size(); ++i) {
156  if(e[i-1].m_ssplice && e[i].m_fsplice) {
157  string sig;
158  if(align.Strand() == ePlus)
159  sig = e[i-1].m_ssplice_sig+e[i].m_fsplice_sig;
160  else
161  sig = e[i].m_fsplice_sig+e[i-1].m_ssplice_sig;
162  SIntron intron(e[i-1].GetTo(), e[i].GetFrom(), align.Strand(), (align.Status()&CGeneModel::eUnknownOrientation) == 0, sig);
163  m_introns.push_back(intron);
164  }
165  }
166 }
167 
169  SAlignExtended(SAlignIndividual& ali, const set<int>& left_exon_ends, const set<int>& right_exon_ends) : m_ali(&ali), m_initial_right_end(ali.m_range.GetTo()) {
170 
172  right_exon_ends.lower_bound(m_ali->m_range.GetTo()); // leftmost compatible rexon
174  if(ri != right_exon_ends.end())
175  m_rlimb = *ri; // position of leftmost compatible rexon
176  m_rlima = -1;
177  if(ri != right_exon_ends.begin())
178  m_rlima = *(--ri); // position of the rightmost incompatible rexon
180  left_exon_ends.upper_bound(m_ali->m_range.GetFrom()); // leftmost not compatible lexon
182  if(li != left_exon_ends.end())
183  m_llimb = *li; // position of the leftmost not compatible lexon
184  }
185 
188  int m_rlimb;
189  int m_rlima;
190  int m_llimb;
191 };
192 
194  arg_desc->SetCurrentGroup("Collapsing and filtering");
195 
196  arg_desc->AddFlag("filtersr","Filter SR");
197  arg_desc->AddFlag("filterest","Filter EST");
198  arg_desc->AddFlag("no_lr_only_introns","Filter introns supported only by LR");
199  arg_desc->AddFlag("filtermrna","Filter mRNA");
200  arg_desc->AddFlag("filterprots","Filter proteins");
201  arg_desc->AddFlag("collapsest","Collaps EST");
202  arg_desc->AddFlag("collapssr","Collaps SR");
203  arg_desc->AddFlag("fillgenomicgaps","Use provided selfspecies cDNA for genomic gap filling");
204 
205  arg_desc->AddDefaultKey("max-extension", "MaxExtension",
206  "Maximal extension for one-exon collapsed alignments",
208 
209  arg_desc->AddDefaultKey("min-consensus-support", "MinConsensusSupport",
210  "Minimal number of support for consensus intron",
212 
213  arg_desc->AddDefaultKey("min-non-consensussupport", "MinNonconsensusSupport",
214  "Minimal number of support for non-consensus intron",
216 
217  arg_desc->AddDefaultKey("high-identity", "HighIdentity",
218  "Minimal exon identity threshold for accepted introns",
219  CArgDescriptions::eDouble, "0.98");
220 
221  arg_desc->AddDefaultKey("min-support-fraction", "MinSupportFraction",
222  "Minimal splice expression relative exon expression",
223  CArgDescriptions::eDouble, "0.03");
224 
225  arg_desc->AddDefaultKey("end-pair-support-cutoff", "EndPairSupportCutoff",
226  "Minimal expression relative to the mean for introns with the same splice",
228 
229  /*
230  arg_desc->AddDefaultKey("minest", "minest",
231  "Minimal EST support to trump expression checks",
232  CArgDescriptions::eInteger, "3");
233  */
234 
235  arg_desc->AddDefaultKey("min-edge-coverage", "MinEdgeCoverage",
236  "Minimal absolute expression for accepted single-exon alignments without polyA/Cap",
238 
239  arg_desc->AddDefaultKey("sharp-boundary", "SharpBoundary",
240  "Minimal relative expression for crossing splice",
242 
243  arg_desc->SetCurrentGroup("CAGE/PolyA arguments");
244  arg_desc->AddFlag("use-long-read-tss","Treat 5' ends of long reads like CAGE");
245 
246  arg_desc->SetCurrentGroup("");
247 }
248 
249 #define MAX_DIST_TO_FLANK_GAP 10000
250 void CAlignCollapser::InitContig(string contig, CScope* scope) {
251  if(m_range.NotEmpty()) {
252  m_contig_name = contig;
253  m_scope = scope;
254 
255  CRef<CSeq_id> contigid(new CSeq_id);
256  contigid->Assign(*CIdHandler::ToSeq_id(contig));
257  if(!contigid)
258  contigid = new CSeq_id(CSeq_id::e_Local, contig);
259 
260  CBioseq_Handle bh (m_scope->GetBioseqHandle(*contigid));
261  if (!bh) {
262  NCBI_THROW(CException, eUnknown, "contig '"+contig+"' retrieval failed");
263  }
265  int length (sv.size());
266 
267  int from = 0;
268  int to = length-1;
271  to = min(length-1,m_range.GetTo()+2*MAX_DIST_TO_FLANK_GAP);
272  }
273  m_contig.Init(sv, from, to);
274 
275  TIntMap::iterator current_gap = m_genomic_gaps_len.end();
276  for(int i = from; i <=to; ++i) {
277  if(sv.IsInGap(i)) {
278  if(current_gap == m_genomic_gaps_len.end())
279  current_gap = m_genomic_gaps_len.insert(TIntMap::value_type(i,1)).first;
280  else
281  ++current_gap->second;
282  } else {
283  current_gap = m_genomic_gaps_len.end();
284  }
285  }
286 
287  if(from == 0)
288  m_genomic_gaps_len[-1] = 1; // fake gap at the beginning
289  if(to == length-1)
290  m_genomic_gaps_len[length] = 1; // fake gap at the end
291  }
292 }
293 
294 CAlignCollapser::CAlignCollapser(string contig, CScope* scope, bool nofilteringcollapsing) : m_count(0), m_long_read_count(0), m_scope(scope) {
295  const CArgs& args = CNcbiApplication::Instance()->GetArgs();
296 
297  if(nofilteringcollapsing) {
298  m_filtersr = false;
299  m_filterest = false;
300  m_filtermrna = false;
301  m_filterprots = false;
302  m_collapsest = false;
303  m_collapssr = false;
304  } else {
305  m_filtersr = args["filtersr"];
306  m_filterest = args["filterest"];
307  m_no_lr_only_introns = args["no_lr_only_introns"];
308  m_filtermrna = args["filtermrna"];
309  m_filterprots = args["filterprots"];
310  m_collapsest = args["collapsest"];
311  m_collapssr = args["collapssr"];
312  }
313  m_fillgenomicgaps = args["fillgenomicgaps"];
314  m_use_long_reads_tss = args["use-long-read-tss"];
315  m_minident = args["high-identity"].AsDouble();
316 
317  if(m_scope != 0 && contig != "") {
319  InitContig(contig, scope);
320  }
321 }
322 
323 
325  return ali.m_weight < 0;
326 }
327 
328 #define COVERED_FRACTION 0.75
329 bool AlignmentIsSupportedBySR(const CAlignModel& align, const vector<double>& coverage, int mincoverage, int left_end) {
330 
331  int align_len = align.AlignLen();
332 
333  int covered_length = 0;
334  ITERATE(CGeneModel::TExons, i, align.Exons()) {
335  for(int p = i->Limits().GetFrom(); p <= i->Limits().GetTo(); ++p)
336  if(coverage[p-left_end] >= mincoverage)
337  ++covered_length;
338  }
339 
340  return (covered_length >= COVERED_FRACTION*align_len);
341 }
342 
343 bool isGoodIntron(int a, int b, EStrand strand, const CAlignCollapser::TAlignIntrons& introns, bool check_introns_on_both_strands) {
344  SIntron intron_oriented_nosig(a, b, strand, true, "");
345  SIntron intron_notoriented_nosig(a, b, ePlus, false, "");
346  bool good_intron = (introns.find(intron_oriented_nosig) != introns.end() || introns.find(intron_notoriented_nosig) != introns.end());
347  if(!good_intron && check_introns_on_both_strands) {
348  SIntron intron_otherstrand_nosig(a, b, OtherStrand(strand), true, "");
349  good_intron = (introns.find(intron_otherstrand_nosig) != introns.end());
350  }
351 
352  return good_intron;
353 }
354 
355 
356 #define END_PART_LENGTH 35
357 
358 #define EXON_TO_SKIP 10
359 void CAlignCollapser::ClipESTorSR(CAlignModel& align, double clip_threshold, double min_lim) { // used for pool alignments with no indel or mismatch info
360  double cov = 0;
361  int nt = 0;
362  ITERATE(CGeneModel::TExons, e, align.Exons()) {
363  for(int i = e->GetFrom(); i <= e->GetTo(); ++i) {
364  cov += m_coverage[i-m_left_end];
365  ++nt;
366  }
367  }
368  cov /= nt;
369 
370  double threshold = max(clip_threshold*cov, min_lim);
371 
372  int left_lim = align.Limits().GetFrom();
373  int exnl = 0;
374  while(m_coverage[left_lim-m_left_end] < threshold) {
375  ++left_lim;
376  if(align.Exons()[exnl].GetTo()-EXON_TO_SKIP+1 <= left_lim) {
377  if(++exnl == (int)align.Exons().size()) {
378  align.ClearExons();
379  return;
380  }
381  left_lim = align.Exons()[exnl].GetFrom();
382  }
383  }
384  int right_lim = align.Limits().GetTo();
385  int exnr = (int)align.Exons().size()-1;
386  while(m_coverage[right_lim-m_left_end] < threshold) {
387  --right_lim;
388  if(align.Exons()[exnr].GetFrom()+EXON_TO_SKIP-1 >= right_lim) {
389  if(--exnr < exnl) {
390  align.ClearExons();
391  return;
392  }
393  right_lim = align.Exons()[exnr].GetTo();
394  }
395  }
396  _ASSERT(right_lim >= left_lim);
397 
398  align.Clip(TSignedSeqRange(left_lim, right_lim),CGeneModel::eRemoveExons);
399  if(align.AlignLen() < END_PART_LENGTH)
400  align.ClearExons();
401 }
402 
403 void CAlignCollapser::ClipNotSupportedFlanks(CAlignModel& align, double clip_threshold, double min_lim) {
404 
405  double cov = 0;
406  int nt = 0;
407  ITERATE(CGeneModel::TExons, e, align.Exons()) {
408  for(int i = e->GetFrom(); i <= e->GetTo(); ++i) {
409  cov += m_coverage[i-m_left_end];
410  ++nt;
411  }
412  }
413  cov /= nt;
414 
415  double lim = max(clip_threshold*cov, min_lim);
416 
417  CAlignMap amap = align.GetAlignMap();
418  TSignedSeqRange old_limits = align.Limits();
419 
420  if(align.Type()&CGeneModel::eNotForChaining) {
421  TSignedSeqRange tlim = align.TranscriptLimits();
422  int not_aligned_left = tlim.GetFrom();
423  int not_aligned_right = align.TargetLen()-1-tlim.GetTo();
424  if(align.Orientation() == eMinus)
425  swap(not_aligned_left,not_aligned_right);
426 
427  if(not_aligned_left > 30) {
428  int l = align.Limits().GetFrom();
429  int ie = 0;
430  while(l < align.Limits().GetTo() && m_coverage[l-m_left_end] < lim) {
431  if(l < align.Exons()[ie].GetTo())
432  ++l;
433  else
434  l = align.Exons()[++ie].GetFrom();
435  }
436  if(l != align.Limits().GetFrom()) {
437  TSignedSeqRange seg = amap.ShrinkToRealPoints(TSignedSeqRange(l,align.Limits().GetTo()), false);
438  if(seg.Empty() || amap.FShiftedLen(seg,false) < END_PART_LENGTH) {
439  align.ClearExons();
440  return;
441  } else {
442  align.Clip(seg,CGeneModel::eRemoveExons);
443  }
444  }
445  }
446 
447  if(not_aligned_right > 30) {
448  int r = align.Limits().GetTo();
449  int ie = (int)align.Exons().size()-1;
450  while(r > align.Limits().GetFrom() && m_coverage[r-m_left_end] < lim) {
451  if(r > align.Exons()[ie].GetFrom())
452  --r;
453  else
454  r = align.Exons()[--ie].GetTo();
455  }
456  if(r != align.Limits().GetTo()) {
457  TSignedSeqRange seg = amap.ShrinkToRealPoints(TSignedSeqRange(align.Limits().GetFrom(),r), false);
458  if(seg.Empty() || amap.FShiftedLen(seg,false) < END_PART_LENGTH) {
459  align.ClearExons();
460  return;
461  } else {
462  align.Clip(seg,CGeneModel::eRemoveExons);
463  }
464  }
465  }
466  }
467 
468  bool snap_to_codons = align.Type()&CAlignModel::eProt;
469  bool keepdoing = true;
470  while(keepdoing) {
471  keepdoing = false;
472  for (int k = 1; k < (int)align.Exons().size(); ++k) {
473  CModelExon exonl = align.Exons()[k-1];
474  CModelExon exonr = align.Exons()[k];
475  if(!(exonl.m_ssplice && exonr.m_fsplice)) {
476  int l = exonl.GetTo();
477  TSignedSeqRange segl(align.Limits().GetFrom(),l);
478  for( ; l >= exonl.GetFrom() && m_coverage[l-m_left_end] < lim; --l);
479  if(l != exonl.GetTo())
480  segl = amap.ShrinkToRealPoints(TSignedSeqRange(align.Limits().GetFrom(),max(align.Limits().GetFrom(),l)),snap_to_codons);
481 
482  int r = exonr.GetFrom();
483  TSignedSeqRange segr(r,align.Limits().GetTo());
484  for( ; r <= exonr.GetTo() && m_coverage[r-m_left_end] < lim; ++r);
485  if(r != exonr.GetFrom())
486  segr = amap.ShrinkToRealPoints(TSignedSeqRange(min(align.Limits().GetTo(),r),align.Limits().GetTo()), snap_to_codons);
487 
488  if(segl.Empty() || amap.FShiftedLen(segl,false) < END_PART_LENGTH) {
489  if(segr.Empty() || amap.FShiftedLen(segr,false) < END_PART_LENGTH) {
490  align.ClearExons();
491  return;
492  } else {
493  align.Clip(segr,CGeneModel::eRemoveExons);
494  keepdoing = true;
495  break;
496  }
497  } else if(segr.Empty() || amap.FShiftedLen(segr,false) < END_PART_LENGTH) {
498  align.Clip(segl,CGeneModel::eRemoveExons);
499  keepdoing = true;
500  break;
501  } else if(l != exonl.GetTo() || r != exonr.GetFrom()) {
502  align.CutExons(TSignedSeqRange(segl.GetTo()+1,segr.GetFrom()-1));
503  keepdoing = true;
504  break;
505  }
506  }
507  }
508  }
509 
510  for(int prev_exon = -1; prev_exon < (int)align.Exons().size()-1; ++prev_exon) {
511  int piece_begin = prev_exon+1;
512  if(align.Exons()[piece_begin].m_fsplice)
513  continue;
514  int piece_end = piece_begin;
515  for( ; piece_end < (int)align.Exons().size() && align.Exons()[piece_end].m_ssplice; ++piece_end);
516  int a = align.Exons()[piece_begin].GetFrom();
517  int b = align.Exons()[piece_end].GetTo();
518  if(amap.FShiftedLen(a, b, false) < END_PART_LENGTH) {
519  if(a == align.Limits().GetFrom() && b == align.Limits().GetTo()) {
520  align.ClearExons();
521  return;
522  } else if(a == align.Limits().GetFrom()) {
523  TSignedSeqRange seg(align.Exons()[piece_end+1].GetFrom(),align.Limits().GetTo());
524  align.Clip(seg, CGeneModel::eRemoveExons);
525  } else if(b == align.Limits().GetTo()) {
526  TSignedSeqRange seg(align.Limits().GetFrom(),align.Exons()[piece_begin-1].GetTo());
527  align.Clip(seg, CGeneModel::eRemoveExons);
528  } else {
529  TSignedSeqRange seg(a, b);
530  align.CutExons(seg);
531  }
532  }
533  }
534 
535  if((align.Status()&CGeneModel::ePolyA) &&
536  ((align.Strand() == ePlus && align.Limits().GetTo() != old_limits.GetTo()) ||
537  (align.Strand() == eMinus && align.Limits().GetFrom() != old_limits.GetFrom()))) { // clipped polyA
538 
539  align.Status() ^= CGeneModel::ePolyA;
540  }
541  if((align.Status()&CGeneModel::eCap) &&
542  ((align.Strand() == eMinus && align.Limits().GetTo() != old_limits.GetTo()) ||
543  (align.Strand() == ePlus && align.Limits().GetFrom() != old_limits.GetFrom()))) { // clipped cap
544 
545  align.Status() ^= CGeneModel::eCap;
546  }
547 }
548 
549 
550 #define CUT_MARGIN 15
551 
553 
554  CAlignMap amap = align.GetAlignMap();
555 
556  bool keepdoing = true;
557  while(keepdoing) {
558  keepdoing = false;
559  for (int k = 1; k < (int)align.Exons().size(); ++k) {
560  CModelExon exonl = align.Exons()[k-1];
561  CModelExon exonr = align.Exons()[k];
562  if(!(exonl.m_ssplice && exonr.m_fsplice) || isGoodIntron(exonl.GetTo(), exonr.GetFrom(), align.Strand(), m_align_introns, false))
563  continue;
564 
565  TSignedSeqRange segl;
566  if(exonl.GetTo()-CUT_MARGIN > align.Limits().GetFrom())
567  segl = amap.ShrinkToRealPoints(TSignedSeqRange(align.Limits().GetFrom(),exonl.GetTo()-CUT_MARGIN), true);
568 
569  TSignedSeqRange segr;
570  if(exonr.GetFrom()+CUT_MARGIN < align.Limits().GetTo())
571  segr = amap.ShrinkToRealPoints(TSignedSeqRange(exonr.GetFrom()+CUT_MARGIN,align.Limits().GetTo()), true);
572 
573  if(segl.Empty() || amap.FShiftedLen(segl,false) < END_PART_LENGTH) {
574  if(segr.Empty() || amap.FShiftedLen(segr,false) < END_PART_LENGTH) {
575  align.ClearExons();
576  return false;
577  } else {
578  align.Clip(segr,CGeneModel::eRemoveExons);
579  keepdoing = true;
580  break;
581  }
582  } else if(segr.Empty() || amap.FShiftedLen(segr,false) < END_PART_LENGTH) {
583  align.Clip(segl,CGeneModel::eRemoveExons);
584  keepdoing = true;
585  break;
586  } else {
587  align.CutExons(TSignedSeqRange(segl.GetTo()+1,segr.GetFrom()-1));
588  keepdoing = true;
589  break;
590  }
591  }
592  }
593 
594  return true;
595 }
596 
597 bool CAlignCollapser::RemoveNotSupportedIntronsFromTranscript(CAlignModel& align, bool check_introns_on_both_strands) const {
598 
599  CAlignMap amap = align.GetAlignMap();
600 
601  CGeneModel editedmodel = align;
602 
603  if(!(editedmodel.Status()&CGeneModel::eGapFiller)) { //remove flanking bad introns AND exons
604  editedmodel.ClearExons(); // empty alignment with all atributes and remove indels
605  for (CAlignModel::TExons::const_iterator piece_begin = align.Exons().begin(); piece_begin != align.Exons().end(); ++piece_begin) {
606  _ASSERT( !piece_begin->m_fsplice );
607 
608  CAlignModel::TExons::const_iterator piece_end = piece_begin;
609  for ( ; piece_end != align.Exons().end() && piece_end->m_ssplice; ++piece_end) ;
610  _ASSERT( piece_end != align.Exons().end() );
611 
612  CAlignModel a = align;
613  a.Clip(TSignedSeqRange(piece_begin->Limits().GetFrom(),piece_end->Limits().GetTo()),CGeneModel::eRemoveExons); // only one piece
614 
615  //remove flanking bad introns
616  int new_left = a.Limits().GetFrom();
617  for(int k = 1; k < (int)a.Exons().size(); ++k) {
618  CModelExon exonl = a.Exons()[k-1];
619  CModelExon exonr = a.Exons()[k];
620  if(isGoodIntron(exonl.GetTo(), exonr.GetFrom(), a.Strand(), m_align_introns, check_introns_on_both_strands))
621  break;
622  else
623  new_left = exonr.GetFrom();
624  }
625  int new_right = a.Limits().GetTo();
626  for(int k = (int)a.Exons().size()-1; k > 0 && a.Exons()[k-1].GetTo() > new_left; --k) {
627  CModelExon exonl = a.Exons()[k-1];
628  CModelExon exonr = a.Exons()[k];
629  if(isGoodIntron(exonl.GetTo(), exonr.GetFrom(), a.Strand(), m_align_introns, check_introns_on_both_strands))
630  break;
631  else
632  new_right = exonl.GetTo();
633  }
634 
635  TSignedSeqRange new_lim(new_left,new_right);
636  if(new_lim != a.Limits()) {
637  new_lim = amap.ShrinkToRealPoints(new_lim,false);
638  a.Clip(new_lim,CGeneModel::eRemoveExons);
639  _ASSERT(a.Limits().NotEmpty());
640  }
641 
642  if(!editedmodel.Exons().empty())
643  editedmodel.AddHole();
644 
645  ITERATE(CGeneModel::TExons, e, a.Exons()) {
646  editedmodel.AddExon(e->Limits(), e->m_fsplice_sig, e->m_ssplice_sig, e->m_ident);
647  }
648  editedmodel.FrameShifts().insert(editedmodel.FrameShifts().end(),a.FrameShifts().begin(),a.FrameShifts().end());
649 
650  piece_begin = piece_end;
651  }
652  }
653 
654 
655  bool good_alignment = true;
656  if((align.Type()&CGeneModel::eEST) && (int)editedmodel.Exons().size() == 1 && editedmodel.Limits() != align.Limits())
657  good_alignment = false;
658 
659 
660  bool keepdoing = true;
661  while(keepdoing) {
662  keepdoing = false;
663  for (int k = 1; k < (int)editedmodel.Exons().size() && good_alignment; ++k) {
664  CModelExon exonl = editedmodel.Exons()[k-1];
665  CModelExon exonr = editedmodel.Exons()[k];
666  if(exonl.m_ssplice && exonr.m_fsplice && !isGoodIntron(exonl.GetTo(), exonr.GetFrom(), editedmodel.Strand(), m_align_introns, check_introns_on_both_strands)) {
667  if(editedmodel.Status()&CGeneModel::eGapFiller) {
668  TSignedSeqRange segl = amap.ShrinkToRealPoints(TSignedSeqRange(editedmodel.Limits().GetFrom(),exonl.GetTo()-1), false);
669  TSignedSeqRange segr = amap.ShrinkToRealPoints(TSignedSeqRange(exonr.GetFrom()+1,editedmodel.Limits().GetTo()), false);
670  if(segl.NotEmpty() && segr.NotEmpty()) {
671  editedmodel.CutExons(TSignedSeqRange(segl.GetTo()+1,segr.GetFrom()-1));
672  keepdoing = true;
673  break;
674  }
675  } else {
676  good_alignment = false;
677  }
678  }
679  }
680  }
681 
682  vector<TSignedSeqRange> transcript_exons;
683  ITERATE(CGeneModel::TExons, e, editedmodel.Exons()) {
685  _ASSERT(te.NotEmpty());
686  transcript_exons.push_back(te);
687  }
688 
689  TSignedSeqRange old_limits = align.Limits();
690 
691  CAlignMap editedamap(editedmodel.Exons(), transcript_exons, editedmodel.FrameShifts(), align.Orientation(), align.GetAlignMap().TargetLen());
692  CAlignModel editedalign(editedmodel, editedamap);
693  editedalign.SetTargetId(*align.GetTargetId());
694  align = editedalign;
695 
696  if((align.Status()&CGeneModel::ePolyA) &&
697  ((align.Strand() == ePlus && align.Limits().GetTo() != old_limits.GetTo()) ||
698  (align.Strand() == eMinus && align.Limits().GetFrom() != old_limits.GetFrom()))) { // clipped polyA
699 
700  align.Status() ^= CGeneModel::ePolyA;
701  }
702  if((align.Status()&CGeneModel::eCap) &&
703  ((align.Strand() == eMinus && align.Limits().GetTo() != old_limits.GetTo()) ||
704  (align.Strand() == ePlus && align.Limits().GetFrom() != old_limits.GetFrom()))) { // clipped cap
705 
706  align.Status() ^= CGeneModel::eCap;
707  }
708 
709  return good_alignment;
710 }
711 
712 #define MISM_PENALTY 10
713 #define INDEL_PENALTY 20
714 #define EXTRA_CUT 5
715 #define BIG_NOT_ALIGNED 20
716 void CAlignCollapser::CleanSelfTranscript(CAlignModel& align, const string& trans) const {
717 
718  string transcript = trans; // transcript as it appears on the genome
719  if(align.Orientation() == eMinus)
720  ReverseComplement(transcript.begin(),transcript.end());
721 
722  int tlen = align.TargetLen();
723  _ASSERT(tlen == (int)transcript.size());
724 
725  //expand not splices exons if identical
726  CGeneModel::TExons exons = align.Exons();
727  vector<TSignedSeqRange> transcript_exons;
728  transcript_exons.reserve(exons.size());
729  for(int ie = 0; ie < (int)exons.size(); ++ie) {
730  transcript_exons.push_back(align.TranscriptExon(ie));
731  }
732  if(align.Orientation() == eMinus) {
733  for(int ie = 0; ie < (int)exons.size(); ++ie) {
734  TSignedSeqRange& te = transcript_exons[ie];
735  te = TSignedSeqRange(tlen-1-te.GetTo(),tlen-1-te.GetFrom());
736  }
737  }
738  for(int ie = 0; ie < (int)exons.size(); ++ie) {
739  if(!exons[ie].m_fsplice) {
740  int glim = (ie > 0) ? exons[ie-1].GetTo() : -1;
741  int tlim = (ie > 0) ? transcript_exons[ie-1].GetTo() : -1;
742  int g = exons[ie].GetFrom();
743  int t = transcript_exons[ie].GetFrom();
744  while(g > glim+1 && t > tlim+1 && transcript[t-1] == m_contig[g-1]) {
745  --t;
746  --g;
747  }
748  if(g < exons[ie].GetFrom()) {
749  exons[ie].AddFrom(g-exons[ie].GetFrom());
750  exons[ie].m_fsplice_sig.clear();
751  transcript_exons[ie].SetFrom(t);
752  }
753  }
754  if(!exons[ie].m_ssplice) {
755  int glim = (ie+1 < (int)exons.size()) ? exons[ie+1].GetFrom() : m_contig.FullLength();
756  int tlim = (ie+1 < (int)exons.size()) ? transcript_exons[ie+1].GetFrom() : (int)transcript.size();
757  int g = exons[ie].GetTo();
758  int t = transcript_exons[ie].GetTo();
759  while(g < glim-1 && t < tlim-1 && transcript[t+1] == m_contig[g+1]) {
760  ++t;
761  ++g;
762  }
763  if(g > exons[ie].GetTo()) {
764  exons[ie].AddTo(g-exons[ie].GetTo());
765  exons[ie].m_ssplice_sig.clear();
766  transcript_exons[ie].SetTo(t);
767  }
768  }
769  }
770 
771  CAlignMap amap(exons,transcript_exons, align.FrameShifts(), ePlus, tlen);
772 
773  CGeneModel::TExons edited_exons;
774  vector<TSignedSeqRange> edited_transcript_exons;
775 
776  for (int piece_begin = 0; piece_begin < (int)exons.size(); ++piece_begin) {
777  _ASSERT( !exons[piece_begin].m_fsplice );
778  int piece_end = piece_begin;
779  for( ; exons[piece_end].m_ssplice; ++piece_end);
780  _ASSERT(piece_end < (int)exons.size());
781 
782  TInDels indels = align.GetInDels(exons[piece_begin].GetFrom(), exons[piece_end].GetTo(), false);
783  TInDels::const_iterator indl = indels.begin();
784 
785  string tseq;
786  string gseq;
787  TIVec exons_to_align;
788  int tp = transcript_exons[piece_begin].GetFrom();
789  for(int ie = piece_begin; ie <= piece_end; ++ie) {
790  TSignedSeqPos gp = exons[ie].GetFrom();
791  while(gp <= exons[ie].GetTo()) {
792  if(indl == indels.end() || indl->Loc() != gp) {
793  tseq.push_back(transcript[tp++]);
794  gseq.push_back(m_contig[gp++]);
795  } else if(indl->IsDeletion()) {
796  tseq += transcript.substr(tp,indl->Len());
797  gseq.insert(gseq.end(),indl->Len(),'-');
798  tp += indl->Len();
799  ++indl;
800  } else {
801  tseq.insert(tseq.end(),indl->Len(),'-');
802  gseq += m_contig.substr(gp,indl->Len());
803  gp += indl->Len();
804  ++indl;
805  }
806  }
807  if(indl != indels.end() && indl->Loc() == gp) { // deletion at the end of exon
808  _ASSERT(indl->IsDeletion());
809  tseq += transcript.substr(tp,indl->Len());
810  gseq.insert(gseq.end(), indl->Len(), '-');
811  tp += indl->Len();
812  ++indl;
813  }
814  exons_to_align.push_back((int)gseq.size()-1);
815  }
816  _ASSERT(tseq.size() == gseq.size() && indl == indels.end());
817 
818  TIVec score(tseq.size());
819  for(int i = 0; i < (int)score.size(); ++i) {
820  if(tseq[i] == gseq[i] && tseq[i] != 'N')
821  score[i] = 1;
822  else if(tseq[i] == '-' || gseq[i] == '-')
823  score[i] = -INDEL_PENALTY;
824  else
825  score[i] = -MISM_PENALTY;
826  if(i > 0)
827  score[i] += score[i-1];
828  score[i] = max(0,score[i]);
829  }
830 
831  int align_right = (int)(max_element(score.begin(),score.end())-score.begin());
832 
833  if(score[align_right] > 0) { // there is at least one match
834  int align_left = align_right;
835  while(align_left > 0 && score[align_left-1] > 0)
836  --align_left;
837 
838  int agaps = (int)count(tseq.begin(), tseq.begin()+align_left, '-');
839  int bgaps = (int)count(tseq.begin(), tseq.begin()+align_right, '-');
840  TSignedSeqRange trange(transcript_exons[piece_begin].GetFrom()+align_left-agaps, transcript_exons[piece_begin].GetFrom()+align_right-bgaps);
841 
842  TSignedSeqRange grange = amap.MapRangeEditedToOrig(trange, false);
843  _ASSERT(grange.NotEmpty());
844 
845  int pb = piece_begin;
846  while(exons[pb].GetTo() < grange.GetFrom())
847  ++pb;
848  int pe = piece_end;
849  while(exons[pe].GetFrom() > grange.GetTo())
850  --pe;
851  _ASSERT(pe >= pb);
852 
853  double lident = 0; // left exon identity
854  int len = 0;
855  for(int i = align_left; i <= (pe > pb ? exons_to_align[pb-piece_begin] : align_right); ++i) {
856  ++len;
857  if(tseq[i] == gseq[i])
858  ++lident;
859  }
860  lident /= len;
861 
862  double rident = 0; // right exon identity
863  len = 0;
864  for(int i = align_right; i >= (pe > pb ? exons_to_align[pe-1-piece_begin]+1 : align_left); --i) {
865  ++len;
866  if(tseq[i] == gseq[i])
867  ++rident;
868  }
869  rident /= len;
870 
871  for( int ie = pb; ie <= pe; ++ie) {
872  CModelExon e = exons[ie];
873  TSignedSeqRange t = transcript_exons[ie];
874  if(ie == pb) {
875  e.m_fsplice = false;
876  e.Limits().SetFrom(grange.GetFrom());
877  t.SetFrom(trange.GetFrom());
878  e.m_fsplice_sig.clear();
879  e.m_ident = lident;
880  }
881  if(ie == pe) {
882  e.m_ssplice = false;
883  e.Limits().SetTo(grange.GetTo());
884  t.SetTo(trange.GetTo());
885  e.m_ssplice_sig.clear();
886  e.m_ident = rident;
887  }
888 
889  edited_exons.push_back(e);
890  edited_transcript_exons.push_back(t);
891  }
892  }
893  piece_begin = piece_end;
894  }
895 
896 
897  CGeneModel editedmodel = align;
898  editedmodel.ClearExons(); // empty alignment with all atributes
899  TInDels edited_indels;
900 
901  for (int piece_begin = 0; piece_begin < (int)edited_exons.size(); ++piece_begin) {
902  _ASSERT( !edited_exons[piece_begin].m_fsplice );
903  int piece_end = piece_begin;
904  for( ; edited_exons[piece_end].m_ssplice; ++piece_end);
905  _ASSERT(piece_end < (int)edited_exons.size());
906 
907  //find splices if possible
908  if(!(align.Status()&CGeneModel::eUnknownOrientation)) {
909  TSignedSeqRange& elim = edited_exons[piece_begin].Limits();
910  TSignedSeqRange& tlim = edited_transcript_exons[piece_begin];
911  int distance_to_lgap = -1;
913  if(igap != m_genomic_gaps_len.begin()) {
914  --igap;
915  distance_to_lgap = elim.GetFrom()-(igap->first+igap->second);
916  }
917  if(distance_to_lgap == 0) { // ubutting gap
918  edited_exons[piece_begin].m_fsplice_sig = "NN";
919  } else if(tlim.GetFrom() > BIG_NOT_ALIGNED && (piece_begin == 0 || tlim.GetFrom() > edited_transcript_exons[piece_begin-1].GetTo()+1)) {
920  string splice = (align.Strand() == ePlus) ? "AG" : "AC";
921  for(int p = max(0,elim.GetFrom()-2); p <= min(elim.GetFrom()+EXTRA_CUT, elim.GetTo()-MISM_PENALTY)-2; ++p) {
922  if(m_contig[p] == splice[0] && m_contig[p+1] == splice[1]) {
923  tlim.SetFrom(tlim.GetFrom()+p+2-elim.GetFrom());
924 
925  int del_len = 0;
926  ITERATE(TInDels, indl, align.FrameShifts()) {
927  if(indl->IsDeletion() && Include(elim, indl->Loc()))
928  del_len += indl->Len();
929  }
930  double errors = (1.-edited_exons[piece_begin].m_ident)*(elim.GetLength()+del_len);
931  elim.SetFrom(p+2);
932  edited_exons[piece_begin].m_ident = 1.-errors/(elim.GetLength()+del_len); // splices won't clip indels or mismatches
933  if(align.Strand() == eMinus)
934  ReverseComplement(splice.begin(),splice.end());
935  edited_exons[piece_begin].m_fsplice_sig = splice;
936  _ASSERT(elim.NotEmpty());
937 
938  break;
939  }
940  }
941  }
942  }
943  if(!(align.Status()&CGeneModel::eUnknownOrientation)) {
944  TSignedSeqRange& elim = edited_exons[piece_end].Limits();
945  TSignedSeqRange& tlim = edited_transcript_exons[piece_end];
946  int distance_to_rgap = -1;
948  if(igap != m_genomic_gaps_len.end())
949  distance_to_rgap = igap->first-elim.GetTo()-1;
950  if(distance_to_rgap == 0) { // ubutting gap
951  edited_exons[piece_end].m_ssplice_sig = "NN";
952  } else if(tlen-tlim.GetTo()-1 > BIG_NOT_ALIGNED && (piece_end == (int)edited_exons.size()-1 || tlim.GetTo() < edited_transcript_exons[piece_end+1].GetFrom()-1)) {
953  string splice = (align.Strand() == ePlus) ? "GT" : "CT";
954  for(int p = min((int)m_contig.FullLength()-1,elim.GetTo()+2); p >= max(elim.GetTo()-EXTRA_CUT, elim.GetFrom()+MISM_PENALTY)+2; --p) {
955  if(m_contig[p-1] == splice[0] && m_contig[p] == splice[1]) {
956  tlim.SetTo(tlim.GetTo()-elim.GetTo()+p-2);
957 
958  int del_len = 0;
959  ITERATE(TInDels, indl, align.FrameShifts()) {
960  if(indl->IsDeletion() && Include(elim, indl->Loc()))
961  del_len += indl->Len();
962  }
963  double errors = (1.-edited_exons[piece_end].m_ident)*(elim.GetLength()+del_len);
964  elim.SetTo(p-2);
965  edited_exons[piece_end].m_ident = 1.-errors/(elim.GetLength()+del_len); // splices won't clip indels
966  if(align.Strand() == eMinus)
967  ReverseComplement(splice.begin(),splice.end());
968  edited_exons[piece_end].m_ssplice_sig = splice;
969  _ASSERT(elim.NotEmpty());
970 
971  break;
972  }
973  }
974  }
975  }
976 
977  for(int ie = piece_begin; ie <= piece_end; ++ie) {
978  CModelExon& e = edited_exons[ie];
979  editedmodel.AddExon(e.Limits(), e.m_fsplice_sig, e.m_ssplice_sig, e.m_ident);
980  }
981  editedmodel.AddHole();
982  ITERATE(TInDels, indl, align.FrameShifts()) {
983  if(indl->Loc() > edited_exons[piece_begin].GetFrom() && indl->Loc() < edited_exons[piece_end].GetTo())
984  edited_indels.push_back(*indl);
985  }
986 
987  piece_begin = piece_end;
988  }
989 
990  if(align.Orientation() == eMinus) {
991  for(int ie = 0; ie < (int)edited_transcript_exons.size(); ++ie) {
992  TSignedSeqRange& te = edited_transcript_exons[ie];
993  te = TSignedSeqRange(tlen-1-te.GetTo(),tlen-1-te.GetFrom());
994  }
995  }
996  CAlignMap editedamap(editedmodel.Exons(),edited_transcript_exons, edited_indels, align.Orientation(), tlen);
997  editedmodel.FrameShifts() = edited_indels;
998  CAlignModel editedalign(editedmodel, editedamap);
999  editedalign.SetTargetId(*align.GetTargetId());
1000 
1001  align = editedalign;
1002 }
1003 
1004 int TotalFrameShift(const TInDels& indels, int a, int b) {
1005  int fs = 0;
1006  ITERATE(TInDels, indl, indels) {
1007  if(indl->IsMismatch() || !indl->IntersectingWith(a, b))
1008  continue;
1009  if(indl->IsInsertion())
1010  fs += indl->Len();
1011  else
1012  fs -= indl->Len();
1013  }
1014 
1015  return fs%3;
1016 }
1017 
1018 
1019 
1021 
1022  return TotalFrameShift(indels, range.GetFrom(), range.GetTo());
1023 }
1024 
1025 
1026 
1028  const CArgs& args = CNcbiApplication::Instance()->GetArgs();
1030  int right_end = 0;
1031 
1032  size_t count_m_aligns = 0;
1033  ITERATE(Tdata, i, m_aligns) {
1034  ITERATE(deque<SAlignIndividual>, k, i->second) {
1035  m_left_end = min(m_left_end, k->m_range.GetFrom());
1036  right_end = max(right_end, k->m_range.GetTo());
1037  ++count_m_aligns;
1038  }
1039  }
1040  if (count_m_aligns == 0 && m_long_read_count == 0) {
1041  for(TAlignModelList::iterator it = m_aligns_for_filtering_only.begin(); it != m_aligns_for_filtering_only.end(); ) {
1042  TAlignModelList::iterator i = it++;
1043  CAlignModel& align = *i;
1044  if(align.Type()&CGeneModel::eNotForChaining)
1046  }
1047 
1048  return;
1049  }
1050 
1052  const SIntron& intron = it->first;
1053 
1054  // cerr << "Intron: " << m_contig_name << ':' << intron.m_range.GetFrom()+1 << ':' << intron.m_range.GetTo()+1 << ' ' << it->second.m_weight << ' ' << it->second.m_sr_support << ' ' << it->second.m_est_support << ' ' << it->second.m_other_support << '\n';
1055 
1056  int a = intron.m_range.GetFrom();
1057  int b = intron.m_range.GetTo();
1058  m_left_end = min(m_left_end, a);
1059  right_end = max(right_end, b);
1060  }
1061 
1063  m_left_end = min(m_left_end,i->Limits().GetFrom());
1064  right_end = max(right_end,i->Limits().GetTo());
1065  }
1066 
1067  int len = right_end-m_left_end+1;
1068 
1069  cerr << "Before filtering: " << m_align_introns.size() << " introns, " << m_count << " alignments" << endl;
1070 
1071  if(m_filterest) { // remove less supported fuzzy introns
1072  map<TSignedSeqRange, tuple<double,bool>> intervals; // [intron interval] weight, don't delete
1073  for(auto it = m_align_introns.begin(); it != m_align_introns.end(); ++it) {
1074  const SIntron& intron = it->first;
1075  SIntronData& data = it->second;
1076  auto& rslt = intervals[intron.m_range];
1077  get<0>(rslt) += data.m_weight;
1078  bool not_long = data.m_sr_support > 0 || data.m_other_support > 0;
1079  if(not_long || data.m_ident >= m_minident)
1080  get<1>(rslt) = true;
1081  }
1082 
1083  multimap<int, TAlignIntrons::iterator> left_intronsp;
1084  multimap<int, TAlignIntrons::iterator> right_intronsp;
1085  for(auto it = m_align_introns.begin(); it != m_align_introns.end(); ++it) {
1086  const SIntron& intron = it->first;
1087  if(!get<1>(intervals[intron.m_range])) {
1088  left_intronsp.emplace(intron.m_range.GetFrom(), it);
1089  right_intronsp.emplace(intron.m_range.GetTo(), it);
1090  }
1091  }
1092 
1093  set<SIntron> deleted_introns;
1094  int delta = 6;
1095  for(auto& interval : intervals) {
1096  TSignedSeqRange range = interval.first;
1097  auto il = left_intronsp.lower_bound(range.GetFrom()-delta); // >= acceptable left for left intron end
1098  if(il == left_intronsp.end() || il->first > range.GetFrom()+delta) // no close left ends
1099  continue;
1100  auto ir = left_intronsp.upper_bound(range.GetFrom()+delta); // > acceptable right or end
1101  set<TAlignIntrons::value_type*> close_lefts;
1102  for(auto i = il; i != ir; ++i) {
1103  const SIntron& intronb = i->second->first;
1104  if(intronb.m_range != range)
1105  close_lefts.insert(&(*i->second));
1106  }
1107  if(close_lefts.empty())
1108  continue;
1109 
1110  auto jl = right_intronsp.lower_bound(range.GetTo()-delta); // >= acceptable left for right intron end
1111  if(jl == right_intronsp.end() || jl->first > range.GetTo()+delta) // no close right end
1112  continue;
1113  auto jr = right_intronsp.upper_bound(range.GetTo()+delta); // > acceptable right or end
1114  set<TAlignIntrons::value_type*> close_rights;
1115  for(auto j = jl; j != jr; ++j) {
1116  const SIntron& intronb = j->second->first;
1117  if(intronb.m_range != range)
1118  close_rights.insert(&(*j->second));
1119  }
1120  if(close_rights.empty())
1121  continue;
1122 
1123  vector<TAlignIntrons::value_type*> close(min(close_lefts.size(),close_rights.size()), nullptr);
1124  auto close_end = set_intersection(close_lefts.begin(), close_lefts.end(), close_rights.begin(), close_rights.end(), close.begin());
1125  for(auto it = close.begin(); it != close_end; ++it) {
1126  const SIntron& intronb = (*it)->first;
1127  if(get<0>(intervals[intronb.m_range]) < get<0>(interval.second))
1128  deleted_introns.insert(intronb);
1129  }
1130  }
1131  for(auto& intron : deleted_introns) {
1132  // cerr << "Fuzzy intron: " << m_contig_name << ':' << intron.m_range.GetFrom()+1 << ':' << intron.m_range.GetTo()+1 << '\n';
1133  m_align_introns.erase(intron);
1134  }
1135  }
1136 
1137 
1138 #define COVERAGE_WINDOW 20
1139  //coverage calculation
1140  m_coverage.resize(len,0.);
1141  ITERATE(Tdata, i, m_aligns) {
1142  int factor = 1;
1143  if(i->first.isEST() && i->first.GetIntrons().size() > 1)
1144  factor = i->first.GetIntrons().size();
1145  ITERATE(deque<SAlignIndividual>, k, i->second) { // short reads and collapsed EST
1146  float weight = k->m_weight*factor;
1147  TSignedSeqRange range = k->m_range;
1148  for(int l = range.GetFrom(); l <= range.GetTo(); ++l) // add coverage for all alignment range
1150  ITERATE(CAlignCommon::Tintrons, in, i->first.GetIntrons()) { // substract intron ranges
1151  for(int l = in->m_range.GetFrom()+1; l <= in->m_range.GetTo()-1; ++l)
1153  }
1154  }
1155  }
1156  for(auto& align : m_aligns_for_filtering_only) {
1157  if((align.Type()&CGeneModel::eProt) || (align.Type()&CGeneModel::eNotForChaining))
1158  continue;
1159  // any cDNA for chaining
1160  for(auto& exon : align.Exons()) {
1161  for(int l = exon.GetFrom(); l <= exon.GetTo(); ++l)
1162  m_coverage[l-m_left_end] += align.Weight();
1163  }
1164  }
1165 
1166  vector<double> left_coverage(len,0.); // average from the left side (including point)
1167  double wsum = 0;
1168  for(int i = 0; i < len; ++i) {
1169  wsum += m_coverage[i];
1170  int ipast = i - COVERAGE_WINDOW;
1171  if(ipast >= 0)
1172  wsum -= m_coverage[ipast];
1173  left_coverage[i] = wsum/COVERAGE_WINDOW;
1174  }
1175  vector<double> right_coverage(len,0.); // average from the right side (including point)
1176  wsum = 0;
1177  for(int i = len-1; i >= 0; --i) {
1178  wsum += m_coverage[i];
1179  int ipast = i + COVERAGE_WINDOW;
1180  if(ipast < len)
1181  wsum -= m_coverage[ipast];
1182  right_coverage[i] = wsum/COVERAGE_WINDOW;
1183  }
1184 
1185  //initial intron filtering
1186  int minconsensussupport = args["min-consensus-support"].AsInteger();
1187  int minnonconsensussupport = args["min-non-consensussupport"].AsInteger();
1188  // int minest = args["minest"].AsInteger();
1190  TAlignIntrons::iterator intron = it++;
1191  bool bad_intron = false;
1192  SIntronData& id = intron->second;
1193 
1194  if(!id.m_selfsp_support)
1195  bad_intron = true;
1196 
1197  if(id.m_keep_anyway)
1198  continue;
1199 
1200  if(intron->first.m_sig == "CTAC" && intron->first.m_oriented)
1201  bad_intron = true;
1202 
1203  // if(intron->first.m_sig == "GTAG") {
1204  if(intron->first.m_sig == "GTAG" || (intron->second.m_intron_num >= 3 && intron->first.m_sig == "GCAG")) {
1205  if(id.m_weight < minconsensussupport)
1206  bad_intron = true;
1207  } else {
1208  if(id.m_weight < minnonconsensussupport)
1209  bad_intron = true;
1210  }
1211 
1212  if(bad_intron) {
1213  // cerr << "Deleted intron1: " << m_contig_name << ':' << intron->first.m_range.GetFrom()+1 << ':' << intron->first.m_range.GetTo()+1 << '\n';
1214  m_align_introns.erase(intron);
1215  }
1216  }
1217 
1218  //filter low expressed splices
1219  double minspliceexpression = args["min-support-fraction"].AsDouble();
1220  double minintronexpression = args["end-pair-support-cutoff"].AsDouble();
1221 
1222  typedef multimap<int,CAlignCollapser::TAlignIntrons::iterator> TIntronsBySplice;
1223  TIntronsBySplice introns_by_left_splice;
1225  introns_by_left_splice.insert(TIntronsBySplice::value_type(intron->first.m_range.GetFrom(),intron));
1226  }
1227  for(TIntronsBySplice::iterator a = introns_by_left_splice.begin(); a != introns_by_left_splice.end(); ) {
1228  int splice = a->first;
1229  TIntronsBySplice::iterator b = introns_by_left_splice.upper_bound(splice); // first with different splice
1230 
1231  double weight = 0;
1232  int number = 0;
1233  for(TIntronsBySplice::iterator i = a; i != b; ++i) {
1234  ++number;
1235  weight += i->second->second.m_weight*i->second->second.m_intron_num;
1236  }
1237  double mean = weight/number;
1238 
1239  for(TIntronsBySplice::iterator it = a; it != b; ) {
1240  TIntronsBySplice::iterator i = it++;
1241  SIntronData& id = i->second->second;
1242  if(!id.m_keep_anyway && (id.m_weight*id.m_intron_num < minintronexpression*mean || weight < minspliceexpression*left_coverage[splice-m_left_end])) {
1243  id.m_weight = -1;
1244  introns_by_left_splice.erase(i);
1245  }
1246  }
1247 
1248  a = b;
1249  }
1250 
1251  TIntronsBySplice introns_by_right_splice;
1253  introns_by_right_splice.insert(TIntronsBySplice::value_type(intron->first.m_range.GetTo(),intron));
1254  }
1255  for(TIntronsBySplice::iterator a = introns_by_right_splice.begin(); a != introns_by_right_splice.end(); ) {
1256  int splice = a->first;
1257  TIntronsBySplice::iterator b = introns_by_right_splice.upper_bound(splice); // first with different splice
1258 
1259  double weight = 0;
1260  int number = 0;
1261  for(TIntronsBySplice::iterator i = a; i != b; ++i) {
1262  ++number;
1263  weight += i->second->second.m_weight*i->second->second.m_intron_num;
1264  }
1265  double mean = weight/number;
1266 
1267  for(TIntronsBySplice::iterator it = a; it != b; ) {
1268  TIntronsBySplice::iterator i = it++;
1269  SIntronData& id = i->second->second;
1270  if(!id.m_keep_anyway && (id.m_weight*id.m_intron_num < minintronexpression*mean || weight < minspliceexpression*right_coverage[splice-m_left_end])) {
1271  id.m_weight = -1;
1272  introns_by_right_splice.erase(i);
1273  }
1274  }
1275 
1276  a = b;
1277  }
1278 
1280  TAlignIntrons::iterator intron = it++;
1281  if(intron->second.m_weight < 0) {
1282  // cerr << "Deleted intron2: " << m_contig_name << ':' << intron->first.m_range.GetFrom()+1 << ':' << intron->first.m_range.GetTo()+1 << '\n';
1283  m_align_introns.erase(intron);
1284  }
1285  }
1286 
1287  //remove/cut pool alignments with bad introns
1288  for(Tdata::iterator it = m_aligns.begin(); it != m_aligns.end(); ) {
1289  Tdata::iterator data = it++;
1290  const CAlignCommon& alc = data->first;
1291 
1292  if((alc.isEST() && !m_filterest) || (alc.isSR() && !m_filtersr))
1293  continue;
1294 
1295  const CAlignCommon::Tintrons& introns = alc.GetIntrons();
1296  if(introns.empty())
1297  continue;
1298 
1299  bool all_good = true;
1300  for(int i = 0; all_good && i < (int)introns.size(); ++i) {
1301  all_good = (m_align_introns.find(introns[i]) != m_align_introns.end());
1302  }
1303  if(all_good)
1304  continue;
1305 
1306  const deque<char>& id_pool = m_target_id_pool[alc];
1307  ITERATE(deque<SAlignIndividual>, i, data->second) {
1308  CAlignModel align(alc.GetAlignment(*i, id_pool));
1309  CGeneModel::TExons& e = align.Exons();
1310  for(unsigned int l = 1; l < e.size(); ++l) {
1311  SIntron intron(e[l-1].GetTo(), e[l].GetFrom(), align.Strand(), (align.Status()&CGeneModel::eUnknownOrientation) == 0, ""); // signature not used
1312  if(!m_align_introns.count(intron)) {
1313  e[l-1].m_ssplice = false;
1314  e[l].m_fsplice = false;
1315  }
1316  }
1317  TAlignModelList aligns = GetAlignParts(align, false);
1318  for(auto& a : aligns) {
1319  if(alc.isEST())
1320  a.Status() |= CGeneModel::eChangedByFilter;
1321  if(a.Exons().size() > 1) {
1322  CAlignCommon c(a);
1323  m_aligns[c].push_back(SAlignIndividual(a, m_target_id_pool[c]));
1324  }
1325  }
1326  }
1327 
1328  // delete initial alignments and ids
1329  m_target_id_pool.erase(data->first);
1330  m_aligns.erase(data);
1331  }
1332 
1333  //splices which should not be crossed
1334  double mincrossexpression = args["sharp-boundary"].AsDouble();
1335  TIVec left_plus(len,right_end); // closest left + strand splice 'on the right' from the current position
1336  TIVec left_minus(len,right_end); // closest left - strand splice 'on the right' from the current position
1337  TIVec right_plus(len,m_left_end); // closest right + strand splice 'on the left' from the current position
1338  TIVec right_minus(len,m_left_end); // closest right - strand splice 'on the left' from the current position
1339  for(auto& intron_data : m_align_introns) {
1340  const SIntron& intron = intron_data.first;
1341  SIntronData& data = intron_data.second;
1342  int a = intron.m_range.GetFrom();
1343  int b = intron.m_range.GetTo();
1344 
1345  double two_side_exon_coverage = max(left_coverage[a-m_left_end],right_coverage[b-m_left_end]);
1346 
1347  // if(right_coverage[a+1-m_left_end] < mincrossexpression*left_coverage[a-m_left_end]) {
1348  if(right_coverage[a+1-m_left_end] < mincrossexpression*two_side_exon_coverage) {
1349  if(!intron.m_oriented || intron.m_strand == ePlus) {
1350  data.m_not_cross = true;
1351  left_plus[a-m_left_end] = a;
1352  }
1353  if(!intron.m_oriented || intron.m_strand == eMinus) {
1354  data.m_not_cross = true;
1355  left_minus[a-m_left_end] = a;
1356  }
1357  }
1358 
1359  // if(left_coverage[b-1-m_left_end] < mincrossexpression*right_coverage[b-m_left_end]) {
1360  if(left_coverage[b-1-m_left_end] < mincrossexpression*two_side_exon_coverage) {
1361  if(!intron.m_oriented || intron.m_strand == ePlus) {
1362  data.m_not_cross = true;
1363  right_plus[b-m_left_end] = b;
1364  }
1365  if(!intron.m_oriented || intron.m_strand == eMinus) {
1366  data.m_not_cross = true;
1367  right_minus[b-m_left_end] = b;
1368  }
1369  }
1370 
1371  // cerr << "Intron: " << intron.m_range << " " << data.m_not_cross << " " << right_coverage[a+1-m_left_end] << " " << left_coverage[b-1-m_left_end] << " " << two_side_exon_coverage << endl;
1372  }
1373 
1374  for(int i = 1; i < len; ++i) {
1375  right_plus[i] = max(right_plus[i],right_plus[i-1]);
1376  right_minus[i] = max(right_minus[i],right_minus[i-1]);
1377  }
1378  for(int i = len-2; i >= 0; --i) {
1379  left_plus[i] = min(left_plus[i],left_plus[i+1]);
1380  left_minus[i] = min(left_minus[i],left_minus[i+1]);
1381  }
1382 
1383  double clip_threshold = args["utrclipthreshold"].AsDouble();
1384  for(Tdata::iterator it = m_aligns.begin(); it != m_aligns.end(); ) {
1385  Tdata::iterator data = it++;
1386  const CAlignCommon& alc = data->first;
1387  deque<SAlignIndividual>& aligns = data->second;
1388 
1389  if((alc.isEST() && !m_filterest) || (alc.isSR() && !m_filtersr))
1390  continue;
1391 
1392  const deque<char>& id_pool = m_target_id_pool[alc];
1393  NON_CONST_ITERATE(deque<SAlignIndividual>, i, aligns) {
1394  CAlignModel align(alc.GetAlignment(*i, id_pool));
1395  int cap_polya0 = align.Status()&(CGeneModel::eCap|CGeneModel::ePolyA);
1396  TSignedSeqRange lim = align.Limits();
1397  auto exnum = align.Exons().size();
1398  if(alc.isSR())
1399  ClipESTorSR(align, clip_threshold, 2);
1400  else
1401  ClipESTorSR(align, 0, 2);
1402  if(align.Limits() == lim)
1403  continue;
1404 
1405  // cerr << "Clipped read: " << i->m_align_id << " " << lim.GetFrom()+1 << " " << lim.GetTo()+1 << " " << align.Limits().GetFrom()+1 << " " << align.Limits().GetTo()+1 << " " << exnum << " " << align.Exons().size() << endl;
1406 
1407  if(align.Limits().Empty()) {
1408  i->m_weight = -1;
1409  continue;
1410  }
1411  int cap_polya1 = align.Status()&(CGeneModel::eCap|CGeneModel::ePolyA);
1412  if(exnum != align.Exons().size() || cap_polya0 != cap_polya1) {
1413  if(align.Exons().size() > 1 || exnum == 1) {
1414  CAlignCommon c(align);
1415  m_aligns[c].push_back(SAlignIndividual(align, m_target_id_pool[c]));
1416  }
1417  i->m_weight = -1;
1418  continue;
1419  }
1420  i->m_range = align.Limits();
1421  }
1422 
1423  aligns.erase(remove_if(aligns.begin(),aligns.end(),AlignmentMarkedForDeletion),aligns.end());
1424  if(aligns.empty())
1425  m_aligns.erase(data);
1426  }
1427 
1428  //filter/cut low abandance one-exon and crossing splices
1429  int minsingleexpression = args["min-edge-coverage"].AsInteger();
1430  int trim = args["trim"].AsInteger();
1431  int total = 0;
1432  if(m_filtersr ) {
1433  for(Tdata::iterator it = m_aligns.begin(); it != m_aligns.end(); ) {
1434  Tdata::iterator data = it++;
1435  const CAlignCommon& alc = data->first;
1436  deque<SAlignIndividual>& aligns = data->second;
1437 
1438  if(!alc.isSR())
1439  continue;
1440 
1441  if(alc.GetIntrons().empty()) { // not spliced
1442  NON_CONST_ITERATE(deque<SAlignIndividual>, i, aligns) {
1443  int a = i->m_range.GetFrom()+trim;
1444  int b = i->m_range.GetTo()-trim;
1445  if(b > a) {
1446  if((m_coverage[a-m_left_end] < minsingleexpression || m_coverage[b-m_left_end] < minsingleexpression) && !alc.isPolyA() && !alc.isCap())
1447  i->m_weight = -1;
1448  else if((alc.isUnknown() || alc.isPlus()) && ((right_plus[b-m_left_end] > a && !alc.isCap()) || (left_plus[a-m_left_end] < b && !alc.isPolyA())))
1449  i->m_weight = -1;
1450  else if((alc.isUnknown() || alc.isMinus()) && ((right_minus[b-m_left_end] > a && !alc.isPolyA()) || (left_minus[a-m_left_end] < b && !alc.isCap())))
1451  i->m_weight = -1;
1452  } else {
1453  i->m_weight = -1;
1454  }
1455  }
1456  } else {
1457  const deque<char>& id_pool = m_target_id_pool[alc];
1458  NON_CONST_ITERATE(deque<SAlignIndividual>, i, aligns) {
1459  CAlignModel align(alc.GetAlignment(*i, id_pool));
1460  TSignedSeqRange new_lim = align.Limits();
1461  if(align.Exons().front().Limits().GetLength() > trim) {
1462  int a = align.Exons().front().Limits().GetFrom()+trim;
1463  int b = align.Exons().front().Limits().GetTo();
1464  if((alc.isUnknown() || alc.isPlus()) && (right_plus[b-m_left_end] > a && !alc.isCap())) // crosses right plus splice
1465  new_lim.SetFrom(right_plus[b-m_left_end]);
1466  if((alc.isUnknown() || alc.isMinus()) && (right_minus[b-m_left_end] > a && !alc.isPolyA())) // crosses right minus splice
1467  new_lim.SetFrom(right_minus[b-m_left_end]);
1468  _ASSERT(new_lim.GetFrom() <= align.Exons().front().GetTo());
1469  }
1470  if(align.Exons().back().Limits().GetLength() > trim) {
1471  int a = align.Exons().back().Limits().GetFrom();
1472  int b = align.Exons().back().Limits().GetTo()-trim;
1473  if((alc.isUnknown() || alc.isPlus()) && (left_plus[a-m_left_end] < b && !alc.isPolyA())) // crosses left plus splice
1474  new_lim.SetTo(left_plus[a-m_left_end]);
1475  if((alc.isUnknown() || alc.isMinus()) && (left_minus[a-m_left_end] < b && !alc.isCap())) // crosses left minus splice
1476  new_lim.SetTo(left_minus[a-m_left_end]);
1477  _ASSERT(new_lim.GetTo() >= align.Exons().back().GetFrom());
1478  }
1479  i->m_range = new_lim;
1480 
1481  //delete if retained intron in short reads internal exon
1482  for(int n = 1; n < (int)align.Exons().size()-1 && i->m_weight > 0; ++n) {
1483  int a = align.Exons()[n].Limits().GetFrom();
1484  int b = align.Exons()[n].Limits().GetTo();
1485 
1486  pair<TIntronsBySplice::iterator,TIntronsBySplice::iterator> eqr(introns_by_right_splice.end(),introns_by_right_splice.end());
1487  if((alc.isUnknown() || alc.isPlus()) && right_plus[b-m_left_end] > a) // crosses right plus splice
1488  eqr = introns_by_right_splice.equal_range(right_plus[b-m_left_end]);
1489  else if((alc.isUnknown() || alc.isMinus()) && right_minus[b-m_left_end] > a) // crosses right minus splice
1490  eqr = introns_by_right_splice.equal_range(right_minus[b-m_left_end]);
1491  for(TIntronsBySplice::iterator ip = eqr.first; ip != eqr.second; ++ip) {
1492  if(ip->second->first.m_range.GetFrom() > a)
1493  i->m_weight = -1;
1494  }
1495  }
1496  }
1497  }
1498  aligns.erase(remove_if(aligns.begin(),aligns.end(),AlignmentMarkedForDeletion),aligns.end());
1499  if(aligns.empty()) {
1500  m_target_id_pool.erase(data->first);
1501  m_aligns.erase(data);
1502  }
1503  }
1504  }
1505 
1506  // remove retained introns in est (keep spliced parts)
1507  if(m_filterest) {
1508  for(Tdata::iterator it = m_aligns.begin(); it != m_aligns.end(); ) {
1509  Tdata::iterator data = it++;
1510  const CAlignCommon& alc = data->first;
1511  const CAlignCommon::Tintrons& introns = alc.GetIntrons();
1512  deque<SAlignIndividual>& aligns_ind = data->second;
1513 
1514  if(!alc.isEST())
1515  continue;
1516 
1517  if(introns.empty()) { // one-exon
1518  for(SAlignIndividual& ali : aligns_ind) {
1519  int a = ali.m_range.GetFrom();
1520  int b = ali.m_range.GetTo();
1521  if(ali.m_range.GetLength() > 2*trim) {
1522  if((m_coverage[a+trim-m_left_end] < minsingleexpression || m_coverage[b-trim-m_left_end] < minsingleexpression)) // low coverage
1523  ali.m_weight = -1;
1524  else if((alc.isUnknown() || alc.isPlus()) && (right_plus[b-m_left_end] > a || left_plus[a-m_left_end] < b)) // crossing splice
1525  ali.m_weight = -1;
1526  else if((alc.isUnknown() || alc.isMinus()) && (right_minus[b-m_left_end] > a || left_minus[a-m_left_end] < b)) // crossing splice
1527  ali.m_weight = -1;
1528  } else {
1529  ali.m_weight = -1;
1530  }
1531  }
1532  } else {
1533  list<int> strands;
1534  if(alc.isUnknown()) {
1535  strands.push_back(ePlus);
1536  strands.push_back(eMinus);
1537  } else {
1538  strands.push_back(alc.isPlus() ? ePlus : eMinus);
1539  }
1540 
1541  set<int> bad_exons;
1542  for(int strand : strands) {
1543  for(int i = 0; i < (int)introns.size()-1; ++i) {
1544  TSignedSeqRange exon(introns[i].m_range.GetTo(), introns[i+1].m_range.GetFrom());
1545  SIntron se(exon.GetFrom(), exon.GetTo(), strand, true, "");
1546  for(auto ip = m_align_introns.upper_bound(se); ip != m_align_introns.end() && ip->first.m_strand == strand && ip->first.m_range.GetFrom() < exon.GetTo(); ++ip) {
1547  if(Include(exon, ip->first.m_range) && ip->second.m_not_cross)
1548  bad_exons.insert(i+1);
1549  }
1550  }
1551  }
1552 
1553  const deque<char>& id_pool = m_target_id_pool[alc];
1554  for(SAlignIndividual& ali : aligns_ind) {
1555  bool save_and_delete = false;
1556  TSignedSeqRange new_lim = ali.m_range;
1557  // first exon
1558  int a = ali.m_range.GetFrom();
1559  int b = introns.front().m_range.GetFrom();
1560  if((alc.isUnknown() || alc.isPlus()) && right_plus[b-m_left_end] > a) { // crosses right plus splice
1561  new_lim.SetFrom(right_plus[b-m_left_end]);
1562  if(alc.isCap())
1563  save_and_delete = true;
1564  }
1565 
1566  if((alc.isUnknown() || alc.isMinus()) && right_minus[b-m_left_end] > a) { // crosses right minus splice
1567  new_lim.SetFrom(right_minus[b-m_left_end]);
1568  if(alc.isPolyA())
1569  save_and_delete = true;
1570  }
1571  _ASSERT(new_lim.GetFrom() <= ali.m_range.GetTo());
1572  // last exon
1573  a = introns.back().m_range.GetTo();
1574  b = ali.m_range.GetTo();
1575  if((alc.isUnknown() || alc.isPlus()) && left_plus[a-m_left_end] < b) { // crosses left plus splice
1576  new_lim.SetTo(left_plus[a-m_left_end]);
1577  if(alc.isPolyA())
1578  save_and_delete = true;
1579  }
1580  if((alc.isUnknown() || alc.isMinus()) && left_minus[a-m_left_end] < b) { // crosses left minus splice
1581  new_lim.SetTo(left_minus[a-m_left_end]);
1582  if(alc.isCap())
1583  save_and_delete = true;
1584  }
1585  _ASSERT(new_lim.GetTo() >= ali.m_range.GetFrom());
1586  if(!save_and_delete)
1587  ali.m_range = new_lim;
1588 
1589  // check if first exon still has notspliced intron
1590  for(int strand : strands) {
1591  TSignedSeqRange exon(new_lim.GetFrom(), introns.front().m_range.GetFrom());
1592  SIntron se(exon.GetFrom(), exon.GetTo(), strand, true, "");
1593  for(auto ip = m_align_introns.upper_bound(se); ip != m_align_introns.end() && ip->first.m_strand == strand && ip->first.m_range.GetFrom() < exon.GetTo(); ++ip) {
1594  if(Include(exon, ip->first.m_range) && ip->second.m_not_cross)
1595  bad_exons.insert(0);
1596  }
1597  }
1598  // check if last exon still has notspliced intron
1599  for(int strand : strands) {
1600  TSignedSeqRange exon(introns.back().m_range.GetTo(), new_lim.GetTo());
1601  SIntron se(exon.GetFrom(), exon.GetTo(), strand, true, "");
1602  for(auto ip = m_align_introns.upper_bound(se); ip != m_align_introns.end() && ip->first.m_strand == strand && ip->first.m_range.GetFrom() < exon.GetTo(); ++ip) {
1603  if(Include(exon, ip->first.m_range) && ip->second.m_not_cross)
1604  bad_exons.insert(introns.size());
1605  }
1606  }
1607 
1608  if(save_and_delete || !bad_exons.empty()) {
1609  CAlignModel align(alc.GetAlignment(ali, id_pool));
1610  if(new_lim != align.Limits())
1611  align.Clip(new_lim, CGeneModel::eRemoveExons);
1612 
1613  if(!bad_exons.empty()) {
1614  CGeneModel::TExons& e = align.Exons();
1615  for(unsigned i : bad_exons) {
1616  if(i != 0)
1617  e[i-1].m_ssplice = false;
1618  e[i].m_fsplice = false;
1619  e[i].m_ssplice = false;
1620  if(i != introns.size())
1621  e[i+1].m_fsplice = false;
1622  }
1623  TAlignModelList aligns = GetAlignParts(align, false);
1624  for(auto& a : aligns) {
1625  if(a.Exons().size() > 1) {
1626  a.Status() |= CGeneModel::eChangedByFilter;
1627  CAlignCommon c(a);
1628  m_aligns[c].push_back(SAlignIndividual(a, m_target_id_pool[c]));
1629  }
1630  }
1631  } else {
1633  CAlignCommon c(align);
1634  m_aligns[c].push_back(SAlignIndividual(align, m_target_id_pool[c]));
1635  }
1636 
1637  ali.m_weight = -1;
1638  }
1639  }
1640  }
1641 
1642  aligns_ind.erase(remove_if(aligns_ind.begin(),aligns_ind.end(),AlignmentMarkedForDeletion), aligns_ind.end());
1643  if(aligns_ind.empty()) {
1644  m_target_id_pool.erase(data->first);
1645  m_aligns.erase(data);
1646  }
1647  }
1648  }
1649 
1650  for(auto& a : m_aligns)
1651  total += a.second.size();
1652 
1653  //filter other alignments
1654 
1655  //filter introns
1656  for(TAlignModelList::iterator it = m_aligns_for_filtering_only.begin(); it != m_aligns_for_filtering_only.end(); ) {
1657  TAlignModelList::iterator i = it++;
1658  CAlignModel& align = *i;
1659 
1660  if(align.Type()&CAlignModel::eProt) {
1661  CAlignModel a = align;
1662  a.Status() |= CGeneModel::eUnmodifiedAlign;
1663  m_aligns_for_filtering_only.push_front(a);
1664  }
1665 
1666  int intronnum = 0;
1667  ITERATE(CGeneModel::TExons, e, align.Exons()) {
1668  if(e->m_fsplice)
1669  ++intronnum;
1670  }
1671 
1672  if((align.Type()&CGeneModel::eEST) && !m_filterest)
1673  continue;
1674  if((align.Type()&CGeneModel::emRNA) && !m_filtermrna)
1675  continue;
1676  if((align.Type()&CGeneModel::eProt) && !m_filterprots)
1677  continue;
1678 
1679  if(!AlignmentIsSupportedBySR(align, m_coverage, minsingleexpression, m_left_end)) {
1680  if(align.Type()&(CGeneModel::emRNA|CGeneModel::eProt)) {
1681  continue;
1682  } else if(align.Type()&CGeneModel::eNotForChaining) {
1684  continue;
1685  }
1686  }
1687 
1688  bool good_alignment = true;
1689 
1690  //clip alignments with bad introns
1691  if(align.Type()&CAlignModel::eProt) {
1692  good_alignment = RemoveNotSupportedIntronsFromProt(align);
1693  } else if(align.Type()&CGeneModel::eNotForChaining) {
1694  good_alignment = RemoveNotSupportedIntronsFromTranscript(align, true);
1695  } else {
1696  CAlignModel reversed = align;
1697  good_alignment = RemoveNotSupportedIntronsFromTranscript(align, false);
1699  reversed.ReverseComplementModel();
1700  bool good_reversed_alignment = RemoveNotSupportedIntronsFromTranscript(reversed, false);
1701  if(reversed.Exons().size() > align.Exons().size()) {
1702  align = reversed;
1703  good_alignment = good_reversed_alignment;
1704  }
1705  }
1706  }
1707 
1708  if(!align.Exons().empty())
1709  ClipNotSupportedFlanks(align, clip_threshold);
1710 
1711  if(align.Exons().empty() || (!good_alignment && !(align.Type()&CGeneModel::eNotForChaining)) || !AlignmentIsSupportedBySR(align, m_coverage, minsingleexpression, m_left_end)) {
1713  continue;
1714  }
1715 
1716  ITERATE(CGeneModel::TExons, e, align.Exons()) {
1717  if(e->m_fsplice)
1718  --intronnum;
1719  }
1720 
1721  if(intronnum > 0 && !(align.Type()&CGeneModel::eNotForChaining))
1723  }
1724 
1725  TIVec self_coverage(len,0);
1726 
1727  //modify contig near correction indels which will ensure their clipping near self species cDNA edges (as mismatches)
1729  if(indl->GetStatus() != CInDelInfo::eGenomeNotCorrect)
1730  continue;
1731  if(indl->IsDeletion()) {
1732  m_contig[indl->Loc()] = tolower(m_contig[indl->Loc()]);
1733  m_contig[indl->Loc()-1] = tolower(m_contig[indl->Loc()]-1);
1734  } else {
1735  for(int p = indl->Loc(); p < indl->Loc()+indl->Len(); ++p)
1736  m_contig[p] = tolower(m_contig[p]);
1737  }
1738  }
1739 
1740 
1741  //clean self species cDNA edges and calculate self coverage
1742  for(TAlignModelList::iterator it = m_aligns_for_filtering_only.begin(); it != m_aligns_for_filtering_only.end(); ) {
1743  TAlignModelList::iterator i = it++;
1744  CAlignModel& align = *i;
1745 
1746  if(align.Status()&CGeneModel::eGapFiller) {
1747  string transcript = GetDNASequence(align.GetTargetId(),*m_scope);
1748 
1749  CleanSelfTranscript(align, transcript);
1750 
1751  ITERATE(CGeneModel::TExons, ie, align.Exons()) {
1752  int a = max(m_left_end, ie->GetFrom()); // TSA could be slightly extended in the area without alignments
1753  int b = min(right_end, ie->GetTo());
1754  for(int p = a; p <= b; ++p) {
1755  ++self_coverage[p-m_left_end];
1756  }
1757  }
1758  }
1759  }
1760 
1761  //restore contig
1762  m_contig.ToUpper();
1763 
1764  typedef pair<TSignedSeqRange,TInDels> TGapEnd;
1765  set<TGapEnd> right_gends; //rightmost exon befor gap
1766  set<TGapEnd> left_gends; // leftmost exon before gap
1767 
1768 #define MIN_EXON 10
1769 #define DESIRED_CHUNK 100
1770  //cut NotForChaining and fill gaps
1771  for(TAlignModelList::iterator it = m_aligns_for_filtering_only.begin(); it != m_aligns_for_filtering_only.end(); ) {
1772  TAlignModelList::iterator i = it++;
1773  CAlignModel& align = *i;
1774 
1775  if(!(align.Status()&CGeneModel::eGapFiller)) {
1776  if(align.Type()&CGeneModel::eNotForChaining)
1778  continue;
1779  }
1780 
1781  //collect fshifts
1782  ITERATE(CGeneModel::TExons, ie, align.Exons()) {
1783  TInDels fs = align.GetInDels(ie->GetFrom(), ie->GetTo(), true);
1784  left_gends.insert(TGapEnd(ie->Limits(),fs));
1785  right_gends.insert(TGapEnd(ie->Limits(),fs));
1786  }
1787 
1788  if(!(align.Type()&CGeneModel::eNotForChaining)) {
1790  if(editedalign.Exons().size() > align.Exons().size()) {
1791  m_aligns_for_filtering_only.push_front(editedalign);
1793  }
1794  } else {
1795  align.Status() &= ~CGeneModel::ePolyA;
1796  align.Status() &= ~CGeneModel::eCap;
1797  if(align.Exons().front().Limits().GetLength() > MIN_EXON) {
1798  CAlignModel a = align;
1799 
1800  TSignedSeqRange l = a.Exons().front().Limits();
1801  int len = l.GetLength();
1802  if(!align.Exons().front().m_ssplice && len > DESIRED_CHUNK) {
1803  l.SetTo(l.GetFrom()+DESIRED_CHUNK-1);
1804  len = DESIRED_CHUNK;
1805  }
1806  for(int ie = 0; len < DESIRED_CHUNK-2*MIN_EXON && a.Exons()[ie].m_ssplice; ++ie) {
1807  if(a.Exons()[ie+1].m_ssplice) {
1808  l.SetTo(a.Exons()[ie+1].GetTo());
1809  len += a.Exons()[ie+1].Limits().GetLength();
1810  } else {
1811  l.SetTo(min(a.Exons()[ie+1].GetTo(),a.Exons()[ie+1].GetFrom()+DESIRED_CHUNK-len-1));
1812  }
1813  }
1814  if(l.NotEmpty())
1815  l = a.GetAlignMap().ShrinkToRealPoints(l, false);
1816  if(l.NotEmpty()) {
1817  a.Clip(l, CGeneModel::eRemoveExons);
1819  if(editedalign.Exons().size() > a.Exons().size()) {
1820  m_aligns_for_filtering_only.push_front(editedalign);
1821  }
1822  }
1823  }
1824 
1825  for(int ie = 0; ie < (int)align.Exons().size()-1; ++ie) {
1826  if((!align.Exons()[ie].m_ssplice || !align.Exons()[ie+1].m_fsplice) &&
1827  align.Exons()[ie].Limits().GetLength() > MIN_EXON && align.Exons()[ie+1].Limits().GetLength() > MIN_EXON) {
1828  CAlignModel a = align;
1829 
1830  int left = a.Exons()[ie].GetFrom();
1831  int len = a.Exons()[ie].Limits().GetLength();
1832  if(!a.Exons()[ie].m_fsplice && len > DESIRED_CHUNK) {
1833  left = a.Exons()[ie].GetTo()-DESIRED_CHUNK+1;
1834  len = DESIRED_CHUNK;
1835  }
1836  for(int iie = ie; len < DESIRED_CHUNK-2*MIN_EXON && a.Exons()[iie].m_fsplice; --iie) {
1837  if(a.Exons()[iie-1].m_fsplice) {
1838  left = a.Exons()[iie-1].GetFrom();
1839  len += a.Exons()[iie-1].Limits().GetLength();
1840  } else {
1841  left = max(a.Exons()[iie-1].GetFrom(),a.Exons()[iie-1].GetTo()-DESIRED_CHUNK+len+1);
1842  }
1843  }
1844  int right = a.Exons()[ie+1].GetTo();
1845  len = a.Exons()[ie+1].Limits().GetLength();
1846  if(!a.Exons()[ie+1].m_ssplice && len > DESIRED_CHUNK) {
1847  right = a.Exons()[ie+1].GetFrom()+DESIRED_CHUNK-1;
1848  len = DESIRED_CHUNK;
1849  }
1850  for(int iie = ie+1; len < DESIRED_CHUNK-2*MIN_EXON && a.Exons()[iie].m_ssplice; ++iie) {
1851  if(a.Exons()[iie+1].m_ssplice) {
1852  right = a.Exons()[iie+1].GetTo();
1853  len += a.Exons()[iie+1].Limits().GetLength();
1854  } else {
1855  right = min(a.Exons()[iie+1].GetTo(),a.Exons()[iie+1].GetFrom()+DESIRED_CHUNK-len-1);
1856  }
1857  }
1858  if(left >= 0 && right >= 0) {
1859  TSignedSeqRange l(left, right);
1860  l = a.GetAlignMap().ShrinkToRealPoints(l, false);
1861  if(l.NotEmpty()) {
1862  a.Clip(l, CGeneModel::eRemoveExons);
1864  if(editedalign.Exons().size() > a.Exons().size()) {
1865  m_aligns_for_filtering_only.push_front(editedalign);
1866  }
1867  }
1868  }
1869  }
1870  }
1871 
1872  if(align.Exons().back().Limits().GetLength() > MIN_EXON) {
1873  CAlignModel a = align;
1874 
1875  TSignedSeqRange l = a.Exons().back().Limits();
1876  int len = l.GetLength();
1877  if(!align.Exons().back().m_fsplice && len > DESIRED_CHUNK) {
1878  l.SetFrom(a.Exons().back().GetTo()-DESIRED_CHUNK+1);
1879  len = DESIRED_CHUNK;
1880  }
1881  for(int ie = (int)a.Exons().size()-1; len < DESIRED_CHUNK-2*MIN_EXON && a.Exons()[ie].m_fsplice; --ie) {
1882  if(a.Exons()[ie-1].m_fsplice) {
1883  l.SetFrom(a.Exons()[ie-1].GetFrom());
1884  len += a.Exons()[ie-1].Limits().GetLength();
1885  } else {
1886  l.SetFrom(max(a.Exons()[ie-1].GetFrom(),a.Exons()[ie-1].GetTo()-DESIRED_CHUNK+len+1));
1887  }
1888  }
1889  if(l.NotEmpty())
1890  l = a.GetAlignMap().ShrinkToRealPoints(l, false);
1891  if(l.NotEmpty()) {
1892  a.Clip(l, CGeneModel::eRemoveExons);
1894  if(editedalign.Exons().size() > a.Exons().size()) {
1895  m_aligns_for_filtering_only.push_front(editedalign);
1896  }
1897  }
1898  }
1899 
1901  }
1902  }
1903 
1904  enum EnpPoint { eUnknown = 0, eRightPlus = 1, eRightMinus = 2, eLeftPlus = 4, eLeftMinus = 8};
1905  vector<unsigned char> end_status(len, 0);
1906 
1907  //include gap's boundaries in no cross splices
1909  const CAlignModel& align = *i;
1910  for(int ie = 0; ie < (int)align.Exons().size(); ++ie) {
1911  if(align.Exons()[ie].Limits().Empty()) {
1912  if(ie > 0) {
1913  int a = align.Exons()[ie-1].GetTo();
1914  int al = a-m_left_end;
1915  // if(a < right_end && self_coverage[al+1] == 0) { // TSA could be slightly extended in the area without alignments; include only at drop
1916  if(a < right_end) {
1917  if((align.Status()&CGeneModel::eUnknownOrientation) || align.Strand() == ePlus) {
1918  left_plus[al] = a;
1919  end_status[al] |= eLeftPlus;
1920  }
1921  if((align.Status()&CGeneModel::eUnknownOrientation) || align.Strand()== eMinus) {
1922  left_minus[al] = a;
1923  end_status[al] |= eLeftMinus;
1924  }
1925  }
1926  }
1927  if(ie < (int)align.Exons().size()-1) {
1928  int b = align.Exons()[ie+1].GetFrom();
1929  int bl = b-m_left_end;
1930  // if(b > m_left_end && self_coverage[bl-1] == 0) { // TSA could be slightly extended in the area without alignments; include only at drop
1931  if(b > m_left_end) {
1932  if((align.Status()&CGeneModel::eUnknownOrientation) || align.Strand() == ePlus) {
1933  right_plus[bl] = b;
1934  end_status[bl] |= eRightPlus;
1935  }
1936  if((align.Status()&CGeneModel::eUnknownOrientation) || align.Strand()== eMinus) {
1937  right_minus[bl] = b;
1938  end_status[bl] |= eRightMinus;
1939  }
1940  }
1941  }
1942  }
1943  }
1944  }
1945 
1946  for(int i = 1; i < len; ++i) {
1947  right_plus[i] = max(right_plus[i],right_plus[i-1]);
1948  right_minus[i] = max(right_minus[i],right_minus[i-1]);
1949  }
1950  for(int i = len-2; i >= 0; --i) {
1951  left_plus[i] = min(left_plus[i],left_plus[i+1]);
1952  left_minus[i] = min(left_minus[i],left_minus[i+1]);
1953  }
1954 
1955 
1956 #define FS_FUZZ 10
1957 #define MAX_CLIP 200
1958 #define SMALL_CLIP 30
1959 
1960  //trim 3'/5' exons crossing splices (including hole boundaries)
1961  for(TAlignModelList::iterator it = m_aligns_for_filtering_only.begin(); it != m_aligns_for_filtering_only.end(); ) {
1962  TAlignModelList::iterator i = it++;
1963  CAlignModel& align = *i;
1964 
1966  continue;
1967 
1968  CAlignMap amap = align.GetAlignMap();
1969 
1970  if((align.Type()&CGeneModel::eEST) && !m_filterest)
1971  continue;
1972  if((align.Type()&CGeneModel::emRNA) && !m_filtermrna)
1973  continue;
1974  if((align.Type()&CGeneModel::eProt) && !m_filterprots)
1975  continue;
1976 
1977  bool snap_to_codons = align.Type()&CAlignModel::eProt;
1978  bool good_alignment = true;
1979 
1980  bool keepdoing = true;
1981  while(keepdoing && good_alignment) {
1982  keepdoing = false;
1983  for(int ie = 0; ie < (int)align.Exons().size(); ++ie) {
1984  const CModelExon& e = align.Exons()[ie];
1985 
1986  if(!e.m_fsplice && e.Limits().GetLength() > trim && e.GetTo() <= right_end &&
1987  (ie != 0 || (align.Strand() == ePlus && !(align.Status()&CGeneModel::eCap) && !align.HasStart()) || (align.Strand() == eMinus && !(align.Status()&CGeneModel::ePolyA) && !align.HasStop()))) {
1988  int l = e.GetFrom();
1989  int r = e.GetTo();
1990  int new_l = l;
1991 
1992  TIVec* rights = 0;
1993  EnpPoint endp = eUnknown;
1994  if(((align.Status()&CGeneModel::eUnknownOrientation) || align.Strand() == ePlus) && right_plus[r-m_left_end] > l+trim) { // crosses right plus splice
1995  new_l = right_plus[r-m_left_end];
1996  rights = &right_plus;
1997  endp = eRightPlus;
1998  }
1999  if(((align.Status()&CGeneModel::eUnknownOrientation) || align.Strand() == eMinus) && right_minus[r-m_left_end] > l+trim) { // crosses right minus splice
2000  new_l = right_minus[r-m_left_end];
2001  rights = &right_minus;
2002  endp = eRightMinus;
2003  }
2004 
2005  if(new_l != l && (end_status[new_l-m_left_end]&endp) && (align.Type()&CAlignModel::eProt)) {
2006  // try to extend
2007  while(new_l-l > MAX_CLIP && (end_status[new_l-m_left_end]&endp))
2008  new_l = max(l,(*rights)[new_l-1-m_left_end]);
2009  TInDels pindels = align.GetInDels(true);
2010  int firstclip = new_l;
2011  for(int putativel = new_l; (new_l > l+SMALL_CLIP || TotalFrameShift(pindels, l, new_l)) && (end_status[new_l-m_left_end]&endp) && new_l == putativel; ) {
2012  putativel = max(l,(*rights)[new_l-1-m_left_end]);
2013  for(set<TGapEnd>::iterator ig = left_gends.begin(); ig != left_gends.end(); ++ig) {
2014  if(ig->first.GetFrom() <= putativel && ig->first.GetTo() >= firstclip) {
2015  int prot_fs = TotalFrameShift(pindels, putativel, firstclip+FS_FUZZ);
2016  int tsa_fs = TotalFrameShift(ig->second, putativel-FS_FUZZ, firstclip);
2017  if(prot_fs == tsa_fs)
2018  new_l = putativel;
2019  }
2020  }
2021  }
2022  //check if undertrimmed
2023  if(end_status[new_l-m_left_end]&endp) {
2024  for(int i = 0; i < (int)pindels.size() && pindels[i].Loc() <= new_l+FS_FUZZ; ++i)
2025  new_l = max(new_l,pindels[i].Loc());
2026  }
2027  }
2028 
2029  if(new_l != l) {
2030  _ASSERT(new_l <= r);
2031  if((align.Type()&CGeneModel::eEST) && (int)align.Exons().size() == 1) {
2032  good_alignment = false;
2033  break;
2034  }
2035 
2036  TSignedSeqRange seg = amap.ShrinkToRealPoints(TSignedSeqRange(new_l,align.Limits().GetTo()),snap_to_codons);
2037  if(seg.Empty() || amap.FShiftedLen(seg,false) < END_PART_LENGTH) { // nothing left on right
2038  if(ie == 0 || amap.FShiftedLen(TSignedSeqRange(align.Limits().GetFrom(),align.Exons()[ie-1].GetTo())) < END_PART_LENGTH) { // no alignment left
2039  good_alignment = false;
2040  } else { // left side is kept
2041  align.Clip(TSignedSeqRange(align.Limits().GetFrom(),align.Exons()[ie-1].GetTo()),CGeneModel::eRemoveExons);
2042  }
2043  } else { // trim
2044  if(ie == 0) { // first exon
2045  if(align.Type()&CGeneModel::eProt)
2046  align.Clip(seg,CGeneModel::eRemoveExons);
2047  else
2048  align.CutExons(TSignedSeqRange(align.Limits().GetFrom(),seg.GetFrom()-1)); // Clip() is not friendly to gapfillers
2049  } else {
2050  align.CutExons(TSignedSeqRange(align.Exons()[ie-1].GetTo()+1,seg.GetFrom()-1));
2051  }
2052  }
2053  keepdoing = true;
2054  break;
2055  }
2056  }
2057 
2058  if(!e.m_ssplice && e.Limits().GetLength() > trim && e.GetFrom() >= m_left_end &&
2059  (ie != (int)align.Exons().size()-1 || (align.Strand() == ePlus && !(align.Status()&CGeneModel::ePolyA) && !align.HasStop()) || (align.Strand() == eMinus && !(align.Status()&CGeneModel::eCap) && !align.HasStart()))) {
2060  int l = e.GetFrom();
2061  int r = e.GetTo();
2062  int new_r = r;
2063 
2064  TIVec* lefts = 0;
2065  EnpPoint endp = eUnknown;
2066  if(((align.Status()&CGeneModel::eUnknownOrientation) || align.Strand() == ePlus) && left_plus[l-m_left_end] < r-trim) { // crosses left plus splice
2067  new_r = left_plus[l-m_left_end];
2068  lefts = &left_plus;
2069  endp = eLeftPlus;
2070  }
2071  if(((align.Status()&CGeneModel::eUnknownOrientation) || align.Strand() == eMinus) && left_minus[l-m_left_end] < r-trim) { // crosses left minus splice
2072  new_r = left_minus[l-m_left_end];
2073  lefts = &left_minus;
2074  endp = eLeftMinus;
2075  }
2076 
2077  if(new_r != r && (end_status[new_r-m_left_end]&endp) && (align.Type()&CAlignModel::eProt)) {
2078  // try to extend
2079  while(r-new_r > MAX_CLIP && (end_status[new_r-m_left_end]&endp))
2080  new_r = min(r,(*lefts)[new_r+1-m_left_end]);
2081  TInDels pindels = align.GetInDels(true);
2082  int firstclip = new_r;
2083  for(int putativer = new_r; (new_r < r-SMALL_CLIP || TotalFrameShift(pindels, new_r, r)) && (end_status[new_r-m_left_end]&endp) && new_r == putativer; ) {
2084  putativer = min(r,(*lefts)[new_r+1-m_left_end]);
2085  for(set<TGapEnd>::iterator ig = right_gends.begin(); ig != right_gends.end(); ++ig) {
2086  if(ig->first.GetFrom() <= firstclip && ig->first.GetTo() >= putativer) {
2087  int prot_fs = TotalFrameShift(pindels, firstclip-FS_FUZZ, putativer);
2088  int tsa_fs = TotalFrameShift(ig->second, firstclip, putativer+FS_FUZZ);
2089  if(prot_fs == tsa_fs)
2090  new_r = putativer;
2091  }
2092  }
2093  }
2094  //check if undertrimmed
2095  if(end_status[new_r-m_left_end]&endp) {
2096  for(int i = (int)pindels.size()-1; i >= 0 && pindels[i].Loc() >= new_r-FS_FUZZ; --i)
2097  new_r = min(new_r,pindels[i].Loc()-1);
2098  }
2099  }
2100 
2101  if(new_r != r) {
2102  _ASSERT(new_r >= l);
2103  if((align.Type()&CGeneModel::eEST) && (int)align.Exons().size() == 1) {
2104  good_alignment = false;
2105  break;
2106  }
2107 
2108  TSignedSeqRange seg = amap.ShrinkToRealPoints(TSignedSeqRange(align.Limits().GetFrom(),new_r),snap_to_codons);
2109  if(seg.Empty() || amap.FShiftedLen(seg,false) < END_PART_LENGTH) { // nothing left on left
2110  if(ie == (int)align.Exons().size()-1 || amap.FShiftedLen(TSignedSeqRange(align.Exons()[ie+1].GetFrom(),align.Limits().GetTo())) < END_PART_LENGTH) { // no alignment left
2111  good_alignment = false;
2112  } else { // right side is kept
2113  align.Clip(TSignedSeqRange(align.Exons()[ie+1].GetFrom(),align.Limits().GetTo()),CGeneModel::eRemoveExons);
2114  }
2115  } else { // trim
2116  if(ie == (int)align.Exons().size()-1) { // last exon
2117  if(align.Type()&CGeneModel::eProt)
2118  align.Clip(seg,CGeneModel::eRemoveExons);
2119  else
2120  align.CutExons(TSignedSeqRange(seg.GetTo()+1, align.Limits().GetTo())); // Clip() is not friendly to gapfillers
2121  } else {
2122  align.CutExons(TSignedSeqRange(seg.GetTo()+1,align.Exons()[ie+1].GetFrom()-1));
2123  }
2124  }
2125  keepdoing = true;
2126  break;
2127  }
2128  }
2129  }
2130  }
2131 
2132  if(!good_alignment)
2134  }
2135 
2136  //clean genomic gaps
2137  sort(m_correction_data.m_correction_indels.begin(),m_correction_data.m_correction_indels.end(),GenomicGapsOrder()); // accsession is used if the sequence is same
2138  m_correction_data.m_correction_indels.erase( unique(m_correction_data.m_correction_indels.begin(),m_correction_data.m_correction_indels.end()), m_correction_data.m_correction_indels.end() ); // uses == for CInDelInfo which ignores accession
2139 
2140  total += m_aligns_for_filtering_only.size();
2141  cerr << "After filtering: " << m_align_introns.size() << " introns, " << total << " alignments" << endl;
2142 }
2143 
2145 
2146  ITERATE(CGeneModel::TExons, i, align.Exons()) {
2147  if(i->Limits().NotEmpty()) {
2148  CInDelInfo p(i->GetFrom(), 1, CInDelInfo::eDel);
2149  TInDels::const_iterator ig = lower_bound(m_correction_data.m_correction_indels.begin(), m_correction_data.m_correction_indels.end(), p); // first equal or greater
2150  for( ; ig != m_correction_data.m_correction_indels.end() && ig->Loc() <= i->GetTo(); ++ig) {
2151  if(ig->GetSource().m_range.NotEmpty()) // exon overlaps with inserted gap
2152  return false;
2153  }
2154  }
2155  }
2156 
2157  clsset.Insert(align);
2158  return true;
2159 }
2160 
2162 {
2163  bool operator() (TAlignModelList::const_iterator a, TAlignModelList::const_iterator b) { // left and long first
2164  if(a->Limits() == b->Limits()) {
2165  if(a->Ident() != b->Ident())
2166  return a->Ident() > b->Ident();
2167  else
2168  return a->TargetAccession() < b->TargetAccession();
2169  } else if(a->Limits().GetFrom() != b->Limits().GetFrom()) {
2170  return a->Limits().GetFrom() < b->Limits().GetFrom();
2171  } else {
2172  return a->Limits().GetTo() > b->Limits().GetTo();
2173  }
2174  }
2175 };
2176 
2177 // one-exon alignments are equal
2178 // gapfilled exons compared by seq; real exons compared by range and splices
2179 // real exon < gapfilled exon;
2180 bool OneExonCompare(const CModelExon& a, const CModelExon& b) {
2181  if(!a.m_seq.empty() || !b.m_seq.empty()) { // at least one is gapfilling
2182  return a.m_seq < b.m_seq;
2183  } else if(b.Limits().Empty()) { // b is from one-exon alignment
2184  return false;
2185  } else if(a.Limits().Empty()) { // a is from one-exon alignment and b is not
2186  return true;
2187  } else if(a.m_fsplice != b.m_fsplice) {
2188  return a.m_fsplice < b.m_fsplice;
2189  } else if(a.m_ssplice != b.m_ssplice) {
2190  return a.m_ssplice < b.m_ssplice;
2191  } else {
2192  return a.Limits() < b.Limits();
2193  }
2194 }
2195 
2197 {
2198  bool operator() (const CGeneModel::TExons& a, const CGeneModel::TExons& b) const {
2199  if(a.size() != b.size()) {
2200  return a.size() < b.size();
2201  } else {
2202  for(int i = 0; i < (int)a.size(); ++i) {
2203  if(OneExonCompare(a[i],b[i]))
2204  return true;
2205  if(OneExonCompare(b[i],a[i]))
2206  return false;
2207  }
2208  return false;
2209  }
2210  }
2211 };
2212 
2214  if(m_count == 0)
2215  return;
2216 
2217  FilterAlignments();
2218 
2220  if(i->Type() == CGeneModel::emRNA) {
2221  CBioseq_Handle bh (m_scope->GetBioseqHandle(*i->GetTargetId()));
2222  const CMolInfo* molinfo = GetMolInfo(bh);
2223  if(molinfo && molinfo->IsSetTech() && molinfo->GetTech() == CMolInfo::eTech_tsa)
2224  i->Status() |= CGeneModel::eTSA; // used to exclude from CDS projection
2225  }
2226  CheckAndInsert(*i, clsset);
2227  }
2228 }
2229 
2231 
2232  cerr << "Added " << m_count << " alignments to collapser for contig " << m_contig_name << endl;
2233 
2234  if(m_count == 0)
2235  return;
2236 
2237  FilterAlignments();
2238 
2239  const CArgs& args = CNcbiApplication::Instance()->GetArgs();
2240  int oep = args["oep"].AsInteger();
2241  int max_extend = args["max-extension"].AsInteger();
2242 
2243  set<int> left_exon_ends, right_exon_ends;
2245  const SIntron& intron = it->first;
2246  int a = intron.m_range.GetFrom();
2247  int b = intron.m_range.GetTo();
2248  left_exon_ends.insert(b);
2249  right_exon_ends.insert(a);
2250  }
2251 
2252  TAlignModelList rnaseq_or_est;
2254  const CAlignCommon& alc = i->first;
2255  const deque<char>& id_pool = m_target_id_pool[alc];
2256  deque<SAlignIndividual>& alideque = i->second;
2257  sort(alideque.begin(),alideque.end(),LeftAndLongFirstOrder(id_pool));
2258 
2259  if((alc.isSR()&&!m_collapssr) || (alc.isEST()&&!m_collapsest)) { // don't collaps
2260  ITERATE(deque<SAlignIndividual>, k, alideque) {
2261  CAlignModel align(alc.GetAlignment(*k, id_pool));
2262  rnaseq_or_est.push_back(align);
2263  }
2264  } else {
2265  bool leftisfixed = (alc.isCap() && alc.isPlus()) || (alc.isPolyA() && alc.isMinus());
2266  bool rightisfixed = (alc.isPolyA() && alc.isPlus()) || (alc.isCap() && alc.isMinus());
2267  bool notspliced = alc.GetIntrons().empty();
2268 
2269  typedef list<SAlignExtended> TEA_List;
2270  TEA_List extended_aligns;
2271 
2272  NON_CONST_ITERATE(deque<SAlignIndividual>, k, alideque) {
2273  SAlignIndividual& aj = *k;
2274  bool collapsed = false;
2275 
2276  for(TEA_List::iterator itloop = extended_aligns.begin(); itloop != extended_aligns.end(); ) {
2277  TEA_List::iterator ita = itloop++;
2278  SAlignIndividual& ai = *ita->m_ali;
2279 
2280  if(aj.m_range.GetFrom() >= min((leftisfixed ? ai.m_range.GetFrom():ai.m_range.GetTo())+1,ita->m_llimb)) { // extendent align is completed
2281  CAlignModel align(alc.GetAlignment(ai, id_pool));
2282  rnaseq_or_est.push_back(align);
2283  extended_aligns.erase(ita);
2284  } else if(!collapsed) { // even if collapsed must check extended_aligns to the end to purge finished
2285  if(rightisfixed && ai.m_range.GetTo() != aj.m_range.GetTo())
2286  continue;
2287  if(notspliced && aj.m_range.GetTo() > ai.m_range.GetTo()) {
2288  if(ai.m_range.GetTo()-aj.m_range.GetFrom()+1 < oep)
2289  continue;
2290  if(aj.m_range.GetTo()-ita->m_initial_right_end > max_extend)
2291  continue;
2292  if(aj.m_range.GetFrom()-ai.m_range.GetFrom() > max_extend)
2293  continue;
2294  }
2295  if(aj.m_range.GetTo() > (alc.isEST() ? ita->m_initial_right_end : ita->m_rlimb) || aj.m_range.GetTo() <= ita->m_rlima)
2296  continue;
2297 
2298  ai.m_weight += aj.m_weight;
2299  if(aj.m_range.GetTo() > ai.m_range.GetTo())
2300  ai.m_range.SetTo(aj.m_range.GetTo());
2301  collapsed = true;
2302  }
2303  }
2304 
2305  if(!collapsed)
2306  extended_aligns.push_back(SAlignExtended(aj,left_exon_ends,right_exon_ends));
2307  }
2308 
2309  ITERATE(TEA_List, ita, extended_aligns) {
2310  CAlignModel align(alc.GetAlignment(*ita->m_ali, id_pool));
2311  rnaseq_or_est.push_back(align);
2312  }
2313  }
2314  }
2315 
2316  //stranded intervals (start->len)
2317  TIntMap strandedplus_len;
2318  TIntMap strandedminus_len;
2319  ITERATE(TAlignModelList, ia, rnaseq_or_est) {
2320  const CAlignModel& align = *ia;
2321  if((align.Type()&CGeneModel::eSR) && !(align.Status()&CGeneModel::eUnknownOrientation) &&
2322  !(align.Status()&CGeneModel::ePolyA) && !(align.Status()&CGeneModel::eCap) && align.Exons().size() == 1) { // ORINTEED notspliced rnaseq
2323  TIntMap* mp = (align.Strand() == ePlus) ? &strandedplus_len : &strandedminus_len;
2324  if(mp->empty() || mp->rbegin()->first+mp->rbegin()->second < align.Limits().GetFrom()) { // abutting intervals are united
2325  (*mp)[align.Limits().GetFrom()] = align.Limits().GetLength();
2326  } else {
2327  mp->rbegin()->second = max(mp->rbegin()->second, align.Limits().GetTo()-mp->rbegin()->first+1);
2328  }
2329  }
2330  }
2331 
2332  int forced_orientation = 0;
2333  NON_CONST_ITERATE(TAlignModelList, ia, rnaseq_or_est) {
2334  CAlignModel& align = *ia;
2335  if((align.Type()&CGeneModel::eSR) && (align.Status()&CGeneModel::eUnknownOrientation) && align.Exons().size() == 1) { // NOTORINTEED notspliced rnaseq
2336  bool included_in_plus = false;
2337  TIntMap::iterator plus = strandedplus_len.lower_bound(align.Limits().GetTo());
2338  if(plus != strandedplus_len.begin() && (--plus)->first <= align.Limits().GetFrom() && plus->first+plus->second > align.Limits().GetTo())
2339  included_in_plus = true;
2340  bool included_in_minus = false;
2341  TIntMap::iterator minus = strandedminus_len.lower_bound(align.Limits().GetTo());
2342  if(minus != strandedminus_len.begin() && (--minus)->first <= align.Limits().GetFrom() && minus->first+minus->second > align.Limits().GetTo())
2343  included_in_minus = true;
2344 
2345  if(included_in_plus != included_in_minus) {
2347  align.SetStrand(included_in_plus ? ePlus : eMinus);
2348  ++forced_orientation;
2349  }
2350  }
2351  }
2352  cerr << "Forced orintation: " << forced_orientation << endl;
2353 
2354  int total = 0;
2355  ITERATE(TAlignModelList, ia, rnaseq_or_est) {
2356  if(CheckAndInsert(*ia, clsset))
2357  ++total;
2358  }
2359 
2360  if(m_collapsest && m_fillgenomicgaps) { // collaps ests used for gapfilling
2361  typedef map<CGeneModel::TExons, vector<TAlignModelList::iterator>, MultiExonsCompare> TEstHolder;
2362  TEstHolder est_for_collapsing;
2364  if(i->Type() == CGeneModel::eEST) {
2365  CGeneModel::TExons exons = i->Exons();
2366  if(exons.size() == 1) {
2367  exons.front().Limits() = TSignedSeqRange::GetEmpty();
2368  _ASSERT(exons.front().m_seq.empty());
2369  } else {
2370  if(exons.front().m_ssplice_sig != "XX")
2371  exons.front().Limits().SetFrom(exons.front().GetTo());
2372  if(exons.back().m_fsplice_sig != "XX")
2373  exons.back().Limits().SetTo(exons.back().GetFrom());
2374  }
2375  est_for_collapsing[exons].push_back(i);
2376  }
2377  }
2378 
2379  NON_CONST_ITERATE(TEstHolder, i, est_for_collapsing) {
2380  sort(i->second.begin(),i->second.end(),LeftAndLongFirstOrderForAligns());
2381  list<TAlignModelList::iterator> ests(i->second.begin(),i->second.end());
2382  for(list<TAlignModelList::iterator>::iterator ihost = ests.begin(); ihost != ests.end(); ++ihost) {
2383  CAlignModel& host = **ihost;
2384  set<int>::const_iterator ri = right_exon_ends.lower_bound(host.Limits().GetTo()); // leftmost compatible rexon
2385  int rlima = -1;
2386  if(ri != right_exon_ends.begin())
2387  rlima = *(--ri); // position of the rightmost incompatible rexon
2388  set<int>::const_iterator li = left_exon_ends.upper_bound(host.Limits().GetFrom()); // leftmost not compatible lexon
2389  int llimb = numeric_limits<int>::max() ;
2390  if(li != left_exon_ends.end())
2391  llimb = *li; // position of the leftmost not compatible lexon
2392 
2393  list<TAlignModelList::iterator>::iterator iloop = ihost;
2394  for(++iloop; iloop != ests.end(); ) {
2395  list<TAlignModelList::iterator>::iterator iguest = iloop++;
2396  CAlignModel& guest = **iguest;
2397 
2398  if(guest.Limits().GetFrom() >= min(host.Limits().GetTo()+1,llimb)) // host is completed
2399  break;
2400 
2401  if(guest.Limits().GetTo() > host.Limits().GetTo() || guest.Limits().GetTo() <= rlima)
2402  continue;
2403 
2404  if(host.Strand() != guest.Strand() || (host.Status()&CGeneModel::eUnknownOrientation) != (guest.Status()&CGeneModel::eUnknownOrientation))
2405  continue;
2406  if((guest.Status()&CGeneModel::ePolyA) || (host.Status()&CGeneModel::ePolyA)) {
2407  if((guest.Status()&CGeneModel::ePolyA) != (host.Status()&CGeneModel::ePolyA)
2408  || (guest.Strand() == ePlus && guest.Limits().GetTo() != host.Limits().GetTo())
2409  || (guest.Strand() == eMinus && guest.Limits().GetFrom() != host.Limits().GetFrom()))
2410  continue;
2411  }
2412  if((guest.Status()&CGeneModel::eCap) || (host.Status()&CGeneModel::eCap)) {
2413  if((guest.Status()&CGeneModel::eCap) != (host.Status()&CGeneModel::eCap)
2414  || (guest.Strand() == eMinus && guest.Limits().GetTo() != host.Limits().GetTo())
2415  || (guest.Strand() == ePlus && guest.Limits().GetFrom() != host.Limits().GetFrom()))
2416  continue;
2417  }
2418 
2419  host.SetWeight(host.Weight()+guest.Weight());
2420  m_aligns_for_filtering_only.erase(*iguest);
2421  ests.erase(iguest);
2422  }
2423  }
2424  }
2425  }
2426 
2428  if(i->Type() == CGeneModel::emRNA) {
2429  CBioseq_Handle bh (m_scope->GetBioseqHandle(*i->GetTargetId()));
2430  const CMolInfo* molinfo = GetMolInfo(bh);
2431  if(molinfo && molinfo->IsSetTech() && molinfo->GetTech() == CMolInfo::eTech_tsa)
2432  i->Status() |= CGeneModel::eTSA; // used to exclude from CDS projection
2433  }
2434  if(CheckAndInsert(*i, clsset))
2435  ++total;
2436  }
2437 
2438  size_t flex_cap = 0;
2439  size_t flex_polya = 0;
2440  for(auto& status_align : m_special_aligns) {
2441  if(status_align.second.Limits().GetFrom() >= 0 && status_align.second.Limits().GetTo() < m_contig.FullLength() && CheckAndInsert(status_align.second, clsset)) {
2442  if(status_align.second.Status()&CGeneModel::eCap)
2443  ++flex_cap;
2444  else
2445  ++flex_polya;
2446  }
2447  }
2448 
2449  cerr << "After collapsing: " << total << " alignments " << flex_cap << " Flexible caps " << flex_polya << " Flexible polyas" << endl;
2450 }
2451 
2452 
2454 
2455  CGeneModel editedmodel = align;
2456  editedmodel.ClearExons(); // empty alignment with all atributes
2457  vector<TSignedSeqRange> transcript_exons;
2458 
2459  string acc = align.TargetAccession();
2460  bool chainer_tsa = (acc.find("ChainerTSA") != string::npos);
2461 
2462  string left_seq, right_seq;
2463  CInDelInfo::SSource left_src;
2464  CInDelInfo::SSource right_src;
2465  TSignedSeqRange left_texon, right_texon;
2466  TSignedSeqRange tlim = align.TranscriptLimits();
2467  string transcript = GetDNASequence(align.GetTargetId(),*m_scope);
2468  if(tlim.GetFrom() > 30 && ((align.Status()&CGeneModel::ePolyA) == 0 || (align.Status()&CGeneModel::eReversed) == 0)) {
2469  left_seq = transcript.substr(0,tlim.GetFrom());
2470  left_texon = TSignedSeqRange(0,tlim.GetFrom()-1);
2471  left_src.m_acc = align.TargetAccession();
2472  left_src.m_strand = ePlus;
2473  left_src.m_range = left_texon;
2474  }
2475  if(tlim.GetTo() < align.TargetLen()-30 && ((align.Status()&CGeneModel::ePolyA) == 0 || (align.Status()&CGeneModel::eReversed) != 0)) {
2476  right_seq = transcript.substr(tlim.GetTo()+1);
2477  right_texon = TSignedSeqRange(tlim.GetTo()+1,align.TargetLen()-1);
2478  right_src.m_acc = align.TargetAccession();
2479  right_src.m_strand = ePlus;
2480  right_src.m_range = right_texon;
2481  }
2482  if(align.Orientation() == eMinus) {
2483  swap(left_seq, right_seq);
2484  swap(left_texon, right_texon);
2485  swap(left_src, right_src);
2486  }
2487 
2488  if(!left_seq.empty() && (fill&efill_left) != 0 && !chainer_tsa) {
2490  if(ig != m_genomic_gaps_len.begin() && (--ig)->first > align.Limits().GetFrom()-MAX_DIST_TO_FLANK_GAP) { // there is gap on left
2491  transcript_exons.push_back(left_texon);
2492  editedmodel.AddExon(TSignedSeqRange::GetEmpty(), "XX", "XX", 1, left_seq, left_src);
2493 
2494  if(align.Orientation() == eMinus) {
2495  ReverseComplement(left_seq.begin(),left_seq.end());
2496  left_src.m_strand = eMinus;
2497  }
2498  m_correction_data.m_correction_indels.push_back(CInDelInfo(max(0,ig->first+2*ig->second/3), (int)left_seq.length(), CInDelInfo::eDel, left_seq, left_src)); // 1/3 of gap length will separate genes abatting the same gap
2499  }
2500  }
2501 
2502  for(int i = 0; i < (int)align.Exons().size(); ++i) {
2503  transcript_exons.push_back(align.TranscriptExon(i));
2504  const CModelExon& e = align.Exons()[i];
2505  editedmodel.AddExon(e.Limits(),e.m_fsplice_sig, e.m_ssplice_sig, e.m_ident);
2506 
2507  if(i < (int)align.Exons().size()-1 && (!e.m_ssplice || !align.Exons()[i+1].m_fsplice)) {
2508  if((fill&efill_middle) != 0) {
2509  TSignedSeqRange texon = align.GetAlignMap().MapRangeOrigToEdited(TSignedSeqRange(e.GetTo(),align.Exons()[i+1].GetFrom()),false);
2510  TIntMap::iterator ig = m_genomic_gaps_len.lower_bound(e.GetTo()); // first gap on right
2511  if(ig != m_genomic_gaps_len.end() && ig->first < align.Exons()[i+1].GetFrom() && texon.GetLength() > 2) { // there is a gap
2512  texon.SetFrom(texon.GetFrom()+1);
2513  texon.SetTo(texon.GetTo()-1);
2514  transcript_exons.push_back(texon);
2515  string seq = transcript.substr(texon.GetFrom(),texon.GetLength());
2516  CInDelInfo::SSource src;
2517  src.m_acc = align.TargetAccession();
2518  src.m_strand = ePlus;
2519  src.m_range = texon;
2520  editedmodel.AddExon(TSignedSeqRange::GetEmpty(), "XX", "XX", 1, seq, src);
2521 
2522  if(align.Orientation() == eMinus) {
2523  ReverseComplement(seq.begin(),seq.end());
2524  src.m_strand = eMinus;
2525  }
2526  m_correction_data.m_correction_indels.push_back(CInDelInfo(ig->first+ig->second/2, (int)seq.length(), CInDelInfo::eDel, seq, src));
2527  } else {
2528  editedmodel.AddHole();
2529  }
2530  } else {
2531  editedmodel.AddHole();
2532  }
2533  }
2534  }
2535 
2536  if(!right_seq.empty() && (fill&efill_right) != 0 && !chainer_tsa) {
2538  if(ig != m_genomic_gaps_len.end() && ig->first < align.Limits().GetTo()+MAX_DIST_TO_FLANK_GAP) { // there is gap on right
2539  transcript_exons.push_back(right_texon);
2540  editedmodel.AddExon(TSignedSeqRange::GetEmpty(), "XX", "XX", 1, right_seq, right_src);
2541 
2542  if(align.Orientation() == eMinus) {
2543  ReverseComplement(right_seq.begin(),right_seq.end());
2544  right_src.m_strand = eMinus;
2545  }
2546  m_correction_data.m_correction_indels.push_back(CInDelInfo(ig->first+ig->second/3, (int)right_seq.length(), CInDelInfo::eDel, right_seq, right_src)); // 1/3 of gap length will separate genes abatting the same gap
2547  }
2548  }
2549 
2550  CAlignMap editedamap(editedmodel.Exons(), transcript_exons, align.FrameShifts(), align.Orientation(), align.GetAlignMap().TargetLen());
2551  editedmodel.FrameShifts() = align.FrameShifts();
2552  CAlignModel editedalign(editedmodel, editedamap);
2553  editedalign.SetTargetId(*align.GetTargetId());
2554 
2555  return editedalign;
2556 }
2557 
2558 #define COLLAPS_CHUNK 500000
2560 
2561  string acc = a.TargetAccession();
2562  for(char& c : acc)
2563  c = toupper(c);
2564 
2565  if(acc.find("CORRECTIONDATA") != string::npos) {
2566  if(!m_genomic_gaps_len.empty()) {
2567  TIntMap::iterator gap = m_genomic_gaps_len.upper_bound(a.Limits().GetTo()); // gap clearly on the right (could be end)
2568  if(gap != m_genomic_gaps_len.begin())
2569  --gap; // existing gap (not end)
2570  if(gap->first <= a.Limits().GetTo() && gap->first+gap->second-1 >= a.Limits().GetFrom()) // overlap
2571  return;
2572  }
2573 
2574  m_correction_data.m_confirmed_intervals.push_back(a.Limits());
2575 
2576  TInDels corrections = a.FrameShifts();
2577  ITERATE(TInDels, i, corrections) {
2578  if(i->IsMismatch()) {
2579  string seq = i->GetInDelV();
2580  for(int l = 0; l < i->Len(); ++l)
2581  m_correction_data.m_replacements[i->Loc()+l] = seq[l];
2582  } else {
2585  }
2586  }
2587 
2588  return;
2589  }
2590 
2591  int status = 0;
2592  if(acc.find("CAPINFO") != string::npos)
2593  status = CGeneModel::eCap;
2594  else if(acc.find("POLYAINFO") != string::npos)
2595  status = CGeneModel::ePolyA;
2596 
2597  if(status != 0) {
2598  int pos;
2599  if(((status&CGeneModel::eCap) && a.Strand() == ePlus) || ((status&CGeneModel::ePolyA) && a.Strand() == eMinus)) {
2600  status |= CGeneModel::eRightFlexible;
2601  pos = a.Limits().GetFrom();
2602  } else {
2603  status |= CGeneModel::eLeftFlexible;
2604  pos = a.Limits().GetTo();
2605  }
2606  if(a.Exons().size() == 1) {
2607  a.Status() |= status;
2608  auto rslt = m_special_aligns.emplace(make_tuple(status, pos), a);
2609  if(!rslt.second) { // same position exists
2610  auto& stored = rslt.first->second;
2611  stored.SetWeight(stored.Weight()+a.Weight());
2612  }
2613  return;
2614  } else {
2615  int spec_extend = SPECIAL_ALIGN_LEN-1;
2616  CGeneModel galign(a.Strand(), a.ID(), CGeneModel::eSR);
2617  galign.SetWeight(a.Weight());
2618  if(a.Strand() == ePlus)
2619  galign.AddExon(TSignedSeqRange(pos, pos+spec_extend));
2620  else
2621  galign.AddExon(TSignedSeqRange(pos-spec_extend, pos));
2622  if(galign.Limits().GetFrom() >= 0) { // can't check right end because we don't know the contig length yet (will check in chainer)
2623  galign.Status() |= status;
2624  auto rslt = m_special_aligns.emplace(make_tuple(status, pos), CAlignModel(galign, galign.GetAlignMap()));
2625  if(!rslt.second) { // same position exists
2626  auto& stored = rslt.first->second;
2627  stored.SetWeight(stored.Weight()+a.Weight());
2628  }
2629  }
2630  }
2631  }
2632 
2633  bool long_read = acc.find("SRA") != string::npos && acc.find("RNASEQ_COLLAPSE") == string::npos;
2634 
2635  if(long_read && a.Exons().front().m_ident == 0) {
2636  TInDels& indels = a.FrameShifts();
2637 
2638  auto it = indels.begin();
2639  for(CModelExon& exon : a.Exons()) {
2640  int align_len = exon.Limits().GetLength();
2641  double matches = align_len;
2642  for( ;it != indels.end() && it->IntersectingWith(exon.GetFrom(), exon.GetTo()); ++it) {
2643  if(it->IsMismatch() || it->IsInsertion())
2644  matches -= it->Len();
2645  if(it->IsDeletion())
2646  align_len += it->Len();
2647  }
2648  if(matches > 0)
2649  exon.m_ident = matches/align_len;
2650  }
2651  }
2652 
2653  if(m_filterest && long_read && a.Continuous()) {
2654  string transcript;
2655  CAlignMap amap = a.GetAlignMap();
2656  amap.EditedSequence(m_contig, transcript);
2657  int shift = a.TranscriptLimits().GetFrom();
2658 
2659  TSignedSeqRange lim = a.Limits();
2660  for(int i = 0; i < (int)a.Exons().size(); ++i) {
2661  auto te = a.TranscriptExon(i);
2662  string eseq = transcript.substr(te.GetFrom()-shift, te.GetLength()); // transcript sequence corresponding to the exon
2663  if(Entropy(eseq) < 0.65) {
2664  if(i == (int)a.Exons().size()-1) {
2665  lim = TSignedSeqRange::GetEmpty();
2666  } else {
2667  lim.SetFrom(a.Exons()[i+1].GetFrom());
2668  if(a.Exons()[i].m_ssplice && a.Exons()[i+1].m_fsplice)
2669  cerr << "Low complexity: " << a.ID() << " " << m_contig_name << ':' << a.Exons()[i].GetTo()+1 << ':' << a.Exons()[i+1].GetFrom()+1 << ' ' << eseq << "\n";
2670  }
2671  } else {
2672  break;
2673  }
2674  }
2675  if(lim.Empty()) {
2676  cerr << "Erased: " << a.ID() << "\n";
2677  return;
2678  }
2679  // at least one exon is good
2680  for(int i = (int)a.Exons().size()-1; i > 0; --i) {
2681  auto te = a.TranscriptExon(i);
2682  string eseq = transcript.substr(te.GetFrom()-shift, te.GetLength()); // transcript sequence corresponding to the exon
2683  if(Entropy(eseq) < 0.65) {
2684  lim.SetTo(a.Exons()[i-1].GetTo());
2685  if(a.Exons()[i-1].m_ssplice && a.Exons()[i].m_fsplice)
2686  cerr << "Low complexity: " << a.ID() << " " << m_contig_name << ':' << a.Exons()[i-1].GetTo()+1 << ':' << a.Exons()[i].GetFrom()+1 << ' ' << eseq << "\n";
2687  } else {
2688  break;
2689  }
2690  }
2691  if(lim != a.Limits()) {
2692  a.Clip(lim, CGeneModel::eRemoveExons);
2693  if(a.Exons().size() == 1 && !(a.Status()&CGeneModel::eCap) && !(a.Status()&CGeneModel::ePolyA))
2694  a.Status() |= CGeneModel::eUnknownOrientation;
2695  }
2696  }
2697 
2698  //Capinfo from not capped long reads
2699  if(m_use_long_reads_tss) {
2700  bool use_alignment = !(a.Status()&CGeneModel::eCap) && long_read; // not capped long read
2701  if(a.Status()&CGeneModel::eUnknownOrientation)
2702  use_alignment = false; // not oriented
2703  TSignedSeqRange tlim = a.TranscriptLimits();
2704  int not_aligned_5p = tlim.GetFrom();
2705  int not_aligned_3p = a.TargetLen()-1-tlim.GetTo();
2706  if(a.Status()&CGeneModel::eReversed)
2707  swap(not_aligned_5p, not_aligned_3p);
2708  if(not_aligned_5p > NOT_ALIGNED_PHONY_CAGE)
2709  use_alignment = false; // not aligned
2710 
2711  if(use_alignment) {
2712  status = CGeneModel::eCap;
2713  int spec_extend = SPECIAL_ALIGN_LEN-1;
2714  CGeneModel galign(a.Strand(), a.ID(), CGeneModel::eSR);
2715  galign.SetWeight(a.Weight());
2716  int pos;
2717  if(a.Strand() == ePlus) {
2718  pos = a.Limits().GetFrom();
2719  galign.AddExon(TSignedSeqRange(pos, pos+spec_extend));
2720  status |= CGeneModel::eRightFlexible;
2721  } else {
2722  pos = a.Limits().GetTo();
2723  galign.AddExon(TSignedSeqRange(pos-spec_extend, pos));
2724  status |= CGeneModel::eLeftFlexible;
2725  }
2726  if(galign.Limits().GetFrom() >= 0) { // can't check right end because we don't know the contig length yet (will check in chainer)
2727  galign.Status() |= status;
2728  auto rslt = m_special_aligns.emplace(make_tuple(status, pos), CAlignModel(galign, galign.GetAlignMap()));
2729  if(!rslt.second) { // same position exists
2730  auto& stored = rslt.first->second;
2731  stored.SetWeight(stored.Weight()+a.Weight());
2732  }
2733  }
2734  }
2735  }
2736 
2737  if((a.Type()&CGeneModel::eSR) && !a.Continuous()) // ignore SR with internal gaps
2738  return;
2739 
2740  CAlignModel align(a);
2741  if(!m_fillgenomicgaps)
2742  align.Status() &= ~CGeneModel::eGapFiller;
2743 
2745  return;
2746 
2747  if((align.Status()&CGeneModel::eUnknownOrientation) && align.Strand() == eMinus)
2748  align.ReverseComplementModel();
2749 
2750  m_range += align.Limits();
2751 
2752  const CGeneModel::TExons& e = align.Exons();
2753  for(unsigned int l = 1; l < e.size(); ++l) {
2754  if(e[l-1].m_ssplice && e[l].m_fsplice) {
2755  double ident = min(e[l-1].m_ident, e[l].m_ident);
2756  if(ident == 0.)
2757  ident = 1.; // collapsed SRA and proteins don't have ident information
2758  if(ident < m_minident && !long_read) // ignore low identity introns from transcripts which are not long reads
2759  continue;
2760 
2761  string sig;
2762  if(align.Strand() == ePlus)
2763  sig = e[l-1].m_ssplice_sig+e[l].m_fsplice_sig;
2764  else
2765  sig = e[l].m_fsplice_sig+e[l-1].m_ssplice_sig;
2766  SIntron intron(e[l-1].GetTo(),e[l].GetFrom(), align.Strand(), (align.Status()&CGeneModel::eUnknownOrientation) == 0, sig);
2767  SIntronData& id = m_align_introns[intron];
2768 
2769  if(((align.Type()&CGeneModel::eSR) && !m_filtersr) ||
2770  ((align.Type()&CGeneModel::eEST) && !m_filterest) ||
2771  ((align.Type()&CGeneModel::emRNA) && !m_filtermrna) ||
2772  ((align.Type()&CGeneModel::eProt) && !m_filterprots)) {
2773 
2774  id.m_keep_anyway = true;
2775  }
2776 
2777  if((align.Type()&CGeneModel::eSR) || (long_read && !m_no_lr_only_introns) ||
2778  (align.Status()&CGeneModel::eGapFiller && sig == "GTAG" &&
2779  e[l-1].Limits().GetLength() > 15 && e[l-1].m_ident > 0.99 &&
2780  e[l].Limits().GetLength() > 15 && e[l].m_ident > 0.99)) {
2781 
2782  id.m_selfsp_support = true;
2783  }
2784 
2785  if(align.Type()&CGeneModel::eSR)
2786  id.m_sr_support += align.Weight()+0.5;
2787  else if(align.Type()&CGeneModel::eEST)
2788  id.m_est_support += align.Weight()+0.5;
2789  else
2790  id.m_other_support += align.Weight()+0.5;
2791 
2792  id.m_intron_num = max(id.m_intron_num, (int)e.size()-1);
2793  id.m_weight += align.Weight();
2794 
2795  id.m_ident = max(id.m_ident,ident);
2796  }
2797  }
2798 
2799  // if((align.Type()&CGeneModel::eSR) || ((align.Type()&CGeneModel::eEST) && !(align.Status()&CGeneModel::eGapFiller) && m_collapsest)) { // add alignments for collapsing
2800  if((align.Type()&CGeneModel::eSR) || ((align.Type()&CGeneModel::eEST) && !(align.Status()&CGeneModel::eGapFiller))) { // add alignments for collapsing (long reads always included even if no collapsest - big memory saving)
2801  if(align.Continuous()) {
2802  CAlignCommon c(align);
2803  m_aligns[c].push_back(SAlignIndividual(align, m_target_id_pool[c]));
2804  } else {
2805  TAlignModelList aligns = GetAlignParts(align, false);
2806  ITERATE(TAlignModelList, i, aligns) {
2807  CAlignCommon c(*i);
2808  m_aligns[c].push_back(SAlignIndividual(*i, m_target_id_pool[c]));
2809  }
2810  }
2811  } else {
2812  m_aligns_for_filtering_only.push_back(align);
2813  }
2814 
2815  if(long_read)
2817 
2818  if(++m_count%COLLAPS_CHUNK == 0) {
2819  cerr << "Added " << m_count << " alignments to collapser" << endl;
2820  CollapsIdentical();
2821  }
2822 }
2823 
2826  deque<SAlignIndividual>& alideque = i->second;
2827  deque<char>& id_pool = m_target_id_pool[i->first];
2828  if(!alideque.empty()) {
2829 
2830  //remove identicals
2831  sort(alideque.begin(),alideque.end(),LeftAndLongFirstOrder(id_pool));
2832  deque<SAlignIndividual>::iterator ali = alideque.begin();
2833  for(deque<SAlignIndividual>::iterator farp = ali+1; farp != alideque.end(); ++farp) {
2834  _ASSERT(farp > ali);
2835  if(farp->m_range == ali->m_range) {
2836  ali->m_weight += farp->m_weight;
2837  for(deque<char>::iterator p = id_pool.begin()+farp->m_target_id; *p != 0; ++p) {
2838  _ASSERT(p < id_pool.end());
2839  *p = 0;
2840  }
2841  } else {
2842  *(++ali) = *farp;
2843  }
2844  }
2845  _ASSERT(ali-alideque.begin()+1 <= (int)alideque.size());
2846  alideque.resize(ali-alideque.begin()+1); // ali - last retained element
2847 
2848 
2849 
2850  //clean up id pool and reset shifts
2851  sort(alideque.begin(),alideque.end(),OriginalOrder);
2852  deque<char>::iterator id = id_pool.begin();
2853  int shift = 0;
2854  ali = alideque.begin();
2855  for(deque<char>::iterator farp = id; farp != id_pool.end(); ) {
2856  while(farp != id_pool.end() && *farp == 0) {
2857  ++farp;
2858  ++shift;
2859  }
2860  if(farp != id_pool.end()) {
2861 
2862  if(farp-id_pool.begin() == ali->m_target_id) {
2863  ali->m_target_id -= shift;
2864  _ASSERT(ali->m_target_id >= 0);
2865  ++ali;
2866  }
2867 
2868 
2869  _ASSERT(farp >= id);
2870  while(*farp != 0) {
2871  *id++ = *farp++;
2872  }
2873  *id++ = *farp++;
2874  }
2875  }
2876  id_pool.resize(id-id_pool.begin()); // id - next after last retained element
2877 
2878  _ASSERT(ali == alideque.end());
2879  }
2880  }
2881 }
2882 
2883 
2884 END_SCOPE(gnomon)
2885 END_SCOPE(ncbi)
2886 
2887 
#define EXTRA_CUT
#define COVERED_FRACTION
#define COVERAGE_WINDOW
bool OneExonCompare(const CModelExon &a, const CModelExon &b)
#define FS_FUZZ
#define END_PART_LENGTH
#define INDEL_PENALTY
bool OriginalOrder(const SAlignIndividual &a, const SAlignIndividual &b)
bool isGoodIntron(int a, int b, EStrand strand, const CAlignCollapser::TAlignIntrons &introns, bool check_introns_on_both_strands)
#define COLLAPS_CHUNK
#define MISM_PENALTY
#define EXON_TO_SKIP
#define CUT_MARGIN
#define MIN_EXON
string GetTargetAcc(int shift, const deque< char > &id_pool)
#define BIG_NOT_ALIGNED
USING_SCOPE(sequence)
#define SMALL_CLIP
int TotalFrameShift(const TInDels &indels, int a, int b)
#define MAX_CLIP
bool AlignmentMarkedForDeletion(const SAlignIndividual &ali)
#define MAX_DIST_TO_FLANK_GAP
bool AlignmentIsSupportedBySR(const CAlignModel &align, const vector< double > &coverage, int mincoverage, int left_end)
#define DESIRED_CHUNK
#define NOT_ALIGNED_PHONY_CAGE
#define SPECIAL_ALIGN_LEN
void remove_if(Container &c, Predicate *__pred)
Definition: chainer.hpp:69
TSignedSeqPos FullLength() const
void Init(const CSeqVector &sv, TSignedSeqPos from, TSignedSeqPos to)
string substr(TSignedSeqPos p, TSignedSeqPos l) const
CAlignModel FillGapsInAlignmentAndAddToGenomicGaps(const CAlignModel &align, int fill)
TSignedSeqRange m_range
bool RemoveNotSupportedIntronsFromProt(CAlignModel &align)
CAlignCollapser(string contig="", CScope *scope=0, bool nofilteringcollapsing=false)
void AddAlignment(CAlignModel &align)
void InitContig(string contig, CScope *scope)
vector< double > m_coverage
CPartialString m_contig
void ClipNotSupportedFlanks(CAlignModel &align, double clip_threshold, double min_lim=0)
bool RemoveNotSupportedIntronsFromTranscript(CAlignModel &align, bool check_introns_on_both_strands) const
void GetOnlyOtherAlignments(TAlignModelClusterSet &clsset)
bool CheckAndInsert(const CAlignModel &align, TAlignModelClusterSet &clsset) const
TAlignIntrons m_align_introns
SCorrectionData m_correction_data
void ClipESTorSR(CAlignModel &align, double clip_threshold, double min_lim)
TIntMap m_genomic_gaps_len
void GetCollapsedAlgnments(TAlignModelClusterSet &clsset)
TAlignModelList m_aligns_for_filtering_only
map< tuple< int, int >, CAlignModel > m_special_aligns
void CleanSelfTranscript(CAlignModel &align, const string &trans) const
static void SetupArgDescriptions(CArgDescriptions *arg_desc)
bool isCap() const
bool isSR() const
bool isPlus() const
CAlignModel GetAlignment(const SAlignIndividual &ali, const deque< char > &target_id_pool) const
bool isEST() const
bool isMinus() const
Tintrons m_introns
bool isUnknown() const
bool isPolyA() const
vector< SIntron > Tintrons
const Tintrons & GetIntrons() const
int TargetLen() const
TSignedSeqRange MapRangeEditedToOrig(TSignedSeqRange edited_range, bool withextras=true) const
Definition: gnomon_seq.cpp:966
void EditedSequence(const In &original_sequence, Out &edited_sequence, bool includeholes=false) const
Definition: gnomon_seq.cpp:632
TSignedSeqRange ShrinkToRealPoints(TSignedSeqRange orig_range, bool snap_to_codons=false) const
Definition: gnomon_seq.cpp:764
int FShiftedLen(TSignedSeqRange ab, ERangeEnd lend, ERangeEnd rend) const
Definition: gnomon_seq.cpp:993
TSignedSeqRange MapRangeOrigToEdited(TSignedSeqRange orig_range, ERangeEnd lend, ERangeEnd rend) const
Definition: gnomon_seq.cpp:938
string TargetAccession() const
virtual void Clip(TSignedSeqRange limits, EClipMode mode, bool ensure_cds_invariant=true)
CConstRef< objects::CSeq_id > GetTargetId() const
virtual void CutExons(TSignedSeqRange hole)
int TargetLen() const
virtual CAlignMap GetAlignMap() const
void SetTargetId(const objects::CSeq_id &id)
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgs –.
Definition: ncbiargs.hpp:379
CBioseq_Handle –.
void AddExon(TSignedSeqRange exon, const string &fs="", const string &ss="", double ident=0, const string &seq="", const CInDelInfo::SSource &src=CInDelInfo::SSource())
void ClearExons()
virtual void CutExons(TSignedSeqRange hole)
TSignedSeqRange TranscriptLimits() const
EStrand Orientation() const
double Weight() const
unsigned int & Status()
const TExons & Exons() const
void SetWeight(double w)
virtual CAlignMap GetAlignMap() const
bool Continuous() const
int AlignLen() const
TSignedSeqRange TranscriptExon(int i) const
void ReverseComplementModel()
void SetStrand(EStrand s)
TSignedSeqRange Limits() const
int Type() const
vector< CModelExon > TExons
bool HasStop() const
bool HasStart() const
TInDels GetInDels(bool fs_only) const
TInDels & FrameShifts()
EStrand Strand() const
static CRef< CSeq_id > ToSeq_id(const string &str)
Definition: id_handler.cpp:73
void Insert(const typename Cluster::TModel &a)
string m_fsplice_sig
double m_ident
string m_ssplice_sig
TSignedSeqPos GetFrom() const
const TSignedSeqRange & Limits() const
TSignedSeqPos GetTo() const
static CNcbiApplication * Instance(void)
Singleton method.
Definition: ncbiapp.cpp:264
CScope –.
Definition: scope.hpp:92
CSeqVector –.
Definition: seq_vector.hpp:65
void erase(iterator pos)
Definition: map.hpp:167
size_type size() const
Definition: map.hpp:148
container_type::const_iterator const_iterator
Definition: map.hpp:53
const_iterator begin() const
Definition: map.hpp:151
const_iterator end() const
Definition: map.hpp:152
const_iterator lower_bound(const key_type &key) const
Definition: map.hpp:154
iterator_bool insert(const value_type &val)
Definition: map.hpp:165
bool empty() const
Definition: map.hpp:149
const_iterator upper_bound(const key_type &key) const
Definition: map.hpp:155
container_type::value_type value_type
Definition: map.hpp:52
const_iterator find(const key_type &key) const
Definition: map.hpp:153
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
const_iterator begin() const
Definition: set.hpp:135
parent_type::iterator iterator
Definition: set.hpp:80
const_iterator upper_bound(const key_type &key) const
Definition: set.hpp:139
bool empty() const
Definition: set.hpp:133
const_iterator end() const
Definition: set.hpp:136
const_iterator lower_bound(const key_type &key) const
Definition: set.hpp:138
parent_type::const_iterator const_iterator
Definition: set.hpp:79
constexpr auto end(const ct_const_array< T, N > &in) noexcept
int close(int fd)
Definition: connection.cpp:45
static const char ip[]
Definition: des.c:75
struct parameters_t * pb[]
char data[12]
Definition: iconv.c:80
double Entropy(const string &seq)
Definition: glb_align.cpp:866
vector< int > TIVec
list< CAlignModel > TAlignModelList
EStrand
@ eMinus
@ ePlus
bool Include(TSignedSeqRange big, TSignedSeqRange small)
void ReverseComplement(const BidirectionalIterator &first, const BidirectionalIterator &last)
EStrand OtherStrand(EStrand s)
vector< CInDelInfo > TInDels
list< Model > GetAlignParts(const Model &algn, bool settrimflags)
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:305
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
int TSignedSeqPos
Type for signed sequence position.
Definition: ncbimisc.hpp:887
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
Definition: ncbimisc.hpp:1508
void AddFlag(const string &name, const string &comment, CBoolEnum< EFlagValue > set_value=eFlagHasValueIfSet, TFlags flags=0)
Add description for flag argument.
Definition: ncbiargs.cpp:2459
void SetCurrentGroup(const string &group)
Set current arguments group name.
Definition: ncbiargs.cpp:2632
void AddDefaultKey(const string &name, const string &synopsis, const string &comment, EType type, const string &default_value, TFlags flags=0, const string &env_var=kEmptyStr, const char *display_value=nullptr)
Add description for optional key with default value.
Definition: ncbiargs.cpp:2442
@ eDouble
Convertible into a floating point number (double)
Definition: ncbiargs.hpp:594
@ eInteger
Convertible into an integer number (int or Int8)
Definition: ncbiargs.hpp:592
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
Definition: Seq_id.cpp:318
const CMolInfo * GetMolInfo(const CBioseq &bioseq)
Retrieve the MolInfo object for a given bioseq handle.
Definition: sequence.cpp:284
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
CSeqVector GetSeqVector(EVectorCoding coding, ENa_strand strand=eNa_strand_plus) const
Get sequence: Iupacna or Iupacaa if use_iupac_coding is true.
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
bool IsInGap(TSeqPos pos) const
true if sequence at 0-based position 'pos' has gap Note: this method is not MT-safe,...
Definition: seq_vector.hpp:277
TSeqPos size(void) const
Definition: seq_vector.hpp:291
position_type GetLength(void) const
Definition: range.hpp:158
bool NotEmpty(void) const
Definition: range.hpp:152
static TThisType GetEmpty(void)
Definition: range.hpp:306
bool Empty(void) const
Definition: range.hpp:148
CRange< TSignedSeqPos > TSignedSeqRange
Definition: range.hpp:420
static TThisType GetWhole(void)
Definition: range.hpp:272
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
void SetFrom(TFrom value)
Assign a value to From data member.
Definition: Range_.hpp:231
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
void SetTo(TTo value)
Assign a value to To data member.
Definition: Range_.hpp:278
@ e_Local
local use
Definition: Seq_id_.hpp:95
TTech GetTech(void) const
Get the Tech member data.
Definition: MolInfo_.hpp:497
bool IsSetTech(void) const
Check if a value has been assigned to Tech data member.
Definition: MolInfo_.hpp:472
@ eTech_tsa
transcriptome shotgun assembly
Definition: MolInfo_.hpp:146
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
string GetDNASequence(CConstRef< objects::CSeq_id > id, CScope &scope)
Definition: id_handler.cpp:130
n font weight
int i
yy_size_t n
int len
range(_Ty, _Ty) -> range< _Ty >
constexpr auto sort(_Init &&init)
double value_type
The numeric datatype used by the parser.
Definition: muParserDef.h:228
Magic spell ;-) needed for some weird compilers... very empiric.
unsigned int a
Definition: ncbi_localip.c:102
EIPRangeType t
Definition: ncbi_localip.c:101
Defines command line argument related classes.
int tolower(Uchar c)
Definition: ncbictype.hpp:72
int toupper(Uchar c)
Definition: ncbictype.hpp:73
T max(T x_, T y_)
T minus(T x_)
T plus(T x_)
T min(T x_, T y_)
std::istream & in(std::istream &in_, double &x_)
Int4 delta(size_t dimension_, const Int4 *score_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
static BOOL number
Definition: pcregrep.c:193
TSignedSeqRange m_range
bool operator()(TAlignModelList::const_iterator a, TAlignModelList::const_iterator b)
const deque< char > & id_pool
LeftAndLongFirstOrder(const deque< char > &idp)
bool operator()(const SAlignIndividual &a, const SAlignIndividual &b)
bool operator()(const CGeneModel::TExons &a, const CGeneModel::TExons &b) const
SAlignIndividual * m_ali
SAlignExtended(SAlignIndividual &ali, const set< int > &left_exon_ends, const set< int > &right_exon_ends)
TSignedSeqRange m_range
TSignedSeqPos m_target_id
list< TSignedSeqRange > m_confirmed_intervals
map< int, char > m_replacements
TInDels m_correction_indels
bool m_oriented
TSignedSeqRange m_range
#define _ASSERT
int g(Seg_Gsm *spe, Seq_Mtf *psm, Thd_Gsm *tdg)
Definition: thrddgri.c:44
#define const
Definition: zconf.h:232
Modified on Wed Apr 17 13:09:03 2024 by modify_doxy.py rev. 669887