NCBI C++ ToolKit
splign.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: splign.cpp 100425 2023-07-31 13:44:51Z mozese2 $
2 * ===========================================================================
3 *
4 * public DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Yuri Kapustin, Boris Kiryutin
27 *
28 * File Description:
29 * CSplign class implementation
30 *
31 */
32 
33 
34 #include <ncbi_pch.hpp>
35 
36 #include "splign_util.hpp"
37 #include "splign_exon_trim.hpp"
38 #include "messages.hpp"
39 
47 
48 #include <algo/sequence/orf.hpp>
49 
50 #include <objmgr/scope.hpp>
51 #include <objmgr/bioseq_handle.hpp>
52 #include <objmgr/seq_vector.hpp>
54 
56 
68 
69 #include <util/value_convert.hpp>
70 
71 #include <math.h>
72 #include <algorithm>
73 #include <iostream>
74 
76 
78 
79 namespace {
80 
81  // define cut-off strategy at the terminii:
82 
83  // (1) - pre-processing
84  // For non-covered ends longer than kNonCoveredEndThreshold use
85  // m_max_genomic_ext. For shorter ends use k * query_len^(1/kPower)
86 
87  const Uint4 kNonCoveredEndThreshold (55);
88  const double kPower (2.5);
89 
90  // (2) - post-processing
91  // term exons shorter than kMinTermExonSize with identity lower than
92  // kMinTermExonIdty will be converted to gaps
93  const size_t kMinTermExonSize (28);
94  const double kMinTermExonIdty (0.9);
95 
96  //if flanking exon is closer than kFlankExonProx to the beggining/(end or polyA) of mRNA, it is NOT treated as adjecent to a gap
97  const int kFlankExonProx (20);
98 
99  //maximim length to cut to splice (exons near gaps)
100  const int kMaxCutToSplice (6);
101 
104 
105 
106  //original Yuri's EST scores
107 
108  const int kEstMatchScore (1000);
109  const int kEstMismatchScore (-1011);
110  const int kEstGapOpeningScore(-1460);
111  const int kEstGapExtensionScore(-464);
112 
113  const int kEstGtAgSpliceScore(-4988);
114  const int kEstGcAgSpliceScore(-5999);
115  const int kEstAtAcSpliceScore(-7010);
116  const int kEstNonConsensusSpliceScore(-13060);
117 }
118 
119 
120 
121 //original Yuri's mRNA scores
122 
124  return eMrnaScoring;
125 }
126 
128  return 1000;
129 }
130 
132  return -1044;
133 }
134 
136  return -3070;
137 }
138 
140  return -173;
141 }
142 
144  return -4270;
145 }
146 
148  return -5314;
149 }
150 
152  return -6358;
153 }
154 
156  return -7395;
157 }
158 
160  m_CanResetHistory (false),
161  //basic scores
162  m_ScoringType(s_GetDefaultScoringType()),
163  m_MatchScore(s_GetDefaultMatchScore()),
164  m_MismatchScore(s_GetDefaultMismatchScore()),
165  m_GapOpeningScore(s_GetDefaultGapOpeningScore()),
166  m_GapExtensionScore(s_GetDefaultGapExtensionScore()),
167  m_GtAgSpliceScore(s_GetDefaultGtAgSpliceScore()),
168  m_GcAgSpliceScore(s_GetDefaultGcAgSpliceScore()),
169  m_AtAcSpliceScore(s_GetDefaultAtAcSpliceScore()),
170  m_NonConsensusSpliceScore(s_GetDefaultNonConsensusSpliceScore()),
171  //end of basic scores
172  m_MinExonIdty(s_GetDefaultMinExonIdty()),
173  m_MinPolyaExtIdty(s_GetDefaultPolyaExtIdty()),
174  m_MinPolyaLen(s_GetDefaultMinPolyaLen()),
175  m_MinHoleLen(s_GetDefaultMinHoleLen()),
176  m_TrimToCodons(s_GetDefaultTrimToCodons()),
177  m_CompartmentPenalty(s_GetDefaultCompartmentPenalty()),
178  m_MinCompartmentIdty(s_GetDefaultMinCompartmentIdty()),
179  m_MinSingletonIdty(s_GetDefaultMinCompartmentIdty()),
180  m_MinSingletonIdtyBps(numeric_limits<size_t>::max()),
181  m_TestType(kTestType_production_default),
182  m_endgaps (true),
183  m_strand (true),
184  m_nopolya (false),
185  m_cds_start (0),
186  m_cds_stop (0),
187  m_max_genomic_ext (s_GetDefaultMaxGenomicExtent()),
188  m_MaxIntron (CCompartmentFinder<THit>::s_GetDefaultMaxIntron()),
189  m_MaxPartExonIdentDrop (s_GetDefaultMaxPartExonIdentDrop()),
190  m_model_id (0),
191  m_MaxCompsPerQuery (0),
192  m_MinPatternHitLength (13)
193 {
194 }
195 
197 {
198 }
199 
201 {
202  return new CVersion(CVersionInfo(2, 1, 0));
203 }
204 
205 
207 {
209  return s_Version.Get();
210 }
211 
212 
214  return m_aligner;
215 }
216 
217 
219  return m_aligner;
220 }
221 
223  CRef<TAligner>& aligner = SetAligner();
224  aligner->SetWm (GetMatchScore());
225  aligner->SetWms (GetMismatchScore());
226  aligner->SetWg (GetGapOpeningScore());
227  aligner->SetWs (GetGapExtensionScore());
228  aligner->SetScoreMatrix(NULL);
229  aligner->SetWi(0, GetGtAgSpliceScore());
230  aligner->SetWi(1, GetGcAgSpliceScore());
231  aligner->SetWi(2, GetAtAcSpliceScore());
232  aligner->SetWi(3, GetNonConsensusSpliceScore());
233 }
234 
236  CRef<CSplicedAligner> aligner(
237  static_cast<CSplicedAligner*>(new CSplicedAligner16));
238  return aligner;
239 }
240 
241 
243 {
245 
246  if(low_query_quality) {
247  aligner->SetWm (kEstMatchScore);
248  aligner->SetWms (kEstMismatchScore);
249  aligner->SetWg (kEstGapOpeningScore);
250  aligner->SetWs (kEstGapExtensionScore);
251  aligner->SetScoreMatrix(NULL);
252  aligner->SetWi(0, kEstGtAgSpliceScore);
253  aligner->SetWi(1, kEstGcAgSpliceScore);
254  aligner->SetWi(2, kEstAtAcSpliceScore);
255  aligner->SetWi(3, kEstNonConsensusSpliceScore);
256  }
257  else {
258  aligner->SetWm (s_GetDefaultMatchScore());
259  aligner->SetWms (s_GetDefaultMismatchScore());
260  aligner->SetWg (s_GetDefaultGapOpeningScore());
262  aligner->SetScoreMatrix(NULL);
263  aligner->SetWi(0, s_GetDefaultGtAgSpliceScore());
264  aligner->SetWi(1, s_GetDefaultGcAgSpliceScore());
265  aligner->SetWi(2, s_GetDefaultAtAcSpliceScore());
267  }
268 
269  return aligner;
270 }
271 
272 //note: SetScoringType call with mRNA or EST type is going to switch basic scores to preset values
273 
275 {
277  if(type == eMrnaScoring) {
286  } else if(type == eEstScoring) {
287  SetMatchScore(kEstMatchScore);
288  SetMismatchScore(kEstMismatchScore);
289  SetGapOpeningScore(kEstGapOpeningScore);
290  SetGapExtensionScore(kEstGapExtensionScore);
291  SetGtAgSpliceScore(kEstGtAgSpliceScore);
292  SetGcAgSpliceScore(kEstGcAgSpliceScore);
293  SetAtAcSpliceScore(kEstAtAcSpliceScore);
294  SetNonConsensusSpliceScore(kEstNonConsensusSpliceScore);
295  }
296 }
297 
299  return m_ScoringType;
300 }
301 
302 void CSplign::SetMatchScore(int score) {
303  m_MatchScore = score;
304 }
305 
306 int CSplign::GetMatchScore(void) const {
307  return m_MatchScore;
308 }
309 
310 void CSplign::SetMismatchScore(int score) {
311  m_MismatchScore = score;
312 }
313 
314 int CSplign::GetMismatchScore(void) const {
315  return m_MismatchScore;
316 }
317 
319  m_GapOpeningScore = score;
320 }
321 
323  return m_GapOpeningScore;
324 }
325 
327  m_GapExtensionScore = score;
328 }
329 
331  return m_GapExtensionScore;
332 }
333 //
335  m_GtAgSpliceScore = score;
336 }
337 
339  return m_GtAgSpliceScore;
340 }
341 
343  m_GcAgSpliceScore = score;
344 }
345 
347  return m_GcAgSpliceScore;
348 }
349 
351  m_AtAcSpliceScore = score;
352 }
353 
355  return m_AtAcSpliceScore;
356 }
357 
360 }
361 
364 }
365 
367  m_endgaps = on;
368 }
369 
370 bool CSplign::GetEndGapDetection( void ) const {
371  return m_endgaps;
372 }
373 
374 void CSplign::SetPolyaDetection( bool on ) {
375  m_nopolya = !on;
376 }
377 
378 bool CSplign::GetPolyaDetection( void ) const {
379  return !m_nopolya;
380 }
381 
382 void CSplign::SetStrand( bool strand ) {
383  m_strand = strand;
384 }
385 
386 bool CSplign::GetStrand( void ) const {
387  return m_strand;
388 }
389 
390 void CSplign::SetMinExonIdentity( double idty )
391 {
392  if(!(0 <= idty && idty <= 1)) {
394  }
395  else {
396  m_MinExonIdty = idty;
397  }
398 }
399 
400 void CSplign::SetPolyaExtIdentity( double idty )
401 {
402  if(!(0 <= idty && idty <= 1)) {
404  }
405  else {
406  m_MinPolyaExtIdty = idty;
407  }
408 }
409 
411  m_MinPolyaLen = len;
412 }
413 
415  m_MinHoleLen = len;
416 }
417 
418 void CSplign::SetTrimToCodons(bool val) {
420 }
421 
423 {
424  if(!(0 <= idty && idty <= 1)) {
426  }
427  else {
428  m_MinCompartmentIdty = idty;
429  }
430 }
431 
433 {
434  if(!(0 <= idty && idty <= 1)) {
436  }
437  else {
438  m_MinSingletonIdty = idty;
439  }
440 }
441 
443 {
444  m_MinSingletonIdtyBps = idty_bps;
445 }
446 
448 {
449  return 35000;
450 }
451 
452 
454 {
455  m_max_genomic_ext = mge;
456 }
457 
458 
460 {
461  return m_max_genomic_ext;
462 }
463 
464 
465 void CSplign::SetMaxIntron(size_t max_intron)
466 {
467  m_MaxIntron = max_intron;
468 }
469 
470 
471 size_t CSplign::GetMaxIntron(void) const
472 {
473  return m_MaxIntron;
474 }
475 
476 
477 double CSplign::GetMinExonIdentity( void ) const {
478  return m_MinExonIdty;
479 }
480 
482 {
483  return 0.75;
484 }
485 
486 double CSplign::GetPolyaExtIdentity( void ) const {
487  return m_MinPolyaExtIdty;
488 }
489 
491 {
492  return 1.;
493 }
494 
495 size_t CSplign::GetMinPolyaLen( void ) const {
496  return m_MinPolyaLen;
497 }
498 
500 {
501  return 1;
502 }
503 
504 size_t CSplign::GetMinHoleLen( void ) const {
505  return m_MinHoleLen;
506 }
507 
509 {
510  return 0;
511 }
512 
513 bool CSplign::GetTrimToCodons( void ) const {
514  return m_TrimToCodons;
515 }
516 
518 {
519  return false;
520 }
521 
523  return m_MinCompartmentIdty;
524 }
525 
527  return m_MinSingletonIdty;
528 }
529 
531  return m_MinSingletonIdtyBps;
532 }
533 
535 {
536  return 0.70;
537 }
538 
540  m_MaxPartExonIdentDrop = ident;
541 }
542 
544  return m_MaxPartExonIdentDrop;
545 }
546 
548 {
549  return 0.25;
550 }
551 
552 void CSplign::SetTestType(const string& test_type) {
554 }
555 
556 string CSplign::GetTestType(void) const {
557  return m_TestType;
558 }
559 
561  m_MaxCompsPerQuery = m;
562 }
563 
564 size_t CSplign::GetMaxCompsPerQuery(void) const {
565  return m_MaxCompsPerQuery;
566 }
567 
568 
569 
571 {
572  return m_Scope;
573 }
574 
575 
577 {
578  return m_Scope;
579 }
580 
581 
582 void CSplign::PreserveScope(bool preserve_scope)
583 {
584  m_CanResetHistory = ! preserve_scope;
585 }
586 
587 
588 void CSplign::SetCompartmentPenalty(double penalty)
589 {
590  if(penalty < 0 || penalty > 1) {
592  }
593  m_CompartmentPenalty = penalty;
594 }
595 
597 {
598  return 0.55;
599 }
600 
601 double CSplign::GetCompartmentPenalty( void ) const
602 {
603  return m_CompartmentPenalty;
604 }
605 
606 bool CSplign::x_IsInGap(size_t pos)
607 {
608  if( pos+1 == 0 || pos >= m_genomic.size() ) return true;//outside genome boundaries
609  TSeqPos pos1 = Convert(pos);
610  if(m_GenomicSeqMap && m_GenomicSeqMap->ResolvedRangeIterator(GetScope(),
611  pos1, 1, eNa_strand_plus, size_t(-1), CSeqMap::fFindGap)) {//gap
612  return true;
613  }
614  return false;
615 }
616 
617 
618 void CSplign::x_LoadSequence(vector<char>* seq,
619  const CSeq_id& seqid,
620  THit::TCoord start,
621  THit::TCoord finish,
622  bool retain, bool is_genomic, bool genomic_strand)
623 {
624 
625  try {
626 
627  if(m_Scope.IsNull()) {
628  NCBI_THROW(CAlgoAlignException, eInternal, "Splign scope not set");
629  }
630 
631  CBioseq_Handle bh (m_Scope->GetBioseqHandle(seqid));
632 
633  if( !is_genomic ) m_mrna_bio_handle = bh;
634 
635  if(retain && m_CanResetHistory) {
636  m_Scope->ResetHistory(); // this does not remove the sequence
637  // referenced to by 'bh'
638  }
639 
640  if(bh) {
641 
643  const TSeqPos dim (sv.size());
644  if(dim == 0) {
646  eNoSeqData,
647  string("Sequence is empty: ")
648  + seqid.AsFastaString());
649  }
650  if(finish >= dim) {
651  finish = dim - 1;
652  }
653 
654  if(start > finish) {
655  CNcbiOstrstream ostr;
656  ostr << "Invalid sequence interval requested for "
657  << seqid.GetSeqIdString(true) << ":\t"
658  << start << '\t' << finish;
659  const string err = CNcbiOstrstreamToString(ostr);
660  NCBI_THROW(CAlgoAlignException, eNoSeqData, err);
661  }
662 
663  string s;
664  sv.GetSeqData(start, finish + 1, s);
665  if(is_genomic) {//get SeqMap data
666  ENa_strand strand = eNa_strand_minus;
667  if(genomic_strand) strand = eNa_strand_plus;
668  CRef<CSeq_id> tmp_id(new CSeq_id());
669  tmp_id->Assign(seqid);
670  CSeq_loc tmp_loc(*tmp_id, start, finish, strand);
672  }
673  seq->resize(1 + finish - start);
674  copy(s.begin(), s.end(), seq->begin());
675  }
676  else {
677  NCBI_THROW(CAlgoAlignException, eNoSeqData,
678  string("ID not found: ") + seqid.AsFastaString());
679  }
680 
681  if(!retain && m_CanResetHistory) {
682  m_Scope->RemoveFromHistory(bh);
683  }
684  }
685 
686  catch(CAlgoAlignException& e) {
687  NCBI_RETHROW_SAME(e, "CSplign::x_LoadSequence(): Sequence data problem");
688  }
689 
690  if(!is_genomic) {
691  if(seq == &m_mrna) {
692  m_mrna_polya.clear();
693  m_mrna_polya.resize(seq->size());
694  copy(seq->begin(), seq->end(), m_mrna_polya.begin());
695  }
696  CSeq_id_Handle idh_key = CSeq_id_Handle::GetHandle(seqid);
697  if(m_MaskMap.find(idh_key) != m_MaskMap.end()) {
698  const TSeqRangeColl& mask_ranges = m_MaskMap.find(idh_key)->second;
699  x_MaskSequence(seq, mask_ranges, start, finish);
700  }
701  }
702 }
703 
704 
705 void CSplign::x_MaskSequence(vector<char>* seq,
706  const TSeqRangeColl& mask_ranges,
707  THit::TCoord start,
708  THit::TCoord finish)
709 {
710  //cerr<<start<<"\t"<<finish<<endl;
711  TSeqRange loop_range;
712  for(TSeqPos loop = start; loop <= finish; loop++) {
713  loop_range.SetFrom(loop);
714  loop_range.SetLength(1);
715  if(mask_ranges.IntersectingWith(loop_range)) {
716  //cerr<<loop<<"\t"<<loop_range<<endl;
717  (*seq)[loop] = 'N';
718  }
719  }
720  //ITERATE(vector<char>, ci, (*seq)) {
721  // cerr << *ci;
722  //}
723  //cerr<<endl;
724 }
725 
726 
727 
729 {
730  m_Scope.Reset(NULL);
731  m_pattern.clear();
732  m_alnmap.clear();
733  m_genomic.clear();
734  m_mrna.clear();
735  m_mrna_polya.clear();
736 }
737 
738 
741 {
742  THitRef hr (new THit);
743  hr->SetQueryStart(q0);
744  hr->SetSubjStart(s0);
745  hr->SetQueryStop(q - 1);
746  hr->SetSubjStop(s - 1);
747  hr->SetLength(q - q0);
748  hr->SetMismatches(0);
749  hr->SetGaps(0);
750  hr->SetEValue(0);
751  hr->SetScore(2*(q - q0));
752  hr->SetIdentity(1);
753  return hr;
754 }
755 
756 
758 {
759  const char * Seq1 (&m_mrna.front());
760  const char * Seq2 (&m_genomic.front());
761  THitRefs rv;
762  ITERATE(THitRefs, ii, (*phitrefs)) {
763 
764  const THitRef & h (*ii);
765  const double idty (h->GetIdentity());
766  const bool diag (h->GetGaps() == 0 && h->GetQuerySpan() == h->GetSubjSpan());
767  if(idty == 1 || idty < .95 || h->GetLength() < 100 || !diag) {
768  rv.push_back(h);
769  }
770  else {
771 
772  int q0 (-1), s0 (-1), q1 (h->GetQueryMax());
773  int q (h->GetQueryMin()), s (h->GetSubjMin());
774  size_t new_hits (0);
775  while(q <= q1) {
776  if(Seq1[q++] != Seq2[s++]) {
777  if(q0 != -1 && q >= q0 + int(m_MinPatternHitLength)) {
778  THitRef hr (sx_NewHit(q0, q, s0, s));
779  hr->SetQueryId(h->GetQueryId());
780  hr->SetSubjId(h->GetSubjId());
781  rv.push_back(hr);
782  ++new_hits;
783  }
784  q0 = s0 = -1;
785  }
786  else {
787  if(q0 == -1) {
788  q0 = q;
789  s0 = s;
790  }
791  }
792  }
793 
794  if(q0 != -1 && q >= q0 + int(m_MinPatternHitLength)) {
795  THitRef hr (sx_NewHit(q0, q, s0, s));
796  hr->SetQueryId(h->GetQueryId());
797  hr->SetSubjId(h->GetSubjId());
798  rv.push_back(hr);
799  ++new_hits;
800  }
801 
802  if(new_hits == 0) {
803  rv.push_back(h);
804  }
805  }
806  }
807 
808  *phitrefs = rv;
809 }
810 
811 
813 {
814  m_alnmap.clear();
815  m_pattern.clear();
816 
817  // sort the input by min query coordinate
818  typedef CHitComparator<THit> THitComparator;
819  THitComparator sorter (THitComparator::eQueryMin);
820  stable_sort(phitrefs->begin(), phitrefs->end(), sorter);
821 
822  // check that no two consecutive hits are farther than the max intron
823  // (extra short hits skipped)
824  // (throw out a hit if it intersects with previous on subject (genome), )
825  size_t prev (0);
826  TSeqPos prevSmin, prevSmax;
827  NON_CONST_ITERATE(THitRefs, ii, *phitrefs) {
828 
829  THitRef& h (*ii);
830 
831  if(h->GetQuerySpan() < m_MinPatternHitLength) {
832  h.Reset(0);
833  continue;
834  }
835 
836  if(prev > 0) {
837 
838  const bool non_intersect = ( prevSmax < h->GetSubjMin() ) || ( prevSmin > h->GetSubjMax() );
839  if(!non_intersect) {//throw out intersecting hit
840  h.Reset(0);
841  continue;
842  }
843 
844  const bool consistent (h->GetSubjStrand()?
845  (h->GetSubjStart() < prev + m_MaxIntron):
846  (h->GetSubjStart() + m_MaxIntron > prev));
847 
848  if(!consistent) {
849  const string errmsg (g_msg_CompartmentInconsistent
850  + string(" (extra long introns)"));
851  NCBI_THROW(CAlgoAlignException, eIntronTooLong, errmsg);
852  }
853  }
854 
855  prev = h->GetSubjStop();
856  prevSmin = h->GetSubjMin();
857  prevSmax = h->GetSubjMax();
858  }
859 
860  phitrefs->erase(remove_if(phitrefs->begin(), phitrefs->end(),
862  phitrefs->end());
863 
864  // save each hit longer than the minimum and test whether the hit is perfect
865  vector<size_t> pattern0;
866  vector<pair<bool,double> > imperfect;
867  double max_idty (0.0);
868  for(size_t i (0), n (phitrefs->size()); i < n; ++i) {
869 
870  const THitRef & h ((*phitrefs)[i]);
871  const bool valid (true);
872  if(valid) {
873 
874  pattern0.push_back(h->GetQueryMin());
875  pattern0.push_back(h->GetQueryMax());
876  pattern0.push_back(h->GetSubjMin());
877  pattern0.push_back(h->GetSubjMax());
878  const double idty (h->GetIdentity());
879  const bool imprf (idty < 1.00
880  || h->GetQuerySpan() != h->GetSubjSpan()
881  || h->GetMismatches() > 0
882  || h->GetGaps() > 0);
883  imperfect.push_back(pair<bool,double>(imprf, idty));
884  if(idty > max_idty) {
885  max_idty = idty;
886  }
887  }
888  }
889 
890  if(max_idty < .85 && pattern0.size() >= 4) {
891  m_BoundingRange = pair<size_t, size_t>(pattern0[2], pattern0.back());
892  }
893  else {
894  m_BoundingRange = pair<size_t, size_t>(0, 0);
895  }
896 
897  const size_t dim (pattern0.size());
898 
899  const char* Seq1 (&m_mrna.front());
900  const size_t SeqLen1 (m_polya_start < kMax_UInt? m_polya_start: m_mrna.size());
901  const char* Seq2 (&m_genomic.front());
902  const size_t SeqLen2 (m_genomic.size());
903 
904  // verify conditions on the input hit pattern
905  CNcbiOstrstream ostr_err;
906  bool some_error (false), bad_input (false);
907  if(dim % 4 == 0) {
908 
909  for(size_t i (0); i < dim; i += 4) {
910 
911  if(pattern0[i] > pattern0[i+1] || pattern0[i+2] > pattern0[i+3]) {
912  ostr_err << "Pattern hits must be specified in plus strand";
913  some_error = bad_input = true;
914  break;
915  }
916 
917  if(i > 4) {
918  if(pattern0[i] <= pattern0[i-3] || pattern0[i+2] <= pattern0[i-1]) {
920  << string(" (hits not sorted)");
921  some_error = true;
922  break;
923  }
924  }
925 
926  const bool br1 (pattern0[i+1] >= SeqLen1);
927  const bool br2 (pattern0[i+3] >= SeqLen2);
928  if(br1 || br2) {
929 
930  ostr_err << "Pattern hits out of range ("
931  << "query = "
932  << phitrefs->front()->GetQueryId()->GetSeqIdString(true)
933  << "subj = "
934  << phitrefs->front()->GetSubjId()->GetSeqIdString(true)
935  << "):" << endl;
936 
937  if(br1) {
938  ostr_err << "\tquery_pattern_max = " << pattern0[i+1]
939  << "; query_len = " << SeqLen1 << endl;
940  }
941 
942  if(br2) {
943  ostr_err << "\tsubj_pattern_max = " << pattern0[i+3]
944  << "; subj_len = " << SeqLen2 << endl;
945  }
946 
947  some_error= true;
948  break;
949  }
950  }
951 
952  }
953  else {
954  ostr_err << "Pattern dimension must be a multiple of four";
955  some_error = bad_input = true;
956  }
957 
958  if(some_error) {
959  ostr_err << " (query = "
960  << phitrefs->front()->GetQueryId()->AsFastaString()
961  << " , subj = "
962  << phitrefs->front()->GetSubjId()->AsFastaString() << ')'
963  << endl;
964  }
965 
966  const string err = CNcbiOstrstreamToString(ostr_err);
967  if(err.size() > 0) {
968  if(bad_input) {
969  NCBI_THROW(CAlgoAlignException, eBadParameter, err);
970  }
971  else {
973  }
974  }
975 
976  SAlnMapElem map_elem;
977  map_elem.m_box[0] = map_elem.m_box[2] = 0;
978  map_elem.m_pattern_start = map_elem.m_pattern_end = -1;
979 
980  // build the alignment map
981  CBandAligner nwa;
982  for(size_t i = 0; i < dim; i += 4) {
983 
984  size_t L1, R1, L2, R2;
985  size_t max_seg_size (0);
986 
987  const bool imprf (imperfect[i/4].first);
988  if(imprf) {
989 
990  // TODO:
991  // a better approach is to find indels and mismatches
992  // and split at the SplitQualifyingHits() stage
993  // to pass here only perfect diags
994  const size_t len1 (pattern0[i+1] - pattern0[i] + 1);
995  const size_t len2 (pattern0[i+3] - pattern0[i+2] + 1);
996  const size_t maxlen (max(len1, len2));
997  const size_t lendif (len1 < len2? len2 - len1: len1 - len2);
998  size_t band (size_t((1 - imperfect[i/4].second) * maxlen) + 2);
999  if(band < lendif) band += lendif;
1000  nwa.SetBand(band);
1001  nwa.SetSequences(Seq1 + pattern0[i], len1,
1002  Seq2 + pattern0[i+2], len2,
1003  false);
1004  nwa.Run();
1005  max_seg_size = nwa.GetLongestSeg(&L1, &R1, &L2, &R2);
1006  }
1007  else {
1008 
1009  L1 = 1;
1010  R1 = pattern0[i+1] - pattern0[i] - 1;
1011  L2 = 1;
1012  R2 = pattern0[i+3] - pattern0[i+2] - 1;
1013  max_seg_size = R1 - L1 + 1;
1014  }
1015 
1016  if(max_seg_size) {
1017 
1018  // make the core
1019  {{
1020  size_t cut ((1 + R1 - L1) / 5);
1021  if(cut > 20) cut = 20;
1022 
1023  const size_t l1 (L1 + cut), l2 (L2 + cut);
1024  const size_t r1 (R1 - cut), r2 (R2 - cut);
1025  if(l1 < r1 && l2 < r2) {
1026  L1 = l1; L2 = l2;
1027  R1 = r1; R2 = r2;
1028  }
1029  }}
1030 
1031  size_t q0 (pattern0[i] + L1);
1032  size_t s0 (pattern0[i+2] + L2);
1033  size_t q1 (pattern0[i] + R1);
1034  size_t s1 (pattern0[i+2] + R2);
1035 
1036  if(imprf) {
1037 
1038  const size_t hitlen_q (pattern0[i + 1] - pattern0[i] + 1);
1039  const size_t sh (size_t(hitlen_q / 4));
1040 
1041  size_t delta (sh > L1? sh - L1: 0);
1042  q0 += delta;
1043  s0 += delta;
1044 
1045  const size_t h2s_right (hitlen_q - R1 - 1);
1046  delta = sh > h2s_right? sh - h2s_right: 0;
1047  q1 -= delta;
1048  s1 -= delta;
1049 
1050  if(q0 > q1 || s0 > s1) {
1051 
1052  // the longest segment too short
1053  q0 = pattern0[i] + L1;
1054  s0 = pattern0[i+2] + L2;
1055  q1 = pattern0[i] + R1;
1056  s1 = pattern0[i+2] + R2;
1057  }
1058  }
1059 
1060  m_pattern.push_back(q0); m_pattern.push_back(q1);
1061  m_pattern.push_back(s0); m_pattern.push_back(s1);
1062 
1063  const size_t pattern_dim = m_pattern.size();
1064  if(map_elem.m_pattern_start == -1) {
1065  map_elem.m_pattern_start = pattern_dim - 4;
1066  }
1067  map_elem.m_pattern_end = pattern_dim - 1;
1068  }
1069 
1070  map_elem.m_box[1] = pattern0[i+1];
1071  map_elem.m_box[3] = pattern0[i+3];
1072  }
1073 
1074  map_elem.m_box[1] = SeqLen1 - 1;
1075  map_elem.m_box[3] = SeqLen2 - 1;
1076  m_alnmap.push_back(map_elem);
1077 }
1078 
1079 
1080 CSplign::TOrfPair CSplign::GetCds(const THit::TId& id, const vector<char> * seq_data)
1081 {
1082  TOrfPair rv (TOrf(0, 0), TOrf(0, 0));
1083 
1085  const string strid (id->AsFastaString());
1087 
1088  if(ii != ie) {
1089  rv = ii->second;
1090  }
1091  else {
1092 
1093 
1094  vector<char> seq;
1095  if(seq_data == 0) {
1096  x_LoadSequence(&seq, *id, 0, kMaxCoord, false);
1097  seq_data = & seq;
1098  }
1099 
1100  // Assign CDS to the max ORF longer than 90 bps and starting from ATG
1101  //
1102  vector<CRef<CSeq_loc> > orfs;
1103  vector<string> start_codon;
1104  start_codon.push_back("ATG");
1105 
1106  COrf::FindOrfs(*seq_data, orfs, 90, 1, start_codon);
1107  TSeqPos max_len_plus (0), max_len_minus (0);
1108  TSeqPos max_from_plus (0), max_from_minus (0);
1109  TSeqPos max_to_plus (0), max_to_minus (0);
1110  ITERATE (vector<CRef<CSeq_loc> >, orf, orfs) {
1111 
1112  const TSeqPos len (sequence::GetLength(**orf, NULL));
1113  const ENa_strand orf_strand ((*orf)->GetInt().GetStrand());
1114  const bool antisense (orf_strand == eNa_strand_minus);
1115 
1116  if(antisense) {
1117  if(len > max_len_minus) {
1118  max_len_minus = len;
1119  max_from_minus = (*orf)->GetInt().GetTo();
1120  max_to_minus = (*orf)->GetInt().GetFrom();
1121  }
1122  }
1123  else {
1124  if(len > max_len_plus) {
1125  max_len_plus = len;
1126  max_from_plus = (*orf)->GetInt().GetFrom();
1127  max_to_plus = (*orf)->GetInt().GetTo();
1128  }
1129  }
1130  }
1131 
1132  if(max_len_plus > 0) {
1133  rv.first = TOrf(max_from_plus, max_to_plus);
1134  }
1135 
1136  if(max_len_minus > 0) {
1137  rv.second = TOrf(max_from_minus, max_to_minus);
1138  }
1139 
1140  m_OrfMap[strid] = rv;
1141  }
1142 
1143  return rv;
1144 }
1145 
1146 
1148 {
1149  ac.m_Id = ++m_model_id;
1150  ac.m_Segments = m_segments;
1152  ac.m_Msg = "Ok";
1153  ac.m_Cds_start = m_cds_start;
1154  ac.m_Cds_stop = m_cds_stop;
1155  ac.m_QueryLen = m_mrna.size();
1157 }
1158 
1159 
1160 // PRE: Input Blast hits.
1161 // POST: TResults - a vector of aligned compartments.
1162 void CSplign::Run(THitRefs* phitrefs)
1163 {
1164  if(!phitrefs) {
1165  NCBI_THROW(CAlgoAlignException, eInternal, "Unexpected NULL pointers");
1166  }
1167 
1168  THitRefs& hitrefs = *phitrefs;
1169 
1170  // make sure query hit is in plus strand
1171  NON_CONST_ITERATE(THitRefs, ii, hitrefs) {
1172 
1173  THitRef& h = *ii;
1174  if(h.NotNull() && h->GetQueryStrand() == false) {
1175  h->FlipStrands();
1176  }
1177  }
1178 
1179  if(m_aligner.IsNull()) {
1181  }
1182 
1183  if(hitrefs.size() == 0) {
1185  }
1186 
1187  m_result.clear();
1188 
1189  THit::TId id_query (hitrefs.front()->GetQueryId());
1190 
1191  const THit::TCoord mrna_size (objects::sequence::GetLength(*id_query, m_Scope));
1192  if(mrna_size == kMaxCoord) {
1193  NCBI_THROW(CAlgoAlignException, eNoSeqData,
1194  string("Sequence not found: ") + id_query->AsFastaString());
1195  }
1196 
1197  // iterate through compartments
1198  const THit::TCoord min_singleton_idty_final (Convert(
1199  min(size_t(m_MinSingletonIdty * mrna_size),
1201 
1203  THit::TCoord(m_MinCompartmentIdty * mrna_size),
1204  min_singleton_idty_final,
1205  true);
1208  comps.Run(hitrefs.begin(), hitrefs.end(), GetScope());
1209  } else {
1210  comps.Run(hitrefs.begin(), hitrefs.end());
1211  }
1212 
1213  pair<size_t,size_t> dim (comps.GetCounts()); // (count_total, count_unmasked)
1214  if(dim.second > 0) {
1215 
1216  // pre-load cDNA
1217  m_mrna.clear();
1218 
1219  x_LoadSequence(&m_mrna, *id_query, 0, kMaxCoord, false);
1220 
1221  const TOrfPair orfs (GetCds(id_query, & m_mrna));
1222  if(m_strand) {
1223  m_cds_start = orfs.first.first;
1224  m_cds_stop = orfs.first.second;
1225  }
1226  else {
1227  m_cds_start = orfs.second.first;
1228  m_cds_stop = orfs.second.second;
1229  }
1230 
1231  if(!m_strand) {
1232  // make a reverse complimentary
1233  reverse (m_mrna.begin(), m_mrna.end());
1234  transform(m_mrna.begin(), m_mrna.end(), m_mrna.begin(), SCompliment());
1235 
1236  reverse (m_mrna_polya.begin(), m_mrna_polya.end());
1237  transform(m_mrna_polya.begin(), m_mrna_polya.end(), m_mrna_polya.begin(), SCompliment());
1238  }
1239 
1240  // compartments share the space between them
1241  THit::TCoord smin (0), smax (kMax_UInt);
1242  bool same_strand (false);
1243 
1244  const THit::TCoord* box (comps.GetBox(0));
1245  if(m_MaxCompsPerQuery > 0 && dim.second > m_MaxCompsPerQuery) {
1246  dim.second = m_MaxCompsPerQuery;
1247  }
1248 
1249  for(size_t i (0); i < dim.first; ++i, box += 4) {
1250 
1251  if(i + 1 == dim.first) {
1252  smax = kMax_UInt;
1253  same_strand = false;
1254  }
1255  else {
1256  bool strand_this (comps.GetStrand(i));
1257  bool strand_next (comps.GetStrand(i+1));
1258  same_strand = strand_this == strand_next;
1259  smax = same_strand? (box + 4)[2]: kMax_UInt;
1260  }
1261 
1262  try {
1263 
1264  if(smax < box[3]) {
1265  // alert if not ordered by lower subject coordinate
1266  NCBI_THROW(CAlgoAlignException, eInternal,
1267  "Unexpected order of compartments");
1268  }
1269 
1270  if(comps.GetStatus(i)) {
1271  THitRefs comp_hits;
1272  comps.Get(i, comp_hits);
1273 
1274  if(smax < box[3]) smax = box[3];
1275  if(smin > box[2]) smin = box[2];
1276 
1277  SAlignedCompartment ac (x_RunOnCompartment(&comp_hits, smin,smax));
1279  m_result.push_back(ac);
1280  }
1281  }
1282 
1283  catch(CAlgoAlignException& e) {
1284 
1285  if(e.GetSeverity() == eDiag_Fatal) {
1286  throw;
1287  }
1288 
1289  m_result.push_back(SAlignedCompartment(0, e.GetMsg().c_str()));
1290 
1291  const CException::TErrCode errcode (e.GetErrCode());
1292  if(errcode != CAlgoAlignException::eNoAlignment) {
1293  m_result.back().m_Status = SAlignedCompartment::eStatus_Error;
1294  }
1295 
1296  ++m_model_id;
1297  }
1298 
1299  smin = same_strand? box[3]: 0;
1300  }
1301  }
1302 }
1303 
1304 
1307 {
1308  const CRef<CSeq_loc> seqloc (compartment->GetBounds().front());
1309  const THit::TCoord subj_min (seqloc->GetStart(eExtreme_Positional));
1310  const THit::TCoord subj_max (seqloc->GetStop(eExtreme_Positional));
1311 
1312  THitRefs hitrefs;
1313  ITERATE(CSeq_align_set::Tdata, ii, compartment->GetSegs().GetDisc().Get()) {
1314  }
1315 
1316  return AlignSingleCompartment(&hitrefs, subj_min, subj_max, result);
1317 }
1318 
1319 
1321  THit::TCoord subj_min,
1322  THit::TCoord subj_max,
1324 {
1325  m_mrna.resize(0);
1326 
1327  THit::TId id_query (phitrefs->front()->GetQueryId());
1328 
1329  x_LoadSequence(&m_mrna, *id_query, 0, kMaxCoord, false);
1330 
1331  const TOrfPair orfs (GetCds(id_query, & m_mrna));
1332  if(m_strand) {
1333  m_cds_start = orfs.first.first;
1334  m_cds_stop = orfs.first.second;
1335  }
1336  else {
1337  m_cds_start = orfs.second.first;
1338  m_cds_stop = orfs.second.second;
1339  }
1340 
1341  if(!m_strand) {
1342  reverse (m_mrna.begin(), m_mrna.end());
1343  transform(m_mrna.begin(), m_mrna.end(), m_mrna.begin(), SCompliment());
1344 
1345  reverse (m_mrna_polya.begin(), m_mrna_polya.end());
1346  transform(m_mrna_polya.begin(), m_mrna_polya.end(), m_mrna_polya.begin(), SCompliment());
1347  }
1348 
1349  bool rv (true);
1350  try {
1351 
1352  SAlignedCompartment ac (x_RunOnCompartment(phitrefs, subj_min, subj_max));
1354  *result = ac;
1355  m_mrna.resize(0);
1356  }
1357 
1358  catch(CAlgoAlignException& e) {
1359 
1360  m_mrna.resize(0);
1361 
1362  if(e.GetSeverity() == eDiag_Fatal) {
1363  throw;
1364  }
1365 
1366  *result = SAlignedCompartment(0, e.GetMsg().c_str());
1367 
1368  const CException::TErrCode errcode (e.GetErrCode());
1369  if(errcode != CAlgoAlignException::eNoAlignment) {
1371  }
1372 
1373  ++m_model_id;
1374  rv = false;
1375  }
1376 
1377  return rv;
1378 }
1379 
1380 bool CSplign::IsPolyA(const char * seq, size_t polya_start, size_t dim) {
1381  const double kMinPercAInPolya (0.80);
1382  if( polya_start + GetMinPolyaLen() > dim ) return false;
1383  size_t cnt = 0;
1384  for(size_t i = polya_start; i<dim; ++i) {
1385  if(seq[i] == 'A') ++cnt;
1386  }
1387  if(cnt >= (dim - polya_start)*kMinPercAInPolya) return true;
1388  return false;
1389 }
1390 
1391 // naive polya detection; sense direction assumed
1392 size_t CSplign::s_TestPolyA(const char * seq, size_t dim, size_t cds_stop)
1393 {
1394  const size_t kMaxNonA (3), kMinAstreak (5);
1395  Int8 i (dim - 1), i0 (dim);
1396  for(size_t count_non_a (0), astreak (0); i >= 0 && count_non_a < kMaxNonA; --i) {
1397 
1398  if(seq[i] != 'A') {
1399  ++count_non_a;
1400  astreak = 0;
1401  }
1402  else {
1403  if(++astreak >= kMinAstreak) {
1404  i0 = i;
1405  }
1406  }
1407  }
1408 
1409  const size_t len (dim - i0);
1410  size_t rv;
1411  if(len >= kMinAstreak) {
1412  rv = i0;
1413  if(0 < cds_stop && cds_stop < dim && rv <= cds_stop) {
1414  rv = cds_stop + 1;
1415  }
1416  }
1417  else {
1418  rv = kMax_UInt;
1419  }
1420 
1421  return rv;
1422 }
1423 
1424 
1425 // PRE: Hits (initial, not transformed) representing the compartment;
1426 // maximum genomic sequence span;
1427 // pre-loaded and appropriately transformed query sequence.
1428 // POST: A set of segments packed into the aligned compartment.
1429 
1431  size_t range_left,
1432  size_t range_right)
1433 {
1435 
1436  try {
1437  m_segments.clear();
1438 
1439  if(range_left > range_right) {
1441  }
1442 
1443  if(phitrefs->size() == 0) {
1445  }
1446 
1447  const TSeqPos mrna_size (Convert(m_mrna.size()));
1448 
1449  if(m_strand == false) {
1450 
1451  // adjust the hits
1452  for(size_t i (0), n (phitrefs->size()); i < n; ++i) {
1453 
1454  THitRef& h ((*phitrefs)[i]);
1455  THit::TCoord a0 (mrna_size - h->GetQueryMin() - 1);
1456  THit::TCoord a1 (mrna_size - h->GetQueryMax() - 1);
1457  const bool new_strand (!(h->GetSubjStrand()));
1458  h->SetQueryStart(a1);
1459  h->SetQueryStop(a0);
1460  h->SetSubjStrand(new_strand);
1461  }
1462  }
1463 
1465  kMax_UInt:
1466  s_TestPolyA(&m_mrna_polya.front(), m_mrna_polya.size(), m_cds_stop));
1467 
1468  // cleave off hits beyond polya
1469  if(m_polya_start < kMax_UInt) {
1470  CleaveOffByTail(phitrefs, m_polya_start);
1471  }
1472 
1473  // keep short terminal hits out of the pattern
1474  THitRefs::iterator ii (phitrefs->begin()), jj (phitrefs->end() - 1);
1475  const size_t min_termhitlen1 (m_MinPatternHitLength);
1476  const size_t min_termhitlen2 (2*m_MinPatternHitLength);
1477  bool b0 (true), b1 (true);
1478  while(b0 && b1 && ii < jj) {
1479 
1480  while(ii->IsNull() && ii < jj) ++ii;
1481  while(jj->IsNull() && ii < jj) --jj;
1482 
1483  if(ii < jj) {
1484 
1485  const double hit_idty ((*ii)->GetIdentity());
1486  const size_t min_termhitlen (
1487  hit_idty < .9999? min_termhitlen2: min_termhitlen1);
1488 
1489  if((*ii)->GetQuerySpan() < min_termhitlen) {
1490  ii++ -> Reset(0);
1491  }
1492  else {
1493  b0 = false;
1494  }
1495  }
1496 
1497  if(ii < jj) {
1498 
1499  const double hit_idty ((*jj)->GetIdentity());
1500  const size_t min_termhitlen (
1501  hit_idty < .9999? min_termhitlen2: min_termhitlen1);
1502 
1503  if((*jj)->GetQuerySpan() < min_termhitlen) {
1504  jj-- -> Reset(0);
1505  }
1506  else {
1507  b1 = false;
1508  }
1509  }
1510  }
1511 
1512  phitrefs->erase(remove_if(phitrefs->begin(), phitrefs->end(),
1514  phitrefs->end());
1515 
1516  if(phitrefs->size() == 0) {
1519  }
1520 
1521 
1522  // find regions of interest on mRna (query) and contig (subj)
1523  THit::TCoord span [4];
1524  CHitFilter<THit>::s_GetSpan(*phitrefs, span);
1525  THit::TCoord qmin (span[0]), qmax (span[1]), smin (span[2]), smax (span[3]);
1526 
1527  const bool ctg_strand (phitrefs->front()->GetSubjStrand());
1528 
1529  // m1: estimate terminal genomic extents based on uncovered end sizes
1530  const THit::TCoord extent_left_m1 (Convert(x_GetGenomicExtent(qmin)));
1531  const THit::TCoord rspace ((m_polya_start < kMax_UInt?
1532  m_polya_start: mrna_size) - qmax - 1 );
1533  const THit::TCoord extent_right_m1 (Convert(x_GetGenomicExtent(rspace)));
1534 
1535  // m2: estimate genomic extents using compartment hits
1536  THit::TCoord fixed_left (kMaxCoord / 2), fixed_right(fixed_left);
1537 
1538  const size_t kTermLenCutOff_m2 (10);
1539  const bool fix_left (qmin <= kTermLenCutOff_m2);
1540  const bool fix_right (rspace <= kTermLenCutOff_m2);
1541  if(fix_left || fix_right) {
1542 
1543  if(phitrefs->size() > 1) {
1544 
1545  // select based on the max intron length
1546  THit::TCoord max_intron (0);
1547  THit::TCoord prev_start (phitrefs->front()->GetSubjStart());
1548 
1549  ITERATE(THitRefs, ii, (*phitrefs)) {
1550 
1551  const THit::TCoord cur_start ((*ii)->GetSubjStart());
1552  const THit::TCoord intron (cur_start >= prev_start?
1553  cur_start - prev_start:
1554  prev_start - cur_start);
1555  if(intron > max_intron) {
1556  max_intron = intron;
1557  }
1558  prev_start = cur_start;
1559  }
1560 
1561  const double factor (2.5);
1562  if(fix_left) { fixed_left = THit::TCoord(max_intron * factor); }
1563  if(fix_right) { fixed_right = THit::TCoord(max_intron * factor); }
1564  }
1565  else {
1566  // stay conservative for single-hit compartments
1567  const THit::TCoord single_hit_extent (300);
1568  if(fix_left) { fixed_left = single_hit_extent; }
1569  if(fix_right) { fixed_right = single_hit_extent; }
1570  }
1571  }
1572 
1573  const THit::TCoord extent_left_m2 (100 + max(fixed_left, qmin));
1574  const THit::TCoord extent_right_m2 (100 + max(fixed_right, rspace));
1575 
1576  const THit::TCoord extent_left (min(extent_left_m1, extent_left_m2));
1577  THit::TCoord extent_right (min(extent_right_m1, extent_right_m2));
1578 
1579  //add polya length to extent
1580  THit::TCoord poly_length = m_polya_start < kMax_UInt ? mrna_size - m_polya_start : 0;
1581  if(extent_right < poly_length) extent_right = poly_length;
1582 
1583  if(ctg_strand) {
1584  smin = max(0, int(smin - extent_left));
1585  smax += extent_right;
1586  }
1587  else {
1588  smin = max(0, int(smin - extent_right));
1589  smax += extent_left;
1590  }
1591 
1592  // regardless of hits, entire cDNA is aligned (without the tail, if any)
1593  qmin = 0;
1594  qmax = m_polya_start < kMax_UInt? m_polya_start - 1: mrna_size - 1;
1595 
1596  // make sure to obey the genomic range specified
1597  if(smin < range_left) {
1598  smin = range_left;
1599  }
1600  if(smax > range_right) {
1601  smax = range_right;
1602  }
1603 
1604  //prohibit extension to go over over non-bridgeable gaps
1605  if(phitrefs->size() > 1) {
1606  THit::TId id_query (phitrefs->front()->GetSubjId());
1607  CRef<CSeq_id> tmp_id(new CSeq_id());
1608  tmp_id->Assign(*id_query);
1609  TSeqPos hitmin(span[2]);
1610  TSeqPos hitmax(span[3]);
1611 
1612  //left
1613  if(hitmin > smin) {
1614  CSeq_loc tmp_loc(*tmp_id, smin, hitmin - 1, eNa_strand_plus);
1616  if(smap) {
1617  TSeqPos tmplen = hitmin - smin;
1618  CSeqMap_CI smit = smap->ResolvedRangeIterator(GetScope(), 0, tmplen, eNa_strand_plus, size_t(-1), CSeqMap::fFindGap);
1619  for(;smit; ++smit) {
1620  if(smit.GetType() == CSeqMap::eSeqGap) {
1622  if(slit && slit->GetBridgeability() == CSeq_literal::e_NotBridgeable) {
1623  TSeqPos pos = smit.GetEndPosition();//exclusive
1624  _ASSERT( smin + pos <= hitmin );
1625  smin += pos;
1626  }
1627  }
1628  }
1629  }
1630  }
1631  //right
1632  if(smax > hitmax) {
1633  CSeq_loc tmp_loc(*tmp_id, hitmax + 1, smax, eNa_strand_plus);
1635  if(smap) {
1636  TSeqPos tmplen = smax - hitmax;
1637  CSeqMap_CI smit = smap->ResolvedRangeIterator(GetScope(), 0, tmplen, eNa_strand_plus, size_t(-1), CSeqMap::fFindGap);
1638  for(;smit; ++smit) {
1639  if(smit.GetType() == CSeqMap::eSeqGap) {
1641  if(slit && slit->GetBridgeability() == CSeq_literal::e_NotBridgeable) {
1642  TSeqPos pos = smit.GetPosition();
1643  _ASSERT( hitmax + pos < smax );
1644  smax = hitmax + pos;
1645  }
1646  }
1647  }
1648  }
1649  }
1650 
1651  }
1652 
1653 
1654  m_genomic.clear();
1655  x_LoadSequence(&m_genomic, *(phitrefs->front()->GetSubjId()),
1656  smin, smax, true, true, ctg_strand);
1657 
1658  // adjust smax if beyond the end
1659  THit::TCoord genomic_size = Convert(m_genomic.size());
1660  const THit::TCoord ctg_end (smin + genomic_size);
1661  if(smax >= ctg_end) {
1662  smax = ctg_end > 0? ctg_end - 1: 0;
1663  }
1664 
1665  if(ctg_strand == false) {
1666 
1667  // make reverse complementary
1668  // for the contig's area of interest
1669  reverse (m_genomic.begin(), m_genomic.end());
1670  transform(m_genomic.begin(), m_genomic.end(), m_genomic.begin(),
1671  SCompliment());
1672  }
1673 
1674  NON_CONST_ITERATE(THitRefs, ii, *phitrefs) {
1675 
1676  THitRef& h (*ii);
1677 
1678  const THit::TCoord hsmin (h->GetSubjMin());
1679  const THit::TCoord hsmax (h->GetSubjMax());
1680  if(!(smin <= hsmin && hsmax <= smax)) {
1681  CNcbiOstrstream ostr;
1682  ostr << "\nOne of compartment hits:\n" << *h
1683  << "\n goes outside the genome range = (" << smin+1 << ", " << smax+1 << ')'
1684  <<"\n allowed for the compartment";
1685  const string errmsg = CNcbiOstrstreamToString(ostr);
1687  }
1688 
1689  if(ctg_strand == false) {
1690 
1691  THit::TCoord a2 (smax - (hsmax - smin));
1692  THit::TCoord a3 (smax - (hsmin - smin));
1693  h->SetSubjStart(a2);
1694  h->SetSubjStop(a3);
1695  }
1696  }
1697 
1698  rv.m_QueryStrand = m_strand;
1699  rv.m_SubjStrand = ctg_strand;
1700 
1701  // shift hits so that they originate from zero
1702  NON_CONST_ITERATE(THitRefs, ii, *phitrefs) {
1703  (*ii)->Shift(-(Int4)qmin, -(Int4)smin);
1704  }
1705 
1706  x_SplitQualifyingHits(phitrefs);
1707  x_SetPattern(phitrefs);
1708  rv.m_Score = x_Run(&m_mrna.front(), &m_genomic.front());
1709 
1710  const size_t seg_dim (m_segments.size());
1711  if(seg_dim == 0) {
1713  }
1714 
1715  //trim holes to codons
1716  if( GetTrimToCodons() ) {
1717  CSplignTrim trim(&m_genomic.front(), (int)m_genomic.size(), m_aligner, m_MaxPartExonIdentDrop);
1719  }
1720 
1721  //look for the last exon
1722  Int8 last_exon = -1;
1723  for(Int8 i = m_segments.size(); i > 0; ) {
1724  --i;
1725  if(m_segments[i].m_exon) {
1726  last_exon = i;
1727  break;
1728  }
1729  }
1730 
1731  if(last_exon == -1) {//no exons found
1733  }
1734 
1735 
1736  // try to extend the last exon as long as it's a good match (min exon identity at the end required)
1737  TSegment& s (const_cast<TSegment&>(m_segments[last_exon]));
1738 
1739  const char* p0 = &m_mrna.front() + s.m_box[1] + 1;
1740  const char* q0 = &m_genomic.front() + s.m_box[3] + 1;
1741  const char* p = p0;
1742  const char* q = q0;
1743  const char* pe = &m_mrna.front() + mrna_size;
1744  const char* qe = &m_genomic.front() + m_genomic.size();
1745 
1746  int match_num = 0;
1747  size_t sh = 0, ct =0;
1748  for(; p < pe && q < qe; ++p, ++q, ++ct) {
1749  if(toupper(*p) != 'N' && *p == *q) {
1750  ++match_num;
1751  if( match_num >= (ct+1)*GetMinExonIdentity() ) { // % ident
1752  sh = ct+1;
1753  }
1754  }
1755  }
1756 
1757  // cut low identity flank region in extention
1758  const double kMinExonFlankIdty (GetPolyaExtIdentity());
1759  if(sh) {
1760  p = p0+(sh-1);
1761  q = q0+(sh-1);
1762  ct = 1;
1763  match_num = 0;
1764  for(;p>=p0;--p,--q,++ct) {
1765  if(toupper(*p) != 'N' && *p == *q) {
1766  ++match_num;
1767  } else {
1768  if( match_num < ct*kMinExonFlankIdty) {//cut flank
1769  sh = p - p0;
1770  ct = 1;
1771  match_num = 0;
1772  }
1773  }
1774  }
1775  }
1776 
1777  if(sh) {
1778  // resize
1779  s.m_box[1] += sh;
1780  s.m_box[3] += sh;
1781  for(ct = 0,p = p0, q = q0; ct < sh; ++p, ++q, ++ct) {
1782  if(toupper(*p) != 'N' && *p == *q) {
1783  s.m_details.append(1, 'M');
1784  } else {
1785  s.m_details.append(1, 'R');
1786  }
1787  }
1788  s.Update(m_aligner);
1789 
1790  // fix annotation
1791  const size_t ann_dim = s.m_annot.size();
1792  if(ann_dim > 2 && s.m_annot[ann_dim - 3] == '>') {
1793  s.m_annot[ann_dim - 2] = q < qe? *q: ' ';
1794  s.m_annot[ann_dim - 1] = q < (qe-1)? *(q+1): ' ';
1795  }
1796  }
1797 
1798  m_segments.resize(last_exon + 1);
1799 
1800  //check if the rest is polya or a gap
1801  THit::TCoord coord = Convert(s.m_box[1]);
1802  ++coord;
1804  if(coord < mrna_size ) {//there is unaligned flanking part of mRNA
1805  if(!m_nopolya && IsPolyA(&m_mrna_polya.front(), coord, m_mrna_polya.size())) {//polya
1806  m_polya_start = coord;
1807  } else {//gap
1809  if( ( (int)mrna_size - (int)s.m_box[1] - 1 ) >= kFlankExonProx &&
1810  ! x_IsInGap( s.m_box[3] + 1) ) {//a sequence gap, but not a genomic gap, cut to splice
1811  int seq1_pos = (int)s.m_box[1];
1812  int seq2_pos = (int)s.m_box[3];
1813  size_t det_pos = s.m_details.size() - 1;
1814  size_t min_det_pos = det_pos - kMaxCutToSplice;
1815  int min_pos = (int)s.m_box[0] + 8;//exon should not be too short
1816  while(seq1_pos >= min_pos && det_pos >= min_det_pos) {
1817  if( (size_t)(seq2_pos + 2) < m_genomic.size() && s.m_details[det_pos] == 'M' &&
1818  toupper(m_genomic[seq2_pos+1]) == 'G' && toupper(m_genomic[seq2_pos+2]) == 'T' ) {//GT point
1819  if( det_pos + 1 < s.m_details.size() ) {//resize
1820  s.m_box[1] = seq1_pos;
1821  s.m_box[3] = seq2_pos;
1822  s.m_details.resize(det_pos + 1);
1823  s.Update(m_aligner);
1824  // update the last two annotation symbols
1825  size_t adim = s.m_annot.size();
1826  if(adim > 0 && s.m_annot[adim-1] == '>') {
1827  s.m_annot += "GT";
1828  } else if(adim > 2 && s.m_annot[adim-3] == '>') {
1829  s.m_annot[adim-2] = 'G';
1830  s.m_annot[adim-1] = 'T';
1831  }
1832  coord = seq1_pos+1;
1833  }
1834  break;
1835  }
1836  switch(s.m_details[det_pos]) {
1837  case 'M' :
1838  --seq1_pos;
1839  --seq2_pos;
1840  break;
1841  case 'R' :
1842  --seq1_pos;
1843  --seq2_pos;
1844  break;
1845  case 'I' :
1846  --seq2_pos;
1847  break;
1848  case 'D' :
1849  --seq1_pos;
1850  break;
1851  }
1852  --det_pos;
1853  }
1854  }
1855  }
1856 
1857  TSegment ss;
1858  ss.m_box[0] = coord;
1859  ss.m_box[1] = mrna_size - 1;
1860  ss.SetToGap();
1861  m_segments.push_back(ss);
1862  }
1863  }
1864 
1865 
1866 
1867  // scratch it if the total coverage is too low
1868  double mcount (0);
1869  ITERATE(TSegments, jj, m_segments) {
1870  if(jj->m_exon) {
1871  mcount += jj->m_idty * jj->m_len;
1872  }
1873  }
1874 
1875  const size_t min_singleton_idty_final (
1876  min(size_t(m_MinSingletonIdty * qmax), m_MinSingletonIdtyBps));
1877 
1878  if(mcount < min_singleton_idty_final) {
1880  }
1881 
1882  // convert coordinates back to original
1884 
1885  if(rv.m_QueryStrand) {
1886  jj->m_box[0] += qmin;
1887  jj->m_box[1] += qmin;
1888  }
1889  else {
1890  jj->m_box[0] = mrna_size - jj->m_box[0] - 1;
1891  jj->m_box[1] = mrna_size - jj->m_box[1] - 1;
1892  }
1893 
1894  if(jj->m_exon) {
1895  if(rv.m_SubjStrand) {
1896  jj->m_box[2] += smin;
1897  jj->m_box[3] += smin;
1898  }
1899  else {
1900  jj->m_box[2] = smax - jj->m_box[2];
1901  jj->m_box[3] = smax - jj->m_box[3];
1902  }
1903  }
1904  }
1905 
1906  if(!rv.m_QueryStrand && m_polya_start > 0 && m_polya_start < mrna_size) {
1907  m_polya_start = mrna_size - m_polya_start - 1;
1908  }
1909  } // try
1910 
1911  catch(CAlgoAlignException& e) {
1912 
1913  const CException::TErrCode errcode (e.GetErrCode());
1914  bool severe (true);
1915  switch(errcode) {
1920  // case CAlgoAlignException::ePattern:
1921  severe = false;
1922  break;
1923  }
1924 
1925  if(severe) {
1927  }
1928  throw;
1929  }
1930 
1931  return rv;
1932 }
1933 
1934 // at this level and below, plus strand is assumed for both sequences
1935 float CSplign::x_Run(const char* Seq1, const char* Seq2)
1936 {
1937  typedef TSegments TSegmentVector;
1938  TSegmentVector segments;
1939 
1940 //#define DBG_DUMP_PATTERN
1941 #ifdef DBG_DUMP_PATTERN
1942  cerr << "Pattern:" << endl;
1943 #endif
1944 
1945  const size_t map_dim (m_alnmap.size());
1946  if(map_dim != 1) {
1947  NCBI_THROW(CAlgoAlignException, eInternal, "Multiple maps not supported");
1948  }
1949 
1950  float rv (0);
1951  size_t cds_start (0), cds_stop (0);
1952  for(size_t i (0); i < map_dim; ++i) {
1953 
1954  const SAlnMapElem& zone (m_alnmap[i]);
1955 
1956  // setup sequences
1957  const size_t len1 (zone.m_box[1] - zone.m_box[0] + 1);
1958  const size_t len2 (zone.m_box[3] - zone.m_box[2] + 1);
1959 
1960  // remap cds if antisense
1961  if(m_strand) {
1962  cds_start = m_cds_start;
1963  cds_stop = m_cds_stop;
1964  }
1965  else {
1966  cds_start = len1 - m_cds_start - 1;
1967  cds_stop = len1 - m_cds_stop - 1;
1968  }
1969 
1970  m_aligner->SetSequences(Seq1 + zone.m_box[0], len1,
1971  Seq2 + zone.m_box[2], len2,
1972  false);
1973 
1974  // prepare the pattern
1975  vector<size_t> pattern;
1976  if(m_pattern.size() > 0) {
1977 
1978  if(zone.m_pattern_start < 0) {
1979  NCBI_THROW(CAlgoAlignException, eInternal,
1980  "CSplign::x_Run(): Invalid alignment pattern");
1981  }
1982 
1983  copy(m_pattern.begin() + zone.m_pattern_start,
1984  m_pattern.begin() + zone.m_pattern_end + 1,
1985  back_inserter(pattern));
1986  }
1987 
1988  for(size_t j (0), pt_dim (pattern.size()); j < pt_dim; j += 4) {
1989 
1990 #ifdef DBG_DUMP_PATTERN
1991  cerr << (1 + pattern[j]) << '\t' << (1 + pattern[j+1]) << '\t'
1992  << "(len = " << (pattern[j+1] - pattern[j] + 1) << ")\t"
1993  << (1 + pattern[j+2]) << '\t' << (1 + pattern[j+3])
1994  << "(len = " << (pattern[j+3] - pattern[j+2] + 1) << ")\t"
1995  << endl;
1996 #undef DBG_DUMP_PATTERN
1997 #endif
1998 
1999  pattern[j] -= zone.m_box[0];
2000  pattern[j+1] -= zone.m_box[0];
2001  pattern[j+2] -= zone.m_box[2];
2002  pattern[j+3] -= zone.m_box[2];
2003  }
2004 
2005  // setup the aligner
2006  m_aligner->SetPattern(pattern);
2007  m_aligner->SetEndSpaceFree(true, true, true, true);
2008  m_aligner->SetCDS(cds_start, cds_stop);
2009 
2010  rv += m_aligner->Run();
2011  m_aligner->CheckPreferences();
2012 
2013 // #define DBG_DUMP_TYPE2
2014 #ifdef DBG_DUMP_TYPE2
2015  {{
2016  CNWFormatter fmt (*m_aligner);
2017  string txt;
2019  cerr << txt;
2020  }}
2021 #undef DBG_DUMP_TYPE2
2022 #endif
2023 
2024  CNWFormatter formatter (*m_aligner);
2025  formatter.MakeSegments(&segments);
2026 
2027  // append a gap
2028  if(i + 1 < map_dim) {
2029  segments.push_back(TSegment());
2030  TSegment& g (segments.back());
2031  g.m_box[0] = zone.m_box[1] + 1;
2032  g.m_box[1] = m_alnmap[i+1].m_box[0] - 1;
2033  g.m_box[2] = zone.m_box[3] + 1;
2034  g.m_box[3] = m_alnmap[i+1].m_box[2] - 1;
2035  g.SetToGap();
2036  }
2037  } // zone iterations end
2038 
2039 
2040 //#define DUMP_ORIG_SEGS
2041 #ifdef DUMP_ORIG_SEGS
2042  cerr << "Orig segments:" << endl;
2043  ITERATE(TSegmentVector, ii, segments) {
2044  cerr << ii->m_exon << '\t' << ii->m_idty << '\t' << ii->m_len << '\t'
2045  << ii->m_box[0] << '\t' << ii->m_box[1] << '\t'
2046  << ii->m_box[2] << '\t' << ii->m_box[3] << '\t'
2047  << ii->m_annot << '\t' << ii->m_score << endl;
2048  }
2049 #endif
2050 
2051  if(segments.size() == 0) {
2053  }
2054 
2055  // segment-level postprocessing
2056 
2057  const size_t SeqLen2 (m_genomic.size());
2058  const size_t SeqLen1 (m_polya_start == kMax_UInt?
2059  m_mrna.size():
2060  m_polya_start);
2061 
2062  // if the limiting range is set, clear all segments beyond the range
2063  if(m_BoundingRange.second > 0) {
2064 
2065  NON_CONST_ITERATE(TSegmentVector, ii, segments) {
2066  if(ii->m_exon &&
2067  (ii->m_box[3] < m_BoundingRange.first
2068  || ii->m_box[2] > m_BoundingRange.second))
2069  {
2070  ii->SetToGap();
2071  }
2072  }
2073  }
2074 
2075  m_segments.resize(0);
2076 
2077  /// postprocessing starts here
2078 
2079  bool is_test = false;
2080  bool is_test_plus = false;
2081  if( GetTestType() == kTestType_20_28 ) {
2082  is_test = true;
2083  } else if( GetTestType() == kTestType_20_28_plus ) {
2084  is_test = true;
2085  is_test_plus = true;
2086  }
2087 
2088  CSplignTrim trim(&m_genomic.front(), (int)m_genomic.size(), m_aligner, m_MaxPartExonIdentDrop);
2089 
2090  //partial trimming near sequence gaps
2091  {{
2092  if (is_test) { // test mode
2093  bool first = true;
2094  TSegmentVector::iterator prev;
2095  NON_CONST_ITERATE(TSegmentVector, ii, segments) {
2096  if(ii->m_exon == false) continue;
2097  if(first) {
2098  first = false;
2099  } else {
2100  if(prev->m_exon) {//if not exon, it will be trimmed later anyway
2101  /* old logic
2102  TSeqPos from = prev->m_box[3];
2103  TSeqPos length = ii->m_box[2] - prev->m_box[3] + 1;
2104  if(m_GenomicSeqMap && m_GenomicSeqMap->ResolvedRangeIterator(GetScope(), from, length, eNa_strand_plus, size_t(-1), CSeqMap::fFindGap)) { //gap, trim.
2105 
2106  // TEST OUTPUT
2107  CSeqMap_CI smit = m_GenomicSeqMap->ResolvedRangeIterator(GetScope(), from, length, eNa_strand_plus, size_t(-1), CSeqMap::fFindGap);
2108  CConstRef<CSeq_literal> slit = smit.GetRefGapLiteral();
2109  string type = "not_gap";
2110  if(smit.GetType() == CSeqMap::eSeqGap) type = "gap";
2111  cout<<"Type: "<<type;
2112  if(slit) {
2113  cout<<" Position: "<<smit.GetPosition()+1;
2114  cout<<" Length: "<<smit.GetLength();
2115  cout<<" End Position: "<<smit.GetEndPosition()+1;
2116  }
2117  cout<<endl;
2118 
2119  if(is_test_plus) {
2120  trim.ImproveFromRight(*prev);
2121  trim.ImproveFromLeft(*ii);
2122  } else {
2123  prev->ImproveFromRight1(Seq1, Seq2, m_aligner);
2124  ii->ImproveFromLeft1(Seq1, Seq2, m_aligner);
2125  }
2126  */
2127 
2128  //trim only if one exon abuts a gap, and the other one not
2129  bool prev_abuts_gap = x_IsInGap(prev->m_box[3] + 1);
2130  bool abuts_gap = x_IsInGap( ii->m_box[2] - 1 );
2131  if( abuts_gap && !prev_abuts_gap ) trim.ImproveFromRight(*prev);
2132  if( !abuts_gap && prev_abuts_gap ) trim.ImproveFromLeft(*ii);
2133 
2134  //add an alignment gap if needed
2135  if( ii->m_box[0] > prev->m_box[1] + 1) {
2136  TSegment sgap;
2137  sgap.m_box[0] = prev->m_box[1] + 1;
2138  sgap.m_box[2] = prev->m_box[3] + 1;
2139  sgap.m_box[1] = ii->m_box[0] - 1;
2140  sgap.m_box[3] = ii->m_box[2] - 1;
2141  sgap.SetToGap();
2142  ii = segments.insert(ii, sgap);
2143  ++ii;
2144  }
2145  //} end of old logic
2146  }
2147  }
2148  prev = ii;
2149  }
2150  if(first) NCBI_THROW(CAlgoAlignException, eNoAlignment, g_msg_NoAlignment);//no exons found
2151  }
2152  }}
2153 
2154  /// trimming iterations
2155  while(true) {
2156 
2157  bool continue_iterations = false;
2158 
2159  if(m_segments.size() > 0) {
2160  segments.resize(m_segments.size());
2161  copy(m_segments.begin(), m_segments.end(), segments.begin());
2162  m_segments.resize(0);
2163  }
2164 
2165  if(segments.size() == 0) {
2166  return 0;
2167  }
2168 
2169  size_t exon_count0 (0);
2170  ITERATE(TSegmentVector, ii, segments) {
2171  if(ii->m_exon) ++exon_count0;
2172  }
2173 
2174  //extend 100% near gaps and flanks
2175  if ( is_test ) { // test mode
2176  bool first_exon = true;
2177  TSegment *last_exon = NULL;
2178  for(size_t k0 = 0; k0 < segments.size(); ++k0) {
2179  TSegment& s = segments[k0];
2180  Int8 ext_len = 0;
2181  Int8 ext_max = 0;
2182  if(s.m_exon) {
2183  if(first_exon) {
2184  first_exon = false;
2185  ext_len = s.CanExtendLeft(m_mrna, m_genomic);
2186  s.ExtendLeft(m_mrna, m_genomic, ext_len, m_aligner);
2187  } else if( !segments[k0-1].m_exon ) {//extend near gap
2188  //extend previous exon to right
2189  ext_len = last_exon->CanExtendRight(m_mrna, m_genomic);
2190  //exons should not intersect
2191  ext_max = min(s.m_box[0] - last_exon->m_box[1], s.m_box[2] - last_exon->m_box[3]) - 1;
2192  last_exon->ExtendRight(m_mrna, m_genomic, min(ext_len, ext_max), m_aligner);
2193  //extend current exon to left
2194  ext_len = s.CanExtendLeft(m_mrna, m_genomic);
2195  ext_max = min(s.m_box[0] - last_exon->m_box[1], s.m_box[2] - last_exon->m_box[3]) - 1;
2196  s.ExtendLeft(m_mrna, m_genomic, min(ext_len, ext_max), m_aligner);
2197  }
2198  last_exon = &s;
2199  }
2200  }
2201  if(last_exon == NULL) {
2202  NCBI_THROW(CAlgoAlignException, eNoAlignment, g_msg_NoAlignment);//no exons
2203  } else {
2204  int ext_len = last_exon->CanExtendRight(m_mrna, m_genomic);
2205  last_exon->ExtendRight(m_mrna, m_genomic, ext_len, m_aligner);
2206  }
2207 
2208  //extension sometimes leads to gap dissapearance
2209  //remove/correct gaps after extension of exons
2210  {{
2211  TSegmentVector tmp_segments;
2212  TSegment g;
2213  g.SetToGap();
2214  int prev_exon_index = -1;
2215  for(size_t k0 = 0; k0 < segments.size(); ++k0) {
2216  if(segments[k0].m_exon) {
2217  if(prev_exon_index == -1) {//first exon
2218  if(segments[k0].m_box[0] > 0) {//gap at the beginning
2219  g.m_box[0] = 0;
2220  g.m_box[1] = segments[k0].m_box[0] - 1;
2221  g.m_box[2] = 0;
2222  g.m_box[3] = segments[k0].m_box[2] - 1;
2223  g.m_len = g.m_box[1] - g.m_box[0] + 1;
2224  tmp_segments.push_back(g);
2225  }
2226 
2227  } else {
2228  if( segments[prev_exon_index].m_box[1] + 1 < segments[k0].m_box[0] ) {// is there a gap?
2229  g.m_box[0] = segments[prev_exon_index].m_box[1] + 1;
2230  g.m_box[1] = segments[k0].m_box[0] - 1;
2231  g.m_box[2] = segments[prev_exon_index].m_box[3] + 1;
2232  g.m_box[3] = segments[k0].m_box[2] - 1;
2233  g.m_len = g.m_box[1] - g.m_box[0] + 1;
2234  tmp_segments.push_back(g);
2235  }
2236  }
2237  prev_exon_index = (int)k0;
2238  tmp_segments.push_back(segments[k0]);
2239  }
2240  }
2241 
2242  //check right end
2243  if(prev_exon_index >= 0) {
2244  if(segments[prev_exon_index].m_box[1] + 1 < SeqLen1) {
2245  g.m_box[0] = segments[prev_exon_index].m_box[1] + 1;
2246  g.m_box[1] = SeqLen1 - 1;
2247  g.m_box[2] = segments[prev_exon_index].m_box[3] + 1;
2248  g.m_box[3] = SeqLen2 - 1;
2249  g.m_len = g.m_box[1] - g.m_box[0] + 1;
2250  tmp_segments.push_back(g);
2251  }
2252  } else {
2253  NCBI_THROW(CAlgoAlignException, eNoAlignment, g_msg_NoAlignment);//no exons
2254  }
2255  segments.swap(tmp_segments);
2256  }}
2257  }
2258 
2259 
2260 
2261  //PARTIAL TRIMMING OF TERMINAL EXONS
2262  if( !is_test ) {//default production
2263  // Go from the ends and see if we can improve term exons
2264  //note that it continue trimming of exons until s.m_idty is high
2265  size_t k0 (0);
2266  while(k0 < segments.size()) {
2267 
2268  TSegment& s = segments[k0];
2269  if(s.m_exon) {
2270 
2271  const size_t len (1 + s.m_box[1] - s.m_box[0]);
2272  const double min_idty (len >= kMinTermExonSize?
2273  m_MinExonIdty:
2274  max(m_MinExonIdty, kMinTermExonIdty));
2275  s.ImproveFromLeft(Seq1, Seq2, m_aligner);
2276  if(s.m_idty >= min_idty) {
2277  break;
2278  }
2279  }
2280  ++k0;
2281  }
2282 
2283  size_t k1 (segments.size() - 1);
2284  while(k1 >= k0) {
2285 
2286  TSegment& s (segments[k1]);
2287  if(s.m_exon) {
2288 
2289  const size_t len (1 + s.m_box[1] - s.m_box[0]);
2290  const double min_idty (len >= kMinTermExonSize?
2291  m_MinExonIdty:
2292  max(m_MinExonIdty, kMinTermExonIdty));
2293  s.ImproveFromRight(Seq1, Seq2, m_aligner);
2294  if(s.m_idty >= min_idty) {
2295  break;
2296  }
2297  }
2298  --k1;
2299  }
2300  } else { // test mode
2301  //trim terminal exons only
2302  //first exon
2303  NON_CONST_ITERATE(TSegmentVector, ii, segments) {
2304  if(ii->m_exon) {
2305  if(is_test_plus) {
2306  trim.Cut50FromLeft(*ii);
2307  } else {
2308  ii->ImproveFromLeft1(Seq1, Seq2, m_aligner);
2309  }
2310  break;
2311  }
2312  }
2313  //last exon
2314  NON_CONST_REVERSE_ITERATE(TSegmentVector, ii, segments) {
2315  if(ii->m_exon) {
2316  if(is_test_plus) {
2317  trim.Cut50FromRight(*ii);
2318  } else {
2319  ii->ImproveFromRight1(Seq1, Seq2, m_aligner);
2320  }
2321  break;
2322  }
2323  }
2324  }
2325 
2326  //partial trimming, exons near <GAP>s, terminal exons are trimmed already
2327  for(unsigned int k0 = 0; k0 < segments.size(); ++k0) {
2328  if(!segments[k0].m_exon) {
2329  if( k0 > 0 && segments[k0-1].m_exon) {
2330  if(is_test) {
2331  if(is_test_plus) {
2332  if( x_IsInGap(segments[k0-1].m_box[3] + 1) ||
2333  CSplignTrim::HasAbuttingExonOnRight(segments, k0-1) ) {
2334  //abuting an exon or a sequence gap on the genome, do not trim
2335  } else {
2336  if( ( (int)SeqLen1 - (int)segments[k0-1].m_box[1] - 1 ) >= kFlankExonProx ) {
2337  trim.ImproveFromRight(segments[k0-1]);
2338  } else {
2339  trim.Cut50FromRight(segments[k0-1]);
2340  }
2341  }
2342  } else {
2343  segments[k0-1].ImproveFromRight1(Seq1, Seq2, m_aligner);
2344  }
2345  } else {
2346  segments[k0-1].ImproveFromRight(Seq1, Seq2, m_aligner);
2347  }
2348  }
2349  if( k0 + 1 < segments.size() && segments[k0+1].m_exon) {
2350  if(is_test) {
2351  if(is_test_plus) {
2352  if( x_IsInGap(segments[k0+1].m_box[2] - 1) ||
2353  CSplignTrim::HasAbuttingExonOnLeft(segments, k0+1) ) {
2354  //abuting an exon or a sequence gap on the genome, do not trim
2355  } else {
2356  if( (int)segments[k0+1].m_box[0] >= kFlankExonProx ) {
2357  trim.ImproveFromLeft(segments[k0+1]);
2358  } else {
2359  trim.Cut50FromLeft(segments[k0+1]);
2360  }
2361  }
2362  } else {
2363  segments[k0+1].ImproveFromLeft1(Seq1, Seq2, m_aligner);
2364  }
2365  } else {
2366  segments[k0+1].ImproveFromLeft(Seq1, Seq2, m_aligner);
2367  }
2368  }
2369  }
2370  }
2371 
2372 
2373  //CORRECTIONS AFTER PARTIAL TRIMMING
2374 
2375  if( segments.size() == 0 ) {
2376  NCBI_THROW(CAlgoAlignException, eNoAlignment, g_msg_NoAlignment);//no exons
2377  }
2378 
2379  // indicate any slack space on the left
2380  if(segments[0].m_box[0] > 0) {
2381 
2382  TSegment g;
2383  g.m_box[0] = 0;
2384  g.m_box[1] = segments[0].m_box[0] - 1;
2385  g.m_box[2] = 0;
2386  g.m_box[3] = segments[0].m_box[2] - 1;
2387  g.SetToGap();
2388  segments.insert(segments.begin(), g);
2389  }
2390 
2391  // same on the right
2392  TSegment& seg_last (segments.back());
2393  if(seg_last.m_box[1] + 1 < SeqLen1) {
2394 
2395  TSegment g;
2396  g.m_box[0] = seg_last.m_box[1] + 1;
2397  g.m_box[1] = SeqLen1 - 1;
2398  g.m_box[2] = seg_last.m_box[3] + 1;
2399  g.m_box[3] = SeqLen2 - 1;
2400  g.SetToGap();
2401  segments.push_back(g);
2402  }
2403 
2404  //WHOLE EXON TRIMMING
2405 
2406  //drop low complexity terminal exons
2407  {{
2408  bool first_exon = true;
2409  TSegment *last_exon = NULL;
2410  NON_CONST_ITERATE(TSegmentVector, ii, segments) {
2411  if(ii->m_exon) {
2412  //first exon
2413  if(first_exon) {
2414  if(ii->IsLowComplexityExon(Seq1) ) {
2415  ii->SetToGap();
2416  }
2417  first_exon = false;
2418  } else {//make sure that if exon is single, it is checked once
2419  last_exon = &*ii;
2420  }
2421  }
2422  }
2423  //last exon
2424  if( last_exon != 0 ) {
2425  if(last_exon->IsLowComplexityExon(Seq1) ) {
2426  last_exon->SetToGap();
2427  }
2428  }
2429  }}
2430 
2431 
2432  //throw away low identity exons
2433  NON_CONST_ITERATE(TSegmentVector, ii, segments) {
2434  if(ii->m_exon == false) continue;
2435  if(ii->m_idty < m_MinExonIdty) {
2436  if( is_test ) {//try to trim it first
2437  TSegment sl(*ii), sr(*ii);
2438  if(is_test_plus) {
2439  trim.ImproveFromLeft(sl);
2440  trim.ImproveFromRight(sr);
2441  } else {
2442  sl.ImproveFromLeft1(Seq1, Seq2, m_aligner);
2443  sr.ImproveFromRight1(Seq1, Seq2, m_aligner);
2444  }
2445  if( sl.m_details == ii->m_details && sr.m_details == ii->m_details ) {//did not help
2446  ii->SetToGap();
2447  } else {
2448  //pick better one
2449  if(is_test_plus) {
2450  //there is a splice on left, but no splice on right
2451  if( sr.m_details != ii->m_details && ii != segments.begin() && (ii-1)->m_exon && ( (ii+1) == segments.end() || !(ii+1)->m_exon ) ) {
2452  *ii = sr;
2453  //splice on right, no splice on left
2454  } else if( sl.m_details != ii->m_details && (ii+1) != segments.end() && (ii+1)->m_exon && ( ii == segments.begin() || !(ii-1)->m_exon) ) {
2455  *ii = sl;
2456  } else if(sl.m_details == ii->m_details ||
2457  (sr.m_details != ii->m_details && sr.m_score > sl.m_score ) ) {
2458  *ii = sr;
2459  } else {
2460  *ii = sl;
2461  }
2462  } else {
2463  if(sl.m_details != ii->m_details || sr.m_details != ii->m_details){
2464  if(sl.m_details == ii->m_details ||
2465  (sr.m_details != ii->m_details && sr.m_score > sl.m_score ) ) {
2466  *ii = sr;
2467  } else {
2468  *ii = sl;
2469  }
2470  }
2471  }
2472  //add gaps if needed
2473  if(ii != segments.begin() && (ii)->m_box[0] > (ii - 1)->m_box[1] + 1) {
2474  TSegment sgap;
2475  sgap.m_box[0] = (ii - 1)->m_box[1] + 1;
2476  sgap.m_box[2] = (ii - 1)->m_box[3] + 1;
2477  sgap.m_box[1] = ii->m_box[0] - 1;
2478  sgap.m_box[3] = ii->m_box[2] - 1;
2479  sgap.SetToGap();
2480  ii = segments.insert(ii, sgap);
2481  continue_iterations = true;
2482  ++ii;
2483  }
2484  if( (ii+1) != segments.end() && (ii+1)->m_box[0] > ii->m_box[1] + 1 ) {
2485  TSegment sgap;
2486  ++ii;
2487  sgap.m_box[0] = (ii - 1)->m_box[1] + 1;
2488  sgap.m_box[2] = (ii - 1)->m_box[3] + 1;
2489  sgap.m_box[1] = ii->m_box[0] - 1;
2490  sgap.m_box[3] = ii->m_box[2] - 1;
2491  sgap.SetToGap();
2492  ii = segments.insert(ii, sgap);
2493  continue_iterations = true;
2494  }
2495  }
2496  } else {// end of test mode
2497  //old style, just throw away
2498  ii->SetToGap();
2499  }
2500  } else if(ii->m_idty < .9 && ii->m_len < 20) {
2501  // 20_90 rule for short exons preceded/followed by non-consensus splices
2502  bool nc_prev (false), nc_next (false);
2503  if(ii != segments.begin() && (ii - 1)->m_exon) {
2504  nc_prev = ! TSegment::s_IsConsensusSplice(
2505  (ii - 1)->GetDonor(),
2506  ii->GetAcceptor());
2507  }
2508  if( (ii+1) != segments.end() && (ii + 1)->m_exon) {
2509  nc_next = ! TSegment::s_IsConsensusSplice(
2510  ii->GetDonor(),
2511  (ii + 1)->GetAcceptor());
2512  }
2513  if( nc_prev || nc_next ) {
2514  ii->SetToGap();
2515  }
2516  }
2517 
2518  }
2519 
2520 
2521  //GP-12069
2522  //apply 40/85 rule for all 'stand alone' exons
2523  if( is_test ) {//test mode
2524  for(size_t k (0); k < segments.size(); ++k) {
2525  TSegment& s (segments[k]);
2526  if(s.m_exon == false) continue;
2527  //the first exon or gap on left
2528  if( ( k == 0 ) || ( ! segments[k-1].m_exon ) ) {
2529  //the last exon or gap on right
2530  if( ( k + 1 == segments.size() ) || ( ! segments[k+1].m_exon ) ) {
2531  //stand alone
2532  if (s.m_idty < .85 && s.m_len < 40) {
2533  s.SetToGap();
2534  }
2535  }
2536  }
2537  }
2538  }
2539 
2540 
2541  // turn to gaps exons with low identity
2542  //20_28_90 TEST MODE
2543  // turn to gaps exons with combination of shortness and low identity
2544  // deal with exons adjacent to gaps
2545  if ( is_test ) { // test mode
2546  for(size_t k (0); k < segments.size(); ++k) {
2547  TSegment& s (segments[k]);
2548  if(s.m_exon == false) continue;
2549 
2550  bool drop (false);
2551 
2552  enum EAdjustExon {
2553  eNo,
2554  eSoft, //for exons close to mRNA edge
2555  eHard
2556  } adj;
2557  adj = eNo;
2558 
2559  if(k == 0) {//the first segment is an exon
2560  if( (int)s.m_box[0] >= kFlankExonProx ) {
2561  adj = eHard;
2562  } else {
2563  if(adj == eNo) adj = eSoft;
2564  }
2565  }
2566 
2567  if( k + 1 == segments.size() ) {//the last segment is an exon
2568  if( ( (int)SeqLen1 - (int)s.m_box[1] - 1 ) >= kFlankExonProx ) {
2569  adj = eHard;
2570  } else {
2571  if(adj == eNo) adj = eSoft;//prevent switch from Hard to Soft
2572  }
2573  }
2574 
2575  if(k > 0 && ( ! segments[k-1].m_exon ) ) {//gap on left
2576  if( (int)s.m_box[0] >= kFlankExonProx ) {
2577  adj = eHard;
2578  } else {
2579  if(adj == eNo) adj = eSoft;
2580  }
2581  }
2582 
2583  if(k + 1 < segments.size() && (! segments[k+1].m_exon ) ) {//gap on right
2584  if( ( (int)SeqLen1 - (int)s.m_box[1] - 1 ) >= kFlankExonProx ) {
2585  adj = eHard;
2586  } else {
2587  if(adj == eNo) adj = eSoft;
2588  }
2589  }
2590 
2591  if(adj == eSoft) {//20_90 rule
2592  if (s.m_idty < .9 && s.m_len < 20) {
2593  drop = true;
2594  }
2595  } else if(adj == eHard) {
2596  if( s.m_len < 20 ) {// 20 rule
2597  drop = true;
2598  }
2599  if ( s.m_idty < kMinTermExonIdty && s.m_len < kMinTermExonSize ) {// 28_90 rule
2600  drop = true;
2601  }
2602  }
2603  if(drop) {
2604  s.SetToGap();
2605  }
2606  }
2607  }
2608 
2609 
2610  // turn to gaps short weak terminal exons
2611  {{
2612  // find the two leftmost exons
2613  size_t exon_count (0);
2614  TSegment* term_segs[] = {0, 0};
2615  for(size_t i = 0; i < segments.size(); ++i) {
2616  TSegment& s = segments[i];
2617  if(s.m_exon) {
2618  term_segs[exon_count] = &s;
2619  if(++exon_count == 2) {
2620  break;
2621  }
2622  }
2623  }
2624 
2625  if(exon_count == 2) {
2626  x_ProcessTermSegm(term_segs, 0);
2627  }
2628  }}
2629 
2630  {{
2631  // find the two rightmost exons
2632  size_t exon_count (0);
2633  TSegment* term_segs[] = {0, 0};
2634  for(Int8 i = segments.size() - 1; i >= 0; --i) {
2635  TSegment& s = segments[i];
2636  if(s.m_exon) {
2637  term_segs[exon_count] = &s;
2638  if(++exon_count == 2) {
2639  break;
2640  }
2641  }
2642  }
2643 
2644  if(exon_count == 2) {
2645  x_ProcessTermSegm(term_segs, 1);
2646  }
2647  }}
2648 
2649  // turn to gaps extra-short exons preceded/followed by gaps
2650  bool gap_prev (false);
2651  for(size_t k (0); k < segments.size(); ++k) {
2652 
2653  TSegment& s (segments[k]);
2654  if(s.m_exon == false) {
2655  gap_prev = true;
2656  }
2657  else {
2658  size_t length (s.m_box[1] - s.m_box[0] + 1);
2659  bool gap_next (false);
2660  if(k + 1 < segments.size()) {
2661  gap_next = !segments[k+1].m_exon;
2662  }
2663  if(length <= 10 && (gap_prev || gap_next)) {
2664  s.SetToGap();
2665  }
2666  gap_prev = false;
2667  }
2668  }
2669 
2670 
2671  // merge all adjacent gaps
2672  int gap_start_idx (-1);
2673  if(segments.size() && segments[0].m_exon == false) {
2674  gap_start_idx = 0;
2675  }
2676 
2677  for(size_t k (0); k < segments.size(); ++k) {
2678  TSegment& s (segments[k]);
2679  if(!s.m_exon) {
2680  if(gap_start_idx == -1) {
2681  gap_start_idx = int(k);
2682  if(k > 0) {
2683  s.m_box[0] = segments[k-1].m_box[1] + 1;
2684  s.m_box[2] = segments[k-1].m_box[3] + 1;
2685  }
2686  }
2687  }
2688  else {
2689  if(gap_start_idx >= 0) {
2690  TSegment& g = segments[gap_start_idx];
2691  g.m_box[1] = s.m_box[0] - 1;
2692  g.m_box[3] = s.m_box[2] - 1;
2693  g.m_len = g.m_box[1] - g.m_box[0] + 1;
2694  g.m_details.resize(0);
2695  m_segments.push_back(g);
2696  gap_start_idx = -1;
2697  }
2698  m_segments.push_back(s);
2699  }
2700  }
2701 
2702  if(gap_start_idx >= 0) {
2703  TSegment& g (segments[gap_start_idx]);
2704  g.m_box[1] = segments[segments.size()-1].m_box[1];
2705  g.m_box[3] = segments[segments.size()-1].m_box[3];
2706  g.m_len = g.m_box[1] - g.m_box[0] + 1;
2707  g.m_details.resize(0);
2708  m_segments.push_back(g);
2709  }
2710 
2711  size_t exon_count1 (0);
2712  ITERATE(TSegments, ii, m_segments) {
2713  if(ii->m_exon) ++exon_count1;
2714  }
2715 
2716  if(exon_count1 == 0 ) {
2717  NCBI_THROW(CAlgoAlignException, eNoAlignment, g_msg_NoAlignment);//no exons
2718  }
2719 
2720  if(exon_count0 == exon_count1 && continue_iterations == false) break;
2721 
2722  } // end of trimming iterations
2723 
2724  //cut to AG/GT
2725  {{
2726 
2727 
2728  if ( is_test ) { // test mode
2729  bool first_exon = true;
2730  size_t sdim = m_segments.size();
2731  int last_exon_index = -1;
2732  for(size_t k0 = 0; k0 < sdim; ++k0) {
2733  if(m_segments[k0].m_exon) {
2734  last_exon_index = (int)k0;
2735  }
2736  }
2737  for(unsigned int k0 = 0; k0 < sdim; ++k0) {
2738  TSegment& s = m_segments[k0];
2739  bool cut_from_left = false;
2740  bool cut_from_right = false;
2741  if(s.m_exon) {
2742  //check left
2743  if( s.m_box[2] == 0 || x_IsInGap(s.m_box[2] - 1) ||
2745  //abuting an exon or a contig border or a sequence gap on the genome, do not cut
2746  first_exon = false;
2747  } else {
2748  if(first_exon) {
2749  if( (int)s.m_box[0] >= kFlankExonProx ) {//gap on left
2750  cut_from_left = true;
2751  }
2752  first_exon = false;
2753  } else if( ! m_segments[k0-1].m_exon ) {//gap on left
2754  cut_from_left = true;
2755  }
2756  }
2757  //check right
2758  if( x_IsInGap(s.m_box[3] + 1) ||
2760  //abuting an exon or a sequence gap on the genome, do not cut
2761  } else {
2762  if( last_exon_index == (int)k0 ) {
2763  if( ( (int)SeqLen1 - (int)s.m_box[1] - 1 ) >= kFlankExonProx ) {//gap on right
2764  cut_from_right = true;
2765  }
2766  } else if(k0 + 1 < sdim && (! m_segments[k0+1].m_exon ) ) {//gap on right
2767  cut_from_right = true;
2768  }
2769  }
2770  //try to cut from left
2771  if(cut_from_left) {
2772  int seq1_pos = (int)s.m_box[0];
2773  int seq2_pos = (int)s.m_box[2];
2774  int det_pos = 0;
2775  int max_pos = (int)s.m_box[1] - 8;//exon should not be too short
2776  while(seq1_pos <= max_pos && det_pos <= kMaxCutToSplice) {
2777  if( seq2_pos > 1 && s.m_details[det_pos] == 'M' &&
2778  toupper(Seq2[seq2_pos-2]) == 'A' && toupper(Seq2[seq2_pos-1]) == 'G' ) {//AG point
2779  if(det_pos > 0) {//resize
2780  s.m_box[0] = seq1_pos;
2781  s.m_box[2] = seq2_pos;
2782  s.m_details.erase(0, det_pos);
2783  s.Update(m_aligner);
2784  // update the first two annotation symbols
2785  if(s.m_annot.size() > 0 && s.m_annot[0] == '<') {
2786  s.m_annot = "AG" + s.m_annot;
2787  } else if(s.m_annot.size() > 2 && s.m_annot[2] == '<') {
2788  s.m_annot[0] = 'A';
2789  s.m_annot[1] = 'G';
2790  }
2791  if( k0>0 && ( !m_segments[k0-1].m_exon ) ) {//adjust previos gap
2792  TSegment& g = m_segments[k0-1];
2793  g.m_box[1] = s.m_box[0] - 1;
2794  g.m_box[3] = s.m_box[2] - 1;
2795  g.m_len = g.m_box[1] - g.m_box[0] + 1;
2796  }
2797  }
2798  break;
2799  }
2800  switch(s.m_details[det_pos]) {
2801  case 'M' :
2802  ++seq1_pos;
2803  ++seq2_pos;
2804  break;
2805  case 'R' :
2806  ++seq1_pos;
2807  ++seq2_pos;
2808  break;
2809  case 'I' :
2810  ++seq2_pos;
2811  break;
2812  case 'D' :
2813  ++seq1_pos;
2814  break;
2815  }
2816  ++det_pos;
2817  }
2818  }
2819 
2820  //try to cut from right
2821  if(cut_from_right) {
2822  int seq1_pos = (int)s.m_box[1];
2823  int seq2_pos = (int)s.m_box[3];
2824  size_t det_pos = s.m_details.size() - 1;
2825  size_t min_det_pos = det_pos - kMaxCutToSplice;
2826  int min_pos = (int)s.m_box[0] + 8;//exon should not be too short
2827  while(seq1_pos >= min_pos && det_pos >= min_det_pos) {
2828  if( (size_t)(seq2_pos + 2) < m_genomic.size() && s.m_details[det_pos] == 'M' &&
2829  toupper(Seq2[seq2_pos+1]) == 'G' && toupper(Seq2[seq2_pos+2]) == 'T' ) {//GT point
2830  if( det_pos + 1 < s.m_details.size() ) {//resize
2831  s.m_box[1] = seq1_pos;
2832  s.m_box[3] = seq2_pos;
2833  s.m_details.resize(det_pos + 1);
2834  s.Update(m_aligner);
2835  // update the last two annotation symbols
2836  size_t adim = s.m_annot.size();
2837  if(adim > 0 && s.m_annot[adim-1] == '>') {
2838  s.m_annot += "GT";
2839  } else if(adim > 2 && s.m_annot[adim-3] == '>') {
2840  s.m_annot[adim-2] = 'G';
2841  s.m_annot[adim-1] = 'T';
2842  }
2843  if( k0 + 1 < sdim && ( !m_segments[k0+1].m_exon ) ) {//adjust next gap
2844  TSegment& g = m_segments[k0+1];
2845  g.m_box[0] = s.m_box[1] + 1;
2846  g.m_box[2] = s.m_box[3] + 1;
2847  g.m_len = g.m_box[1] - g.m_box[0] + 1;
2848  }
2849  }
2850  break;
2851  }
2852  switch(s.m_details[det_pos]) {
2853  case 'M' :
2854  --seq1_pos;
2855  --seq2_pos;
2856  break;
2857  case 'R' :
2858  --seq1_pos;
2859  --seq2_pos;
2860  break;
2861  case 'I' :
2862  --seq2_pos;
2863  break;
2864  case 'D' :
2865  --seq1_pos;
2866  break;
2867  }
2868  --det_pos;
2869  }
2870  }
2871  }
2872  }
2873  }
2874  }}
2875 
2876  //throw away bad stand alone exons
2877  {{
2878  if(is_test_plus) {
2879  bool adjust = false;
2880  bool prev_exon = false;
2881  size_t ssize = m_segments.size();
2882  for(size_t pp = 0; pp <ssize ; ++pp) {
2883  if(m_segments[pp].m_exon) {
2884  if( !prev_exon && ( pp == ssize - 1 || !m_segments[pp+1].m_exon ) ) { //stand alone exon
2885  if(trim.ThrowAway20_28_90(m_segments[pp])) {
2886  adjust = true;
2887  }
2888  }
2889  prev_exon = true;
2890  } else {
2891  prev_exon = false;
2892  }
2893  }
2894  if(adjust) {
2895  trim.AdjustGaps(m_segments);
2896  }
2897  }
2898  }}
2899 
2900  // stich small holes
2901  {{
2902  size_t min_hole_len = GetMinHoleLen();
2903  if( min_hole_len > 0) { //find small holes and stich
2904  trim.AdjustGaps(m_segments);//make sure there is no adjacent gaps
2905  TSeqPos pos1 = 0, pos2 = 2;
2906  for(; pos2 < m_segments.size(); ++pos1, ++pos2) {
2907  if( m_segments[pos1].m_exon && !m_segments[pos1+1].m_exon && m_segments[pos2].m_exon &&
2908  m_segments[pos1].m_box[1] + min_hole_len >= m_segments[pos2].m_box[0] &&
2909  m_segments[pos1].m_box[3] + min_hole_len >= m_segments[pos2].m_box[2] ) {
2910 
2911  trim.JoinExons(m_segments, pos1, pos2);//note: m_segments changes here!
2912  }
2913  }
2914  }
2915  }}
2916 
2917  // cut gaps to codons
2918  {{
2919  bool cut_to_codons = true;
2920  if( cut_to_codons ) {
2921  }
2922  }}
2923 
2924  /// there is more postprocessing to follow
2925 
2926  if( m_segments.size() == 0 ) {
2927  NCBI_THROW(CAlgoAlignException, eNoAlignment, g_msg_NoAlignment);//no exons
2928  }
2929 
2930 
2931 //#define DUMP_PROCESSED_SEGS
2932 #ifdef DUMP_PROCESSED_SEGS
2933  cerr << "Processed segments:" << endl;
2934  ITERATE(TSegments, ii, m_segments) {
2935  cerr << ii->m_box[0] << '\t' << ii->m_box[1] << '\t'
2936  << ii->m_box[2] << '\t' << ii->m_box[3] << '\t'
2937  << ii->m_annot << '\t' << ii->m_score << endl;
2938  }
2939 #endif
2940 
2941  rv /= m_aligner->GetWm();
2942  return rv;
2943 }
2944 
2945 
2947 {
2948  string trans;
2949  for(size_t i (0), dim (m_Segments.size()); i < dim; ++i) {
2950  const TSegment & s (m_Segments[i]);
2951  if(s.m_exon) {
2952  trans.append(s.m_details);
2953  }
2954  else {
2955  trans.append(s.m_len, 'D');
2956  }
2957  }
2958  size_t matches = 0;
2959  ITERATE(string, ii, trans) {
2960  if(*ii == 'M') {
2961  ++matches;
2962  }
2963  }
2964  return double(matches) / trans.size();
2965 }
2966 
2967 
2968 
2970 {
2971  box[0] = box[2] = kMax_UInt;
2972  box[1] = box[3] = 0;
2973  ITERATE(TSegments, ii, m_Segments) {
2974  const TSegment& s (*ii);
2975  if(s.m_exon) {
2976 
2977  size_t a, b;
2978  if(s.m_box[0] <= s.m_box[1]) {
2979  a = s.m_box[0];
2980  b = s.m_box[1];
2981  }
2982  else {
2983  b = s.m_box[0];
2984  a = s.m_box[1];
2985  }
2986  if(a < box[0]) {
2987  box[0] = a;
2988  }
2989  if(b > box[1]) {
2990  box[1] = b;
2991  }
2992 
2993  if(s.m_box[2] <= s.m_box[3]) {
2994  a = s.m_box[2];
2995  b = s.m_box[3];
2996  }
2997  else {
2998  b = s.m_box[2];
2999  a = s.m_box[3];
3000  }
3001  if(a < box[2]) {
3002  box[2] = a;
3003  }
3004  if(b > box[3]) {
3005  box[3] = b;
3006  }
3007  }
3008  }
3009 }
3010 
3011 
3012 bool CSplign::x_ProcessTermSegm(TSegment** term_segs, Uint1 side) const
3013 {
3014  bool turn2gap (false);
3015 
3016  const size_t exon_size (1 + term_segs[0]->m_box[1] -
3017  term_segs[0]->m_box[0]);
3018 
3019  const double idty (term_segs[0]->m_idty);
3020 
3021 
3022  if( GetTestType() == kTestType_production_default ) {//default production
3023  if(exon_size < kMinTermExonSize && idty < kMinTermExonIdty ) {
3024  turn2gap = true;
3025  }
3026  }
3027 
3028  if(exon_size < kMinTermExonSize) {
3029 
3030  // verify that the intron is not too long
3031 
3032  size_t a, b;
3033  const char *dnr, *acc;
3034  if(side == 0) {
3035  a = term_segs[0]->m_box[3];
3036  b = term_segs[1]->m_box[2];
3037  dnr = term_segs[0]->GetDonor();
3038  acc = term_segs[1]->GetAcceptor();
3039  }
3040  else {
3041  a = term_segs[1]->m_box[3];
3042  b = term_segs[0]->m_box[2];
3043  dnr = term_segs[1]->GetDonor();
3044  acc = term_segs[0]->GetAcceptor();
3045  }
3046 
3047  const size_t intron_len (b - a);
3048 
3049  const bool consensus (TSegment::s_IsConsensusSplice(dnr, acc));
3050 
3051  size_t max_ext ((idty < .96 || !consensus || exon_size < 16)?
3052  m_max_genomic_ext: (5000 * kMinTermExonSize));
3053 
3054  if(consensus) {
3055  if(exon_size < 8) {
3056  max_ext = 10 * exon_size;
3057  }
3058  }
3059  else if(exon_size < 16) {
3060  max_ext = 1;
3061  }
3062 
3063  const size_t max_intron_len (x_GetGenomicExtent(exon_size, max_ext));
3064  if(intron_len > max_intron_len) {
3065  turn2gap = true;
3066  }
3067  }
3068 
3069  if(turn2gap) {
3070 
3071  // turn the segment into a gap
3072  TSegment& s = *(term_segs[0]);
3073  s.SetToGap();
3074  s.m_len = exon_size;
3075  }
3076 
3077  return turn2gap;
3078 }
3079 
3080 
3081 size_t CSplign::x_GetGenomicExtent(const size_t query_len, size_t max_ext) const
3082 {
3083  if(max_ext == 0) {
3084  max_ext = m_max_genomic_ext;
3085  }
3086 
3087  size_t rv (0);
3088  if(query_len >= kNonCoveredEndThreshold) {
3089  rv = m_max_genomic_ext;
3090  }
3091  else {
3092  const double k (pow(kNonCoveredEndThreshold, - 1. / kPower) * max_ext);
3093  const double drv (k * pow(query_len, 1. / kPower));
3094  rv = size_t(drv);
3095  }
3096 
3097  return rv;
3098 }
3099 
3100 
3101 ////////////////////////////////////
3102 
3103 namespace splign_local {
3104 
3105  template<typename T>
3106  void ElemToBuffer(const T& n, char*& p)
3107  {
3108  *(reinterpret_cast<T*>(p)) = n;
3109  p += sizeof(n);
3110  }
3111 
3112  template<>
3113  void ElemToBuffer(const string& s, char*& p)
3114  {
3115  copy(s.begin(), s.end(), p);
3116  p += s.size();
3117  *p++ = 0;
3118  }
3119 
3120  template<typename T>
3121  void ElemFromBuffer(T& n, const char*& p)
3122  {
3123  n = *(reinterpret_cast<const T*>(p));
3124  p += sizeof(n);
3125  }
3126 
3127  template<>
3128  void ElemFromBuffer(string& s, const char*& p)
3129  {
3130  s = p;
3131  p += s.size() + 1;
3132  }
3133 }
3134 
3135 
3137 {
3138  using namespace splign_local;
3139 
3140  if(target == 0) {
3142  }
3143 
3144  const size_t total_size = sizeof m_exon + sizeof m_idty +
3145  sizeof m_len + sizeof m_box + m_annot.size() + 1 +
3146  m_details.size() + 1 + sizeof m_score;
3147 
3148  target->resize(total_size);
3149 
3150  char* p = &target->front();
3151  ElemToBuffer(m_exon, p);
3152  ElemToBuffer(m_idty, p);
3153  ElemToBuffer(m_len, p);
3154  for(size_t i = 0; i < 4; ++i) {
3155  ElemToBuffer(m_box[i], p);
3156  }
3157  ElemToBuffer(m_annot, p);
3158  ElemToBuffer(m_details, p);
3159  ElemToBuffer(m_score, p);
3160 }
3161 
3162 
3164 {
3165  using namespace splign_local;
3166 
3167  const size_t min_size = sizeof m_exon + sizeof m_idty + sizeof m_len +
3168  + sizeof m_box + 1 + 1 + sizeof m_score;
3169 
3170  if(source.size() < min_size) {
3172  }
3173 
3174  const char* p = &source.front();
3175  ElemFromBuffer(m_exon, p);
3176  ElemFromBuffer(m_idty, p);
3177  ElemFromBuffer(m_len, p);
3178 
3179  for(size_t i = 0; i < 4; ++i) {
3180  ElemFromBuffer(m_box[i], p);
3181  }
3182 
3183  ElemFromBuffer(m_annot, p);
3184  ElemFromBuffer(m_details, p);
3185  ElemFromBuffer(m_score, p);
3186 }
3187 
3188 
3190 {
3191  using namespace splign_local;
3192 
3193  if(target == 0) {
3195  }
3196 
3197  const size_t core_size (
3198  sizeof m_Id + sizeof m_Status + m_Msg.size() + 1
3199  + sizeof m_QueryStrand + sizeof m_SubjStrand
3200  + sizeof m_Cds_start + sizeof m_Cds_stop
3201  + sizeof m_QueryLen
3202  + sizeof m_PolyA
3203  + sizeof m_Score);
3204 
3205  vector<char> core (core_size);
3206 
3207  char* p = &core.front();
3208  ElemToBuffer(m_Id, p);
3209  ElemToBuffer(m_Status, p);
3210  ElemToBuffer(m_Msg, p);
3211  ElemToBuffer(m_QueryStrand, p);
3212  ElemToBuffer(m_SubjStrand, p);
3213  ElemToBuffer(m_Cds_start, p);
3214  ElemToBuffer(m_Cds_stop, p);
3215  ElemToBuffer(m_QueryLen, p);
3216  ElemToBuffer(m_PolyA, p);
3217  ElemToBuffer(m_Score, p);
3218 
3219  typedef vector<TNetCacheBuffer> TBuffers;
3220  TBuffers vb (m_Segments.size());
3221  size_t ibuf (0);
3222  ITERATE(TSegments, ii, m_Segments) {
3223  ii->ToBuffer(&vb[ibuf++]);
3224  }
3225 
3226  size_t total_size (core_size + sizeof(size_t) * m_Segments.size());
3227  ITERATE(TBuffers, ii, vb) {
3228  total_size += ii->size();
3229  }
3230 
3231  target->resize(total_size);
3232  TNetCacheBuffer::iterator it = target->begin();
3233  copy(core.begin(), core.end(), it);
3234  it += core_size;
3235 
3236  ITERATE(TBuffers, ii, vb) {
3237  char* p = &(*it);
3238  const size_t seg_buf_size = ii->size();
3239  *((size_t*)p) = seg_buf_size;
3240  it += sizeof (size_t);
3241  copy(ii->begin(), ii->end(), it);
3242  it += seg_buf_size;
3243  }
3244 }
3245 
3246 
3248 {
3249  using namespace splign_local;
3250 
3251  const size_t min_size (
3252  sizeof m_Id
3253  + sizeof m_Status
3254  + 1
3255  + sizeof m_QueryStrand + sizeof m_SubjStrand
3256  + sizeof m_Cds_start + sizeof m_Cds_stop
3257  + sizeof m_QueryLen
3258  + sizeof m_PolyA
3259  + sizeof m_Score );
3260 
3261  if(source.size() < min_size) {
3263  }
3264 
3265  const char* p (&source.front());
3266  ElemFromBuffer(m_Id, p);
3267  ElemFromBuffer(m_Status, p);
3268  ElemFromBuffer(m_Msg, p);
3269  ElemFromBuffer(m_QueryStrand, p);
3270  ElemFromBuffer(m_SubjStrand, p);
3271  ElemFromBuffer(m_Cds_start, p);
3272  ElemFromBuffer(m_Cds_stop, p);
3273  ElemFromBuffer(m_QueryLen, p);
3274  ElemFromBuffer(m_PolyA, p);
3275  ElemFromBuffer(m_Score, p);
3276 
3277  const char* pe (&source.back());
3278  while(p <= pe) {
3279  size_t seg_buf_size (0);
3280  ElemFromBuffer(seg_buf_size, p);
3281  m_Segments.push_back(TSegment());
3282  TSegment& seg (m_Segments.back());
3283  seg.FromBuffer(TNetCacheBuffer(p, p + seg_buf_size));
3284  p += seg_buf_size;
3285  }
3286 }
3287 
3289  TScoreSets * output_stats,
3290  TOrf cds,
3291  EStatFlags flags)
3292 {
3293 
3294  const
3295  bool valid_input (sas.GetPointer() && sas->CanGet() && sas->Get().size()
3296  && sas->Get().front()->CanGetSegs()
3297  && sas->Get().front()->GetSegs().IsSpliced()
3298  && sas->Get().front()->GetSegs().GetSpliced().GetProduct_type()
3300  && output_stats);
3301 
3302  if(!valid_input) {
3303  NCBI_THROW(CAlgoAlignException, eBadParameter,
3304  "CSplign::s_ComputeStats(): Invalid input");
3305  }
3306 
3307  output_stats->resize(0);
3308 
3309  ITERATE(CSeq_align_set::Tdata, ii1, sas->Get()) {
3310  CRef<CScore_set> ss (s_ComputeStats(*ii1, false, cds, flags));
3311  output_stats->push_back(ss);
3312  }
3313 
3314  return output_stats->size();
3315 }
3316 
3317 
3318 namespace {
3319  const int kFrame_not_set (-10);
3320  const int kFrame_end (-5);
3321  const int kFrame_lost (-20);
3322 }
3323 
3324 
3326  bool embed_scoreset,
3327  TOrf cds,
3328  EStatFlags flags)
3329 {
3330  if(!(flags & (eSF_BasicNonCds | eSF_BasicCds))) {
3332  "CSplign::s_ComputeStats(): mode not yet supported.");
3333  }
3334 
3335  const bool cds_stats ((flags & eSF_BasicCds) && (cds.first + cds.second > 0));
3336 
3337  // set individual scores
3338  CRef<CScore_set> ss (new CScore_set);
3339  CScore_set::Tdata & scores (ss->Set());
3340 
3341  if(flags & eSF_BasicNonCds) {
3342  CSeq_align::TScore score_vec;
3343  CScoreBuilderBase().AddSplignScores(*sa, score_vec);
3344  scores.assign(score_vec.begin(), score_vec.end());
3345  }
3346 
3347  if(cds_stats) {
3348 
3349  typedef CSeq_align::TSegs::TSpliced TSpliced;
3350  const TSpliced & spliced (sa->GetSegs().GetSpliced());
3351  if(spliced.GetProduct_type() != CSpliced_seg::eProduct_type_transcript) {
3352  NCBI_THROW(CAlgoAlignException, eBadParameter,
3353  "CSplign::s_ComputeStats(): Unsupported product type");
3354  }
3355 
3356  const bool qstrand (spliced.GetProduct_strand() != eNa_strand_minus);
3357  if(cds_stats) {
3358  const bool cds_strand (cds.first < cds.second);
3359  if(qstrand ^ cds_strand) {
3360  NCBI_THROW(CAlgoAlignException, eBadParameter,
3361  "CSplign::s_ComputeStats(): Transcript orientation not "
3362  "matching specified CDS orientation.");
3363  }
3364  }
3365 
3366  typedef TSpliced::TExons TExons;
3367  const TExons & exons (spliced.GetExons());
3368 
3369  size_t matches (0),
3370  aligned_query_bases (0), // matches, mismatches and indels
3371  aln_length_exons (0),
3372  aln_length_gaps (0);
3373 
3374  const TSeqPos qlen (spliced.GetProduct_length());
3375  const TSeqPos polya (spliced.CanGetPoly_a()?
3376  spliced.GetPoly_a(): (qstrand? qlen: TSeqPos(-1)));
3377 
3378  typedef CSpliced_exon TExon;
3379  TSeqPos qprev (qstrand? TSeqPos(-1): qlen);
3380  string xcript;
3381  ITERATE(TExons, ii2, exons) {
3382 
3383  const TExon & exon (**ii2);
3384  const TSeqPos qmin (exon.GetProduct_start().GetNucpos()),
3385  qmax (exon.GetProduct_end().GetNucpos());
3386 
3387  const TSeqPos qgap (qstrand? qmin - qprev - 1: qprev - qmax - 1);
3388 
3389  if(qgap > 0) {
3390  aln_length_gaps += qgap;
3391  if(cds_stats) xcript.append(qgap, 'X');
3392  }
3393 
3394  typedef TExon::TParts TParts;
3395  const TParts & parts (exon.GetParts());
3396  string errmsg;
3397  ITERATE(TParts, ii3, parts) {
3398  const CSpliced_exon_chunk & part (**ii3);
3399  const CSpliced_exon_chunk::E_Choice choice (part.Which());
3400  TSeqPos len (0);
3401  switch(choice) {
3403  len = part.GetMatch();
3404  matches += len;
3405  aligned_query_bases += len;
3406  if(cds_stats) xcript.append(len, 'M');
3407  break;
3409  len = part.GetMismatch();
3410  aligned_query_bases += len;
3411  if(cds_stats) xcript.append(len, 'R');
3412  break;
3414  len = part.GetProduct_ins();
3415  aligned_query_bases += len;
3416  if(cds_stats) xcript.append(len, 'D');
3417  break;
3419  len = part.GetGenomic_ins();
3420  if(cds_stats) xcript.append(len, 'I');
3421  break;
3422  default:
3423  errmsg = "Unexpected spliced exon chunk part: "
3424  + part.SelectionName(choice);
3425  NCBI_THROW(CAlgoAlignException, eBadParameter, errmsg);
3426  }
3427  aln_length_exons += len;
3428  }
3429 
3430  qprev = qstrand? qmax: qmin;
3431  } // TExons
3432 
3433  const TSeqPos qgap (qstrand? polya - qprev - 1: qprev - polya - 1);
3434  aln_length_gaps += qgap;
3435  if(cds_stats) xcript.append(qgap, 'X');
3436 
3437  if(!qstrand && qlen <= 0) {
3438  NCBI_THROW(CAlgoAlignException, eBadParameter,
3439  "CSplign::s_ComputeStats(): Cannot compute "
3440  "inframe stats - transcript length not set.");
3441  }
3442 
3443  int qpos (qstrand? -1: int(qlen));
3444  int qinc (qstrand? +1: -1);
3445  int frame (kFrame_not_set);
3446  size_t aln_length_cds (0);
3447  int matches_frame[] = {0, 0, 0, 0, 0};
3448  const Int8 cds_start (cds.first), cds_stop (cds.second);
3449  for(string::const_iterator ie (xcript.end()), ii(xcript.begin());
3450  ii != ie && frame != kFrame_end; ++ii)
3451  {
3452 
3453  switch(*ii) {
3454 
3455  case 'M':
3456  qpos += qinc;
3457  if(frame == kFrame_not_set && qpos == cds_start) frame = 0;
3458  if(qpos == cds_stop) frame = kFrame_end;
3459  if(frame >= -2) {
3460  ++aln_length_cds;
3461  ++matches_frame[frame + 2];
3462  }
3463  break;
3464 
3465  case 'R':
3466  qpos += qinc;
3467  if(frame == kFrame_not_set && qpos == cds_start) frame = 0;
3468  if(qpos == cds_stop) frame = kFrame_end;
3469  if(frame >= -2) ++aln_length_cds;
3470  break;
3471 
3472  case 'D':
3473  qpos += qinc;
3474  if(frame == kFrame_not_set && qpos == cds_start) frame = 0;
3475  if(qpos == cds_stop) frame = kFrame_end;
3476  if(frame >= -2) {
3477  ++aln_length_cds;
3478  frame = (frame + 1) % 3;
3479  }
3480  break;
3481 
3482  case 'I':
3483  if(frame >= -2) {
3484  ++aln_length_cds;
3485  frame = (frame - 1) % 3;
3486  }
3487  break;
3488 
3489  case 'X':
3490  qpos += qinc;
3491  if( (qstrand && cds_start <= qpos && qpos < cds_stop) ||
3492  (!qstrand && cds_start >= qpos && qpos > cds_stop) )
3493  {
3494  frame = kFrame_lost;
3495  ++aln_length_cds;
3496  }
3497  break;
3498  }
3499  }
3500 
3501  {
3502  CRef<CScore> score_matches_inframe (new CScore());
3503  score_matches_inframe->SetId().SetId(eCS_InframeMatches);
3504  score_matches_inframe->SetValue().SetInt(matches_frame[2]);
3505  scores.push_back(score_matches_inframe);
3506  }
3507 
3508  {
3509  CRef<CScore> score_inframe_identity (new CScore());
3510  score_inframe_identity->SetId().SetId(eCS_InframeIdentity);
3511  score_inframe_identity->SetValue().
3512  SetReal(double(matches_frame[2]) / aln_length_cds);
3513  scores.push_back(score_inframe_identity);
3514  }
3515  }
3516 
3517 
3518  if(embed_scoreset) {
3519  CSeq_align::TScore & sa_score (sa->SetScore());
3520  sa_score.resize(scores.size());
3521  copy(scores.begin(), scores.end(), sa_score.begin());
3522  }
3523 
3524  return ss;
3525 }
3526 
3527 
3529 
3530 
@ eExtreme_Positional
numerical value
Definition: Na_strand.hpp:63
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
void remove_if(Container &c, Predicate *__pred)
Definition: chainer.hpp:69
void transform(Container &c, UnaryFunction *op)
Definition: chainer.hpp:86
CBioseq_Handle –.
TParent::TCoord TCoord
void Run(typename THitRefs::iterator start, typename THitRefs::iterator finish, CScope *scope=NULL, const vector< pair< TCoord, TCoord > > *gaps=NULL)
Execute: identify compartments.
bool GetStrand(size_t i) const
void SetMaxIntron(TCoord mi)
Assign the maximum intron length, in base pairs.
void Get(size_t idx, THitRefs &compartment) const
Retrieve a compartment by index.
const TCoord * GetBox(size_t i) const
bool GetStatus(size_t i) const
pair< size_t, size_t > GetCounts(void) const
Retrieve the compartment counts.
CConstRef –.
Definition: ncbiobj.hpp:1266
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
static void FindOrfs(const string &seq, TLocVec &results, unsigned int min_length_bp=3, int genetic_code=1, const vector< string > &allowable_starts=vector< string >(), bool longest_orfs=true, size_t max_seq_gap=k_default_max_seq_gap)
Find ORFs in both orientations.
Definition: orf.cpp:336
bool IntersectingWith(const TRange &r) const
Definition: range_coll.hpp:187
CRef –.
Definition: ncbiobj.hpp:618
CSafeStatic<>::
void AddSplignScores(const CSeq_align &align, CSeq_align::TScore &scores)
Compute the six splign scores.
CScore_set –.
Definition: Score_set.hpp:66
Definition: Score.hpp:57
Iterator over CSeqMap.
Definition: seq_map_ci.hpp:252
CSeqVector –.
Definition: seq_vector.hpp:65
CSpliced_exon_chunk –.
void ImproveFromLeft(TSeg &s)
void Cut50FromLeft(TSeg &s)
static bool HasAbuttingExonOnLeft(TSegs segments, TSeqPos p)
void Cut50FromRight(TSeg &s)
void TrimHolesToCodons(TSegs &segments, objects::CBioseq_Handle &mrna_bio_handle, bool mrna_strand, TSeqPos mrna_len)
void ImproveFromRight(TSeg &s)
bool ThrowAway20_28_90(TSeg &s)
void JoinExons(TSegs &segments, TSeqPos p1, TSeqPos p2)
void AdjustGaps(TSegs &segments)
static bool HasAbuttingExonOnRight(TSegs segments, TSeqPos p)
EStatFlags
Definition: splign.hpp:334
@ eSF_BasicCds
Definition: splign.hpp:336
@ eSF_BasicNonCds
Definition: splign.hpp:335
bool m_nopolya
Definition: splign.hpp:462
void SetMismatchScore(int score)
Definition: splign.cpp:310
~CSplign()
Definition: splign.cpp:196
void SetMinHoleLen(size_t len)
Definition: splign.cpp:414
void SetPolyaDetection(bool on)
Definition: splign.cpp:374
static int s_GetDefaultNonConsensusSpliceScore(void)
Definition: splign.cpp:155
int GetGcAgSpliceScore(void) const
Definition: splign.cpp:346
EScoringType GetScoringType(void) const
Definition: splign.cpp:298
void Run(THitRefs *hitrefs)
Definition: splign.cpp:1162
void SetCompartmentPenalty(double penalty)
Definition: splign.cpp:588
int m_NonConsensusSpliceScore
Definition: splign.hpp:402
int GetGapExtensionScore(void) const
Definition: splign.cpp:330
list< CRef< objects::CScore_set > > TScoreSets
Definition: splign.hpp:339
void SetMinSingletonIdentity(double idty)
Definition: splign.cpp:432
int m_GcAgSpliceScore
Definition: splign.hpp:400
size_t x_GetGenomicExtent(const size_t query_extent, size_t max_ext=0) const
Definition: splign.cpp:3081
CRef< objects::CScope > & SetScope(void)
Definition: splign.cpp:576
int m_MatchScore
Definition: splign.hpp:395
void SetMatchScore(int score)
Definition: splign.cpp:302
void SetMinPolyaLen(size_t len)
Definition: splign.cpp:410
EScoringType
Definition: splign.hpp:177
@ eMrnaScoring
Definition: splign.hpp:178
@ eEstScoring
Definition: splign.hpp:179
SAlignedCompartment x_RunOnCompartment(THitRefs *hitrefs, size_t range_left, size_t range_right)
Definition: splign.cpp:1430
void SetMaxIntron(size_t max_intron)
Definition: splign.cpp:465
string m_TestType
Definition: splign.hpp:437
bool GetEndGapDetection(void) const
Definition: splign.cpp:370
double GetPolyaExtIdentity(void) const
Definition: splign.cpp:486
size_t m_cds_stop
Definition: splign.hpp:468
static bool s_GetDefaultTrimToCodons(void)
Definition: splign.cpp:517
TSegments m_segments
Definition: splign.hpp:489
size_t m_MinSingletonIdtyBps
Definition: splign.hpp:435
bool x_ProcessTermSegm(TSegment **term_segs, Uint1 side) const
Definition: splign.cpp:3012
TResults m_result
Definition: splign.hpp:493
void SetMinSingletonIdentityBps(size_t idty)
Definition: splign.cpp:442
void SetMinExonIdentity(double idty)
Definition: splign.cpp:390
pair< size_t, size_t > m_BoundingRange
Definition: splign.hpp:486
static double s_GetDefaultMinCompartmentIdty(void)
Definition: splign.cpp:534
size_t GetMinPolyaLen(void) const
Definition: splign.cpp:495
int GetMismatchScore(void) const
Definition: splign.cpp:314
static int s_GetDefaultGapOpeningScore(void)
Definition: splign.cpp:135
void SetGapOpeningScore(int score)
Definition: splign.cpp:318
vector< size_t > m_pattern
Definition: splign.hpp:391
pair< size_t, size_t > TOrf
Definition: splign.hpp:319
bool m_endgaps
Definition: splign.hpp:445
static int s_GetDefaultGtAgSpliceScore(void)
Definition: splign.cpp:143
CConstRef< objects::CSeqMap > m_GenomicSeqMap
Definition: splign.hpp:472
void SetTestType(const string &test_type)
Definition: splign.cpp:552
static size_t s_GetDefaultMinPolyaLen(void)
Definition: splign.cpp:499
bool GetPolyaDetection(void) const
Definition: splign.cpp:378
void SetStrand(bool strand)
Definition: splign.cpp:382
double GetCompartmentPenalty(void) const
Definition: splign.cpp:601
CSplign(void)
Definition: splign.cpp:159
EScoringType m_ScoringType
Definition: splign.hpp:394
int m_GapExtensionScore
Definition: splign.hpp:398
size_t GetMinHoleLen(void) const
Definition: splign.cpp:504
int GetAtAcSpliceScore(void) const
Definition: splign.cpp:354
bool GetTrimToCodons(void) const
Definition: splign.cpp:513
TSeqPos m_polya_start
Definition: splign.hpp:461
void PreserveScope(bool preserve=true)
Controls whether to clean the scope object's cache on a new sequence.
Definition: splign.cpp:582
void SetMaxPartExonIdentDrop(double ident)
Definition: splign.cpp:539
void SetGcAgSpliceScore(int score)
Definition: splign.cpp:342
vector< char > m_genomic
Definition: splign.hpp:471
int m_AtAcSpliceScore
Definition: splign.hpp:401
int m_GapOpeningScore
Definition: splign.hpp:397
static size_t s_GetDefaultMaxGenomicExtent(void)
Definition: splign.cpp:447
vector< char > m_mrna_polya
Definition: splign.hpp:463
size_t m_MaxCompsPerQuery
Definition: splign.hpp:495
double m_MinExonIdty
Definition: splign.hpp:405
vector< char > m_mrna
Definition: splign.hpp:459
size_t m_MinPolyaLen
Definition: splign.hpp:411
bool AlignSingleCompartment(THitRefs *hitrefs, THit::TCoord range_left, THit::TCoord range_right, SAlignedCompartment *result)
Definition: splign.cpp:1320
void SetMaxCompsPerQuery(size_t m)
Definition: splign.cpp:560
void x_LoadSequence(vector< char > *seq, const objects::CSeq_id &seqid, THit::TCoord start, THit::TCoord finish, bool retain, bool is_genomic=false, bool genomic_strand=true)
Definition: splign.cpp:618
size_t GetMaxIntron(void) const
Definition: splign.cpp:471
static CVersionAPI & s_GetVersion(void)
Retrieve the library's version object.
Definition: splign.cpp:206
bool GetStrand(void) const
Definition: splign.cpp:386
string GetTestType(void) const
Definition: splign.cpp:556
size_t m_max_genomic_ext
Definition: splign.hpp:475
CRef< objects::CScope > GetScope(void) const
Access the scope object that the library will use to retrieve the sequences.
Definition: splign.cpp:570
static int s_GetDefaultGcAgSpliceScore(void)
Definition: splign.cpp:147
void SetScoringType(EScoringType type)
Definition: splign.cpp:274
double m_MinSingletonIdty
Definition: splign.hpp:433
CRef< objects::CScope > m_Scope
Definition: splign.hpp:387
void SetNonConsensusSpliceScore(int score)
Definition: splign.cpp:358
int m_GtAgSpliceScore
Definition: splign.hpp:399
static double s_GetDefaultCompartmentPenalty(void)
Definition: splign.cpp:596
bool m_CanResetHistory
Definition: splign.hpp:388
bool m_strand
Definition: splign.hpp:460
vector< THitRef > THitRefs
Definition: splign.hpp:295
void SetAlignerScores(void)
Definition: splign.cpp:222
static int s_GetDefaultMatchScore(void)
Definition: splign.cpp:127
static double s_GetDefaultMinExonIdty(void)
Definition: splign.cpp:481
void SetMaxGenomicExtent(size_t mge)
Definition: splign.cpp:453
static int s_GetDefaultMismatchScore(void)
Definition: splign.cpp:131
size_t GetMaxGenomicExtent(void) const
Definition: splign.cpp:459
static double s_GetDefaultMaxPartExonIdentDrop(void)
Definition: splign.cpp:547
static size_t s_ComputeStats(CRef< objects::CSeq_align_set > sas, TScoreSets *output_stats, TOrf cds=TOrf(0, 0), EStatFlags flags=eSF_BasicNonCds)
Generate statistics based on splign-generated seq-align-set, with each seq-align corresponding to an ...
Definition: splign.cpp:3288
size_t m_model_id
Definition: splign.hpp:492
double m_MinCompartmentIdty
Definition: splign.hpp:427
CConstRef< TAligner > GetAligner(void) const
Definition: splign.cpp:218
void x_FinalizeAlignedCompartment(SAlignedCompartment &ac)
Definition: splign.cpp:1147
size_t m_MinHoleLen
Definition: splign.hpp:417
double GetMinCompartmentIdentity(void) const
Definition: splign.cpp:522
static EScoringType s_GetDefaultScoringType(void)
Definition: splign.cpp:123
static size_t s_TestPolyA(const char *seq, size_t dim, size_t cds_stop=0)
Definition: splign.cpp:1392
void x_SplitQualifyingHits(THitRefs *phitrefs)
Definition: splign.cpp:757
size_t GetMaxCompsPerQuery(void) const
Definition: splign.cpp:564
double m_MaxPartExonIdentDrop
Definition: splign.hpp:482
@ eCS_InframeMatches
Definition: splign.hpp:329
@ eCS_InframeIdentity
Definition: splign.hpp:330
void SetTrimToCodons(bool)
Definition: splign.cpp:418
size_t m_MaxIntron
Definition: splign.hpp:478
int GetMatchScore(void) const
Definition: splign.cpp:306
void SetAtAcSpliceScore(int score)
Definition: splign.cpp:350
bool IsPolyA(const char *seq, size_t polya_start, size_t dim)
Definition: splign.cpp:1380
void SetPolyaExtIdentity(double idty)
Definition: splign.cpp:400
size_t m_MinPatternHitLength
Definition: splign.hpp:496
bool m_TrimToCodons
Definition: splign.hpp:421
void SetGapExtensionScore(int score)
Definition: splign.cpp:326
double GetMinSingletonIdentity(void) const
Definition: splign.cpp:526
CRef< TAligner > & SetAligner(void)
Access the spliced aligner core object.
Definition: splign.cpp:213
float x_Run(const char *seq1, const char *seq2)
Definition: splign.cpp:1935
size_t GetMinSingletonIdentityBps(void) const
Definition: splign.cpp:530
bool x_IsInGap(size_t pos)
Definition: splign.cpp:606
TSIHToMaskRanges m_MaskMap
Definition: splign.hpp:441
TStrIdToOrfs m_OrfMap
Definition: splign.hpp:455
static double s_GetDefaultPolyaExtIdty(void)
Definition: splign.cpp:490
void SetMinCompartmentIdentity(double idty)
Definition: splign.cpp:422
vector< TSegment > TSegments
Definition: splign.hpp:238
static CRef< CSplicedAligner > s_CreateDefaultAligner(void)
Definition: splign.cpp:235
int GetNonConsensusSpliceScore(void) const
Definition: splign.cpp:362
TOrfPair GetCds(const THit::TId &id, const vector< char > *seq_data=0)
Definition: splign.cpp:1080
size_t m_cds_start
Definition: splign.hpp:467
double m_CompartmentPenalty
Definition: splign.hpp:424
pair< TOrf, TOrf > TOrfPair
Definition: splign.hpp:320
void SetEndGapDetection(bool on)
Definition: splign.cpp:366
int GetGtAgSpliceScore(void) const
Definition: splign.cpp:338
objects::CBioseq_Handle m_mrna_bio_handle
Definition: splign.hpp:458
double GetMaxPartExonIdentDrop(void) const
Definition: splign.cpp:543
static size_t s_GetDefaultMinHoleLen(void)
Definition: splign.cpp:508
void SetGtAgSpliceScore(int score)
Definition: splign.cpp:334
double m_MinPolyaExtIdty
Definition: splign.hpp:408
int GetGapOpeningScore(void) const
Definition: splign.cpp:322
void x_SetPattern(THitRefs *hitrefs)
Definition: splign.cpp:812
double GetMinExonIdentity(void) const
Definition: splign.cpp:477
CRef< TAligner > m_aligner
Definition: splign.hpp:384
static THitRef sx_NewHit(THit::TCoord q0, THit::TCoord q, THit::TCoord s0, THit::TCoord s)
Definition: splign.cpp:739
void ClearMem(void)
Definition: splign.cpp:728
CNWFormatter::SSegment TSegment
Definition: splign.hpp:237
static int s_GetDefaultAtAcSpliceScore(void)
Definition: splign.cpp:151
void x_MaskSequence(vector< char > *seq, const TSeqRangeColl &mask_ranges, THit::TCoord start, THit::TCoord finish)
Definition: splign.cpp:705
int m_MismatchScore
Definition: splign.hpp:396
static int s_GetDefaultGapExtensionScore(void)
Definition: splign.cpp:139
vector< SAlnMapElem > m_alnmap
Definition: splign.hpp:452
CVersionInfo –.
container_type::const_iterator const_iterator
Definition: map.hpp:53
const_iterator end() const
Definition: map.hpp:152
const_iterator find(const key_type &key) const
Definition: map.hpp:153
static const char s_Version[]
Definition: clparser.cpp:210
static uch flags
static void test_type(TDSSOCKET *tds, TDSCOLUMN *col)
Definition: all_types.c:18
#define T(s)
Definition: common.h:230
#define true
Definition: bool.h:35
#define false
Definition: bool.h:36
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:61
static int type
Definition: getdata.c:31
void ImproveFromLeft(const char *seq1, const char *seq2, CConstRef< CSplicedAligner > aligner)
void ExtendRight(const vector< char > &mrna, const vector< char > &genomic, Int8 ext_len, const CNWAligner *aligner)
void ImproveFromRight(const char *seq1, const char *seq2, CConstRef< CSplicedAligner > aligner)
void AsText(string *output, ETextFormatType type, size_t line_width=100) const
void MakeSegments(vector< SSegment > *psegments) const
int CanExtendRight(const vector< char > &mrna, const vector< char > &genomic) const
const char * GetDonor(void) const
void ImproveFromRight1(const char *seq1, const char *seq2, CConstRef< CSplicedAligner > aligner)
void FromBuffer(const TNetCacheBuffer &buf)
Definition: splign.cpp:3163
static bool s_IsConsensusSplice(const char *donor, const char *acceptor, bool semi_as_cons=false)
void ToBuffer(TNetCacheBuffer *buf) const
Definition: splign.cpp:3136
vector< char > TNetCacheBuffer
const char * GetAcceptor(void) const
int CanExtendLeft(const vector< char > &mrna, const vector< char > &genomic) const
void Update(const CNWAligner *aligner)
void ExtendLeft(const vector< char > &mrna, const vector< char > &genomic, Int8 ext_len, const CNWAligner *aligner)
bool IsLowComplexityExon(const char *rna_seq)
void SetBand(size_t band)
virtual void SetSequences(const char *seq1, size_t len1, const char *seq2, size_t len2, bool verify=true)
Definition: nw_aligner.cpp:140
size_t GetLongestSeg(size_t *q0, size_t *q1, size_t *s0, size_t *s1) const
void SetWs(TScore value)
Definition: nw_aligner.hpp:122
static void s_GetSpan(const THitRefs &hitrefs, TCoord span[4])
Get sequence span for a set of alignments (hits).
Definition: hit_filter.hpp:175
virtual TScore Run(void)
Definition: nw_aligner.cpp:503
void SetWg(TScore value)
Definition: nw_aligner.hpp:121
void SetScoreMatrix(const SNCBIPackedScoreMatrix *scoremat)
void SetWms(TScore value)
void SetWm(TScore value)
void SetWi(unsigned char splice_type, TScore value)
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define NON_CONST_REVERSE_ITERATE(Type, Var, Cont)
Non constant version of REVERSE_ITERATE macro.
Definition: ncbimisc.hpp:834
string
Definition: cgiapp.hpp:687
#define NULL
Definition: ncbistd.hpp:225
@ eDiag_Fatal
Fatal error – guarantees exit(or abort)
Definition: ncbidiag.hpp:655
TErrCode GetErrCode(void) const
Get error code.
Definition: ncbiexpt.cpp:453
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
const string & GetMsg(void) const
Get message string.
Definition: ncbiexpt.cpp:461
#define NCBI_RETHROW_SAME(prev_exception, message)
Generic macro to re-throw the same exception.
Definition: ncbiexpt.hpp:749
int TErrCode
Definition: ncbiexpt.hpp:889
EDiagSev GetSeverity(void) const
Get exception severity.
Definition: ncbiexpt.hpp:999
CException & SetSeverity(EDiagSev severity)
Set exception severity.
Definition: ncbiexpt.cpp:321
const string AsFastaString(void) const
Definition: Seq_id.cpp:2266
string GetSeqIdString(bool with_version=false) const
Return seqid string with optional version for text seqid type.
Definition: Seq_id.cpp:2145
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
Definition: Seq_id.cpp:318
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
TSeqPos GetStart(ESeqLocExtremes ext) const
Return start and stop positions of the seq-loc.
Definition: Seq_loc.cpp:915
TSeqPos GetStop(ESeqLocExtremes ext) const
Definition: Seq_loc.cpp:963
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
CSeqVector GetSeqVector(EVectorCoding coding, ENa_strand strand=eNa_strand_plus) const
Get sequence: Iupacna or Iupacaa if use_iupac_coding is true.
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
TSeqPos GetEndPosition(void) const
return end position of current segment in sequence (exclusive)
Definition: seq_map_ci.hpp:679
CSeqMap::ESegmentType GetType(void) const
Definition: seq_map_ci.hpp:651
TSeqPos GetPosition(void) const
return position of current segment in sequence
Definition: seq_map_ci.hpp:665
CConstRef< CSeq_literal > GetRefGapLiteral(void) const
return CSeq_literal with gap data, or null if either the segment is not a gap, or an unspecified gap
Definition: seq_map_ci.cpp:292
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
Definition: seq_vector.cpp:304
TSeqPos size(void) const
Definition: seq_vector.hpp:291
static CConstRef< CSeqMap > GetSeqMapForSeq_loc(const CSeq_loc &loc, CScope *scope)
Definition: seq_map.cpp:1162
CSeqMap_CI ResolvedRangeIterator(CScope *scope, TSeqPos from, TSeqPos length, ENa_strand strand=eNa_strand_plus, size_t maxResolve=size_t(-1), TFlags flags=fDefaultFlags) const
Iterate segments in the range with specified strand coordinates.
Definition: seq_map.cpp:868
@ fFindGap
Definition: seq_map.hpp:130
@ eSeqGap
gap
Definition: seq_map.hpp:97
bool NotNull(void) const THROWS_NONE
Check if pointer is not null – same effect as NotEmpty().
Definition: ncbiobj.hpp:744
TObjectType * GetPointer(void) THROWS_NONE
Get pointer,.
Definition: ncbiobj.hpp:998
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool IsNull(void) const THROWS_NONE
Check if pointer is null – same effect as Empty().
Definition: ncbiobj.hpp:735
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
#define numeric_limits
Pre-declaration of the "numeric_limits<>" template Forcibly overrides (using preprocessor) the origin...
Definition: ncbi_limits.hpp:92
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
#define kMax_UInt
Definition: ncbi_limits.h:185
TThisType & SetLength(position_type length)
Definition: range.hpp:194
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define CVersion
void SetFrom(TFrom value)
Assign a value to From data member.
Definition: Range_.hpp:231
vector< CRef< CScore > > TScore
Definition: Seq_align_.hpp:398
TMatch GetMatch(void) const
Get the variant data.
list< CRef< CScore > > Tdata
Definition: Score_set_.hpp:90
Tdata & Set(void)
Assign a value to data member.
Definition: Score_set_.hpp:171
static string SelectionName(E_Choice index)
Retrieve selection name (for diagnostic purposes).
TMismatch GetMismatch(void) const
Get the variant data.
TGenomic_ins GetGenomic_ins(void) const
Get the variant data.
list< CRef< CSeq_align > > Tdata
TProduct_ins GetProduct_ins(void) const
Get the variant data.
E_Choice Which(void) const
Which variant is currently selected.
@ e_Product_ins
insertion in product sequence (i.e. gap in the genomic sequence)
@ e_Genomic_ins
insertion in genomic sequence (i.e. gap in the product sequence)
@ e_Match
both sequences represented, product and genomic sequences match
@ e_Mismatch
both sequences represented, product and genomic sequences do not match
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
yy_size_t n
int len
const CharType(& source)[N]
Definition: pointer.h:1149
void ElemToBuffer(const string &s, char *&p)
Definition: splign.cpp:3113
void ElemFromBuffer(string &s, const char *&p)
Definition: splign.cpp:3128
unsigned int a
Definition: ncbi_localip.c:102
int toupper(Uchar c)
Definition: ncbictype.hpp:73
T max(T x_, T y_)
T min(T x_, T y_)
Int4 delta(size_t dimension_, const Int4 *score_)
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
static unsigned cnt[256]
string GetDonor(const objects::CSpliced_exon &exon)
USING_SCOPE(objects)
static CVersionAPI * s_CreateVersion(void)
Definition: splign.cpp:200
const string kTestType_20_28
Definition: splign.hpp:55
const string kTestType_20_28_plus
Definition: splign.hpp:54
const string kTestType_production_default
Definition: splign.hpp:56
void CleaveOffByTail(CSplign::THitRefs *phitrefs, TSeqPos polya_start)
Definition: splign_util.cpp:45
const char g_msg_NoAlignment[]
Definition: messages.hpp:38
const char g_msg_CompartmentInconsistent[]
Definition: messages.hpp:39
const char g_msg_AlignedNotSpecified[]
Definition: messages.hpp:13
const char g_msg_BadIdentityThreshold[]
Definition: messages.hpp:4
const char g_msg_EmptyHitVectorPassed[]
Definition: messages.hpp:16
const char g_msg_NetCacheBufferIncomplete[]
Definition: messages.hpp:32
const char g_msg_QueryCoverageOutOfRange[]
Definition: messages.hpp:7
const char g_msg_NullPointerPassed[]
Definition: messages.hpp:29
const char g_msg_NoExonsAboveIdtyLimit[]
Definition: messages.hpp:26
const char g_msg_NoHitsAfterFiltering[]
Definition: messages.hpp:21
const char g_msg_InvalidRange[]
Definition: messages.hpp:19
ECompartmentStatus m_Status
Definition: splign.hpp:252
vector< char > TNetCacheBuffer
Definition: splign.hpp:288
void GetBox(Uint4 *box) const
Definition: splign.cpp:2969
void ToBuffer(TNetCacheBuffer *buf) const
Definition: splign.cpp:3189
void FromBuffer(const TNetCacheBuffer &buf)
Definition: splign.cpp:3247
double GetIdentity(void) const
Definition: splign.cpp:2946
Definition: type.c:6
#define _ASSERT
int g(Seg_Gsm *spe, Seq_Mtf *psm, Thd_Gsm *tdg)
Definition: thrddgri.c:44
else result
Definition: token2.c:20
const value_slice::CValueConvert< value_slice::SRunTimeCP, FROM > Convert(const FROM &value)
Modified on Sat Apr 20 12:19:15 2024 by modify_doxy.py rev. 669887