NCBI C++ ToolKit
prosplign.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /*
2 * $Id: prosplign.cpp 100543 2023-08-10 16:43:54Z grichenk $
3 *
4 * =========================================================================
5 *
6 * PUBLIC DOMAIN NOTICE
7 * National Center for Biotechnology Information
8 *
9 * This software/database is a "United States Government Work" under the
10 * terms of the United States Copyright Act. It was written as part of
11 * the author's official duties as a United States Government employee and
12 * thus cannot be copyrighted. This software/database is freely available
13 * to the public for use. The National Library of Medicine and the U.S.
14 * Government have not placed any restriction on its use or reproduction.
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannt warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * Please cite the author in any work or product based on this material.
25 *
26 * =========================================================================
27 *
28 * Author: Vyacheslav Chetvernin
29 *
30 * =========================================================================
31 */
32 
33 #include <ncbi_pch.hpp>
36 
37 #include "scoring.hpp"
38 #include "PSeq.hpp"
39 #include "NSeq.hpp"
40 #include "nucprot.hpp"
41 #include "Ali.hpp"
42 #include "AliSeqAlign.hpp"
43 #include "Info.hpp"
44 
46 #include <objects/seq/seq__.hpp>
49 #include <objmgr/util/sequence.hpp>
51 #include <objmgr/seq_vector.hpp>
52 
55 USING_SCOPE(ncbi::prosplign);
56 
59 
61 {
62  if (!arg_desc->Exist("score_matrix")) {
63  arg_desc->AddDefaultKey
64  ("score_matrix",
65  "score_matrix",
66  "Aminoacid substitution matrix",
69  }
70  if (!arg_desc->Exist("allow_alt_starts")) {
71  arg_desc->AddFlag("allow_alt_starts", "treat alternative starts same way as ATG for ASN flag 'start-codon-found' (this is an ASN output oprion)");
72  }
73 
74 }
75 
77 {
80 }
81 
83 {
84  SetAltStarts(args["allow_alt_starts"]);
85  SetScoreMatrix(args["score_matrix"].AsString());
86 }
87 
89 {
90  score_matrix_name = matrix_name;
91  return *this;
92 }
94 {
95  return score_matrix_name;
96 }
97 
99 {
100  allow_alt_starts = allow_alt_start;
101  return *this;
102 }
103 
105 {
106  return allow_alt_starts;
107 }
108 
110 {
112 
113  arg_desc->AddDefaultKey
114  ("min_intron_len",
115  "min_intron_len",
116  "min_intron_len",
119  arg_desc->AddDefaultKey
120  ("gap_opening",
121  "gap_opening",
122  "Gap Opening Cost",
125  arg_desc->AddDefaultKey
126  ("gap_extension",
127  "gap_extension",
128  "Gap Extension Cost for one aminoacid (three bases)",
131  arg_desc->AddDefaultKey
132  ("frameshift_opening",
133  "frameshift_opening",
134  "Frameshift Opening Cost",
137  arg_desc->AddDefaultKey
138  ("intron_GT",
139  "intron_GT",
140  "GT/AG intron opening cost",
143  arg_desc->AddDefaultKey
144  ("intron_GC",
145  "intron_GC",
146  "GC/AG intron opening cost",
149  arg_desc->AddDefaultKey
150  ("intron_AT",
151  "intron_AT",
152  "AT/AC intron opening cost",
155  arg_desc->AddDefaultKey
156  ("intron_non_consensus",
157  "intron_non_consensus",
158  "Non Consensus Intron opening Cost",
161  arg_desc->AddDefaultKey
162  ("inverted_intron_extension",
163  "inverted_intron_extension",
164  "intron_extension cost for 1 base = 1/(inverted_intron_extension*3)",
167 }
168 ///////////////////////////////////////////////////////////////////////////
170 {
180 }
181 
183 {
184  SetMinIntronLen(args["min_intron_len"].AsInteger());
185  SetGapOpeningCost(args["gap_opening"].AsInteger());
186  SetGapExtensionCost(args["gap_extension"].AsInteger());
187  SetFrameshiftOpeningCost(args["frameshift_opening"].AsInteger());
188  SetGTIntronCost(args["intron_GT"].AsInteger());
189  SetGCIntronCost(args["intron_GC"].AsInteger());
190  SetATIntronCost(args["intron_AT"].AsInteger());
191  SetNonConsensusIntronCost(args["intron_non_consensus"].AsInteger());
192  SetInvertedIntronExtensionCost(args["inverted_intron_extension"].AsInteger());
193 }
195 {
197 
198  arg_desc->AddFlag("full", "output global alignment as is (all postprocessing options are ingoned)");
199  arg_desc->AddDefaultKey
200  ("cut_flank_partial_codons",
201  "cut_flank_partial_codons",
202  "cut partial codons and adjacent mismatches",
205  arg_desc->AddDefaultKey
206  ("fill_holes",
207  "fill_holes",
208  "postprocessing: postprocess flank regions only. Holes between good pieces will be filled back. It may decrease positives and identity",
211  try {
212  arg_desc->AddDefaultKey
213  ("min_hole_len",
214  "min_hole_len",
215  "postprocessing: fill back holes with both unaligned portions of nuc. and prot. less than min_hole_len;"
216  " 0 - don\'t fill",
219  } catch (CArgException &) {
220  /// Ignore exception, which owuld happen if an application sets up
221  /// command-line arguments for both splign and prosplign, creating a
222  /// duplicate argument
223  }
224  arg_desc->SetConstraint("min_hole_len", new CArgAllow_Integers(0, 10000));
225  arg_desc->AddDefaultKey
226  ("cut_trailing_Ns",
227  "cut_trailing_Ns",
228  "postprocessing: remove Ns at the end of good pieces. It may slightly decrease positives and identity",
231  arg_desc->AddDefaultKey
232  ("flank_positives",
233  "flank_positives",
234  "postprocessing: any length flank of a good piece should not be worse than this",
237  arg_desc->SetConstraint("flank_positives", new CArgAllow_Integers(0, 100));
238  arg_desc->AddDefaultKey
239  ("total_positives",
240  "total_positives",
241  "postprocessing: good piece total percentage threshold",
244  arg_desc->SetConstraint("total_positives", new CArgAllow_Integers(0, 100));
245  arg_desc->AddDefaultKey
246  ("max_bad_len",
247  "max_bad_len",
248  "postprocessing: any part of a good piece longer than max_bad_len should not be worse than min_positives",
251  arg_desc->SetConstraint("max_bad_len", new CArgAllow_Integers(0, 10000));
252  arg_desc->AddDefaultKey
253  ("min_positives",
254  "min_positives",
255  "postprocessing: any part of a good piece longer than max_bad_len should not be worse than min_positives",
258  arg_desc->SetConstraint("min_positives", new CArgAllow_Integers(0, 100));
259 
260  arg_desc->AddDefaultKey
261  ("min_exon_ident",
262  "pct",
263  "postprocessing: any full or partial exon in the output won't have lower percentage of identity",
266  arg_desc->SetConstraint("min_exon_ident", new CArgAllow_Integers(0, 100));
267  arg_desc->AddDefaultKey
268  ("min_exon_positives",
269  "pct",
270  "postprocessing: any full or partial exon in the output won't have lower percentage of positives",
273  arg_desc->SetConstraint("min_exon_positives", new CArgAllow_Integers(0, 100));
274 
275  arg_desc->AddDefaultKey
276  ("min_flanking_exon_len",
277  "min_flanking_exon_len",
278  "postprocessing: minimum number of bases in the first and last exon",
281  arg_desc->SetConstraint("min_flanking_exon_len", new CArgAllow_Integers(3,10000));
282  arg_desc->AddDefaultKey
283  ("min_good_len",
284  "min_good_len",
285  "postprocessing: good piece should not be shorter",
288  arg_desc->SetConstraint("min_good_len", new CArgAllow_Integers(3,10000));
289 
290  arg_desc->AddDefaultKey
291  ("cut_flanks_with_posit_drop",
292  "cut_flanks_with_posit_drop",
293  "cut flanks if drop of positives is more than cut_flanks_with_posit_dropoff threshold",
296  arg_desc->AddDefaultKey
297  ("cut_flanks_with_posit_dropoff",
298  "cut_flanks_with_posit_dropoff",
299  "percentage threshold for cut_flanks_with_posit_drop",
302  arg_desc->SetConstraint("cut_flanks_with_posit_dropoff", new CArgAllow_Integers(0, 100));
303  arg_desc->AddDefaultKey
304  ("cut_flanks_with_posit_window",
305  "cut_flanks_with_posit_window",
306  "window size for cut_flanks_with_posit_drop."
307  " Positives will be counted for a flank and for a window next to the flank."
308  " If difference (in percentage) is more than cut_flanks_with_posit_dropoff, flank will be dropped",
311  arg_desc->SetConstraint("cut_flanks_with_posit_window", new CArgAllow_Integers(0, 100000));
312 
313  arg_desc->AddDefaultKey
314  ("cut_flanks_with_posit_max_len",
315  "cut_flanks_with_posit_max_len",
316  "maximum length to cut for cut_flanks_with_posit_drop",
319  arg_desc->SetConstraint("cut_flanks_with_posit_max_len", new CArgAllow_Integers(-1, 100000));
320 
321  arg_desc->AddDefaultKey
322  ("cut_flanks_with_posit_gap_ratio",
323  "cut_flanks_with_posot_gap_ratio",
324  "gap ratio for cut_flanks_with_posit_drop."
325  " Gaps will be counted as 1 for opening and 1/gap_ratio for extention while trimming flanks."
326  " Setting gap_ratio to more than 1 will affect cut_flanks_with_posit_dropoff value",
329  arg_desc->SetConstraint("cut_flanks_with_posit_gap_ratio", new CArgAllow_Integers(1, 1000));
330 
331  arg_desc->AddDefaultKey
332  ("start_bonus",
333  "start_bonus",
334  "postprocessing: reward for start codon match",
337  arg_desc->SetConstraint("start_bonus", new CArgAllow_Integers(0, 1000));
338  arg_desc->AddDefaultKey
339  ("stop_bonus",
340  "stop_bonus",
341  "postprocessing: reward for stop codon at the end (not implemented)",
344  arg_desc->SetConstraint("stop_bonus", new CArgAllow_Integers(0, 1000));
345 }
346 
347 ///////////////////////////////////////////////////////////////////////////
349 {
350  switch (mode) {
351  case eWithHoles:
352 
358 
363 
366 
369 
372 
375 
378 
379  break;
380  case ePassThrough:
381 
387 
389  SetFillHoles(false);
390  SetMinHoleLen(0);
391  SetCutNs(false);
392 
395 
396  SetMaxBadLen(0);
397  SetMinPositives(0);
398 
399  SetMinExonId(0);
400  SetMinExonPos(0);
401 
403  SetMinGoodLen(0);
404 
405  SetStartBonus(0);
406  SetStopBonus(0);
407  }
408 }
409 
411 {
412  if (args["full"]) {
413 
419 
421  SetFillHoles(false);
422  SetMinHoleLen(0);
423  SetCutNs(false);
424 
427 
428  SetMaxBadLen(0);
429  SetMinPositives(0);
430 
431  SetMinExonId(0);
432  SetMinExonPos(0);
433 
435  SetMinGoodLen(0);
436 
437  SetStartBonus(0);
438  SetStopBonus(0);
439  } else {
440 
441  SetCutFlanksWithPositDrop(args["cut_flanks_with_posit_drop"].AsBoolean());
442  SetCutFlanksWithPositDropoff(args["cut_flanks_with_posit_dropoff"].AsInteger());
443  SetCutFlanksWithPositWindow(args["cut_flanks_with_posit_window"].AsInteger());
444  SetCutFlanksWithPositMaxLen(args["cut_flanks_with_posit_max_len"].AsInteger());
445  SetCutFlanksWithPositGapRatio(args["cut_flanks_with_posit_gap_ratio"].AsInteger());
446 
447  SetCutFlankPartialCodons(args["cut_flank_partial_codons"].AsBoolean());
448  SetFillHoles(args["fill_holes"].AsBoolean());
449  SetMinHoleLen(args["min_hole_len"].AsInteger());
450  SetCutNs(args["cut_trailing_Ns"].AsBoolean());
451  SetFlankPositives(args["flank_positives"].AsInteger());
452  SetTotalPositives(args["total_positives"].AsInteger());
453  SetMaxBadLen(args["max_bad_len"].AsInteger());
454  SetMinPositives(args["min_positives"].AsInteger());
455 
456  SetMinExonId(args["min_exon_ident"].AsInteger());
457  SetMinExonPos(args["min_exon_positives"].AsInteger());
458 
459  SetMinFlankingExonLen(args["min_flanking_exon_len"].AsInteger());
460  SetMinGoodLen(args["min_good_len"].AsInteger());
461  SetStartBonus(args["start_bonus"].AsInteger());
462  SetStopBonus(args["stop_bonus"].AsInteger());
463  }
464 }
465 
467 {
469  return *this;
470 }
472 {
473  return min_intron_len;
474 }
476 {
477  gap_opening = val;
478  return *this;
479 }
481 {
482  return gap_opening;
483 }
484 
486 {
487  gap_extension = val;
488  return *this;
489 }
491 {
492  return gap_extension;
493 }
494 
496 {
498  return *this;
499 }
501 {
502  return frameshift_opening;
503 }
504 
506 {
507  intron_GT = val;
508  return *this;
509 }
511 {
512  return intron_GT;
513 }
515 {
516  intron_GC = val;
517  return *this;
518 }
520 {
521  return intron_GC;
522 }
524 {
525  intron_AT = val;
526  return *this;
527 }
529 {
530  return intron_AT;
531 }
532 
534 {
536  return *this;
537 }
539 {
540  return intron_non_consensus;
541 }
542 
544 {
546  return *this;
547 }
549 {
551 }
552 
554 {
555  return GetTotalPositives() == 0 && GetFlankPositives() == 0;
556 }
557 
559 {
561  return *this;
562 }
564 {
566 }
567 
569 {
571  return *this;
572 }
574 {
576 }
577 
579 {
581  return *this;
582 }
584 {
586 }
587 
589 {
591  return *this;
592 }
594 {
596 }
597 
599 {
601  return *this;
602 }
604 {
606 }
607 
609 {
611  return *this;
612 }
614 {
616 }
617 
619 {
620  fill_holes = val;
621  return *this;
622 }
624 {
625  return fill_holes;
626 }
627 
629 {
630  min_hole_len = val;
631  return *this;
632 }
634 {
635  return min_hole_len;
636 }
637 
639 {
640  cut_ns = val;
641  return *this;
642 }
644 {
645  return cut_ns;
646 }
647 
649 {
650  min_exon_id = val;
651  return *this;
652 }
654 {
655  return min_exon_id;
656 }
657 
659 {
660  min_exon_pos = val;
661  return *this;
662 }
664 {
665  return min_exon_pos;
666 }
667 
669 {
671  return *this;
672 }
674 {
675  return flank_positives;
676 }
677 
679 {
681  return *this;
682 }
684 {
685  return total_positives;
686 }
687 
689 {
690  max_bad_len = val;
691  return *this;
692 }
694 {
695  return max_bad_len;
696 }
697 
699 {
700  min_positives = val;
701  return *this;
702 }
703 
705 {
706  return min_positives;
707 }
708 
710 {
712  return *this;
713 }
714 
716 {
717  return min_flanking_exon_len;
718 }
720 {
721  min_good_len = val;
722  return *this;
723 }
724 
726 {
727  return min_good_len;
728 }
729 
731 {
732  start_bonus = val;
733  return *this;
734 }
735 
737 {
738  return start_bonus;
739 }
741 {
742  stop_bonus = val;
743  return *this;
744 }
745 
747 {
748  return stop_bonus;
749 }
750 
751 
752 ////////////////////////////////////////////////////////////////////////////////
753 
754 
755 
757 public:
758  static CImplementation* create(CProSplignScoring scoring, bool intronless, bool one_stage, bool just_second_stage, bool old);
760  m_scoring(scoring), m_matrix(m_scoring.GetScoreMatrix(), m_scoring.sm_koef) {}
761  virtual ~CImplementation() {}
762  virtual CImplementation* clone()=0;
763 
764  // returns score, bigger is better.
765  // if genomic strand is unknown call twice with opposite strands and compare scores
766  int FindGlobalAlignment_stage1(CScope& scope, const CSeq_id& protein, const CSeq_loc& genomic);
768  CRef<CSeq_align> FindGlobalAlignment(CScope& scope, const CSeq_id& protein, const CSeq_loc& genomic_orig)
769  {
770  CSeq_loc genomic;
771  genomic.Assign(genomic_orig);
772  FindGlobalAlignment_stage1(scope, protein, genomic);
774  }
775 
776  bool HasStartOnNuc(const CSpliced_seg& sps);
777  bool HasStopOnNuc(const CSpliced_seg& sps);
778  void SeekStartStop(CSeq_align& seq_align);
779 
781  {
782  return m_scoring;
783  }
784 
785  const CSubstMatrix& GetSubstMatrix() const
786  {
787  return m_matrix;
788  }
789 
790  virtual const vector<pair<int, int> >& GetExons() const
791  {
792  NCBI_THROW(CProSplignException, eGenericError, "method relevant only for two stage prosplign");
793  }
794  virtual vector<pair<int, int> >& SetExons()
795  {
796  NCBI_THROW(CProSplignException, eGenericError, "method relevant only for two stage prosplign");
797  }
798  virtual void GetFlanks(bool& lgap, bool& rgap) const
799  {
800  NCBI_THROW(CProSplignException, eGenericError, "method relevant only for two stage prosplign");
801  }
802  virtual void SetFlanks(bool lgap, bool rgap)
803  {
804  NCBI_THROW(CProSplignException, eGenericError, "method relevant only for two stage prosplign");
805  }
806 
807  void Interrupt(void)
808  {
810  }
811 
812  void SetInterruptCallback( CProSplign::TInterruptFnPtr prg_callback, void* data)
813  {
814  m_Interrupt.SetInterruptCallback(prg_callback, data);
815  }
816 
817  void SetScope(CScope &scope)
818  { m_scope = &scope; }
819 
820  void SetTranslationTable(int gcode)
822 
823 private:
824  virtual int stage1() = 0;
825  virtual void stage2(CAli& ali) = 0;
826 
827 protected:
829  CSubstMatrix m_matrix;//scaled to be in the same scale as m_scoring
830 
834  shared_ptr<CPSeq> m_protseq;
835  shared_ptr<CNSeq> m_cnseq;
836 
838 
839 };
840 
842 public:
844  virtual COneStage* clone() { return new COneStage(*this); }
845 
846 private:
847  virtual int stage1();
848  virtual void stage2(CAli& ali);
849 
851 };
852 
854 {
855  m_bi.Init((int)m_protseq->seq.size(), (int)m_cnseq->size());//backtracking
857 }
858 
860 {
861  BackAlignNog(m_bi, ali);
862 }
863 
865 public:
866  CTwoStage(CProSplignScoring scoring, bool just_second_stage) :
867  CProSplign::CImplementation(scoring),
868  m_just_second_stage(just_second_stage), m_lgap(false), m_rgap(false) {}
869 
870  virtual const vector<pair<int, int> >& GetExons() const
871  {
872  return m_igi;
873  }
874  virtual vector<pair<int, int> >& SetExons()
875  {
876  return m_igi;
877  }
878  virtual void GetFlanks(bool& lgap, bool& rgap) const
879  {
880  lgap = m_lgap;
881  rgap = m_rgap;
882  }
883  virtual void SetFlanks(bool lgap, bool rgap)
884  {
885  m_lgap = lgap;
886  m_rgap = rgap;
887  }
888 protected:
890  vector<pair<int, int> > m_igi;
891  bool m_lgap;//true if the first one in igi is a gap
892  bool m_rgap;//true if the last one in igi is a gap
893 };
894 
895 class CTwoStageOld : public CTwoStage {
896 public:
897  CTwoStageOld(CProSplignScoring scoring, bool just_second_stage) : CTwoStage(scoring,just_second_stage) {}
898  virtual CTwoStageOld* clone() { return new CTwoStageOld(*this); }
899 private:
900  virtual int stage1();
901  virtual void stage2(CAli& ali);
902 };
903 
904 class CTwoStageNew : public CTwoStage {
905 public:
906  CTwoStageNew(CProSplignScoring scoring, bool just_second_stage) : CTwoStage(scoring,just_second_stage) {}
907  virtual CTwoStageNew* clone() { return new CTwoStageNew(*this); }
908 private:
909  virtual int stage1();
910  virtual void stage2(CAli& ali);
911 };
912 
914 {
916  return 0;
917  int score = FindIGapIntrons(m_Interrupt, m_igi, m_protseq->seq, *m_cnseq,
921  m_lgap = !m_igi.empty() && m_igi.front().first == 0;
922  m_rgap = !m_igi.empty() && m_igi.back().first + m_igi.back().second == int(m_cnseq->size());
923  return score;
924 }
925 
927 {
928  CNSeq cfrnseq;
929  cfrnseq.Init(*m_cnseq, m_igi);
930 
931  CBackAlignInfo bi;
932  bi.Init((int)m_protseq->seq.size(), (int)cfrnseq.size()); //backtracking
933 
934  FrAlign(m_Interrupt, bi, m_protseq->seq, cfrnseq,
938 
939  FrBackAlign(bi, ali);
940  CAli new_ali(m_igi, m_lgap, m_rgap, ali);
941  ali = new_ali;
942 }
943 
945 {
947  return 0;
949 }
950 
952 {
953  CNSeq cfrnseq;
954  cfrnseq.Init(*m_cnseq, m_igi);
955 
956  CBackAlignInfo bi;
957  bi.Init((int)m_protseq->seq.size(), (int)cfrnseq.size()); //backtracking
958 
960 
961  FrBackAlign(bi, ali);
962  CAli new_ali(m_igi, m_lgap, m_rgap, ali);
963  ali = new_ali;
964 }
965 
967 public:
969 private:
970  virtual void stage2(CAli& ali);
971 protected:
973 };
974 
975 class CIntronlessOld : public CIntronless {
976 public:
978  virtual CIntronlessOld* clone() { return new CIntronlessOld(*this); }
979 private:
980  virtual int stage1();
981 };
982 
983 class CIntronlessNew : public CIntronless {
984 public:
986  virtual CIntronlessNew* clone() { return new CIntronlessNew(*this); }
987 private:
988  virtual int stage1();
989 };
990 
992 {
993  m_bi.Init((int)m_protseq->seq.size(), (int)m_cnseq->size());//backtracking
994  return FrAlign(m_Interrupt, m_bi, m_protseq->seq, *m_cnseq,
998 }
999 
1001 {
1002  m_bi.Init((int)m_protseq->seq.size(), (int)m_cnseq->size());//backtracking
1004 }
1005 
1007 {
1008  FrBackAlign(m_bi, ali);
1009 }
1010 
1011 CProSplign::CImplementation* CProSplign::CImplementation::create(CProSplignScoring scoring, bool intronless, bool one_stage, bool just_second_stage, bool old)
1012 {
1013  if (intronless) {
1014  if (old)
1015  return new CIntronlessOld(scoring);
1016  else
1017  return new CIntronlessNew(scoring);
1018  } else {
1019  if (one_stage) {
1020  return new COneStage(scoring);
1021  } else {
1022  if (old)
1023  return new CTwoStageOld(scoring, just_second_stage);
1024  else
1025  return new CTwoStageNew(scoring, just_second_stage);
1026  }
1027  }
1028 }
1029 
1030 
1031 const vector<pair<int, int> >& CProSplign::GetExons() const
1032 {
1033  return m_implementation->GetExons();
1034 }
1035 
1036 vector<pair<int, int> >& CProSplign::SetExons()
1037 {
1038  return m_implementation->SetExons();
1039 }
1040 
1041 void CProSplign::GetFlanks(bool& lgap, bool& rgap) const
1042 {
1043  m_implementation->GetFlanks(lgap, rgap);
1044 }
1045 
1046 void CProSplign::SetFlanks(bool lgap, bool rgap)
1047 {
1048  m_implementation->SetFlanks(lgap, rgap);
1049 }
1050 
1051 
1052 CProSplign::CProSplign( CProSplignScoring scoring, bool intronless) :
1053  m_implementation(CImplementation::create(scoring,intronless,false,false,false))
1054 {
1055 }
1056 
1057 CProSplign::CProSplign( CProSplignScoring scoring, bool intronless, bool one_stage, bool just_second_stage, bool old) :
1058  m_implementation(CImplementation::create(scoring,intronless,one_stage,just_second_stage,old))
1059 {
1060 }
1061 
1063 {
1064 }
1065 
1067 {
1068  m_implementation->SetTranslationTable(gcode);
1069 }
1070 
1071 namespace {
1072 /// true if first and last aa are aligned, nothing about inside holes
1073 bool IsProteinSpanWhole(const CSpliced_seg& sps)
1074 {
1075  CSpliced_seg::TExons exons = sps.GetExons();
1076  if (exons.empty())
1077  return false;
1078  const CProt_pos& prot_start_pos = exons.front()->GetProduct_start().GetProtpos();
1079  const CProt_pos& prot_stop_pos = exons.back()->GetProduct_end().GetProtpos();
1080 
1081  return prot_start_pos.GetAmin()==0 && prot_start_pos.GetFrame()==1 &&
1082  prot_stop_pos.GetAmin()+1 == sps.GetProduct_length() && prot_stop_pos.GetFrame() == 3;
1083 }
1084 }
1085 
1087 {
1088  m_implementation->Interrupt();
1089 }
1090 
1092 {
1093  m_implementation->SetInterruptCallback(prg_callback, data);
1094 }
1095 
1096  //Use this method to set/change genetic code field in ASN
1097 
1098 void CProSplign::AssignGeneticCode(CScope& scope, const CSeq_id& gid, int gcode) {
1099  CBioseq_Handle hp = scope.GetBioseqHandle(gid);
1100 //cout<<MSerial_AsnText<<*hp.GetTopLevelEntry().GetCompleteSeq_entry()<<endl;
1101  list< CRef< CSeqdesc > > & ldesc = hp.GetTopLevelEntry().GetEditHandle().SetDescr().Set();
1102  bool not_found = true;
1103  NON_CONST_ITERATE(list< CRef< CSeqdesc > >, it, ldesc) {
1104  if((*it)->IsSource()) {
1105  (*it)->SetSource().SetOrg().SetOrgname().SetGcode(gcode);
1106  not_found = false;
1107  }
1108  }
1109  if(not_found) {
1110  CRef< CSeqdesc > desc(new CSeqdesc);
1111  desc->SetSource().SetOrg().SetOrgname().SetGcode(gcode);
1112  ldesc.push_back(desc);
1113  }
1114 }
1115 
1116 
1117 CRef<CSeq_align> CProSplign::FindGlobalAlignment(CScope& scope, const CSeq_id& protein, const CSeq_loc& genomic_orig)
1118 {
1119  CRef<CSeq_loc> genomic(new CSeq_loc);
1120  genomic->Assign(genomic_orig);
1121  CConstRef<CSeq_id> nucid(genomic->GetId());
1122  if ( ! nucid )
1123  NCBI_THROW(CProSplignException, eGenericError, "genomic seq-loc has multiple ids or no id at all");
1124 
1125  if (genomic->IsWhole()) {
1126  // change to Interval, because Whole doesn't allow strand change - it's always unknown.
1127  genomic->Assign(
1128  *new CSeq_loc(
1129  *SerialClone(*nucid), 0,
1130  sequence::GetLength(*nucid, &scope)-1));
1131  }
1132 
1133  //check if from <=to
1134  TSeqPos from = genomic->GetTotalRange().GetFrom();
1135  TSeqPos to = genomic->GetTotalRange().GetTo();
1136  if(from > to) {
1137  NCBI_THROW(CProSplignException, eGenericError, "genomic seq-loc has from > to");
1138  }
1139 
1141 
1142  switch (genomic->GetStrand()) {
1143  case eNa_strand_plus:
1144  case eNa_strand_minus:
1145  result = m_implementation->FindGlobalAlignment(
1146  scope, protein, *genomic);
1147  break;
1148  case eNa_strand_unknown:
1149  case eNa_strand_both:
1150  case eNa_strand_both_rev:
1151  // do both
1152  {
1153  unique_ptr<CImplementation> plus_data(m_implementation->clone());
1154  genomic->SetStrand(eNa_strand_plus);
1155  int plus_score = plus_data->FindGlobalAlignment_stage1(scope, protein, *genomic);
1156 
1157  genomic->SetStrand(eNa_strand_minus);
1158  int minus_score = m_implementation->FindGlobalAlignment_stage1(scope, protein, *genomic);
1159 
1160  if (minus_score <= plus_score)
1161  m_implementation = std::move(plus_data);
1162  }
1163 
1164  result = m_implementation->FindGlobalAlignment_stage2();
1165  break;
1166  default:
1167  genomic->SetStrand(eNa_strand_plus);
1168  result = m_implementation->FindGlobalAlignment(
1169  scope, protein, *genomic);
1170  break;
1171  }
1172 
1173  //remove genomic bounds if set
1174  if (result->CanGetBounds()) {
1175  NON_CONST_ITERATE(CSeq_align::TBounds, b, result->SetBounds()) {
1176  if ((*b)->GetId() != NULL && (*b)->GetId()->Match(*nucid)) {
1177  result->SetBounds().erase(b);
1178  break;
1179  }
1180  }
1181  }
1182  //add genomic_orig as genomic bounds
1183  CRef<CSeq_loc> genomic_bounds(new CSeq_loc);
1184  genomic_bounds->Assign(genomic_orig);
1185  result->SetBounds().push_back(genomic_bounds);
1186 
1187  return result;
1188 }
1189 
1190 int CProSplign::CImplementation::FindGlobalAlignment_stage1(CScope& scope, const CSeq_id& protein, const CSeq_loc& genomic)
1191 {
1192  int gcode = 1;
1193  try {
1194  const CSeq_id* sid = genomic.GetId();
1195  CBioseq_Handle hp = scope.GetBioseqHandle(*sid);
1196  gcode = sequence::GetOrg_ref(hp).GetGcode();
1197  } catch (...) {}
1199 
1200  m_scope = &scope;
1201  m_protein = &protein;
1202  m_genomic.Reset(new CSeq_loc);
1203  m_genomic->Assign(genomic);
1204  m_protseq.reset(new CPSeq(*m_scope, *m_protein));
1205  m_cnseq.reset(new CNSeq(*m_scope, *m_genomic));
1206 
1207  return stage1();
1208 }
1209 
1211 {
1212  CAli ali;
1213  stage2(ali);
1214 
1215  CAliToSeq_align cpa(ali, *m_scope, *m_protein, *m_genomic);
1216  CRef<CSeq_align> seq_align = cpa.MakeSeq_align(*m_protseq, *m_cnseq);
1217 
1218  SeekStartStop(*seq_align);
1219 
1220  if (!IsProteinSpanWhole(seq_align->GetSegs().GetSpliced()))
1221  seq_align->SetType(CSeq_align::eType_disc);
1222 
1223  return seq_align;
1224 }
1225 
1227 {
1228  CRef<CSeq_align> refined_align(new CSeq_align);
1229  refined_align->Assign(seq_align);
1230 
1231  if (output_options.IsPassThrough()) {
1232  prosplign::SetScores(*refined_align, scope, output_options.GetScoreMatrix());
1233  return refined_align;
1234  }
1235 
1236  CProteinAlignText alignment_text(scope, seq_align, output_options.GetScoreMatrix());
1237  list<CNPiece> good_parts = FindGoodParts( alignment_text, output_options, m_implementation->GetScaleScoring(), m_implementation->GetSubstMatrix() );
1238  if (good_parts.empty()) {
1239  return CRef<CSeq_align>();
1240  }
1241 
1242  prosplign::RefineAlignment(scope, *refined_align, good_parts/*, output_options.GetCutFlankPartialCodons()*/);
1243 
1244  if (good_parts.size()!=1 || !IsProteinSpanWhole(refined_align->GetSegs().GetSpliced())) {
1245  refined_align->SetType(CSeq_align::eType_disc);
1246  }
1247 
1248  m_implementation->SetScope(scope);
1249  m_implementation->SeekStartStop(*refined_align);
1250  prosplign::SetScores(*refined_align, scope, output_options.GetScoreMatrix());
1251 
1252  return refined_align;
1253 }
1254 
1255 
1257 {
1258  const CSpliced_exon& exon = *sps.GetExons().front();
1259  if (exon.GetProduct_start().GetProtpos().GetFrame()!=1)
1260  return false;
1261  const CSpliced_exon_chunk& chunk = *exon.GetParts().front();
1262  if (chunk.IsProduct_ins() || chunk.IsGenomic_ins())
1263  return false;
1264  int len = 0;
1265  if (chunk.IsDiag()) {
1266  len = chunk.GetDiag();
1267  } else if (chunk.IsMatch()) {
1268  len = chunk.GetMatch();
1269  } else if (chunk.IsMismatch()) {
1270  len = chunk.GetMismatch();
1271  }
1272  if (len < 3)
1273  return false;
1274 
1275  CSeq_id nucid;
1276  nucid.Assign(sps.GetGenomic_id());
1277  CSeq_loc genomic_seqloc(nucid,exon.GetGenomic_start(), exon.GetGenomic_end(),sps.GetGenomic_strand());
1278 
1279  CSeqVector genomic_seqvec(genomic_seqloc, *m_scope, CBioseq_Handle::eCoding_Iupac);
1280  CSeqVector_CI genomic_ci(genomic_seqvec);
1281 
1282  string buf;
1283  genomic_ci.GetSeqData(buf, 3);
1284  if(buf.size() != 3) return false;
1285 
1286  return m_matrix.GetTranslationTable().TranslateStartTriplet(buf) == 'M';
1287 }
1288 
1290 {
1291  const CSpliced_exon& exon = *sps.GetExons().back();
1292  if (exon.GetProduct_end().GetProtpos().GetFrame()!=3)
1293  return false;
1294 
1295  if (sps.GetGenomic_strand()==eNa_strand_minus &&
1296  exon.GetGenomic_start()<3)
1297  return false;
1298 
1299  //need to check before because TSeqPos is unsigned
1300  if(sps.GetGenomic_strand()!=eNa_strand_plus && exon.GetGenomic_start()<3) return false;
1301 
1302  TSeqPos stop_codon_start = sps.GetGenomic_strand()==eNa_strand_plus?exon.GetGenomic_end()+1:exon.GetGenomic_start()-3;
1303  TSeqPos stop_codon_end = sps.GetGenomic_strand()==eNa_strand_plus?exon.GetGenomic_end()+3:exon.GetGenomic_start()-1;
1304 
1305  CSeq_id nucid;
1306  nucid.Assign(sps.GetGenomic_id());
1307 
1308  TSeqPos seq_end = sequence::GetLength(nucid, m_scope)-1;
1309  //if (sps.GetGenomic_strand()==eNa_strand_plus?seq_end<stop_codon_end:stop_codon_start<0) //wrong. stop_codon_start is insigned
1310  if (sps.GetGenomic_strand()==eNa_strand_plus && seq_end<stop_codon_end)
1311  return false;
1312 
1313  CSeq_loc genomic_seqloc(nucid,stop_codon_start, stop_codon_end,sps.GetGenomic_strand());
1314 
1315  CSeqVector genomic_seqvec(genomic_seqloc, *m_scope, CBioseq_Handle::eCoding_Iupac);
1316  CSeqVector_CI genomic_ci(genomic_seqvec);
1317 
1318  string buf;
1319  genomic_ci.GetSeqData(buf, 3);
1320  if(buf.size() != 3) return false;
1321 
1322  return m_matrix.GetTranslationTable().TranslateTriplet(buf) == '*';
1323  //return buf.size()==3 && (buf=="TAA" || buf=="TGA" || buf=="TAG");
1324 }
1325 
1326 
1328 {
1329  CSpliced_seg& sps = seq_align.SetSegs().SetSpliced();
1330 
1331  if (sps.IsSetModifiers()) {
1332  for (CSpliced_seg::TModifiers::iterator m = sps.SetModifiers().begin(); m != sps.SetModifiers().end(); ) {
1333  if ((*m)->IsStart_codon_found() || (*m)->IsStop_codon_found())
1334  m = sps.SetModifiers().erase(m);
1335  else
1336  ++m;
1337  }
1338  if (sps.GetModifiers().empty())
1339  sps.ResetModifiers();
1340  }
1341 
1342  if (!sps.SetExons().empty()) {
1343  //start, stop
1344  if(HasStartOnNuc(sps)) {
1346  modi->SetStart_codon_found(true);
1347  sps.SetModifiers().push_back(modi);
1348 
1349  CSpliced_exon& exon = *sps.SetExons().front();
1350  if (exon.GetProduct_start().GetProtpos().GetAmin()==0) {
1351  CSeq_id protid;
1352  protid.Assign(sps.GetProduct_id());
1353  CPSeq pseq(*m_scope,protid);
1354 
1355  CRef<CSpliced_exon_chunk> chunk = exon.SetParts().front();
1356  _ASSERT( !chunk->IsMatch() || pseq.HasStart() );
1357  if (pseq.HasStart() && !chunk->IsMatch()) {
1358  _ASSERT( chunk->IsDiag() );
1359  int len = chunk->GetDiag();
1360  _ASSERT( len >= 3 );
1361  if (len > 3) {
1362  chunk->SetDiag(len-3);
1363  chunk.Reset(new CSpliced_exon_chunk);
1364  exon.SetParts().push_front(chunk);
1365  }
1366  chunk->SetMatch(3);
1367  }
1368  }
1369  }
1370  if(HasStopOnNuc(sps)) {
1372  modi->SetStop_codon_found(true);
1373  sps.SetModifiers().push_back(modi);
1374  }
1375  }
1376 }
1377 
1378 
list< CNPiece > FindGoodParts(const CProteinAlignText &alignment_text, CProSplignOutputOptionsExt m_options, const CProSplignScaledScoring &scoring, const CSubstMatrix &matrix)
Definition: Info.cpp:107
void RefineAlignment(objects::CScope &scope, objects::CSeq_align &seq_align, const list< CNPiece > &good_parts)
void SetScores(objects::CSeq_align &seq_align, objects::CScope &scope, const string &matrix_name="BLOSUM62")
void FrBackAlign(CBackAlignInfo &bi, CAli &ali)
Definition: nucprot.cpp:684
int FindIGapIntrons(const CProSplignInterrupt &interrupt, vector< pair< int, int > > &igi, const PSEQ &pseq, const CNSeq &nseq, int g, int e, int f, const CProSplignScaledScoring &scoring, const CSubstMatrix &matrix)
Definition: nucprot.cpp:508
int FindFGapIntronNog(const CProSplignInterrupt &interrupt, vector< pair< int, int > > &igi, const PSEQ &pseq, const CNSeq &nseq, bool &left_gap, bool &right_gap, const CProSplignScaledScoring &scoring, const CSubstMatrix &matrix)
Definition: nucprot.cpp:260
int FrAlign(const CProSplignInterrupt &interrupt, CBackAlignInfo &bi, const PSEQ &pseq, const CNSeq &nseq, int g, int e, int f, const CProSplignScaledScoring &scoring, const CSubstMatrix &matrix)
Definition: nucprot.cpp:171
void BackAlignNog(CTBackAlignInfo< CBMode > &bi, CAli &ali)
Definition: nucprot.cpp:1143
int FrAlignFNog1(const CProSplignInterrupt &interrupt, CBackAlignInfo &bi, const PSEQ &pseq, const CNSeq &nseq, const CProSplignScaledScoring &scoring, const CSubstMatrix &matrix, bool left_gap, bool right_gap)
Definition: nucprot.cpp:774
int AlignFNog(const CProSplignInterrupt &interrupt, CTBackAlignInfo< CBMode > &bi, const PSEQ &pseq, const CNSeq &nseq, const CProSplignScaledScoring &scoring, const CSubstMatrix &matrix)
Definition: nucprot.cpp:937
#define false
Definition: bool.h:36
CRef< CSeq_align > MakeSeq_align(const CPSeq &cpseq, const CNSeq &cnseq) const
Definition: Ali.hpp:60
CArgAllow_Integers –.
Definition: ncbiargs.hpp:1751
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgException –.
Definition: ncbiargs.hpp:120
CArgs –.
Definition: ncbiargs.hpp:379
CBioseq_Handle –.
virtual CIntronlessNew * clone()
Definition: prosplign.cpp:986
CIntronlessNew(CProSplignScoring scoring)
Definition: prosplign.cpp:985
virtual int stage1()
Definition: prosplign.cpp:1000
CIntronlessOld(CProSplignScoring scoring)
Definition: prosplign.cpp:977
virtual CIntronlessOld * clone()
Definition: prosplign.cpp:978
virtual int stage1()
Definition: prosplign.cpp:991
CIntronless(CProSplignScoring scoring)
Definition: prosplign.cpp:968
CBackAlignInfo m_bi
Definition: prosplign.cpp:972
virtual void stage2(CAli &ali)
Definition: prosplign.cpp:1006
Definition: NSeq.hpp:52
int size(void) const
Definition: NSeq.hpp:71
void Init(CScope &scope, CSeq_loc &genomic)
Definition: NSeq.cpp:83
COneStage(CProSplignScoring scoring)
Definition: prosplign.cpp:843
CTBackAlignInfo< CBMode > m_bi
Definition: prosplign.cpp:850
virtual int stage1()
Definition: prosplign.cpp:853
virtual COneStage * clone()
Definition: prosplign.cpp:844
virtual void stage2(CAli &ali)
Definition: prosplign.cpp:859
int GetGcode(void) const
Definition: Org_ref.cpp:134
Definition: PSeq.hpp:51
bool HasStart(void)
Definition: PSeq.hpp:55
void Interrupt(void)
Definition: nucprot.hpp:186
void SetInterruptCallback(TInterruptFnPtr prg_callback, void *data)
Definition: nucprot.hpp:199
Scoring parameters object.
Definition: prosplign.hpp:54
CProSplignOptions_Base & SetScoreMatrix(const string &matrix_name)
Definition: prosplign.cpp:88
static void SetupArgDescriptions(CArgDescriptions *argdescr)
Definition: prosplign.cpp:60
CProSplignOptions_Base & SetAltStarts(bool allow_alt_start)
Definition: prosplign.cpp:98
CProSplignOptions_Base()
creates scoring parameter object with default values
Definition: prosplign.cpp:76
const string & GetScoreMatrix() const
Definition: prosplign.cpp:93
static const bool default_allow_alt_starts
Definition: prosplign.hpp:69
static const char * default_score_matrix_name
Definition: prosplign.hpp:68
bool GetAltStarts() const
Definition: prosplign.cpp:104
Output filtering parameters.
Definition: prosplign.hpp:156
static const int default_cut_flanks_with_posit_window
Definition: prosplign.hpp:247
CProSplignOutputOptions & SetCutFlanksWithPositDropoff(int)
Definition: prosplign.cpp:568
CProSplignOutputOptions & SetMinExonPos(int)
minimum exon positives percentage
Definition: prosplign.cpp:658
bool cut_flanks_with_posit_drop
???
Definition: prosplign.hpp:272
bool GetCutFlankPartialCodons() const
Definition: prosplign.cpp:613
int GetTotalPositives() const
Definition: prosplign.cpp:683
CProSplignOutputOptions & SetMinGoodLen(int)
good piece should not be shorter than that
Definition: prosplign.cpp:719
CProSplignOutputOptions & SetMinPositives(int)
Definition: prosplign.cpp:698
int GetStopBonus() const
Definition: prosplign.cpp:746
CProSplignOutputOptions & SetMinFlankingExonLen(int)
minimum number of bases in the first and last exon
Definition: prosplign.cpp:709
int GetCutFlanksWithPositGapRatio() const
Definition: prosplign.cpp:603
int GetMaxBadLen() const
Definition: prosplign.cpp:693
CProSplignOutputOptions & SetCutFlanksWithPositMaxLen(int)
max flank size to cut
Definition: prosplign.cpp:588
static const int default_total_positives
Definition: prosplign.hpp:257
static const int default_cut_flanks_with_posit_dropoff
Definition: prosplign.hpp:246
CProSplignOutputOptions(EMode mode=eWithHoles)
Definition: prosplign.cpp:348
int GetCutFlanksWithPositWindow() const
Definition: prosplign.cpp:583
int GetCutFlanksWithPositMaxLen() const
Definition: prosplign.cpp:593
CProSplignOutputOptions & SetMinHoleLen(int)
fill back small holes between good pieces holes with both unaligned protein and nucleotide portions l...
Definition: prosplign.cpp:628
static const bool default_fill_holes
Definition: prosplign.hpp:252
static const int default_flank_positives
Definition: prosplign.hpp:256
CProSplignOutputOptions & SetCutFlanksWithPositWindow(int)
window size
Definition: prosplign.cpp:578
CProSplignOutputOptions & SetCutNs(bool)
cut trailing Ns at the ends of good pieces.
Definition: prosplign.cpp:638
bool GetFillHoles() const
Definition: prosplign.cpp:623
bool GetCutFlanksWithPositDrop() const
Definition: prosplign.cpp:563
CProSplignOutputOptions & SetCutFlankPartialCodons(bool)
cut partial codons and adjecent at the beginning and at the end good pieces called at the end of post...
Definition: prosplign.cpp:608
static const bool default_cut_flanks_with_posit_drop
Definition: prosplign.hpp:245
static const int default_cut_flanks_with_posit_max_len
Definition: prosplign.hpp:248
static const int default_min_hole_len
Definition: prosplign.hpp:253
CProSplignOutputOptions & SetCutFlanksWithPositDrop(bool)
cut flanks if drop of positives is more than a dropoff in comparison to positives in a window next to...
Definition: prosplign.cpp:558
static const int default_min_good_len
Definition: prosplign.hpp:266
CProSplignOutputOptions & SetTotalPositives(int)
good piece total percentage threshold
Definition: prosplign.cpp:678
CProSplignOutputOptions & SetCutFlanksWithPositGapRatio(int)
count gaps as 1+1/gap_ratio, gap_ratio = 1 - standart behaviour.
Definition: prosplign.cpp:598
int GetMinHoleLen() const
Definition: prosplign.cpp:633
int GetMinFlankingExonLen() const
Definition: prosplign.cpp:715
int GetMinGoodLen() const
Definition: prosplign.cpp:725
static void SetupArgDescriptions(CArgDescriptions *argdescr)
Definition: prosplign.cpp:194
int GetFlankPositives() const
Definition: prosplign.cpp:673
int GetCutFlanksWithPositDropoff() const
Definition: prosplign.cpp:573
static const int default_cut_flanks_with_posit_gap_ratio
Definition: prosplign.hpp:249
CProSplignOutputOptions & SetMinExonId(int)
minimum exon identity
Definition: prosplign.cpp:648
int GetMinExonPos() const
Definition: prosplign.cpp:663
static const int default_min_flanking_exon_len
Definition: prosplign.hpp:265
CProSplignOutputOptions & SetMaxBadLen(int)
any part of a good piece longer than max_bad_len should not be worse than min_positives
Definition: prosplign.cpp:688
int GetStartBonus() const
Definition: prosplign.cpp:736
bool IsPassThrough() const
Definition: prosplign.cpp:553
CProSplignOutputOptions & SetStopBonus(int)
reward for stop codon at the end. Not implemented yet
Definition: prosplign.cpp:740
static const int default_max_bad_len
Definition: prosplign.hpp:259
static const int default_start_bonus
Definition: prosplign.hpp:268
CProSplignOutputOptions & SetStartBonus(int)
reward (in # of positives?) for start codon match.
Definition: prosplign.cpp:730
CProSplignOutputOptions & SetFlankPositives(int)
any length flank of a good piece should not be worse than this percentage threshold
Definition: prosplign.cpp:668
int GetMinExonId() const
Definition: prosplign.cpp:653
@ ePassThrough
all zeroes - no filtering
Definition: prosplign.hpp:162
@ eWithHoles
default filtering parameters
Definition: prosplign.hpp:160
CProSplignOutputOptions & SetFillHoles(bool)
fill back holes between good pieces.
Definition: prosplign.cpp:618
static const int default_stop_bonus
???
Definition: prosplign.hpp:269
static const bool default_cut_ns
Definition: prosplign.hpp:254
static const int default_min_positives
Definition: prosplign.hpp:260
static const bool default_cut_flank_partial_codons
Definition: prosplign.hpp:251
static const int default_min_exon_pos
Definition: prosplign.hpp:263
static const int default_min_exon_id
Definition: prosplign.hpp:262
int GetMinPositives() const
Definition: prosplign.cpp:704
CProSplignScoring & SetFrameshiftOpeningCost(int)
Definition: prosplign.cpp:495
int inverted_intron_extension
Definition: prosplign.hpp:146
CProSplignScoring & SetInvertedIntronExtensionCost(int)
Inverted Intron Extension Cost intron_extension cost for 1 base = 1/(inverted_intron_extension*3)
Definition: prosplign.cpp:543
int GetGapOpeningCost() const
Definition: prosplign.cpp:480
int GetMinIntronLen() const
Definition: prosplign.cpp:471
int GetFrameshiftOpeningCost() const
Definition: prosplign.cpp:500
static const int default_min_intron_len
Definition: prosplign.hpp:125
int GetInvertedIntronExtensionCost() const
Definition: prosplign.cpp:548
CProSplignScoring()
creates scoring parameter object with default values
Definition: prosplign.cpp:169
CProSplignScoring & SetGapOpeningCost(int)
in addition to ScoreMatrix prosplign uses following costs (negate to get a score)
Definition: prosplign.cpp:475
int GetGCIntronCost() const
Definition: prosplign.cpp:519
static const int default_intron_GT
Definition: prosplign.hpp:131
static const int default_gap_extension
Definition: prosplign.hpp:128
static const int default_intron_GC
Definition: prosplign.hpp:132
static const int default_intron_non_consensus
Definition: prosplign.hpp:134
static const int default_frameshift_opening
Definition: prosplign.hpp:129
CProSplignScoring & SetATIntronCost(int)
AT/AC intron opening cost.
Definition: prosplign.cpp:523
int GetATIntronCost() const
Definition: prosplign.cpp:528
static const int default_gap_opening
Definition: prosplign.hpp:127
static const int default_inverted_intron_extension
Definition: prosplign.hpp:135
CProSplignScoring & SetNonConsensusIntronCost(int)
Non Consensus Intron Cost should not exceed a sum of lowest two intron opening costs,...
Definition: prosplign.cpp:533
int GetGapExtensionCost() const
Definition: prosplign.cpp:490
CProSplignScoring & SetGapExtensionCost(int)
Gap Extension Cost for one aminoacid (three bases)
Definition: prosplign.cpp:485
CProSplignScoring & SetGCIntronCost(int)
GC/AG intron opening cost.
Definition: prosplign.cpp:514
CProSplignScoring & SetMinIntronLen(int)
Definition: prosplign.cpp:466
CProSplignScoring & SetGTIntronCost(int)
GT/AG intron opening cost.
Definition: prosplign.cpp:505
static void SetupArgDescriptions(CArgDescriptions *argdescr)
Definition: prosplign.cpp:109
int GetGTIntronCost() const
Definition: prosplign.cpp:510
int GetNonConsensusIntronCost() const
Definition: prosplign.cpp:538
static const int default_intron_AT
Definition: prosplign.hpp:133
CImplementation(CProSplignScoring scoring)
Definition: prosplign.cpp:759
CProSplignInterrupt m_Interrupt
Definition: prosplign.cpp:837
bool HasStartOnNuc(const CSpliced_seg &sps)
Definition: prosplign.cpp:1256
shared_ptr< CNSeq > m_cnseq
Definition: prosplign.cpp:835
void SeekStartStop(CSeq_align &seq_align)
Definition: prosplign.cpp:1327
const CProSplignScaledScoring & GetScaleScoring() const
Definition: prosplign.cpp:780
void SetTranslationTable(int gcode)
Definition: prosplign.cpp:820
const CSeq_id * m_protein
Definition: prosplign.cpp:832
CProSplignScaledScoring m_scoring
Definition: prosplign.cpp:828
static CImplementation * create(CProSplignScoring scoring, bool intronless, bool one_stage, bool just_second_stage, bool old)
Definition: prosplign.cpp:1011
const CSubstMatrix & GetSubstMatrix() const
Definition: prosplign.cpp:785
virtual void SetFlanks(bool lgap, bool rgap)
Definition: prosplign.cpp:802
virtual void GetFlanks(bool &lgap, bool &rgap) const
Definition: prosplign.cpp:798
CRef< CSeq_loc > m_genomic
Definition: prosplign.cpp:833
virtual void stage2(CAli &ali)=0
shared_ptr< CPSeq > m_protseq
Definition: prosplign.cpp:834
int FindGlobalAlignment_stage1(CScope &scope, const CSeq_id &protein, const CSeq_loc &genomic)
Definition: prosplign.cpp:1190
void SetInterruptCallback(CProSplign::TInterruptFnPtr prg_callback, void *data)
Definition: prosplign.cpp:812
bool HasStopOnNuc(const CSpliced_seg &sps)
Definition: prosplign.cpp:1289
virtual const vector< pair< int, int > > & GetExons() const
Definition: prosplign.cpp:790
CRef< CSeq_align > FindGlobalAlignment_stage2()
Definition: prosplign.cpp:1210
void SetScope(CScope &scope)
Definition: prosplign.cpp:817
virtual CImplementation * clone()=0
virtual vector< pair< int, int > > & SetExons()
Definition: prosplign.cpp:794
CRef< CSeq_align > FindGlobalAlignment(CScope &scope, const CSeq_id &protein, const CSeq_loc &genomic_orig)
Definition: prosplign.cpp:768
spliced protein to genomic alignment
Definition: prosplign.hpp:299
bool(* TInterruptFnPtr)(void *callback_data)
User interrupt logic for GBENCH.
Definition: prosplign.hpp:318
void AssignGeneticCode(objects::CScope &scope, const objects::CSeq_id &gid, int gcode)
Definition: prosplign.cpp:1098
CProSplign(CProSplignScoring scoring=CProSplignScoring(), bool intronless=false)
By default ProSplign looks for introns.
Definition: prosplign.cpp:1052
void SetInterruptCallback(TInterruptFnPtr prg_callback, void *data)
Definition: prosplign.cpp:1091
void Interrupt(void)
for MT usage set a signal for core algirithm to interrupt calculations after this method is called fr...
Definition: prosplign.cpp:1086
CRef< objects::CSeq_align > FindGlobalAlignment(objects::CScope &scope, const objects::CSeq_id &protein, const objects::CSeq_loc &genomic)
Globally aligns protein to a region on genomic sequence.
Definition: prosplign.cpp:1117
void GetFlanks(bool &lgap, bool &rgap) const
Definition: prosplign.cpp:1041
unique_ptr< CImplementation > m_implementation
Definition: prosplign.hpp:367
void SetFlanks(bool lgap, bool rgap)
Definition: prosplign.cpp:1046
vector< pair< int, int > > & SetExons()
Definition: prosplign.cpp:1036
const vector< pair< int, int > > & GetExons() const
Definition: prosplign.cpp:1031
void SetTranslationTable(int gcode)
Definition: prosplign.cpp:1066
CRef< objects::CSeq_align > RefineAlignment(objects::CScope &scope, const objects::CSeq_align &seq_align, CProSplignOutputOptions output_options=CProSplignOutputOptions())
Refines Spliced-seg alignment by removing bad pieces according to output_options.
Definition: prosplign.cpp:1226
CProt_pos_Base::TFrame GetFrame() const
Definition: Prot_pos.hpp:82
Text representation of ProSplign alignment.
Definition: alntext.hpp:60
CScope –.
Definition: scope.hpp:92
CSeqVector –.
Definition: seq_vector.hpp:65
CSpliced_exon_chunk –.
CSpliced_seg_modifier –.
Substitution Matrix for Scoring Amino-Acid Alignments.
Definition: nucprot.hpp:123
void SetTranslationTable(const CTranslationTable *trans_table)
Definition: nucprot.cpp:91
void Init(int oilen, int ojlen)
CTwoStageNew(CProSplignScoring scoring, bool just_second_stage)
Definition: prosplign.cpp:906
virtual CTwoStageNew * clone()
Definition: prosplign.cpp:907
virtual int stage1()
Definition: prosplign.cpp:944
virtual void stage2(CAli &ali)
Definition: prosplign.cpp:951
CTwoStageOld(CProSplignScoring scoring, bool just_second_stage)
Definition: prosplign.cpp:897
virtual void stage2(CAli &ali)
Definition: prosplign.cpp:926
virtual int stage1()
Definition: prosplign.cpp:913
virtual CTwoStageOld * clone()
Definition: prosplign.cpp:898
vector< pair< int, int > > m_igi
Definition: prosplign.cpp:890
bool m_rgap
Definition: prosplign.cpp:892
bool m_just_second_stage
Definition: prosplign.cpp:889
virtual const vector< pair< int, int > > & GetExons() const
Definition: prosplign.cpp:870
virtual void GetFlanks(bool &lgap, bool &rgap) const
Definition: prosplign.cpp:878
CTwoStage(CProSplignScoring scoring, bool just_second_stage)
Definition: prosplign.cpp:866
virtual vector< pair< int, int > > & SetExons()
Definition: prosplign.cpp:874
virtual void SetFlanks(bool lgap, bool rgap)
Definition: prosplign.cpp:883
bool m_lgap
Definition: prosplign.cpp:891
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
void AddFlag(const string &name, const string &comment, CBoolEnum< EFlagValue > set_value=eFlagHasValueIfSet, TFlags flags=0)
Add description for flag argument.
Definition: ncbiargs.cpp:2459
void SetConstraint(const string &name, const CArgAllow *constraint, EConstraintNegate negate=eConstraint)
Set additional user defined constraint on argument value.
Definition: ncbiargs.cpp:2591
bool Exist(const string &name) const
Check if there is already an argument description with specified name.
Definition: ncbiargs.cpp:2654
void AddDefaultKey(const string &name, const string &synopsis, const string &comment, EType type, const string &default_value, TFlags flags=0, const string &env_var=kEmptyStr, const char *display_value=nullptr)
Add description for optional key with default value.
Definition: ncbiargs.cpp:2442
@ eBoolean
{'true', 't', 'false', 'f'}, case-insensitive
Definition: ncbiargs.hpp:590
@ eString
An arbitrary string.
Definition: ncbiargs.hpp:589
@ eInteger
Convertible into an integer number (int or Int8)
Definition: ncbiargs.hpp:592
#define NULL
Definition: ncbistd.hpp:225
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
C * SerialClone(const C &src)
Create on heap a clone of the source object.
Definition: serialbase.hpp:512
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
Definition: Seq_id.cpp:318
ENa_strand GetStrand(void) const
Get the location's strand.
Definition: Seq_loc.cpp:882
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Override Assign() to incorporate cache invalidation.
Definition: Seq_loc.cpp:337
TRange GetTotalRange(void) const
Definition: Seq_loc.hpp:913
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
Definition: Seq_loc.hpp:941
void SetStrand(ENa_strand strand)
Set the strand for all of the location's ranges.
Definition: Seq_loc.cpp:5196
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
const COrg_ref & GetOrg_ref(const CBioseq_Handle &handle)
Return the org-ref associated with a given sequence.
Definition: sequence.cpp:264
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
void SetDescr(TDescr &v) const
CSeq_entry_EditHandle GetEditHandle(void) const
Get 'edit' version of handle.
CSeq_entry_Handle GetTopLevelEntry(void) const
Get top level Seq-entry handle.
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer)
Fill the buffer string with the sequence data for the interval [start, stop).
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5083
void SetOrg(TOrg &value)
Assign a value to Org data member.
Definition: BioSource_.cpp:108
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
const TProtpos & GetProtpos(void) const
Get the variant data.
TModifiers & SetModifiers(void)
Assign a value to Modifiers data member.
const TGenomic_id & GetGenomic_id(void) const
Get the Genomic_id member data.
TMatch GetMatch(void) const
Get the variant data.
const TProduct_id & GetProduct_id(void) const
Get the Product_id member data.
TGenomic_start GetGenomic_start(void) const
Get the Genomic_start member data.
bool IsMismatch(void) const
Check if variant Mismatch is selected.
void SetSegs(TSegs &value)
Assign a value to Segs data member.
Definition: Seq_align_.cpp:310
TExons & SetExons(void)
Assign a value to Exons data member.
TProduct_length GetProduct_length(void) const
Get the Product_length member data.
TDiag GetDiag(void) const
Get the variant data.
bool IsSetModifiers(void) const
alignment descriptors / modifiers this provides us a set for extension Check if a value has been assi...
TMismatch GetMismatch(void) const
Get the variant data.
TGenomic_strand GetGenomic_strand(void) const
Get the Genomic_strand member data.
TAmin GetAmin(void) const
Get the Amin member data.
Definition: Prot_pos_.hpp:220
void SetType(TType value)
Assign a value to Type data member.
Definition: Seq_align_.hpp:818
const TParts & GetParts(void) const
Get the Parts member data.
const TProduct_start & GetProduct_start(void) const
Get the Product_start member data.
const TProduct_end & GetProduct_end(void) const
Get the Product_end member data.
const TSpliced & GetSpliced(void) const
Get the variant data.
Definition: Seq_align_.cpp:219
list< CRef< CSeq_loc > > TBounds
Definition: Seq_align_.hpp:400
bool IsGenomic_ins(void) const
Check if variant Genomic_ins is selected.
bool IsMatch(void) const
Check if variant Match is selected.
list< CRef< CSpliced_exon > > TExons
const TExons & GetExons(void) const
Get the Exons member data.
TParts & SetParts(void)
Assign a value to Parts data member.
bool IsDiag(void) const
Check if variant Diag is selected.
TGenomic_end GetGenomic_end(void) const
Get the Genomic_end member data.
bool IsProduct_ins(void) const
Check if variant Product_ins is selected.
const TModifiers & GetModifiers(void) const
Get the Modifiers member data.
void ResetModifiers(void)
Reset Modifiers data member.
const TSegs & GetSegs(void) const
Get the Segs member data.
Definition: Seq_align_.hpp:921
@ eType_disc
discontinuous alignment
Definition: Seq_align_.hpp:104
bool IsWhole(void) const
Check if variant Whole is selected.
Definition: Seq_loc_.hpp:522
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
@ eNa_strand_unknown
Definition: Na_strand_.hpp:65
@ eNa_strand_both_rev
in reverse orientation
Definition: Na_strand_.hpp:69
@ eNa_strand_both
in forward orientation
Definition: Na_strand_.hpp:68
TSource & SetSource(void)
Select the variant.
Definition: Seqdesc_.cpp:572
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
char * buf
int len
mdb_mode_t mode
Definition: lmdb++.h:38
USING_SCOPE(ncbi::objects)
#define _ASSERT
else result
Definition: token2.c:20
Modified on Wed Oct 04 02:23:48 2023 by modify_doxy.py rev. 669887