62 if (!arg_desc->
Exist(
"score_matrix")) {
66 "Aminoacid substitution matrix",
70 if (!arg_desc->
Exist(
"allow_alt_starts")) {
71 arg_desc->
AddFlag(
"allow_alt_starts",
"treat alternative starts same way as ATG for ASN flag 'start-codon-found' (this is an ASN output oprion)");
128 "Gap Extension Cost for one aminoacid (three bases)",
132 (
"frameshift_opening",
133 "frameshift_opening",
134 "Frameshift Opening Cost",
140 "GT/AG intron opening cost",
146 "GC/AG intron opening cost",
152 "AT/AC intron opening cost",
156 (
"intron_non_consensus",
157 "intron_non_consensus",
158 "Non Consensus Intron opening Cost",
162 (
"inverted_intron_extension",
163 "inverted_intron_extension",
164 "intron_extension cost for 1 base = 1/(inverted_intron_extension*3)",
198 arg_desc->
AddFlag(
"full",
"output global alignment as is (all postprocessing options are ingoned)");
200 (
"cut_flank_partial_codons",
201 "cut_flank_partial_codons",
202 "cut partial codons and adjacent mismatches",
208 "postprocessing: postprocess flank regions only. Holes between good pieces will be filled back. It may decrease positives and identity",
215 "postprocessing: fill back holes with both unaligned portions of nuc. and prot. less than min_hole_len;"
228 "postprocessing: remove Ns at the end of good pieces. It may slightly decrease positives and identity",
234 "postprocessing: any length flank of a good piece should not be worse than this",
241 "postprocessing: good piece total percentage threshold",
248 "postprocessing: any part of a good piece longer than max_bad_len should not be worse than min_positives",
255 "postprocessing: any part of a good piece longer than max_bad_len should not be worse than min_positives",
263 "postprocessing: any full or partial exon in the output won't have lower percentage of identity",
268 (
"min_exon_positives",
270 "postprocessing: any full or partial exon in the output won't have lower percentage of positives",
276 (
"min_flanking_exon_len",
277 "min_flanking_exon_len",
278 "postprocessing: minimum number of bases in the first and last exon",
285 "postprocessing: good piece should not be shorter",
291 (
"cut_flanks_with_posit_drop",
292 "cut_flanks_with_posit_drop",
293 "cut flanks if drop of positives is more than cut_flanks_with_posit_dropoff threshold",
297 (
"cut_flanks_with_posit_dropoff",
298 "cut_flanks_with_posit_dropoff",
299 "percentage threshold for cut_flanks_with_posit_drop",
304 (
"cut_flanks_with_posit_window",
305 "cut_flanks_with_posit_window",
306 "window size for cut_flanks_with_posit_drop."
307 " Positives will be counted for a flank and for a window next to the flank."
308 " If difference (in percentage) is more than cut_flanks_with_posit_dropoff, flank will be dropped",
314 (
"cut_flanks_with_posit_max_len",
315 "cut_flanks_with_posit_max_len",
316 "maximum length to cut for cut_flanks_with_posit_drop",
322 (
"cut_flanks_with_posit_gap_ratio",
323 "cut_flanks_with_posot_gap_ratio",
324 "gap ratio for cut_flanks_with_posit_drop."
325 " Gaps will be counted as 1 for opening and 1/gap_ratio for extention while trimming flanks."
326 " Setting gap_ratio to more than 1 will affect cut_flanks_with_posit_dropoff value",
334 "postprocessing: reward for start codon match",
341 "postprocessing: reward for stop codon at the end (not implemented)",
450 SetCutNs(args[
"cut_trailing_Ns"].AsBoolean());
771 genomic.Assign(genomic_orig);
790 virtual const vector<pair<int, int> >&
GetExons()
const
870 virtual const vector<pair<int, int> >&
GetExons()
const
1058 m_implementation(
CImplementation::create(scoring,intronless,one_stage,just_second_stage,old))
1078 const CProt_pos& prot_start_pos = exons.front()->GetProduct_start().GetProtpos();
1079 const CProt_pos& prot_stop_pos = exons.back()->GetProduct_end().GetProtpos();
1104 if((*it)->IsSource()) {
1105 (*it)->SetSource().SetOrg().SetOrgname().SetGcode(gcode);
1112 ldesc.push_back(desc);
1120 genomic->
Assign(genomic_orig);
1146 scope, protein, *genomic);
1155 int plus_score = plus_data->FindGlobalAlignment_stage1(scope, protein, *genomic);
1158 int minus_score =
m_implementation->FindGlobalAlignment_stage1(scope, protein, *genomic);
1160 if (minus_score <= plus_score)
1169 scope, protein, *genomic);
1174 if (
result->CanGetBounds()) {
1176 if ((*b)->GetId() !=
NULL && (*b)->GetId()->Match(*nucid)) {
1184 genomic_bounds->
Assign(genomic_orig);
1185 result->SetBounds().push_back(genomic_bounds);
1194 const CSeq_id* sid = genomic.GetId();
1218 SeekStartStop(*seq_align);
1229 refined_align->
Assign(seq_align);
1233 return refined_align;
1238 if (good_parts.empty()) {
1244 if (good_parts.size()!=1 || !IsProteinSpanWhole(refined_align->
GetSegs().
GetSpliced())) {
1252 return refined_align;
1284 if(
buf.size() != 3)
return false;
1286 return m_matrix.GetTranslationTable().TranslateStartTriplet(
buf) ==
'M';
1313 CSeq_loc genomic_seqloc(nucid,stop_codon_start, stop_codon_end,sps.
GetGenomic_strand());
1320 if(
buf.size() != 3)
return false;
1322 return m_matrix.GetTranslationTable().TranslateTriplet(
buf) ==
'*';
1333 if ((*m)->IsStart_codon_found() || (*m)->IsStop_codon_found())
1344 if(HasStartOnNuc(sps)) {
1346 modi->SetStart_codon_found(
true);
1353 CPSeq pseq(*m_scope,protid);
1357 if (pseq.
HasStart() && !chunk->IsMatch()) {
1359 int len = chunk->GetDiag();
1362 chunk->SetDiag(
len-3);
1370 if(HasStopOnNuc(sps)) {
1372 modi->SetStop_codon_found(
true);
list< CNPiece > FindGoodParts(const CProteinAlignText &alignment_text, CProSplignOutputOptionsExt m_options, const CProSplignScaledScoring &scoring, const CSubstMatrix &matrix)
void RefineAlignment(objects::CScope &scope, objects::CSeq_align &seq_align, const list< CNPiece > &good_parts)
void SetScores(objects::CSeq_align &seq_align, objects::CScope &scope, const string &matrix_name="BLOSUM62")
void FrBackAlign(CBackAlignInfo &bi, CAli &ali)
int FindIGapIntrons(const CProSplignInterrupt &interrupt, vector< pair< int, int > > &igi, const PSEQ &pseq, const CNSeq &nseq, int g, int e, int f, const CProSplignScaledScoring &scoring, const CSubstMatrix &matrix)
int FindFGapIntronNog(const CProSplignInterrupt &interrupt, vector< pair< int, int > > &igi, const PSEQ &pseq, const CNSeq &nseq, bool &left_gap, bool &right_gap, const CProSplignScaledScoring &scoring, const CSubstMatrix &matrix)
int FrAlign(const CProSplignInterrupt &interrupt, CBackAlignInfo &bi, const PSEQ &pseq, const CNSeq &nseq, int g, int e, int f, const CProSplignScaledScoring &scoring, const CSubstMatrix &matrix)
void BackAlignNog(CTBackAlignInfo< CBMode > &bi, CAli &ali)
int FrAlignFNog1(const CProSplignInterrupt &interrupt, CBackAlignInfo &bi, const PSEQ &pseq, const CNSeq &nseq, const CProSplignScaledScoring &scoring, const CSubstMatrix &matrix, bool left_gap, bool right_gap)
int AlignFNog(const CProSplignInterrupt &interrupt, CTBackAlignInfo< CBMode > &bi, const PSEQ &pseq, const CNSeq &nseq, const CProSplignScaledScoring &scoring, const CSubstMatrix &matrix)
CRef< CSeq_align > MakeSeq_align(const CPSeq &cpseq, const CNSeq &cnseq) const
virtual CIntronlessNew * clone()
CIntronlessNew(CProSplignScoring scoring)
CIntronlessOld(CProSplignScoring scoring)
virtual CIntronlessOld * clone()
CIntronless(CProSplignScoring scoring)
virtual void stage2(CAli &ali)
void Init(CScope &scope, CSeq_loc &genomic)
COneStage(CProSplignScoring scoring)
CTBackAlignInfo< CBMode > m_bi
virtual COneStage * clone()
virtual void stage2(CAli &ali)
void SetInterruptCallback(TInterruptFnPtr prg_callback, void *data)
Scoring parameters object.
CProSplignOptions_Base & SetScoreMatrix(const string &matrix_name)
static void SetupArgDescriptions(CArgDescriptions *argdescr)
CProSplignOptions_Base & SetAltStarts(bool allow_alt_start)
CProSplignOptions_Base()
creates scoring parameter object with default values
const string & GetScoreMatrix() const
static const bool default_allow_alt_starts
static const char * default_score_matrix_name
bool GetAltStarts() const
Output filtering parameters.
static const int default_cut_flanks_with_posit_window
CProSplignOutputOptions & SetCutFlanksWithPositDropoff(int)
CProSplignOutputOptions & SetMinExonPos(int)
minimum exon positives percentage
bool cut_flanks_with_posit_drop
???
bool GetCutFlankPartialCodons() const
int GetTotalPositives() const
int cut_flanks_with_posit_max_len
CProSplignOutputOptions & SetMinGoodLen(int)
good piece should not be shorter than that
CProSplignOutputOptions & SetMinPositives(int)
CProSplignOutputOptions & SetMinFlankingExonLen(int)
minimum number of bases in the first and last exon
int GetCutFlanksWithPositGapRatio() const
CProSplignOutputOptions & SetCutFlanksWithPositMaxLen(int)
max flank size to cut
static const int default_total_positives
static const int default_cut_flanks_with_posit_dropoff
CProSplignOutputOptions(EMode mode=eWithHoles)
int GetCutFlanksWithPositWindow() const
int GetCutFlanksWithPositMaxLen() const
int cut_flanks_with_posit_window
CProSplignOutputOptions & SetMinHoleLen(int)
fill back small holes between good pieces holes with both unaligned protein and nucleotide portions l...
static const bool default_fill_holes
static const int default_flank_positives
CProSplignOutputOptions & SetCutFlanksWithPositWindow(int)
window size
CProSplignOutputOptions & SetCutNs(bool)
cut trailing Ns at the ends of good pieces.
bool GetFillHoles() const
int cut_flanks_with_posit_gap_ratio
bool GetCutFlanksWithPositDrop() const
CProSplignOutputOptions & SetCutFlankPartialCodons(bool)
cut partial codons and adjecent at the beginning and at the end good pieces called at the end of post...
static const bool default_cut_flanks_with_posit_drop
static const int default_cut_flanks_with_posit_max_len
static const int default_min_hole_len
CProSplignOutputOptions & SetCutFlanksWithPositDrop(bool)
cut flanks if drop of positives is more than a dropoff in comparison to positives in a window next to...
static const int default_min_good_len
bool cut_flank_partial_codons
CProSplignOutputOptions & SetTotalPositives(int)
good piece total percentage threshold
CProSplignOutputOptions & SetCutFlanksWithPositGapRatio(int)
count gaps as 1+1/gap_ratio, gap_ratio = 1 - standart behaviour.
int GetMinHoleLen() const
int GetMinFlankingExonLen() const
int GetMinGoodLen() const
static void SetupArgDescriptions(CArgDescriptions *argdescr)
int GetFlankPositives() const
int GetCutFlanksWithPositDropoff() const
int min_flanking_exon_len
static const int default_cut_flanks_with_posit_gap_ratio
CProSplignOutputOptions & SetMinExonId(int)
minimum exon identity
int GetMinExonPos() const
static const int default_min_flanking_exon_len
CProSplignOutputOptions & SetMaxBadLen(int)
any part of a good piece longer than max_bad_len should not be worse than min_positives
int GetStartBonus() const
bool IsPassThrough() const
CProSplignOutputOptions & SetStopBonus(int)
reward for stop codon at the end. Not implemented yet
static const int default_max_bad_len
static const int default_start_bonus
CProSplignOutputOptions & SetStartBonus(int)
reward (in # of positives?) for start codon match.
CProSplignOutputOptions & SetFlankPositives(int)
any length flank of a good piece should not be worse than this percentage threshold
@ ePassThrough
all zeroes - no filtering
@ eWithHoles
default filtering parameters
int cut_flanks_with_posit_dropoff
CProSplignOutputOptions & SetFillHoles(bool)
fill back holes between good pieces.
static const int default_stop_bonus
???
static const bool default_cut_ns
static const int default_min_positives
static const bool default_cut_flank_partial_codons
static const int default_min_exon_pos
static const int default_min_exon_id
int GetMinPositives() const
CProSplignScoring & SetFrameshiftOpeningCost(int)
int inverted_intron_extension
CProSplignScoring & SetInvertedIntronExtensionCost(int)
Inverted Intron Extension Cost intron_extension cost for 1 base = 1/(inverted_intron_extension*3)
int GetGapOpeningCost() const
int GetMinIntronLen() const
int GetFrameshiftOpeningCost() const
static const int default_min_intron_len
int GetInvertedIntronExtensionCost() const
CProSplignScoring()
creates scoring parameter object with default values
CProSplignScoring & SetGapOpeningCost(int)
in addition to ScoreMatrix prosplign uses following costs (negate to get a score)
int GetGCIntronCost() const
static const int default_intron_GT
static const int default_gap_extension
static const int default_intron_GC
static const int default_intron_non_consensus
static const int default_frameshift_opening
CProSplignScoring & SetATIntronCost(int)
AT/AC intron opening cost.
int GetATIntronCost() const
static const int default_gap_opening
static const int default_inverted_intron_extension
CProSplignScoring & SetNonConsensusIntronCost(int)
Non Consensus Intron Cost should not exceed a sum of lowest two intron opening costs,...
int GetGapExtensionCost() const
CProSplignScoring & SetGapExtensionCost(int)
Gap Extension Cost for one aminoacid (three bases)
CProSplignScoring & SetGCIntronCost(int)
GC/AG intron opening cost.
CProSplignScoring & SetMinIntronLen(int)
CProSplignScoring & SetGTIntronCost(int)
GT/AG intron opening cost.
static void SetupArgDescriptions(CArgDescriptions *argdescr)
int GetGTIntronCost() const
int GetNonConsensusIntronCost() const
static const int default_intron_AT
CImplementation(CProSplignScoring scoring)
CProSplignInterrupt m_Interrupt
bool HasStartOnNuc(const CSpliced_seg &sps)
shared_ptr< CNSeq > m_cnseq
void SeekStartStop(CSeq_align &seq_align)
const CProSplignScaledScoring & GetScaleScoring() const
void SetTranslationTable(int gcode)
const CSeq_id * m_protein
CProSplignScaledScoring m_scoring
static CImplementation * create(CProSplignScoring scoring, bool intronless, bool one_stage, bool just_second_stage, bool old)
const CSubstMatrix & GetSubstMatrix() const
virtual void SetFlanks(bool lgap, bool rgap)
virtual void GetFlanks(bool &lgap, bool &rgap) const
CRef< CSeq_loc > m_genomic
virtual void stage2(CAli &ali)=0
shared_ptr< CPSeq > m_protseq
int FindGlobalAlignment_stage1(CScope &scope, const CSeq_id &protein, const CSeq_loc &genomic)
void SetInterruptCallback(CProSplign::TInterruptFnPtr prg_callback, void *data)
bool HasStopOnNuc(const CSpliced_seg &sps)
virtual const vector< pair< int, int > > & GetExons() const
CRef< CSeq_align > FindGlobalAlignment_stage2()
virtual ~CImplementation()
void SetScope(CScope &scope)
virtual CImplementation * clone()=0
virtual vector< pair< int, int > > & SetExons()
CRef< CSeq_align > FindGlobalAlignment(CScope &scope, const CSeq_id &protein, const CSeq_loc &genomic_orig)
spliced protein to genomic alignment
bool(* TInterruptFnPtr)(void *callback_data)
User interrupt logic for GBENCH.
void AssignGeneticCode(objects::CScope &scope, const objects::CSeq_id &gid, int gcode)
CProSplign(CProSplignScoring scoring=CProSplignScoring(), bool intronless=false)
By default ProSplign looks for introns.
void SetInterruptCallback(TInterruptFnPtr prg_callback, void *data)
void Interrupt(void)
for MT usage set a signal for core algirithm to interrupt calculations after this method is called fr...
CRef< objects::CSeq_align > FindGlobalAlignment(objects::CScope &scope, const objects::CSeq_id &protein, const objects::CSeq_loc &genomic)
Globally aligns protein to a region on genomic sequence.
void GetFlanks(bool &lgap, bool &rgap) const
unique_ptr< CImplementation > m_implementation
void SetFlanks(bool lgap, bool rgap)
vector< pair< int, int > > & SetExons()
const vector< pair< int, int > > & GetExons() const
void SetTranslationTable(int gcode)
CRef< objects::CSeq_align > RefineAlignment(objects::CScope &scope, const objects::CSeq_align &seq_align, CProSplignOutputOptions output_options=CProSplignOutputOptions())
Refines Spliced-seg alignment by removing bad pieces according to output_options.
CProt_pos_Base::TFrame GetFrame() const
Text representation of ProSplign alignment.
Substitution Matrix for Scoring Amino-Acid Alignments.
void SetTranslationTable(const CTranslationTable *trans_table)
void Init(int oilen, int ojlen)
CTwoStageNew(CProSplignScoring scoring, bool just_second_stage)
virtual CTwoStageNew * clone()
virtual void stage2(CAli &ali)
CTwoStageOld(CProSplignScoring scoring, bool just_second_stage)
virtual void stage2(CAli &ali)
virtual CTwoStageOld * clone()
vector< pair< int, int > > m_igi
virtual const vector< pair< int, int > > & GetExons() const
virtual void GetFlanks(bool &lgap, bool &rgap) const
CTwoStage(CProSplignScoring scoring, bool just_second_stage)
virtual vector< pair< int, int > > & SetExons()
virtual void SetFlanks(bool lgap, bool rgap)
unsigned int TSeqPos
Type for sequence locations and lengths.
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
void AddFlag(const string &name, const string &comment, CBoolEnum< EFlagValue > set_value=eFlagHasValueIfSet, TFlags flags=0)
Add description for flag argument.
void SetConstraint(const string &name, const CArgAllow *constraint, EConstraintNegate negate=eConstraint)
Set additional user defined constraint on argument value.
bool Exist(const string &name) const
Check if there is already an argument description with specified name.
void AddDefaultKey(const string &name, const string &synopsis, const string &comment, EType type, const string &default_value, TFlags flags=0, const string &env_var=kEmptyStr, const char *display_value=nullptr)
Add description for optional key with default value.
@ eBoolean
{'true', 't', 'false', 'f'}, case-insensitive
@ eString
An arbitrary string.
@ eInteger
Convertible into an integer number (int or Int8)
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
C * SerialClone(const C &src)
Create on heap a clone of the source object.
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
ENa_strand GetStrand(void) const
Get the location's strand.
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Override Assign() to incorporate cache invalidation.
TRange GetTotalRange(void) const
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
void SetStrand(ENa_strand strand)
Set the strand for all of the location's ranges.
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
const COrg_ref & GetOrg_ref(const CBioseq_Handle &handle)
Return the org-ref associated with a given sequence.
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
void SetDescr(TDescr &v) const
CSeq_entry_EditHandle GetEditHandle(void) const
Get 'edit' version of handle.
CSeq_entry_Handle GetTopLevelEntry(void) const
Get top level Seq-entry handle.
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer)
Fill the buffer string with the sequence data for the interval [start, stop).
void Reset(void)
Reset reference object.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
void SetOrg(TOrg &value)
Assign a value to Org data member.
TTo GetTo(void) const
Get the To member data.
TFrom GetFrom(void) const
Get the From member data.
const TProtpos & GetProtpos(void) const
Get the variant data.
TModifiers & SetModifiers(void)
Assign a value to Modifiers data member.
const TGenomic_id & GetGenomic_id(void) const
Get the Genomic_id member data.
TMatch GetMatch(void) const
Get the variant data.
const TProduct_id & GetProduct_id(void) const
Get the Product_id member data.
TGenomic_start GetGenomic_start(void) const
Get the Genomic_start member data.
bool IsMismatch(void) const
Check if variant Mismatch is selected.
void SetSegs(TSegs &value)
Assign a value to Segs data member.
TExons & SetExons(void)
Assign a value to Exons data member.
TProduct_length GetProduct_length(void) const
Get the Product_length member data.
TDiag GetDiag(void) const
Get the variant data.
bool IsSetModifiers(void) const
alignment descriptors / modifiers this provides us a set for extension Check if a value has been assi...
TMismatch GetMismatch(void) const
Get the variant data.
TGenomic_strand GetGenomic_strand(void) const
Get the Genomic_strand member data.
TAmin GetAmin(void) const
Get the Amin member data.
void SetType(TType value)
Assign a value to Type data member.
const TParts & GetParts(void) const
Get the Parts member data.
const TProduct_start & GetProduct_start(void) const
Get the Product_start member data.
const TProduct_end & GetProduct_end(void) const
Get the Product_end member data.
const TSpliced & GetSpliced(void) const
Get the variant data.
list< CRef< CSeq_loc > > TBounds
bool IsGenomic_ins(void) const
Check if variant Genomic_ins is selected.
bool IsMatch(void) const
Check if variant Match is selected.
list< CRef< CSpliced_exon > > TExons
const TExons & GetExons(void) const
Get the Exons member data.
TParts & SetParts(void)
Assign a value to Parts data member.
bool IsDiag(void) const
Check if variant Diag is selected.
TGenomic_end GetGenomic_end(void) const
Get the Genomic_end member data.
bool IsProduct_ins(void) const
Check if variant Product_ins is selected.
const TModifiers & GetModifiers(void) const
Get the Modifiers member data.
void ResetModifiers(void)
Reset Modifiers data member.
const TSegs & GetSegs(void) const
Get the Segs member data.
@ eType_disc
discontinuous alignment
bool IsWhole(void) const
Check if variant Whole is selected.
@ eNa_strand_both_rev
in reverse orientation
@ eNa_strand_both
in forward orientation
TSource & SetSource(void)
Select the variant.
unsigned int
A callback function used to compare two keys in a database.
USING_SCOPE(ncbi::objects)