67 : m_ErrorMode(eError_Throw)
68 , m_SubstMatrixName(
"BLOSUM62")
84 length += it->IntersectionWith(
range).GetLength();
100 size_t rows = data.size();
101 size_t size = data[0].size();
102 for (
size_t i = 1;
i < rows; ++
i ) {
105 "Rows have different lengths");
108 for (
size_t a = 0;
a <
size; ++
a) {
109 bool is_mismatch =
false;
111 for (
size_t b = 1;
b < rows; ++
b) {
112 if (data[
b][
a] != c) {
144 if ( !prod_bsh || !genomic_bsh ) {
148 " in order to count identities/mismatches");
165 if (
range.IsReversed()) {
175 end_offset = range_it->GetToOpen() - r1.
GetFrom();
176 string::const_iterator pit = prod_data.begin()
178 string::const_iterator pit_end = prod_data.begin()
180 string::const_iterator git = gen_data.begin()
182 string::const_iterator git_end = gen_data.begin()
185 for ( ; pit != pit_end && git != git_end; ++pit, ++git)
187 bool match = (*pit == *git);
188 *identities +=
match;
189 *mismatches += !
match;
208 codon[0] = codon[1] = codon[2] =
'N';
217 size_t i = last_r1.
GetTo() + 1;
219 for ( ;
i != r1.
GetFrom() && count < 3; ++
i, ++count) {
220 codon[
i % 3 ] =
'N';
228 if (
range.IsReversed()) {
241 for (
size_t i = 0;
i < gen_data.size(); ++
i, ++prod_pos) {
242 codon[ prod_pos % 3 ] = gen_data[
i];
245 if (prod_pos % 3 == 2) {
247 char residue = (prod_pos == 2
254 if (residue == prod[prod_pos / 3] &&
255 residue !=
'X' && residue !=
'-') {
286 int* identities,
int* mismatches,
290 _ASSERT(identities && mismatches);
291 if (ranges.empty()) {
300 if (ranges.begin()->IsWhole() &&
304 *identities += num_ident;
305 *mismatches += (
len - num_ident);
317 for (
int seg = 0; seg < vec.
GetNumSegs(); ++seg) {
318 bool has_gap =
false;
331 seg_stop = vec.
GetStop(0, seg);
339 range_it->GetFrom()+
offset,
340 range_it->GetTo()+
offset);
341 data[
i] += seq_string;
354 identities, mismatches, ranges);
361 "identity + mismatch function not implemented for std-seg");
366 int aln_identities = 0;
367 int aln_mismatches = 0;
368 bool has_non_standard =
false;
380 switch (chunk.
Which()) {
385 part_start+part_len-1));
392 part_start+part_len-1));
399 part_start+part_len-1)))
401 has_non_standard =
true;
412 part_start += part_len;
415 has_non_standard =
true;
419 if ( !has_non_standard ) {
420 *identities += aln_identities;
421 *mismatches += aln_mismatches;
431 identities, mismatches);
434 "Can't calculate identities/mismatches for "
435 "alignment with genomic sequence " +
437 "; Loader can't load all required "
438 "components of sequence");
456 double* pct_identity,
461 size_t count_aligned = 0;
479 *pct_identity = 100.0f * double(*identities) / count_aligned;
497 "failed to retrieve sequence: " +
id.AsFastaString());
539 double* pct_coverage,
unsigned query = 0)
541 if (!ranges.
empty() && ranges.
begin()->IsWhole() &&
550 if(ranges.
empty() || !ranges.
begin()->IsWhole()){
559 const objects::CBioseq_Handle& bsh_seq = scope.
GetBioseqHandle(query_id);
563 "Can't get sequence data for " + query_id.AsFastaString() +
564 " in order to calculate coverage");
566 seq_len = bsh_seq.GetBioseqLength();
584 if (is_protein_to_genomic) {
598 *pct_coverage = 100.0f * double(covered_bases) / double(seq_len);
607 int* positives,
int* negatives)
614 "num_positives and num_negatives scores only defined "
615 "for protein alignment");
619 const string& dna = pro_text.
GetDNA();
621 for(string::size_type
i=0;
i<
match.size(); ++
i) {
627 *positives += increment;
632 *negatives += increment;
653 double pct_identity = 0;
655 &identities, &mismatches, &pct_identity,
type);
667 double pct_identity = 0;
669 &identities, &mismatches, &pct_identity,
type,
682 double pct_identity = 0;
684 &identities, &mismatches, &pct_identity,
type, ranges);
693 double pct_coverage = 0;
706 double pct_coverage = 0;
716 double pct_coverage = 0;
740 int& identities,
int& mismatches)
772 int& identities,
int& mismatches)
803 int& identities,
int& mismatches)
830 int& positives,
int& negatives)
925 double pct_identity = 0;
954 switch (e.GetErrCode()) {
967 <<
"CScoreBuilderBase::AddScore(): error computing score: "
975 string GetDonor(
const objects::CSpliced_exon& exon) {
976 if( exon.CanGetDonor_after_exon() && exon.GetDonor_after_exon().CanGetBases() ) {
977 return exon.GetDonor_after_exon().GetBases();
983 if( exon.CanGetAcceptor_before_exon() && exon.GetAcceptor_before_exon().CanGetBases() ) {
984 return exon.GetAcceptor_before_exon().GetBases();
991 if(donor.length()<2 || acc.length()<2)
return false;
1001 if(don2 ==
'T' || don2 ==
'C')
return true;
1020 "CScoreBuilderBase::AddSplignScores(): Unsupported product type");
1025 typedef TSpliced::TExons TExons;
1026 const TExons & exons (spliced.GetExons());
1029 aligned_query_bases (0),
1030 aln_length_exons (0),
1031 aln_length_gaps (0),
1033 splices_consensus (0);
1035 const TSeqPos qlen (spliced.GetProduct_length());
1036 const TSeqPos polya (spliced.CanGetPoly_a()?
1037 spliced.GetPoly_a(): (qstrand? qlen:
TSeqPos(-1)));
1038 const TSeqPos prod_length_no_polya (qstrand? polya: qlen - 1 - polya);
1045 const TExon & exon (**ii2);
1046 const TSeqPos qmin (exon.GetProduct_start().GetNucpos()),
1047 qmax (exon.GetProduct_end().GetNucpos());
1049 const TSeqPos qgap (qstrand? qmin - qprev - 1: qprev - qmax - 1);
1052 aln_length_gaps += qgap;
1055 else if (ii2 != exons.begin()) {
1060 typedef TExon::TParts TParts;
1061 const TParts & parts (exon.GetParts());
1071 aligned_query_bases +=
len;
1075 aligned_query_bases +=
len;
1079 aligned_query_bases +=
len;
1085 errmsg =
"Unexpected spliced exon chunk part: "
1089 aln_length_exons +=
len;
1093 qprev = qstrand? qmax: qmin;
1096 const TSeqPos qgap (qstrand? polya - qprev - 1: qprev - polya - 1);
1097 aln_length_gaps += qgap;
1099 for (CSeq_align::TScore::iterator it = scores.begin(); it != scores.end(); )
1102 if ((*it)->GetId().IsStr()) {
1105 . find((*it)->GetId().GetStr());
1107 score_type = score->second;
1113 it = scores.erase(it);
1121 score_matches->SetId().SetStr(
1123 score_matches->SetValue().SetInt(matches);
1124 scores.push_back(score_matches);
1128 score_overall_identity->SetId().SetStr(
1130 score_overall_identity->SetValue().
1131 SetReal(
double(matches)/(aln_length_exons + aln_length_gaps));
1132 scores.push_back(score_overall_identity);
1136 score_splices->SetId().SetStr(
1138 score_splices->SetValue().SetInt(splices_total);
1139 scores.push_back(score_splices);
1143 score_splices_consensus->SetId().SetStr(
1145 score_splices_consensus->SetValue().SetInt(splices_consensus);
1146 scores.push_back(score_splices_consensus);
1150 score_coverage->SetId().SetStr(
1152 score_coverage->SetValue().
1153 SetReal(
double(aligned_query_bases) / prod_length_no_polya);
1154 scores.push_back(score_coverage);
1158 score_exon_identity->SetId().SetStr(
1160 score_exon_identity->SetValue().
1161 SetReal(
double(matches) / aln_length_exons);
1162 scores.push_back(score_exon_identity);
1189 "CScoreBuilderBase::ComputeScore(): "
1190 "generic 'score' computation is undefined");
1200 "CScoreBuilderBase::ComputeScore(): "
1201 "BLAST scores are available in CScoreBuilder, "
1202 "not CScoreBuilderBase");
1209 if (ranges.
empty() || !ranges.
begin()->IsWhole()) {
1211 "positive-count score not supported within a range");
1216 if (ranges.
empty() || !ranges.
begin()->IsWhole()) {
1218 "positive-count score not supported within a range");
1235 double pct_identity = 0;
1237 &identities, &mismatches, &pct_identity,
1239 return pct_identity;
1247 double pct_identity = 0;
1249 &identities, &mismatches, &pct_identity,
1251 return pct_identity;
1259 double pct_identity = 0;
1261 &identities, &mismatches, &pct_identity,
1263 return pct_identity;
1269 double pct_coverage = 0;
1271 return pct_coverage;
1280 "High-quality percent coverage not supported "
1281 "for standard seg representation");
1283 if (ranges.
empty() || !ranges.
begin()->IsWhole()) {
1285 "High-quality percent coverage not supported "
1294 SetExcludeExternal());
1297 if(feat_it->GetData().GetRegion() ==
"alignable" &&
1298 feat_it->GetAnnot().IsNamed() &&
1299 feat_it->GetAnnot().GetName() ==
"NCBI_GPIPE")
1301 alignable_range = feat_it->GetRange();
1305 double pct_coverage = 0;
1309 return pct_coverage;
1320 if (ranges.
empty() || !ranges.
begin()->IsWhole()) {
1322 "splign scores not supported within a range");
1329 if ((*it)->GetValue().IsInt()) {
1330 return (*it)->GetValue().GetInt();
1332 return (*it)->GetValue().GetReal();
1336 NCBI_ASSERT(
false,
"Should never reach this point");
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
void ConvertSeqAlignToPairwiseAln(CPairwiseAln &pairwise_aln, const objects::CSeq_align &sa, objects::CSeq_align::TDim row_1, objects::CSeq_align::TDim row_2, CAlnUserOptions::EDirection direction=CAlnUserOptions::eBothDirections, const TAlnSeqIdVec *ids=0)
Build pairwise alignment from the selected rows of a seq-align.
CAlignRange Represents an element of pairwise alignment of two sequences.
TSignedSeqPos GetStop(TNumrow row, TNumseg seg, int offset=0) const
TSignedSeqPos GetStart(TNumrow row, TNumseg seg, int offset=0) const
TDim GetNumRows(void) const
TNumseg GetNumSegs(void) const
Default IAlnSeqId implementation based on CSeq_id_Handle.
string & GetSeqString(string &buffer, TNumrow row, TSeqPos seq_from, TSeqPos seq_to) const
bool IsSetWidths(void) const
static const CTrans_table & GetTransTable(int id)
Data loader exceptions, used by GenBank loader.
A pairwise aln is a collection of ranges for a pair of rows.
Text representation of ProSplign alignment.
const string & GetDNA() const
const string & GetMatch() const
const string & GetProtein() const
TThisType & IntersectWith(const TRange &r)
const_iterator begin() const
position_type GetCoveredLength(void) const
Returns total length covered by ranges in this collection, i.e.
int GetPositiveCount(CScope &scope, const CSeq_align &align)
counts based on substitution matrix for protein alignments
TSeqPos GetAlignLength(const CSeq_align &align, bool ungapped=false)
Compute the length of the alignment (= length of all segments, gaps + aligned)
@ eError_Report
Print error messages, but do not fail.
@ eError_Throw
Throw exceptions on errors.
int GetIdentityCount(CScope &scope, const CSeq_align &align)
Compute the number of identities in the alignment.
void AddSplignScores(const CSeq_align &align, CSeq_align::TScore &scores)
Compute the six splign scores.
void AddScore(CScope &scope, CSeq_align &align, CSeq_align::EScoreType score)
int GetGapCount(const CSeq_align &align)
Compute the number of gaps in the alignment.
double ComputeScore(CScope &scope, const CSeq_align &align, CSeq_align::EScoreType score)
EPercentIdentityType
Compute percent identity (range 0-100)
int GetNegativeCount(CScope &scope, const CSeq_align &align)
double GetPercentCoverage(CScope &scope, const CSeq_align &align, unsigned query=0)
Compute percent coverage of the query (sequence 0) (range 0-100)
double GetPercentIdentity(CScope &scope, const CSeq_align &align, EPercentIdentityType type=eGapped)
virtual ~CScoreBuilderBase()
Destructor.
void x_GetMatrixCounts(CScope &scope, const CSeq_align &align, int *positives, int *negatives)
int GetMismatchCount(CScope &scope, const CSeq_align &align)
Compute the number of mismatches in the alignment.
CScoreBuilderBase()
Default constructor.
EErrorMode GetErrorMode(void) const
void GetMatrixCounts(CScope &scope, const CSeq_align &align, int &positives, int &negatives)
void SetSubstMatrix(const string &name)
int GetGapBaseCount(const CSeq_align &align)
Compute the number of gap bases in the alignment (= length of all gap segments)
static SIZE_TYPE ReverseComplement(const string &src, TCoding src_coding, TSeqPos pos, TSeqPos length, string &dst)
EScoreType
enum controlling known named scores
@ eScore_PercentIdentity_GapOpeningOnly
@ eScore_PercentIdentity_Gapped
@ eScore_ConsensusSplices
@ eScore_PercentIdentity_Ungapped
@ eScore_HighQualityPercentCoverage
TSeqPos GetNumGapOpeningsWithinRanges(const CRangeCollection< TSeqPos > &ranges, TDim row=-1) const
TSeqPos GetTotalGapCount(TDim row=-1) const
Retrieves the total number of gaps in the given row an alignment; all gaps by default.
TSeqPos GetNumGapOpeningsWithinRange(const TSeqRange &range, TDim row=-1) const
static string ScoreName(EScoreType score)
void SetNamedScore(const string &id, int score)
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
bool GetNamedScore(const string &id, int &score) const
Get score.
TSeqPos GetTotalGapCountWithinRanges(const CRangeCollection< TSeqPos > &ranges, TDim row=-1) const
void ResetNamedScore(const string &name)
TSeqPos GetAlignLengthWithinRange(const TSeqRange &range, bool include_gaps=true) const
Get the length of this alignment within a specified range By default, this function computes an align...
TSeqPos GetTotalGapCountWithinRange(const TSeqRange &range, TDim row=-1) const
TSeqPos GetAlignLength(bool include_gaps=true) const
Get the length of this alignment.
TSeqPos GetAlignLengthWithinRanges(const CRangeCollection< TSeqPos > &ranges, bool include_gaps=true) const
Get the length of this alignment within a specified range By default, this function computes an align...
static const TScoreNameMap & ScoreNameMap()
static bool IsIntegerScore(EScoreType score)
TSeqPos GetNumGapOpenings(TDim row=-1) const
Retrieves the number of gap openings in a given row in an alignment (ignoring how many gaps are in th...
@ eUnsupported
Operation that is undefined for the given input Seq-align, and which is impossible to perform.
@ eNotImplemented
Attempt to use unimplemented funtionality.
char GetStartResidue(int state) const
char GetCodonResidue(int state) const
static int SetCodonState(unsigned char ch1, unsigned char ch2, unsigned char ch3)
container_type::const_iterator const_iterator
unsigned int TSeqPos
Type for sequence locations and lengths.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
#define NCBI_ASSERT(expr, mess)
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
void Error(CExceptionArgs_Base &args)
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
#define NCBI_RETHROW_SAME(prev_exception, message)
Generic macro to re-throw the same exception.
const string AsFastaString(void) const
const COrg_ref & GetOrg_ref(const CBioseq_Handle &handle)
Return the org-ref associated with a given sequence.
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
CSeq_inst::TMol GetSequenceType(const CSeq_id &id, TGetFlags flags=0)
Get molecular type of sequence (protein/dna/rna) Return CSeq_inst::eMol_not_set if sequence is not fo...
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
unsigned char Uchar
Alias for unsigned char.
TThisType & Set(position_type from, position_type to)
CRange< TSeqPos > TSeqRange
typedefs for sequence ranges
static TThisType GetWhole(void)
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
TTo GetTo(void) const
Get the To member data.
TFrom GetFrom(void) const
Get the From member data.
TGcode GetGcode(void) const
Get the Gcode member data.
const TOrgname & GetOrgname(void) const
Get the Orgname member data.
const TDenseg & GetDenseg(void) const
Get the variant data.
E_Choice Which(void) const
Which variant is currently selected.
bool IsSetParts(void) const
basic seqments always are in biologic order Check if a value has been assigned to Parts data member.
vector< CRef< CScore > > TScore
TMatch GetMatch(void) const
Get the variant data.
bool IsSetProduct_strand(void) const
should be 'plus' or 'minus' Check if a value has been assigned to Product_strand data member.
static string SelectionName(E_Choice index)
Retrieve selection name (for diagnostic purposes).
TProduct_length GetProduct_length(void) const
Get the Product_length member data.
bool IsSetPoly_a(void) const
start of poly(A) tail on the transcript For sense transcripts: aligned product positions < poly-a <= ...
TDiag GetDiag(void) const
Get the variant data.
TProduct_type GetProduct_type(void) const
Get the Product_type member data.
TMismatch GetMismatch(void) const
Get the variant data.
const TParts & GetParts(void) const
Get the Parts member data.
const TProduct_start & GetProduct_start(void) const
Get the Product_start member data.
const TProduct_end & GetProduct_end(void) const
Get the Product_end member data.
const TSpliced & GetSpliced(void) const
Get the variant data.
TGenomic_ins GetGenomic_ins(void) const
Get the variant data.
list< CRef< CSpliced_exon > > TExons
const TExons & GetExons(void) const
Get the Exons member data.
bool IsStd(void) const
Check if variant Std is selected.
list< CRef< CSpliced_exon_chunk > > TParts
bool IsSetProduct_length(void) const
length of the product, in bases/residues from this (or from poly-a if present), a 3' unaligned length...
TPoly_a GetPoly_a(void) const
Get the Poly_a member data.
bool IsSpliced(void) const
Check if variant Spliced is selected.
TProduct_strand GetProduct_strand(void) const
Get the Product_strand member data.
list< CRef< CSeq_align > > Tdata
TProduct_ins GetProduct_ins(void) const
Get the variant data.
const TDisc & GetDisc(void) const
Get the variant data.
const Tdata & Get(void) const
Get the member data.
const TSegs & GetSegs(void) const
Get the Segs member data.
bool IsDenseg(void) const
Check if variant Denseg is selected.
E_Choice Which(void) const
Which variant is currently selected.
@ e_Product_ins
insertion in product sequence (i.e. gap in the genomic sequence)
@ e_Diag
both sequences are represented, there is sufficient similarity between product and genomic sequences....
@ e_Genomic_ins
insertion in genomic sequence (i.e. gap in the product sequence)
@ e_Match
both sequences represented, product and genomic sequences match
@ e_Mismatch
both sequences represented, product and genomic sequences do not match
@ eProduct_type_transcript
@ e_Region
named region (globin locus)
EMol
molecule class in living organism
@ eMol_not_set
> cdna = rna
range(_Ty, _Ty) -> range< _Ty >
const struct ncbi::grid::netcache::search::fields::SIZE size
static int match(register const pcre_uchar *eptr, register const pcre_uchar *ecode, const pcre_uchar *mstart, int offset_top, match_data *md, eptrblock *eptrb, unsigned int rdepth)
static void s_GetCountIdentityMismatch(CScope &scope, const CSeq_align &align, int *identities, int *mismatches, const CRangeCollection< TSeqPos > &ranges=CRangeCollection< TSeqPos >(TSeqRange::GetWhole()))
static void s_GetNucIdentityMismatch(const vector< string > &data, int *identities, int *mismatches)
calculate mismatches and identities in a seq-align
static bool s_SequenceIsProtein(CScope &scope, const CSeq_id &id)
calculate the percent coverage
bool IsConsSplice(const string &donor, const string acc)
static TSeqPos s_IntersectionLength(const CRangeCollection< TSeqPos > &ranges, const TSeqRange &range)
Get length of intersection between a range and a range collection.
static void s_GetPercentIdentity(CScope &scope, const CSeq_align &align, int *identities, int *mismatches, double *pct_identity, CScoreBuilderBase::EPercentIdentityType type, const CRangeCollection< TSeqPos > &ranges=CRangeCollection< TSeqPos >(TSeqRange::GetWhole()))
calculate the percent identity we also return the count of identities and mismatches
static void s_GetPercentCoverage(CScope &scope, const CSeq_align &align, const CRangeCollection< TSeqPos > &ranges, double *pct_coverage, unsigned query=0)
string GetAcceptor(const objects::CSpliced_exon &exon)
static void s_GetSplicedSegIdentityMismatch(CScope &scope, const CSeq_align &align, const CRangeCollection< TSeqPos > &ranges, int *identities, int *mismatches)
string GetDonor(const objects::CSpliced_exon &exon)
static bool s_IsProteinToGenomic(CScope &scope, const CSeq_align &align)