45 #include "../core/blast_psi_priv.h"
84 if ( !score_blk->protein_alphabet ) {
86 "BlastScoreBlk is not configured for a protein alphabet");
91 score_blk->kbp_psi[0]->Lambda = pssm->GetPssm().GetLambdaUngapped();
92 }
else if (score_blk->kbp_std[0]->Lambda > 0.0) {
93 score_blk->kbp_psi[0]->Lambda = score_blk->kbp_std[0]->Lambda;
97 score_blk->kbp_psi[0]->K = pssm->GetPssm().GetKappaUngapped();
98 }
else if (score_blk->kbp_std[0]->K > 0.0) {
99 score_blk->kbp_psi[0]->K = score_blk->kbp_std[0]->K;
101 score_blk->kbp_psi[0]->logK =
log(score_blk->kbp_psi[0]->K);
104 score_blk->kbp_psi[0]->H = pssm->GetPssm().GetHUngapped();
105 }
else if (score_blk->kbp_std[0]->K > 0.0) {
106 score_blk->kbp_psi[0]->H = score_blk->kbp_std[0]->H;
111 score_blk->kbp_gap_psi[0]->Lambda = pssm->GetPssm().GetLambda();
112 }
else if (score_blk->kbp_gap_std[0]->Lambda > 0.0) {
113 score_blk->kbp_gap_psi[0]->Lambda = score_blk->kbp_gap_std[0]->Lambda;
117 score_blk->kbp_gap_psi[0]->K = pssm->GetPssm().GetKappa();
118 }
else if (score_blk->kbp_gap_std[0]->K > 0.0) {
119 score_blk->kbp_gap_psi[0]->K = score_blk->kbp_gap_std[0]->K;
121 score_blk->kbp_gap_psi[0]->logK =
log(score_blk->kbp_gap_psi[0]->K);
124 score_blk->kbp_gap_psi[0]->H = pssm->GetPssm().GetH();
125 }
else if (score_blk->kbp_gap_std[0]->H > 0.0) {
126 score_blk->kbp_gap_psi[0]->H = score_blk->kbp_gap_std[0]->H;
130 const size_t kQueryLength = pssm->GetPssm().GetNumColumns();
134 bool missing_scores =
false;
136 unique_ptr< CNcbiMatrix<int> > scores
138 _ASSERT(score_blk->psi_matrix->pssm->ncols == scores->GetCols());
139 _ASSERT(score_blk->psi_matrix->pssm->nrows == scores->GetRows());
141 for (
TSeqPos c = 0; c < scores->GetCols(); c++) {
142 for (
TSeqPos r = 0;
r < scores->GetRows();
r++) {
143 score_blk->psi_matrix->pssm->data[c][
r] = (*scores)(
r, c);
146 }
catch (
const std::runtime_error&) {
147 missing_scores =
true;
151 bool missing_freq_ratios =
false;
153 bool freq_ratios_all_zeros =
true;
156 unique_ptr< CNcbiMatrix<double> > freq_ratios
158 _ASSERT(score_blk->psi_matrix->pssm->ncols ==
159 freq_ratios->GetCols());
160 _ASSERT(score_blk->psi_matrix->pssm->nrows ==
161 freq_ratios->GetRows());
163 for (
TSeqPos c = 0; c < freq_ratios->GetCols(); c++) {
164 for (
TSeqPos r = 0;
r < freq_ratios->GetRows();
r++) {
165 score_blk->psi_matrix->freq_ratios[c][
r] =
166 (*freq_ratios)(
r, c);
168 freq_ratios_all_zeros =
false;
172 }
catch (
const std::runtime_error&) {
173 missing_freq_ratios =
true;
176 if (missing_scores && missing_freq_ratios) {
178 "Missing scores and frequency ratios in PSSM");
185 freq_ratios_all_zeros) {
187 os <<
"Frequency ratios for PSSM are all zeros, frequency ratios for ";
188 os << options->GetMatrixName() <<
" will be used during traceback ";
189 os <<
"in composition based statistics";
193 messages.front().push_back(sm);
201 os <<
"Composition-based score adjustment conditioned on "
202 <<
"sequence properties and unconditional composition-based score "
203 <<
"adjustment is not supported with PSSMs, resetting to default "
204 <<
"value of standard composition-based statistics";
208 messages.front().push_back(sm);
222 typename list<T>::const_iterator itr =
source.begin();
223 if (by_row ==
true) {
225 for (
SIZE_TYPE c = 0; c < num_columns; c++) {
230 for (
SIZE_TYPE c = 0; c < num_columns; c++) {
242 if ( !pssm_asn.GetPssm().CanGetFinalData() ||
243 !pssm_asn.GetPssm().GetFinalData().CanGetScores() ||
244 pssm_asn.GetPssm().GetFinalData().GetScores().empty() ) {
245 throw runtime_error(
"Cannot obtain scores from ASN.1 PSSM");
248 const CPssm& pssm = pssm_asn.GetPssm();
250 (
size_t)pssm.
GetNumRows()*pssm_asn.GetPssm().GetNumColumns());
252 unique_ptr< CNcbiMatrix<int> > retval
260 return retval.release();
267 if ( !pssm_asn.GetPssm().CanGetIntermediateData() ||
268 !pssm_asn.GetPssm().GetIntermediateData().CanGetFreqRatios() ||
269 pssm_asn.GetPssm().GetIntermediateData().GetFreqRatios().empty() ) {
270 throw runtime_error(
"Cannot obtain frequency ratios from ASN.1 PSSM");
273 const CPssm& pssm = pssm_asn.GetPssm();
275 (
size_t)pssm.
GetNumRows()*pssm_asn.GetPssm().GetNumColumns());
277 unique_ptr< CNcbiMatrix<double> > retval
283 return retval.release();
288 (
const objects::CPssmWithParameters& pssm_asn)
290 if ( !pssm_asn.GetPssm().CanGetIntermediateData() ||
291 !pssm_asn.GetPssm().GetIntermediateData().CanGetResFreqsPerPos() ||
292 pssm_asn.GetPssm().GetIntermediateData().GetResFreqsPerPos().empty() )
297 const CPssm& pssm = pssm_asn.GetPssm();
299 (
size_t)pssm.
GetNumRows()*pssm_asn.GetPssm().GetNumColumns());
301 unique_ptr< CNcbiMatrix<int> > retval
307 return retval.release();
312 (
const objects::CPssmWithParameters& pssm_asn)
314 if ( !pssm_asn.GetPssm().CanGetIntermediateData() ||
315 !pssm_asn.GetPssm().GetIntermediateData().
316 CanGetWeightedResFreqsPerPos() ||
317 pssm_asn.GetPssm().GetIntermediateData().
318 GetWeightedResFreqsPerPos().
empty() ) {
322 const CPssm& pssm = pssm_asn.GetPssm();
324 GetWeightedResFreqsPerPos().
size() ==
325 (
size_t)pssm.
GetNumRows()*pssm_asn.GetPssm().GetNumColumns());
327 unique_ptr< CNcbiMatrix<double> > retval
333 return retval.release();
338 (
const objects::CPssmWithParameters& pssm_asn,
339 vector<double>& retval)
342 if ( !pssm_asn.GetPssm().CanGetIntermediateData() ||
343 !pssm_asn.GetPssm().GetIntermediateData().CanGetInformationContent() ||
348 const CPssm& pssm = pssm_asn.GetPssm();
351 back_inserter(retval));
356 (
const objects::CPssmWithParameters& pssm_asn,
357 vector<double>& retval)
360 if ( !pssm_asn.GetPssm().CanGetIntermediateData() ||
362 GetIntermediateData().CanGetGaplessColumnWeights() ||
367 const CPssm& pssm = pssm_asn.GetPssm();
370 back_inserter(retval));
375 vector<double>& retval)
378 if ( !pssm_asn.GetPssm().CanGetIntermediateData() ||
379 !pssm_asn.GetPssm().GetIntermediateData().CanGetSigma() ||
380 pssm_asn.GetPssm().GetIntermediateData().GetSigma().empty() ) {
383 const CPssm& pssm = pssm_asn.GetPssm();
386 back_inserter(retval));
391 (
const objects::CPssmWithParameters& pssm_asn, vector<int>& retval)
394 if ( !pssm_asn.GetPssm().CanGetIntermediateData() ||
396 GetIntermediateData().CanGetIntervalSizes() ||
401 const CPssm& pssm = pssm_asn.GetPssm();
404 back_inserter(retval));
409 (
const objects::CPssmWithParameters& pssm_asn, vector<int>& retval)
412 if ( !pssm_asn.GetPssm().CanGetIntermediateData() ||
414 GetIntermediateData().CanGetNumMatchingSeqs() ||
419 const CPssm& pssm = pssm_asn.GetPssm();
422 back_inserter(retval));
430 _ASSERT(pssm.GetParams().GetRpsdbparams().IsSetMatrixName());
431 pssm.SetParams().SetRpsdbparams().SetGapOpen(gap_open);
432 pssm.SetParams().SetRpsdbparams().SetGapExtend(gap_extend);
444 if (pssm.GetPssm().CanGetFinalData()) {
445 _ASSERT(pssm.GetPssm().GetFinalData().GetScores().size() ==
449 const size_t diff = (size_t)
BLASTAA_SIZE - pssm.GetPssm().GetNumRows();
451 pssm.SetPssm().SetIntermediateData().SetFreqRatios();
453 if (pssm.GetPssm().GetByRow() ==
true) {
454 freq_ratios.resize(pssm.GetPssm().GetNumColumns() *
BLASTAA_SIZE, 0.0);
456 CPssmIntermediateData::TFreqRatios::iterator itr = freq_ratios.begin();
457 for (
int c = 0; c < pssm.GetPssm().GetNumColumns(); c++) {
458 advance(itr, pssm.GetPssm().GetNumRows());
459 freq_ratios.insert(itr, diff, 0.0);
476 (
size_t)pssm->GetPssm().GetNumColumns());
477 unique_ptr< CNcbiMatrix<double> > freq_ratios
487 if (pssm->GetPssm().GetNumRows() !=
492 pssm->SetPssm().SetFinalData().SetScores() =
494 pssm->SetPssm().SetFinalData().SetLambda() =
496 pssm->SetPssm().SetFinalData().SetKappa() =
498 pssm->SetPssm().SetFinalData().SetH() =
512 (score_type ==
"e_value" || score_type ==
"sum_e")) {
550 CPsiBlastAlignmentProcessor::operator()
551 (
const objects::CSeq_align_set& alignments,
552 double evalue_inclusion_threshold,
560 if (e < evalue_inclusion_threshold) {
572 if ( !pssm.CanGetPssm() ) {
574 "Missing PSSM data");
577 bool missing_scores(
false);
578 if ( !pssm.GetPssm().CanGetFinalData() ||
579 !pssm.GetPssm().GetFinalData().CanGetScores() ||
580 pssm.GetPssm().GetFinalData().GetScores().empty() ) {
581 missing_scores =
true;
584 bool missing_freq_ratios(
false);
585 if ( !pssm.GetPssm().CanGetIntermediateData() ||
586 !pssm.GetPssm().GetIntermediateData().CanGetFreqRatios() ||
587 pssm.GetPssm().GetIntermediateData().GetFreqRatios().empty() ) {
588 missing_freq_ratios =
true;
591 if (missing_freq_ratios && missing_scores) {
593 "PSSM data must contain either scores or frequency ratios");
595 if (missing_scores && require_scores) {
597 "PSSM data must contain scores (did you run the PSSM engine?)");
601 if (!missing_scores &&
602 pssm.GetPssm().GetFinalData().CanGetScalingFactor() &&
603 pssm.GetPssm().GetFinalData().GetScalingFactor() != 1) {
604 string msg(
"PSSM has a scaling factor of ");
607 .GetScalingFactor());
608 msg +=
". PSI-BLAST does not accept scaled PSSMs";
612 if ( !pssm.HasQuery() ) {
614 "Missing query sequence in PSSM");
616 if ( !pssm.GetQuery().IsSeq() ) {
618 "Query sequence in ASN.1 PSSM is not a single Bioseq");
621 if ( !pssm.GetPssm().GetIsProtein() ) {
623 "PSSM does not represent protein scoring matrix");
636 string excpt_msg(
"PSI-BLAST only accepts ");
638 excpt_msg +=
"one protein sequence as query";
640 excpt_msg +=
"protein sequences as subjects";
654 if (e.
GetMsg().find(
"Incompatible sequence codings") !=
NPOS) {
663 static_cast<unsigned>(sblk->
length));
665 excpt_msg.assign(
"PSI-BLAST cannot accept nucleotide ");
666 excpt_msg += (qf_type ==
eQFT_Query ?
"queries" :
"subjects");
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
Auxiliary functions for BLAST.
Declarations of static arrays used to define some NCBI encodings to be used in a toolkit independent ...
Declares the BLAST exception class.
The structures and functions in blast_options.
#define BLAST_EXPECT_VALUE
Default parameters for saving hits.
Declares the CBlastOptionsHandle and CBlastOptionsFactory classes.
const double kEpsilon
Small constant to test against 0.
Definitions and prototypes used by blast_stat.c to calculate BLAST statistics.
#define BLAST_SCORE_MIN
minimum allowed score (for one letter comparison).
SPsiBlastScoreMatrix * SPsiBlastScoreMatrixNew(size_t ncols)
Allocates a new SPsiBlastScoreMatrix structure of dimensions ncols by BLASTAA_SIZE.
@ eDeltaBlast
Delta Blast.
Defines BLAST error codes (user errors included)
Handle to the options to the BLAST algorithm.
Encapsulates ALL the BLAST algorithm's options.
NCBI C++ Object Manager free implementation of IQueryFactory.
Computes a PSSM as specified in PSI-BLAST.
static const double kInvalidStat
Error or Warning Message from search.
typedef for the messages for an entire BLAST search, which could be comprised of multiple query seque...
@ eCompositionBasedStats
Composition-based statistics as in NAR 29:2994-3005, 2001.
@ eNoCompositionBasedStats
Don't use composition based statistics.
static SQLCHAR output[256]
static void QueryFactory(CRef< IQueryFactory > query_factory, const CBlastOptionsHandle &opts_handle, EQueryFactoryType query_factory_type=eQFT_Query)
Function to perform sanity checks on the query factory.
static CNcbiMatrix< int > * GetResidueFrequencies(const objects::CPssmWithParameters &pssm)
Returns matrix of BLASTAA_SIZE by query size (dimensions are opposite of what is stored in the BlastS...
CRef< objects::CPssmWithParameters > Run()
Runs the PSSM engine to compute the PSSM.
void PsiBlastSetupScoreBlock(BlastScoreBlk *score_blk, CConstRef< objects::CPssmWithParameters > pssm, TSearchMessages &messages, CConstRef< CBlastOptions > options)
Setup CORE BLAST score block structure with data from the scoremat PSSM.
void Convert2Matrix(const list< T > &source, CNcbiMatrix< T > &dest, bool by_row, SIZE_TYPE num_rows, SIZE_TYPE num_columns)
Convert a list of values into a CNcbiMatrix.
static CNcbiMatrix< int > * GetScores(const objects::CPssmWithParameters &pssm)
Returns matrix of BLASTAA_SIZE by query size (dimensions are opposite of what is stored in the BlastS...
#define BLASTAA_SIZE
Size of aminoacid alphabet.
static void GetSigma(const objects::CPssmWithParameters &pssm, vector< double > &retval)
Data used in sequence weights computation.
static CNcbiMatrix< double > * GetWeightedResidueFrequencies(const objects::CPssmWithParameters &pssm)
Returns matrix of BLASTAA_SIZE by query size (dimensions are opposite of what is stored in the BlastS...
virtual BLAST_SequenceBlk * GetSequenceBlk()=0
Accessor for the BLAST_SequenceBlk structure.
CRef< ILocalQueryData > MakeLocalQueryData(const CBlastOptions *opts)
Creates and caches an ILocalQueryData.
EQueryFactoryType
Enumeration to specify the different uses of the query factory.
int GetGapExtensionCost() const
static double s_GetBitScore(const CScore &score)
Returns the bit_score from this score object.
static void s_AdjustFrequencyRatiosMatrixToMatchScoreMatrix(objects::CPssmWithParameters &pssm)
After creating the PSSM from frequency ratios, adjust the frequency ratios matrix to match the dimens...
static double s_GetEvalue(const CScore &score)
Returns the evalue from this score object.
static CNcbiMatrix< double > * GetFreqRatios(const objects::CPssmWithParameters &pssm)
Returns matrix of BLASTAA_SIZE by query size (dimensions are opposite of what is stored in the BlastS...
const CBlastOptions & GetOptions() const
Return the object which this object is a handle for.
static void GetNumMatchingSeqs(const objects::CPssmWithParameters &pssm, vector< int > &retval)
Gets the number of matching sequences per position of the PSSM.
void PsiBlastComputePssmScores(CRef< objects::CPssmWithParameters > pssm, const CBlastOptions &opts)
Given a PSSM with frequency ratios and options, invoke the PSSM engine to compute the scores.
virtual size_t GetNumQueries()=0
Get the number of queries.
double GetLowestEvalue(const objects::CDense_seg::TScores &scores, double *bit_score)
Returns the lowest score from the list of scores in CDense_seg::TScores.
void PsiBlastAddAncillaryPssmData(objects::CPssmWithParameters &pssm, int gap_open, int gap_extend)
Even though the query sequence and the matrix gap costs are not a product of the PSSM engine,...
static void Pssm(const objects::CPssmWithParameters &pssm, bool require_scores=false)
Perform validation on the PSSM.
static void GetGaplessColumnWeights(const objects::CPssmWithParameters &pssm, vector< double > &retval)
Returns the relative gapless PSSM column weights to pseudocounts for the provided PSSM.
virtual size_t GetSeqLength(size_t index)=0
Get the length of the sequence indicated by index.
int GetGapOpeningCost() const
static void GetInformationContent(const objects::CPssmWithParameters &pssm, vector< double > &retval)
Returns the information content per position of the PSSM.
const char * GetMatrixName() const
static void GetIntervalSizes(const objects::CPssmWithParameters &pssm, vector< int > &retval)
Length of the aligned regions per position of the query sequence.
unsigned int TSeqPos
Type for sequence locations and lengths.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
const string & GetMsg(void) const
Get message string.
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define USING_SCOPE(ns)
Use the specified namespace.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define BEGIN_SCOPE(ns)
Define a new scope.
NCBI_NS_STD::string::size_type SIZE_TYPE
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
const TStr & GetStr(void) const
Get the variant data.
const TFreqRatios & GetFreqRatios(void) const
Get the FreqRatios member data.
const TNumMatchingSeqs & GetNumMatchingSeqs(void) const
Get the NumMatchingSeqs member data.
TNumRows GetNumRows(void) const
Get the NumRows member data.
const TGaplessColumnWeights & GetGaplessColumnWeights(void) const
Get the GaplessColumnWeights member data.
TH GetH(void) const
Get the H member data.
TKappa GetKappa(void) const
Get the Kappa member data.
const TScores & GetScores(void) const
Get the Scores member data.
const TWeightedResFreqsPerPos & GetWeightedResFreqsPerPos(void) const
Get the WeightedResFreqsPerPos member data.
const TIntervalSizes & GetIntervalSizes(void) const
Get the IntervalSizes member data.
const TSigma & GetSigma(void) const
Get the Sigma member data.
const TInformationContent & GetInformationContent(void) const
Get the InformationContent member data.
const TFinalData & GetFinalData(void) const
Get the FinalData member data.
TNumColumns GetNumColumns(void) const
Get the NumColumns member data.
const TIntermediateData & GetIntermediateData(void) const
Get the IntermediateData member data.
TByRow GetByRow(void) const
Get the ByRow member data.
list< double > TFreqRatios
const TResFreqsPerPos & GetResFreqsPerPos(void) const
Get the ResFreqsPerPos member data.
const TPssm & GetPssm(void) const
Get the Pssm member data.
TLambda GetLambda(void) const
Get the Lambda member data.
bool IsReal(void) const
Check if variant Real is selected.
const TValue & GetValue(void) const
Get the Value member data.
vector< CRef< CScore > > TScores
list< CRef< CSeq_align > > Tdata
TReal GetReal(void) const
Get the variant data.
const TId & GetId(void) const
Get the Id member data.
constexpr bool empty(list< Ts... >) noexcept
const struct ncbi::grid::netcache::search::fields::SIZE size
const CharType(& source)[N]
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
NOTE: This file contains work in progress and the APIs are likely to change, please do not rely on th...
Declarations of auxiliary functions/classes for PSI-BLAST.
C++ API for the PSI-BLAST PSSM engine.
Structure to hold a sequence.
Uint1 * sequence_start
Start of sequence, usually one byte before sequence as that byte is a NULL sentinel byte.
Int4 length
Length of sequence.
Uint1 * sequence
Sequence used for search (could be translation).
Structure used for scoring calculations.