53 #include "../core/blast_psi_priv.h"
71 if ( !pssm_input_msa ) {
73 "IPssmInputData is NULL");
78 "IPssmInputData returns NULL PSIBlastOptions");
83 "IPssmInputData returns NULL query sequence");
88 "Query length provided by IPssmInputData is 0");
99 if ( !pssm_input_freqratios ) {
101 "IPssmInputFreqRatios is NULL");
104 if ( !pssm_input_freqratios->
GetQuery() ) {
106 "IPssmInputFreqRatiosFreqRatios returns NULL query sequence");
109 const unsigned int kQueryLength = pssm_input_freqratios->
GetQueryLength();
110 if (kQueryLength == 0) {
112 "Query length provided by IPssmInputFreqRatiosFreqRatios is 0");
117 "Number of columns returned by IPssmInputFreqRatiosFreqRatios does "
118 "not match query length");
122 "Number of rows returned by IPssmInputFreqRatiosFreqRatios differs "
135 if ( !pssm_input_msa->
GetData() ) {
137 "IPssmInputData returns NULL multiple sequence alignment");
142 string msg(
"IPssmInputData returns invalid PSIBlastOptions: ");
157 if ( !pssm_input->
GetData() ) {
159 "IPssmInputData returns NULL multiple sequence alignment");
164 string msg(
"IPssmInputData returns invalid PSIBlastOptions: ");
183 "PSSM frequency ratios cannot have negative values");
189 : m_PssmInput(
input), m_PssmInputFreqRatios(
NULL)
197 : m_PssmInput(
NULL), m_PssmInputFreqRatios(
input)
205 m_PssmInputFreqRatios(
NULL),
206 m_PssmInputCdd(
input)
209 input->GetMatrixName(),
input->GetGapExistence(),
210 input->GetGapExtension());
222 switch (error_code) {
224 retval =
"No error detected";
228 retval =
"Bad argument to function detected";
232 retval =
"Out of memory";
236 retval =
"Error computing sequence weights";
240 retval =
"No matrix frequency ratios were found for requested matrix";
244 retval =
"PSSM has positive average score";
248 retval =
"No sequences left after purging biased sequences in ";
249 retval +=
"multiple sequence alignment";
253 retval =
"Gap found in query sequence";
257 retval =
"Found column with no sequences aligned in it";
261 retval =
"Found column with only GAP residues";
265 retval =
"Found flanking gap at start of alignment";
269 retval =
"Found flanking gap at end of alignment";
273 retval =
"Errors in conserved domain profile";
277 retval =
"Unknown error code returned from PSSM engine: " +
300 "processing input data strategies are null");
313 for (
size_t c = 0; c < m.
GetCols(); c++) {
368 if (
query.NotEmpty()) {
403 if (
query.NotEmpty()) {
441 if (
query.NotEmpty()) {
450 unsigned int query_length)
454 unsigned char* retval =
NULL;
455 retval = (
unsigned char*)
malloc(
sizeof(
unsigned char)*(query_length + 2));
461 memcpy((
void*) &retval[1], (
void*)
query, query_length);
468 const int kNumQueries = 1;
513 unsigned int query_length,
514 const char* matrix_name,
574 "Unknown error when setting up BlastScoreBlk");
626 const char* matrix_name,
635 string mtx(matrix_name);
637 retval->
SetParams().SetRpsdbparams().SetMatrixName(
mtx);
652 asn1_pssm.
SetH(pssm->
h);
656 if (asn1_pssm.
GetByRow() ==
false) {
657 for (
unsigned int i = 0;
i < pssm->
ncols;
i++) {
658 for (
unsigned int j = 0; j < pssm->
nrows; j++) {
660 push_back(pssm->
pssm[
i][j]);
664 for (
unsigned int i = 0;
i < pssm->
nrows;
i++) {
665 for (
unsigned int j = 0; j < pssm->
ncols; j++) {
667 push_back(pssm->
pssm[j][
i]);
677 if ( !diagnostics ) {
695 if (asn1_pssm.
GetByRow() ==
false) {
696 for (
unsigned int i = 0;
i < pssm->
ncols;
i++) {
697 for (
unsigned int j = 0; j < pssm->
nrows; j++) {
702 for (
unsigned int i = 0;
i < pssm->
nrows;
i++) {
703 for (
unsigned int j = 0; j < pssm->
ncols; j++) {
713 if (asn1_pssm.
GetByRow() ==
false) {
714 for (
unsigned int i = 0;
i < pssm->
ncols;
i++) {
715 for (
unsigned int j = 0; j < pssm->
nrows; j++) {
721 for (
unsigned int i = 0;
i < pssm->
nrows;
i++) {
722 for (
unsigned int j = 0; j < pssm->
ncols; j++) {
733 if (asn1_pssm.
GetByRow() ==
false) {
734 for (
unsigned int i = 0;
i < pssm->
ncols;
i++) {
735 for (
unsigned int j = 0; j < pssm->
nrows; j++) {
740 for (
unsigned int i = 0;
i < pssm->
nrows;
i++) {
741 for (
unsigned int j = 0; j < pssm->
ncols; j++) {
756 if (diagnostics->
sigma) {
760 sigma.push_back(diagnostics->
sigma[
i]);
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
Contains C++ wrapper classes to structures in algo/blast/core as well as some auxiliary functions to ...
Blast_Message * Blast_MessageFree(Blast_Message *blast_msg)
Deallocates message memory.
The structures and functions in blast_options.
Int2 PSIBlastOptionsValidate(const PSIBlastOptions *psi_options, Blast_Message **blast_msg)
Validates the PSI BLAST options so that they have sane values.
Int2 BlastScoringOptionsNew(EBlastProgramType program, BlastScoringOptions **options)
Allocate memory for BlastScoringOptions and fill with default values.
Int2 BlastScoringOptionsSetMatrix(BlastScoringOptions *opts, const char *matrix_name)
Resets matrix name option.
const double kPSSM_NoImpalaScaling
Value used to indicate that no IMPALA-style scaling should be performed when scaling a PSSM.
EBlastProgramType
Defines the engine's notion of the different applications of the BLAST algorithm.
int PSICreatePssmFromCDD(const PSICdMsa *cd_msa, const PSIBlastOptions *options, BlastScoreBlk *sbp, const PSIDiagnosticsRequest *request, PSIMatrix **pssm, PSIDiagnosticsResponse **diagnostics)
Main entry point to core PSSM engine for computing CDD-based PSSMs.
int PSICreatePssmFromFrequencyRatios(const Uint1 *query, Uint4 query_length, BlastScoreBlk *sbp, double **freq_ratios, double impala_scaling_factor, PSIMatrix **pssm)
Top-level function to create a PSSM given a matrix of frequency ratios and perform scaling on the res...
int PSICreatePssmWithDiagnostics(const PSIMsa *msap, const PSIBlastOptions *options, BlastScoreBlk *sbp, const PSIDiagnosticsRequest *request, PSIMatrix **pssm, PSIDiagnosticsResponse **diagnostics)
Main entry point to core PSSM engine which allows to request diagnostics information.
#define PSIERR_BADPARAM
Bad parameter used in function.
#define PSIERR_ENDINGGAP
Found flanking gap at end of alignment.
#define PSIERR_COLUMNOFGAPS
Found an entire column full of GAP residues.
#define PSIERR_OUTOFMEM
Out of memory.
#define PSIERR_BADPROFILE
Errors in conserved domain profile.
#define PSIERR_POSITIVEAVGSCORE
Positive average score found when scaling matrix.
#define PSIERR_NOALIGNEDSEQS
After purge stage of PSSM creation, no sequences are left.
#define PSIERR_NOFREQRATIOS
No frequency ratios were found for the given scoring matrix.
#define PSIERR_STARTINGGAP
Found flanking gap at start of alignment.
#define PSIERR_BADSEQWEIGHTS
Sequence weights do not add to 1.
#define PSI_SUCCESS
Successful operation.
#define PSIERR_UNALIGNEDCOLUMN
Found an entire column with no participating sequences.
#define PSIERR_GAPINQUERY
GAP residue found in query sequence.
BlastQueryInfo * BlastQueryInfoNew(EBlastProgramType program, int num_queries)
Allocate memory for query information structure.
Utilities initialize/setup BLAST.
Int2 BlastSetup_ScoreBlkInit(BLAST_SequenceBlk *query_blk, const BlastQueryInfo *query_info, const BlastScoringOptions *scoring_options, EBlastProgramType program_number, BlastScoreBlk **sbpp, double scale_factor, Blast_Message **blast_message, GET_MATRIX_PATH get_path)
Initializes the score block structure.
Internal auxiliary setup classes/functions for C++ BLAST APIs.
Definitions and prototypes used by blast_stat.c to calculate BLAST statistics.
BlastScoreBlk * BlastScoreBlkFree(BlastScoreBlk *sbp)
Deallocates BlastScoreBlk as well as all associated structures.
Int2 BlastSeqBlkSetSequence(BLAST_SequenceBlk *seq_blk, const Uint1 *sequence, Int4 seqlen)
Stores the sequence in the sequence block structure.
Int2 BlastSeqBlkNew(BLAST_SequenceBlk **retval)
Allocates a new sequence block structure.
Wrapper class for BLAST_SequenceBlk .
Defines BLAST error codes (user errors included)
Wrapper class for BlastQueryInfo .
Wrapper class for BlastScoringOptions .
Defines system exceptions occurred while running BLAST.
size_t GetRows() const
get the number of rows in this matrix
size_t GetCols() const
get the number of columns in this matrix
Wrapper class for PSIDiagnosticsResponse .
Wrapper class for PSIMatrix .
Exception class for the CPssmEngine class.
CSeq_entry & SetQuery()
Retrieve the query sequence.
void SetHUngapped(double val)
void SetLambdaUngapped(double val)
void SetKappa(double val)
void SetKappaUngapped(double val)
void SetLambda(double val)
virtual void Process()=0
Algorithm to produce multiple sequence alignment structure should be implemented in this method.
static void s_CheckAgainstNullData(IPssmInputData *pssm_input_msa)
This function makes sure that none of the required data is returned as NULL or "empty".
CRef< objects::CPssmWithParameters > x_CreatePssmFromMsa()
Using IPssmInputData as a delegate to provide input data in the form of a multiple sequence alignment...
CRef< objects::CPssmWithParameters > x_CreatePssmFromFreqRatios()
Using IPssmInputFreqRatios as a delegate to provide the input PSSM's frequency ratios,...
CBlastScoreBlk m_ScoreBlk
Blast score block structure.
CRef< objects::CPssmWithParameters > Run()
Runs the PSSM engine to compute the PSSM.
const Blast_KarlinBlk * GetPsiGappedKarlinBlk() const
Retrieve PSI-BLAST gapped Karlin parameters.
const char * x_GetMatrixName() const
Private interface to retrieve matrix name from its data source interface.
virtual void Process(void)=0
Pre-process CDs used for PSSM computation.
static unsigned char * x_GuardProteinQuery(const unsigned char *query, unsigned int query_length)
Copies query sequence and adds protein sentinel bytes at the beginning and at the end of the sequence...
virtual const char * GetMatrixName()
Obtain the name of the underlying matrix to use when building the PSSM.
CRef< objects::CPssmWithParameters > x_CreatePssmFromCDD()
Using IPssmInputCdd as a delegate to provide data in the form of multiple alignment of CDs,...
static CRef< objects::CPssmWithParameters > x_PSIMatrix2Asn1(const PSIMatrix *pssm, const char *matrix_name, const PSIBlastOptions *opts=NULL, const PSIDiagnosticsResponse *diagnostics=NULL)
Converts the PSIMatrix structure into a ASN.1 CPssmWithParameters object.
#define BLASTAA_SIZE
Size of aminoacid alphabet.
CPssmEngine()
Default constructor available for derived test classes.
size_t m_NumCols
number of columns in the matrix (for deallocation)
virtual unsigned char * GetQuery()=0
Get the query sequence used as master for the multiple sequence alignment in ncbistdaa encoding.
virtual const PSIDiagnosticsRequest * GetDiagnosticsRequest(void)
Get diagnostics options.
void x_InitializeScoreBlock(const unsigned char *query, unsigned int query_length, const char *matrix_name, int gap_existence, int gap_extension)
Initializes the BlastScoreBlk data member required to run the PSSM engine.
BlastScoreBlk * Get() const
virtual const PSIDiagnosticsRequest * GetDiagnosticsRequest()
Obtain the diagnostics data that is requested from the PSSM engine Its results will be populated in t...
virtual double GetImpalaScaleFactor()
int x_GetGapExtension() const
Private interface to retrieve gap extension cost from data source.
virtual int GetGapExistence()
Obtain the gap existence value for the underlying matrix used to build the PSSM.
virtual unsigned int GetQueryLength()=0
Get the query's length.
int x_GetGapExistence() const
Private interface to retrieve gap existence cost from data source.
IPssmInputFreqRatios * m_PssmInputFreqRatios
Pointer to input data to create PSSM from frequency ratios.
const Blast_KarlinBlk * GetPsiUngappedKarlinBlk() const
Retrieve PSI-BLAST ungapped Karlin parameters.
IPssmInputData * m_PssmInput
Handle to strategy to process raw PSSM input data.
virtual int GetGapExtension()
Obtain the gap extension value for the underlying matrix used to build the PSSM.
virtual const PSIBlastOptions * GetOptions()=0
Obtain the options for the PSSM engine.
virtual const PSIBlastOptions * GetOptions(void)=0
Get CDD-related PSI-BLAST options.
SNcbiMatrix2DoubleMatrix(const CNcbiMatrix< double > &m)
Constructor.
static void s_Validate(IPssmInputData *pssm_input_msa)
Performs validation on data provided before invoking the CORE PSSM engine.
IPssmInputCdd * m_PssmInputCdd
Pointer to strategy to process raw PSSM input data Note: Only one m_PssmInput* should be non-NULL.
char * BlastFindMatrixPath(const char *matrix_name, Boolean is_prot)
Returns the path to a specified matrix.
virtual const CNcbiMatrix< double > & GetData()=0
Obtain a matrix of frequency ratios with this->GetQueryLength() columns and BLASTAA_SIZE rows.
virtual PSICdMsa * GetData(void)=0
Get CD data for PSSM computation.
unsigned int x_GetQueryLength() const
Private interface to retrieve query length from its data source interface.
void Reset(BlastScoreBlk *p=NULL)
~CPssmEngine()
Destructor.
void SetUngappedStatisticalParams(CConstRef< CBlastAncillaryData > ancillary_data)
Sets the Karlin & Altschul parameters in the BlastScoreBlk to be used in PSSM generation.
virtual void Process()=0
Algorithm to produce the PSSM's frequecy ratios should be implemented in this method.
virtual PSIMsa * GetData()=0
Obtain the multiple sequence alignment structure.
static std::string x_ErrorCodeToString(int error_code)
Convert a PSSM return status into a string.
virtual CRef< objects::CBioseq > GetQueryForPssm()
Get a CBioseq object for attachment into the CPssmWithParameters that CPssmEngine produces (only atta...
unsigned char * x_GetQuery() const
Private interface to retrieve query sequence from its data source interface.
double ** m_Data
double** representation of a CNcbiMatrix
~SNcbiMatrix2DoubleMatrix()
Destructor.
Uint1 GetSentinelByte(EBlastEncoding encoding) THROWS((CBlastException))
Convenience function to centralize the knowledge of which sentinel bytes we use for supported encodin...
BlastQueryInfo * x_InitializeQueryInfo(unsigned int query_length)
Initialiazes the core BlastQueryInfo structure for a single protein sequence.
@ eBlastEncodingProtein
NCBIstdaa.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
element_type * release(void)
Release will release ownership of pointer to caller.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
bool NotEmpty(void) const THROWS_NONE
Check if CConstRef is not empty – pointing to an object and has a non-null value.
uint32_t Uint4
4-byte (32-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define USING_SCOPE(ns)
Use the specified namespace.
#define END_SCOPE(ns)
End the previously defined scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
#define BEGIN_SCOPE(ns)
Define a new scope.
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
static string & ToUpper(string &str)
Convert string to upper case – string& version.
void SetParams(TParams &value)
Assign a value to Params data member.
list< int > TNumMatchingSeqs
void SetIsProtein(TIsProtein value)
Assign a value to IsProtein data member.
list< int > TResFreqsPerPos
void SetByRow(TByRow value)
Assign a value to ByRow data member.
void SetPssm(TPssm &value)
Assign a value to Pssm data member.
void SetIntermediateData(TIntermediateData &value)
Assign a value to IntermediateData data member.
list< double > TWeightedResFreqsPerPos
void SetFinalData(TFinalData &value)
Assign a value to FinalData data member.
list< double > TNumIndeptObsr
TByRow GetByRow(void) const
Get the ByRow member data.
list< double > TFreqRatios
list< int > TIntervalSizes
void SetNumColumns(TNumColumns value)
Assign a value to NumColumns data member.
list< double > TInformationContent
void SetNumRows(TNumRows value)
Assign a value to NumRows data member.
list< double > TGaplessColumnWeights
TSeq & SetSeq(void)
Select the variant.
static const int kScaleFactor
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
C++ API for the PSI-BLAST PSSM engine.
Int4 query_length
Length of this query, strand or frame.
Int4 query_offset
Offset of this query, strand or frame in the concatenated super-query.
The query related information.
BlastContextInfo * contexts
Information per context.
Uint4 max_length
Length of the longest among the concatenated queries.
Structure used for scoring calculations.
Blast_KarlinBlk ** kbp
Karlin-Altschul parameters.
Blast_KarlinBlk ** kbp_psi
K-A parameters for position-based alignments.
Blast_KarlinBlk ** kbp_gap
K-A parameters for gapped alignments.
Blast_KarlinBlk * kbp_ideal
Ideal values (for query with average database composition).
Blast_KarlinBlk ** kbp_gap_psi
K-A parameters for psi alignments.
Int4 gap_open
Extra penalty for starting a gap.
Int4 gap_extend
Penalty for each gap residue.
double K
K value used in statistics.
double Lambda
Lambda value used in statistics.
double H
H value used in statistics.
double logK
natural log of K value used in statistics
Structure to hold the a message from the core of the BLAST engine.
char * message
User message to be saved.
Options used in protein BLAST only (PSI, PHI, RPS and translated BLAST) Some of these possibly should...
double impala_scaling_factor
Scaling factor as used in IMPALA to do the matrix rescaling.
Int4 pseudo_count
Pseudocount constant.
This structure contains the diagnostics information requested using the PSIDiagnosticsRequest structu...
double * information_content
position information content (query_length elements)
Uint4 ** residue_freqs
observed residue frequencies per position of the PSSM (Dimensions are query_length by alphabet_size)
double ** weighted_residue_freqs
Weighted observed residue frequencies per position of the PSSM.
Uint4 * interval_sizes
interval sizes of aligned regions (query_length elements)
Uint4 alphabet_size
Specifies length of alphabet.
Uint4 query_length
Specifies the number of positions in the PSSM.
double * gapless_column_weights
Weights for columns without gaps (query_length elements)
double * independent_observations
Effective number of observations per column.
Uint4 * num_matching_seqs
number of matching sequences per query position (query_length elements)
double * sigma
sigma (query_length elements)
double ** frequency_ratios
PSSM's frequency ratios (Dimensions are query_length by alphabet_size)
This is the main return value from the PSSM engine.
double ung_lambda
Ungapped Lambda Karlin-Altschul parameter.
double kappa
Kappa Karlin-Altschul parameter.
int ** pssm
Position-specific score matrix.
double ung_kappa
Ungapped Kappa Karlin-Altschul parameter.
Uint4 ncols
Number of columns in PSSM (query_length)
double ung_h
Ungapped H Karlin-Altschul parameter.
double lambda
Lambda Karlin-Altschul parameter.
Uint4 nrows
Number of rows in PSSM (alphabet_size)
double h
H Karlin-Altschul parameter.
Auxiliary class to convert from a CNcbiMatrix into a double** as required by the C API.