51 #ifndef ALGO_BLAST_CORE___BLAST_PSI_PRIV__H
52 #define ALGO_BLAST_CORE___BLAST_PSI_PRIV__H
108 unsigned int data_type_sz);
130 unsigned int ncols,
unsigned int nrows);
142 unsigned int ncols,
unsigned int nrows);
366 #define PSI_SUCCESS (0)
368 #define PSIERR_BADPARAM (-1)
370 #define PSIERR_OUTOFMEM (-2)
372 #define PSIERR_BADSEQWEIGHTS (-3)
374 #define PSIERR_NOFREQRATIOS (-4)
376 #define PSIERR_POSITIVEAVGSCORE (-5)
378 #define PSIERR_NOALIGNEDSEQS (-6)
380 #define PSIERR_GAPINQUERY (-7)
382 #define PSIERR_UNALIGNEDCOLUMN (-8)
384 #define PSIERR_COLUMNOFGAPS (-9)
386 #define PSIERR_STARTINGGAP (-10)
388 #define PSIERR_ENDINGGAP (-11)
390 #define PSIERR_BADPROFILE (-12)
392 #define PSIERR_UNKNOWN (-255)
453 Boolean nsg_compatibility_mode,
494 Boolean nsg_compatibility_mode,
530 const double* std_probs);
546 const double* std_probs,
569 const double* std_probs,
585 double scaling_factor);
600 unsigned int seq_index,
643 const double* std_probs,
693 const double* std_prob,
714 double** freq_ratios,
715 const double* std_prob,
719 #ifdef DEBUG_PSSM_ENGINE
720 void __printMsa(
const char* filename,
const _PSIPackedMsa* msa);
#define NCBI_XBLAST_EXPORT
NULL operations for other cases.
High level definitions and declarations for the PSSM engine of PSI-BLAST.
int _PSIConvertFreqRatiosToPSSM(_PSIInternalPssmData *internal_pssm, const Uint1 *query, const BlastScoreBlk *sbp, const double *std_probs)
Converts the PSSM's frequency ratios obtained in the previous stage to a PSSM of scores.
int _PSIComputeAlignmentBlocks(const _PSIMsa *msa, _PSIAlignedBlock *aligned_block)
Main function to compute aligned blocks' properties for each position within multiple alignment (stag...
int _PSIComputeFreqRatios(const _PSIMsa *msa, const _PSISequenceWeights *seq_weights, const BlastScoreBlk *sbp, const _PSIAlignedBlock *aligned_blocks, Int4 pseudo_count, Boolean nsg_compatibility_mode, _PSIInternalPssmData *internal_pssm)
Main function to compute the PSSM's frequency ratios (stage 5).
void ** _PSIAllocateMatrix(unsigned int ncols, unsigned int nrows, unsigned int data_type_sz)
Generic 2 dimensional matrix allocator.
void _PSIStructureGroupCustomization(_PSIMsa *msa)
Enable NCBI structure group customization to discard the query sequence, as this really isn't the res...
struct _PSIPackedMsaCell _PSIPackedMsaCell
Compact version of the PSIMsaCell structure.
int _PSIComputeFreqRatiosFromCDs(const PSICdMsa *cd_msa, const _PSISequenceWeights *seq_weights, const BlastScoreBlk *sbp, Int4 pseudo_count, _PSIInternalPssmData *internal_pssm)
Main function to compute CD-based PSSM's frequency ratios.
_PSIInternalPssmData * _PSIInternalPssmDataNew(Uint4 query_length, Uint4 alphabet_size)
Allocates a new _PSIInternalPssmData structure.
_PSIAlignedBlock * _PSIAlignedBlockNew(Uint4 query_length)
Allocates and initializes the _PSIAlignedBlock structure.
unsigned int _PSIPackedMsaGetNumberOfAlignedSeqs(const _PSIPackedMsa *msa)
Retrieve the number of aligned sequences in the compact multiple sequence alignment.
int _PSIComputeSequenceWeights(const _PSIMsa *msa, const _PSIAlignedBlock *aligned_blocks, Boolean nsg_compatibility_mode, _PSISequenceWeights *seq_weights)
Main function to calculate the sequence weights.
int _PSISaveCDDiagnostics(const PSICdMsa *msa, const _PSISequenceWeights *seq_weights, const _PSIInternalPssmData *internal_pssm, PSIDiagnosticsResponse *diagnostics)
Collects diagnostic information from the process of creating the CDD-based PSSM.
int _PSISaveDiagnostics(const _PSIMsa *msa, const _PSIAlignedBlock *aligned_block, const _PSISequenceWeights *seq_weights, const _PSIInternalPssmData *internal_pssm, PSIDiagnosticsResponse *diagnostics)
Collects diagnostic information from the process of creating the PSSM.
double * _PSICalculateInformationContentFromScoreMatrix(Int4 **score_mat, const double *std_prob, const Uint1 *query, Uint4 query_length, Uint4 alphabet_sz, double lambda)
Calculates the information content from the scoring matrix.
Blast_ScoreFreq * _PSIComputeScoreProbabilities(const int **pssm, const Uint1 *query, Uint4 query_length, const double *std_probs, const BlastScoreBlk *sbp)
Compute the probabilities for each score in the PSSM.
struct _PSISequenceWeights _PSISequenceWeights
Internal data structure to keep computed sequence weights.
int _PSIValidateMSA(const _PSIMsa *msa, Boolean ignored_unaligned_positions)
Main validation function for multiple sequence alignment structure.
int _PSIPurgeBiasedSegments(_PSIPackedMsa *msa)
Main function for keeping only those selected sequences for PSSM construction (stage 2).
_PSIInternalPssmData * _PSIInternalPssmDataFree(_PSIInternalPssmData *pssm)
Deallocates the _PSIInternalPssmData structure.
int _PSIComputeFrequenciesFromCDs(const PSICdMsa *cd_msa, BlastScoreBlk *sbp, const PSIBlastOptions *options, _PSISequenceWeights *seq_weights)
Main function to calculate CD weights and combine weighted residue counts from matched CDs.
const int kPSIScaleFactor
Successor to POSIT_SCALE_FACTOR.
const double kPSINearIdentical
Percent identity threshold for discarding near-identical matches.
const Uint4 kPositScalingNumIterations
Constant used in scaling PSSM routines: Successor to POSIT_NUM_ITERATIONS.
_PSISequenceWeights * _PSISequenceWeightsFree(_PSISequenceWeights *seq_weights)
Deallocates the _PSISequenceWeights structure.
void ** _PSIDeallocateMatrix(void **matrix, unsigned int ncols)
Generic 2 dimensional matrix deallocator.
void _PSICopyMatrix_int(int **dest, int **src, unsigned int ncols, unsigned int nrows)
Copies src matrix into dest matrix, both of which must be int matrices with dimensions ncols by nrows...
Uint4 _PSISequenceLengthWithoutX(const Uint1 *seq, Uint4 length)
Calculates the length of the sequence without including any 'X' residues.
struct _PSIAlignedBlock _PSIAlignedBlock
This structure keeps track of the regions aligned between the query sequence and those that were not ...
void _PSIUpdateLambdaK(const int **pssm, const Uint1 *query, Uint4 query_length, const double *std_probs, BlastScoreBlk *sbp)
Updates the Karlin-Altschul parameters based on the query sequence and PSSM's score frequencies.
struct _PSIMsa _PSIMsa
Internal multiple alignment data structure used by the PSSM engine.
const unsigned int kQueryIndex
Index into multiple sequence alignment structure for the query sequence.
struct _PSIInternalPssmData _PSIInternalPssmData
Internal representation of a PSSM in various stages of its creation and its dimensions.
_PSISequenceWeights * _PSISequenceWeightsNew(const PSIMsaDimensions *dims, const BlastScoreBlk *sbp)
Allocates and initializes the _PSISequenceWeights structure.
void _PSICopyMatrix_double(double **dest, double **src, unsigned int ncols, unsigned int nrows)
Copies src matrix into dest matrix, both of which must be double matrices with dimensions ncols by nr...
int _PSIValidateMSA_StructureGroup(const _PSIMsa *msa)
Structure group validation function for multiple sequence alignment structure.
int _PSIScaleMatrix(const Uint1 *query, const double *std_probs, _PSIInternalPssmData *internal_pssm, BlastScoreBlk *sbp)
Scales the PSSM (stage 7)
int _PSIPurgeAlignedRegion(_PSIPackedMsa *msa, unsigned int seq_index, unsigned int start, unsigned int stop)
Marks the (start, stop] region corresponding to sequence seq_index in alignment so that it is not fur...
_PSIMsa * _PSIMsaFree(_PSIMsa *msa)
Deallocates the _PSIMsa data structure.
_PSIAlignedBlock * _PSIAlignedBlockFree(_PSIAlignedBlock *aligned_blocks)
Deallocates the _PSIAlignedBlock structure.
const double kEpsilon
Small constant to test against 0.
double * _PSICalculateInformationContentFromFreqRatios(double **freq_ratios, const double *std_prob, Uint4 query_length, Uint4 alphabet_sz)
Calculates the information content from the residue frequencies calculated in stage 5 of the PSSM cre...
struct _PSIMsaCell _PSIMsaCell
Internal data structure to represent a position in the multiple sequence alignment data structure.
int _PSIValidateCdMSA(const PSICdMsa *cd_msa, Uint4 alphabet_size)
Validation of multiple alignment of conserved domains structure.
void _PSIUpdatePositionCounts(_PSIMsa *msa)
Counts the number of sequences matching the query per query position (columns of the multiple alignme...
const double kPSIIdentical
Percent identity threshold for discarding identical matches.
_PSIPackedMsa * _PSIPackedMsaNew(const PSIMsa *msa)
Allocates and initializes the compact version of the PSIMsa structure (makes a deep copy) for interna...
struct _PSIPackedMsa _PSIPackedMsa
Compact version of PSIMsa structure.
const double kPositScalingPercent
Constant used in scaling PSSM routines: Successor to POSIT_PERCENT.
int _IMPALAScaleMatrix(const Uint1 *query, const double *std_probs, _PSIInternalPssmData *internal_pssm, BlastScoreBlk *sbp, double scaling_factor)
Provides a similar function to _PSIScaleMatrix but it performs the scaling as IMPALA did,...
_PSIMsa * _PSIMsaNew(const _PSIPackedMsa *packed_msa, Uint4 alphabet_size)
Allocates and initializes the internal version of the PSIMsa structure (makes a deep copy) for intern...
_PSIPackedMsa * _PSIPackedMsaFree(_PSIPackedMsa *msa)
Deallocates the _PSIMsa data structure.
Definitions and prototypes used by blast_stat.c to calculate BLAST statistics.
uint8_t Uint1
1-byte (8-bit) unsigned integer
int32_t Int4
4-byte (32-bit) signed integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
Interface to retrieve the frequency ratios for various scoring matrices.
Type and macro definitions from C toolkit that are not defined in C++ toolkit.
Uint1 Boolean
bool replacment for C
double lambda(size_t dimMatrix_, const Int4 *const *scoreMatrix_, const double *q_)
Structure used for scoring calculations.
Holds score frequencies used in calculation of Karlin-Altschul parameters for an ungapped search.
Options used in protein BLAST only (PSI, PHI, RPS and translated BLAST) Some of these possibly should...
Data structure representing multiple alignemnt of CDs and query sequence along with data needed for P...
This structure contains the diagnostics information requested using the PSIDiagnosticsRequest structu...
Structure representing the dimensions of the multiple sequence alignment data structure.
Multiple sequence alignment (msa) data structure containing the raw data needed by the PSSM engine to...
A structure containing two integers, used e.g.
This structure keeps track of the regions aligned between the query sequence and those that were not ...
SSeqRange * pos_extnt
Dynamically allocated array of size query_length to keep track of the extents of each aligned positio...
Uint4 * size
Dynamically allocated array of size query_length that contains the size of the intervals in the array...
Internal representation of a PSSM in various stages of its creation and its dimensions.
int ** scaled_pssm
scaled PSSM (scores)
Uint4 nrows
number of rows (alphabet_size)
double * pseudocounts
pseudocount constant for each column
Uint4 ncols
number of columns (query_length)
double ** freq_ratios
frequency ratios
Internal data structure to represent a position in the multiple sequence alignment data structure.
unsigned int letter
Preferred letter at this position.
SSeqRange extents
Extents of this aligned position.
unsigned int is_aligned
Is this letter part of the alignment?
Internal multiple alignment data structure used by the PSSM engine.
Uint4 * num_matching_seqs
number of sequences aligned at a given position in the multiple sequence alignment (length: query_len...
Uint1 * query
query sequence (length: query_length)
Uint4 ** residue_counts
matrix to keep track of the raw residue counts at each position of the multiple sequence alignment (d...
PSIMsaDimensions * dimensions
dimensions of field below
Uint4 alphabet_size
number of elements in alphabet
_PSIMsaCell ** cell
multiple sequence alignment matrix (dimensions: query_length x num_seqs + 1)
Compact version of the PSIMsaCell structure.
unsigned int letter
Preferred letter at this position, in ncbistdaa encoding.
unsigned int is_aligned
Is this letter part of the alignment?
Compact version of PSIMsa structure.
PSIMsaDimensions * dimensions
dimensions of the msa
Boolean * use_sequence
used to indicate whether a sequence should be used for further processing by the engine (length: num_...
_PSIPackedMsaCell ** data
actual data, dimensions are (dimensions->num_seqs+1) by (dimensions->query_length)
Internal data structure to keep computed sequence weights.
double ** match_weights
weighted observed residue frequencies (f_i in 2001 paper).
Uint4 posDistinctDistrib_size
Kept to deallocate field above.
double * std_prob
standard amino acid probabilities
double * norm_seq_weights
Stores the normalized sequence weights (length: num_seqs + 1)
int ** posDistinctDistrib
For position i, how many positions in its block have j distinct letters.
double * independent_observations
Number of independent sequences per column.
int * posNumParticipating
number of sequences at each position.
double * sigma
array of length query_length
double * gapless_column_weights
FIXME.
Uint4 match_weights_size
kept for help deallocate the field above
double * row_sigma
array of length num_seqs + 1