90 double impala_scaling_factor);
119 if ( !msap || !options || !sbp || !pssm ) {
130 seq_weights, internal_pssm);
142 if ( !msa || ! aligned_block || !seq_weights || !internal_pssm || !*pssm ) {
144 seq_weights, internal_pssm);
159 seq_weights, internal_pssm);
166 seq_weights, internal_pssm);
175 seq_weights, internal_pssm);
185 seq_weights, internal_pssm);
194 seq_weights, internal_pssm);
202 if (request && diagnostics) {
206 if ( !*diagnostics ) {
210 seq_weights, internal_pssm);
214 internal_pssm, *diagnostics);
218 seq_weights, internal_pssm);
240 if ( !cd_msa || !options || !sbp || !pssm ) {
253 if ( !seq_weights || !internal_pssm || !*pssm ) {
296 if (request && diagnostics) {
302 if ( !*diagnostics ) {
347 double** freq_ratios,
348 double impala_scaling_factor,
352 double* std_prob =
NULL;
359 if ( !std_prob || !*pssm || !internal_pssm ) {
371 impala_scaling_factor);
390 double impala_scaling_factor)
409 impala_scaling_factor);
486 if ( !retval->
data ) {
493 for (s = 0; s < dimensions->
num_seqs + 1; s++) {
501 #ifdef DEBUG_PSSM_ENGINE
504 if ( !retval->seqinfo ) {
529 #ifdef DEBUG_PSSM_ENGINE
530 if ( msa->seqinfo ) {
549 retval->
ncols = query_length;
550 retval->
nrows = alphabet_size;
554 if ( !(retval->
pssm) ) {
599 if (save_ascii_pssm) {
613 sfree(diags_request);
640 calloc(query_length,
sizeof(
double));
677 calloc(query_length,
sizeof(
double));
684 retval->
sigma = (
double*)
calloc(query_length,
sizeof(
double));
685 if ( !retval->
sigma ) {
707 (
double*)
calloc(query_length,
sizeof(
double));
#define sfree(x)
Safe free a pointer: belongs to a higher level header.
Declarations of static arrays used to define some NCBI encodings to be used in a toolkit independent ...
const double kPSSM_NoImpalaScaling
Value used to indicate that no IMPALA-style scaling should be performed when scaling a PSSM.
PSIDiagnosticsResponse * PSIDiagnosticsResponseFree(PSIDiagnosticsResponse *diags)
Deallocates the PSIDiagnosticsResponse structure passed in.
int PSICreatePssmFromCDD(const PSICdMsa *cd_msa, const PSIBlastOptions *options, BlastScoreBlk *sbp, const PSIDiagnosticsRequest *request, PSIMatrix **pssm, PSIDiagnosticsResponse **diagnostics)
Main entry point to core PSSM engine for computing CDD-based PSSMs.
PSIMatrix * PSIMatrixFree(PSIMatrix *matrix)
Deallocates the PSIMatrix structure passed in.
PSIDiagnosticsResponse * PSIDiagnosticsResponseNew(Uint4 query_length, Uint4 alphabet_size, const PSIDiagnosticsRequest *wants)
Allocates a new PSI-BLAST diagnostics structure based on which fields of the PSIDiagnosticsRequest st...
PSIMsa * PSIMsaFree(PSIMsa *msa)
Deallocates the PSIMsa structure.
static void s_PSICreatePssmCleanUp(PSIMatrix **pssm, _PSIPackedMsa *packed_msa, _PSIMsa *msa, _PSIAlignedBlock *aligned_block, _PSISequenceWeights *seq_weights, _PSIInternalPssmData *internal_pssm)
Convenience function to deallocate data structures allocated in PSICreatePssmWithDiagnostics.
PSIDiagnosticsRequest * PSIDiagnosticsRequestNew(void)
Allocates a PSIDiagnosticsRequest structure, setting all fields to false.
static int _PSICreateAndScalePssmFromFrequencyRatios(_PSIInternalPssmData *internal_pssm, const Uint1 *query, Uint4 query_length, double *std_prob, BlastScoreBlk *sbp, double impala_scaling_factor)
Private function which performs the last 2 stages of the PSSM creation: conversion of PSSM frequecy r...
int PSICreatePssmFromFrequencyRatios(const Uint1 *query, Uint4 query_length, BlastScoreBlk *sbp, double **freq_ratios, double impala_scaling_factor, PSIMatrix **pssm)
Top-level function to create a PSSM given a matrix of frequency ratios and perform scaling on the res...
PSIDiagnosticsRequest * PSIDiagnosticsRequestNewEx(Boolean save_ascii_pssm)
Allocates a PSIDiagnosticsRequest structure, setting fields to their default values for their use in ...
PSIMatrix * PSIMatrixNew(Uint4 query_length, Uint4 alphabet_size)
Allocates a new PSIMatrix structure.
static void s_PSICreatePssmFromFrequencyRatiosCleanUp(PSIMatrix **pssm, _PSIInternalPssmData *internal_pssm, double *std_prob)
Convenience function to deallocate data structures allocated in PSICreatePssmFromFrequencyRatios.
static void s_PSISavePssm(const _PSIInternalPssmData *internal_pssm, const BlastScoreBlk *sbp, PSIMatrix *pssm)
Copies pssm data from internal_pssm and sbp into pssm.
int PSICreatePssm(const PSIMsa *msap, const PSIBlastOptions *options, BlastScoreBlk *sbp, PSIMatrix **pssm)
Main entry point to core PSSM engine to calculate the PSSM.
PSIMsa * PSIMsaNew(const PSIMsaDimensions *dimensions)
Allocates and initializes the multiple sequence alignment data structure for use as input to the PSSM...
int PSICreatePssmWithDiagnostics(const PSIMsa *msap, const PSIBlastOptions *options, BlastScoreBlk *sbp, const PSIDiagnosticsRequest *request, PSIMatrix **pssm, PSIDiagnosticsResponse **diagnostics)
Main entry point to core PSSM engine which allows to request diagnostics information.
PSIDiagnosticsRequest * PSIDiagnosticsRequestFree(PSIDiagnosticsRequest *diags_request)
Deallocates the PSIDiagnosticsRequest structure passed in.
int _PSIComputeAlignmentBlocks(const _PSIMsa *msa, _PSIAlignedBlock *aligned_blocks)
Main function to compute aligned blocks' properties for each position within multiple alignment (stag...
int _PSIConvertFreqRatiosToPSSM(_PSIInternalPssmData *internal_pssm, const Uint1 *query, const BlastScoreBlk *sbp, const double *std_probs)
Converts the PSSM's frequency ratios obtained in the previous stage to a PSSM of scores.
int _PSIComputeFreqRatios(const _PSIMsa *msa, const _PSISequenceWeights *seq_weights, const BlastScoreBlk *sbp, const _PSIAlignedBlock *aligned_blocks, Int4 pseudo_count, Boolean nsg_compatibility_mode, _PSIInternalPssmData *internal_pssm)
Main function to compute the PSSM's frequency ratios (stage 5).
void ** _PSIAllocateMatrix(unsigned int ncols, unsigned int nrows, unsigned int data_type_sz)
Generic 2 dimensional matrix allocator.
void _PSIStructureGroupCustomization(_PSIMsa *msa)
Enable NCBI structure group customization to discard the query sequence, as this really isn't the res...
int _PSIComputeFreqRatiosFromCDs(const PSICdMsa *cd_msa, const _PSISequenceWeights *seq_weights, const BlastScoreBlk *sbp, Int4 pseudo_count, _PSIInternalPssmData *internal_pssm)
Main function to compute CD-based PSSM's frequency ratios.
_PSISequenceWeights * _PSISequenceWeightsNew(const PSIMsaDimensions *dimensions, const BlastScoreBlk *sbp)
Allocates and initializes the _PSISequenceWeights structure.
_PSIInternalPssmData * _PSIInternalPssmDataNew(Uint4 query_length, Uint4 alphabet_size)
Allocates a new _PSIInternalPssmData structure.
_PSIAlignedBlock * _PSIAlignedBlockNew(Uint4 query_length)
Allocates and initializes the _PSIAlignedBlock structure.
int _PSIComputeSequenceWeights(const _PSIMsa *msa, const _PSIAlignedBlock *aligned_blocks, Boolean nsg_compatibility_mode, _PSISequenceWeights *seq_weights)
Main function to calculate the sequence weights.
int _PSISaveDiagnostics(const _PSIMsa *msa, const _PSIAlignedBlock *aligned_block, const _PSISequenceWeights *seq_weights, const _PSIInternalPssmData *internal_pssm, PSIDiagnosticsResponse *diagnostics)
Collects diagnostic information from the process of creating the PSSM.
int _PSIPurgeBiasedSegments(_PSIPackedMsa *msa)
Main function for keeping only those selected sequences for PSSM construction (stage 2).
int _PSIComputeFrequenciesFromCDs(const PSICdMsa *cd_msa, BlastScoreBlk *sbp, const PSIBlastOptions *options, _PSISequenceWeights *seq_weights)
Main function to calculate CD weights and combine weighted residue counts from matched CDs.
_PSIMsa * _PSIMsaNew(const _PSIPackedMsa *msa, Uint4 alphabet_size)
Allocates and initializes the internal version of the PSIMsa structure (makes a deep copy) for intern...
_PSISequenceWeights * _PSISequenceWeightsFree(_PSISequenceWeights *seq_weights)
Deallocates the _PSISequenceWeights structure.
void ** _PSIDeallocateMatrix(void **matrix, unsigned int ncols)
Generic 2 dimensional matrix deallocator.
void _PSICopyMatrix_int(int **dest, int **src, unsigned int ncols, unsigned int nrows)
Copies src matrix into dest matrix, both of which must be int matrices with dimensions ncols by nrows...
_PSIInternalPssmData * _PSIInternalPssmDataFree(_PSIInternalPssmData *pssm_data)
Deallocates the _PSIInternalPssmData structure.
void _PSICopyMatrix_double(double **dest, double **src, unsigned int ncols, unsigned int nrows)
Copies src matrix into dest matrix, both of which must be double matrices with dimensions ncols by nr...
int _PSIValidateMSA_StructureGroup(const _PSIMsa *msa)
Structure group validation function for multiple sequence alignment structure.
int _PSIScaleMatrix(const Uint1 *query, const double *std_probs, _PSIInternalPssmData *internal_pssm, BlastScoreBlk *sbp)
Scales the PSSM (stage 7)
_PSIMsa * _PSIMsaFree(_PSIMsa *msa)
Deallocates the _PSIMsa data structure.
_PSIAlignedBlock * _PSIAlignedBlockFree(_PSIAlignedBlock *aligned_blocks)
Deallocates the _PSIAlignedBlock structure.
int _PSISaveCDDiagnostics(const PSICdMsa *cd_msa, const _PSISequenceWeights *seq_weights, const _PSIInternalPssmData *internal_pssm, PSIDiagnosticsResponse *diagnostics)
Collects diagnostic information from the process of creating the CDD-based PSSM.
int _PSIValidateCdMSA(const PSICdMsa *cd_msa, Uint4 alphabet_size)
Validation of multiple alignment of conserved domains structure.
_PSIPackedMsa * _PSIPackedMsaNew(const PSIMsa *msa)
Allocates and initializes the compact version of the PSIMsa structure (makes a deep copy) for interna...
int _IMPALAScaleMatrix(const Uint1 *query, const double *std_probs, _PSIInternalPssmData *internal_pssm, BlastScoreBlk *sbp, double scaling_factor)
Provides a similar function to _PSIScaleMatrix but it performs the scaling as IMPALA did,...
_PSIPackedMsa * _PSIPackedMsaFree(_PSIPackedMsa *msa)
Deallocates the _PSIMsa data structure.
int _PSIValidateMSA(const _PSIMsa *msa, Boolean ignore_unaligned_positions)
Main validation function for multiple sequence alignment structure.
Private interface for Position Iterated BLAST API, contains the PSSM generation engine.
#define PSIERR_BADPARAM
Bad parameter used in function.
#define PSIERR_OUTOFMEM
Out of memory.
#define PSI_SUCCESS
Successful operation.
Various auxiliary BLAST utility functions.
double * BLAST_GetStandardAaProbabilities(void)
Get the standard amino acid probabilities.
uint8_t Uint1
1-byte (8-bit) unsigned integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
Uint1 Boolean
bool replacment for C
#define TRUE
bool replacment for C indicating true.
#define FALSE
bool replacment for C indicating false.
#define ASSERT
macro for assert.
Structure used for scoring calculations.
Blast_KarlinBlk ** kbp_psi
K-A parameters for position-based alignments.
Int2 alphabet_size
size of alphabet.
Blast_KarlinBlk ** kbp_gap_psi
K-A parameters for psi alignments.
double K
K value used in statistics.
double Lambda
Lambda value used in statistics.
double H
H value used in statistics.
Options used in protein BLAST only (PSI, PHI, RPS and translated BLAST) Some of these possibly should...
Boolean nsg_compatibility_mode
Compatibility option for the NCBI's structure group (note nsg_ prefix, stands for NCBI's structure gr...
double impala_scaling_factor
Scaling factor as used in IMPALA to do the matrix rescaling.
Boolean ignore_unaligned_positions
This turns off a validation for the multiple sequence alignment in the PSSM engine for unaligned posi...
Int4 pseudo_count
Pseudocount constant.
Data structure representing multiple alignemnt of CDs and query sequence along with data needed for P...
PSIMsaDimensions * dimensions
Query length and number of aligned cds.
unsigned char * query
Query sequence as Ncbistdaa.
Structure to allow requesting various diagnostics data to be collected by PSSM engine.
Boolean information_content
request information content
Boolean frequency_ratios
request frequency ratios
Boolean independent_observations
request number of independent observations
Boolean weighted_residue_frequencies
request observed weighted residue frequencies
Boolean gapless_column_weights
request gapless column weights
Boolean num_matching_seqs
request number of matching sequences
Boolean sigma
request sigma
Boolean residue_frequencies
request observed residue frequencies
Boolean interval_sizes
request interval sizes
This structure contains the diagnostics information requested using the PSIDiagnosticsRequest structu...
double * information_content
position information content (query_length elements)
Uint4 ** residue_freqs
observed residue frequencies per position of the PSSM (Dimensions are query_length by alphabet_size)
double ** weighted_residue_freqs
Weighted observed residue frequencies per position of the PSSM.
Uint4 * interval_sizes
interval sizes of aligned regions (query_length elements)
Uint4 alphabet_size
Specifies length of alphabet.
Uint4 query_length
Specifies the number of positions in the PSSM.
double * gapless_column_weights
Weights for columns without gaps (query_length elements)
double * independent_observations
Effective number of observations per column.
Uint4 * num_matching_seqs
number of matching sequences per query position (query_length elements)
double * sigma
sigma (query_length elements)
double ** frequency_ratios
PSSM's frequency ratios (Dimensions are query_length by alphabet_size)
This is the main return value from the PSSM engine.
double ung_lambda
Ungapped Lambda Karlin-Altschul parameter.
double kappa
Kappa Karlin-Altschul parameter.
int ** pssm
Position-specific score matrix.
double ung_kappa
Ungapped Kappa Karlin-Altschul parameter.
Uint4 ncols
Number of columns in PSSM (query_length)
double ung_h
Ungapped H Karlin-Altschul parameter.
double lambda
Lambda Karlin-Altschul parameter.
Uint4 nrows
Number of rows in PSSM (alphabet_size)
double h
H Karlin-Altschul parameter.
Structure to describe the characteristics of a position in the multiple sequence alignment data struc...
Boolean is_aligned
Is this letter part of the alignment?
Uint1 letter
Preferred letter at this position, in ncbistdaa encoding.
Structure representing the dimensions of the multiple sequence alignment data structure.
Uint4 num_seqs
Number of distinct sequences aligned with the query (does not include the query)
Uint4 query_length
Length of the query.
Multiple sequence alignment (msa) data structure containing the raw data needed by the PSSM engine to...
PSIMsaCell ** data
actual data, dimensions are (dimensions->num_seqs+1) by (dimensions->query_length)
PSIMsaDimensions * dimensions
dimensions of the msa
This structure keeps track of the regions aligned between the query sequence and those that were not ...
Internal representation of a PSSM in various stages of its creation and its dimensions.
Uint4 nrows
number of rows (alphabet_size)
Uint4 ncols
number of columns (query_length)
double ** freq_ratios
frequency ratios
Internal multiple alignment data structure used by the PSSM engine.
Uint1 * query
query sequence (length: query_length)
PSIMsaDimensions * dimensions
dimensions of field below
Compact version of PSIMsa structure.
Internal data structure to keep computed sequence weights.
double * std_prob
standard amino acid probabilities
voidp calloc(uInt items, uInt size)