NCBI C++ ToolKit
blast_psi.h
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: blast_psi.h 52298 2011-12-14 20:08:44Z boratyng $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Christiam Camacho
27  *
28  */
29 
30 /** @file blast_psi.h
31  * High level definitions and declarations for the PSSM engine of PSI-BLAST.
32  */
33 
34 #ifndef ALGO_BLAST_CORE___BLAST_PSI__H
35 #define ALGO_BLAST_CORE___BLAST_PSI__H
36 
41 
42 #ifdef __cplusplus
43 extern "C" {
44 #endif
45 
46 /** Structure to describe the characteristics of a position in the multiple
47  * sequence alignment data structure
48  */
49 typedef struct PSIMsaCell {
50  Uint1 letter; /**< Preferred letter at this position, in
51  ncbistdaa encoding */
52  Boolean is_aligned; /**< Is this letter part of the alignment? */
54 
55 /** Structure representing the dimensions of the multiple sequence alignment
56  * data structure */
57 typedef struct PSIMsaDimensions {
58  Uint4 query_length; /**< Length of the query */
59  Uint4 num_seqs; /**< Number of distinct sequences aligned with the
60  query (does not include the query) */
62 
63 #ifdef DEBUG_PSSM_ENGINE
64 /** Sequence information structure */
65 typedef struct PSISeqInfo {
66  int gi; /**< Sequence GI */
67  double bit_score; /**< Bit score of this sequence aligned with query*/
68  double evalue; /**< E-value of this sequence aligned with query*/
69 } PSISeqInfo;
70 #endif /* DEBUG_PSSM_ENGINE */
71 
72 /** Multiple sequence alignment (msa) data structure containing the raw data
73  * needed by the PSSM engine to create a PSSM. By convention, the first row of
74  * the data field contains the query sequence */
75 typedef struct PSIMsa {
76  PSIMsaDimensions* dimensions; /**< dimensions of the msa */
77  PSIMsaCell** data; /**< actual data, dimensions are
78  (dimensions->num_seqs+1) by
79  (dimensions->query_length) */
80 #ifdef DEBUG_PSSM_ENGINE
81  PSISeqInfo* seqinfo; /** sequence information for a row of the
82  multiple sequence alignment, length is
83  dimensions->num_seqs+1 */
84 
85 #endif /* DEBUG_PSSM_ENGINE */
87 
88 /** Allocates and initializes the multiple sequence alignment data structure
89  * for use as input to the PSSM engine.
90  * @param dimensions dimensions of multiple sequence alignment data structure
91  * to allocate [in]
92  * @return allocated PSIMsa structure or NULL if out of memory.
93  */
95 PSIMsa*
96 PSIMsaNew(const PSIMsaDimensions* dimensions);
97 
98 /** Deallocates the PSIMsa structure
99  * @param msa multiple sequence alignment structure to deallocate [in]
100  * @return NULL
101  */
103 PSIMsa*
104 PSIMsaFree(PSIMsa* msa);
105 
106 /*******************************************************************
107 * Data structures for computing PSSM from using Conserved Domains
108 *
109 */
110 
111 /** Data needed for PSSM computation stored in MSA cell for single column in
112  * CD aligned to a position in the query */
113 typedef struct PSICdMsaCellData {
114 
115  double* wfreqs; /**< Frequencies for each residue in
116  CD column */
117 
118  double iobsr; /**< Effective number of independent
119  observations in a CD column */
121 
122 /** Alignment cell that represents one column of CD aligned to a position
123  * in the query*/
124 typedef struct PSICdMsaCell {
125  Uint1 is_aligned; /**< Does this cell represent column aligned
126  to a CD */
127 
128  PSICdMsaCellData* data; /**< Data needed for PSSM computation */
130 
131 
132 /** Data structure representing multiple alignemnt of CDs and query sequence
133  along with data needed for PSSM computation */
134 typedef struct PSICdMsa {
135  unsigned char* query; /**< Query sequence as Ncbistdaa */
136  PSIMsaDimensions* dimensions; /**< Query length and number of aligned cds */
137 
138  PSICdMsaCell **msa; /**< Multiple alignment of CDs */
140 
141 
142 #ifdef DEBUG_PSSM_ENGINE
144 void PrintMsa(const char* filename, const PSIMsa* msa);
146 void PrintMsaFP(FILE* fp, const PSIMsa* msa);
147 #endif /* DEBUG_PSSM_ENGINE */
148 
149 /** This is the main return value from the PSSM engine */
150 typedef struct PSIMatrix {
151  Uint4 ncols; /**< Number of columns in PSSM (query_length) */
152  Uint4 nrows; /**< Number of rows in PSSM (alphabet_size) */
153  int** pssm; /**< Position-specific score matrix */
154  double lambda; /**< Lambda Karlin-Altschul parameter */
155  double kappa; /**< Kappa Karlin-Altschul parameter */
156  double h; /**< H Karlin-Altschul parameter */
157  double ung_lambda; /**< Ungapped Lambda Karlin-Altschul parameter */
158  double ung_kappa; /**< Ungapped Kappa Karlin-Altschul parameter */
159  double ung_h; /**< Ungapped H Karlin-Altschul parameter */
161 
162 /** Allocates a new PSIMatrix structure
163  * @param query_length number of columns allocated for the PSSM [in]
164  * @param alphabet_size number of rows allocated for the PSSM [in]
165  * @return pointer to allocated PSIMatrix structure or NULL if out of memory
166  */
168 PSIMatrix*
169 PSIMatrixNew(Uint4 query_length, Uint4 alphabet_size);
170 
171 /** Deallocates the PSIMatrix structure passed in.
172  * @param matrix structure to deallocate [in]
173  * @return NULL
174  */
176 PSIMatrix*
177 PSIMatrixFree(PSIMatrix* matrix);
178 
179 /** Structure to allow requesting various diagnostics data to be collected by
180  * PSSM engine */
181 typedef struct PSIDiagnosticsRequest {
182  Boolean information_content; /**< request information content */
183  Boolean residue_frequencies; /**< request observed residue
184  frequencies */
185  Boolean weighted_residue_frequencies; /**< request observed weighted
186  residue frequencies */
187  Boolean frequency_ratios; /**< request frequency ratios */
188  Boolean gapless_column_weights; /**< request gapless column weights
189  */
190  Boolean sigma; /**< request sigma */
191  Boolean interval_sizes; /**< request interval sizes */
192  Boolean num_matching_seqs; /**< request number of matching
193  sequences */
194  Boolean independent_observations; /**< request number of independent
195  observations */
196 
198 
199 /** This structure contains the diagnostics information requested using the
200  * PSIDiagnosticsRequest structure */
201 typedef struct PSIDiagnosticsResponse {
202  double* information_content; /**< position information content
203  (query_length elements)*/
204  Uint4** residue_freqs; /**< observed residue frequencies
205  per position of the PSSM
206  (Dimensions are query_length by
207  alphabet_size) */
208  double** weighted_residue_freqs; /**< Weighted observed residue
209  frequencies per position of the
210  PSSM. (Dimensions are query_length
211  by alphabet_size) */
212  double** frequency_ratios; /**< PSSM's frequency ratios
213  (Dimensions are query_length by
214  alphabet_size) */
215  double* gapless_column_weights; /**< Weights for columns without
216  gaps (query_length elements) */
217  double* sigma; /**< sigma (query_length elements) */
218  Uint4* interval_sizes; /**< interval sizes of aligned
219  regions (query_length elements) */
220  Uint4* num_matching_seqs; /**< number of matching sequences
221  per query position (query_length
222  elements) */
223  Uint4 query_length; /**< Specifies the number of
224  positions in the PSSM */
225  Uint4 alphabet_size; /**< Specifies length of alphabet */
226 
227  double* independent_observations; /**< Effective number of
228  observations per column */
230 
231 /** Allocates a PSIDiagnosticsRequest structure, setting all fields to false
232  * @return newly allocated structure or NULL in case of memory allocation
233  * failure
234  */
238 
239 /** Allocates a PSIDiagnosticsRequest structure, setting fields to their
240  * default values for their use in the context of the PSI-BLAST application.
241  * @param save_ascii_pssm corresponds to the command line argument to save the
242  * PSSM in ASCII format [in]
243  * @return newly allocated structure or NULL in case of memory allocation
244  * failure
245  */
248 PSIDiagnosticsRequestNewEx(Boolean save_ascii_pssm);
249 
250 /** Deallocates the PSIDiagnosticsRequest structure passed in
251  * @param diags_request structure to deallocate [in]
252  * @return NULL
253  */
257 
258 /** Allocates a new PSI-BLAST diagnostics structure based on which fields of
259  * the PSIDiagnosticsRequest structure are TRUE. Note: this is declared
260  * here for consistency - this does not need to be called by client code of
261  * this API, it is called in the PSICreatePssm* functions to allocate the
262  * diagnostics response structure.
263  * @param query_length length of the query sequence [in]
264  * @param alphabet_size length of the alphabet [in]
265  * @param request diagnostics to retrieve from PSSM engine [in]
266  * @return pointer to allocated PSIDiagnosticsResponse or NULL if dimensions or
267  * request are NULL
268  */
271 PSIDiagnosticsResponseNew(Uint4 query_length, Uint4 alphabet_size,
272  const PSIDiagnosticsRequest* request);
273 
274 /** Deallocates the PSIDiagnosticsResponse structure passed in.
275  * @param diags structure to deallocate [in]
276  * @return NULL
277  */
281 
282 /****************************************************************************/
283 
284 /** Main entry point to core PSSM engine to calculate the PSSM.
285  * @param msap multiple sequence alignment data structure [in]
286  * @param options options to the PSSM engine [in]
287  * @param sbp BLAST score block structure [in|out]
288  * @param pssm PSSM and statistical information (the latter is also returned
289  * in the sbp->kbp_gap_psi[0])
290  * @return PSI_SUCCESS on success, otherwise one of the PSIERR_* constants
291  */
293 int
294 PSICreatePssm(const PSIMsa* msap,
295  const PSIBlastOptions* options,
296  BlastScoreBlk* sbp,
297  PSIMatrix** pssm);
298 
299 /** Main entry point to core PSSM engine which allows to request diagnostics
300  * information.
301  * @param msap multiple sequence alignment data structure [in]
302  * @param options options to the PSSM engine [in]
303  * @param sbp BLAST score block structure [in|out]
304  * @param request diagnostics information request [in]
305  * @param pssm PSSM and statistical information (the latter is also returned
306  * in the sbp->kbp_gap_psi[0]) [out]
307  * @param diagnostics diagnostics information response, expects a pointer to an
308  * uninitialized structure which will be populated with data requested in
309  * requests [in|out]
310  * @return PSI_SUCCESS on success, otherwise one of the PSIERR_* constants
311  */
313 int
315  const PSIBlastOptions* options,
316  BlastScoreBlk* sbp,
317  const PSIDiagnosticsRequest* request,
318  PSIMatrix** pssm,
319  PSIDiagnosticsResponse** diagnostics);
320 
321 /** Main entry point to core PSSM engine for computing CDD-based PSSMs
322  * @param cd_msa information about CDs that match to query sequence [in]
323  * @param options options to PSSM engine [in]
324  * @param sbp BLAST score block structure [in|out]
325  * @param request diagnostics information request [in]
326  * @param pssm PSSM [out]
327  * @param diagnostics diagnostics information response, expects a pointer to
328  * uninitialized structure [in|out]
329  */
330 int
331 PSICreatePssmFromCDD(const PSICdMsa* cd_msa,
332  const PSIBlastOptions* options,
333  BlastScoreBlk* sbp,
334  const PSIDiagnosticsRequest* request,
335  PSIMatrix** pssm,
336  PSIDiagnosticsResponse** diagnostics);
337 
338 
339 
340 /** Top-level function to create a PSSM given a matrix of frequency ratios
341  * and perform scaling on the resulting PSSM (i.e.: performs the last two
342  * stages of the algorithm)
343  * Note that no diagnostics can be returned as those are calculated in earlier
344  * stages of the algorithm.
345  * @param query query sequence in ncbistdaa format, no sentinels needed [in]
346  * @param query_length length of the query sequence [in]
347  * @param sbp BLAST score block structure [in|out]
348  * @param freq_ratios matrix of frequency ratios, dimensions are query_length
349  * by BLASTAA_SIZE [in]
350  * @param impala_scaling_factor scaling factor used in IMPALA-style scaling if
351  * its value is NOT kPSSM_NoImpalaScaling (otherwise it performs standard
352  * PSI-BLAST scaling) [in]
353  * @param pssm PSSM and statistical information [in|out]
354  * @return PSI_SUCCESS on success, otherwise one of the PSIERR_* constants
355  * @todo FIXME change scalePosMatrix (blast_kappa.c) to use this function
356  */
358 int
360  Uint4 query_length,
361  BlastScoreBlk* sbp,
362  double** freq_ratios,
363  double impala_scaling_factor,
364  PSIMatrix** pssm);
365 
366 #ifdef __cplusplus
367 }
368 #endif
369 
370 #endif /* !ALGO_BLAST_CORE__BLAST_PSI__H */
371 
Defines to provide correct exporting from BLAST DLL in Windows.
#define NCBI_XBLAST_EXPORT
NULL operations for other cases.
Definition: blast_export.h:65
The structures and functions in blast_options.
PSIDiagnosticsResponse * PSIDiagnosticsResponseFree(PSIDiagnosticsResponse *diags)
Deallocates the PSIDiagnosticsResponse structure passed in.
Definition: blast_psi.c:717
int PSICreatePssmFromCDD(const PSICdMsa *cd_msa, const PSIBlastOptions *options, BlastScoreBlk *sbp, const PSIDiagnosticsRequest *request, PSIMatrix **pssm, PSIDiagnosticsResponse **diagnostics)
Main entry point to core PSSM engine for computing CDD-based PSSMs.
Definition: blast_psi.c:229
PSIDiagnosticsResponse * PSIDiagnosticsResponseNew(Uint4 query_length, Uint4 alphabet_size, const PSIDiagnosticsRequest *request)
Allocates a new PSI-BLAST diagnostics structure based on which fields of the PSIDiagnosticsRequest st...
Definition: blast_psi.c:618
PSIMatrix * PSIMatrixFree(PSIMatrix *matrix)
Deallocates the PSIMatrix structure passed in.
Definition: blast_psi.c:569
PSIMsa * PSIMsaFree(PSIMsa *msa)
Deallocates the PSIMsa structure.
Definition: blast_psi.c:513
PSIDiagnosticsRequest * PSIDiagnosticsRequestNew(void)
Allocates a PSIDiagnosticsRequest structure, setting all fields to false.
Definition: blast_psi.c:585
int PSICreatePssmFromFrequencyRatios(const Uint1 *query, Uint4 query_length, BlastScoreBlk *sbp, double **freq_ratios, double impala_scaling_factor, PSIMatrix **pssm)
Top-level function to create a PSSM given a matrix of frequency ratios and perform scaling on the res...
Definition: blast_psi.c:344
struct PSIMatrix PSIMatrix
This is the main return value from the PSSM engine.
struct PSICdMsaCell PSICdMsaCell
Alignment cell that represents one column of CD aligned to a position in the query.
struct PSIMsaCell PSIMsaCell
Structure to describe the characteristics of a position in the multiple sequence alignment data struc...
PSIDiagnosticsRequest * PSIDiagnosticsRequestNewEx(Boolean save_ascii_pssm)
Allocates a PSIDiagnosticsRequest structure, setting fields to their default values for their use in ...
Definition: blast_psi.c:591
struct PSICdMsa PSICdMsa
Data structure representing multiple alignemnt of CDs and query sequence along with data needed for P...
PSIMatrix * PSIMatrixNew(Uint4 query_length, Uint4 alphabet_size)
Allocates a new PSIMatrix structure.
Definition: blast_psi.c:541
struct PSIMsa PSIMsa
Multiple sequence alignment (msa) data structure containing the raw data needed by the PSSM engine to...
struct PSICdMsaCellData PSICdMsaCellData
Data needed for PSSM computation stored in MSA cell for single column in CD aligned to a position in ...
struct PSIDiagnosticsRequest PSIDiagnosticsRequest
Structure to allow requesting various diagnostics data to be collected by PSSM engine.
int PSICreatePssm(const PSIMsa *msap, const PSIBlastOptions *options, BlastScoreBlk *sbp, PSIMatrix **pssm)
Main entry point to core PSSM engine to calculate the PSSM.
Definition: blast_psi.c:95
PSIMsa * PSIMsaNew(const PSIMsaDimensions *dimensions)
Allocates and initializes the multiple sequence alignment data structure for use as input to the PSSM...
Definition: blast_psi.c:462
int PSICreatePssmWithDiagnostics(const PSIMsa *msap, const PSIBlastOptions *options, BlastScoreBlk *sbp, const PSIDiagnosticsRequest *request, PSIMatrix **pssm, PSIDiagnosticsResponse **diagnostics)
Main entry point to core PSSM engine which allows to request diagnostics information.
Definition: blast_psi.c:105
PSIDiagnosticsRequest * PSIDiagnosticsRequestFree(PSIDiagnosticsRequest *diags_request)
Deallocates the PSIDiagnosticsRequest structure passed in.
Definition: blast_psi.c:611
struct PSIMsaDimensions PSIMsaDimensions
Structure representing the dimensions of the multiple sequence alignment data structure.
struct PSIDiagnosticsResponse PSIDiagnosticsResponse
This structure contains the diagnostics information requested using the PSIDiagnosticsRequest structu...
Definitions and prototypes used by blast_stat.c to calculate BLAST statistics.
static const char fp[]
Definition: des.c:87
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
Type and macro definitions from C toolkit that are not defined in C++ toolkit.
Uint1 Boolean
bool replacment for C
Definition: ncbi_std.h:94
Structure used for scoring calculations.
Definition: blast_stat.h:177
Options used in protein BLAST only (PSI, PHI, RPS and translated BLAST) Some of these possibly should...
Data needed for PSSM computation stored in MSA cell for single column in CD aligned to a position in ...
Definition: blast_psi.h:113
double iobsr
Effective number of independent observations in a CD column.
Definition: blast_psi.h:118
double * wfreqs
Frequencies for each residue in CD column.
Definition: blast_psi.h:115
Alignment cell that represents one column of CD aligned to a position in the query.
Definition: blast_psi.h:124
Uint1 is_aligned
Does this cell represent column aligned to a CD.
Definition: blast_psi.h:125
PSICdMsaCellData * data
Data needed for PSSM computation.
Definition: blast_psi.h:128
Data structure representing multiple alignemnt of CDs and query sequence along with data needed for P...
Definition: blast_psi.h:134
PSIMsaDimensions * dimensions
Query length and number of aligned cds.
Definition: blast_psi.h:136
unsigned char * query
Query sequence as Ncbistdaa.
Definition: blast_psi.h:135
PSICdMsaCell ** msa
Multiple alignment of CDs.
Definition: blast_psi.h:138
Structure to allow requesting various diagnostics data to be collected by PSSM engine.
Definition: blast_psi.h:181
Boolean information_content
request information content
Definition: blast_psi.h:182
Boolean frequency_ratios
request frequency ratios
Definition: blast_psi.h:187
Boolean independent_observations
request number of independent observations
Definition: blast_psi.h:194
Boolean weighted_residue_frequencies
request observed weighted residue frequencies
Definition: blast_psi.h:185
Boolean gapless_column_weights
request gapless column weights
Definition: blast_psi.h:188
Boolean num_matching_seqs
request number of matching sequences
Definition: blast_psi.h:192
Boolean sigma
request sigma
Definition: blast_psi.h:190
Boolean residue_frequencies
request observed residue frequencies
Definition: blast_psi.h:183
Boolean interval_sizes
request interval sizes
Definition: blast_psi.h:191
This structure contains the diagnostics information requested using the PSIDiagnosticsRequest structu...
Definition: blast_psi.h:201
double * information_content
position information content (query_length elements)
Definition: blast_psi.h:202
Uint4 ** residue_freqs
observed residue frequencies per position of the PSSM (Dimensions are query_length by alphabet_size)
Definition: blast_psi.h:204
double ** weighted_residue_freqs
Weighted observed residue frequencies per position of the PSSM.
Definition: blast_psi.h:208
Uint4 * interval_sizes
interval sizes of aligned regions (query_length elements)
Definition: blast_psi.h:218
Uint4 alphabet_size
Specifies length of alphabet.
Definition: blast_psi.h:225
Uint4 query_length
Specifies the number of positions in the PSSM.
Definition: blast_psi.h:223
double * gapless_column_weights
Weights for columns without gaps (query_length elements)
Definition: blast_psi.h:215
double * independent_observations
Effective number of observations per column.
Definition: blast_psi.h:227
Uint4 * num_matching_seqs
number of matching sequences per query position (query_length elements)
Definition: blast_psi.h:220
double * sigma
sigma (query_length elements)
Definition: blast_psi.h:217
double ** frequency_ratios
PSSM's frequency ratios (Dimensions are query_length by alphabet_size)
Definition: blast_psi.h:212
This is the main return value from the PSSM engine.
Definition: blast_psi.h:150
double ung_lambda
Ungapped Lambda Karlin-Altschul parameter.
Definition: blast_psi.h:157
double kappa
Kappa Karlin-Altschul parameter.
Definition: blast_psi.h:155
int ** pssm
Position-specific score matrix.
Definition: blast_psi.h:153
double ung_kappa
Ungapped Kappa Karlin-Altschul parameter.
Definition: blast_psi.h:158
Uint4 ncols
Number of columns in PSSM (query_length)
Definition: blast_psi.h:151
double ung_h
Ungapped H Karlin-Altschul parameter.
Definition: blast_psi.h:159
double lambda
Lambda Karlin-Altschul parameter.
Definition: blast_psi.h:154
Uint4 nrows
Number of rows in PSSM (alphabet_size)
Definition: blast_psi.h:152
double h
H Karlin-Altschul parameter.
Definition: blast_psi.h:156
Structure to describe the characteristics of a position in the multiple sequence alignment data struc...
Definition: blast_psi.h:49
Boolean is_aligned
Is this letter part of the alignment?
Definition: blast_psi.h:52
Uint1 letter
Preferred letter at this position, in ncbistdaa encoding.
Definition: blast_psi.h:50
Structure representing the dimensions of the multiple sequence alignment data structure.
Definition: blast_psi.h:57
Uint4 num_seqs
Number of distinct sequences aligned with the query (does not include the query)
Definition: blast_psi.h:59
Uint4 query_length
Length of the query.
Definition: blast_psi.h:58
Multiple sequence alignment (msa) data structure containing the raw data needed by the PSSM engine to...
Definition: blast_psi.h:75
PSIMsaCell ** data
actual data, dimensions are (dimensions->num_seqs+1) by (dimensions->query_length)
Definition: blast_psi.h:77
PSIMsaDimensions * dimensions
dimensions of the msa
Definition: blast_psi.h:76
static string query
Modified on Wed Sep 04 15:00:39 2024 by modify_doxy.py rev. 669887