NCBI C++ ToolKit
blast_psi_priv.h
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: blast_psi_priv.h 52318 2011-12-16 13:17:19Z ivanov $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Alejandro Schaffer, ported by Christiam Camacho
27  *
28  */
29 
30 /** @file blast_psi_priv.h
31  * Private interface for Position Iterated BLAST API, contains the
32  * PSSM generation engine.
33  *
34  * <pre>
35  * Calculating PSSMs from Seq-aligns is a multi-stage process. These stages
36  * include:
37  * 1) Processing the Seq-align
38  * Examine alignment and extract information about aligned characters,
39  * performed at the API level
40  * 2) Purge biased sequences: construct M multiple sequence alignment as
41  * described in page 3395[1] - performed at the core level; custom
42  * selection of sequences should be performed at the API level.
43  * 3) Compute extents of the alignment: M sub C as described in page 3395[1]
44  * 4) Compute sequence weights
45  * 5) Compute residue frequencies
46  * 6) Convert residue frequencies to PSSM
47  * 7) Scale the resulting PSSM
48  * </pre>
49  */
50 
51 #ifndef ALGO_BLAST_CORE___BLAST_PSI_PRIV__H
52 #define ALGO_BLAST_CORE___BLAST_PSI_PRIV__H
53 
57 #include "matrix_freq_ratios.h"
58 
59 #ifdef __cplusplus
60 extern "C" {
61 #endif
62 
63 /****************************************************************************/
64 /* Extern declarations for constants (defined in blast_psi_priv.c) */
65 
66 /** Percent identity threshold for discarding near-identical matches */
68 extern const double kPSINearIdentical;
69 
70 /** Percent identity threshold for discarding identical matches */
72 extern const double kPSIIdentical;
73 
74 /** Index into multiple sequence alignment structure for the query sequence */
76 extern const unsigned int kQueryIndex;
77 
78 /** Small constant to test against 0 */
80 extern const double kEpsilon;
81 
82 /** Successor to POSIT_SCALE_FACTOR */
84 extern const int kPSIScaleFactor;
85 
86 /** Constant used in scaling PSSM routines: Successor to POSIT_PERCENT */
88 extern const double kPositScalingPercent;
89 /** Constant used in scaling PSSM routines: Successor to POSIT_NUM_ITERATIONS */
92 
93 /****************************************************************************/
94 /* Matrix utility functions */
95 
96 /** Generic 2 dimensional matrix allocator.
97  * Allocates a ncols by nrows matrix with cells of size data_type_sz. Must be
98  * freed using x_DeallocateMatrix
99  * @param ncols number of columns in matrix [in]
100  * @param nrows number of rows in matrix [in]
101  * @param data_type_sz size of the data type (in bytes) to allocate for each
102  * element in the matrix [in]
103  * @return pointer to allocated memory or NULL in case of failure
104  */
106 void**
107 _PSIAllocateMatrix(unsigned int ncols, unsigned int nrows,
108  unsigned int data_type_sz);
109 
110 /** Generic 2 dimensional matrix deallocator.
111  * Deallocates the memory allocated by x_AllocateMatrix
112  * @param matrix matrix to deallocate [in]
113  * @param ncols number of columns in the matrix [in]
114  * @return NULL
115  */
117 void**
118 _PSIDeallocateMatrix(void** matrix, unsigned int ncols);
119 
120 /** Copies src matrix into dest matrix, both of which must be int matrices with
121  * dimensions ncols by nrows
122  * @param dest Destination matrix [out]
123  * @param src Source matrix [in]
124  * @param ncols Number of columns to copy [in]
125  * @param nrows Number of rows to copy [in]
126  */
128 void
129 _PSICopyMatrix_int(int** dest, int** src,
130  unsigned int ncols, unsigned int nrows);
131 
132 /** Copies src matrix into dest matrix, both of which must be double matrices
133  * with dimensions ncols by nrows
134  * @param dest Destination matrix [out]
135  * @param src Source matrix [in]
136  * @param ncols Number of columns to copy [in]
137  * @param nrows Number of rows to copy [in]
138  */
140 void
141 _PSICopyMatrix_double(double** dest, double** src,
142  unsigned int ncols, unsigned int nrows);
143 
144 /****************************************************************************/
145 /* Structure declarations */
146 
147 /** Compact version of the PSIMsaCell structure */
148 typedef struct _PSIPackedMsaCell {
149  unsigned int letter:7; /**< Preferred letter at this position, in
150  ncbistdaa encoding */
151  unsigned int is_aligned:1; /**< Is this letter part of the alignment? */
153 
154 /** Compact version of PSIMsa structure */
155 typedef struct _PSIPackedMsa {
156  PSIMsaDimensions* dimensions; /**< dimensions of the msa */
157  _PSIPackedMsaCell** data; /**< actual data, dimensions are
158  (dimensions->num_seqs+1) by
159  (dimensions->query_length) */
160  Boolean* use_sequence; /**< used to indicate whether a
161  sequence should be used for
162  further processing by the engine
163  (length: num_seqs + 1) */
165 
166 /** Allocates and initializes the compact version of the PSIMsa structure
167  * (makes a deep copy) for internal use by the PSSM engine.
168  * @param msa multiple sequence alignment data structure provided by the user
169  * [in]
170  * @return newly allocated structure or NULL in case of memory allocation
171  * failure
172  */
175 _PSIPackedMsaNew(const PSIMsa* msa);
176 
177 /** Deallocates the _PSIMsa data structure.
178  * @param msa multiple sequence alignment data structure to deallocate [in]
179  * @return NULL
180  */
184 
185 /** Retrieve the number of aligned sequences in the compact multiple sequence
186  * alignment
187  * @param msa multiple sequence alignment data structure to deallocate [in]
188  */
190 unsigned int
192 
193 
194 /** Internal data structure to represent a position in the multiple sequence
195  * alignment data structure @sa _PSIMsa */
196 typedef struct _PSIMsaCell {
197  unsigned int letter:7; /**< Preferred letter at this position */
198  unsigned int is_aligned:1; /**< Is this letter part of the alignment? */
199  SSeqRange extents; /**< Extents of this aligned position */
201 
202 /** Internal multiple alignment data structure used by the PSSM engine */
203 typedef struct _PSIMsa {
204  PSIMsaDimensions* dimensions; /**< dimensions of field below */
205  _PSIMsaCell** cell; /**< multiple sequence alignment
206  matrix (dimensions: query_length
207  x num_seqs + 1) */
208  Uint1* query; /**< query sequence (length:
209  query_length) */
210  Uint4** residue_counts; /**< matrix to keep track of the
211  raw residue counts at each
212  position of the multiple sequence
213  alignment (dimensions:
214  query_length x alphabet_size) */
215  Uint4 alphabet_size; /**< number of elements in
216  alphabet */
217  Uint4* num_matching_seqs; /**< number of sequences aligned at
218  a given position in the multiple
219  sequence alignment (length:
220  query_length). Corresponds to
221  posit.h:posSearch->posCount */
223 
224 /** Allocates and initializes the internal version of the PSIMsa structure
225  * (makes a deep copy) for internal use by the PSSM engine.
226  * @param packed_msa compact multiple sequence alignment data structure [in]
227  * @param alphabet_size number of elements in the alphabet that makes up the
228  * aligned characters in the multiple sequence alignment [in]
229  * @return newly allocated structure or NULL in case of memory allocation
230  * failure
231  */
233 _PSIMsa*
234 _PSIMsaNew(const _PSIPackedMsa* packed_msa, Uint4 alphabet_size);
235 
236 /** Deallocates the _PSIMsa data structure.
237  * @param msa multiple sequence alignment data structure to deallocate [in]
238  * @return NULL
239  */
241 _PSIMsa*
242 _PSIMsaFree(_PSIMsa* msa);
243 
244 /** Internal representation of a PSSM in various stages of its creation and its
245  * dimensions. */
246 typedef struct _PSIInternalPssmData {
247  Uint4 ncols; /**< number of columns (query_length) */
248  Uint4 nrows; /**< number of rows (alphabet_size) */
249  int** pssm; /**< PSSM (scores) */
250  int** scaled_pssm; /**< scaled PSSM (scores) */
251  double** freq_ratios; /**< frequency ratios */
252  double* pseudocounts; /**< pseudocount constant for each column */
254 
255 /** Allocates a new _PSIInternalPssmData structure.
256  * @param query_length number of columns for the PSSM [in]
257  * @param alphabet_size number of rows for the PSSM [in]
258  * @return newly allocated structure or NULL in case of memory allocation
259  * failure
260  */
263 _PSIInternalPssmDataNew(Uint4 query_length, Uint4 alphabet_size);
264 
265 /** Deallocates the _PSIInternalPssmData structure.
266  * @param pssm data structure to deallocate [in]
267  * @return NULL
268  */
272 
273 /** This structure keeps track of the regions aligned between the query
274  * sequence and those that were not purged. It is used when calculating the
275  * sequence weights (replaces posExtents in old code) */
276 typedef struct _PSIAlignedBlock {
277  SSeqRange* pos_extnt; /**< Dynamically allocated array of size
278  query_length to keep track of the extents
279  of each aligned position */
280 
281  Uint4* size; /**< Dynamically allocated array of size
282  query_length that contains the size of the
283  intervals in the array above */
285 
286 /** Allocates and initializes the _PSIAlignedBlock structure.
287  * @param query_length length of the query sequence of the multiple
288  * sequence alignment [in]
289  * @return newly allocated structure or NULL in case of memory allocation
290  * failure
291  */
294 _PSIAlignedBlockNew(Uint4 query_length);
295 
296 /** Deallocates the _PSIAlignedBlock structure.
297  * @param aligned_blocks data structure to deallocate [in]
298  * @return NULL
299  */
302 _PSIAlignedBlockFree(_PSIAlignedBlock* aligned_blocks);
303 
304 /** Internal data structure to keep computed sequence weights */
305 typedef struct _PSISequenceWeights {
306  /* Some notes on how elements of this structures corresponds to elements
307  of structures defined in posit.h:
308 
309  _PSISequenceWeights->match_weights is the same as posSearch->posMatchWeights
310  _PSISequenceWeights->gapless_column_weights is the same as
311  posSearch->posGaplessColumnWeights
312  _PSISequenceWeights->norm_seq_weights is the same as posSearch->posA
313  _PSISequenceWeights->row_sigma is the same as posSearch->posRowSigma
314  */
315  double** match_weights; /**< weighted observed residue frequencies (f_i
316  in 2001 paper). (dimensions: query_length
317  x BlastScoreBlk's alphabet_size) */
318  Uint4 match_weights_size; /**< kept for help deallocate the field above */
319 
320  double* norm_seq_weights; /**< Stores the normalized sequence weights
321  (length: num_seqs + 1) */
322  double* row_sigma; /**< array of length num_seqs + 1 */
323  /* Sigma: number of different characters occurring in matches within a
324  * multi-alignment block - FIXME: why is it a double? */
325  double* sigma; /**< array of length query_length */
326 
327  double* std_prob; /**< standard amino acid probabilities */
328 
329  /* This fields is required for important diagnostic output, they are
330  * copied into diagnostics structure */
331  double* gapless_column_weights; /**< FIXME */
332  int** posDistinctDistrib; /**< For position i, how many positions in its block
333  have j distinct letters. Copied from
334  posit.h:posSearchItems.posDistinctDistrib */
335  Uint4 posDistinctDistrib_size; /**< Kept to deallocate field above. */
336  int* posNumParticipating; /**< number of sequences at each position. Copied from
337  posit.h:posSearchItems.posNumParticipating */
338  double* independent_observations; /**< Number of independent sequences
339  per column */
340 
342 
343 /** Allocates and initializes the _PSISequenceWeights structure.
344  * @param dims structure containing the multiple sequence alignment dimensions
345  * [in]
346  * @param sbp score block structure initialized for the scoring system used
347  * with the query sequence [in]
348  * @return newly allocated structure or NULL in case of memory allocation
349  * failure
350  */
353 _PSISequenceWeightsNew(const PSIMsaDimensions* dims, const BlastScoreBlk* sbp);
354 
355 /** Deallocates the _PSISequenceWeights structure.
356  * @param seq_weights data structure to deallocate [in]
357  * @return NULL
358  */
362 
363 /* Return values for internal PSI-BLAST functions */
364 
365 /** Successful operation */
366 #define PSI_SUCCESS (0)
367 /** Bad parameter used in function */
368 #define PSIERR_BADPARAM (-1)
369 /** Out of memory */
370 #define PSIERR_OUTOFMEM (-2)
371 /** Sequence weights do not add to 1 */
372 #define PSIERR_BADSEQWEIGHTS (-3)
373 /** No frequency ratios were found for the given scoring matrix */
374 #define PSIERR_NOFREQRATIOS (-4)
375 /** Positive average score found when scaling matrix */
376 #define PSIERR_POSITIVEAVGSCORE (-5)
377 /** After purge stage of PSSM creation, no sequences are left */
378 #define PSIERR_NOALIGNEDSEQS (-6)
379 /** GAP residue found in query sequence */
380 #define PSIERR_GAPINQUERY (-7)
381 /** Found an entire column with no participating sequences */
382 #define PSIERR_UNALIGNEDCOLUMN (-8)
383 /** Found an entire column full of GAP residues */
384 #define PSIERR_COLUMNOFGAPS (-9)
385 /** Found flanking gap at start of alignment */
386 #define PSIERR_STARTINGGAP (-10)
387 /** Found flanking gap at end of alignment */
388 #define PSIERR_ENDINGGAP (-11)
389 /** Errors in conserved domain profile */
390 #define PSIERR_BADPROFILE (-12)
391 /** Unknown error */
392 #define PSIERR_UNKNOWN (-255)
393 
394 /****************************************************************************/
395 /* Function prototypes for the various stages of the PSSM generation engine */
396 
397 /** Main function for keeping only those selected sequences for PSSM
398  * construction (stage 2). After this function the multiple sequence alignment
399  * data will not be modified.
400  * @sa implementation of PSICreatePssmWithDiagnostics
401  * @param msa multiple sequence alignment data structure [in]
402  * @return PSIERR_BADPARAM if alignment is NULL; PSI_SUCCESS otherwise
403  */
405 int
407 
408 /** Main validation function for multiple sequence alignment structure. Should
409  * be called after _PSIPurgeBiasedSegments.
410  * @param msa multiple sequence alignment data structure [in]
411  * @param ignored_unaligned_positions determines whether the unaligned
412  * positions test should be performend or not [in]
413  * @return One of the errors defined above if validation fails or bad
414  * parameter is passed in, else PSI_SUCCESS
415  */
417 int
418 _PSIValidateMSA(const _PSIMsa* msa, Boolean ignored_unaligned_positions);
419 
420 /** Main function to compute aligned blocks' properties for each position
421  * within multiple alignment (stage 3)
422  * Corresponds to posit.c:posComputeExtents
423  * @param msa multiple sequence alignment data structure [in]
424  * @param aligned_block data structure describing the aligned blocks'
425  * properties for each position of the multiple sequence alignment [out]
426  * @return PSIERR_BADPARAM if arguments are NULL
427  * PSI_SUCCESS otherwise
428  */
430 int
432  _PSIAlignedBlock* aligned_block);
433 
434 /** Main function to calculate the sequence weights. Should be called with the
435  * return value of PSIComputeAlignmentBlocks (stage 4)
436  * Corresponds to posit.c:posComputeSequenceWeights
437  * @param msa multiple sequence alignment data structure [in]
438  * @param aligned_blocks data structure describing the aligned blocks'
439  * properties for each position of the multiple sequence alignment [in]
440  * @param nsg_compatibility_mode set to true to emulate the structure group's
441  * use of PSSM engine in the cddumper application. By default should be FALSE
442  * [in]
443  * @param seq_weights data structure containing the data needed to compute the
444  * sequence weights [out]
445  * @return PSIERR_BADPARAM if arguments are NULL, PSIERR_OUTOFMEM in case of
446  * memory allocation failure, PSIERR_BADSEQWEIGHTS if the sequence weights fail
447  * to add up to 1.0, PSI_SUCCESS otherwise
448  */
450 int
452  const _PSIAlignedBlock* aligned_blocks,
453  Boolean nsg_compatibility_mode,
454  _PSISequenceWeights* seq_weights);
455 
456 /** Main function to calculate CD weights and combine weighted residue counts
457  * from matched CDs
458  * @param cd_msa multiple alignment of conserved domains data structure [in]
459  * @param sbp BLAST score block [in]
460  * @param options CDD-related options [in]
461  * @param seq_weights data structure with CD frequencies [out]
462  */
464 int
466  BlastScoreBlk* sbp,
467  const PSIBlastOptions* options,
468  _PSISequenceWeights* seq_weights);
469 
470 
471 /** Main function to compute the PSSM's frequency ratios (stage 5).
472  * Implements formula 2 in Nucleic Acids Research, 2001, Vol 29, No 14.
473  * Corresponds to posit.c:posComputePseudoFreqs
474  * @param msa multiple sequence alignment data structure [in]
475  * @param seq_weights data structure containing the data needed to compute the
476  * sequence weights [in]
477  * @param sbp score block structure initialized for the scoring system used
478  * with the query sequence [in]
479  * @param aligned_blocks data structure describing the aligned blocks'
480  * properties for each position of the multiple sequence alignment [in]
481  * @param pseudo_count pseudo count constant [in]
482  * @param nsg_compatibility_mode set to true to emulate the structure group's
483  * use of PSSM engine in the cddumper application. By default should be FALSE
484  * @param internal_pssm PSSM being computed [out]
485  * @return PSIERR_BADPARAM if arguments are NULL, PSI_SUCCESS otherwise
486  */
488 int
489 _PSIComputeFreqRatios(const _PSIMsa* msa,
490  const _PSISequenceWeights* seq_weights,
491  const BlastScoreBlk* sbp,
492  const _PSIAlignedBlock* aligned_blocks,
493  Int4 pseudo_count,
494  Boolean nsg_compatibility_mode,
495  _PSIInternalPssmData* internal_pssm);
496 
497 /** Main function to compute CD-based PSSM's frequency ratios
498  * @param cd_msa multiple alignment of CDs [in]
499  * @param seq_weights contains weighted residue frequencies and effective number
500  * of observations [in]
501  * @param sbp initialized score block data structure [in]
502  * @param pseudo_count pseudo count constant [in]
503  * @param internal_pssm PSSM [out]
504  * @return status
505  */
507 int
509  const _PSISequenceWeights* seq_weights,
510  const BlastScoreBlk* sbp,
511  Int4 pseudo_count,
512  _PSIInternalPssmData* internal_pssm);
513 
514 
515 /** Converts the PSSM's frequency ratios obtained in the previous stage to a
516  * PSSM of scores. (stage 6)
517  * @param internal_pssm PSSM being computed [in|out]
518  * @param query query sequence in ncbistdaa encoding. The length of this
519  * sequence is read from internal_pssm->ncols [in]
520  * @param sbp score block structure initialized for the scoring system used
521  * with the query sequence [in]
522  * @param std_probs array containing the standard residue probabilities [in]
523  * @return PSIERR_BADPARAM if arguments are NULL, PSI_SUCCESS otherwise
524  */
526 int
528  const Uint1* query,
529  const BlastScoreBlk* sbp,
530  const double* std_probs);
531 
532 /** Scales the PSSM (stage 7)
533  * @param query query sequence in ncbistdaa encoding. The length of this
534  * sequence is read from internal_pssm->ncols [in]
535  * @param std_probs array containing the standard background residue
536  * probabilities [in]
537  * @param internal_pssm PSSM being computed [in|out]
538  * @param sbp score block structure initialized for the scoring system used
539  * with the query sequence [in|out]
540  * @return PSIERR_BADPARAM if arguments are NULL, PSIERR_POSITIVEAVGSCORE if
541  * the average score of the generated PSSM is positive, PSI_SUCCESS otherwise
542  */
544 int
546  const double* std_probs,
547  _PSIInternalPssmData* internal_pssm,
548  BlastScoreBlk* sbp);
549 
550 /****************************************************************************/
551 /* Function prototypes for auxiliary functions for the stages above */
552 
553 
554 /** Updates the Karlin-Altschul parameters based on the query sequence and
555  * PSSM's score frequencies. Port of blastool.c's updateLambdaK
556  * @param pssm PSSM [in]
557  * @param query query sequence in ncbistdaa encoding [in]
558  * @param query_length length of the query sequence above [in]
559  * @param std_probs array containing the standard background residue
560  * probabilities [in]
561  * @param sbp Score block structure where the calculated lambda and K will be
562  * returned [in|out]
563  */
565 void
566 _PSIUpdateLambdaK(const int** pssm,
567  const Uint1* query,
568  Uint4 query_length,
569  const double* std_probs,
570  BlastScoreBlk* sbp);
571 
572 /** Provides a similar function to _PSIScaleMatrix but it performs the scaling
573  * as IMPALA did, i.e.: allowing the specification of a scaling factor and when
574  * calculating the score probabilities, the query length includes 'X' residues.
575  * @todo Ideally all scaling code should be refactored so that it is
576  * consolidated, eliminating the need for blast_posit.[hc]. Please note that
577  * blast_kappa.c's scalePosMatrix also does something very similar.
578  * @todo remove std_probs as it's not used
579  */
581 int
582 _IMPALAScaleMatrix(const Uint1* query, const double* std_probs,
583  _PSIInternalPssmData* internal_pssm,
584  BlastScoreBlk* sbp,
585  double scaling_factor);
586 
587 /** Marks the (start, stop] region corresponding to sequence seq_index in
588  * alignment so that it is not further considered for PSSM calculation.
589  * Note that the query sequence cannot be purged.
590  * @param msa multiple sequence alignment data [in|out]
591  * @param seq_index index of the sequence of interested in alignment [in]
592  * @param start start of the region to remove [in]
593  * @param stop stop of the region to remove [in]
594  * @return PSIERR_BADPARAM if no alignment is given, or if seq_index or stop
595  * are invalid, PSI_SUCCESS otherwise
596  */
598 int
600  unsigned int seq_index,
601  unsigned int start,
602  unsigned int stop);
603 
604 /** Counts the number of sequences matching the query per query position
605  * (columns of the multiple alignment) as well as the number of residues
606  * present in each position of the query.
607  * Should be called after multiple alignment data has been purged from biased
608  * sequences.
609  * @param msa multiple sequence alignment structure [in|out]
610  */
612 void
614 
615 /** Calculates the length of the sequence without including any 'X' residues.
616  * used in kappa.c
617  * @param seq sequence to examine [in]
618  * @param length length of the sequence above [in]
619  * @return number of non-X residues in the sequence
620  */
622 Uint4
623 _PSISequenceLengthWithoutX(const Uint1* seq, Uint4 length);
624 
625 /** Compute the probabilities for each score in the PSSM.
626  * This is only valid for protein sequences.
627  * FIXME: Should this be moved to blast_stat.[hc]?
628  * used in kappa.c in notposfillSfp()
629  * @param pssm PSSM for which to compute the score probabilities [in]
630  * @param query query sequence for the PSSM above in ncbistdaa encoding [in]
631  * @param query_length length of the query sequence above [in]
632  * @param std_probs array containing the standard background residue
633  * probabilities [in]
634  * @param sbp score block structure initialized for the scoring system used
635  * with the query sequence [in]
636  * @return structure containing the score frequencies, or NULL in case of error
637  */
640 _PSIComputeScoreProbabilities(const int** pssm,
641  const Uint1* query,
642  Uint4 query_length,
643  const double* std_probs,
644  const BlastScoreBlk* sbp);
645 
646 /** Collects diagnostic information from the process of creating the PSSM
647  * @param msa multiple sequence alignment data structure [in]
648  * @param aligned_block aligned regions' extents [in]
649  * @param seq_weights sequence weights data structure [in]
650  * @param internal_pssm structure containing PSSM's frequency ratios [in]
651  * @param diagnostics output parameter [out]
652  * @return PSI_SUCCESS on success, PSIERR_OUTOFMEM if memory allocation fails
653  * or PSIERR_BADPARAM if any of its arguments is NULL
654  */
656 int
657 _PSISaveDiagnostics(const _PSIMsa* msa,
658  const _PSIAlignedBlock* aligned_block,
659  const _PSISequenceWeights* seq_weights,
660  const _PSIInternalPssmData* internal_pssm,
661  PSIDiagnosticsResponse* diagnostics);
662 
663 /** Collects diagnostic information from the process of creating the CDD-based
664  * PSSM
665  * @param cd_msa multiple alignment of CDs data structure [in]
666  * @param seq_weights sequence weights data structure [in]
667  * @param internal_pssm structure containing PSSM's frequency ratios [in]
668  * @param diagnostics output parameter [out]
669  * @return PSI_SUCCESS on success, PSIERR_OUTOFMEM if memory allocation fails
670  * or PSIERR_BADPARAM if any of its arguments is NULL
671  */
672 int
674  const _PSISequenceWeights* seq_weights,
675  const _PSIInternalPssmData* internal_pssm,
676  PSIDiagnosticsResponse* diagnostics);
677 
678 
679 /** Calculates the information content from the scoring matrix
680  * @param score_mat alphabet by alphabet_sz matrix of scores (const) [in]
681  * @param std_prob standard residue probabilities [in]
682  * @param query query sequence [in]
683  * @param query_length length of the query [in]
684  * @param alphabet_sz length of the alphabet used by the query [in]
685  * @param lambda lambda parameter [in] FIXME documentation
686  * @return array of length query_length containing the information content per
687  * query position or NULL on error (e.g.: out-of-memory or NULL parameters)
688  */
690 double*
692  Int4** score_mat,
693  const double* std_prob,
694  const Uint1* query,
695  Uint4 query_length,
696  Uint4 alphabet_sz,
697  double lambda);
698 
699 /** Calculates the information content from the residue frequencies calculated
700  * in stage 5 of the PSSM creation algorithm
701  * Corresponds to posit.c:posFreqsToInformation
702  * @sa _PSIComputeFreqRatios: stage 5
703  * @param freq_ratios matrix of frequency ratios (dimensions: query_length x
704  * alphabet_sz) (const) [in]
705  * @param std_prob standard residue probabilities [in]
706  * @param query_length length of the query [in]
707  * @param alphabet_sz length of the alphabet used by the query [in]
708  * @return array of length query_length containing the information content per
709  * query position or NULL on error (e.g.: out-of-memory or NULL parameters)
710  */
712 double*
714  double** freq_ratios,
715  const double* std_prob,
716  Uint4 query_length,
717  Uint4 alphabet_sz);
718 
719 #ifdef DEBUG_PSSM_ENGINE
720 void __printMsa(const char* filename, const _PSIPackedMsa* msa);
721 void __printMsaFP(FILE* fp, const _PSIPackedMsa* msa);
722 #endif /* DEBUG_PSSM_ENGINE */
723 
724 /** Enable NCBI structure group customization to discard the query sequence,
725  * as this really isn't the result of a PSI-BLAST iteration, but rather an
726  * artificial consensus sequence of the multiple sequence alignment
727  * constructed by them. This should be called after _PSIPurgeBiasedSegments.
728  */
730 void
732 
733 /** Structure group validation function for multiple sequence alignment
734  * structure. Should be called after _PSIStructureGroupCustomization.
735  * @param msa multiple sequence alignment data structure [in]
736  * @return One of the errors defined above if validation fails or bad
737  * parameter is passed in, else PSI_SUCCESS
738  */
740 int
742 
743 /** Validation of multiple alignment of conserved domains structure
744  * @param cd_msa multiple alignment of CDs [in]
745  * @param alphabet_size alphabet size [in]
746  * @return One of the errors defined above if validation fails or bad
747  * parameter is passed in, else PSI_SUCCESS
748  */
750 int
751 _PSIValidateCdMSA(const PSICdMsa* cd_msa, Uint4 alphabet_size);
752 
753 #ifdef __cplusplus
754 }
755 #endif
756 
757 #endif /* !ALGO_BLAST_CORE__BLAST_PSI_PRIV__H */
#define NCBI_XBLAST_EXPORT
NULL operations for other cases.
Definition: blast_export.h:65
High level definitions and declarations for the PSSM engine of PSI-BLAST.
int _PSIConvertFreqRatiosToPSSM(_PSIInternalPssmData *internal_pssm, const Uint1 *query, const BlastScoreBlk *sbp, const double *std_probs)
Converts the PSSM's frequency ratios obtained in the previous stage to a PSSM of scores.
int _PSIComputeAlignmentBlocks(const _PSIMsa *msa, _PSIAlignedBlock *aligned_block)
Main function to compute aligned blocks' properties for each position within multiple alignment (stag...
int _PSIComputeFreqRatios(const _PSIMsa *msa, const _PSISequenceWeights *seq_weights, const BlastScoreBlk *sbp, const _PSIAlignedBlock *aligned_blocks, Int4 pseudo_count, Boolean nsg_compatibility_mode, _PSIInternalPssmData *internal_pssm)
Main function to compute the PSSM's frequency ratios (stage 5).
void ** _PSIAllocateMatrix(unsigned int ncols, unsigned int nrows, unsigned int data_type_sz)
Generic 2 dimensional matrix allocator.
void _PSIStructureGroupCustomization(_PSIMsa *msa)
Enable NCBI structure group customization to discard the query sequence, as this really isn't the res...
struct _PSIPackedMsaCell _PSIPackedMsaCell
Compact version of the PSIMsaCell structure.
int _PSIComputeFreqRatiosFromCDs(const PSICdMsa *cd_msa, const _PSISequenceWeights *seq_weights, const BlastScoreBlk *sbp, Int4 pseudo_count, _PSIInternalPssmData *internal_pssm)
Main function to compute CD-based PSSM's frequency ratios.
_PSIInternalPssmData * _PSIInternalPssmDataNew(Uint4 query_length, Uint4 alphabet_size)
Allocates a new _PSIInternalPssmData structure.
_PSIAlignedBlock * _PSIAlignedBlockNew(Uint4 query_length)
Allocates and initializes the _PSIAlignedBlock structure.
unsigned int _PSIPackedMsaGetNumberOfAlignedSeqs(const _PSIPackedMsa *msa)
Retrieve the number of aligned sequences in the compact multiple sequence alignment.
int _PSIComputeSequenceWeights(const _PSIMsa *msa, const _PSIAlignedBlock *aligned_blocks, Boolean nsg_compatibility_mode, _PSISequenceWeights *seq_weights)
Main function to calculate the sequence weights.
int _PSISaveCDDiagnostics(const PSICdMsa *msa, const _PSISequenceWeights *seq_weights, const _PSIInternalPssmData *internal_pssm, PSIDiagnosticsResponse *diagnostics)
Collects diagnostic information from the process of creating the CDD-based PSSM.
int _PSISaveDiagnostics(const _PSIMsa *msa, const _PSIAlignedBlock *aligned_block, const _PSISequenceWeights *seq_weights, const _PSIInternalPssmData *internal_pssm, PSIDiagnosticsResponse *diagnostics)
Collects diagnostic information from the process of creating the PSSM.
double * _PSICalculateInformationContentFromScoreMatrix(Int4 **score_mat, const double *std_prob, const Uint1 *query, Uint4 query_length, Uint4 alphabet_sz, double lambda)
Calculates the information content from the scoring matrix.
Blast_ScoreFreq * _PSIComputeScoreProbabilities(const int **pssm, const Uint1 *query, Uint4 query_length, const double *std_probs, const BlastScoreBlk *sbp)
Compute the probabilities for each score in the PSSM.
struct _PSISequenceWeights _PSISequenceWeights
Internal data structure to keep computed sequence weights.
int _PSIValidateMSA(const _PSIMsa *msa, Boolean ignored_unaligned_positions)
Main validation function for multiple sequence alignment structure.
int _PSIPurgeBiasedSegments(_PSIPackedMsa *msa)
Main function for keeping only those selected sequences for PSSM construction (stage 2).
_PSIInternalPssmData * _PSIInternalPssmDataFree(_PSIInternalPssmData *pssm)
Deallocates the _PSIInternalPssmData structure.
int _PSIComputeFrequenciesFromCDs(const PSICdMsa *cd_msa, BlastScoreBlk *sbp, const PSIBlastOptions *options, _PSISequenceWeights *seq_weights)
Main function to calculate CD weights and combine weighted residue counts from matched CDs.
const int kPSIScaleFactor
Successor to POSIT_SCALE_FACTOR.
const double kPSINearIdentical
Percent identity threshold for discarding near-identical matches.
const Uint4 kPositScalingNumIterations
Constant used in scaling PSSM routines: Successor to POSIT_NUM_ITERATIONS.
_PSISequenceWeights * _PSISequenceWeightsFree(_PSISequenceWeights *seq_weights)
Deallocates the _PSISequenceWeights structure.
void ** _PSIDeallocateMatrix(void **matrix, unsigned int ncols)
Generic 2 dimensional matrix deallocator.
void _PSICopyMatrix_int(int **dest, int **src, unsigned int ncols, unsigned int nrows)
Copies src matrix into dest matrix, both of which must be int matrices with dimensions ncols by nrows...
Uint4 _PSISequenceLengthWithoutX(const Uint1 *seq, Uint4 length)
Calculates the length of the sequence without including any 'X' residues.
struct _PSIAlignedBlock _PSIAlignedBlock
This structure keeps track of the regions aligned between the query sequence and those that were not ...
void _PSIUpdateLambdaK(const int **pssm, const Uint1 *query, Uint4 query_length, const double *std_probs, BlastScoreBlk *sbp)
Updates the Karlin-Altschul parameters based on the query sequence and PSSM's score frequencies.
struct _PSIMsa _PSIMsa
Internal multiple alignment data structure used by the PSSM engine.
const unsigned int kQueryIndex
Index into multiple sequence alignment structure for the query sequence.
struct _PSIInternalPssmData _PSIInternalPssmData
Internal representation of a PSSM in various stages of its creation and its dimensions.
_PSISequenceWeights * _PSISequenceWeightsNew(const PSIMsaDimensions *dims, const BlastScoreBlk *sbp)
Allocates and initializes the _PSISequenceWeights structure.
void _PSICopyMatrix_double(double **dest, double **src, unsigned int ncols, unsigned int nrows)
Copies src matrix into dest matrix, both of which must be double matrices with dimensions ncols by nr...
int _PSIValidateMSA_StructureGroup(const _PSIMsa *msa)
Structure group validation function for multiple sequence alignment structure.
int _PSIScaleMatrix(const Uint1 *query, const double *std_probs, _PSIInternalPssmData *internal_pssm, BlastScoreBlk *sbp)
Scales the PSSM (stage 7)
int _PSIPurgeAlignedRegion(_PSIPackedMsa *msa, unsigned int seq_index, unsigned int start, unsigned int stop)
Marks the (start, stop] region corresponding to sequence seq_index in alignment so that it is not fur...
_PSIMsa * _PSIMsaFree(_PSIMsa *msa)
Deallocates the _PSIMsa data structure.
_PSIAlignedBlock * _PSIAlignedBlockFree(_PSIAlignedBlock *aligned_blocks)
Deallocates the _PSIAlignedBlock structure.
const double kEpsilon
Small constant to test against 0.
double * _PSICalculateInformationContentFromFreqRatios(double **freq_ratios, const double *std_prob, Uint4 query_length, Uint4 alphabet_sz)
Calculates the information content from the residue frequencies calculated in stage 5 of the PSSM cre...
struct _PSIMsaCell _PSIMsaCell
Internal data structure to represent a position in the multiple sequence alignment data structure.
int _PSIValidateCdMSA(const PSICdMsa *cd_msa, Uint4 alphabet_size)
Validation of multiple alignment of conserved domains structure.
void _PSIUpdatePositionCounts(_PSIMsa *msa)
Counts the number of sequences matching the query per query position (columns of the multiple alignme...
const double kPSIIdentical
Percent identity threshold for discarding identical matches.
_PSIPackedMsa * _PSIPackedMsaNew(const PSIMsa *msa)
Allocates and initializes the compact version of the PSIMsa structure (makes a deep copy) for interna...
struct _PSIPackedMsa _PSIPackedMsa
Compact version of PSIMsa structure.
const double kPositScalingPercent
Constant used in scaling PSSM routines: Successor to POSIT_PERCENT.
int _IMPALAScaleMatrix(const Uint1 *query, const double *std_probs, _PSIInternalPssmData *internal_pssm, BlastScoreBlk *sbp, double scaling_factor)
Provides a similar function to _PSIScaleMatrix but it performs the scaling as IMPALA did,...
_PSIMsa * _PSIMsaNew(const _PSIPackedMsa *packed_msa, Uint4 alphabet_size)
Allocates and initializes the internal version of the PSIMsa structure (makes a deep copy) for intern...
_PSIPackedMsa * _PSIPackedMsaFree(_PSIPackedMsa *msa)
Deallocates the _PSIMsa data structure.
Definitions and prototypes used by blast_stat.c to calculate BLAST statistics.
static const char fp[]
Definition: des.c:87
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
Interface to retrieve the frequency ratios for various scoring matrices.
Type and macro definitions from C toolkit that are not defined in C++ toolkit.
Uint1 Boolean
bool replacment for C
Definition: ncbi_std.h:94
double lambda(size_t dimMatrix_, const Int4 *const *scoreMatrix_, const double *q_)
Structure used for scoring calculations.
Definition: blast_stat.h:177
Holds score frequencies used in calculation of Karlin-Altschul parameters for an ungapped search.
Definition: blast_stat.h:128
Options used in protein BLAST only (PSI, PHI, RPS and translated BLAST) Some of these possibly should...
Data structure representing multiple alignemnt of CDs and query sequence along with data needed for P...
Definition: blast_psi.h:134
This structure contains the diagnostics information requested using the PSIDiagnosticsRequest structu...
Definition: blast_psi.h:201
Structure representing the dimensions of the multiple sequence alignment data structure.
Definition: blast_psi.h:57
Multiple sequence alignment (msa) data structure containing the raw data needed by the PSSM engine to...
Definition: blast_psi.h:75
A structure containing two integers, used e.g.
Definition: blast_def.h:155
This structure keeps track of the regions aligned between the query sequence and those that were not ...
SSeqRange * pos_extnt
Dynamically allocated array of size query_length to keep track of the extents of each aligned positio...
Uint4 * size
Dynamically allocated array of size query_length that contains the size of the intervals in the array...
Internal representation of a PSSM in various stages of its creation and its dimensions.
int ** scaled_pssm
scaled PSSM (scores)
Uint4 nrows
number of rows (alphabet_size)
double * pseudocounts
pseudocount constant for each column
int ** pssm
PSSM (scores)
Uint4 ncols
number of columns (query_length)
double ** freq_ratios
frequency ratios
Internal data structure to represent a position in the multiple sequence alignment data structure.
unsigned int letter
Preferred letter at this position.
SSeqRange extents
Extents of this aligned position.
unsigned int is_aligned
Is this letter part of the alignment?
Internal multiple alignment data structure used by the PSSM engine.
Uint4 * num_matching_seqs
number of sequences aligned at a given position in the multiple sequence alignment (length: query_len...
Uint1 * query
query sequence (length: query_length)
Uint4 ** residue_counts
matrix to keep track of the raw residue counts at each position of the multiple sequence alignment (d...
PSIMsaDimensions * dimensions
dimensions of field below
Uint4 alphabet_size
number of elements in alphabet
_PSIMsaCell ** cell
multiple sequence alignment matrix (dimensions: query_length x num_seqs + 1)
Compact version of the PSIMsaCell structure.
unsigned int letter
Preferred letter at this position, in ncbistdaa encoding.
unsigned int is_aligned
Is this letter part of the alignment?
Compact version of PSIMsa structure.
PSIMsaDimensions * dimensions
dimensions of the msa
Boolean * use_sequence
used to indicate whether a sequence should be used for further processing by the engine (length: num_...
_PSIPackedMsaCell ** data
actual data, dimensions are (dimensions->num_seqs+1) by (dimensions->query_length)
Internal data structure to keep computed sequence weights.
double ** match_weights
weighted observed residue frequencies (f_i in 2001 paper).
Uint4 posDistinctDistrib_size
Kept to deallocate field above.
double * std_prob
standard amino acid probabilities
double * norm_seq_weights
Stores the normalized sequence weights (length: num_seqs + 1)
int ** posDistinctDistrib
For position i, how many positions in its block have j distinct letters.
double * independent_observations
Number of independent sequences per column.
int * posNumParticipating
number of sequences at each position.
double * sigma
array of length query_length
double * gapless_column_weights
FIXME.
Uint4 match_weights_size
kept for help deallocate the field above
double * row_sigma
array of length num_seqs + 1
static string query
Modified on Wed Sep 04 15:01:55 2024 by modify_doxy.py rev. 669887