NCBI C++ ToolKit
blast_psi_priv.c
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* ===========================================================================
2  *
3  * PUBLIC DOMAIN NOTICE
4  * National Center for Biotechnology Information
5  *
6  * This software/database is a "United States Government Work" under the
7  * terms of the United States Copyright Act. It was written as part of
8  * the author's official duties as a United States Government employee and
9  * thus cannot be copyrighted. This software/database is freely available
10  * to the public for use. The National Library of Medicine and the U.S.
11  * Government have not placed any restriction on its use or reproduction.
12  *
13  * Although all reasonable efforts have been taken to ensure the accuracy
14  * and reliability of the software and data, the NLM and the U.S.
15  * Government do not and cannot warrant the performance or results that
16  * may be obtained by using this software or data. The NLM and the U.S.
17  * Government disclaim all warranties, express or implied, including
18  * warranties of performance, merchantability or fitness for any particular
19  * purpose.
20  *
21  * Please cite the author in any work or product based on this material.
22  *
23  * ===========================================================================
24  *
25  * Author: Alejandro Schaffer, ported by Christiam Camacho
26  *
27  */
28 
29 /** @file blast_psi_priv.c
30  * Defintions for functions in private interface for Position Iterated BLAST
31  * API.
32  */
33 
34 #include "blast_psi_priv.h"
35 #include "blast_posit.h"
39 #include "blast_dynarray.h"
40 
42 
43 /****************************************************************************/
44 /* Use the following #define's to enable/disable functionality */
45 
46 /* Taking gaps into account when constructing a PSSM was introduced in the
47  * 2001 paper "Improving the accuracy of PSI-BLAST protein database searches
48  * with composition based-statistics and other refinements". This feature
49  * can be disabled by defining the PSI_IGNORE_GAPS_IN_COLUMNS symbol below */
50 /* #define PSI_IGNORE_GAPS_IN_COLUMNS */
51 /****************************************************************************/
52 
53 /****************************************************************************/
54 /* Constants */
55 const double kPSINearIdentical = 0.94;
56 const double kPSIIdentical = 1.0;
57 const unsigned int kQueryIndex = 0;
58 const double kEpsilon = 0.0001;
59 const int kPSIScaleFactor = 200;
60 const double kPositScalingPercent = 0.05;
62 
63 /****************************************************************************/
64 
65 void**
66 _PSIAllocateMatrix(unsigned int ncols, unsigned int nrows,
67  unsigned int data_type_sz)
68 {
69  void** retval = NULL;
70  unsigned int i = 0;
71 
72  retval = (void**) malloc(sizeof(void*) * ncols);
73  if ( !retval ) {
74  return NULL;
75  }
76 
77  for (i = 0; i < ncols; i++) {
78  retval[i] = (void*) calloc(nrows, data_type_sz);
79  if ( !retval[i] ) {
80  retval = _PSIDeallocateMatrix(retval, i);
81  break;
82  }
83  }
84  return retval;
85 }
86 
87 void**
88 _PSIDeallocateMatrix(void** matrix, unsigned int ncols)
89 {
90  unsigned int i = 0;
91 
92  if (!matrix)
93  return NULL;
94 
95  for (i = 0; i < ncols; i++) {
96  sfree(matrix[i]);
97  }
98  sfree(matrix);
99  return NULL;
100 }
101 
102 /** Implements the generic copy matrix functions. Prototypes must be defined
103  * in the header file manually following the naming convention for
104  * _PSICopyMatrix_int
105  */
106 #define DEFINE_COPY_MATRIX_FUNCTION(type) \
107 void _PSICopyMatrix_##type(type** dest, type** src, \
108  unsigned int ncols, unsigned int nrows) \
109 { \
110  unsigned int i = 0; \
111  unsigned int j = 0; \
112  \
113  ASSERT(dest); \
114  ASSERT(src); \
115  \
116  for (i = 0; i < ncols; i++) { \
117  for (j = 0; j < nrows; j++) { \
118  dest[i][j] = src[i][j]; \
119  } \
120  } \
121 } \
122 
125 
126 /****************************************************************************/
127 
130 {
131  _PSIPackedMsa* retval = NULL; /* the return value */
132  Uint4 s = 0; /* index on sequences */
133  Uint4 p = 0; /* index on positions */
134 
135  if ( !msa || !msa->dimensions || !msa->data ) {
136  return NULL;
137  }
138 
139  retval = (_PSIPackedMsa*) calloc(1, sizeof(_PSIPackedMsa));
140  if ( !retval ) {
141  return _PSIPackedMsaFree(retval);
142  }
143 
144  retval->dimensions = (PSIMsaDimensions*) malloc(sizeof(PSIMsaDimensions));
145  if ( !retval->dimensions ) {
146  return _PSIPackedMsaFree(retval);
147  }
148  memcpy((void*) retval->dimensions,
149  (void*) msa->dimensions,
150  sizeof(PSIMsaDimensions));
151 
152  retval->data = (_PSIPackedMsaCell**)
153  _PSIAllocateMatrix(msa->dimensions->num_seqs + 1,
154  msa->dimensions->query_length,
155  sizeof(_PSIPackedMsaCell));
156  if ( !retval->data ) {
157  return _PSIPackedMsaFree(retval);
158  }
159  /* Copy the multiple sequence alignment data */
160  for (s = 0; s < msa->dimensions->num_seqs + 1; s++) {
161  for (p = 0; p < msa->dimensions->query_length; p++) {
162  ASSERT(msa->data[s][p].letter <= BLASTAA_SIZE);
163  retval->data[s][p].letter = msa->data[s][p].letter;
164  retval->data[s][p].is_aligned = msa->data[s][p].is_aligned;
165  }
166  }
167 
168  retval->use_sequence = (Boolean*) malloc((msa->dimensions->num_seqs + 1) *
169  sizeof(Boolean));
170  if ( !retval->use_sequence ) {
171  return _PSIPackedMsaFree(retval);
172  }
173  /* All sequences are valid candidates for taking part in PSSM construction
174  * by default */
175  for (s = 0; s < msa->dimensions->num_seqs + 1; s++) {
176  retval->use_sequence[s] = TRUE;
177  }
178 
179  return retval;
180 }
181 
184 {
185  if ( !msa ) {
186  return NULL;
187  }
188 
189  if ( msa->data && msa->dimensions ) {
190  _PSIDeallocateMatrix((void**) msa->data,
191  msa->dimensions->num_seqs + 1);
192  msa->data = NULL;
193  }
194 
195  if ( msa->dimensions ) {
196  sfree(msa->dimensions);
197  }
198 
199  if ( msa->use_sequence ) {
200  sfree(msa->use_sequence);
201  }
202 
203  sfree(msa);
204 
205  return NULL;
206 }
207 
208 unsigned int
210 {
211  unsigned int retval = 0;
212  Uint4 i = 0;
213 
214  if ( !msa ) {
215  return retval;
216  }
217 
218  for (i = 0; i < msa->dimensions->num_seqs + 1; i++) {
219  if (msa->use_sequence[i]) {
220  retval++;
221  }
222  }
223 
224  return retval;
225 }
226 
227 /****************************************************************************/
228 
229 #ifdef DEBUG_PSSM_ENGINE
230 char GetResidue(char input)
231 {
233 }
234 
235 void
236 PrintMsa(const char* filename, const PSIMsa* msa)
237 {
238  FILE* fp = NULL;
239  ASSERT(msa);
240  ASSERT(filename);
241  fp = fopen(filename, "w");
242  PrintMsaFP(fp, msa);
243  fclose(fp);
244 }
245 
246 void
247 PrintMsaFP(FILE* fp, const PSIMsa* msa)
248 {
249  Uint4 i, j;
250 
251  ASSERT(msa);
252 
253  for (i = 0; i < msa->dimensions->num_seqs + 1; i++) {
254  fprintf(fp, "%3d\tGI=%10d\tEvalue=%2.0le\tBitScore=%4.1f\t", i,
255  msa->seqinfo[i].gi,
256  msa->seqinfo[i].evalue,
257  msa->seqinfo[i].bit_score);
258  for (j = 0; j < msa->dimensions->query_length; j++) {
259  if (msa->data[i][j].is_aligned) {
260  fprintf(fp, "%c", GetResidue(msa->data[i][j].letter));
261  } else {
262  fprintf(fp, ".");
263  }
264  }
265  fprintf(fp, "\n");
266  }
267 }
268 
269 void
270 __printPackedMsaFP(FILE* fp, const _PSIPackedMsa* msa)
271 {
272  Uint4 i, j;
273 
274  ASSERT(msa);
275  ASSERT(fp);
276 
277  for (i = 0; i < msa->dimensions->num_seqs + 1; i++) {
278  fprintf(fp, "%3d\t", i);
279  if ( !msa->use_sequence[i] ) {
280  fprintf(fp, "NOT USED\n");
281  continue;
282  }
283 
284  for (j = 0; j < msa->dimensions->query_length; j++) {
285  if (msa->data[i][j].is_aligned) {
286  fprintf(fp, "%c", GetResidue(msa->data[i][j].letter));
287  } else {
288  fprintf(fp, ".");
289  }
290  }
291  fprintf(fp, "\n");
292  }
293 }
294 
295 void
296 __printPackedMsa(const char* filename, const _PSIPackedMsa* msa)
297 {
298  FILE* fp = NULL;
299  ASSERT(msa);
300  ASSERT(filename);
301  fp = fopen(filename, "w");
302  __printPackedMsaFP(fp, msa);
303  fclose(fp);
304 }
305 #endif /* DEBUG_PSSM_ENGINE */
306 
307 _PSIMsa*
308 _PSIMsaNew(const _PSIPackedMsa* msa, Uint4 alphabet_size)
309 {
310  _PSIMsa* retval = NULL; /* the return value */
311  Uint4 s = 0; /* index on sequences */
312  Uint4 p = 0; /* index on positions */
313 
314  if ( !msa || !msa->dimensions || !msa->data ) {
315  return NULL;
316  }
317 
318  retval = (_PSIMsa*) calloc(1, sizeof(_PSIMsa));
319  if ( !retval ) {
320  return _PSIMsaFree(retval);
321  }
322 
323  retval->alphabet_size = alphabet_size;
324  retval->dimensions = (PSIMsaDimensions*) malloc(sizeof(PSIMsaDimensions));
325  if ( !retval->dimensions ) {
326  return _PSIMsaFree(retval);
327  }
330 
331  retval->cell = (_PSIMsaCell**)
333  retval->dimensions->query_length,
334  sizeof(_PSIMsaCell));
335  if ( !retval->cell ) {
336  return _PSIMsaFree(retval);
337  }
338  /* Copy the multiple sequence alignment data */
339  {
340  Uint4 ss = 0; /* column index into retval's _PSIMsaCell matrix */
341  for (s = 0; s < msa->dimensions->num_seqs + 1; s++) {
342 
343  if ( !msa->use_sequence[s] ) {
344  continue;
345  }
346 
347  for (p = 0; p < retval->dimensions->query_length; p++) {
348  retval->cell[ss][p].letter = msa->data[s][p].letter;
349  retval->cell[ss][p].is_aligned = msa->data[s][p].is_aligned;
350  retval->cell[ss][p].extents.left = -1;
351  retval->cell[ss][p].extents.right =
352  msa->dimensions->query_length;
353  }
354  ss++;
355  }
356  }
357 
358  retval->query = (Uint1*) malloc(retval->dimensions->query_length *
359  sizeof(Uint1));
360  if ( !retval->query ) {
361  return _PSIMsaFree(retval);
362  }
363  /* Initialize the query according to convention that first sequence in msa
364  * data structure corresponds to the query */
365  for (p = 0; p < retval->dimensions->query_length; p++) {
367  retval->query[p] = msa->data[kQueryIndex][p].letter;
368  }
369 
370  retval->residue_counts = (Uint4**)
372  alphabet_size,
373  sizeof(Uint4));
374  if ( !retval->residue_counts ) {
375  return _PSIMsaFree(retval);
376  }
377 
378  retval->num_matching_seqs = (Uint4*)
379  calloc(retval->dimensions->query_length, sizeof(Uint4));
380  if ( !retval->num_matching_seqs ) {
381  return _PSIMsaFree(retval);
382  }
383 
384  _PSIUpdatePositionCounts(retval);
385  return retval;
386 }
387 
388 _PSIMsa*
390 {
391  if ( !msa ) {
392  return NULL;
393  }
394 
395  if ( msa->cell && msa->dimensions ) {
396  _PSIDeallocateMatrix((void**) msa->cell,
397  msa->dimensions->num_seqs + 1);
398  msa->cell = NULL;
399  }
400 
401  if ( msa->query ) {
402  sfree(msa->query);
403  }
404 
405  if ( msa->residue_counts && msa->dimensions ) {
406  _PSIDeallocateMatrix((void**) msa->residue_counts,
407  msa->dimensions->query_length);
408  msa->residue_counts = NULL;
409  }
410 
411  if ( msa->num_matching_seqs ) {
412  sfree(msa->num_matching_seqs);
413  }
414 
415  if ( msa->dimensions ) {
416  sfree(msa->dimensions);
417  }
418 
419  sfree(msa);
420 
421  return NULL;
422 }
423 
425 _PSIInternalPssmDataNew(Uint4 query_length, Uint4 alphabet_size)
426 {
427  _PSIInternalPssmData* retval = NULL;
428 
429  retval = (_PSIInternalPssmData*) calloc(1, sizeof(_PSIInternalPssmData));
430  if ( !retval ) {
431  return NULL;
432  }
433 
434  retval->ncols = query_length;
435  retval->nrows = alphabet_size;
436 
437  retval->pssm = (int**) _PSIAllocateMatrix(retval->ncols,
438  retval->nrows,
439  sizeof(int));
440  if ( !retval->pssm ) {
441  return _PSIInternalPssmDataFree(retval);
442  }
443 
444  retval->scaled_pssm = (int**) _PSIAllocateMatrix(retval->ncols,
445  retval->nrows,
446  sizeof(int));
447  if ( !retval->scaled_pssm ) {
448  return _PSIInternalPssmDataFree(retval);
449  }
450 
451  retval->freq_ratios = (double**) _PSIAllocateMatrix(retval->ncols,
452  retval->nrows,
453  sizeof(double));
454  if ( !retval->freq_ratios ) {
455  return _PSIInternalPssmDataFree(retval);
456  }
457 
458  retval->pseudocounts = (double*) calloc(query_length, sizeof(double));
459 
460  if ( !retval->pseudocounts ) {
461  return _PSIInternalPssmDataFree(retval);
462  }
463 
464  return retval;
465 }
466 
469 {
470  if ( !pssm_data ) {
471  return NULL;
472  }
473 
474  if (pssm_data->pssm) {
475  pssm_data->pssm = (int**)
476  _PSIDeallocateMatrix((void**) pssm_data->pssm, pssm_data->ncols);
477  }
478 
479  if (pssm_data->scaled_pssm) {
480  pssm_data->scaled_pssm = (int**)
481  _PSIDeallocateMatrix((void**) pssm_data->scaled_pssm,
482  pssm_data->ncols);
483  }
484 
485  if (pssm_data->freq_ratios) {
486  pssm_data->freq_ratios = (double**)
487  _PSIDeallocateMatrix((void**) pssm_data->freq_ratios,
488  pssm_data->ncols);
489  }
490 
491  if (pssm_data->pseudocounts) {
492  sfree(pssm_data->pseudocounts);
493  }
494 
495  sfree(pssm_data);
496 
497  return NULL;
498 }
499 
502 {
503  _PSIAlignedBlock* retval = NULL;
504  Uint4 i = 0;
505 
506  retval = (_PSIAlignedBlock*) calloc(1, sizeof(_PSIAlignedBlock));
507  if ( !retval ) {
508  return NULL;
509  }
510 
511  retval->size = (Uint4*) calloc(query_length, sizeof(Uint4));
512  if ( !retval->size ) {
513  return _PSIAlignedBlockFree(retval);
514  }
515 
516  retval->pos_extnt = (SSeqRange*) malloc(query_length * sizeof(SSeqRange));
517  if ( !retval->pos_extnt ) {
518  return _PSIAlignedBlockFree(retval);
519  }
520 
521  /* N.B.: these initial values are deliberate so that the retval->size[i]
522  * field is initialized with a value that exceeds query_length in
523  * _PSIComputeAlignedRegionLengths and can be used as a sanity check */
524  for (i = 0; i < query_length; i++) {
525  retval->pos_extnt[i].left = -1;
526  retval->pos_extnt[i].right = query_length;
527  }
528  return retval;
529 }
530 
533 {
534  if ( !aligned_blocks ) {
535  return NULL;
536  }
537 
538  if (aligned_blocks->size) {
539  sfree(aligned_blocks->size);
540  }
541 
542  if (aligned_blocks->pos_extnt) {
543  sfree(aligned_blocks->pos_extnt);
544  }
545 
546  sfree(aligned_blocks);
547  return NULL;
548 }
549 
550 #define EFFECTIVE_ALPHABET 20 /**< size of alphabet used for pseudocounts calculations */
553  const BlastScoreBlk* sbp)
554 {
555  _PSISequenceWeights* retval = NULL;
556 
557  ASSERT(dimensions);
558  ASSERT(sbp);
559 
560  retval = (_PSISequenceWeights*) calloc(1, sizeof(_PSISequenceWeights));
561  if ( !retval ) {
562  return NULL;
563  }
564 
565  retval->row_sigma = (double*) calloc(dimensions->num_seqs + 1,
566  sizeof(double));
567  if ( !retval->row_sigma ) {
568  return _PSISequenceWeightsFree(retval);
569  }
570 
571  retval->norm_seq_weights = (double*) calloc(dimensions->num_seqs + 1,
572  sizeof(double));
573  if ( !retval->norm_seq_weights ) {
574  return _PSISequenceWeightsFree(retval);
575  }
576 
577  retval->sigma = (double*) calloc(dimensions->query_length, sizeof(double));
578  if ( !retval->sigma ) {
579  return _PSISequenceWeightsFree(retval);
580  }
581 
582  retval->match_weights = (double**)
583  _PSIAllocateMatrix(dimensions->query_length,
584  sbp->alphabet_size,
585  sizeof(double));
586  retval->match_weights_size = dimensions->query_length;
587  if ( !retval->match_weights ) {
588  return _PSISequenceWeightsFree(retval);
589  }
590 
592  if ( !retval->std_prob ) {
593  return _PSISequenceWeightsFree(retval);
594  }
595 
596  retval->gapless_column_weights = (double*)
597  calloc(dimensions->query_length, sizeof(double));
598  if ( !retval->gapless_column_weights ) {
599  return _PSISequenceWeightsFree(retval);
600  }
601 
602  retval->posDistinctDistrib = (int**)
603  _PSIAllocateMatrix(1+dimensions->query_length,
605  sizeof(int));
606  retval->posDistinctDistrib_size = 1+dimensions->query_length;
607  if ( !retval->posDistinctDistrib ) {
608  return _PSISequenceWeightsFree(retval);
609  }
610 
611  retval->posNumParticipating = (int*) calloc(1+dimensions->query_length, sizeof(int));
612  if ( !retval->posNumParticipating ) {
613  return _PSISequenceWeightsFree(retval);
614  }
615 
617  = (double*)calloc(1+dimensions->query_length, sizeof(double));
618  if ( !retval->independent_observations ) {
619  return _PSISequenceWeightsFree(retval);
620  }
621 
622 
623  return retval;
624 }
625 
628 {
629  if ( !seq_weights ) {
630  return NULL;
631  }
632 
633  if (seq_weights->row_sigma) {
634  sfree(seq_weights->row_sigma);
635  }
636 
637  if (seq_weights->norm_seq_weights) {
638  sfree(seq_weights->norm_seq_weights);
639  }
640 
641  if (seq_weights->sigma) {
642  sfree(seq_weights->sigma);
643  }
644 
645  if (seq_weights->match_weights) {
646  _PSIDeallocateMatrix((void**) seq_weights->match_weights,
647  seq_weights->match_weights_size);
648  }
649 
650  if (seq_weights->std_prob) {
651  sfree(seq_weights->std_prob);
652  }
653 
654  if (seq_weights->gapless_column_weights) {
655  sfree(seq_weights->gapless_column_weights);
656  }
657 
658  if (seq_weights->posDistinctDistrib) {
659  _PSIDeallocateMatrix((void**) seq_weights->posDistinctDistrib,
660  seq_weights->posDistinctDistrib_size);
661  }
662 
663  if (seq_weights->posNumParticipating) {
664  sfree(seq_weights->posNumParticipating);
665  }
666 
667  if (seq_weights->independent_observations) {
668  sfree(seq_weights->independent_observations);
669  }
670 
671  sfree(seq_weights);
672 
673  return NULL;
674 }
675 
676 /************** Validation routines *****************************************/
677 
678 /** Validate that there are no gaps in the query sequence
679  * @param msa multiple sequence alignment data structure [in]
680  * @return PSIERR_GAPINQUERY if validation fails, else PSI_SUCCESS
681  */
682 static int
684 {
685  const Uint1 kGapResidue = AMINOACID_TO_NCBISTDAA['-'];
686  Uint4 p = 0; /* index on positions */
687  ASSERT(msa);
688 
689  for (p = 0; p < msa->dimensions->query_length; p++) {
690  if (msa->cell[kQueryIndex][p].letter == kGapResidue ||
691  msa->query[p] == kGapResidue) {
692  return PSIERR_GAPINQUERY;
693  }
694  }
695  return PSI_SUCCESS;
696 }
697 
698 /** Validate that there are no flanking gaps in the multiple sequence alignment
699  * @param msa multiple sequence alignment data structure [in]
700  * @return PSIERR_STARTINGGAP or PSIERR_ENDINGGAP if validation fails,
701  * else PSI_SUCCESS
702  */
703 static int
705 {
706  const Uint1 kGapResidue = AMINOACID_TO_NCBISTDAA['-'];
707  Uint4 s = 0; /* index on sequences */
708  Int4 p = 0; /* index on positions */
709  ASSERT(msa);
710 
711  /* Look for starting gaps in alignments */
712  for (s = 0; s < msa->dimensions->num_seqs + 1; s++) {
713 
714  /* find the first aligned residue */
715  for (p = 0; p < (Int4) msa->dimensions->query_length; p++) {
716  if (msa->cell[s][p].is_aligned) {
717  if (msa->cell[s][p].letter == kGapResidue) {
718  return PSIERR_STARTINGGAP;
719  } else {
720  break;
721  }
722  }
723  }
724  }
725 
726  /* Look for ending gaps in alignments */
727  for (s = 0; s < msa->dimensions->num_seqs + 1; s++) {
728 
729  /* find the first aligned residue */
730  for (p = msa->dimensions->query_length - 1; p >= 0; p--) {
731  if (msa->cell[s][p].is_aligned) {
732  if (msa->cell[s][p].letter == kGapResidue) {
733  return PSIERR_ENDINGGAP;
734  } else {
735  break;
736  }
737  }
738  }
739  }
740 
741  return PSI_SUCCESS;
742 }
743 
744 /** Validate that there are no unaligned columns or columns which only contain
745  * gaps in the multiple sequence alignment. Note that this test is a bit
746  * redundant with s_PSIValidateNoGapsInQuery(), but it is left in here just in
747  * case the query sequence is manually disabled (normally it shouldn't, but we
748  * have seen cases where this is done).
749  * @param msa multiple sequence alignment data structure [in]
750  * @return PSIERR_UNALIGNEDCOLUMN or PSIERR_COLUMNOFGAPS if validation fails,
751  * else PSI_SUCCESS
752  */
753 static int
755 {
756  const Uint1 kGapResidue = AMINOACID_TO_NCBISTDAA['-'];
757  Uint4 s = 0; /* index on sequences */
758  Uint4 p = 0; /* index on positions */
759  ASSERT(msa);
760 
761  for (p = 0; p < msa->dimensions->query_length; p++) {
762 
763  Boolean found_aligned_sequence = FALSE;
764  Boolean found_non_gap_residue = FALSE;
765 
766  for (s = kQueryIndex; s < msa->dimensions->num_seqs + 1; s++) {
767 
768  if (msa->cell[s][p].is_aligned) {
769  found_aligned_sequence = TRUE;
770  if (msa->cell[s][p].letter != kGapResidue) {
771  found_non_gap_residue = TRUE;
772  break;
773  }
774  }
775  }
776  if ( !found_aligned_sequence ) {
777  return PSIERR_UNALIGNEDCOLUMN;
778  }
779  if ( !found_non_gap_residue ) {
780  return PSIERR_COLUMNOFGAPS;
781  }
782  }
783 
784  return PSI_SUCCESS;
785 }
786 
787 /** Verify that after purging biased sequences in multiple sequence alignment
788  * there are still sequences participating in the multiple sequences alignment
789  * @param msa multiple sequence alignment structure [in]
790  * @return PSIERR_NOALIGNEDSEQS if validation fails, else PSI_SUCCESS
791  */
792 static int
794 {
795  ASSERT(msa);
797 }
798 
799 void
801 {
802  Uint4 i;
803  for (i = 0; i < msa->dimensions->query_length; i++) {
804  msa->cell[kQueryIndex][i].letter = 0;
805  msa->cell[kQueryIndex][i].is_aligned = FALSE;
806  }
808 }
809 
810 int
812 {
813  int retval = PSI_SUCCESS;
814 
815  if ( !msa ) {
816  return PSIERR_BADPARAM;
817  }
818 
820  if (retval != PSI_SUCCESS) {
821  return retval;
822  }
823 
824  return retval;
825 }
826 
827 int
828 _PSIValidateMSA(const _PSIMsa* msa, Boolean ignore_unaligned_positions)
829 {
830  int retval = PSI_SUCCESS;
831 
832  if ( !msa ) {
833  return PSIERR_BADPARAM;
834  }
835 
836  retval = s_PSIValidateNoFlankingGaps(msa);
837  if (retval != PSI_SUCCESS) {
838  return retval;
839  }
840 
841  if ( !ignore_unaligned_positions ) {
842  retval = s_PSIValidateAlignedColumns(msa);
843  if (retval != PSI_SUCCESS) {
844  return retval;
845  }
846  }
847 
848  retval = s_PSIValidateNoGapsInQuery(msa);
849  if (retval != PSI_SUCCESS) {
850  return retval;
851  }
852 
854  if (retval != PSI_SUCCESS) {
855  return retval;
856  }
857 
858  return retval;
859 }
860 
861 int
862 _PSIValidateCdMSA(const PSICdMsa* cd_msa, Uint4 alphabet_size)
863 {
864  int retval = PSI_SUCCESS;
865  const Uint1 kGapResidue = AMINOACID_TO_NCBISTDAA['-'];
866  Uint4 p = 0; /* index on query position */
867  Uint4 s = 0; /* index on domains */
868 
869  if ( !cd_msa || !cd_msa->dimensions) {
870  return PSIERR_BADPARAM;
871  }
872 
873  // validate no gaps in query
874  for (p = 0; p < cd_msa->dimensions->query_length; p++) {
875  if (cd_msa->query[p] == kGapResidue) {
876  return PSIERR_GAPINQUERY;
877  }
878  }
879 
880  // validate profile data
881  for (s = 0; s < cd_msa->dimensions->num_seqs; s++) {
882  for (p = 0; p < cd_msa->dimensions->query_length; p++) {
883  double sum = 0.0;
884  Uint4 k = 0;
885 
886  if (cd_msa->msa[s][p].is_aligned) {
887  if (!cd_msa->msa[s][p].data) {
888  return PSIERR_BADPROFILE;
889  }
890  if (!cd_msa->msa[s][p].data->wfreqs
891  || cd_msa->msa[s][p].data->iobsr < kEpsilon) {
892  return PSIERR_BADPROFILE;
893  }
894 
895  for (k = 0; k < alphabet_size; k++) {
896  if (cd_msa->msa[s][p].data->wfreqs[k] < 0.0) {
897  return PSIERR_BADPROFILE;
898  }
899  sum += cd_msa->msa[s][p].data->wfreqs[k];
900  }
901  if (fabs(sum - 1.0) > kEpsilon) {
902  return PSIERR_BADPROFILE;
903  }
904  }
905  }
906  }
907 
908  return retval;
909 }
910 
911 /****************************************************************************/
912 /* Function prototypes */
913 
914 /** Remove those sequences which are identical to the query sequence
915  * @param msa multiple sequence alignment data structure [in]
916  */
917 static void
919 
920 /** Keeps only one copy of any aligned sequences which are >kPSINearIdentical%
921  * identical to one another
922  * @param msa multiple sequence alignment data structure [in]
923  */
924 static void
926 
927 /** This function compares the sequences in the msa->cell
928  * structure indexed by sequence_index1 and seq_index2. If it finds aligned
929  * regions that have a greater percent identity than max_percent_identity,
930  * it removes the sequence identified by seq_index2.
931  * FIXME: needs more descriptive name
932  * @param msa multiple sequence alignment data structure [in]
933  * @param seq_index1 index of the sequence of interest [in]
934  * @param seq_index2 index of the sequence of interest [in]
935  * @param max_percent_identity percent identity needed to drop sequence
936  * identified by seq_index2 from the multiple sequence alignment data structure
937  * [in]
938  */
939 static void
941  Uint4 seq_index1,
942  Uint4 seq_index2,
943  double max_percent_identity);
944 /****************************************************************************/
945 
946 /**************** PurgeMatches stage of PSSM creation ***********************/
947 int
949 {
950  if ( !msa ) {
951  return PSIERR_BADPARAM;
952  }
953 
954  s_PSIPurgeSelfHits(msa);
956 
957  return PSI_SUCCESS;
958 }
959 
960 static void
962 {
963  Uint4 s = 0; /* index on sequences */
964 
965  ASSERT(msa);
966 
967  for (s = kQueryIndex + 1; s < msa->dimensions->num_seqs + 1; s++) {
969  }
970 }
971 
972 static void
974 {
975  Uint4 i = 0;
976  Uint4 j = 0;
977 
978  ASSERT(msa);
979 
980  for (i = 1; i < msa->dimensions->num_seqs + 1; i++) {
981  for (j = 1; (i + j) < msa->dimensions->num_seqs + 1; j++) {
982  /* N.B.: The order of comparison of sequence pairs is deliberate,
983  * tests on real data indicated that this approach allowed more
984  * sequences to be purged */
986  }
987  }
988 }
989 
990 void
992 {
993  Uint4 kQueryLength = 0; /* length of the query sequence */
994  Uint4 kNumberOfSeqs = 0; /* number of aligned sequences + 1 */
995  Uint4 s = 0; /* index on aligned sequences */
996  Uint4 p = 0; /* index on positions */
997 
998  ASSERT(msa);
999 
1000  kQueryLength = msa->dimensions->query_length;
1001  kNumberOfSeqs = msa->dimensions->num_seqs + 1;
1002 
1003  /* Reset the data in case this function is being called multiple times
1004  * after the initial counts were done (i.e.: structure group's
1005  * compatibility mode) */
1006  memset((void*) msa->num_matching_seqs, 0, sizeof(Uint4)*kQueryLength);
1007  for (p = 0; p < kQueryLength; p++) {
1008  memset((void*) msa->residue_counts[p], 0,
1009  sizeof(Uint4)*msa->alphabet_size);
1010  }
1011 
1012  for (s = 0; s < kNumberOfSeqs; s++) {
1013  _PSIMsaCell* pos = msa->cell[s]; /* entry in MSA matrix */
1014 
1015  for (p = 0; p < kQueryLength; p++, pos++) {
1016  if (pos->is_aligned) {
1017  const Uint1 res = pos->letter;
1018  if (res >= msa->alphabet_size) {
1019  continue;
1020  }
1021  msa->residue_counts[p][res]++;
1022  msa->num_matching_seqs[p]++;
1023  }
1024  }
1025  }
1026 }
1027 
1028 /** Defines the states of the finite state machine used in
1029  * s_PSIPurgeSimilarAlignments. Successor to posit.c's POS_COUNTING and
1030  * POS_RESTING */
1031 typedef enum _EPSIPurgeFsmState {
1033  eResting
1035 
1036 /** Auxiliary structure to maintain information about two aligned regions
1037  * between the query and a subject sequence. It is used to store the data
1038  * manipulated by the finite state machine used in s_PSIPurgeSimilarAlignments.
1039  */
1040 typedef struct _PSIAlignmentTraits {
1041  Uint4 start; /**< starting offset of alignment w.r.t. query */
1042  Uint4 effective_length; /**< length of alignment not including Xs */
1043  Uint4 n_x_residues; /**< number of X residues in alignment */
1044  Uint4 n_identical; /**< number of identical residues in alignment */
1046 
1047 #ifdef DEBUG
1048 static
1049 void DEBUG_printTraits(_PSIAlignmentTraits* traits,
1050  _EPSIPurgeFsmState state, Uint4 position)
1051 {
1052  fprintf(stderr, "Position: %d - State: %s\n", position,
1053  state == eCounting ? "eCounting" : "eResting");
1054  fprintf(stderr, "\tstart: %d\n", traits->start);
1055  fprintf(stderr, "\teffective_length: %d\n", traits->effective_length);
1056  fprintf(stderr, "\tn_x_residues: %d\n", traits->n_x_residues);
1057  fprintf(stderr, "\tn_identical: %d\n", traits->n_identical);
1058 }
1059 #endif
1060 
1061 /** Resets the traits structure to restart finite state machine
1062  * @param traits structure to reset [in|out]
1063  * @param position position in the multiple sequence alignment to which the
1064  * traits structure is initialized [in]
1065  */
1066 static NCBI_INLINE void
1068 {
1069  ASSERT(traits);
1070  memset((void*) traits, 0, sizeof(_PSIAlignmentTraits));
1071  traits->start = position;
1072 }
1073 
1074 /** Handles neither is aligned event FIXME: document better */
1075 static NCBI_INLINE void
1078  _PSIPackedMsa* msa, Uint4 seq_index,
1079  double max_percent_identity)
1080 {
1081  ASSERT(traits);
1082  ASSERT(state);
1083 
1084  switch (*state) {
1085  case eCounting:
1086  /* Purge aligned region if max_percent_identity is exceeded */
1087  {
1088  if (traits->effective_length > 0) {
1089  const double percent_identity =
1090  ((double)traits->n_identical) / traits->effective_length;
1091  if (percent_identity >= max_percent_identity) {
1092  const unsigned int align_stop =
1093  traits->start + traits->effective_length +
1094  traits->n_x_residues;
1095  int rv = _PSIPurgeAlignedRegion(msa, seq_index,
1096  traits->start, align_stop);
1097  ASSERT(rv == PSI_SUCCESS);
1098  rv += 0; /* dummy code to avoid warning in release mode */
1099  }
1100  }
1101  }
1102 
1103  *state = eResting;
1104  break;
1105 
1106  case eResting:
1107  /* No-op */
1108  break;
1109 
1110  default:
1111  abort();
1112  }
1113 }
1114 
1115 /** Handle event when both positions are aligned, using the same residue, but
1116  * this residue is not X */
1117 static NCBI_INLINE void
1120 {
1121  ASSERT(traits);
1122  ASSERT(state);
1123 
1124  switch (*state) {
1125  case eCounting:
1126  traits->n_identical++;
1127  break;
1128 
1129  case eResting:
1130  /* No-op */
1131  break;
1132 
1133  default:
1134  abort();
1135  }
1136 }
1137 
1138 /** Handle the event when either position is aligned and either is X */
1139 static NCBI_INLINE void
1142 {
1143  ASSERT(traits);
1144  ASSERT(state);
1145 
1146  switch (*state) {
1147  case eCounting:
1148  traits->n_x_residues++;
1149  break;
1150 
1151  case eResting:
1152  /* No-op */
1153  break;
1154 
1155  default:
1156  abort();
1157  }
1158 }
1159 
1160 /** Handle the event when either position is aligned and neither is X */
1161 static NCBI_INLINE void
1164  Uint4 position)
1165 {
1166  ASSERT(traits);
1167  ASSERT(state);
1168 
1169  switch (*state) {
1170  case eCounting:
1171  traits->effective_length++;
1172  break;
1173 
1174  case eResting:
1175  _PSIResetAlignmentTraits(traits, position);
1176  traits->effective_length = 1; /* count this residue */
1177  *state = eCounting;
1178  break;
1179 
1180  default:
1181  abort();
1182  }
1183 }
1184 
1185 static void
1187  Uint4 seq_index1,
1188  Uint4 seq_index2,
1189  double max_percent_identity)
1190 {
1191 
1192  const Uint1 kXResidue = AMINOACID_TO_NCBISTDAA['X'];
1193  _EPSIPurgeFsmState state = eCounting; /* initial state of the fsm */
1194  _PSIAlignmentTraits traits;
1195  const Uint4 kQueryLength = msa->dimensions->query_length;
1196  _PSIPackedMsaCell* seq1 = 0; /* array of cells for sequence 1 in MSA */
1197  _PSIPackedMsaCell* seq2 = 0; /* array of cells for sequence 2 in MSA */
1198  Uint4 p = 0; /* position on alignment */
1199 
1200  /* Nothing to do if sequences are the same or not selected for further
1201  processing */
1202  if ( seq_index1 == seq_index2 ||
1203  !msa->use_sequence[seq_index1] ||
1204  !msa->use_sequence[seq_index2] ) {
1205  return;
1206  }
1207 
1208  _PSIResetAlignmentTraits(&traits, p);
1209  seq1 = msa->data[seq_index1];
1210  seq2 = msa->data[seq_index2];
1211 
1212  /* Examine each position of the aligned sequences and use the fsm to
1213  * determine if a region of the alignment should be purged */
1214  for (p = 0; p < kQueryLength; p++, seq1++, seq2++) {
1215 
1216  /* Indicates if the position in seq_index1 currently being examined is
1217  * aligned. In the special case for seq_index1 == kQueryIndex, this
1218  * variable is set to FALSE to force the other sequence's position to
1219  * be used to proceed with processing. */
1220  Boolean kPos1Aligned = (seq_index1 == kQueryIndex ?
1221  FALSE : seq1->is_aligned);
1222  /* Indicates if the position in seq_index2 currently being examined is
1223  * aligned. */
1224  Boolean kPos2Aligned = seq2->is_aligned;
1225 
1226  /* Look for events interesting to the finite state machine */
1227 
1228  /* if neither position is aligned */
1229  if (!kPos1Aligned && !kPos2Aligned) {
1230  _handleNeitherAligned(&traits, &state, msa, seq_index2,
1231  max_percent_identity);
1232  } else {
1233 
1234  /* Define vars for events interesting to the finite state machine */
1235 
1236  Boolean neither_is_X =
1237  seq1->letter != kXResidue && seq2->letter != kXResidue;
1238 
1239  /* Either one of the is aligned*/
1240  if (neither_is_X) {
1241  _handleEitherAlignedNeitherX(&traits, &state, p);
1242  } else { /* either is X */
1244  }
1245 
1246  if (neither_is_X && (kPos2Aligned && seq1->is_aligned) &&
1247  (seq1->letter == seq2->letter)) {
1249  }
1250  }
1251  }
1252  _handleNeitherAligned(&traits, &state, msa, seq_index2,
1253  max_percent_identity);
1254 }
1255 
1256 /****************************************************************************/
1257 /* Function prototypes */
1258 
1259 /** Computes the left extents for the sequence identified by seq_index
1260  * @param msa multiple sequence alignment data structure [in]
1261  * @param seq_index index of the sequence of interest [in]
1262  */
1263 static void
1264 _PSIGetLeftExtents(const _PSIMsa* msa, Uint4 seq_index);
1265 
1266 /** Computes the right extents for the sequence identified by seq_index
1267  * @param msa multiple sequence alignment data structure [in]
1268  * @param seq_index index of the sequence of interest [in]
1269  */
1270 static void
1271 _PSIGetRightExtents(const _PSIMsa* msa, Uint4 seq_index);
1272 
1273 /** Computes the aligned blocks extents' for each position for the sequence
1274  * identified by seq_index
1275  * @param msa multiple sequence alignment data structure [in]
1276  * @param seq_index index of the sequence of interest [in]
1277  * @param aligned_blocks aligned regions' extents [out]
1278  */
1279 static void
1281  Uint4 seq_index,
1282  _PSIAlignedBlock* aligned_blocks);
1283 
1284 /** Calculates the aligned blocks lengths in the multiple sequence alignment
1285  * data structure.
1286  * @param msa multiple sequence alignment data structure [in]
1287  * @param aligned_blocks aligned regions' extents [in|out]
1288  */
1289 static void
1291  _PSIAlignedBlock* aligned_blocks);
1292 
1293 /****************************************************************************/
1294 /******* Compute alignment extents stage of PSSM creation *******************/
1295 /* posComputeExtents in posit.c */
1296 int
1297 _PSIComputeAlignmentBlocks(const _PSIMsa* msa, /* [in] */
1298  _PSIAlignedBlock* aligned_blocks) /* [out] */
1299 {
1300  Uint4 s = 0; /* index on aligned sequences */
1301 
1302  if ( !msa || !aligned_blocks ) {
1303  return PSIERR_BADPARAM;
1304  }
1305 
1306  /* no need to compute extents for query sequence */
1307  for (s = kQueryIndex + 1; s < msa->dimensions->num_seqs + 1; s++) {
1308  _PSIGetLeftExtents(msa, s);
1309  _PSIGetRightExtents(msa, s);
1310  _PSIComputePositionExtents(msa, s, aligned_blocks);
1311  }
1312 
1313  _PSIComputeAlignedRegionLengths(msa, aligned_blocks);
1314 
1315  return PSI_SUCCESS;
1316 }
1317 
1318 static void
1319 _PSIGetLeftExtents(const _PSIMsa* msa, Uint4 seq_index)
1320 {
1321  const Uint1 kGapResidue = AMINOACID_TO_NCBISTDAA['-'];
1322  _PSIMsaCell* sequence_position = NULL;
1323  Uint4 prev = 0; /* index for the first and previous position */
1324  Uint4 curr = 0; /* index for the current position */
1325 
1326  ASSERT(msa);
1327  ASSERT(seq_index < msa->dimensions->num_seqs + 1);
1328 
1329  sequence_position = msa->cell[seq_index];
1330 
1331  if (sequence_position[prev].is_aligned &&
1332  sequence_position[prev].letter != kGapResidue) {
1333  sequence_position[prev].extents.left = prev;
1334  }
1335 
1336  for (curr = prev + 1; curr < msa->dimensions->query_length;
1337  curr++, prev++) {
1338 
1339  if ( !sequence_position[curr].is_aligned ) {
1340  continue;
1341  }
1342 
1343  if (sequence_position[prev].is_aligned) {
1344  sequence_position[curr].extents.left =
1345  sequence_position[prev].extents.left;
1346  } else {
1347  sequence_position[curr].extents.left = curr;
1348  }
1349  }
1350 }
1351 
1352 static void
1353 _PSIGetRightExtents(const _PSIMsa* msa, Uint4 seq_index)
1354 {
1355  const Uint1 kGapResidue = AMINOACID_TO_NCBISTDAA['-'];
1356  _PSIMsaCell* sequence_position = NULL;
1357  Uint4 last = 0; /* index for the last position */
1358  Int4 curr = 0; /* index for the current position */
1359 
1360  ASSERT(msa);
1361  ASSERT(seq_index < msa->dimensions->num_seqs + 1);
1362 
1363  sequence_position = msa->cell[seq_index];
1364  last = msa->dimensions->query_length - 1;
1365 
1366  if (sequence_position[last].is_aligned &&
1367  sequence_position[last].letter != kGapResidue) {
1368  sequence_position[last].extents.right = last;
1369  }
1370 
1371  for (curr = last - 1; curr >= 0; curr--, last--) {
1372 
1373  if ( !sequence_position[curr].is_aligned ) {
1374  continue;
1375  }
1376 
1377  if (sequence_position[last].is_aligned) {
1378  sequence_position[curr].extents.right =
1379  sequence_position[last].extents.right;
1380  } else {
1381  sequence_position[curr].extents.right = curr;
1382  }
1383  }
1384 }
1385 
1386 static void
1388  Uint4 seq_index,
1389  _PSIAlignedBlock* aligned_blocks)
1390 {
1391 #ifdef PSI_IGNORE_GAPS_IN_COLUMNS
1392  const Uint1 kGapResidue = AMINOACID_TO_NCBISTDAA['-'];
1393 #endif
1394  _PSIMsaCell* sequence_position = NULL;
1395  Uint4 i = 0;
1396 
1397  ASSERT(aligned_blocks);
1398  ASSERT(msa);
1399  ASSERT(seq_index < msa->dimensions->num_seqs + 1);
1400 
1401  sequence_position = msa->cell[seq_index];
1402 
1403  for (i = 0; i < msa->dimensions->query_length; i++) {
1404 #ifdef PSI_IGNORE_GAPS_IN_COLUMNS
1405  if (sequence_position[i].is_aligned &&
1406  sequence_position[i].letter != kGapResidue) {
1407 #else
1408  if (sequence_position[i].is_aligned) {
1409 #endif
1410  aligned_blocks->pos_extnt[i].left =
1411  MAX(aligned_blocks->pos_extnt[i].left,
1412  sequence_position[i].extents.left);
1413  aligned_blocks->pos_extnt[i].right =
1414  MIN(aligned_blocks->pos_extnt[i].right,
1415  sequence_position[i].extents.right);
1416  }
1417  }
1418 }
1419 
1420 static void
1422  _PSIAlignedBlock* aligned_blocks)
1423 {
1424  const Uint1 kXResidue = AMINOACID_TO_NCBISTDAA['X'];
1425  Uint4 kQueryLength = 0; /* length of the query */
1426  Uint4 i = 0;
1427 
1428  ASSERT(msa);
1429  ASSERT(aligned_blocks);
1430  kQueryLength = msa->dimensions->query_length;
1431 
1432  for (i = 0; i < kQueryLength; i++) {
1433  aligned_blocks->size[i] = aligned_blocks->pos_extnt[i].right -
1434  aligned_blocks->pos_extnt[i].left + 1;
1435  /* Sanity check: if aligned_blocks->pos_extnt[i].{right,left} was
1436  not modified after initialization, this assertion will fail
1437  N.B.: This is allowed in the old code, i.e.: the size field will be
1438  query_length + 2, because of this the assertion below was removed.
1439  ASSERT(aligned_blocks->size[i] <= msa->dimensions->query_length);
1440  */
1441  }
1442 
1443  /* Do not include X's in aligned region lengths */
1444  for (i = 0; i < kQueryLength; i++) {
1445 
1446  if (msa->query[i] == kXResidue) {
1447 
1448  Uint4 idx = 0;
1449  for (idx = 0; idx < i; idx++) {
1450  if ((Uint4)aligned_blocks->pos_extnt[idx].right >= i &&
1451  msa->query[idx] != kXResidue) {
1452  aligned_blocks->size[idx]--;
1453  }
1454  }
1455  for (idx = msa->dimensions->query_length - 1; idx > i; idx--) {
1456  if ((Uint4)aligned_blocks->pos_extnt[idx].left <= i &&
1457  msa->query[idx] != kXResidue) {
1458  aligned_blocks->size[idx]--;
1459  }
1460  }
1461  }
1462 
1463  }
1464 }
1465 
1466 /****************************************************************************/
1467 
1468 /** Populates the array aligned_sequences with the indices of the sequences
1469  * which are part of the multiple sequence alignment at the request position
1470  * @param msa multiple sequence alignment data structure [in]
1471  * @param position position of interest [in]
1472  * @param aligned_sequences array which will contain the indices of the
1473  * sequences aligned at the requested position. This array must have size
1474  * greater than or equal to the number of sequences + 1 in multiple alignment
1475  * data structure (alignment->dimensions->num_seqs + 1) [out]
1476  */
1477 static void
1479  const _PSIMsa* msa,
1480  Uint4 position,
1481  SDynamicUint4Array* aligned_sequences);
1482 
1483 /** Calculates the position based weights using a modified version of the
1484  * Henikoff's algorithm presented in "Position-based sequence weights".
1485  * Skipped optimization about identical previous sets.
1486  * @param msa multiple sequence alignment data structure [in]
1487  * @param aligned_blocks aligned regions' extents [in]
1488  * @param position position of the query to calculate the sequence weights for
1489  * [in]
1490  * @param aligned_seqs array containing the indices of the sequences
1491  * participating in the multiple sequence alignment at the requested
1492  * position [in]
1493  * @param seq_weights sequence weights data structure [out]
1494  */
1495 static void
1497  const _PSIMsa* msa,
1498  const _PSIAlignedBlock* aligned_blocks,
1499  Uint4 position,
1500  const SDynamicUint4Array* aligned_seqs,
1501  _PSISequenceWeights* seq_weights);
1502 
1503 /** Calculate the weighted observed sequence weights
1504  * @param msa multiple sequence alignment data structure [in]
1505  * @param position position of the query to calculate the sequence weights for
1506  * [in]
1507  * @param aligned_seqs array containing the indices of the sequences
1508  * participating in the multiple sequence alignment at the requested
1509  * position [in]
1510  * @param seq_weights sequence weights data structure [in|out]
1511  */
1512 static void
1514  const _PSIMsa* msa,
1515  Uint4 position,
1516  const SDynamicUint4Array* aligned_seqs,
1517  _PSISequenceWeights* seq_weights);
1518 
1519 /** Uses disperse method of spreading the gap weights
1520  * @param msa multiple sequence alignment data structure [in]
1521  * @param seq_weights sequence weights data structure [in|out]
1522  * @param nsg_compatibility_mode set to true to emulate the structure group's
1523  * use of PSSM engine in the cddumper application. By default should be FALSE
1524  * [in]
1525  */
1526 static void
1527 _PSISpreadGapWeights(const _PSIMsa* msa,
1528  _PSISequenceWeights* seq_weights,
1529  Boolean nsg_compatibility_mode);
1530 
1531 /** Verifies that the sequence weights for each column of the PSSM add up to
1532  * 1.0.
1533  * @param msa multiple sequence alignment data structure [in]
1534  * @param seq_weights sequence weights data structure [in]
1535  * @param nsg_compatibility_mode set to true to emulate the structure group's
1536  * use of PSSM engine in the cddumper application. By default should be FALSE
1537  * [in]
1538  * @return PSIERR_BADSEQWEIGHTS in case of failure, PSI_SUCCESS otherwise
1539  */
1540 static int
1542  const _PSIMsa* msa,
1543  const _PSISequenceWeights* seq_weights,
1544  Boolean nsg_compatibility_mode);
1545 
1546 /****************************************************************************/
1547 /******* Calculate sequence weights stage of PSSM creation ******************/
1548 /* Needs the _PSIAlignedBlock structure calculated in previous stage as well
1549  * as PSIAlignmentData structure */
1550 
1551 int
1552 _PSIComputeSequenceWeights(const _PSIMsa* msa, /* [in] */
1553  const _PSIAlignedBlock* aligned_blocks, /* [in] */
1554  Boolean nsg_compatibility_mode, /* [in] */
1555  _PSISequenceWeights* seq_weights) /* [out] */
1556 {
1557  SDynamicUint4Array* aligned_seqs = 0; /* list of indices of sequences
1558  which participate in an
1559  aligned position */
1560  SDynamicUint4Array* prev_pos_aligned_seqs = 0; /* list of indices of
1561  sequences for the previous position in
1562  the query (i.e.: column in MSA). */
1563  Uint4 kQueryLength = 0; /* length of the query */
1564  Uint4 pos = 0; /* position index */
1565  int retval = PSI_SUCCESS; /* return value */
1566  const Uint4 kExpectedNumMatchingSeqs = nsg_compatibility_mode ? 0 : 1;
1567  Uint4 last_calc_pos = 0;
1568 
1569  if ( !msa || !aligned_blocks || !seq_weights ) {
1570  return PSIERR_BADPARAM;
1571  }
1572 
1573  aligned_seqs = DynamicUint4ArrayNewEx(msa->dimensions->num_seqs + 1);
1574  prev_pos_aligned_seqs = DynamicUint4Array_Dup(aligned_seqs);
1575  if ( !aligned_seqs || !prev_pos_aligned_seqs ) {
1576  return PSIERR_OUTOFMEM;
1577  }
1578  kQueryLength = msa->dimensions->query_length;
1579 
1580  for (pos = 0; pos < kQueryLength; pos++) {
1581 
1582  /* ignore positions of no interest */
1583  if (aligned_blocks->size[pos] == 0 ||
1584  msa->num_matching_seqs[pos] <= kExpectedNumMatchingSeqs) {
1585  continue;
1586  }
1587 
1588  DynamicUint4Array_Copy(prev_pos_aligned_seqs, aligned_seqs);
1589  _PSIGetAlignedSequencesForPosition(msa, pos, aligned_seqs);
1590  ASSERT(msa->num_matching_seqs[pos] == aligned_seqs->num_used);
1591  if (aligned_seqs->num_used <= kExpectedNumMatchingSeqs) {
1592  continue;
1593  }
1594  last_calc_pos = pos;
1595 
1596  if (last_calc_pos != pos - 1 ||
1597  !DynamicUint4Array_AreEqual(aligned_seqs, prev_pos_aligned_seqs)) {
1598 
1599  memset((void*)seq_weights->norm_seq_weights, 0,
1600  sizeof(double)*(msa->dimensions->num_seqs+1));
1601  memset((void*)seq_weights->row_sigma, 0,
1602  sizeof(double)*(msa->dimensions->num_seqs+1));
1603 
1604  _PSICalculateNormalizedSequenceWeights(msa, aligned_blocks, pos,
1605  aligned_seqs, seq_weights);
1606  } else {
1607  int index;
1608  seq_weights->sigma[pos] = seq_weights->sigma[pos-1];
1609  for (index = 0; index <= EFFECTIVE_ALPHABET; index++) {
1610  seq_weights->posDistinctDistrib[pos][index] = seq_weights->posDistinctDistrib[pos-1][index];
1611  }
1612  /* seq_weights->norm_seq_weights are unchanged from the previous
1613  * iteration, leaving them ready to be used in
1614  * _PSICalculateMatchWeights */
1615  }
1616  seq_weights->posNumParticipating[pos] = aligned_seqs->num_used;
1617 
1618  /* Uses seq_weights->norm_seq_weights to populate match_weights */
1619  _PSICalculateMatchWeights(msa, pos, aligned_seqs, seq_weights);
1620  }
1621 
1622  DynamicUint4ArrayFree(aligned_seqs);
1623  DynamicUint4ArrayFree(prev_pos_aligned_seqs);
1624 
1625  /* Check that the sequence weights add up to 1 in each column */
1626  retval = _PSICheckSequenceWeights(msa, seq_weights,
1627  nsg_compatibility_mode);
1628  if (retval != PSI_SUCCESS) {
1629  return retval;
1630  }
1631 
1632 #ifndef PSI_IGNORE_GAPS_IN_COLUMNS
1633  _PSISpreadGapWeights(msa, seq_weights, nsg_compatibility_mode);
1634  retval = _PSICheckSequenceWeights(msa, seq_weights,
1635  nsg_compatibility_mode);
1636 #endif
1637 
1638  /* Return seq_weights->match_weigths, should free others? FIXME: need to
1639  * keep sequence weights for diagnostics for structure group */
1640  return retval;
1641 }
1642 
1643 static void s_PSIComputeFrequenciesFromCDsCleanup(double* sum_weights)
1644 {
1645  if (sum_weights) {
1646  sfree(sum_weights);
1647  }
1648 }
1649 
1650 int
1651 _PSIComputeFrequenciesFromCDs(const PSICdMsa* cd_msa, /* [in] */
1652  BlastScoreBlk* sbp, /* [in] */
1653  const PSIBlastOptions* options, /* [in] */
1654  _PSISequenceWeights* seq_weights) /* [out] */
1655 {
1656  Uint4 kQueryLength = 0; /* length of the query */
1657  Uint4 pos = 0; /* position index */
1658  int retval = PSI_SUCCESS; /* return value */
1659  double* sum_weights = NULL;
1660 
1661  const Uint1 kXResidue = AMINOACID_TO_NCBISTDAA['X'];
1662 
1663  if ( !cd_msa || !seq_weights || !sbp || !options) {
1664  return PSIERR_BADPARAM;
1665  }
1666 
1667  /* quit of no CDs aligned to the query */
1668  if (cd_msa->dimensions->num_seqs == 0) {
1669  return retval;
1670  }
1671 
1672  sum_weights = (double*)malloc(sbp->alphabet_size * sizeof(double));
1673 
1674  if ( !sum_weights) {
1676 
1677  return PSIERR_OUTOFMEM;
1678  }
1679 
1680  kQueryLength = cd_msa->dimensions->query_length;
1681 
1682  /* for each position in query */
1683  for (pos = 0; pos < kQueryLength; pos++) {
1684  double total_observations = 0.0; /* total number of observations */
1685  Uint4 msa_index; /* index of aligned CDs in msa */
1686  Uint4 residue; /* residue index */
1687  Uint4 query_residue = cd_msa->query[pos]; /* query residue */
1688 
1689  memset(sum_weights, 0, sbp->alphabet_size * sizeof(double));
1690 
1691  /* for each matching CD */
1692  for (msa_index = 0; msa_index < cd_msa->dimensions->num_seqs;
1693  msa_index++) {
1694 
1695  /* disregard CDs not alined to this position */
1696  if (!cd_msa->msa[msa_index][pos].is_aligned) {
1697  continue;
1698  }
1699 
1700  ASSERT(cd_msa->msa[msa_index][pos].data);
1701 
1702  /* add number of independent observations */
1703  total_observations +=
1704  cd_msa->msa[msa_index][pos].data->iobsr;
1705 
1706 
1707  /* for each residue add weighted residue counts weighted by
1708  number of independent observations for the CD */
1709  for (residue = 0; residue < sbp->alphabet_size; residue++) {
1710  sum_weights[residue] +=
1711  cd_msa->msa[msa_index][pos].data->wfreqs[residue]
1712  * cd_msa->msa[msa_index][pos].data->iobsr;
1713  }
1714  }
1715 
1716  /* Include query residue unless it is already observed in a matching
1717  domain */
1718  if (total_observations > 0.0 && query_residue != kXResidue
1719  && sum_weights[query_residue] == 0.0) {
1720 
1721  sum_weights[query_residue] = 1.0;
1722  total_observations += 1.0;
1723  }
1724 
1725  /* normalize the summed weighted counts */
1726  if (total_observations > 0.0) {
1727  double sum = 0.0;
1728  for (residue = 0; residue < sbp->alphabet_size; residue++) {
1729  seq_weights->match_weights[pos][residue] =
1730  sum_weights[residue] / total_observations;
1731  sum += seq_weights->match_weights[pos][residue];
1732  }
1733  ASSERT(fabs(sum - 1.0) < 1e-5);
1734  }
1735 
1736  /* the procedure that estimates frequency ratios (and adds pseudo
1737  counts) was developed for estimation of the effective number of
1738  independent observations from the 2009 paper, where 400 is the
1739  maximum estimate */
1740  seq_weights->independent_observations[pos] = MIN(400.0,
1741  total_observations);
1742  }
1743 
1745 
1746  return retval;
1747 }
1748 
1749 static void
1751  const _PSIMsa* msa,
1752  const _PSIAlignedBlock* aligned_blocks, /* [in] */
1753  Uint4 position, /* [in] */
1754  const SDynamicUint4Array* aligned_seqs, /* [in] */
1755  _PSISequenceWeights* seq_weights) /* [out] norm_seq_weights, sigma */
1756 {
1757  const Uint1 kGapResidue = AMINOACID_TO_NCBISTDAA['-'];
1758  const Uint1 kXResidue = AMINOACID_TO_NCBISTDAA['X'];
1759  /* Index into aligned block for requested position */
1760  Uint4 i = 0;
1761 
1762  /* This flag will be true if more than one different type of residue is
1763  * found in a column in the extent of the alignment that corresponds to
1764  * the position being examined. (replaces Sigma in old code) */
1765  Boolean distinct_residues_found = FALSE;
1766 
1767  /* Number of different characters occurring in matches within a
1768  * multi-alignment block including identical columns (replaces
1769  * intervalSigma in old code)
1770  * FIXME: alternate description
1771  * number of distinct residues in all columns in the extent of the
1772  * alignment corresponding to a position
1773  */
1774  Uint4 sigma = 0;
1775 
1776  /* Index into array of aligned sequences */
1777  Uint4 asi = 0;
1778 
1779  ASSERT(msa);
1780  ASSERT(aligned_blocks);
1781  ASSERT(seq_weights);
1782  ASSERT(aligned_seqs && aligned_seqs->num_used);
1783  ASSERT(position < msa->dimensions->query_length);
1784 
1785  for (i = (Uint4)aligned_blocks->pos_extnt[position].left;
1786  i <= (Uint4)aligned_blocks->pos_extnt[position].right; i++) {
1787 
1788  /* Keeps track of how many occurrences of each residue are found in a
1789  * column of the alignment extent corresponding to a query position */
1790  Uint4 residue_counts_for_column[BLASTAA_SIZE] = { 0 };
1791 
1792  /* number of distinct residues found in a column of the alignment
1793  * extent correspoding to a query position */
1794  Uint4 num_distinct_residues_for_column = 0;
1795  Uint4 num_local_std_letters = 0;
1796 
1797  /* Assert that the alignment extents have sane values */
1798  ASSERT(i < msa->dimensions->query_length);
1799 
1800  /* Count number of residues in column i of the alignment extent
1801  * corresponding to position */
1802  for (asi = 0; asi < aligned_seqs->num_used; asi++) {
1803  const Uint4 kSeqIdx = aligned_seqs->data[asi];
1804  const Uint1 kResidue = msa->cell[kSeqIdx][i].letter;
1805 
1806  if (residue_counts_for_column[kResidue] == 0) {
1807  num_distinct_residues_for_column++;
1808  if (kResidue != kGapResidue && kResidue != kXResidue)
1809  num_local_std_letters++;
1810  }
1811  residue_counts_for_column[kResidue]++;
1812  }
1813 
1814  sigma += num_distinct_residues_for_column;
1815  num_local_std_letters = MIN(num_local_std_letters,EFFECTIVE_ALPHABET);
1816  seq_weights->posDistinctDistrib[position][num_local_std_letters]++;
1817  if (num_distinct_residues_for_column > 1) {
1818  /* num_distinct_residues_for_column == 1 means that all residues in
1819  * that column of the alignment extent are the same residue */
1820  distinct_residues_found = TRUE;
1821  }
1822 
1823  /* Calculate row_sigma, an intermediate value to calculate the
1824  * normalized sequence weights */
1825  for (asi = 0; asi < aligned_seqs->num_used; asi++) {
1826  const Uint4 seq_idx = aligned_seqs->data[asi];
1827  const Uint1 residue = msa->cell[seq_idx][i].letter;
1828 
1829  /* This is a modified version of the Henikoff's idea in
1830  * "Position-based sequence weights" paper. The modification
1831  * consists in using the alignment extents. */
1832  seq_weights->row_sigma[seq_idx] +=
1833  (1.0 / (double)
1834  (residue_counts_for_column[residue] *
1835  num_distinct_residues_for_column) );
1836  }
1837  }
1838 
1839  /* Save sigma for this position */
1840  seq_weights->sigma[position] = sigma;
1841 
1842  if (distinct_residues_found) {
1843  double weight_sum = 0.0;
1844 
1845  for (asi = 0; asi < aligned_seqs->num_used; asi++) {
1846  const Uint4 seq_idx = aligned_seqs->data[asi];
1847  seq_weights->norm_seq_weights[seq_idx] =
1848  seq_weights->row_sigma[seq_idx] /
1849  (aligned_blocks->pos_extnt[position].right -
1850  aligned_blocks->pos_extnt[position].left + 1);
1851  weight_sum += seq_weights->norm_seq_weights[seq_idx];
1852  }
1853 
1854  /* Normalize */
1855  for (asi = 0; asi < aligned_seqs->num_used; asi++) {
1856  const Uint4 seq_idx = aligned_seqs->data[asi];
1857  seq_weights->norm_seq_weights[seq_idx] /= weight_sum;
1858  }
1859 
1860  } else {
1861  /* All residues in the extent of this position's alignment are the same
1862  * residue, therefore we distribute the sequence weight equally among
1863  * all participating sequences */
1864  for (asi = 0; asi < aligned_seqs->num_used; asi++) {
1865  const Uint4 seq_idx = aligned_seqs->data[asi];
1866  seq_weights->norm_seq_weights[seq_idx] =
1867  (1.0/(double) aligned_seqs->num_used);
1868  }
1869  }
1870 
1871  return;
1872 }
1873 
1874 static void
1876  const _PSIMsa* msa, /* [in] */
1877  Uint4 position, /* [in] */
1878  const SDynamicUint4Array* aligned_seqs, /* [in] */
1879  _PSISequenceWeights* seq_weights) /* [out] */
1880 {
1881  const Uint1 kGapResidue = AMINOACID_TO_NCBISTDAA['-'];
1882  Uint4 asi = 0; /* index into array of aligned sequences */
1883 
1884  ASSERT(msa);
1885  ASSERT(aligned_seqs && aligned_seqs->num_used);
1886  ASSERT(seq_weights);
1887 
1888  for (asi = 0; asi < aligned_seqs->num_used; asi++) {
1889  const Uint4 seq_idx = aligned_seqs->data[asi];
1890  const Uint1 residue = msa->cell[seq_idx][position].letter;
1891 
1892  seq_weights->match_weights[position][residue] +=
1893  seq_weights->norm_seq_weights[seq_idx];
1894 
1895  /* Collected for diagnostics information, not used elsewhere */
1896  if (residue != kGapResidue) {
1897  seq_weights->gapless_column_weights[position] +=
1898  seq_weights->norm_seq_weights[seq_idx];
1899  }
1900  }
1901 }
1902 
1903 static void
1905  Uint4 position,
1906  SDynamicUint4Array* aligned_sequences)
1907 {
1908 #ifdef PSI_IGNORE_GAPS_IN_COLUMNS
1909  const Uint1 kGapResidue = AMINOACID_TO_NCBISTDAA['-'];
1910 #endif
1911  Uint4 i = 0;
1912 
1913  ASSERT(msa);
1914  ASSERT(position < msa->dimensions->query_length);
1915  ASSERT(aligned_sequences && aligned_sequences->num_allocated);
1916  aligned_sequences->num_used = 0; /* reset the array */
1917 
1918  for (i = 0; i < msa->dimensions->num_seqs + 1; i++) {
1919 
1920 #ifdef PSI_IGNORE_GAPS_IN_COLUMNS
1921  if (msa->cell[i][position].is_aligned &&
1922  msa->cell[i][position].letter != kGapResidue) {
1923 #else
1924  if (msa->cell[i][position].is_aligned) {
1925 #endif
1926  DynamicUint4Array_Append(aligned_sequences, i);
1927  }
1928  }
1929 }
1930 
1931 static void
1933  _PSISequenceWeights* seq_weights,
1934  Boolean nsg_compatibility_mode)
1935 {
1936  const Uint1 kGapResidue = AMINOACID_TO_NCBISTDAA['-'];
1937  const Uint1 kXResidue = AMINOACID_TO_NCBISTDAA['X'];
1938  Uint4 pos = 0; /* residue position (ie: column number) */
1939  Uint4 res = 0; /* residue */
1940  const Uint4 kExpectedNumMatchingSeqs = nsg_compatibility_mode ? 0 : 1;
1941 
1942  ASSERT(msa);
1943  ASSERT(seq_weights);
1944 
1945  for (pos = 0; pos < msa->dimensions->query_length; pos++) {
1946 
1947  if (msa->num_matching_seqs[pos] <= kExpectedNumMatchingSeqs ||
1948  msa->cell[kQueryIndex][pos].letter == kXResidue) {
1949  continue;
1950  }
1951 
1952  /* Disperse method of spreading the gap weights */
1953  for (res = 0; res < msa->alphabet_size; res++) {
1954  if (seq_weights->std_prob[res] > kEpsilon) {
1955  seq_weights->match_weights[pos][res] +=
1956  (seq_weights->match_weights[pos][kGapResidue] *
1957  seq_weights->std_prob[res]);
1958  }
1959  }
1960  seq_weights->match_weights[pos][kGapResidue] = 0.0;
1961 
1962  }
1963 }
1964 
1965 /** The following define enables/disables the _PSICheckSequenceWeights
1966  * function's abort statement in the case when the sequence weights are not
1967  * being checked. When this is enabled, abort() will be invoked if none of the
1968  * sequence weights are checked to be in the proper range. The C toolkit code
1969  * silently ignores this situation, so it's implemented that way here for
1970  * backwards compatibility.
1971  */
1972 #define SEQUENCE_WEIGHTS_CHECK__ABORT_ON_FAILURE 0
1973 
1974 /* Verifies that each column of the match_weights field in the seq_weights
1975  * structure adds up to 1. */
1976 static int
1978  const _PSISequenceWeights* seq_weights,
1979  Boolean nsg_compatibility_mode)
1980 {
1981  const Uint1 kXResidue = AMINOACID_TO_NCBISTDAA['X'];
1982  Uint4 pos = 0; /* residue position (ie: column number) */
1983  const Uint4 kExpectedNumMatchingSeqs = nsg_compatibility_mode ? 0 : 1;
1984 
1985 #if SEQUENCE_WEIGHTS_CHECK__ABORT_ON_FAILURE
1986  Boolean check_performed = FALSE; /* were there any sequences checked? */
1987 #endif
1988 
1989  ASSERT(msa);
1990  ASSERT(seq_weights);
1991 
1992  for (pos = 0; pos < msa->dimensions->query_length; pos++) {
1993 
1994  double running_total = 0.0;
1995  Uint4 residue = 0;
1996 
1997  if (msa->num_matching_seqs[pos] <= kExpectedNumMatchingSeqs ||
1998  msa->cell[kQueryIndex][pos].letter == kXResidue) {
1999  /* N.B.: the following statement allows for the sequence weights to
2000  * go unchecked. To allow more strict checking, enable the
2001  * SEQUENCE_WEIGHTS_CHECK__ABORT_ON_FAILURE #define above */
2002  continue;
2003  }
2004 
2005  for (residue = 0; residue < msa->alphabet_size; residue++) {
2006  running_total += seq_weights->match_weights[pos][residue];
2007  }
2008 
2009  if (running_total < 0.99 || running_total > 1.01) {
2010  return PSIERR_BADSEQWEIGHTS;
2011  }
2012 #if SEQUENCE_WEIGHTS_CHECK__ABORT_ON_FAILURE
2013  check_performed = TRUE;
2014 #endif
2015  }
2016 
2017 #if SEQUENCE_WEIGHTS_CHECK__ABORT_ON_FAILURE
2018  /* This condition should never happen because it means that no sequences
2019  * were selected to calculate the sequence weights! */
2020  if ( !check_performed &&
2021  !nsg_compatibility_mode ) { /* old code didn't check for this... */
2022  assert(!"Did not perform sequence weights check");
2023  }
2024 #endif
2025 
2026  return PSI_SUCCESS;
2027 }
2028 
2029 /** initialize the expected number of observations
2030  use background probabilities for this matrix
2031  Calculate exp. # of distinct aa's as a function of independent trials
2032  copy of posit.c:initializeExpNumObservations
2033 
2034  @param expno table of expectations [out]
2035  @param backgroundProbabilities residue background probs [in]
2036 */
2037 static void s_initializeExpNumObservations(double *expno, const double *backgroundProbabilities);
2038 
2039 /** A method to estimate the effetive number of observations
2040  in the interval for the specified columnNumber
2041  copy of posit.c:effectiveObservations
2042  @param align_blk data structure describing the aligned blocks [in]
2043  @param seq_weights data structure of sequence weights [in]
2044  @param columnNumber column in the PSSM [in]
2045  @param queryLength length of the query sequence
2046  @param expno table of expectations [in]
2047 */
2048 static double s_effectiveObservations(const _PSIAlignedBlock *align_blk,
2049  const _PSISequenceWeights* seq_weights,
2050  int columnNumber, int queryLength,
2051  const double *expno);
2052 
2053 
2054 /** copy of posit.c:columnSpecificPseudocounts
2055  @param posSearch data structure of sequence weights [in]
2056  @param columnNumber column in the PSSM [in]
2057  @param backgroundProbabilities residue background probs [in]
2058  @param observations for each column an estimate of observed residues [in]
2059 */
2060 static double s_columnSpecificPseudocounts(const _PSISequenceWeights *posSearch,
2061  int columnNumber,
2062  const double *backgroundProbabilities,
2063  const double observations);
2064 
2065 #define MAX_IND_OBSERVATIONS 400 /**< max number of independent observation for pseudocount calculation */
2066 #define PSEUDO_MAX 1000000 /**< effective infinity */
2067 /****************************************************************************/
2068 /******* Compute residue frequencies stage of PSSM creation *****************/
2069 
2070 int
2072  const _PSISequenceWeights* seq_weights,
2073  const BlastScoreBlk* sbp,
2074  const _PSIAlignedBlock* aligned_blocks,
2075  Int4 pseudo_count,
2076  Boolean nsg_compatibility_mode,
2077  _PSIInternalPssmData* internal_pssm)
2078 {
2079  /* Subscripts are indicated as follows: N_i, where i is a subscript of N */
2080  const Uint1 kXResidue = AMINOACID_TO_NCBISTDAA['X'];
2081  SFreqRatios* freq_ratios = NULL;/* matrix-specific frequency ratios */
2082  Uint4 p = 0; /* index on positions */
2083  Uint4 r = 0; /* index on residues */
2084  const double kZeroObsPseudo = 30.0; /*arbitrary constant to use for columns with
2085  zero observations in actual data (ZERO_OBS_PSEUDO in posit.c) */
2086  double expno[MAX_IND_OBSERVATIONS+1]; /*table of expectations*/
2087  const double* backgroundProbabilities = Blast_GetMatrixBackgroundFreq(sbp->name);
2088 
2089  if ( !msa || !seq_weights || !sbp || !aligned_blocks || !internal_pssm ) {
2090  return PSIERR_BADPARAM;
2091  }
2092  ASSERT(((Uint4)sbp->alphabet_size) == msa->alphabet_size);
2093 
2094  freq_ratios = _PSIMatrixFrequencyRatiosNew(sbp->name);
2095 
2096  s_initializeExpNumObservations(&(expno[0]), backgroundProbabilities);
2097 
2098  for (p = 0; p < msa->dimensions->query_length; p++) {
2099  double columnCounts = 0.0; /*column-specific pseudocounts*/
2100  double observations = 0.0;
2101  double pseudoWeight; /*multiplier for pseudocounts term*/
2102  if (msa->cell[kQueryIndex][p].letter != kXResidue)
2103  {
2104  observations = s_effectiveObservations(aligned_blocks, seq_weights, p, msa->dimensions->query_length, expno);
2105 
2106  // this is done so that effective observations can be reported in
2107  // diagnostics
2108  seq_weights->independent_observations[p] = observations;
2109 
2110  if (0 == pseudo_count)
2111  columnCounts = s_columnSpecificPseudocounts(seq_weights, p, backgroundProbabilities, observations);
2112  else
2113  columnCounts = pseudo_count;
2114  }
2115  if (columnCounts >= PSEUDO_MAX) {
2116  pseudoWeight = kZeroObsPseudo;
2117  observations = 0;
2118  }
2119  else {
2120  pseudoWeight = columnCounts;
2121  }
2122 
2123  for (r = 0; r < msa->alphabet_size; r++) {
2124 
2125  /* If there is an 'X' in the query sequence at position p
2126  or the standard probability of residue r is close to 0 */
2127  if (msa->cell[kQueryIndex][p].letter == kXResidue ||
2128  seq_weights->std_prob[r] <= kEpsilon) {
2129  internal_pssm->freq_ratios[p][r] = 0.0;
2130  } else {
2131 
2132  Uint4 i = 0; /* loop index */
2133 
2134  /* beta( Sum_j(f_j r_ij) ) in formula 2 */
2135  double pseudo = 0.0;
2136  /* Renamed to match the formula in the paper */
2137  const double kBeta = pseudoWeight;
2138  double numerator = 0.0; /* intermediate term */
2139  double denominator = 0.0; /* intermediate term */
2140  double qOverPEstimate = 0.0; /* intermediate term */
2141 
2142  /* so that it can be saved in diagnostics */
2143  internal_pssm->pseudocounts[p] = kBeta;
2144 
2145  /* As specified in 2001 paper, underlying matrix frequency
2146  ratios are used here */
2147  for (i = 0; i < msa->alphabet_size; i++) {
2148  if (sbp->matrix->data[r][i] != BLAST_SCORE_MIN) {
2149  pseudo += (seq_weights->match_weights[p][i] *
2150  freq_ratios->data[r][i]);
2151  }
2152  }
2153  pseudo *= kBeta;
2154 
2155  numerator =
2156  (observations * seq_weights->match_weights[p][r] /
2157  seq_weights->std_prob[r])
2158  + pseudo;
2159 
2160  denominator = observations + kBeta;
2161 
2162  if (nsg_compatibility_mode && denominator == 0.0) {
2163  return PSIERR_UNKNOWN;
2164  } else {
2165  ASSERT(denominator != 0.0);
2166  }
2167  qOverPEstimate = numerator/denominator;
2168 
2169  /* Note artificial multiplication by standard probability
2170  * to normalize */
2171  internal_pssm->freq_ratios[p][r] = qOverPEstimate *
2172  seq_weights->std_prob[r];
2173 
2174  }
2175  }
2176  }
2177 
2178  freq_ratios = _PSIMatrixFrequencyRatiosFree(freq_ratios);
2179 
2180  return PSI_SUCCESS;
2181 }
2182 
2183 
2184 int
2186  const _PSISequenceWeights* seq_weights,
2187  const BlastScoreBlk* sbp,
2188  Int4 pseudo_count,
2189  _PSIInternalPssmData* internal_pssm)
2190 {
2191  const Uint1 kXResidue = AMINOACID_TO_NCBISTDAA['X'];
2192  SFreqRatios* freq_ratios = NULL;/* matrix-specific frequency ratios */
2193  Uint4 p = 0; /* index on positions */
2194  Uint4 r = 0; /* index on residues */
2195  const double kZeroObsPseudo = 30.0; /*arbitrary constant to use for columns with
2196  zero observations in actual data (ZERO_OBS_PSEUDO in posit.c) */
2197 
2198  const double* backgroundProbabilities = NULL;
2199 
2200  if ( !cd_msa || !seq_weights || !sbp || !internal_pssm || pseudo_count < 0) {
2201  return PSIERR_BADPARAM;
2202  }
2203 
2204  freq_ratios = _PSIMatrixFrequencyRatiosNew(sbp->name);
2205  if ( !freq_ratios ) {
2206  return PSIERR_OUTOFMEM;
2207  }
2208 
2209  backgroundProbabilities = Blast_GetMatrixBackgroundFreq(sbp->name);
2210  if ( !backgroundProbabilities ) {
2211  return PSIERR_OUTOFMEM;
2212  }
2213 
2214 
2215  for (p = 0; p < cd_msa->dimensions->query_length; p++) {
2216  double columnCounts = 0.0; /*column-specific pseudocounts*/
2217  double observations = 0.0;
2218  double pseudoWeight; /*multiplier for pseudocounts term*/
2219 
2220  if (cd_msa->query[p] != kXResidue)
2221  {
2222  /* observations will be used as alpha in equation 2 in the 2001
2223  paper */
2224  observations = MAX(0.0,
2225  seq_weights->independent_observations[p] - 1.0);
2226 
2227  if (0 == pseudo_count)
2228  columnCounts = s_columnSpecificPseudocounts(seq_weights, p, backgroundProbabilities, observations);
2229  else
2230  columnCounts = pseudo_count;
2231  }
2232 
2233  if (columnCounts >= PSEUDO_MAX) {
2234  pseudoWeight = kZeroObsPseudo;
2235  observations = 0;
2236  }
2237  else {
2238  pseudoWeight = columnCounts;
2239  }
2240 
2241  for (r = 0; r < sbp->alphabet_size; r++) {
2242 
2243  /* If there is an 'X' in the query sequence at position p
2244  or the standard probability of residue r is close to 0 */
2245  if (cd_msa->query[p] == kXResidue ||
2246  seq_weights->std_prob[r] <= kEpsilon) {
2247  internal_pssm->freq_ratios[p][r] = 0.0;
2248  } else {
2249 
2250  Uint4 i = 0; /* loop index */
2251 
2252  /* beta( Sum_j(f_j r_ij) ) in formula 2 */
2253  double pseudo = 0.0;
2254  /* Renamed to match the formula in the paper */
2255  const double kBeta = pseudoWeight;
2256  double numerator = 0.0; /* intermediate term */
2257  double denominator = 0.0; /* intermediate term */
2258  double qOverPEstimate = 0.0; /* intermediate term */
2259 
2260  /* As specified in 2001 paper, underlying matrix frequency
2261  ratios are used here */
2262  for (i = 0; i < sbp->alphabet_size; i++) {
2263  if (sbp->matrix->data[r][i] != BLAST_SCORE_MIN) {
2264  pseudo += (seq_weights->match_weights[p][i] *
2265  freq_ratios->data[r][i]);
2266  }
2267  }
2268  pseudo *= kBeta;
2269 
2270  numerator =
2271  (observations * seq_weights->match_weights[p][r] /
2272  seq_weights->std_prob[r])
2273  + pseudo;
2274 
2275  denominator = observations + kBeta;
2276 
2277  ASSERT(denominator != 0.0);
2278 
2279  qOverPEstimate = numerator/denominator;
2280 
2281  /* Note artificial multiplication by standard probability
2282  * to normalize */
2283  internal_pssm->freq_ratios[p][r] = qOverPEstimate *
2284  seq_weights->std_prob[r];
2285 
2286  }
2287  }
2288  }
2289 
2290  freq_ratios = _PSIMatrixFrequencyRatiosFree(freq_ratios);
2291 
2292  return PSI_SUCCESS;
2293 }
2294 
2295 
2296 
2297 /* port of posInitializeInformation */
2298 double*
2300  Int4** score_mat,
2301  const double* std_prob,
2302  const Uint1* query,
2303  Uint4 query_length,
2304  Uint4 alphabet_sz,
2305  double lambda)
2306 {
2307  double* retval = NULL; /* the return value */
2308  Uint4 p = 0; /* index on positions */
2309  Uint4 r = 0; /* index on residues */
2310 
2311  if ( !std_prob || !score_mat ) {
2312  return NULL;
2313  }
2314 
2315  retval = (double*) calloc(query_length, sizeof(double));
2316  if ( !retval ) {
2317  return NULL;
2318  }
2319 
2320  for (p = 0; p < query_length; p++) {
2321 
2322  double info_sum = 0.0;
2323 
2324  for (r = 0; r < alphabet_sz; r++) {
2325 
2326  if (std_prob[r] > kEpsilon) {
2327  Int4 score = score_mat[query[p]][r];
2328  double exponent = exp(score * lambda);
2329  double tmp = std_prob[r] * exponent;
2330  info_sum += tmp * log(tmp/std_prob[r])/ NCBIMATH_LN2;
2331  }
2332 
2333  }
2334  retval[p] = info_sum;
2335  }
2336 
2337  return retval;
2338 }
2339 
2340 double*
2342  double** freq_ratios,
2343  const double* std_prob,
2344  Uint4 query_length,
2345  Uint4 alphabet_sz)
2346 {
2347  double* retval = NULL; /* the return value */
2348  Uint4 p = 0; /* index on positions */
2349  Uint4 r = 0; /* index on residues */
2350 
2351  if ( !std_prob || !freq_ratios ) {
2352  return NULL;
2353  }
2354 
2355  retval = (double*) calloc(query_length, sizeof(double));
2356  if ( !retval ) {
2357  return NULL;
2358  }
2359 
2360  for (p = 0; p < query_length; p++) {
2361 
2362  double info_sum = 0.0;
2363 
2364  for (r = 0; r < alphabet_sz; r++) {
2365 
2366  if (std_prob[r] > kEpsilon) {
2367  /* Division compensates for multiplication in
2368  * _PSIComputeFreqRatios */
2369  double qOverPEstimate = freq_ratios[p][r] / std_prob[r];
2370  if (qOverPEstimate > kEpsilon) {
2371  info_sum +=
2372  freq_ratios[p][r] * log(qOverPEstimate) / NCBIMATH_LN2;
2373  }
2374  }
2375 
2376  }
2377  retval[p] = info_sum;
2378  }
2379 
2380  return retval;
2381 }
2382 
2383 
2384 /****************************************************************************/
2385 /**************** Convert residue frequencies to PSSM stage *****************/
2386 
2387 /* FIXME: Answer questions
2388  FIXME: need ideal_labmda, regular scoring matrix, length of query
2389  port of posFreqsToMatrix
2390 */
2391 int
2393  const Uint1* query,
2394  const BlastScoreBlk* sbp,
2395  const double* std_probs)
2396 {
2397  const Uint4 kXResidue = AMINOACID_TO_NCBISTDAA['X'];
2398  const Uint4 kStarResidue = AMINOACID_TO_NCBISTDAA['*'];
2399  Uint4 i = 0;
2400  Uint4 j = 0;
2401  SFreqRatios* freq_ratios = NULL; /* only needed when there are not
2402  residue frequencies for a given
2403  column */
2404  double ideal_lambda = 0.0;
2405 
2406  if ( !internal_pssm || !sbp || !std_probs )
2407  return PSIERR_BADPARAM;
2408 
2409  ideal_lambda = sbp->kbp_ideal->Lambda;
2410  freq_ratios = _PSIMatrixFrequencyRatiosNew(sbp->name);
2411 
2412  /* Each column is a position in the query */
2413  for (i = 0; i < internal_pssm->ncols; i++) {
2414 
2415  /* True if all frequencies in column i are zero */
2416  Boolean is_unaligned_column = TRUE;
2417  const Uint4 kResidue = query[i];
2418 
2419  for (j = 0; j < (Uint4) sbp->alphabet_size; j++) {
2420 
2421  double qOverPEstimate = 0.0;
2422 
2423  /* Division compensates for multiplication in
2424  * _PSIComputeFreqRatios */
2425  if (std_probs[j] > kEpsilon) {
2426  qOverPEstimate =
2427  internal_pssm->freq_ratios[i][j] / std_probs[j];
2428  }
2429 
2430  if (is_unaligned_column && qOverPEstimate != 0.0) {
2431  is_unaligned_column = FALSE;
2432  }
2433 
2434  /* Populate scaled matrix */
2435  if (qOverPEstimate == 0.0 || std_probs[j] < kEpsilon) {
2436  internal_pssm->scaled_pssm[i][j] = BLAST_SCORE_MIN;
2437  } else {
2438  double tmp = log(qOverPEstimate)/ideal_lambda;
2439  internal_pssm->scaled_pssm[i][j] = (int)
2441  }
2442 
2443  if ( (j == kXResidue || j == kStarResidue) &&
2444  (sbp->matrix->data[kResidue][kXResidue] != BLAST_SCORE_MIN) ) {
2445  internal_pssm->scaled_pssm[i][j] =
2446  sbp->matrix->data[kResidue][j] * kPSIScaleFactor;
2447  }
2448  }
2449 
2450  if (is_unaligned_column) {
2451  for (j = 0; j < (Uint4) sbp->alphabet_size; j++) {
2452 
2453  internal_pssm->pssm[i][j] = sbp->matrix->data[kResidue][j];
2454 
2455  if (freq_ratios->data[kResidue][j] != 0.0) {
2456  double tmp =
2457  kPSIScaleFactor * freq_ratios->bit_scale_factor *
2458  log(freq_ratios->data[kResidue][j])/NCBIMATH_LN2;
2459 
2460  internal_pssm->scaled_pssm[i][j] = (int)BLAST_Nint(tmp);
2461  } else {
2462  internal_pssm->scaled_pssm[i][j] = BLAST_SCORE_MIN;
2463  }
2464  }
2465  }
2466  }
2467 
2468  freq_ratios = _PSIMatrixFrequencyRatiosFree(freq_ratios);
2469 
2470  return PSI_SUCCESS;
2471 }
2472 
2473 /****************************************************************************/
2474 /************************* Scaling of PSSM stage ****************************/
2475 
2476 /* FIXME: change so that only lambda is calculated inside the loop that scales
2477  the matrix and kappa is calculated before returning from this function.
2478  Scaling factor should be optional argument to accomodate kappa.c's needs?
2479 */
2480 int
2482  const double* std_probs,
2483  _PSIInternalPssmData* internal_pssm,
2484  BlastScoreBlk* sbp)
2485 {
2486  Boolean first_time = TRUE;
2487  Uint4 index = 0; /* loop index */
2488  int** scaled_pssm = NULL;
2489  int** pssm = NULL;
2490  double factor;
2491  double factor_low = 1.0;
2492  double factor_high = 1.0;
2493  double ideal_lambda = 0.0; /* ideal value of ungapped lambda for
2494  underlying scoring matrix */
2495  double new_lambda = 0.0; /* Karlin-Altschul parameter calculated
2496  from scaled matrix*/
2497 
2498  Uint4 query_length = 0;
2499  Boolean too_high = TRUE;
2500 
2501  if ( !internal_pssm || !sbp || !query || !std_probs )
2502  return PSIERR_BADPARAM;
2503 
2504  ASSERT(sbp->kbp_psi[0]);
2505  ASSERT(sbp->kbp_ideal);
2506 
2507  scaled_pssm = internal_pssm->scaled_pssm;
2508  pssm = internal_pssm->pssm;
2509  ideal_lambda = sbp->kbp_ideal->Lambda;
2510  query_length = internal_pssm->ncols;
2511 
2512  factor = 1.0;
2513  for ( ; ; ) {
2514  Uint4 i = 0;
2515  Uint4 j = 0;
2516 
2517  for (i = 0; i < internal_pssm->ncols; i++) {
2518  for (j = 0; j < internal_pssm->nrows; j++) {
2519  if (scaled_pssm[i][j] != BLAST_SCORE_MIN) {
2520  pssm[i][j] =
2521  (int)BLAST_Nint(factor*scaled_pssm[i][j]/kPSIScaleFactor);
2522  } else {
2523  pssm[i][j] = BLAST_SCORE_MIN;
2524  }
2525  }
2526  }
2527  _PSIUpdateLambdaK((const int**)pssm, query, query_length,
2528  std_probs, sbp);
2529 
2530  new_lambda = sbp->kbp_psi[0]->Lambda;
2531 
2532  if (new_lambda > ideal_lambda) {
2533  if (first_time) {
2534  factor_high = 1.0 + kPositScalingPercent;
2535  factor = factor_high;
2536  factor_low = 1.0;
2537  too_high = TRUE;
2538  first_time = FALSE;
2539  } else {
2540  if (too_high == FALSE) {
2541  break;
2542  }
2543  factor_high += (factor_high - 1.0);
2544  factor = factor_high;
2545  }
2546  } else if (new_lambda > 0) {
2547  if (first_time) {
2548  factor_high = 1.0;
2549  factor_low = 1.0 - kPositScalingPercent;
2550  factor = factor_low;
2551  too_high = FALSE;
2552  first_time = FALSE;
2553  } else {
2554  if (too_high == TRUE) {
2555  break;
2556  }
2557  factor_low += (factor_low - 1.0);
2558  factor = factor_low;
2559  }
2560  } else {
2561  return PSIERR_POSITIVEAVGSCORE;
2562  }
2563  }
2564 
2565  /* Binary search for kPositScalingNumIterations times */
2566  for (index = 0; index < kPositScalingNumIterations; index++) {
2567  Uint4 i = 0;
2568  Uint4 j = 0;
2569 
2570  factor = (factor_high + factor_low)/2;
2571 
2572  for (i = 0; i < internal_pssm->ncols; i++) {
2573  for (j = 0; j < internal_pssm->nrows; j++) {
2574  if (scaled_pssm[i][j] != BLAST_SCORE_MIN) {
2575  pssm[i][j] =
2576  (int)BLAST_Nint(factor*scaled_pssm[i][j]/kPSIScaleFactor);
2577  } else {
2578  pssm[i][j] = BLAST_SCORE_MIN;
2579  }
2580  }
2581  }
2582 
2583  _PSIUpdateLambdaK((const int**)pssm, query, query_length,
2584  std_probs, sbp);
2585 
2586  new_lambda = sbp->kbp_psi[0]->Lambda;
2587 
2588  if (new_lambda > ideal_lambda) {
2589  factor_low = factor;
2590  } else {
2591  factor_high = factor;
2592  }
2593  }
2594 
2595  return PSI_SUCCESS;
2596 }
2597 
2598 int
2599 _IMPALAScaleMatrix(const Uint1* query, const double* std_probs,
2600  _PSIInternalPssmData* internal_pssm,
2601  BlastScoreBlk* sbp,
2602  double scaling_factor)
2603 {
2604  Kappa_posSearchItems *posSearch = NULL;
2605  Kappa_compactSearchItems* compactSearch = NULL;
2606  int retval = PSI_SUCCESS;
2607 
2608  posSearch = Kappa_posSearchItemsNew(internal_pssm->ncols,
2609  sbp->name,
2610  internal_pssm->scaled_pssm,
2611  internal_pssm->freq_ratios);
2612  compactSearch = Kappa_compactSearchItemsNew(query, internal_pssm->ncols,
2613  sbp);
2614 
2615  retval = Kappa_impalaScaling(posSearch, compactSearch,
2616  scaling_factor, TRUE, sbp);
2617 
2618  /* Overwrite unscaled PSSM with scaled PSSM */
2619  _PSICopyMatrix_int(internal_pssm->pssm, internal_pssm->scaled_pssm,
2620  internal_pssm->ncols, internal_pssm->nrows);
2621 
2622  posSearch = Kappa_posSearchItemsFree(posSearch);
2623  compactSearch = Kappa_compactSearchItemsFree(compactSearch);
2624 
2625  return retval;
2626 }
2627 
2628 Uint4
2630 {
2631  const Uint1 kXResidue = AMINOACID_TO_NCBISTDAA['X'];
2632  Uint4 retval = 0; /* the return value */
2633  Uint4 i = 0; /* loop index */
2634 
2635  ASSERT(seq);
2636 
2637  for(i = 0; i < length; i++) {
2638  if (seq[i] != kXResidue) {
2639  retval++;
2640  }
2641  }
2642 
2643  return retval;
2644 }
2645 
2647 _PSIComputeScoreProbabilities(const int** pssm, /* [in] */
2648  const Uint1* query, /* [in] */
2649  Uint4 query_length, /* [in] */
2650  const double* std_probs, /* [in] */
2651  const BlastScoreBlk* sbp) /* [in] */
2652 {
2653  const Uint1 kXResidue = AMINOACID_TO_NCBISTDAA['X'];
2654  Uint1 aa_alphabet[BLASTAA_SIZE]; /* ncbistdaa alphabet */
2655  Uint4 alphabet_size = 0; /* number of elements populated
2656  in array above */
2657  Uint4 effective_length = 0; /* length of query w/o Xs */
2658  Uint4 p = 0; /* index on positions */
2659  Uint4 r = 0; /* index on residues */
2660  int s = 0; /* index on scores */
2661  int min_score = BLAST_SCORE_MAX; /* minimum score in pssm */
2662  int max_score = BLAST_SCORE_MIN; /* maximum score in pssm */
2663  Blast_ScoreFreq* score_freqs = NULL; /* score frequencies */
2664 
2665  ASSERT(pssm);
2666  ASSERT(query);
2667  ASSERT(std_probs);
2668  ASSERT(sbp);
2670 
2671  alphabet_size = (Uint4) Blast_GetStdAlphabet(sbp->alphabet_code,
2672  aa_alphabet, BLASTAA_SIZE);
2673  if (alphabet_size <= 0) {
2674  return NULL;
2675  }
2676  ASSERT(alphabet_size < BLASTAA_SIZE);
2677 
2678  effective_length = _PSISequenceLengthWithoutX(query, query_length);
2679 
2680  /* Get the minimum and maximum scores */
2681  for (p = 0; p < query_length; p++) {
2682  if (query[p] == kXResidue) {
2683  continue;
2684  }
2685  for (r = 0; r < alphabet_size; r++) {
2686  const int kScore = pssm[p][aa_alphabet[r]];
2687 
2688  if (kScore <= BLAST_SCORE_MIN || kScore >= BLAST_SCORE_MAX) {
2689  continue;
2690  }
2691  max_score = MAX(kScore, max_score);
2692  min_score = MIN(kScore, min_score);
2693  }
2694  }
2695  ASSERT(min_score != BLAST_SCORE_MAX);
2696  ASSERT(max_score != BLAST_SCORE_MIN);
2697 
2698  score_freqs = Blast_ScoreFreqNew(min_score, max_score);
2699  if ( !score_freqs ) {
2700  return NULL;
2701  }
2702 
2703  score_freqs->obs_min = min_score;
2704  score_freqs->obs_max = max_score;
2705  for (p = 0; p < query_length; p++) {
2706  if (query[p] == kXResidue) {
2707  continue;
2708  }
2709 
2710  for (r = 0; r < alphabet_size; r++) {
2711  const int kScore = pssm[p][aa_alphabet[r]];
2712 
2713  if (kScore <= BLAST_SCORE_MIN || kScore >= BLAST_SCORE_MAX) {
2714  continue;
2715  }
2716 
2717  /* Increment the weight for the score in position p, residue r */
2718  score_freqs->sprob[kScore] +=
2719  (std_probs[aa_alphabet[r]]/effective_length);
2720  }
2721  }
2722 
2723  ASSERT(score_freqs->score_avg == 0.0);
2724  for (s = min_score; s <= max_score; s++) {
2725  score_freqs->score_avg += (s * score_freqs->sprob[s]);
2726  }
2727 
2728  return score_freqs;
2729 }
2730 
2731 void
2732 _PSIUpdateLambdaK(const int** pssm, /* [in] */
2733  const Uint1* query, /* [in] */
2734  Uint4 query_length, /* [in] */
2735  const double* std_probs, /* [in] */
2736  BlastScoreBlk* sbp) /* [in|out] */
2737 {
2738  Blast_ScoreFreq* score_freqs =
2739  _PSIComputeScoreProbabilities(pssm, query, query_length,
2740  std_probs, sbp);
2741 
2742  /* Calculate lambda and K */
2743  Blast_KarlinBlkUngappedCalc(sbp->kbp_psi[0], score_freqs);
2744 
2745  ASSERT(sbp->kbp_ideal);
2746  ASSERT(sbp->kbp_psi[0]);
2747  ASSERT(sbp->kbp_gap_std[0]);
2748  ASSERT(sbp->kbp_gap_psi[0]);
2749 
2750  sbp->kbp_gap_psi[0]->K =
2751  sbp->kbp_psi[0]->K * sbp->kbp_gap_std[0]->K / sbp->kbp_ideal->K;
2752  sbp->kbp_gap_psi[0]->logK = log(sbp->kbp_gap_psi[0]->K);
2753 
2754  score_freqs = Blast_ScoreFreqFree(score_freqs);
2755 }
2756 
2757 
2758 /****************************************************************************/
2759 /* Function definitions for auxiliary functions for the stages above */
2760 
2761 /** Check if we still need this sequence */
2762 static void
2763 s_PSIDiscardIfUnused(_PSIPackedMsa* msa, unsigned int seq_index)
2764 {
2765  Boolean contains_aligned_regions = FALSE;
2766  unsigned int i = 0;
2767 
2768  for (i = 0; i < msa->dimensions->query_length; i++) {
2769  if (msa->data[seq_index][i].is_aligned) {
2770  contains_aligned_regions = TRUE;
2771  break;
2772  }
2773  }
2774 
2775  if ( !contains_aligned_regions ) {
2776  msa->use_sequence[seq_index] = FALSE;
2777  }
2778 }
2779 
2780 int
2782  unsigned int seq_index,
2783  unsigned int start,
2784  unsigned int stop)
2785 {
2786  _PSIPackedMsaCell* sequence_position = NULL;
2787  unsigned int i = 0;
2788 
2789  if ( !msa ||
2790  seq_index == 0 ||
2791  seq_index > msa->dimensions->num_seqs + 1 ||
2792  stop > msa->dimensions->query_length) {
2793  return PSIERR_BADPARAM;
2794  }
2795 
2796  sequence_position = msa->data[seq_index];
2797  for (i = start; i < stop; i++) {
2798  sequence_position[i].letter = 0;
2799  sequence_position[i].is_aligned = FALSE;
2800  }
2801 
2802  s_PSIDiscardIfUnused(msa, seq_index);
2803 
2804  return PSI_SUCCESS;
2805 }
2806 
2807 /****************************************************************************/
2808 int
2810  const _PSIAlignedBlock* aligned_block,
2811  const _PSISequenceWeights* seq_weights,
2812  const _PSIInternalPssmData* internal_pssm,
2813  PSIDiagnosticsResponse* diagnostics)
2814 {
2815  Uint4 p = 0; /* index on positions */
2816  Uint4 r = 0; /* index on residues */
2817  const Uint1 kXResidue = AMINOACID_TO_NCBISTDAA['X'];
2818 
2819  if ( !diagnostics || !msa || !aligned_block || !seq_weights ||
2820  !internal_pssm || !internal_pssm->freq_ratios ) {
2821  return PSIERR_BADPARAM;
2822  }
2823 
2824  ASSERT(msa->dimensions->query_length == diagnostics->query_length);
2825 
2826  if (diagnostics->information_content) {
2828  internal_pssm->freq_ratios, seq_weights->std_prob,
2829  diagnostics->query_length,
2830  diagnostics->alphabet_size);
2831  if ( !info ) {
2832  return PSIERR_OUTOFMEM;
2833  }
2834  for (p = 0; p < diagnostics->query_length; p++) {
2835  diagnostics->information_content[p] = info[p];
2836  }
2837  sfree(info);
2838  }
2839 
2840  if (diagnostics->residue_freqs) {
2841  for (p = 0; p < diagnostics->query_length; p++) {
2842  for (r = 0; r < diagnostics->alphabet_size; r++) {
2843  diagnostics->residue_freqs[p][r] = msa->residue_counts[p][r];
2844  }
2845  }
2846  }
2847 
2848  if (diagnostics->weighted_residue_freqs) {
2849  for (p = 0; p < diagnostics->query_length; p++) {
2850  for (r = 0; r < diagnostics->alphabet_size; r++) {
2851  diagnostics->weighted_residue_freqs[p][r] =
2852  seq_weights->match_weights[p][r];
2853  }
2854  }
2855  }
2856 
2857  if (diagnostics->frequency_ratios) {
2858  for (p = 0; p < diagnostics->query_length; p++) {
2859  for (r = 0; r < diagnostics->alphabet_size; r++) {
2860  diagnostics->frequency_ratios[p][r] =
2861  internal_pssm->freq_ratios[p][r];
2862  }
2863  }
2864  }
2865 
2866  if (diagnostics->gapless_column_weights) {
2867  for (p = 0; p < diagnostics->query_length; p++) {
2868  if (msa->num_matching_seqs[p] > 1
2869  && msa->cell[0][p].letter != kXResidue) {
2870 
2871  diagnostics->gapless_column_weights[p] =
2872  seq_weights->gapless_column_weights[p]
2873  / internal_pssm->pseudocounts[p];
2874 
2875  diagnostics->gapless_column_weights[p] *=
2876  (seq_weights->sigma[p] / aligned_block->size[p] - 1);
2877  }
2878  else {
2879  diagnostics->gapless_column_weights[p] = 0.0;
2880  }
2881  }
2882  }
2883 
2884  if (diagnostics->sigma) {
2885  for (p = 0; p < diagnostics->query_length; p++) {
2886  diagnostics->sigma[p] = seq_weights->sigma[p];
2887  }
2888  }
2889 
2890  if (diagnostics->interval_sizes) {
2891  for (p = 0; p < diagnostics->query_length; p++) {
2892  diagnostics->interval_sizes[p] = aligned_block->size[p];
2893  }
2894  }
2895 
2896  if (diagnostics->num_matching_seqs) {
2897  for (p = 0; p < diagnostics->query_length; p++) {
2898  diagnostics->num_matching_seqs[p] = msa->num_matching_seqs[p];
2899  }
2900  }
2901 
2902  if (diagnostics->independent_observations) {
2903  for (p = 0; p < diagnostics->query_length; p++) {
2904  diagnostics->independent_observations[p] =
2905  seq_weights->independent_observations[p];
2906  }
2907  }
2908  return PSI_SUCCESS;
2909 }
2910 
2911 int
2913  const _PSISequenceWeights* seq_weights,
2914  const _PSIInternalPssmData* internal_pssm,
2915  PSIDiagnosticsResponse* diagnostics)
2916 {
2917  Uint4 p = 0; /* index on positions */
2918  Uint4 r = 0; /* index on residues */
2919 
2920  if ( !diagnostics || !cd_msa || !seq_weights ||
2921  !internal_pssm || !internal_pssm->freq_ratios ) {
2922  return PSIERR_BADPARAM;
2923  }
2924 
2925  ASSERT(cd_msa->dimensions->query_length == diagnostics->query_length);
2926 
2927  if (diagnostics->information_content) {
2929  internal_pssm->freq_ratios, seq_weights->std_prob,
2930  diagnostics->query_length,
2931  diagnostics->alphabet_size);
2932  if ( !info ) {
2933  return PSIERR_OUTOFMEM;
2934  }
2935  for (p = 0; p < diagnostics->query_length; p++) {
2936  diagnostics->information_content[p] = info[p];
2937  }
2938  sfree(info);
2939  }
2940 
2941  if (diagnostics->weighted_residue_freqs) {
2942  for (p = 0; p < diagnostics->query_length; p++) {
2943  for (r = 0; r < diagnostics->alphabet_size; r++) {
2944  diagnostics->weighted_residue_freqs[p][r] =
2945  seq_weights->match_weights[p][r];
2946  }
2947  }
2948  }
2949 
2950  if (diagnostics->frequency_ratios) {
2951  for (p = 0; p < diagnostics->query_length; p++) {
2952  for (r = 0; r < diagnostics->alphabet_size; r++) {
2953  diagnostics->frequency_ratios[p][r] =
2954  internal_pssm->freq_ratios[p][r];
2955  }
2956  }
2957  }
2958 
2959  if (diagnostics->independent_observations) {
2960  for (p = 0; p < diagnostics->query_length; p++) {
2961  diagnostics->independent_observations[p] =
2962  seq_weights->independent_observations[p];
2963  }
2964  }
2965  return PSI_SUCCESS;
2966 }
2967 
2968 
2969 /** Reorders in the same manner as returned by Blast_GetMatrixBackgroundFreq
2970  this function is a copy of posit.c:fillColumnProbabilities
2971 */
2972 static void s_fillColumnProbabilities(double *probabilities,
2973  const _PSISequenceWeights *posSearch,
2974  Int4 columnNumber)
2975 {
2976  int charOrder[EFFECTIVE_ALPHABET]; /*standard order of letters according to S. Altschul*/
2977  int c; /*loop index*/
2978 
2979  charOrder[0] = 1; /*A*/
2980  charOrder[1] = 16; /*R*/
2981  charOrder[2] = 13; /*N*/
2982  charOrder[3] = 4; /*D*/
2983  charOrder[4] = 3; /*C*/
2984  charOrder[5] = 15; /*Q*/
2985  charOrder[6] = 5; /*E*/
2986  charOrder[7] = 7; /*G*/
2987  charOrder[8] = 8; /*H*/
2988  charOrder[9] = 9; /*I*/
2989  charOrder[10] = 11; /*L*/
2990  charOrder[11] = 10; /*K*/
2991  charOrder[12] = 12; /*M*/
2992  charOrder[13] = 6; /*F*/
2993  charOrder[14] = 14; /*P*/
2994  charOrder[15] = 17; /*S*/
2995  charOrder[16] = 18; /*T*/
2996  charOrder[17] = 20; /*W*/
2997  charOrder[18] = 22; /*Y*/
2998  charOrder[19] = 19; /*V*/
2999 
3000  for(c = 0; c < EFFECTIVE_ALPHABET; c++)
3001  probabilities[c] = posSearch->match_weights[columnNumber][charOrder[c]];
3002 }
3003 
3004 /** adjust the probabilities by assigning observations weight
3005  to initialProbabilities and standardWeight to standardProbabilities
3006  copy of posit.c:adjustColumnProbabilities
3007  @param initialProbabilities starting probabilities [in]
3008  @param probabilitiesToReturn return value [out]
3009  @param standardWeight small number of pseudocounts to
3010  avoid 0 probabilities [in]
3011  @param standardProbabilities background probabilities [in]
3012  @param observations expected number of observations [in]
3013 */
3014 static void s_adjustColumnProbabilities(double *initialProbabilities,
3015  double *probabilitiesToReturn,
3016  double standardWeight,
3017  const double *standardProbabilities,
3018  double observations)
3019 {
3020  double intermediateSums[EFFECTIVE_ALPHABET]; /*weighted sums for each letter*/
3021  double overallSum; /*overall sum of weightedSums*/
3022  int c; /*loop index*/
3023 
3024  overallSum = 0.0;
3025  for(c = 0; c < EFFECTIVE_ALPHABET; c++) {
3026  intermediateSums[c] =
3027  (initialProbabilities[c] * observations) +
3028  (standardProbabilities[c] * standardWeight);
3029  overallSum += intermediateSums[c];
3030  }
3031  for(c = 0; c < EFFECTIVE_ALPHABET; c++)
3032  probabilitiesToReturn[c] = intermediateSums[c]/overallSum;
3033 }
3034 
3035 /*compute relative entropy of first distribution to second distribution
3036  copy of posit.c:computeRelativeEntropy */
3037 
3038 const double kPosEpsilon = 0.0001; /**< minimum return value of s_computeRelativeEntropy */
3039 
3040 /** compute relative entropy of first distribution to second distribution
3041  A copy of posit.c:computeRelativeEntropy
3042  @param newDistribution working set [in]
3043  @param backgroundProbabilities standard set [in]
3044 */
3045 static double s_computeRelativeEntropy(const double *newDistribution,
3046  const double *backgroundProbabilities)
3047 {
3048  Int4 c; /*loop index*/
3049  double returnValue; /*value to return*/
3050 
3051  returnValue = 0.0;
3052  for(c = 0; c < EFFECTIVE_ALPHABET; c++) {
3053  if (newDistribution[c] > kPosEpsilon)
3054  returnValue += (newDistribution[c] *
3055  log (newDistribution[c]/backgroundProbabilities[c]));
3056  }
3057  if (returnValue < kPosEpsilon)
3058  returnValue = kPosEpsilon;
3059  return(returnValue);
3060 }
3061 
3062 
3063 /*initialize the expected number of observations
3064  use background probabilities for this matrix
3065  Calculate exp. # of distinct aa's as a function of independent trials
3066  copy of posit.c:initializeExpNumObservations
3067 */
3068 static void s_initializeExpNumObservations(double *expno,
3069  const double *backgroundProbabilities)
3070 
3071 {
3072 int j,k ; /*loop indices*/
3073 double weighted_sum; /*20 - this is how many distinct
3074  amino acids are expected*/
3075 
3076  expno[0] = 0;
3077  for (j=1;j<MAX_IND_OBSERVATIONS;++j) {
3078  weighted_sum = 0;
3079  for (k=0;k<EFFECTIVE_ALPHABET;++k)
3080  weighted_sum += exp(j*log(1.0-backgroundProbabilities[k]));
3081  expno[j] = EFFECTIVE_ALPHABET-weighted_sum;
3082  }
3083 }
3084 
3085 /* copy of posit.c:columnSpecificPseudocounts */
3086 static double s_columnSpecificPseudocounts(const _PSISequenceWeights *posSearch,
3087  int columnNumber,
3088  const double *backgroundProbabilities,
3089  const double observations)
3090 {
3091  double columnProbabilitiesInitial[EFFECTIVE_ALPHABET];
3092  double columnProbabilitiesAdjusted[EFFECTIVE_ALPHABET];
3093  double relativeEntropy; /*relative entropy of this column to background probs.*/
3094  double alpha; /*intermediate term*/
3095  double pseudoDenominator; /*intermediate term*/
3096  double returnValue;
3097  /* Constant values, were #defines in posit.c */
3098  const double kPseudoMult = 500.0; /* Was PSEUDO_MULTIPLIER */
3099  const double kPseudoNumerator = 0.0457; /*numerator of entropy-based method, was PSEUDO_NUMERATOR */
3100  const double kPseudoExponent = 0.8; /*exponent of denominator, was PSEUDO_EXPONENT */
3101  const double kPseudoSmallInitial = 5.5; /*small number of pseudocounts to
3102  avoid 0 probabilities in entropy-based method, was PSEUDO_SMALL_INITIAL */
3103 
3104  s_fillColumnProbabilities(&(columnProbabilitiesInitial[0]), posSearch, columnNumber);
3105  s_adjustColumnProbabilities(&(columnProbabilitiesInitial[0]),
3106  &(columnProbabilitiesAdjusted[0]),
3107  kPseudoSmallInitial,
3108  backgroundProbabilities, observations);
3109  relativeEntropy = s_computeRelativeEntropy(&(columnProbabilitiesAdjusted[0]),
3110  backgroundProbabilities);
3111  pseudoDenominator = pow(relativeEntropy, kPseudoExponent);
3112  alpha = kPseudoNumerator/pseudoDenominator;
3113  if (alpha < (1.0 - kPosEpsilon))
3114  returnValue = kPseudoMult * alpha/ (1- alpha);
3115  else
3116  returnValue = PSEUDO_MAX;
3117 
3118  return(returnValue);
3119 }
3120 
3121 static double s_effectiveObservations(const _PSIAlignedBlock *align_blk,
3122  const _PSISequenceWeights* seq_weights,
3123  int columnNumber, int queryLength,
3124  const double *expno)
3125 {
3126 int i,k; /*loop indices*/
3127 double indep; /*number of independent observations to return*/
3128 int halfNumColumns; /*half the number of columns in the interval, rounded
3129  down*/
3130 int totalDistinctCounts; /*total number of distinct letters in columns
3131  used*/
3132 double aveDistinctAA; /*average number of distinct letters in columns used*/
3133 int columnsAccountedFor; /*how many of the columns had their
3134  distinct count totaled so far*/
3135 
3136 
3137  if (align_blk->pos_extnt[columnNumber].left < 0)
3138  return(0);
3139  if (align_blk->pos_extnt[columnNumber].right >= queryLength)
3140  return(0);
3141 
3142 /* Calculate the average number of distinct amino acids in the half of the
3143  columns within the block in question with the most distinct amino acids;
3144  +2 in the parentheses is for rounding up.*/
3145 
3146  halfNumColumns = MAX(1,(align_blk->pos_extnt[columnNumber].right -
3147  align_blk->pos_extnt[columnNumber].left+2)/2);
3148  k = EFFECTIVE_ALPHABET;
3149  columnsAccountedFor = 0;
3150  totalDistinctCounts = 0;
3151  while (columnsAccountedFor < halfNumColumns) {
3152  ASSERT(k >= 0);
3153  totalDistinctCounts += (seq_weights->posDistinctDistrib[columnNumber][k] *k);
3154  columnsAccountedFor += seq_weights->posDistinctDistrib[columnNumber][k];
3155  if (columnsAccountedFor > halfNumColumns) {
3156  totalDistinctCounts -=
3157  ((columnsAccountedFor - halfNumColumns) * k);
3158  columnsAccountedFor = halfNumColumns;
3159  }
3160  k--;
3161  }
3162  aveDistinctAA = ((double) totalDistinctCounts)/
3163  ((double) columnsAccountedFor);
3164 
3165 /* Then use the following code to calculate the number of
3166  independent observations corresponding to
3167  aveDistinctAA.
3168 */
3169 
3170  for (i=1;i<MAX_IND_OBSERVATIONS && expno[i]<=aveDistinctAA;++i);
3171  indep = (i==MAX_IND_OBSERVATIONS) ? i :
3172  i-(expno[i]-aveDistinctAA)/(expno[i]-expno[i-1]);
3173  indep = MIN(indep, seq_weights->posNumParticipating[columnNumber]);
3174  indep = MAX(0,indep - 1);
3175  return(indep);
3176 }
#define sfree(x)
Safe free a pointer: belongs to a higher level header.
Definition: blast_def.h:112
Int2 DynamicUint4Array_Append(SDynamicUint4Array *arr, Uint4 element)
Append a Uint4 to the dynamic array structure.
SDynamicUint4Array * DynamicUint4Array_Dup(const SDynamicUint4Array *src)
Make a deep copy of the src dynamic array.
Int4 DynamicUint4Array_Copy(SDynamicUint4Array *dest, const SDynamicUint4Array *src)
Make a shallow copy of the src dynamic array into the dest dynamic array.
SDynamicUint4Array * DynamicUint4ArrayNewEx(Uint4 init_num_elements)
Allocate a dynamic array of Uint4 with an initial size of specified as an argument to the function.
Boolean DynamicUint4Array_AreEqual(const SDynamicUint4Array *a, const SDynamicUint4Array *b)
Compares dynamic arrays a and b for equality of its contents.
SDynamicUint4Array * DynamicUint4ArrayFree(SDynamicUint4Array *arr)
Deallocates a dynamic array structure.
Declarations for various dynamic array types.
int Kappa_impalaScaling(Kappa_posSearchItems *posSearch, Kappa_compactSearchItems *compactSearch, double scalingFactor, Boolean doBinarySearch, BlastScoreBlk *sbp)
Copied from posit2.c.
Definition: blast_posit.c:393
Kappa_compactSearchItems * Kappa_compactSearchItemsNew(const Uint1 *query, unsigned int queryLength, BlastScoreBlk *sbp)
Creates a new Kappa_compactSearchItems structure.
Definition: blast_posit.c:101
Kappa_posSearchItems * Kappa_posSearchItemsFree(Kappa_posSearchItems *posSearch)
Deallocates the Kappa_posSearchItems structure.
Definition: blast_posit.c:75
Kappa_compactSearchItems * Kappa_compactSearchItemsFree(Kappa_compactSearchItems *compactSearch)
Deallocates the Kappa_compactSearchItems structure.
Definition: blast_posit.c:140
Kappa_posSearchItems * Kappa_posSearchItemsNew(unsigned int queryLength, const char *matrix_name, int **posPrivateMatrix, double **posFreqs)
Allocates a new Kappa_posSearchItems structure.
Definition: blast_posit.c:44
Port of posit.h structures and impalaScaling for implementing composition based statistics for PSI-BL...
static int s_PSIValidateAlignedColumns(const _PSIMsa *msa)
Validate that there are no unaligned columns or columns which only contain gaps in the multiple seque...
static NCBI_INLINE void _handleEitherAlignedNeitherX(_PSIAlignmentTraits *traits, _EPSIPurgeFsmState *state, Uint4 position)
Handle the event when either position is aligned and neither is X.
static NCBI_INLINE void _handleEitherAlignedEitherX(_PSIAlignmentTraits *traits, _EPSIPurgeFsmState *state)
Handle the event when either position is aligned and either is X.
static NCBI_INLINE void _handleNeitherAligned(_PSIAlignmentTraits *traits, _EPSIPurgeFsmState *state, _PSIPackedMsa *msa, Uint4 seq_index, double max_percent_identity)
Handles neither is aligned event FIXME: document better.
int _PSIComputeAlignmentBlocks(const _PSIMsa *msa, _PSIAlignedBlock *aligned_blocks)
Main function to compute aligned blocks' properties for each position within multiple alignment (stag...
static void _PSIGetLeftExtents(const _PSIMsa *msa, Uint4 seq_index)
Computes the left extents for the sequence identified by seq_index.
int _PSIConvertFreqRatiosToPSSM(_PSIInternalPssmData *internal_pssm, const Uint1 *query, const BlastScoreBlk *sbp, const double *std_probs)
Converts the PSSM's frequency ratios obtained in the previous stage to a PSSM of scores.
static void _PSICalculateMatchWeights(const _PSIMsa *msa, Uint4 position, const SDynamicUint4Array *aligned_seqs, _PSISequenceWeights *seq_weights)
Calculate the weighted observed sequence weights.
int _PSIComputeFreqRatios(const _PSIMsa *msa, const _PSISequenceWeights *seq_weights, const BlastScoreBlk *sbp, const _PSIAlignedBlock *aligned_blocks, Int4 pseudo_count, Boolean nsg_compatibility_mode, _PSIInternalPssmData *internal_pssm)
Main function to compute the PSSM's frequency ratios (stage 5).
void ** _PSIAllocateMatrix(unsigned int ncols, unsigned int nrows, unsigned int data_type_sz)
Generic 2 dimensional matrix allocator.
static int s_PSIValidateNoGapsInQuery(const _PSIMsa *msa)
Validate that there are no gaps in the query sequence.
void _PSIStructureGroupCustomization(_PSIMsa *msa)
Enable NCBI structure group customization to discard the query sequence, as this really isn't the res...
int _PSIComputeFreqRatiosFromCDs(const PSICdMsa *cd_msa, const _PSISequenceWeights *seq_weights, const BlastScoreBlk *sbp, Int4 pseudo_count, _PSIInternalPssmData *internal_pssm)
Main function to compute CD-based PSSM's frequency ratios.
_PSISequenceWeights * _PSISequenceWeightsNew(const PSIMsaDimensions *dimensions, const BlastScoreBlk *sbp)
Allocates and initializes the _PSISequenceWeights structure.
static void _PSISpreadGapWeights(const _PSIMsa *msa, _PSISequenceWeights *seq_weights, Boolean nsg_compatibility_mode)
Uses disperse method of spreading the gap weights.
_PSIInternalPssmData * _PSIInternalPssmDataNew(Uint4 query_length, Uint4 alphabet_size)
Allocates a new _PSIInternalPssmData structure.
static void _PSIGetAlignedSequencesForPosition(const _PSIMsa *msa, Uint4 position, SDynamicUint4Array *aligned_sequences)
Populates the array aligned_sequences with the indices of the sequences which are part of the multipl...
_PSIAlignedBlock * _PSIAlignedBlockNew(Uint4 query_length)
Allocates and initializes the _PSIAlignedBlock structure.
unsigned int _PSIPackedMsaGetNumberOfAlignedSeqs(const _PSIPackedMsa *msa)
Retrieve the number of aligned sequences in the compact multiple sequence alignment.
int _PSIComputeSequenceWeights(const _PSIMsa *msa, const _PSIAlignedBlock *aligned_blocks, Boolean nsg_compatibility_mode, _PSISequenceWeights *seq_weights)
Main function to calculate the sequence weights.
int _PSISaveDiagnostics(const _PSIMsa *msa, const _PSIAlignedBlock *aligned_block, const _PSISequenceWeights *seq_weights, const _PSIInternalPssmData *internal_pssm, PSIDiagnosticsResponse *diagnostics)
Collects diagnostic information from the process of creating the PSSM.
static int s_PSIValidateParticipatingSequences(const _PSIMsa *msa)
Verify that after purging biased sequences in multiple sequence alignment there are still sequences p...
static void s_fillColumnProbabilities(double *probabilities, const _PSISequenceWeights *posSearch, Int4 columnNumber)
Reorders in the same manner as returned by Blast_GetMatrixBackgroundFreq this function is a copy of p...
double * _PSICalculateInformationContentFromScoreMatrix(Int4 **score_mat, const double *std_prob, const Uint1 *query, Uint4 query_length, Uint4 alphabet_sz, double lambda)
Calculates the information content from the scoring matrix.
static int _PSICheckSequenceWeights(const _PSIMsa *msa, const _PSISequenceWeights *seq_weights, Boolean nsg_compatibility_mode)
Verifies that the sequence weights for each column of the PSSM add up to 1.0.
const double kPosEpsilon
minimum return value of s_computeRelativeEntropy
Blast_ScoreFreq * _PSIComputeScoreProbabilities(const int **pssm, const Uint1 *query, Uint4 query_length, const double *std_probs, const BlastScoreBlk *sbp)
Compute the probabilities for each score in the PSSM.
static double s_effectiveObservations(const _PSIAlignedBlock *align_blk, const _PSISequenceWeights *seq_weights, int columnNumber, int queryLength, const double *expno)
A method to estimate the effetive number of observations in the interval for the specified columnNumb...
static double s_columnSpecificPseudocounts(const _PSISequenceWeights *posSearch, int columnNumber, const double *backgroundProbabilities, const double observations)
copy of posit.c:columnSpecificPseudocounts
int _PSIPurgeBiasedSegments(_PSIPackedMsa *msa)
Main function for keeping only those selected sequences for PSSM construction (stage 2).
static void s_PSIDiscardIfUnused(_PSIPackedMsa *msa, unsigned int seq_index)
Check if we still need this sequence.
int _PSIComputeFrequenciesFromCDs(const PSICdMsa *cd_msa, BlastScoreBlk *sbp, const PSIBlastOptions *options, _PSISequenceWeights *seq_weights)
Main function to calculate CD weights and combine weighted residue counts from matched CDs.
static int s_PSIValidateNoFlankingGaps(const _PSIMsa *msa)
Validate that there are no flanking gaps in the multiple sequence alignment.
static void s_initializeExpNumObservations(double *expno, const double *backgroundProbabilities)
initialize the expected number of observations use background probabilities for this matrix Calculate...
_PSIMsa * _PSIMsaNew(const _PSIPackedMsa *msa, Uint4 alphabet_size)
Allocates and initializes the internal version of the PSIMsa structure (makes a deep copy) for intern...
_EPSIPurgeFsmState
Defines the states of the finite state machine used in s_PSIPurgeSimilarAlignments.
@ eResting
@ eCounting
static void _PSIComputeAlignedRegionLengths(const _PSIMsa *msa, _PSIAlignedBlock *aligned_blocks)
Calculates the aligned blocks lengths in the multiple sequence alignment data structure.
const int kPSIScaleFactor
Successor to POSIT_SCALE_FACTOR.
const double kPSINearIdentical
Percent identity threshold for discarding near-identical matches.
const Uint4 kPositScalingNumIterations
Constant used in scaling PSSM routines: Successor to POSIT_NUM_ITERATIONS.
_PSISequenceWeights * _PSISequenceWeightsFree(_PSISequenceWeights *seq_weights)
Deallocates the _PSISequenceWeights structure.
void ** _PSIDeallocateMatrix(void **matrix, unsigned int ncols)
Generic 2 dimensional matrix deallocator.
void _PSICopyMatrix_int(int **dest, int **src, unsigned int ncols, unsigned int nrows)
Copies src matrix into dest matrix, both of which must be int matrices with dimensions ncols by nrows...
#define MAX_IND_OBSERVATIONS
max number of independent observation for pseudocount calculation
static void s_PSIComputeFrequenciesFromCDsCleanup(double *sum_weights)
static NCBI_INLINE void _handleBothAlignedSameResidueNoX(_PSIAlignmentTraits *traits, _EPSIPurgeFsmState *state)
Handle event when both positions are aligned, using the same residue, but this residue is not X.
Uint4 _PSISequenceLengthWithoutX(const Uint1 *seq, Uint4 length)
Calculates the length of the sequence without including any 'X' residues.
static void s_PSIPurgeSimilarAlignments(_PSIPackedMsa *msa, Uint4 seq_index1, Uint4 seq_index2, double max_percent_identity)
This function compares the sequences in the msa->cell structure indexed by sequence_index1 and seq_in...
static void _PSIGetRightExtents(const _PSIMsa *msa, Uint4 seq_index)
Computes the right extents for the sequence identified by seq_index.
void _PSIUpdateLambdaK(const int **pssm, const Uint1 *query, Uint4 query_length, const double *std_probs, BlastScoreBlk *sbp)
Updates the Karlin-Altschul parameters based on the query sequence and PSSM's score frequencies.
static void _PSIComputePositionExtents(const _PSIMsa *msa, Uint4 seq_index, _PSIAlignedBlock *aligned_blocks)
Computes the aligned blocks extents' for each position for the sequence identified by seq_index.
#define PSEUDO_MAX
effective infinity
static double s_computeRelativeEntropy(const double *newDistribution, const double *backgroundProbabilities)
compute relative entropy of first distribution to second distribution A copy of posit....
static void _PSICalculateNormalizedSequenceWeights(const _PSIMsa *msa, const _PSIAlignedBlock *aligned_blocks, Uint4 position, const SDynamicUint4Array *aligned_seqs, _PSISequenceWeights *seq_weights)
Calculates the position based weights using a modified version of the Henikoff's algorithm presented ...
const unsigned int kQueryIndex
Index into multiple sequence alignment structure for the query sequence.
_PSIInternalPssmData * _PSIInternalPssmDataFree(_PSIInternalPssmData *pssm_data)
Deallocates the _PSIInternalPssmData structure.
int _PSIValidateMSA_StructureGroup(const _PSIMsa *msa)
Structure group validation function for multiple sequence alignment structure.
int _PSIScaleMatrix(const Uint1 *query, const double *std_probs, _PSIInternalPssmData *internal_pssm, BlastScoreBlk *sbp)
Scales the PSSM (stage 7)
int _PSIPurgeAlignedRegion(_PSIPackedMsa *msa, unsigned int seq_index, unsigned int start, unsigned int stop)
Marks the (start, stop] region corresponding to sequence seq_index in alignment so that it is not fur...
_PSIMsa * _PSIMsaFree(_PSIMsa *msa)
Deallocates the _PSIMsa data structure.
#define DEFINE_COPY_MATRIX_FUNCTION(type)
Implements the generic copy matrix functions.
_PSIAlignedBlock * _PSIAlignedBlockFree(_PSIAlignedBlock *aligned_blocks)
Deallocates the _PSIAlignedBlock structure.
const double kEpsilon
Small constant to test against 0.
double * _PSICalculateInformationContentFromFreqRatios(double **freq_ratios, const double *std_prob, Uint4 query_length, Uint4 alphabet_sz)
Calculates the information content from the residue frequencies calculated in stage 5 of the PSSM cre...
static void s_PSIPurgeSelfHits(_PSIPackedMsa *msa)
Remove those sequences which are identical to the query sequence.
#define EFFECTIVE_ALPHABET
size of alphabet used for pseudocounts calculations
int _PSISaveCDDiagnostics(const PSICdMsa *cd_msa, const _PSISequenceWeights *seq_weights, const _PSIInternalPssmData *internal_pssm, PSIDiagnosticsResponse *diagnostics)
Collects diagnostic information from the process of creating the CDD-based PSSM.
static void s_PSIPurgeNearIdenticalAlignments(_PSIPackedMsa *msa)
Keeps only one copy of any aligned sequences which are >kPSINearIdentical% identical to one another.
struct _PSIAlignmentTraits _PSIAlignmentTraits
Auxiliary structure to maintain information about two aligned regions between the query and a subject...
int _PSIValidateCdMSA(const PSICdMsa *cd_msa, Uint4 alphabet_size)
Validation of multiple alignment of conserved domains structure.
void _PSIUpdatePositionCounts(_PSIMsa *msa)
Counts the number of sequences matching the query per query position (columns of the multiple alignme...
const double kPSIIdentical
Percent identity threshold for discarding identical matches.
_PSIPackedMsa * _PSIPackedMsaNew(const PSIMsa *msa)
Allocates and initializes the compact version of the PSIMsa structure (makes a deep copy) for interna...
static void s_adjustColumnProbabilities(double *initialProbabilities, double *probabilitiesToReturn, double standardWeight, const double *standardProbabilities, double observations)
adjust the probabilities by assigning observations weight to initialProbabilities and standardWeight ...
const double kPositScalingPercent
Constant used in scaling PSSM routines: Successor to POSIT_PERCENT.
int _IMPALAScaleMatrix(const Uint1 *query, const double *std_probs, _PSIInternalPssmData *internal_pssm, BlastScoreBlk *sbp, double scaling_factor)
Provides a similar function to _PSIScaleMatrix but it performs the scaling as IMPALA did,...
static NCBI_INLINE void _PSIResetAlignmentTraits(_PSIAlignmentTraits *traits, Uint4 position)
Resets the traits structure to restart finite state machine.
_PSIPackedMsa * _PSIPackedMsaFree(_PSIPackedMsa *msa)
Deallocates the _PSIMsa data structure.
int _PSIValidateMSA(const _PSIMsa *msa, Boolean ignore_unaligned_positions)
Main validation function for multiple sequence alignment structure.
Private interface for Position Iterated BLAST API, contains the PSSM generation engine.
#define PSIERR_BADPARAM
Bad parameter used in function.
#define PSIERR_ENDINGGAP
Found flanking gap at end of alignment.
#define PSIERR_UNKNOWN
Unknown error.
#define PSIERR_COLUMNOFGAPS
Found an entire column full of GAP residues.
#define PSIERR_OUTOFMEM
Out of memory.
#define PSIERR_BADPROFILE
Errors in conserved domain profile.
#define PSIERR_POSITIVEAVGSCORE
Positive average score found when scaling matrix.
#define PSIERR_NOALIGNEDSEQS
After purge stage of PSSM creation, no sequences are left.
#define PSIERR_STARTINGGAP
Found flanking gap at start of alignment.
#define PSIERR_BADSEQWEIGHTS
Sequence weights do not add to 1.
#define PSI_SUCCESS
Successful operation.
#define PSIERR_UNALIGNEDCOLUMN
Found an entire column with no participating sequences.
#define PSIERR_GAPINQUERY
GAP residue found in query sequence.
#define BLAST_SCORE_MIN
minimum allowed score (for one letter comparison).
Definition: blast_stat.h:121
Int2 Blast_GetStdAlphabet(Uint1 alphabet_code, Uint1 *residues, Uint4 residue_size)
Fills a buffer with the 'standard' alphabet (given by STD_AMINO_ACID_FREQS[index]....
Definition: blast_stat.c:1863
Int2 Blast_KarlinBlkUngappedCalc(Blast_KarlinBlk *kbp, Blast_ScoreFreq *sfp)
Computes the parameters lambda, H K for use in calculating the statistical significance of high-scori...
Definition: blast_stat.c:2699
Blast_ScoreFreq * Blast_ScoreFreqFree(Blast_ScoreFreq *sfp)
Deallocates the score frequencies structure.
Definition: blast_stat.c:941
#define BLAST_SCORE_MAX
maximum allowed score (for one letter comparison).
Definition: blast_stat.h:122
Blast_ScoreFreq * Blast_ScoreFreqNew(Int4 score_min, Int4 score_max)
Creates a new structure to keep track of score frequencies for a scoring system.
Definition: blast_stat.c:2113
Various auxiliary BLAST utility functions.
#define IS_residue(x)
Does character encode a residue?
Definition: blast_util.h:48
double * BLAST_GetStandardAaProbabilities(void)
Get the standard amino acid probabilities.
Definition: blast_util.c:1323
Constants used in compositional score matrix adjustment.
static const char fp[]
Definition: des.c:87
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:51
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:61
static char tmp[3200]
Definition: utf8.c:42
#define BLASTAA_SIZE
Size of aminoacid alphabet.
#define BLASTAA_SEQ_CODE
== Seq_code_ncbistdaa
const Uint1 AMINOACID_TO_NCBISTDAA[]
Translates between ncbieaa and ncbistdaa.
const char NCBISTDAA_TO_AMINOACID[]
Translates between ncbieaa and ncbistdaa.
#define NULL
Definition: ncbistd.hpp:225
#define Boolean
Definition: ncbistd.hpp:136
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
static const char * GetResidue(TokenStatBlkPtr stoken)
Definition: indx_blk.cpp:235
static int input()
int i
SFreqRatios * _PSIMatrixFrequencyRatiosFree(SFreqRatios *freq_ratios)
Deallocate the frequency ratios structure.
SFreqRatios * _PSIMatrixFrequencyRatiosNew(const char *matrix_name)
Retrive the matrix's frequency ratios.
Definitions used to get joint probabilities for a scoring matrix.
const double * Blast_GetMatrixBackgroundFreq(const char *matrix_name)
Return true if frequency data is available for the given matrix name.
static MDB_envinfo info
Definition: mdb_load.c:37
bool is_aligned(T *p) noexcept
Check pointer alignment.
Definition: bmutil.h:637
#define fabs(v)
Definition: ncbi_dispd.c:46
Prototypes for portable math library (ported from C Toolkit)
#define NCBIMATH_LN2
Natural log(2)
Definition: ncbi_math.h:161
long BLAST_Nint(double x)
Nearest integer.
Definition: ncbi_math.c:437
#define MIN(a, b)
returns smaller of a and b.
Definition: ncbi_std.h:112
#define NCBI_INLINE
"inline" seems to work on our remaining in-house compilers (WorkShop, Compaq, ICC,...
Definition: ncbi_std.h:81
Uint1 Boolean
bool replacment for C
Definition: ncbi_std.h:94
#define TRUE
bool replacment for C indicating true.
Definition: ncbi_std.h:97
#define FALSE
bool replacment for C indicating false.
Definition: ncbi_std.h:101
#define ASSERT
macro for assert.
Definition: ncbi_std.h:107
#define MAX(a, b)
returns larger of a and b.
Definition: ncbi_std.h:117
void abort()
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
double lambda(size_t dimMatrix_, const Int4 *const *scoreMatrix_, const double *q_)
static const char * kScore
Definition: showdefline.cpp:77
#define assert(x)
Definition: srv_diag.hpp:58
Structure used for scoring calculations.
Definition: blast_stat.h:177
Blast_KarlinBlk ** kbp_psi
K-A parameters for position-based alignments.
Definition: blast_stat.h:213
char * name
name of scoring matrix.
Definition: blast_stat.h:183
Int2 alphabet_size
size of alphabet.
Definition: blast_stat.h:181
Uint1 alphabet_code
NCBI alphabet code.
Definition: blast_stat.h:180
SBlastScoreMatrix * matrix
scoring matrix data
Definition: blast_stat.h:185
Blast_KarlinBlk * kbp_ideal
Ideal values (for query with average database composition).
Definition: blast_stat.h:216
Blast_KarlinBlk ** kbp_gap_std
K-A parameters for std (not position-based) alignments.
Definition: blast_stat.h:214
Blast_KarlinBlk ** kbp_gap_psi
K-A parameters for psi alignments.
Definition: blast_stat.h:215
double K
K value used in statistics.
Definition: blast_stat.h:68
double Lambda
Lambda value used in statistics.
Definition: blast_stat.h:67
double logK
natural log of K value used in statistics
Definition: blast_stat.h:69
Holds score frequencies used in calculation of Karlin-Altschul parameters for an ungapped search.
Definition: blast_stat.h:128
double score_avg
average score, must be negative for local alignment.
Definition: blast_stat.h:133
Int4 obs_min
lowest observed (actual) scores
Definition: blast_stat.h:131
double * sprob
arrays for frequency of given score, shifted down by score_min.
Definition: blast_stat.h:135
Int4 obs_max
highest observed (actual) scores
Definition: blast_stat.h:132
Structure used to pass data into the scaling routines.
Definition: blast_posit.h:73
Structure used to pass data into the scaling routines.
Definition: blast_posit.h:56
Options used in protein BLAST only (PSI, PHI, RPS and translated BLAST) Some of these possibly should...
double iobsr
Effective number of independent observations in a CD column.
Definition: blast_psi.h:118
double * wfreqs
Frequencies for each residue in CD column.
Definition: blast_psi.h:115
Uint1 is_aligned
Does this cell represent column aligned to a CD.
Definition: blast_psi.h:125
PSICdMsaCellData * data
Data needed for PSSM computation.
Definition: blast_psi.h:128
Data structure representing multiple alignemnt of CDs and query sequence along with data needed for P...
Definition: blast_psi.h:134
PSIMsaDimensions * dimensions
Query length and number of aligned cds.
Definition: blast_psi.h:136
unsigned char * query
Query sequence as Ncbistdaa.
Definition: blast_psi.h:135
PSICdMsaCell ** msa
Multiple alignment of CDs.
Definition: blast_psi.h:138
This structure contains the diagnostics information requested using the PSIDiagnosticsRequest structu...
Definition: blast_psi.h:201
double * information_content
position information content (query_length elements)
Definition: blast_psi.h:202
Uint4 ** residue_freqs
observed residue frequencies per position of the PSSM (Dimensions are query_length by alphabet_size)
Definition: blast_psi.h:204
double ** weighted_residue_freqs
Weighted observed residue frequencies per position of the PSSM.
Definition: blast_psi.h:208
Uint4 * interval_sizes
interval sizes of aligned regions (query_length elements)
Definition: blast_psi.h:218
Uint4 alphabet_size
Specifies length of alphabet.
Definition: blast_psi.h:225
Uint4 query_length
Specifies the number of positions in the PSSM.
Definition: blast_psi.h:223
double * gapless_column_weights
Weights for columns without gaps (query_length elements)
Definition: blast_psi.h:215
double * independent_observations
Effective number of observations per column.
Definition: blast_psi.h:227
Uint4 * num_matching_seqs
number of matching sequences per query position (query_length elements)
Definition: blast_psi.h:220
double * sigma
sigma (query_length elements)
Definition: blast_psi.h:217
double ** frequency_ratios
PSSM's frequency ratios (Dimensions are query_length by alphabet_size)
Definition: blast_psi.h:212
Boolean is_aligned
Is this letter part of the alignment?
Definition: blast_psi.h:52
Uint1 letter
Preferred letter at this position, in ncbistdaa encoding.
Definition: blast_psi.h:50
Structure representing the dimensions of the multiple sequence alignment data structure.
Definition: blast_psi.h:57
Uint4 num_seqs
Number of distinct sequences aligned with the query (does not include the query)
Definition: blast_psi.h:59
Uint4 query_length
Length of the query.
Definition: blast_psi.h:58
Multiple sequence alignment (msa) data structure containing the raw data needed by the PSSM engine to...
Definition: blast_psi.h:75
PSIMsaCell ** data
actual data, dimensions are (dimensions->num_seqs+1) by (dimensions->query_length)
Definition: blast_psi.h:77
PSIMsaDimensions * dimensions
dimensions of the msa
Definition: blast_psi.h:76
int ** data
actual scoring matrix data, stored in row-major form
Definition: blast_stat.h:140
Data structure to maintain a dynamically allocated array of Uint4.
Uint4 * data
array of Uint4
Uint4 num_used
number of elements used in this array
Uint4 num_allocated
size of array below
Stores the frequency ratios along with their bit scale factor.
double ** data
The actual frequency ratios.
int bit_scale_factor
Used to multiply the values in the above matrix to obtain scores in bit units.
A structure containing two integers, used e.g.
Definition: blast_def.h:155
Int4 left
left endpoint of range (zero based)
Definition: blast_def.h:156
Int4 right
right endpoint of range (zero based)
Definition: blast_def.h:157
This structure keeps track of the regions aligned between the query sequence and those that were not ...
SSeqRange * pos_extnt
Dynamically allocated array of size query_length to keep track of the extents of each aligned positio...
Uint4 * size
Dynamically allocated array of size query_length that contains the size of the intervals in the array...
Auxiliary structure to maintain information about two aligned regions between the query and a subject...
Uint4 n_x_residues
number of X residues in alignment
Uint4 n_identical
number of identical residues in alignment
Uint4 start
starting offset of alignment w.r.t.
Uint4 effective_length
length of alignment not including Xs
Internal representation of a PSSM in various stages of its creation and its dimensions.
int ** scaled_pssm
scaled PSSM (scores)
Uint4 nrows
number of rows (alphabet_size)
double * pseudocounts
pseudocount constant for each column
int ** pssm
PSSM (scores)
Uint4 ncols
number of columns (query_length)
double ** freq_ratios
frequency ratios
Internal data structure to represent a position in the multiple sequence alignment data structure.
unsigned int letter
Preferred letter at this position.
SSeqRange extents
Extents of this aligned position.
unsigned int is_aligned
Is this letter part of the alignment?
Internal multiple alignment data structure used by the PSSM engine.
Uint4 * num_matching_seqs
number of sequences aligned at a given position in the multiple sequence alignment (length: query_len...
Uint1 * query
query sequence (length: query_length)
Uint4 ** residue_counts
matrix to keep track of the raw residue counts at each position of the multiple sequence alignment (d...
PSIMsaDimensions * dimensions
dimensions of field below
Uint4 alphabet_size
number of elements in alphabet
_PSIMsaCell ** cell
multiple sequence alignment matrix (dimensions: query_length x num_seqs + 1)
Compact version of the PSIMsaCell structure.
unsigned int letter
Preferred letter at this position, in ncbistdaa encoding.
unsigned int is_aligned
Is this letter part of the alignment?
Compact version of PSIMsa structure.
PSIMsaDimensions * dimensions
dimensions of the msa
Boolean * use_sequence
used to indicate whether a sequence should be used for further processing by the engine (length: num_...
_PSIPackedMsaCell ** data
actual data, dimensions are (dimensions->num_seqs+1) by (dimensions->query_length)
Internal data structure to keep computed sequence weights.
double ** match_weights
weighted observed residue frequencies (f_i in 2001 paper).
Uint4 posDistinctDistrib_size
Kept to deallocate field above.
double * std_prob
standard amino acid probabilities
double * norm_seq_weights
Stores the normalized sequence weights (length: num_seqs + 1)
int ** posDistinctDistrib
For position i, how many positions in its block have j distinct letters.
double * independent_observations
Number of independent sequences per column.
int * posNumParticipating
number of sequences at each position.
double * sigma
array of length query_length
double * gapless_column_weights
FIXME.
Uint4 match_weights_size
kept for help deallocate the field above
double * row_sigma
array of length num_seqs + 1
static string query
static Uint4 letter(char c)
#define const
Definition: zconf.h:232
voidp malloc(uInt size)
voidp calloc(uInt items, uInt size)
Modified on Fri Sep 20 14:58:33 2024 by modify_doxy.py rev. 669887