67 unsigned int data_type_sz)
72 retval = (
void**)
malloc(
sizeof(
void*) * ncols);
77 for (
i = 0;
i < ncols;
i++) {
78 retval[
i] = (
void*)
calloc(nrows, data_type_sz);
95 for (
i = 0;
i < ncols;
i++) {
106 #define DEFINE_COPY_MATRIX_FUNCTION(type) \
107 void _PSICopyMatrix_##type(type** dest, type** src, \
108 unsigned int ncols, unsigned int nrows) \
110 unsigned int i = 0; \
111 unsigned int j = 0; \
116 for (i = 0; i < ncols; i++) { \
117 for (j = 0; j < nrows; j++) { \
118 dest[i][j] = src[i][j]; \
135 if ( !msa || !msa->dimensions || !msa->data ) {
149 (
void*) msa->dimensions,
154 msa->dimensions->query_length,
156 if ( !retval->
data ) {
160 for (s = 0; s < msa->dimensions->num_seqs + 1; s++) {
161 for (p = 0; p < msa->dimensions->query_length; p++) {
163 retval->
data[s][p].
letter = msa->data[s][p].letter;
175 for (s = 0; s < msa->dimensions->num_seqs + 1; s++) {
211 unsigned int retval = 0;
229 #ifdef DEBUG_PSSM_ENGINE
236 PrintMsa(
const char* filename,
const PSIMsa* msa)
241 fp = fopen(filename,
"w");
247 PrintMsaFP(FILE*
fp,
const PSIMsa* msa)
254 fprintf(
fp,
"%3d\tGI=%10d\tEvalue=%2.0le\tBitScore=%4.1f\t",
i,
256 msa->seqinfo[
i].evalue,
257 msa->seqinfo[
i].bit_score);
278 fprintf(
fp,
"%3d\t",
i);
280 fprintf(
fp,
"NOT USED\n");
296 __printPackedMsa(
const char* filename,
const _PSIPackedMsa* msa)
301 fp = fopen(filename,
"w");
302 __printPackedMsaFP(
fp, msa);
335 if ( !retval->
cell ) {
360 if ( !retval->
query ) {
434 retval->
ncols = query_length;
435 retval->
nrows = alphabet_size;
440 if ( !retval->
pssm ) {
474 if (pssm_data->
pssm) {
475 pssm_data->
pssm = (
int**)
512 if ( !retval->
size ) {
524 for (
i = 0;
i < query_length;
i++) {
534 if ( !aligned_blocks ) {
538 if (aligned_blocks->
size) {
546 sfree(aligned_blocks);
550 #define EFFECTIVE_ALPHABET 20
578 if ( !retval->
sigma ) {
629 if ( !seq_weights ) {
641 if (seq_weights->
sigma) {
691 msa->
query[p] == kGapResidue) {
769 found_aligned_sequence =
TRUE;
771 found_non_gap_residue =
TRUE;
776 if ( !found_aligned_sequence ) {
779 if ( !found_non_gap_residue ) {
841 if ( !ignore_unaligned_positions ) {
875 if (cd_msa->
query[p] == kGapResidue) {
895 for (k = 0; k < alphabet_size; k++) {
943 double max_percent_identity);
993 Uint4 kQueryLength = 0;
994 Uint4 kNumberOfSeqs = 0;
1007 for (p = 0; p < kQueryLength; p++) {
1012 for (s = 0; s < kNumberOfSeqs; s++) {
1015 for (p = 0; p < kQueryLength; p++, pos++) {
1052 fprintf(stderr,
"Position: %d - State: %s\n", position,
1054 fprintf(stderr,
"\tstart: %d\n", traits->
start);
1056 fprintf(stderr,
"\tn_x_residues: %d\n", traits->
n_x_residues);
1057 fprintf(stderr,
"\tn_identical: %d\n", traits->
n_identical);
1071 traits->
start = position;
1079 double max_percent_identity)
1089 const double percent_identity =
1091 if (percent_identity >= max_percent_identity) {
1092 const unsigned int align_stop =
1096 traits->
start, align_stop);
1189 double max_percent_identity)
1202 if ( seq_index1 == seq_index2 ||
1209 seq1 = msa->
data[seq_index1];
1210 seq2 = msa->
data[seq_index2];
1214 for (p = 0; p < kQueryLength; p++, seq1++, seq2++) {
1229 if (!kPos1Aligned && !kPos2Aligned) {
1231 max_percent_identity);
1237 seq1->
letter != kXResidue && seq2->
letter != kXResidue;
1246 if (neither_is_X && (kPos2Aligned && seq1->
is_aligned) &&
1253 max_percent_identity);
1302 if ( !msa || !aligned_blocks ) {
1327 ASSERT(seq_index < msa->dimensions->num_seqs + 1);
1329 sequence_position = msa->
cell[seq_index];
1332 sequence_position[
prev].
letter != kGapResidue) {
1361 ASSERT(seq_index < msa->dimensions->num_seqs + 1);
1363 sequence_position = msa->
cell[seq_index];
1367 sequence_position[
last].
letter != kGapResidue) {
1371 for (curr =
last - 1; curr >= 0; curr--,
last--) {
1391 #ifdef PSI_IGNORE_GAPS_IN_COLUMNS
1399 ASSERT(seq_index < msa->dimensions->num_seqs + 1);
1401 sequence_position = msa->
cell[seq_index];
1404 #ifdef PSI_IGNORE_GAPS_IN_COLUMNS
1406 sequence_position[
i].
letter != kGapResidue) {
1425 Uint4 kQueryLength = 0;
1432 for (
i = 0;
i < kQueryLength;
i++) {
1444 for (
i = 0;
i < kQueryLength;
i++) {
1446 if (msa->
query[
i] == kXResidue) {
1449 for (idx = 0; idx <
i; idx++) {
1451 msa->
query[idx] != kXResidue) {
1452 aligned_blocks->
size[idx]--;
1457 msa->
query[idx] != kXResidue) {
1458 aligned_blocks->
size[idx]--;
1529 Boolean nsg_compatibility_mode);
1544 Boolean nsg_compatibility_mode);
1554 Boolean nsg_compatibility_mode,
1563 Uint4 kQueryLength = 0;
1566 const Uint4 kExpectedNumMatchingSeqs = nsg_compatibility_mode ? 0 : 1;
1567 Uint4 last_calc_pos = 0;
1569 if ( !msa || !aligned_blocks || !seq_weights ) {
1575 if ( !aligned_seqs || !prev_pos_aligned_seqs ) {
1580 for (pos = 0; pos < kQueryLength; pos++) {
1583 if (aligned_blocks->
size[pos] == 0 ||
1591 if (aligned_seqs->
num_used <= kExpectedNumMatchingSeqs) {
1594 last_calc_pos = pos;
1596 if (last_calc_pos != pos - 1 ||
1601 memset((
void*)seq_weights->
row_sigma, 0,
1605 aligned_seqs, seq_weights);
1608 seq_weights->
sigma[pos] = seq_weights->
sigma[pos-1];
1627 nsg_compatibility_mode);
1632 #ifndef PSI_IGNORE_GAPS_IN_COLUMNS
1635 nsg_compatibility_mode);
1656 Uint4 kQueryLength = 0;
1659 double* sum_weights =
NULL;
1663 if ( !cd_msa || !seq_weights || !sbp || !options) {
1674 if ( !sum_weights) {
1683 for (pos = 0; pos < kQueryLength; pos++) {
1684 double total_observations = 0.0;
1689 memset(sum_weights, 0, sbp->
alphabet_size *
sizeof(
double));
1703 total_observations +=
1709 for (residue = 0; residue < sbp->
alphabet_size; residue++) {
1710 sum_weights[residue] +=
1718 if (total_observations > 0.0 && query_residue != kXResidue
1719 && sum_weights[query_residue] == 0.0) {
1721 sum_weights[query_residue] = 1.0;
1722 total_observations += 1.0;
1726 if (total_observations > 0.0) {
1728 for (residue = 0; residue < sbp->
alphabet_size; residue++) {
1730 sum_weights[residue] / total_observations;
1741 total_observations);
1783 ASSERT(position < msa->dimensions->query_length);
1794 Uint4 num_distinct_residues_for_column = 0;
1795 Uint4 num_local_std_letters = 0;
1798 ASSERT(i < msa->dimensions->query_length);
1802 for (asi = 0; asi < aligned_seqs->
num_used; asi++) {
1803 const Uint4 kSeqIdx = aligned_seqs->
data[asi];
1806 if (residue_counts_for_column[kResidue] == 0) {
1807 num_distinct_residues_for_column++;
1808 if (kResidue != kGapResidue && kResidue != kXResidue)
1809 num_local_std_letters++;
1811 residue_counts_for_column[kResidue]++;
1814 sigma += num_distinct_residues_for_column;
1817 if (num_distinct_residues_for_column > 1) {
1820 distinct_residues_found =
TRUE;
1825 for (asi = 0; asi < aligned_seqs->
num_used; asi++) {
1826 const Uint4 seq_idx = aligned_seqs->
data[asi];
1834 (residue_counts_for_column[residue] *
1835 num_distinct_residues_for_column) );
1840 seq_weights->
sigma[position] = sigma;
1842 if (distinct_residues_found) {
1843 double weight_sum = 0.0;
1845 for (asi = 0; asi < aligned_seqs->
num_used; asi++) {
1846 const Uint4 seq_idx = aligned_seqs->
data[asi];
1855 for (asi = 0; asi < aligned_seqs->
num_used; asi++) {
1856 const Uint4 seq_idx = aligned_seqs->
data[asi];
1864 for (asi = 0; asi < aligned_seqs->
num_used; asi++) {
1865 const Uint4 seq_idx = aligned_seqs->
data[asi];
1867 (1.0/(double) aligned_seqs->
num_used);
1888 for (asi = 0; asi < aligned_seqs->
num_used; asi++) {
1889 const Uint4 seq_idx = aligned_seqs->
data[asi];
1896 if (residue != kGapResidue) {
1908 #ifdef PSI_IGNORE_GAPS_IN_COLUMNS
1914 ASSERT(position < msa->dimensions->query_length);
1920 #ifdef PSI_IGNORE_GAPS_IN_COLUMNS
1934 Boolean nsg_compatibility_mode)
1940 const Uint4 kExpectedNumMatchingSeqs = nsg_compatibility_mode ? 0 : 1;
1972 #define SEQUENCE_WEIGHTS_CHECK__ABORT_ON_FAILURE 0
1979 Boolean nsg_compatibility_mode)
1983 const Uint4 kExpectedNumMatchingSeqs = nsg_compatibility_mode ? 0 : 1;
1985 #if SEQUENCE_WEIGHTS_CHECK__ABORT_ON_FAILURE
1994 double running_total = 0.0;
2005 for (residue = 0; residue < msa->
alphabet_size; residue++) {
2009 if (running_total < 0.99 || running_total > 1.01) {
2012 #if SEQUENCE_WEIGHTS_CHECK__ABORT_ON_FAILURE
2013 check_performed =
TRUE;
2017 #if SEQUENCE_WEIGHTS_CHECK__ABORT_ON_FAILURE
2020 if ( !check_performed &&
2021 !nsg_compatibility_mode ) {
2022 assert(!
"Did not perform sequence weights check");
2050 int columnNumber,
int queryLength,
2051 const double *expno);
2062 const double *backgroundProbabilities,
2063 const double observations);
2065 #define MAX_IND_OBSERVATIONS 400
2066 #define PSEUDO_MAX 1000000
2076 Boolean nsg_compatibility_mode,
2084 const double kZeroObsPseudo = 30.0;
2089 if ( !msa || !seq_weights || !sbp || !aligned_blocks || !internal_pssm ) {
2099 double columnCounts = 0.0;
2100 double observations = 0.0;
2101 double pseudoWeight;
2110 if (0 == pseudo_count)
2113 columnCounts = pseudo_count;
2116 pseudoWeight = kZeroObsPseudo;
2120 pseudoWeight = columnCounts;
2135 double pseudo = 0.0;
2137 const double kBeta = pseudoWeight;
2138 double numerator = 0.0;
2139 double denominator = 0.0;
2140 double qOverPEstimate = 0.0;
2150 freq_ratios->
data[
r][
i]);
2160 denominator = observations + kBeta;
2162 if (nsg_compatibility_mode && denominator == 0.0) {
2165 ASSERT(denominator != 0.0);
2167 qOverPEstimate = numerator/denominator;
2195 const double kZeroObsPseudo = 30.0;
2198 const double* backgroundProbabilities =
NULL;
2200 if ( !cd_msa || !seq_weights || !sbp || !internal_pssm || pseudo_count < 0) {
2205 if ( !freq_ratios ) {
2210 if ( !backgroundProbabilities ) {
2216 double columnCounts = 0.0;
2217 double observations = 0.0;
2218 double pseudoWeight;
2220 if (cd_msa->
query[p] != kXResidue)
2224 observations =
MAX(0.0,
2227 if (0 == pseudo_count)
2230 columnCounts = pseudo_count;
2234 pseudoWeight = kZeroObsPseudo;
2238 pseudoWeight = columnCounts;
2245 if (cd_msa->
query[p] == kXResidue ||
2253 double pseudo = 0.0;
2255 const double kBeta = pseudoWeight;
2256 double numerator = 0.0;
2257 double denominator = 0.0;
2258 double qOverPEstimate = 0.0;
2265 freq_ratios->
data[
r][
i]);
2275 denominator = observations + kBeta;
2277 ASSERT(denominator != 0.0);
2279 qOverPEstimate = numerator/denominator;
2301 const double* std_prob,
2307 double* retval =
NULL;
2311 if ( !std_prob || !score_mat ) {
2315 retval = (
double*)
calloc(query_length,
sizeof(
double));
2320 for (p = 0; p < query_length; p++) {
2322 double info_sum = 0.0;
2324 for (
r = 0;
r < alphabet_sz;
r++) {
2328 double exponent = exp(score *
lambda);
2329 double tmp = std_prob[
r] * exponent;
2334 retval[p] = info_sum;
2342 double** freq_ratios,
2343 const double* std_prob,
2347 double* retval =
NULL;
2351 if ( !std_prob || !freq_ratios ) {
2355 retval = (
double*)
calloc(query_length,
sizeof(
double));
2360 for (p = 0; p < query_length; p++) {
2362 double info_sum = 0.0;
2364 for (
r = 0;
r < alphabet_sz;
r++) {
2369 double qOverPEstimate = freq_ratios[p][
r] / std_prob[
r];
2377 retval[p] = info_sum;
2395 const double* std_probs)
2404 double ideal_lambda = 0.0;
2406 if ( !internal_pssm || !sbp || !std_probs )
2413 for (
i = 0;
i < internal_pssm->
ncols;
i++) {
2421 double qOverPEstimate = 0.0;
2430 if (is_unaligned_column && qOverPEstimate != 0.0) {
2431 is_unaligned_column =
FALSE;
2435 if (qOverPEstimate == 0.0 || std_probs[j] <
kEpsilon) {
2438 double tmp =
log(qOverPEstimate)/ideal_lambda;
2443 if ( (j == kXResidue || j == kStarResidue) &&
2450 if (is_unaligned_column) {
2455 if (freq_ratios->
data[kResidue][j] != 0.0) {
2482 const double* std_probs,
2488 int** scaled_pssm =
NULL;
2491 double factor_low = 1.0;
2492 double factor_high = 1.0;
2493 double ideal_lambda = 0.0;
2495 double new_lambda = 0.0;
2498 Uint4 query_length = 0;
2501 if ( !internal_pssm || !sbp || !
query || !std_probs )
2508 pssm = internal_pssm->
pssm;
2510 query_length = internal_pssm->
ncols;
2517 for (
i = 0;
i < internal_pssm->
ncols;
i++) {
2518 for (j = 0; j < internal_pssm->
nrows; j++) {
2532 if (new_lambda > ideal_lambda) {
2535 factor = factor_high;
2540 if (too_high ==
FALSE) {
2543 factor_high += (factor_high - 1.0);
2544 factor = factor_high;
2546 }
else if (new_lambda > 0) {
2550 factor = factor_low;
2554 if (too_high ==
TRUE) {
2557 factor_low += (factor_low - 1.0);
2558 factor = factor_low;
2570 factor = (factor_high + factor_low)/2;
2572 for (
i = 0;
i < internal_pssm->
ncols;
i++) {
2573 for (j = 0; j < internal_pssm->
nrows; j++) {
2588 if (new_lambda > ideal_lambda) {
2589 factor_low = factor;
2591 factor_high = factor;
2602 double scaling_factor)
2616 scaling_factor,
TRUE, sbp);
2637 for(
i = 0;
i < length;
i++) {
2638 if (seq[
i] != kXResidue) {
2650 const double* std_probs,
2655 Uint4 alphabet_size = 0;
2657 Uint4 effective_length = 0;
2673 if (alphabet_size <= 0) {
2681 for (p = 0; p < query_length; p++) {
2682 if (
query[p] == kXResidue) {
2685 for (
r = 0;
r < alphabet_size;
r++) {
2686 const int kScore = pssm[p][aa_alphabet[
r]];
2699 if ( !score_freqs ) {
2703 score_freqs->
obs_min = min_score;
2704 score_freqs->
obs_max = max_score;
2705 for (p = 0; p < query_length; p++) {
2706 if (
query[p] == kXResidue) {
2710 for (
r = 0;
r < alphabet_size;
r++) {
2711 const int kScore = pssm[p][aa_alphabet[
r]];
2719 (std_probs[aa_alphabet[
r]]/effective_length);
2724 for (s = min_score; s <= max_score; s++) {
2735 const double* std_probs,
2770 contains_aligned_regions =
TRUE;
2775 if ( !contains_aligned_regions ) {
2782 unsigned int seq_index,
2796 sequence_position = msa->
data[seq_index];
2797 for (
i = start;
i < stop;
i++) {
2798 sequence_position[
i].
letter = 0;
2819 if ( !diagnostics || !msa || !aligned_block || !seq_weights ||
2876 (seq_weights->
sigma[p] / aligned_block->
size[p] - 1);
2884 if (diagnostics->
sigma) {
2886 diagnostics->
sigma[p] = seq_weights->
sigma[p];
2920 if ( !diagnostics || !cd_msa || !seq_weights ||
3001 probabilities[c] = posSearch->
match_weights[columnNumber][charOrder[c]];
3015 double *probabilitiesToReturn,
3016 double standardWeight,
3017 const double *standardProbabilities,
3018 double observations)
3026 intermediateSums[c] =
3027 (initialProbabilities[c] * observations) +
3028 (standardProbabilities[c] * standardWeight);
3029 overallSum += intermediateSums[c];
3032 probabilitiesToReturn[c] = intermediateSums[c]/overallSum;
3046 const double *backgroundProbabilities)
3054 returnValue += (newDistribution[c] *
3055 log (newDistribution[c]/backgroundProbabilities[c]));
3059 return(returnValue);
3069 const double *backgroundProbabilities)
3073 double weighted_sum;
3080 weighted_sum += exp(j*
log(1.0-backgroundProbabilities[k]));
3088 const double *backgroundProbabilities,
3089 const double observations)
3093 double relativeEntropy;
3095 double pseudoDenominator;
3098 const double kPseudoMult = 500.0;
3099 const double kPseudoNumerator = 0.0457;
3100 const double kPseudoExponent = 0.8;
3101 const double kPseudoSmallInitial = 5.5;
3106 &(columnProbabilitiesAdjusted[0]),
3107 kPseudoSmallInitial,
3108 backgroundProbabilities, observations);
3110 backgroundProbabilities);
3111 pseudoDenominator = pow(relativeEntropy, kPseudoExponent);
3112 alpha = kPseudoNumerator/pseudoDenominator;
3114 returnValue = kPseudoMult * alpha/ (1- alpha);
3118 return(returnValue);
3123 int columnNumber,
int queryLength,
3124 const double *expno)
3130 int totalDistinctCounts;
3132 double aveDistinctAA;
3133 int columnsAccountedFor;
3149 columnsAccountedFor = 0;
3150 totalDistinctCounts = 0;
3151 while (columnsAccountedFor < halfNumColumns) {
3155 if (columnsAccountedFor > halfNumColumns) {
3156 totalDistinctCounts -=
3157 ((columnsAccountedFor - halfNumColumns) * k);
3158 columnsAccountedFor = halfNumColumns;
3162 aveDistinctAA = ((double) totalDistinctCounts)/
3163 ((double) columnsAccountedFor);
3172 i-(expno[
i]-aveDistinctAA)/(expno[
i]-expno[
i-1]);
3174 indep =
MAX(0,indep - 1);
#define sfree(x)
Safe free a pointer: belongs to a higher level header.
Int2 DynamicUint4Array_Append(SDynamicUint4Array *arr, Uint4 element)
Append a Uint4 to the dynamic array structure.
SDynamicUint4Array * DynamicUint4Array_Dup(const SDynamicUint4Array *src)
Make a deep copy of the src dynamic array.
Int4 DynamicUint4Array_Copy(SDynamicUint4Array *dest, const SDynamicUint4Array *src)
Make a shallow copy of the src dynamic array into the dest dynamic array.
SDynamicUint4Array * DynamicUint4ArrayNewEx(Uint4 init_num_elements)
Allocate a dynamic array of Uint4 with an initial size of specified as an argument to the function.
Boolean DynamicUint4Array_AreEqual(const SDynamicUint4Array *a, const SDynamicUint4Array *b)
Compares dynamic arrays a and b for equality of its contents.
SDynamicUint4Array * DynamicUint4ArrayFree(SDynamicUint4Array *arr)
Deallocates a dynamic array structure.
Declarations for various dynamic array types.
int Kappa_impalaScaling(Kappa_posSearchItems *posSearch, Kappa_compactSearchItems *compactSearch, double scalingFactor, Boolean doBinarySearch, BlastScoreBlk *sbp)
Copied from posit2.c.
Kappa_compactSearchItems * Kappa_compactSearchItemsNew(const Uint1 *query, unsigned int queryLength, BlastScoreBlk *sbp)
Creates a new Kappa_compactSearchItems structure.
Kappa_posSearchItems * Kappa_posSearchItemsFree(Kappa_posSearchItems *posSearch)
Deallocates the Kappa_posSearchItems structure.
Kappa_compactSearchItems * Kappa_compactSearchItemsFree(Kappa_compactSearchItems *compactSearch)
Deallocates the Kappa_compactSearchItems structure.
Kappa_posSearchItems * Kappa_posSearchItemsNew(unsigned int queryLength, const char *matrix_name, int **posPrivateMatrix, double **posFreqs)
Allocates a new Kappa_posSearchItems structure.
Port of posit.h structures and impalaScaling for implementing composition based statistics for PSI-BL...
static int s_PSIValidateAlignedColumns(const _PSIMsa *msa)
Validate that there are no unaligned columns or columns which only contain gaps in the multiple seque...
static NCBI_INLINE void _handleEitherAlignedNeitherX(_PSIAlignmentTraits *traits, _EPSIPurgeFsmState *state, Uint4 position)
Handle the event when either position is aligned and neither is X.
static NCBI_INLINE void _handleEitherAlignedEitherX(_PSIAlignmentTraits *traits, _EPSIPurgeFsmState *state)
Handle the event when either position is aligned and either is X.
static NCBI_INLINE void _handleNeitherAligned(_PSIAlignmentTraits *traits, _EPSIPurgeFsmState *state, _PSIPackedMsa *msa, Uint4 seq_index, double max_percent_identity)
Handles neither is aligned event FIXME: document better.
int _PSIComputeAlignmentBlocks(const _PSIMsa *msa, _PSIAlignedBlock *aligned_blocks)
Main function to compute aligned blocks' properties for each position within multiple alignment (stag...
static void _PSIGetLeftExtents(const _PSIMsa *msa, Uint4 seq_index)
Computes the left extents for the sequence identified by seq_index.
int _PSIConvertFreqRatiosToPSSM(_PSIInternalPssmData *internal_pssm, const Uint1 *query, const BlastScoreBlk *sbp, const double *std_probs)
Converts the PSSM's frequency ratios obtained in the previous stage to a PSSM of scores.
static void _PSICalculateMatchWeights(const _PSIMsa *msa, Uint4 position, const SDynamicUint4Array *aligned_seqs, _PSISequenceWeights *seq_weights)
Calculate the weighted observed sequence weights.
int _PSIComputeFreqRatios(const _PSIMsa *msa, const _PSISequenceWeights *seq_weights, const BlastScoreBlk *sbp, const _PSIAlignedBlock *aligned_blocks, Int4 pseudo_count, Boolean nsg_compatibility_mode, _PSIInternalPssmData *internal_pssm)
Main function to compute the PSSM's frequency ratios (stage 5).
void ** _PSIAllocateMatrix(unsigned int ncols, unsigned int nrows, unsigned int data_type_sz)
Generic 2 dimensional matrix allocator.
static int s_PSIValidateNoGapsInQuery(const _PSIMsa *msa)
Validate that there are no gaps in the query sequence.
void _PSIStructureGroupCustomization(_PSIMsa *msa)
Enable NCBI structure group customization to discard the query sequence, as this really isn't the res...
int _PSIComputeFreqRatiosFromCDs(const PSICdMsa *cd_msa, const _PSISequenceWeights *seq_weights, const BlastScoreBlk *sbp, Int4 pseudo_count, _PSIInternalPssmData *internal_pssm)
Main function to compute CD-based PSSM's frequency ratios.
_PSISequenceWeights * _PSISequenceWeightsNew(const PSIMsaDimensions *dimensions, const BlastScoreBlk *sbp)
Allocates and initializes the _PSISequenceWeights structure.
static void _PSISpreadGapWeights(const _PSIMsa *msa, _PSISequenceWeights *seq_weights, Boolean nsg_compatibility_mode)
Uses disperse method of spreading the gap weights.
_PSIInternalPssmData * _PSIInternalPssmDataNew(Uint4 query_length, Uint4 alphabet_size)
Allocates a new _PSIInternalPssmData structure.
static void _PSIGetAlignedSequencesForPosition(const _PSIMsa *msa, Uint4 position, SDynamicUint4Array *aligned_sequences)
Populates the array aligned_sequences with the indices of the sequences which are part of the multipl...
_PSIAlignedBlock * _PSIAlignedBlockNew(Uint4 query_length)
Allocates and initializes the _PSIAlignedBlock structure.
unsigned int _PSIPackedMsaGetNumberOfAlignedSeqs(const _PSIPackedMsa *msa)
Retrieve the number of aligned sequences in the compact multiple sequence alignment.
int _PSIComputeSequenceWeights(const _PSIMsa *msa, const _PSIAlignedBlock *aligned_blocks, Boolean nsg_compatibility_mode, _PSISequenceWeights *seq_weights)
Main function to calculate the sequence weights.
int _PSISaveDiagnostics(const _PSIMsa *msa, const _PSIAlignedBlock *aligned_block, const _PSISequenceWeights *seq_weights, const _PSIInternalPssmData *internal_pssm, PSIDiagnosticsResponse *diagnostics)
Collects diagnostic information from the process of creating the PSSM.
static int s_PSIValidateParticipatingSequences(const _PSIMsa *msa)
Verify that after purging biased sequences in multiple sequence alignment there are still sequences p...
static void s_fillColumnProbabilities(double *probabilities, const _PSISequenceWeights *posSearch, Int4 columnNumber)
Reorders in the same manner as returned by Blast_GetMatrixBackgroundFreq this function is a copy of p...
double * _PSICalculateInformationContentFromScoreMatrix(Int4 **score_mat, const double *std_prob, const Uint1 *query, Uint4 query_length, Uint4 alphabet_sz, double lambda)
Calculates the information content from the scoring matrix.
static int _PSICheckSequenceWeights(const _PSIMsa *msa, const _PSISequenceWeights *seq_weights, Boolean nsg_compatibility_mode)
Verifies that the sequence weights for each column of the PSSM add up to 1.0.
const double kPosEpsilon
minimum return value of s_computeRelativeEntropy
Blast_ScoreFreq * _PSIComputeScoreProbabilities(const int **pssm, const Uint1 *query, Uint4 query_length, const double *std_probs, const BlastScoreBlk *sbp)
Compute the probabilities for each score in the PSSM.
static double s_effectiveObservations(const _PSIAlignedBlock *align_blk, const _PSISequenceWeights *seq_weights, int columnNumber, int queryLength, const double *expno)
A method to estimate the effetive number of observations in the interval for the specified columnNumb...
static double s_columnSpecificPseudocounts(const _PSISequenceWeights *posSearch, int columnNumber, const double *backgroundProbabilities, const double observations)
copy of posit.c:columnSpecificPseudocounts
int _PSIPurgeBiasedSegments(_PSIPackedMsa *msa)
Main function for keeping only those selected sequences for PSSM construction (stage 2).
static void s_PSIDiscardIfUnused(_PSIPackedMsa *msa, unsigned int seq_index)
Check if we still need this sequence.
int _PSIComputeFrequenciesFromCDs(const PSICdMsa *cd_msa, BlastScoreBlk *sbp, const PSIBlastOptions *options, _PSISequenceWeights *seq_weights)
Main function to calculate CD weights and combine weighted residue counts from matched CDs.
static int s_PSIValidateNoFlankingGaps(const _PSIMsa *msa)
Validate that there are no flanking gaps in the multiple sequence alignment.
static void s_initializeExpNumObservations(double *expno, const double *backgroundProbabilities)
initialize the expected number of observations use background probabilities for this matrix Calculate...
_PSIMsa * _PSIMsaNew(const _PSIPackedMsa *msa, Uint4 alphabet_size)
Allocates and initializes the internal version of the PSIMsa structure (makes a deep copy) for intern...
_EPSIPurgeFsmState
Defines the states of the finite state machine used in s_PSIPurgeSimilarAlignments.
static void _PSIComputeAlignedRegionLengths(const _PSIMsa *msa, _PSIAlignedBlock *aligned_blocks)
Calculates the aligned blocks lengths in the multiple sequence alignment data structure.
const int kPSIScaleFactor
Successor to POSIT_SCALE_FACTOR.
const double kPSINearIdentical
Percent identity threshold for discarding near-identical matches.
const Uint4 kPositScalingNumIterations
Constant used in scaling PSSM routines: Successor to POSIT_NUM_ITERATIONS.
_PSISequenceWeights * _PSISequenceWeightsFree(_PSISequenceWeights *seq_weights)
Deallocates the _PSISequenceWeights structure.
void ** _PSIDeallocateMatrix(void **matrix, unsigned int ncols)
Generic 2 dimensional matrix deallocator.
void _PSICopyMatrix_int(int **dest, int **src, unsigned int ncols, unsigned int nrows)
Copies src matrix into dest matrix, both of which must be int matrices with dimensions ncols by nrows...
#define MAX_IND_OBSERVATIONS
max number of independent observation for pseudocount calculation
static void s_PSIComputeFrequenciesFromCDsCleanup(double *sum_weights)
static NCBI_INLINE void _handleBothAlignedSameResidueNoX(_PSIAlignmentTraits *traits, _EPSIPurgeFsmState *state)
Handle event when both positions are aligned, using the same residue, but this residue is not X.
Uint4 _PSISequenceLengthWithoutX(const Uint1 *seq, Uint4 length)
Calculates the length of the sequence without including any 'X' residues.
static void s_PSIPurgeSimilarAlignments(_PSIPackedMsa *msa, Uint4 seq_index1, Uint4 seq_index2, double max_percent_identity)
This function compares the sequences in the msa->cell structure indexed by sequence_index1 and seq_in...
static void _PSIGetRightExtents(const _PSIMsa *msa, Uint4 seq_index)
Computes the right extents for the sequence identified by seq_index.
void _PSIUpdateLambdaK(const int **pssm, const Uint1 *query, Uint4 query_length, const double *std_probs, BlastScoreBlk *sbp)
Updates the Karlin-Altschul parameters based on the query sequence and PSSM's score frequencies.
static void _PSIComputePositionExtents(const _PSIMsa *msa, Uint4 seq_index, _PSIAlignedBlock *aligned_blocks)
Computes the aligned blocks extents' for each position for the sequence identified by seq_index.
#define PSEUDO_MAX
effective infinity
static double s_computeRelativeEntropy(const double *newDistribution, const double *backgroundProbabilities)
compute relative entropy of first distribution to second distribution A copy of posit....
static void _PSICalculateNormalizedSequenceWeights(const _PSIMsa *msa, const _PSIAlignedBlock *aligned_blocks, Uint4 position, const SDynamicUint4Array *aligned_seqs, _PSISequenceWeights *seq_weights)
Calculates the position based weights using a modified version of the Henikoff's algorithm presented ...
const unsigned int kQueryIndex
Index into multiple sequence alignment structure for the query sequence.
_PSIInternalPssmData * _PSIInternalPssmDataFree(_PSIInternalPssmData *pssm_data)
Deallocates the _PSIInternalPssmData structure.
int _PSIValidateMSA_StructureGroup(const _PSIMsa *msa)
Structure group validation function for multiple sequence alignment structure.
int _PSIScaleMatrix(const Uint1 *query, const double *std_probs, _PSIInternalPssmData *internal_pssm, BlastScoreBlk *sbp)
Scales the PSSM (stage 7)
int _PSIPurgeAlignedRegion(_PSIPackedMsa *msa, unsigned int seq_index, unsigned int start, unsigned int stop)
Marks the (start, stop] region corresponding to sequence seq_index in alignment so that it is not fur...
_PSIMsa * _PSIMsaFree(_PSIMsa *msa)
Deallocates the _PSIMsa data structure.
#define DEFINE_COPY_MATRIX_FUNCTION(type)
Implements the generic copy matrix functions.
_PSIAlignedBlock * _PSIAlignedBlockFree(_PSIAlignedBlock *aligned_blocks)
Deallocates the _PSIAlignedBlock structure.
const double kEpsilon
Small constant to test against 0.
double * _PSICalculateInformationContentFromFreqRatios(double **freq_ratios, const double *std_prob, Uint4 query_length, Uint4 alphabet_sz)
Calculates the information content from the residue frequencies calculated in stage 5 of the PSSM cre...
static void s_PSIPurgeSelfHits(_PSIPackedMsa *msa)
Remove those sequences which are identical to the query sequence.
#define EFFECTIVE_ALPHABET
size of alphabet used for pseudocounts calculations
int _PSISaveCDDiagnostics(const PSICdMsa *cd_msa, const _PSISequenceWeights *seq_weights, const _PSIInternalPssmData *internal_pssm, PSIDiagnosticsResponse *diagnostics)
Collects diagnostic information from the process of creating the CDD-based PSSM.
static void s_PSIPurgeNearIdenticalAlignments(_PSIPackedMsa *msa)
Keeps only one copy of any aligned sequences which are >kPSINearIdentical% identical to one another.
struct _PSIAlignmentTraits _PSIAlignmentTraits
Auxiliary structure to maintain information about two aligned regions between the query and a subject...
int _PSIValidateCdMSA(const PSICdMsa *cd_msa, Uint4 alphabet_size)
Validation of multiple alignment of conserved domains structure.
void _PSIUpdatePositionCounts(_PSIMsa *msa)
Counts the number of sequences matching the query per query position (columns of the multiple alignme...
const double kPSIIdentical
Percent identity threshold for discarding identical matches.
_PSIPackedMsa * _PSIPackedMsaNew(const PSIMsa *msa)
Allocates and initializes the compact version of the PSIMsa structure (makes a deep copy) for interna...
static void s_adjustColumnProbabilities(double *initialProbabilities, double *probabilitiesToReturn, double standardWeight, const double *standardProbabilities, double observations)
adjust the probabilities by assigning observations weight to initialProbabilities and standardWeight ...
const double kPositScalingPercent
Constant used in scaling PSSM routines: Successor to POSIT_PERCENT.
int _IMPALAScaleMatrix(const Uint1 *query, const double *std_probs, _PSIInternalPssmData *internal_pssm, BlastScoreBlk *sbp, double scaling_factor)
Provides a similar function to _PSIScaleMatrix but it performs the scaling as IMPALA did,...
static NCBI_INLINE void _PSIResetAlignmentTraits(_PSIAlignmentTraits *traits, Uint4 position)
Resets the traits structure to restart finite state machine.
_PSIPackedMsa * _PSIPackedMsaFree(_PSIPackedMsa *msa)
Deallocates the _PSIMsa data structure.
int _PSIValidateMSA(const _PSIMsa *msa, Boolean ignore_unaligned_positions)
Main validation function for multiple sequence alignment structure.
Private interface for Position Iterated BLAST API, contains the PSSM generation engine.
#define PSIERR_BADPARAM
Bad parameter used in function.
#define PSIERR_ENDINGGAP
Found flanking gap at end of alignment.
#define PSIERR_UNKNOWN
Unknown error.
#define PSIERR_COLUMNOFGAPS
Found an entire column full of GAP residues.
#define PSIERR_OUTOFMEM
Out of memory.
#define PSIERR_BADPROFILE
Errors in conserved domain profile.
#define PSIERR_POSITIVEAVGSCORE
Positive average score found when scaling matrix.
#define PSIERR_NOALIGNEDSEQS
After purge stage of PSSM creation, no sequences are left.
#define PSIERR_STARTINGGAP
Found flanking gap at start of alignment.
#define PSIERR_BADSEQWEIGHTS
Sequence weights do not add to 1.
#define PSI_SUCCESS
Successful operation.
#define PSIERR_UNALIGNEDCOLUMN
Found an entire column with no participating sequences.
#define PSIERR_GAPINQUERY
GAP residue found in query sequence.
#define BLAST_SCORE_MIN
minimum allowed score (for one letter comparison).
Int2 Blast_GetStdAlphabet(Uint1 alphabet_code, Uint1 *residues, Uint4 residue_size)
Fills a buffer with the 'standard' alphabet (given by STD_AMINO_ACID_FREQS[index]....
Int2 Blast_KarlinBlkUngappedCalc(Blast_KarlinBlk *kbp, Blast_ScoreFreq *sfp)
Computes the parameters lambda, H K for use in calculating the statistical significance of high-scori...
Blast_ScoreFreq * Blast_ScoreFreqFree(Blast_ScoreFreq *sfp)
Deallocates the score frequencies structure.
#define BLAST_SCORE_MAX
maximum allowed score (for one letter comparison).
Blast_ScoreFreq * Blast_ScoreFreqNew(Int4 score_min, Int4 score_max)
Creates a new structure to keep track of score frequencies for a scoring system.
Various auxiliary BLAST utility functions.
#define IS_residue(x)
Does character encode a residue?
double * BLAST_GetStandardAaProbabilities(void)
Get the standard amino acid probabilities.
Constants used in compositional score matrix adjustment.
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
#define BLASTAA_SIZE
Size of aminoacid alphabet.
#define BLASTAA_SEQ_CODE
== Seq_code_ncbistdaa
const Uint1 AMINOACID_TO_NCBISTDAA[]
Translates between ncbieaa and ncbistdaa.
const char NCBISTDAA_TO_AMINOACID[]
Translates between ncbieaa and ncbistdaa.
uint8_t Uint1
1-byte (8-bit) unsigned integer
int32_t Int4
4-byte (32-bit) signed integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
unsigned int
A callback function used to compare two keys in a database.
static const char * GetResidue(TokenStatBlkPtr stoken)
SFreqRatios * _PSIMatrixFrequencyRatiosFree(SFreqRatios *freq_ratios)
Deallocate the frequency ratios structure.
SFreqRatios * _PSIMatrixFrequencyRatiosNew(const char *matrix_name)
Retrive the matrix's frequency ratios.
Definitions used to get joint probabilities for a scoring matrix.
const double * Blast_GetMatrixBackgroundFreq(const char *matrix_name)
Return true if frequency data is available for the given matrix name.
bool is_aligned(T *p) noexcept
Check pointer alignment.
Prototypes for portable math library (ported from C Toolkit)
#define NCBIMATH_LN2
Natural log(2)
long BLAST_Nint(double x)
Nearest integer.
#define MIN(a, b)
returns smaller of a and b.
#define NCBI_INLINE
"inline" seems to work on our remaining in-house compilers (WorkShop, Compaq, ICC,...
Uint1 Boolean
bool replacment for C
#define TRUE
bool replacment for C indicating true.
#define FALSE
bool replacment for C indicating false.
#define ASSERT
macro for assert.
#define MAX(a, b)
returns larger of a and b.
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
double lambda(size_t dimMatrix_, const Int4 *const *scoreMatrix_, const double *q_)
static const char * kScore
Structure used for scoring calculations.
Blast_KarlinBlk ** kbp_psi
K-A parameters for position-based alignments.
char * name
name of scoring matrix.
Int2 alphabet_size
size of alphabet.
Uint1 alphabet_code
NCBI alphabet code.
SBlastScoreMatrix * matrix
scoring matrix data
Blast_KarlinBlk * kbp_ideal
Ideal values (for query with average database composition).
Blast_KarlinBlk ** kbp_gap_std
K-A parameters for std (not position-based) alignments.
Blast_KarlinBlk ** kbp_gap_psi
K-A parameters for psi alignments.
double K
K value used in statistics.
double Lambda
Lambda value used in statistics.
double logK
natural log of K value used in statistics
Holds score frequencies used in calculation of Karlin-Altschul parameters for an ungapped search.
double score_avg
average score, must be negative for local alignment.
Int4 obs_min
lowest observed (actual) scores
double * sprob
arrays for frequency of given score, shifted down by score_min.
Int4 obs_max
highest observed (actual) scores
Structure used to pass data into the scaling routines.
Structure used to pass data into the scaling routines.
Options used in protein BLAST only (PSI, PHI, RPS and translated BLAST) Some of these possibly should...
double iobsr
Effective number of independent observations in a CD column.
double * wfreqs
Frequencies for each residue in CD column.
Uint1 is_aligned
Does this cell represent column aligned to a CD.
PSICdMsaCellData * data
Data needed for PSSM computation.
Data structure representing multiple alignemnt of CDs and query sequence along with data needed for P...
PSIMsaDimensions * dimensions
Query length and number of aligned cds.
unsigned char * query
Query sequence as Ncbistdaa.
PSICdMsaCell ** msa
Multiple alignment of CDs.
This structure contains the diagnostics information requested using the PSIDiagnosticsRequest structu...
double * information_content
position information content (query_length elements)
Uint4 ** residue_freqs
observed residue frequencies per position of the PSSM (Dimensions are query_length by alphabet_size)
double ** weighted_residue_freqs
Weighted observed residue frequencies per position of the PSSM.
Uint4 * interval_sizes
interval sizes of aligned regions (query_length elements)
Uint4 alphabet_size
Specifies length of alphabet.
Uint4 query_length
Specifies the number of positions in the PSSM.
double * gapless_column_weights
Weights for columns without gaps (query_length elements)
double * independent_observations
Effective number of observations per column.
Uint4 * num_matching_seqs
number of matching sequences per query position (query_length elements)
double * sigma
sigma (query_length elements)
double ** frequency_ratios
PSSM's frequency ratios (Dimensions are query_length by alphabet_size)
Boolean is_aligned
Is this letter part of the alignment?
Uint1 letter
Preferred letter at this position, in ncbistdaa encoding.
Structure representing the dimensions of the multiple sequence alignment data structure.
Uint4 num_seqs
Number of distinct sequences aligned with the query (does not include the query)
Uint4 query_length
Length of the query.
Multiple sequence alignment (msa) data structure containing the raw data needed by the PSSM engine to...
PSIMsaCell ** data
actual data, dimensions are (dimensions->num_seqs+1) by (dimensions->query_length)
PSIMsaDimensions * dimensions
dimensions of the msa
int ** data
actual scoring matrix data, stored in row-major form
Data structure to maintain a dynamically allocated array of Uint4.
Uint4 * data
array of Uint4
Uint4 num_used
number of elements used in this array
Uint4 num_allocated
size of array below
Stores the frequency ratios along with their bit scale factor.
double ** data
The actual frequency ratios.
int bit_scale_factor
Used to multiply the values in the above matrix to obtain scores in bit units.
A structure containing two integers, used e.g.
Int4 left
left endpoint of range (zero based)
Int4 right
right endpoint of range (zero based)
This structure keeps track of the regions aligned between the query sequence and those that were not ...
SSeqRange * pos_extnt
Dynamically allocated array of size query_length to keep track of the extents of each aligned positio...
Uint4 * size
Dynamically allocated array of size query_length that contains the size of the intervals in the array...
Auxiliary structure to maintain information about two aligned regions between the query and a subject...
Uint4 n_x_residues
number of X residues in alignment
Uint4 n_identical
number of identical residues in alignment
Uint4 start
starting offset of alignment w.r.t.
Uint4 effective_length
length of alignment not including Xs
Internal representation of a PSSM in various stages of its creation and its dimensions.
int ** scaled_pssm
scaled PSSM (scores)
Uint4 nrows
number of rows (alphabet_size)
double * pseudocounts
pseudocount constant for each column
Uint4 ncols
number of columns (query_length)
double ** freq_ratios
frequency ratios
Internal data structure to represent a position in the multiple sequence alignment data structure.
unsigned int letter
Preferred letter at this position.
SSeqRange extents
Extents of this aligned position.
unsigned int is_aligned
Is this letter part of the alignment?
Internal multiple alignment data structure used by the PSSM engine.
Uint4 * num_matching_seqs
number of sequences aligned at a given position in the multiple sequence alignment (length: query_len...
Uint1 * query
query sequence (length: query_length)
Uint4 ** residue_counts
matrix to keep track of the raw residue counts at each position of the multiple sequence alignment (d...
PSIMsaDimensions * dimensions
dimensions of field below
Uint4 alphabet_size
number of elements in alphabet
_PSIMsaCell ** cell
multiple sequence alignment matrix (dimensions: query_length x num_seqs + 1)
Compact version of the PSIMsaCell structure.
unsigned int letter
Preferred letter at this position, in ncbistdaa encoding.
unsigned int is_aligned
Is this letter part of the alignment?
Compact version of PSIMsa structure.
PSIMsaDimensions * dimensions
dimensions of the msa
Boolean * use_sequence
used to indicate whether a sequence should be used for further processing by the engine (length: num_...
_PSIPackedMsaCell ** data
actual data, dimensions are (dimensions->num_seqs+1) by (dimensions->query_length)
Internal data structure to keep computed sequence weights.
double ** match_weights
weighted observed residue frequencies (f_i in 2001 paper).
Uint4 posDistinctDistrib_size
Kept to deallocate field above.
double * std_prob
standard amino acid probabilities
double * norm_seq_weights
Stores the normalized sequence weights (length: num_seqs + 1)
int ** posDistinctDistrib
For position i, how many positions in its block have j distinct letters.
double * independent_observations
Number of independent sequences per column.
int * posNumParticipating
number of sequences at each position.
double * sigma
array of length query_length
double * gapless_column_weights
FIXME.
Uint4 match_weights_size
kept for help deallocate the field above
double * row_sigma
array of length num_seqs + 1
static Uint4 letter(char c)
voidp calloc(uInt items, uInt size)