NCBI C++ ToolKit
blast_aalookup.c
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: blast_aalookup.c 98954 2023-01-26 13:38:46Z fongah2 $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  */
26 
27 /** @file blast_aalookup.c
28  * Functions interacting with the protein BLAST lookup table.
29  */
33 
34 /** Structure containing information needed for adding neighboring words.
35  */
36 typedef struct NeighborInfo {
37  BlastAaLookupTable *lookup; /**< Lookup table */
38  Uint1 *query_word; /**< the word whose neighbors we are computing */
39  Uint1 *subject_word; /**< the computed neighboring word */
40  Int4 alphabet_size; /**< number of letters in the alphabet */
41  Int4 wordsize; /**< number of residues in a word */
42  Int4 charsize; /**< number of bits in a residue */
43  Int4 **matrix; /**< the substitution matrix */
44  Int4 *row_max; /**< maximum possible score for each row of the matrix */
45  Int4 *offset_list; /**< list of offsets where the word occurs in the query */
46  Int4 threshold; /**< the score threshold for neighboring words */
47  Int4 query_bias; /**< bias all stored offsets for multiple queries */
49 
50 /**
51  * Index a query sequence; i.e. fill a lookup table with the offsets
52  * of query words
53  *
54  * @param lookup the lookup table [in/modified]
55  * @param matrix the substitution matrix [in]
56  * @param query the query sequence [in]
57  * @param query_bias number added to each offset put into lookup table
58  * (ordinarily 0; a nonzero value allows a succession of
59  * query sequences to update the same lookup table)
60  * @param location the list of ranges of query offsets to examine
61  * for indexing [in]
62  */
63 static void s_AddNeighboringWords(BlastAaLookupTable * lookup, Int4 ** matrix,
64  BLAST_SequenceBlk * query, Int4 query_bias,
66 
67 /**
68  * A position-specific version of AddNeighboringWords. Note that
69  * only the score matrix matters for indexing purposes, so an
70  * actual query sequence is unneccessary
71  *
72  * @param lookup the lookup table [in/modified]
73  * @param matrix the substitution matrix [in]
74  * @param query_bias number added to each offset put into lookup table
75  * (ordinarily 0; a nonzero value allows a succession of
76  * query sequences to update the same lookup table)
77  * @param location the list of ranges of query offsets to examine for indexing
78  */
80  Int4 ** matrix, Int4 query_bias,
82 
83 /** Add neighboring words to the lookup table.
84  * @param lookup Pointer to the lookup table.
85  * @param matrix Pointer to the substitution matrix.
86  * @param query Pointer to the query sequence.
87  * @param offset_list list of offsets where the word occurs in the query
88  * @param query_bias bias all stored offsets for multiple queries
89  * @param row_max maximum possible score for each row of the matrix
90  */
92  Int4 ** matrix, Uint1 * query,
93  Int4 * offset_list, Int4 query_bias,
94  Int4 * row_max);
95 
96 /** Add neighboring words to the lookup table using NeighborInfo structure.
97  * @param info Pointer to the NeighborInfo structure.
98  * @param score The partial sum of the score.
99  * @param current_pos The current offset.
100  */
101 static void s_AddWordHitsCore(NeighborInfo * info, Int4 score,
102  Int4 current_pos);
103 
104 /** Add neighboring words to the lookup table in case of a position-specific
105  * matrix.
106  * @param lookup Pointer to the lookup table.
107  * @param matrix The position-specific matrix.
108  * @param query_bias bias all stored offsets for multiple queries
109  * @param row_max maximum possible score for each row of the matrix
110  */
112  Int4 ** matrix, Int4 query_bias, Int4 * row_max);
113 
114 /** Add neighboring words to the lookup table in case of a position-specific
115  * matrix, using NeighborInfo structure.
116  * @param info Pointer to the NeighborInfo structure.
117  * @param score The partial sum of the score.
118  * @param current_pos The current offset.
119  */
121  Int4 score, Int4 current_pos);
122 
123 
125 {
126  Int4 i;
127  BlastRPSLookupFileHeader *lookup_header;
128  BlastRPSProfileHeader *profile_header;
129  BlastRPSLookupTable *lookup = *lut =
131  Int4 *pssm_start;
132  Int4 num_pssm_rows;
133  PV_ARRAY_TYPE *pv;
134 
135  ASSERT(info != NULL);
136 
137  /* Fill in the lookup table information. */
138 
139  lookup_header = info->lookup_header;
140  if (lookup_header->magic_number != RPS_MAGIC_NUM &&
141  lookup_header->magic_number != RPS_MAGIC_NUM_28)
142  return -1;
143 
144  /* set the alphabet size. Use hardwired numbers, since we cannot rely on
145  #define'd constants matching up to the sizes implicit in disk files */
146  if (lookup_header->magic_number == RPS_MAGIC_NUM)
147  lookup->alphabet_size = 26;
148  else
149  lookup->alphabet_size = 28;
150 
151  lookup->wordsize = BLAST_WORDSIZE_PROT;
152  lookup->charsize = ilog2(lookup->alphabet_size) + 1;
153  lookup->backbone_size = 1 << (lookup->wordsize * lookup->charsize);
154  lookup->mask = lookup->backbone_size - 1;
155  lookup->rps_backbone = (RPSBackboneCell *) ((Uint1 *) lookup_header +
156  lookup_header->
157  start_of_backbone);
158  lookup->overflow =
159  (Int4 *) ((Uint1 *) lookup_header + lookup_header->start_of_backbone +
160  (lookup->backbone_size + 1) * sizeof(RPSBackboneCell));
161  lookup->overflow_size = lookup_header->overflow_hits;
162 
163  /* fill in the pv_array */
164 
165  pv = lookup->pv = (PV_ARRAY_TYPE *) calloc(
166  (lookup->backbone_size >> PV_ARRAY_BTS),
167  sizeof(PV_ARRAY_TYPE));
168 
169  for (i = 0; i < lookup->backbone_size; i++) {
170  if (lookup->rps_backbone[i].num_used > 0) {
171  PV_SET(pv, i, PV_ARRAY_BTS);
172  }
173  }
174 
175  /* Fill in the PSSM information */
176 
177  profile_header = info->profile_header;
178  if (profile_header->magic_number != RPS_MAGIC_NUM &&
179  profile_header->magic_number != RPS_MAGIC_NUM_28)
180  return -2;
181 
182  lookup->rps_seq_offsets = profile_header->start_offsets;
183  lookup->num_profiles = profile_header->num_profiles;
184  num_pssm_rows = lookup->rps_seq_offsets[lookup->num_profiles];
185  lookup->rps_pssm = (Int4 **) malloc((num_pssm_rows + 1) * sizeof(Int4 *));
186  pssm_start = profile_header->start_offsets + lookup->num_profiles + 1;
187 
188  for (i = 0; i < num_pssm_rows + 1; i++) {
189  lookup->rps_pssm[i] = pssm_start;
190  pssm_start += lookup->alphabet_size;
191  }
192 
193  /* divide the concatenated database into regions of size RPS_BUCKET_SIZE.
194  bucket_array will then be used to organize offsets retrieved from the
195  lookup table in order to increase cache reuse */
196 
197  lookup->num_buckets = num_pssm_rows / RPS_BUCKET_SIZE + 1;
198  lookup->bucket_array = (RPSBucket *) malloc(lookup->num_buckets *
199  sizeof(RPSBucket));
200  for (i = 0; i < lookup->num_buckets; i++) {
201  RPSBucket *bucket = lookup->bucket_array + i;
202  bucket->num_filled = 0;
203  bucket->num_alloc = 1000;
204  bucket->offset_pairs = (BlastOffsetPair *) malloc(bucket->num_alloc *
205  sizeof
206  (BlastOffsetPair));
207  }
208 
209  return 0;
210 }
211 
213 {
214  /* The following will only free memory that was allocated by
215  RPSLookupTableNew. */
216  Int4 i;
217  for (i = 0; i < lookup->num_buckets; i++)
218  sfree(lookup->bucket_array[i].offset_pairs);
219  sfree(lookup->bucket_array);
220 
221  sfree(lookup->rps_pssm);
222  sfree(lookup->pv);
223  sfree(lookup);
224  return NULL;
225 }
226 
228  BlastAaLookupTable * *lut)
229 {
230  Int4 i;
231  BlastAaLookupTable *lookup = *lut =
233 
234  ASSERT(lookup != NULL);
235 
236  lookup->charsize = ilog2(BLASTAA_SIZE) + 1;
237  lookup->word_length = opt->word_size;
238 
239  for (i = 0; i < lookup->word_length; i++)
240  lookup->backbone_size |= (BLASTAA_SIZE - 1) << (i * lookup->charsize);
241  lookup->backbone_size++;
242 
243  lookup->mask = (1 << (opt->word_size * lookup->charsize)) - 1;
244  lookup->alphabet_size = BLASTAA_SIZE;
245  lookup->threshold = (Int4)opt->threshold;
246  lookup->thin_backbone =
247  (Int4 **) calloc(lookup->backbone_size, sizeof(Int4 *));
248  ASSERT(lookup->thin_backbone != NULL);
249 
250  lookup->thick_backbone = NULL;
251  lookup->overflow = NULL;
252  lookup->pv = NULL;
253  return 0;
254 }
255 
256 
258 {
259  sfree(lookup->thick_backbone);
260  sfree(lookup->overflow);
261  sfree(lookup->pv);
262  sfree(lookup);
263  return NULL;
264 }
265 
266 
268 {
269  Int4 i,j;
270  Int4 overflow_cells_needed = 0;
271  Int4 overflow_cursor = 0;
272  Int4 longest_chain = 0;
273  PV_ARRAY_TYPE *pv;
276 #ifdef LOOKUP_VERBOSE
277  Int4 backbone_occupancy = 0;
278  Int4 thick_backbone_occupancy = 0;
279  Int4 num_overflows = 0;
280 #endif
281 
282  /* find out how many cells need the overflow array */
283  for (i = 0; i < lookup->backbone_size; i++) {
284  if (lookup->thin_backbone[i]) {
285 #ifdef LOOKUP_VERBOSE
286  backbone_occupancy++;
287 #endif
288  if (lookup->thin_backbone[i][1] > AA_HITS_PER_CELL){
289 #ifdef LOOKUP_VERBOSE
290  ++num_overflows;
291 #endif
292  overflow_cells_needed += lookup->thin_backbone[i][1];
293  }
294  if (lookup->thin_backbone[i][1] > longest_chain)
295  longest_chain = lookup->thin_backbone[i][1];
296  }
297  }
298  lookup->overflow_size = overflow_cells_needed;
299  lookup->longest_chain = longest_chain;
300 
301 #ifdef LOOKUP_VERBOSE
302  thick_backbone_occupancy = backbone_occupancy - num_overflows;
303  printf("backbone size: %d\n", lookup->backbone_size);
304  printf("backbone occupancy: %d (%f%%)\n", backbone_occupancy,
305  100.0 * backbone_occupancy / lookup->backbone_size);
306  printf("thick_backbone occupancy: %d (%f%%)\n",
307  thick_backbone_occupancy,
308  100.0 * thick_backbone_occupancy / lookup->backbone_size);
309  printf("num_overflows: %d\n", num_overflows);
310  printf("overflow size: %d\n", overflow_cells_needed);
311  printf("longest chain: %d\n", longest_chain);
312  printf("exact matches: %d\n", lookup->exact_matches);
313  printf("neighbor matches: %d\n", lookup->neighbor_matches);
314 #endif
315 
316  /* bone-dependent lookup table filling-up */
317  lookup->bone_type = bone_type;
318 
319  /* backbone using Int4 as storage unit */
320  if(bone_type==eBackbone){
321  /* allocate new lookup table */
322  lookup->thick_backbone =
323  calloc(lookup->backbone_size, sizeof(AaLookupBackboneCell));
324  ASSERT(lookup->thick_backbone != NULL);
325  bbc = (AaLookupBackboneCell *) lookup->thick_backbone;
326  /* allocate the pv_array */
327  pv = lookup->pv = (PV_ARRAY_TYPE *) calloc(
328  (lookup->backbone_size >> PV_ARRAY_BTS) + 1,
329  sizeof(PV_ARRAY_TYPE));
330  ASSERT(pv != NULL);
331  /* allocate the overflow array */
332  if (overflow_cells_needed > 0) {
333  lookup->overflow =
334  calloc(overflow_cells_needed, sizeof(Int4));
335  ASSERT(lookup->overflow != NULL);
336  }
337  /* fill the lookup table */
338  for (i = 0; i < lookup->backbone_size; i++) {
339  /* if there are hits there, */
340  if (lookup->thin_backbone[i] ) {
341  Int4 * dest = NULL;
342  /* set the corresponding bit in the pv_array */
343  PV_SET(pv, i, PV_ARRAY_BTS);
344  bbc[i].num_used = lookup->thin_backbone[i][1];
345  /* if there are three or fewer hits, */
346  if (lookup->thin_backbone[i][1] <= AA_HITS_PER_CELL)
347  /* copy them into the thick_backbone cell */
348  dest = bbc[i].payload.entries;
349  else /* more than three hits; copy to overflow array */
350  {
351  bbc[i].payload.overflow_cursor = overflow_cursor;
352  dest = (Int4 *) lookup->overflow;
353  dest += overflow_cursor;
354  overflow_cursor += lookup->thin_backbone[i][1];
355  }
356  for (j=0; j <lookup->thin_backbone[i][1]; j++)
357  dest[j] = lookup->thin_backbone[i][j + 2];
358  /* done with this chain- free it */
359  sfree(lookup->thin_backbone[i]);
360  lookup->thin_backbone[i] = NULL;
361  }
362  else /* no hits here */
363  bbc[i].num_used = 0;
364  } /* end for */
365  } /* end of original scheme*/
366  /* Smallbone, using Uint2 as storage unit */
367  else{
368  /* allocate new lookup table */
369  lookup->thick_backbone =
370  calloc(lookup->backbone_size, sizeof(AaLookupSmallboneCell));
371  ASSERT(lookup->thick_backbone != NULL);
372  sbc = (AaLookupSmallboneCell *) lookup->thick_backbone;
373  /* allocate the pv_array */
374  pv = lookup->pv = (PV_ARRAY_TYPE *) calloc(
375  (lookup->backbone_size >> PV_ARRAY_BTS) + 1,
376  sizeof(PV_ARRAY_TYPE));
377  ASSERT(pv != NULL);
378  /* allocate the overflow array */
379  if (overflow_cells_needed > 0) {
380  lookup->overflow =
381  calloc(overflow_cells_needed, sizeof(Uint2));
382  ASSERT(lookup->overflow != NULL);
383  }
384  /* fill the lookup table */
385  for (i = 0; i < lookup->backbone_size; i++) {
386  if (lookup->thin_backbone[i] ) {
387  Uint2 * dest = NULL;
388  PV_SET(pv, i, PV_ARRAY_BTS);
389  sbc[i].num_used = lookup->thin_backbone[i][1];
390  if ((lookup->thin_backbone[i])[1] <= AA_HITS_PER_CELL)
391  dest=sbc[i].payload.entries;
392  else{
393  sbc[i].payload.overflow_cursor = overflow_cursor;
394  dest=((Uint2 *) (lookup->overflow))+overflow_cursor;
395  overflow_cursor += lookup->thin_backbone[i][1];
396  }
397  for (j=0; j <lookup->thin_backbone[i][1]; j++)
398  dest[j] = lookup->thin_backbone[i][j + 2];
399  sfree(lookup->thin_backbone[i]);
400  lookup->thin_backbone[i] = NULL;
401  }
402  else sbc[i].num_used = 0;
403  } /* end for */
404  } /* end of the small backbone */
405 
406  /* done copying hit info- free the backbone */
407  sfree(lookup->thin_backbone);
408  lookup->thin_backbone = NULL;
409 
410  return 0;
411 }
412 
414  Int4 ** matrix,
417  Int4 query_bias)
418 {
419  if (lookup->use_pssm) {
420  s_AddPSSMNeighboringWords(lookup, matrix, query_bias, location);
421  }
422  else {
423  ASSERT(query != NULL);
424  s_AddNeighboringWords(lookup, matrix, query, query_bias, location);
425  }
426 }
427 
429  BLAST_SequenceBlk * query, Int4 query_bias,
431 {
432  Int4 i, j;
433  Int4 **exact_backbone;
434  Int4 row_max[BLASTAA_SIZE];
435 
436  ASSERT(lookup->alphabet_size <= BLASTAA_SIZE);
437 
438  /* Determine the maximum possible score for each row of the score matrix */
439 
440  for (i = 0; i < lookup->alphabet_size; i++) {
441  row_max[i] = matrix[i][0];
442  for (j = 1; j < lookup->alphabet_size; j++)
443  row_max[i] = MAX(row_max[i], matrix[i][j]);
444  }
445 
446  /* create an empty backbone */
447 
448  exact_backbone = (Int4 **) calloc(lookup->backbone_size, sizeof(Int4 *));
449 
450  /* find all the exact matches, grouping together all offsets of identical
451  query words. The query bias is not used here, since the next stage
452  will need real offsets into the query sequence */
453 
454  BlastLookupIndexQueryExactMatches(exact_backbone, lookup->word_length,
455  lookup->charsize, lookup->word_length,
456  query, location);
457 
458  /* walk though the list of exact matches previously computed. Find
459  neighboring words for entire lists at a time */
460 
461  for (i = 0; i < lookup->backbone_size; i++) {
462  if (exact_backbone[i] != NULL) {
463  s_AddWordHits(lookup, matrix, query->sequence,
464  exact_backbone[i], query_bias, row_max);
465  sfree(exact_backbone[i]);
466  }
467  }
468 
469  sfree(exact_backbone);
470 }
471 
472 static void s_AddWordHits(BlastAaLookupTable * lookup, Int4 ** matrix,
473  Uint1 * query, Int4 * offset_list,
474  Int4 query_bias, Int4 * row_max)
475 {
476  Uint1 *w;
477  Uint1 s[32]; /* larger than any possible wordsize */
478  Int4 score;
479  Int4 i;
481 
482 #ifdef LOOKUP_VERBOSE
483  lookup->exact_matches += offset_list[1];
484 #endif
485 
486  /* All of the offsets in the list refer to the same query word. Thus,
487  neighboring words only have to be found for the first offset in the
488  list (since all other offsets would have the same neighbors) */
489 
490  w = query + offset_list[2];
491 
492  /* Compute the self-score of this word */
493 
494  score = matrix[w[0]][w[0]];
495  for (i = 1; i < lookup->word_length; i++)
496  score += matrix[w[i]][w[i]];
497 
498  /* If the self-score is above the threshold, then the neighboring
499  computation will automatically add the word to the lookup table.
500  Otherwise, either the score is too low or neighboring is not done at
501  all, so that all of these exact matches must be explicitly added to
502  the lookup table */
503 
504  if (lookup->threshold == 0 || score < lookup->threshold) {
505  for (i = 0; i < offset_list[1]; i++) {
506  BlastLookupAddWordHit(lookup->thin_backbone, lookup->word_length,
507  lookup->charsize, w,
508  query_bias + offset_list[i + 2]);
509  }
510  } else {
511 #ifdef LOOKUP_VERBOSE
512  lookup->neighbor_matches -= offset_list[1];
513 #endif
514  }
515 
516  /* check if neighboring words need to be found */
517 
518  if (lookup->threshold == 0)
519  return;
520 
521  /* Set up the structure of information to be used during the recursion */
522 
523  info.lookup = lookup;
524  info.query_word = w;
525  info.subject_word = s;
526  info.alphabet_size = lookup->alphabet_size;
527  info.wordsize = lookup->word_length;
528  info.charsize = lookup->charsize;
529  info.matrix = matrix;
530  info.row_max = row_max;
531  info.offset_list = offset_list;
532  info.threshold = lookup->threshold;
533  info.query_bias = query_bias;
534 
535  /* compute the largest possible score that any neighboring word can have;
536  this maximum will gradually be replaced by exact scores as subject
537  words are built up */
538 
539  score = row_max[w[0]];
540  for (i = 1; i < lookup->word_length; i++)
541  score += row_max[w[i]];
542 
543  s_AddWordHitsCore(&info, score, 0);
544 }
545 
546 static void s_AddWordHitsCore(NeighborInfo * info, Int4 score,
547  Int4 current_pos)
548 {
549  Int4 alphabet_size = info->alphabet_size;
550  Int4 threshold = info->threshold;
551  Uint1 *query_word = info->query_word;
552  Uint1 *subject_word = info->subject_word;
553  Int4 *row;
554  Int4 i;
555 
556  /* remove the maximum score of letters that align with the query letter
557  at position 'current_pos'. Later code will align the entire alphabet
558  with this letter, and compute the exact score each time. Also point to
559  the row of the score matrix corresponding to the query letter at
560  current_pos */
561 
562  score -= info->row_max[query_word[current_pos]];
563  row = info->matrix[query_word[current_pos]];
564 
565  if (current_pos == info->wordsize - 1) {
566 
567  /* The recursion has bottomed out, and we can produce complete
568  subject words. Pass the entire alphabet through the last position
569  in the subject word, then save the list of query offsets in all
570  positions corresponding to subject words that yield a high enough
571  score */
572 
573  Int4 *offset_list = info->offset_list;
574  Int4 query_bias = info->query_bias;
575  Int4 wordsize = info->wordsize;
576  Int4 charsize = info->charsize;
577  BlastAaLookupTable *lookup = info->lookup;
578  Int4 j;
579 
580  for (i = 0; i < alphabet_size; i++) {
581  if (score + row[i] >= threshold) {
582  subject_word[current_pos] = i;
583  for (j = 0; j < offset_list[1]; j++) {
584  BlastLookupAddWordHit(lookup->thin_backbone, wordsize,
585  charsize, subject_word,
586  query_bias + offset_list[j + 2]);
587  }
588 #ifdef LOOKUP_VERBOSE
589  lookup->neighbor_matches += offset_list[1];
590 #endif
591  }
592  }
593  return;
594  }
595 
596  /* Otherwise, pass the entire alphabet through position current_pos of
597  the subject word, and recurse on all words that could possibly exceed
598  the threshold later */
599 
600  for (i = 0; i < alphabet_size; i++) {
601  if (score + row[i] >= threshold) {
602  subject_word[current_pos] = i;
603  s_AddWordHitsCore(info, score + row[i], current_pos + 1);
604  }
605  }
606 }
607 
609  Int4 ** matrix, Int4 query_bias,
611 {
612  Int4 offset;
613  Int4 i, j;
614  BlastSeqLoc *loc;
615  Int4 *row_max;
616  Int4 wordsize = lookup->word_length;
617 
618  /* for PSSMs, we only have to track the maximum score of 'wordsize'
619  matrix columns */
620 
621  row_max = (Int4 *) malloc(lookup->word_length * sizeof(Int4));
622  ASSERT(row_max != NULL);
623 
624  for (loc = location; loc; loc = loc->next) {
625  Int4 from = loc->ssr->left;
626  Int4 to = loc->ssr->right - wordsize + 1;
627  Int4 **row = matrix + from;
628 
629  /* prepare to start another run of adjacent query words. Find the
630  maximum possible score for the first wordsize-1 rows of the PSSM */
631 
632  if (to >= from) {
633  for (i = 0; i < wordsize - 1; i++) {
634  row_max[i] = row[i][0];
635  for (j = 1; j < lookup->alphabet_size; j++)
636  row_max[i] = MAX(row_max[i], row[i][j]);
637  }
638  }
639 
640  for (offset = from; offset <= to; offset++, row++) {
641  /* find the maximum score of the next PSSM row */
642 
643  row_max[wordsize - 1] = row[wordsize - 1][0];
644  for (i = 1; i < lookup->alphabet_size; i++)
645  row_max[wordsize - 1] = MAX(row_max[wordsize - 1],
646  row[wordsize - 1][i]);
647 
648  /* find all neighboring words */
649 
650  s_AddPSSMWordHits(lookup, row, offset + query_bias, row_max);
651 
652  /* shift the list of maximum scores over by one, to make room for
653  the next maximum in the next loop iteration */
654 
655  for (i = 0; i < wordsize - 1; i++)
656  row_max[i] = row_max[i + 1];
657  }
658  }
659 
660  sfree(row_max);
661 }
662 
664  Int4 offset, Int4 * row_max)
665 {
666  Uint1 s[32]; /* larger than any possible wordsize */
667  Int4 score;
668  Int4 i;
670 
671  /* Set up the structure of information to be used during the recursion */
672 
673  info.lookup = lookup;
674  info.query_word = NULL;
675  info.subject_word = s;
676  info.alphabet_size = lookup->alphabet_size;
677  info.wordsize = lookup->word_length;
678  info.charsize = lookup->charsize;
679  info.matrix = matrix;
680  info.row_max = row_max;
681  info.offset_list = NULL;
682  info.threshold = lookup->threshold;
683  info.query_bias = offset;
684 
685  /* compute the largest possible score that any neighboring word can have;
686  this maximum will gradually be replaced by exact scores as subject
687  words are built up */
688 
689  score = row_max[0];
690  for (i = 1; i < lookup->word_length; i++)
691  score += row_max[i];
692 
693  s_AddPSSMWordHitsCore(&info, score, 0);
694 }
695 
697  Int4 current_pos)
698 {
699  Int4 alphabet_size = info->alphabet_size;
700  Int4 threshold = info->threshold;
701  Uint1 *subject_word = info->subject_word;
702  Int4 *row;
703  Int4 i;
704 
705  /* remove the maximum score of letters that align with the query letter
706  at position 'current_pos'. Later code will align the entire alphabet
707  with this letter, and compute the exact score each time. Also point to
708  the row of the score matrix corresponding to the query letter at
709  current_pos */
710 
711  score -= info->row_max[current_pos];
712  row = info->matrix[current_pos];
713 
714  if (current_pos == info->wordsize - 1) {
715 
716  /* The recursion has bottomed out, and we can produce complete
717  subject words. Pass the entire alphabet through the last position
718  in the subject word, then save the query offset in all lookup
719  table positions corresponding to subject words that yield a high
720  enough score */
721 
722  Int4 offset = info->query_bias;
723  Int4 wordsize = info->wordsize;
724  Int4 charsize = info->charsize;
725  BlastAaLookupTable *lookup = info->lookup;
726 
727  for (i = 0; i < alphabet_size; i++) {
728  if (score + row[i] >= threshold) {
729  subject_word[current_pos] = i;
730  BlastLookupAddWordHit(lookup->thin_backbone, wordsize,
731  charsize, subject_word, offset);
732 #ifdef LOOKUP_VERBOSE
733  lookup->neighbor_matches++;
734 #endif
735  }
736  }
737  return;
738  }
739 
740  /* Otherwise, pass the entire alphabet through position current_pos of
741  the subject word, and recurse on all words that could possibly exceed
742  the threshold later */
743 
744  for (i = 0; i < alphabet_size; i++) {
745  if (score + row[i] >= threshold) {
746  subject_word[current_pos] = i;
747  s_AddPSSMWordHitsCore(info, score + row[i], current_pos + 1);
748  }
749  }
750 }
751 
752 /** Fetch next vacant cell from a bank.
753  * @param[in] lookup compressed protein lookup table
754  * @return pointer to reserved cell
755  */
756 static CompressedOverflowCell*
758 {
759  if (lookup->curr_overflow_cell ==
761  /* need a new bank */
762  Int4 bank_idx = lookup->curr_overflow_bank + 1;
763  lookup->overflow_banks[bank_idx] = (CompressedOverflowCell*) malloc(
765  sizeof(CompressedOverflowCell));
767  ASSERT(lookup->overflow_banks[bank_idx]);
768  lookup->curr_overflow_bank++;
769  lookup->curr_overflow_cell = 0;
770  }
771 
772  return lookup->overflow_banks[lookup->curr_overflow_bank] +
773  lookup->curr_overflow_cell++;
774 }
775 
776 /** Add a single query offset to the compressed
777  * alphabet protein lookup table
778  * @param lookup The lookup table [in]
779  * @param index The hashtable index into which the query offset goes [in]
780  * @param query_offset Query offset to add [in]
781  */
784  Int4 index,
785  Int4 query_offset)
786 {
787  CompressedLookupBackboneCell *backbone_cell = lookup->backbone + index;
788  Int4 num_entries = backbone_cell->num_used;
789 
790  switch (num_entries) {
791  case 0:
792  backbone_cell->query_offset = query_offset;
793  break;
794  case 1:
795  case 2:
796  case 3:
797  case 4:
798  backbone_cell->payload.query_offsets[num_entries-1] = query_offset;
799  break;
800  case 5:
801  {
802  Int4 tmp_offset_0 = backbone_cell->payload.query_offsets[0];
803  Int4 tmp_offset_1 = backbone_cell->payload.query_offsets[1];
804 
805  /* fetch next vacant cell */
807 
808  /* this cell is always the end of the list */
809  new_cell->next = NULL;
810 
811  /* store the last element of the original backbone cell */
812  new_cell->query_offsets[0] = backbone_cell->payload.query_offsets[2];
813 
814  new_cell->query_offsets[1] = backbone_cell->payload.query_offsets[3];
815  /* store this new offset too */
816  new_cell->query_offsets[2] = query_offset;
817 
818  backbone_cell->payload.overflow_list.query_offsets[0] = tmp_offset_0;
819  backbone_cell->payload.overflow_list.query_offsets[1] = tmp_offset_1;
820 
821  /* make backbone point to this new, one-cell long list */
822  backbone_cell->payload.overflow_list.head = new_cell;
823  }
824  break;
825  default:
826  {
827  /* continue with existing overflow list */
828  /* find the index into the current overflow cell; we
829  do not store the current index in every cell, to
830  save space */
831  Int4 cell_index = (num_entries -3) & COMPRESSED_HITS_CELL_MASK ;
832 
833  if (cell_index == 0 ) { /* can't be empty => it's full */
834 
835  /* fetch next vacant cell */
837 
838  /* shuffle the pointers */
839  new_cell->next = backbone_cell->payload.overflow_list.head;
840  backbone_cell->payload.overflow_list.head = new_cell;
841  }
842 
843  /* head always points to a cell with free space */
844  backbone_cell->payload.overflow_list.head->query_offsets[cell_index] = query_offset;
845  }
846  break;
847  }
848 
849  backbone_cell->num_used++;
850 }
851 
852 /** Add a single query offset to the compressed lookup table.
853  * The index is computed using the letters in w[], which is
854  * assumed to already be converted to the compressed alphabet
855  * @param lookup Pointer to the lookup table. [in][out]
856  * @param w Word to add [in]
857  * @param query_offset The offset in the query where the word occurs [in]
858  */
861  Uint1* w,
862  Int4 query_offset)
863 {
864  Int4 index;
865 
866  static const Int4 W7p1[] = { 0, 10, 20, 30, 40, 50, 60, 70, 80, 90};
867  static const Int4 W7p2[] = { 0, 100, 200, 300, 400, 500, 600, 700, 800,
868  900};
869  static const Int4 W7p3[] = { 0, 1000, 2000, 3000, 4000, 5000, 6000,
870  7000, 8000, 9000};
871  static const Int4 W7p4[] = { 0, 10000, 20000, 30000, 40000, 50000, 60000,
872  70000, 80000, 90000};
873  static const Int4 W7p5[] = { 0, 100000, 200000, 300000, 400000, 500000,
874  600000, 700000, 800000, 900000};
875  static const Int4 W7p6[] = { 0, 1000000, 2000000, 3000000, 4000000,
876  5000000, 6000000, 7000000, 8000000, 9000000};
877 
878  static const Int4 W6p1[] = { 0, 15, 30, 45, 60, 75, 90, 105, 120, 135,
879  150, 165, 180, 195, 210};
880  static const Int4 W6p2[] = { 0, 225, 450, 675, 900, 1125, 1350, 1575,
881  1800, 2025, 2250, 2475, 2700, 2925, 3150};
882  static const Int4 W6p3[] = { 0, 3375, 6750, 10125, 13500, 16875, 20250,
883  23625, 27000, 30375, 33750, 37125, 40500,
884  43875, 47250};
885  static const Int4 W6p4[] = { 0, 50625, 101250, 151875, 202500, 253125,
886  303750, 354375, 405000, 455625, 506250,
887  556875, 607500, 658125, 708750};
888  static const Int4 W6p5[] = { 0, 759375, 1518750, 2278125, 3037500,
889  3796875, 4556250, 5315625, 6075000, 6834375,
890  7593750, 8353125, 9112500, 9871875, 10631250};
891 
892 
893 
894  switch (lookup->word_length) {
895  case 5:
896  index = w[0] + W6p1[w[1]] + W6p2[w[2]] + W6p3[w[3]] + W6p4[w[4]];
897  break;
898  case 6:
899  index = w[0] + W6p1[w[1]] + W6p2[w[2]] + W6p3[w[3]] + W6p4[w[4]] + W6p5[w[5]];
900  break;
901  case 7:
902  index = w[0] + W7p1[w[1]] + W7p2[w[2]] + W7p3[w[3]] + W7p4[w[4]] + W7p5[w[5]] + W7p6[w[6]];
903  break;
904  default:
905  index = 0;
906  break;
907  }
908 
909  s_CompressedLookupAddWordHit(lookup, index, query_offset);
910 }
911 
912 /** Add a single query offset to the compressed lookup table.
913  * The index is computed using the letters in w[], which is
914  * assumed to be in the standard alphabet (i.e. not compressed)
915  * @param lookup Pointer to the lookup table. [in][out]
916  * @param w word to add [in]
917  * @param query_offset the offset in the query where the word occurs [in]
918  */
921  Uint1* w,
922  Int4 query_offset)
923 {
924  Int4 skip = 0;
925  Int4 index = s_ComputeCompressedIndex(lookup->word_length, w,
926  lookup->compressed_alphabet_size,
927  &skip, lookup);
928  if (skip == 0)
929  s_CompressedLookupAddWordHit(lookup, index, query_offset);
930 }
931 
932 /** Structure containing information needed for adding neighboring words
933  * (specific to compressed lookup table)
934  */
935 typedef struct CompressedNeighborInfo {
936  BlastCompressedAaLookupTable *lookup; /**< Lookup table */
937  Uint1 *query_word; /**< the word whose neighbors we are computing */
938  Uint1 *subject_word; /**< the computed neighboring word */
939  Int4 compressed_alphabet_size; /**< for use with compressed alphabet */
940  Int4 wordsize; /**< number of residues in a word */
941  Int4 **matrix; /**< the substitution matrix */
942  Int4 row_max[BLASTAA_SIZE]; /**< maximum possible score for each
943  row of the matrix */
944  Int4 query_offset; /**< a single query offset to index */
945  Int4 threshold; /**< the score threshold for neighboring words */
946  Int4 matrixSorted[BLASTAA_SIZE][BLASTAA_SIZE]; /**< version of substitution
947  matrix whose rows are
948  sorted by score */
950  the letters permuted identically
951  to that of matrixSorted */
953 
954 /** Structure used as a helper for sorting matrix according to substitution
955  * score
956  */
958  Int4 diff; /**< score difference from row maximum */
959  Uint1 letter; /**< given protein letter */
961 
962 /** callback for the "sort" */
963 static int ScoreDifferenceSort(const void * a, const void *b ){
964  return (((LetterAndScoreDifferencePair*)a)->diff -
965  ((LetterAndScoreDifferencePair*)b)->diff);
966 }
967 
968 /** Prepare "score sorted" version of the substitution matrix"
969  * @param info Pointer to the NeighborInfo structure.
970  */
972 
974  Int4 i;
975  Int4 longChar, shortChar;
976 
977  for (longChar = 0; longChar < BLASTAA_SIZE; longChar++) {
978  for (shortChar = 0; shortChar <
979  info->compressed_alphabet_size; shortChar++) {
980 
981  sortTable[shortChar].diff = info->row_max[longChar] -
982  info->matrix[longChar][shortChar];
983  sortTable[shortChar].letter = shortChar;
984  }
985 
986  qsort(sortTable, info->compressed_alphabet_size,
988 
989  for (i = 0; i < info->compressed_alphabet_size; i++) {
990  Uint1 letter = sortTable[i].letter;
991 
992  info->matrixSorted[longChar][i] = info->matrix[longChar][letter];
993  info->matrixSortedChar[longChar][i] = letter;
994  }
995  }
996 }
997 
998 /** Very similar to s_AddWordHitsCore
999  * @param info Pointer to the NeighborInfo structure.
1000  * @param score The partial sum of the score.
1001  * @param current_pos The current offset.
1002  */
1004  Int4 score, Int4 current_pos)
1005 {
1006  Uint1 *query_word = info->query_word;
1007  Uint1 *subject_word = info->subject_word;
1008  Int4 i;
1009  Int4 *rowSorted;
1010  Uint1 *charSorted;
1011  Int4 currQueryChar = query_word[current_pos];
1012 
1013  /* remove the maximum score of letters that align with the query letter
1014  at position 'current_pos'. Later code will align the entire alphabet
1015  with this letter, and compute the exact score each time. Also point to
1016  the row of the score matrix corresponding to the query letter at
1017  current_pos */
1018 
1019  score -= info->row_max[currQueryChar];
1020  rowSorted = info->matrixSorted[currQueryChar];
1021  charSorted = info->matrixSortedChar[currQueryChar];
1022 
1023  if (current_pos == info->wordsize - 1) {
1024 
1025  /* The recursion has bottomed out, and we can produce complete
1026  subject words. Pass (a portion of) the alphabet through the
1027  last position in the subject word, then saving the query offset
1028  in the lookup table position corresponding to subject word i */
1029 
1031  Int4 query_offset = info->query_offset;
1032 
1033  for (i = 0; i < info->compressed_alphabet_size &&
1034  (score + rowSorted[i] >= info->threshold); i++) {
1035  subject_word[current_pos] = charSorted[i];
1036  s_CompressedLookupAddEncoded(lookup, subject_word,
1037  query_offset);
1038 #ifdef LOOKUP_VERBOSE
1039  lookup->neighbor_matches++;
1040 #endif
1041  }
1042  return;
1043  }
1044 
1045  /* Otherwise, pass (a portion of) the alphabet through position
1046  current_pos of the subject word, and recurse on all words that
1047  could possibly exceed the threshold later */
1048 
1049  for (i = 0; i < info->compressed_alphabet_size &&
1050  (score + rowSorted[i] >= info->threshold); i++) {
1051  subject_word[current_pos] = charSorted[i];
1052  s_CompressedAddWordHitsCore(info, score + rowSorted[i],
1053  current_pos + 1);
1054  }
1055 }
1056 
1057 /** Add neighboring words to the lookup table (compressed alphabet).
1058  * @param info Pointer to the NeighborInfo structure.
1059  * @param query Pointer to the query sequence.
1060  * @param query_offset offset where the word occurs in the query
1061  */
1063  Uint1 * query, Int4 query_offset)
1064 {
1065  Uint1 *w = query + query_offset;
1066  Uint1 s[32]; /* larger than any possible wordsize */
1067  Int4 score;
1068  Int4 i;
1070 
1071 #ifdef LOOKUP_VERBOSE
1072  lookup->exact_matches++;
1073 #endif
1074 
1075  /* Compute the self-score of the query word */
1076 
1077  score = 0;
1078  for (i = 0; i < lookup->word_length; i++) {
1079  int c = lookup->compress_table[w[i]];
1080 
1081  if (c >= lookup->compressed_alphabet_size) /* "non-20 aa": skip it*/
1082  return;
1083 
1084  score += info->matrix[w[i]][c];
1085  }
1086 
1087  /* If the self-score is above the threshold, then the neighboring
1088  computation will automatically add the word to the lookup table.
1089  Otherwise, either the score is too low or neighboring is not done at
1090  all, so that all of these exact matches must be explicitly added to
1091  the lookup table */
1092 
1093  if (lookup->threshold == 0 || score < lookup->threshold) {
1094  s_CompressedLookupAddUnencoded(lookup, w, query_offset);
1095  }
1096  else {
1097 #ifdef LOOKUP_VERBOSE
1098  lookup->neighbor_matches--;
1099 #endif
1100  }
1101 
1102  /* check if neighboring words need to be found */
1103 
1104  if (lookup->threshold == 0)
1105  return;
1106 
1107  /* Set up the structure of information to be used during the recursion */
1108 
1109  info->query_word = w;
1110  info->subject_word = s;
1111  info->query_offset = query_offset;
1112 
1113  /* compute the largest possible score that any neighboring word can have;
1114  this maximum will gradually be replaced by exact scores as subject
1115  words are built up */
1116 
1117  score = info->row_max[w[0]];
1118  for (i = 1; i < lookup->word_length; i++)
1119  score += info->row_max[w[i]];
1120 
1121  s_CompressedAddWordHitsCore(info, score, 0);
1122 }
1123 
1124 /**
1125  * Index a query sequence; i.e. fill a lookup table with the offsets
1126  * of query words
1127  *
1128  * @param lookup The lookup table [in/modified]
1129  * @param compressed_matrix The substitution matrix [in]
1130  * @param query The query sequence [in]
1131  * @param location List of ranges of query offsets to examine
1132  * for indexing [in]
1133  */
1136  Int4 ** compressed_matrix,
1139 {
1140  Int4 i, j;
1142  BlastSeqLoc *loc;
1143  Int4 offset;
1144 
1145  ASSERT(lookup->alphabet_size <= BLASTAA_SIZE);
1146 
1147  /* Determine the maximum possible score for each
1148  row of the score matrix */
1149 
1150  for (i = 0; i < lookup->alphabet_size; i++) {
1151  info.row_max[i] = compressed_matrix[i][0];
1152  for (j = 1; j < lookup->compressed_alphabet_size; j++)
1153  info.row_max[i] = MAX(info.row_max[i], compressed_matrix[i][j]);
1154  }
1155 
1156  /* Set up the structure of information to be used during the recursion */
1157  info.lookup = lookup;
1158  info.compressed_alphabet_size = lookup->compressed_alphabet_size;
1159  info.wordsize = lookup->word_length;
1160  info.matrix = compressed_matrix;
1161  info.threshold = lookup->threshold;
1162 
1164 
1165 
1166  /* Walk through the query and index all the words */
1167 
1168  for (loc = location; loc; loc = loc->next){
1169  Int4 from = loc->ssr->left;
1170  Int4 to = loc->ssr->right - lookup->word_length + 1;
1171 
1172  for (offset = from; offset <= to; offset++){
1173  s_CompressedAddWordHits(&info, query->sequence, offset);
1174  }
1175  }
1176 }
1177 
1178 /** Complete the construction of a compressed protein lookup table
1179  * @param lookup The lookup table [in][out]
1180  * @return Always 0
1181  */
1183 {
1184  Int4 i;
1185  Int4 longest_chain = 0;
1186  Int4 count;
1187  PV_ARRAY_TYPE *pv;
1188  Int4 pv_array_bts;
1189  const Int4 kTargetPVBytes = 262144;
1190 #ifdef LOOKUP_VERBOSE
1191 #define HISTSIZE 30
1192  Int4 histogram[HISTSIZE] = {0};
1193  Int4 backbone_occupancy = 0;
1194  Int4 num_overflows = 0;
1195 #endif
1196 
1197  /* count the number of nonempty cells in the backbone */
1198 
1199  for (i = count = 0; i < lookup->backbone_size; i++) {
1200  if (lookup->backbone[i].num_used)
1201  count++;
1202  }
1203 
1204 /* fprintf(stderr, "SIZE: %ld %ld\n", (long) count, (long) (0.05 * lookup->backbone_size));
1205  */
1206 
1207  /* Compress the PV array if it would be large. Compress it
1208  more if the backbone is sparsely populated. Do not compress
1209  if the PV array is small enough already or the backbone is
1210  mostly full */
1211 
1212  pv_array_bts = PV_ARRAY_BTS;
1213  if (count <= 0.01 * lookup->backbone_size) {
1214  pv_array_bts += ilog2(lookup->backbone_size / (8 * kTargetPVBytes));
1215  }
1216 
1217  pv = lookup->pv = (PV_ARRAY_TYPE *)calloc(
1218  (lookup->backbone_size >> pv_array_bts) + 1,
1219  sizeof(PV_ARRAY_TYPE));
1220  lookup->pv_array_bts = pv_array_bts;
1221  ASSERT(pv != NULL);
1222 
1223  /* compute the longest chain size and initialize the PV array */
1224  for (i = 0; i < lookup->backbone_size; i++) {
1225  count = lookup->backbone[i].num_used;
1226 
1227  if (count > 0) {
1228  /* set the corresponding bit in the pv_array */
1229  PV_SET(pv, i, pv_array_bts);
1230  longest_chain = MAX(count, longest_chain);
1231 
1232 #ifdef LOOKUP_VERBOSE
1234  num_overflows++;
1235  }
1236  if (count >= HISTSIZE)
1237  count = HISTSIZE-1;
1238 #endif
1239  }
1240 
1241 #ifdef LOOKUP_VERBOSE
1242  histogram[count]++;
1243 #endif
1244  }
1245 
1246  lookup->longest_chain = longest_chain;
1247 
1248 #ifdef LOOKUP_VERBOSE
1249  backbone_occupancy = lookup->backbone_size - histogram[0];
1250 
1251  printf("backbone size: %d\n", lookup->backbone_size);
1252  printf("backbone occupancy: %d (%f%%)\n", backbone_occupancy,
1253  100.0 * backbone_occupancy / lookup->backbone_size);
1254  printf("num_overflows: %d\n", num_overflows);
1255  printf("longest chain: %d\n", longest_chain);
1256  printf("exact matches: %d\n", lookup->exact_matches);
1257  printf("neighbor matches: %d\n", lookup->neighbor_matches);
1258  printf("banks allocated: %d\n", lookup->curr_overflow_bank + 1);
1259  printf("PV array: %d entries per bit\n", 1 << (lookup->pv_array_bts -
1260  PV_ARRAY_BTS));
1261  printf("Lookup table histogram:\n");
1262  for (i = 0; i < HISTSIZE; i++) {
1263  printf("%d\t%d\n", i, histogram[i]);
1264  }
1265 #endif
1266 
1267  return 0;
1268 }
1269 
1271  BlastSeqLoc* locations,
1273  const LookupTableOptions * opt,
1274  BlastScoreBlk *sbp)
1275 {
1276  Int4 i;
1277  SCompressedAlphabet* new_alphabet;
1278  const double kMatrixScale = 100.0;
1279  Int4 word_size = opt->word_size;
1280  Int4 table_scale;
1284 
1285  ASSERT(lookup != NULL);
1286  ASSERT(word_size == 5 || word_size == 6 || word_size == 7);
1287 
1288  /* set word size and threshold information. The reciprocals
1289  below are 2^32 / (compressed alphabet size) */
1290 
1291  lookup->word_length = word_size;
1292  lookup->threshold = (Int4)(kMatrixScale * opt->threshold);
1293  lookup->alphabet_size = BLASTAA_SIZE;
1294  if (word_size == 6 || word_size == 5) {
1295  lookup->compressed_alphabet_size = 15;
1296  lookup->reciprocal_alphabet_size = 286331154;
1297  }
1298  else {
1299  lookup->compressed_alphabet_size = 10;
1300  lookup->reciprocal_alphabet_size = 429496730;
1301  }
1302 
1303  /* compute a custom score matrix, for use only
1304  with lookup table creation. The matrix dimensions
1305  are BLASTAA_SIZE x compressed_alphabet_size, and
1306  the score entries are scaled up by kMatrixScale */
1307 
1308  new_alphabet = SCompressedAlphabetNew(sbp,
1309  lookup->compressed_alphabet_size,
1310  kMatrixScale);
1311  if (new_alphabet == NULL)
1312  return -1;
1313 
1314  /* allocate the backbone and overflow array */
1315 
1316  lookup->backbone_size = (Int4)pow(lookup->compressed_alphabet_size,
1317  word_size) + 1;
1319  lookup->backbone_size,
1321  lookup->overflow_banks = (CompressedOverflowCell **) calloc(
1323  sizeof(CompressedOverflowCell *));
1324  ASSERT(lookup->backbone != NULL);
1325  ASSERT(lookup->overflow_banks != NULL);
1326  /* there is no 'current overflow cell' that was previously
1327  allocated, so configure the allocator to start allocations
1328  immediately */
1329  lookup->curr_overflow_cell = COMPRESSED_OVERFLOW_CELLS_IN_BANK;
1330  lookup->curr_overflow_bank = -1;
1331 
1332  /* copy the mapping from protein to compressed
1333  representation; also save a scaled version of the
1334  mapping, for use in the scanning phase */
1335 
1336  lookup->compress_table = (Uint1 *)malloc(BLASTAA_SIZE * sizeof(Uint1));
1337  lookup->scaled_compress_table = (Int4 *)malloc(
1338  BLASTAA_SIZE * sizeof(Int4));
1339  table_scale = iexp(lookup->compressed_alphabet_size, word_size - 1);
1340  for (i = 0; i < BLASTAA_SIZE; i++) {
1341  Uint1 letter = new_alphabet->compress_table[i];
1342  lookup->compress_table[i] = letter;
1343 
1344  if (letter >= lookup->compressed_alphabet_size)
1345  lookup->scaled_compress_table[i] = -1;
1346  else
1347  lookup->scaled_compress_table[i] = table_scale * letter;
1348  }
1349 
1350  /* index the query and finish up */
1351 
1353  query, locations);
1355  SCompressedAlphabetFree(new_alphabet);
1356  return 0;
1357 }
1358 
1361 {
1362  Int4 i;
1363 
1364  for (i = 0; i <= lookup->curr_overflow_bank; i++) {
1365  free(lookup->overflow_banks[i]);
1366  }
1367 
1368  sfree(lookup->compress_table);
1369  sfree(lookup->scaled_compress_table);
1370  sfree(lookup->backbone);
1371  sfree(lookup->overflow_banks);
1372  sfree(lookup->pv);
1373  sfree(lookup);
1374  return NULL;
1375 }
static void s_CompressedLookupAddEncoded(BlastCompressedAaLookupTable *lookup, Uint1 *w, Int4 query_offset)
Add a single query offset to the compressed lookup table.
static void s_CompressedAddNeighboringWords(BlastCompressedAaLookupTable *lookup, Int4 **compressed_matrix, BLAST_SequenceBlk *query, BlastSeqLoc *location)
Index a query sequence; i.e.
static Int4 s_CompressedLookupFinalize(BlastCompressedAaLookupTable *lookup)
Complete the construction of a compressed protein lookup table.
struct LetterAndScoreDifferencePair LetterAndScoreDifferencePair
Structure used as a helper for sorting matrix according to substitution score.
static void s_AddPSSMWordHitsCore(NeighborInfo *info, Int4 score, Int4 current_pos)
Add neighboring words to the lookup table in case of a position-specific matrix, using NeighborInfo s...
Int4 BlastCompressedAaLookupTableNew(BLAST_SequenceBlk *query, BlastSeqLoc *locations, BlastCompressedAaLookupTable **lut, const LookupTableOptions *opt, BlastScoreBlk *sbp)
Create a new compressed protein lookup table.
static void s_CompressedAddWordHits(CompressedNeighborInfo *info, Uint1 *query, Int4 query_offset)
Add neighboring words to the lookup table (compressed alphabet).
static void s_AddWordHits(BlastAaLookupTable *lookup, Int4 **matrix, Uint1 *query, Int4 *offset_list, Int4 query_bias, Int4 *row_max)
Add neighboring words to the lookup table.
BlastCompressedAaLookupTable * BlastCompressedAaLookupTableDestruct(BlastCompressedAaLookupTable *lookup)
Free the compressed lookup table.
BlastAaLookupTable * BlastAaLookupTableDestruct(BlastAaLookupTable *lookup)
Free the lookup table.
static CompressedOverflowCell * s_CompressedListGetNewCell(BlastCompressedAaLookupTable *lookup)
Fetch next vacant cell from a bank.
static void s_CompressedLookupAddWordHit(BlastCompressedAaLookupTable *lookup, Int4 index, Int4 query_offset)
Add a single query offset to the compressed alphabet protein lookup table.
BlastRPSLookupTable * RPSLookupTableDestruct(BlastRPSLookupTable *lookup)
Free the lookup table.
static void s_AddPSSMNeighboringWords(BlastAaLookupTable *lookup, Int4 **matrix, Int4 query_bias, BlastSeqLoc *location)
A position-specific version of AddNeighboringWords.
Int2 RPSLookupTableNew(const BlastRPSInfo *info, BlastRPSLookupTable **lut)
Create a new RPS blast lookup table.
struct CompressedNeighborInfo CompressedNeighborInfo
Structure containing information needed for adding neighboring words (specific to compressed lookup t...
static void s_CompressedLookupAddUnencoded(BlastCompressedAaLookupTable *lookup, Uint1 *w, Int4 query_offset)
Add a single query offset to the compressed lookup table.
static void s_AddWordHitsCore(NeighborInfo *info, Int4 score, Int4 current_pos)
Add neighboring words to the lookup table using NeighborInfo structure.
static void s_CompressedAddWordHitsCore(CompressedNeighborInfo *info, Int4 score, Int4 current_pos)
Very similar to s_AddWordHitsCore.
void BlastAaLookupIndexQuery(BlastAaLookupTable *lookup, Int4 **matrix, BLAST_SequenceBlk *query, BlastSeqLoc *location, Int4 query_bias)
Index a protein query.
Int4 BlastAaLookupFinalize(BlastAaLookupTable *lookup, EBoneType bone_type)
Pack the data structures comprising a protein lookup table into their final form.
static void s_AddPSSMWordHits(BlastAaLookupTable *lookup, Int4 **matrix, Int4 query_bias, Int4 *row_max)
Add neighboring words to the lookup table in case of a position-specific matrix.
static void s_loadSortedMatrix(CompressedNeighborInfo *info)
Prepare "score sorted" version of the substitution matrix".
static int ScoreDifferenceSort(const void *a, const void *b)
callback for the "sort"
Int4 BlastAaLookupTableNew(const LookupTableOptions *opt, BlastAaLookupTable **lut)
Create a new protein lookup table.
static void s_AddNeighboringWords(BlastAaLookupTable *lookup, Int4 **matrix, BLAST_SequenceBlk *query, Int4 query_bias, BlastSeqLoc *location)
Index a query sequence; i.e.
struct NeighborInfo NeighborInfo
Structure containing information needed for adding neighboring words.
Routines for creating protein BLAST lookup tables.
#define COMPRESSED_OVERFLOW_MAX_BANKS
The maximum number of banks (usually less than 10 are needed; memory will run out before this is insu...
EBoneType
types of cells
@ eBackbone
#define COMPRESSED_HITS_PER_BACKBONE_CELL
number of query offsets to store in a backbone cell
#define COMPRESSED_HITS_CELL_MASK
#define RPS_BUCKET_SIZE
The number of regions into which the concatenated RPS blast database is split via bucket sorting.
#define AA_HITS_PER_CELL
maximum number of hits in one lookup table cell
#define COMPRESSED_OVERFLOW_CELLS_IN_BANK
number of cells in one bank of cells
static NCBI_INLINE Int4 s_ComputeCompressedIndex(Int4 wordsize, const Uint1 *word, Int4 compressed_alphabet_size, Int4 *skip, BlastCompressedAaLookupTable *lookup)
Convert a word to use a compressed alphabet.
#define sfree(x)
Safe free a pointer: belongs to a higher level header.
Definition: blast_def.h:112
Declarations of static arrays used to define some NCBI encodings to be used in a toolkit independent ...
void BlastLookupAddWordHit(Int4 **backbone, Int4 wordsize, Int4 charsize, Uint1 *seq, Int4 query_offset)
Add a single query offset to a generic lookup table.
Definition: blast_lookup.c:33
#define PV_ARRAY_BTS
bits-to-shift from lookup_index to pv_array index.
Definition: blast_lookup.h:43
void BlastLookupIndexQueryExactMatches(Int4 **backbone, Int4 word_length, Int4 charsize, Int4 lut_word_length, BLAST_SequenceBlk *query, BlastSeqLoc *locations)
Add all applicable query offsets to a generic lookup table.
Definition: blast_lookup.c:79
#define PV_SET(lookup, index, shift)
Set the bit at position 'index' in the PV array bitfield within 'lookup'.
Definition: blast_lookup.h:49
#define PV_ARRAY_TYPE
The pv_array 'native' type.
Definition: blast_lookup.h:41
#define BLAST_WORDSIZE_PROT
length of word to trigger an extension.
Definition: blast_options.h:66
#define RPS_MAGIC_NUM_28
Version number for 28-letter alphabet.
Definition: blast_rps.h:44
#define RPS_MAGIC_NUM
RPS blast version number.
Definition: blast_rps.h:43
SCompressedAlphabet * SCompressedAlphabetFree(SCompressedAlphabet *alphabet)
Free a compressed alphabet and score matrix.
Definition: blast_stat.c:4975
SCompressedAlphabet * SCompressedAlphabetNew(BlastScoreBlk *sbp, Int4 compressed_alphabet_size, double scale_factor)
Allocate a new compressed alphabet and score matrix.
Definition: blast_stat.c:4939
static int lookup(const char *name, const struct lookup_int *table)
Definition: attributes.c:50
int offset
Definition: replacements.h:160
static const char location[]
Definition: config.c:97
#define BLASTAA_SIZE
Size of aminoacid alphabet.
#define NULL
Definition: ncbistd.hpp:225
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint16_t Uint2
2-byte (16-bit) unsigned integer
Definition: ncbitype.h:101
int i
Utility functions for lookup table generation.
Int4 iexp(Int4 x, Int4 n)
Integer exponentiation using right to left binary algorithm.
Definition: lookup_util.c:47
Int4 ilog2(Int8 x)
Integer base two logarithm.
Definition: lookup_util.c:71
static MDB_envinfo info
Definition: mdb_load.c:37
unsigned int a
Definition: ncbi_localip.c:102
#define ASSERT
macro for assert.
Definition: ncbi_std.h:107
#define MAX(a, b)
returns larger of a and b.
Definition: ncbi_std.h:117
#define count
#define row(bind, expected)
Definition: string_bind.c:73
structure defining one cell of the compacted lookup table
union AaLookupBackboneCell::@3 payload
union that specifies either entries stored right on the backbone if fewer than AA_HITS_PER_CELL are p...
Int4 entries[3]
if the number of hits for this cell is AA_HITS_PER_CELL or less, the hits are all stored directly in ...
Int4 overflow_cursor
integer offset into the overflow array where the list of hits for this cell begins
Int4 num_used
number of hits stored for this cell
structure defining one cell of the small (i.e., use short) lookup table
union AaLookupSmallboneCell::@4 payload
union that specifies either entries stored right on the backbone if fewer than AA_HITS_PER_CELL are p...
Uint2 num_used
number of hits stored for this cell
Int4 overflow_cursor
integer offset into the overflow array where the list of hits for this cell begins
Uint2 entries[3]
if the number of hits for this cell is AA_HITS_PER_CELL or less, the hits are all stored directly in ...
Structure to hold a sequence.
Definition: blast_def.h:242
The basic lookup table structure for blastp searches.
The lookup table structure for protein searches using a compressed alphabet.
The RPS engine uses this structure to access all of the RPS blast related data (assumed to be collect...
Definition: blast_rps.h:120
header of RPS blast '.loo' file
Definition: blast_rps.h:49
Int4 magic_number
value should be RPS_MAGIC_NUM
Definition: blast_rps.h:50
Int4 start_of_backbone
byte offset of start of backbone
Definition: blast_rps.h:56
Int4 overflow_hits
number of hits in overflow array
Definition: blast_rps.h:54
The basic lookup table structure for RPS blast searches.
header of RPS blast '.rps' file
Definition: blast_rps.h:62
Int4 magic_number
value should be RPS_MAGIC_NUM
Definition: blast_rps.h:63
Int4 num_profiles
number of PSSMs in the file
Definition: blast_rps.h:64
Int4 start_offsets[1]
start of an Int4 array that gives the starting byte offset of each RPS DB sequence.
Definition: blast_rps.h:65
Structure used for scoring calculations.
Definition: blast_stat.h:177
Used to hold a set of positions, mostly used for filtering.
Definition: blast_def.h:204
SSeqRange * ssr
location data on the sequence.
Definition: blast_def.h:206
struct BlastSeqLoc * next
next in linked list
Definition: blast_def.h:205
structure for hashtable of indexed query offsets
Int4 query_offsets[4]
storage for query offsets local to the backbone cell
CompressedMixedOffsets overflow_list
storage for remote query offsets
union CompressedLookupBackboneCell::@5 payload
structure for holding the list of query offsets
Int4 query_offsets[4 -2]
the query offsets stored locally
CompressedOverflowCell * head
head of linked list of cells of query offsets stored off the backbone
Structure containing information needed for adding neighboring words (specific to compressed lookup t...
Int4 matrixSorted[BLASTAA_SIZE][BLASTAA_SIZE]
version of substitution matrix whose rows are sorted by score
BlastCompressedAaLookupTable * lookup
Lookup table.
Int4 compressed_alphabet_size
for use with compressed alphabet
Int4 wordsize
number of residues in a word
Uint1 * subject_word
the computed neighboring word
Int4 ** matrix
the substitution matrix
Int4 query_offset
a single query offset to index
Uint1 matrixSortedChar[BLASTAA_SIZE][BLASTAA_SIZE]
matrix with the letters permuted identically to that of matrixSorted
Int4 threshold
the score threshold for neighboring words
Int4 row_max[BLASTAA_SIZE]
maximum possible score for each row of the matrix
Uint1 * query_word
the word whose neighbors we are computing
cell in list for holding query offsets
struct CompressedOverflowCell * next
pointer to next cell
Int4 query_offsets[4]
the query offsets stored in the cell
Structure used as a helper for sorting matrix according to substitution score.
Int4 diff
score difference from row maximum
Uint1 letter
given protein letter
Options needed to construct a lookup table Also needed: query sequence and query length.
Int4 word_size
Determines the size of the lookup table.
double threshold
Score threshold for putting words in a lookup table (fractional values are allowed,...
Structure containing information needed for adding neighboring words.
Int4 ** matrix
the substitution matrix
Uint1 * subject_word
the computed neighboring word
Int4 alphabet_size
number of letters in the alphabet
Uint1 * query_word
the word whose neighbors we are computing
Int4 threshold
the score threshold for neighboring words
Int4 charsize
number of bits in a residue
Int4 query_bias
bias all stored offsets for multiple queries
BlastAaLookupTable * lookup
Lookup table.
Int4 * row_max
maximum possible score for each row of the matrix
Int4 * offset_list
list of offsets where the word occurs in the query
Int4 wordsize
number of residues in a word
structure defining one cell of the RPS lookup table
structure used for bucket sorting offsets retrieved from the RPS blast lookup table.
Int4 num_filled
number of offset pairs currently in bucket
Int4 num_alloc
max number of offset pairs bucket can hold
BlastOffsetPair * offset_pairs
list of offset pairs
int ** data
actual scoring matrix data, stored in row-major form
Definition: blast_stat.h:140
Scoring matrix data used for compressed protein alphabets.
Definition: blast_stat.h:225
Uint1 * compress_table
translation table (AA->compressed)
Definition: blast_stat.h:228
SBlastScoreMatrix * matrix
score matrix
Definition: blast_stat.h:227
Int4 left
left endpoint of range (zero based)
Definition: blast_def.h:156
Int4 right
right endpoint of range (zero based)
Definition: blast_def.h:157
static string query
This symbol enables the verbose option in makeblastdb and other BLAST+ search command line applicatio...
Definition: blast_def.h:141
static Uint4 letter(char c)
void free(voidpf ptr)
voidp malloc(uInt size)
voidp calloc(uInt items, uInt size)
Modified on Fri Sep 20 14:58:15 2024 by modify_doxy.py rev. 669887