NCBI C++ ToolKit
blast_nalookup.c
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: blast_nalookup.c 100164 2023-06-28 13:36:01Z merezhuk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  */
26 
27 /** @file blast_nalookup.c
28  * Functions for constructing nucleotide blast lookup tables
29  */
30 
36 
37 /** bitfield used to detect ambiguities in uncompressed
38  * nucleotide letters
39  */
40 #define BLAST2NA_MASK 0xfc
41 
42 /** number of bits in a compressed nucleotide letter */
43 #define BITS_PER_NUC 2
44 
47  Int4 approx_table_entries, Int4 max_q_off,
48  Int4 *lut_width)
49 {
50  ELookupTableType lut_type;
51 
52  /* Choose the width of the lookup table. The width may be any number
53  <= the word size, but the most efficient width is a compromise
54  between small values (which have better cache performance and
55  allow a larger scanning stride) and large values (which have fewer
56  accesses and allow fewer word extensions) The number of entries
57  where one table width becomes better than another is probably
58  machine-dependent */
59 
60  ASSERT(lookup_options->word_size >= 4);
61 
62  /* Discontiguous megablast must always use a megablast table */
63 
64  if (lookup_options->mb_template_length > 0) {
65  *lut_width = lookup_options->word_size;
66  return eMBLookupTable;
67  }
68 
69  /* always use megablast lookup table and word size 16 for mapping */
70  if (Blast_ProgramIsMapping(lookup_options->program_number) &&
71  lookup_options->word_size >= 16 && lookup_options->db_filter) {
72 
73  *lut_width = 16;
74  return eNaHashLookupTable;
75  }
76 
77  switch(lookup_options->word_size) {
78  case 4:
79  case 5:
80  case 6:
81  lut_type = eSmallNaLookupTable;
82  *lut_width = lookup_options->word_size;
83  break;
84 
85  case 7:
86  lut_type = eSmallNaLookupTable;
87  if (approx_table_entries < 250)
88  *lut_width = 6;
89  else
90  *lut_width = 7;
91  break;
92 
93  case 8:
94  lut_type = eSmallNaLookupTable;
95  if (approx_table_entries < 8500)
96  *lut_width = 7;
97  else
98  *lut_width = 8;
99  break;
100 
101  case 9:
102  if (approx_table_entries < 1250) {
103  *lut_width = 7;
104  lut_type = eSmallNaLookupTable;
105  } else if (approx_table_entries < 21000) {
106  *lut_width = 8;
107  lut_type = eSmallNaLookupTable;
108  } else {
109  *lut_width = 9;
110  lut_type = eMBLookupTable;
111  }
112  break;
113 
114  case 10:
115  if (approx_table_entries < 1250) {
116  *lut_width = 7;
117  lut_type = eSmallNaLookupTable;
118  } else if (approx_table_entries < 8500) {
119  *lut_width = 8;
120  lut_type = eSmallNaLookupTable;
121  } else if (approx_table_entries < 18000) {
122  *lut_width = 9;
123  lut_type = eMBLookupTable;
124  } else {
125  *lut_width = 10;
126  lut_type = eMBLookupTable;
127  }
128  break;
129 
130  case 11:
131  if (approx_table_entries < 12000) {
132  *lut_width = 8;
133  lut_type = eSmallNaLookupTable;
134  } else if (approx_table_entries < 180000) {
135  *lut_width = 10;
136  lut_type = eMBLookupTable;
137  } else {
138  *lut_width = 11;
139  lut_type = eMBLookupTable;
140  }
141  break;
142 
143  case 12:
144  if (approx_table_entries < 8500) {
145  *lut_width = 8;
146  lut_type = eSmallNaLookupTable;
147  } else if (approx_table_entries < 18000) {
148  *lut_width = 9;
149  lut_type = eMBLookupTable;
150  } else if (approx_table_entries < 60000) {
151  *lut_width = 10;
152  lut_type = eMBLookupTable;
153  } else if (approx_table_entries < 900000) {
154  *lut_width = 11;
155  lut_type = eMBLookupTable;
156  } else {
157  *lut_width = 12;
158  lut_type = eMBLookupTable;
159  }
160  break;
161 
162  default:
163  if (approx_table_entries < 8500) {
164  *lut_width = 8;
165  lut_type = eSmallNaLookupTable;
166  } else if (approx_table_entries < 300000) {
167  *lut_width = 11;
168  lut_type = eMBLookupTable;
169  } else {
170  *lut_width = 12;
171  lut_type = eMBLookupTable;
172  }
173  break;
174  }
175 
176  /* we only use the ordinary blastn table for cases where
177  the number of words to index, or the maximum query offset,
178  exceeds the range of the 15-bit values used in the
179  small blastn lookup table */
180 
181  if (lut_type == eSmallNaLookupTable &&
182  (approx_table_entries >= 32767 || max_q_off >= 32768)) {
183  lut_type = eNaLookupTable;
184  }
185  return lut_type;
186 }
187 
188 /*--------------------- Small nucleotide table ----------------------*/
189 
190 /** Pack the data structures comprising a small nucleotide lookup table
191  * into their final form
192  * @param thin_backbone structure containing indexed query offsets [in][out]
193  * @param lookup the lookup table [in]
194  * @param query the query sequence [in][out]
195  * @return zero if packing process succeeded
196  */
197 static Int4 s_BlastSmallNaLookupFinalize(Int4 **thin_backbone,
200 {
201  Int4 i;
202  Int4 overflow_cells_needed = 2;
203  Int4 overflow_cursor = 2;
204  Int4 longest_chain = 0;
205 #ifdef LOOKUP_VERBOSE
206  Int4 backbone_occupancy = 0;
207  Int4 thick_backbone_occupancy = 0;
208  Int4 num_overflows = 0;
209 #endif
210 
211  /* find out how many cells need the overflow array. The
212  backbone holds at most one hit per cell, so any cells
213  that need more than that must go into the overflow array
214  (along with a trailing null). */
215 
216  for (i = 0; i < lookup->backbone_size; i++) {
217  if (thin_backbone[i] != NULL) {
218  Int4 num_hits = thin_backbone[i][1];
219  if (num_hits > 1)
220  overflow_cells_needed += num_hits + 1;
221  longest_chain = MAX(longest_chain, num_hits);
222  }
223  }
224 
225  /* there is a hard limit to the number of query offsets
226  allowed in the overflow array. Although unlikely, it
227  is technically possible to index a query sequence that
228  has so many trailing nulls in the overflow array that
229  the limit gets exceeded */
230 
231  if (overflow_cells_needed >= 32768) {
232  for (i = 0; i < lookup->backbone_size; i++)
233  sfree(thin_backbone[i]);
234  return -1;
235  }
236 
237  /* compute a compressed representation of the query, used
238  for computing ungapped extensions */
239 
241 
242  /* allocate the new lookup table */
243  lookup->final_backbone = (Int2 *)malloc(
244  lookup->backbone_size * sizeof(Int2));
245  ASSERT(lookup->final_backbone != NULL);
246 
247  lookup->longest_chain = longest_chain;
248 
249  /* allocate the overflow array */
250  if (overflow_cells_needed > 0) {
251  lookup->overflow = (Int2 *) malloc(overflow_cells_needed *
252  sizeof(Int2));
253  ASSERT(lookup->overflow != NULL);
254  }
255 
256  /* for each position in the lookup table backbone, */
257  for (i = 0; i < lookup->backbone_size; i++) {
258 
259  Int4 j, num_hits;
260 
261  /* skip if there are no hits in cell i */
262  if (thin_backbone[i] == NULL) {
263  lookup->final_backbone[i] = -1;
264  continue;
265  }
266 
267 #ifdef LOOKUP_VERBOSE
268  backbone_occupancy++;
269 #endif
270  num_hits = thin_backbone[i][1];
271 
272  if (num_hits == 1) {
273 
274  /* if there is only one hit, it goes into the backbone */
275 
276 #ifdef LOOKUP_VERBOSE
277  thick_backbone_occupancy++;
278 #endif
279  lookup->final_backbone[i] = thin_backbone[i][2];
280  }
281  else {
282 #ifdef LOOKUP_VERBOSE
283  num_overflows++;
284 #endif
285  /* for more than one hit, the backbone stores
286  -(overflow offset where hits occur). Since a
287  cell value of -1 is reserved to mean 'empty cell',
288  the value stored begins at -2 */
289  lookup->final_backbone[i] = -overflow_cursor;
290  for (j = 0; j < num_hits; j++) {
291  lookup->overflow[overflow_cursor++] =
292  thin_backbone[i][j + 2];
293  }
294 
295  /* we don't have the room to store the number of hits,
296  so append a null to the end of the list to signal
297  that the current chain is finished */
298  lookup->overflow[overflow_cursor++] = -1;
299  }
300 
301  /* done with this chain */
302  sfree(thin_backbone[i]);
303  }
304 
305  lookup->overflow_size = overflow_cursor;
306 
307 #ifdef LOOKUP_VERBOSE
308  printf("SmallNa\n");
309  printf("backbone size: %d\n", lookup->backbone_size);
310  printf("backbone occupancy: %d (%f%%)\n", backbone_occupancy,
311  100.0 * backbone_occupancy / lookup->backbone_size);
312  printf("thick_backbone occupancy: %d (%f%%)\n",
313  thick_backbone_occupancy,
314  100.0 * thick_backbone_occupancy / lookup->backbone_size);
315  printf("num_overflows: %d\n", num_overflows);
316  printf("overflow size: %d\n", overflow_cells_needed);
317  printf("longest chain: %d\n", longest_chain);
318 #endif
319 
320  return 0;
321 }
322 
323 /** Changes the list of locations into a list of
324  the intervals between locations (the inverse).
325  @param locations input list [in]
326  @param length (query) sequence length [in]
327  @return inverted BlastSeqLoc
328 */
329 
330 static BlastSeqLoc* s_SeqLocListInvert(const BlastSeqLoc* locations, Int4 length)
331 {
332  BlastSeqLoc* retval = NULL;
333  BlastSeqLoc* tail = NULL; /* Tail of the list. */
334  Int4 start, stop;
335 
336  ASSERT(locations);
337 
338  start = 0;
339  stop = MAX( 0, locations->ssr->left-1);
340 
341  if (stop - start > 2)
342  tail = BlastSeqLocNew(&retval, start, stop);
343 
344  while (locations)
345  {
346  start = locations->ssr->right+1;
347  locations = locations->next;
348 
349  if (locations)
350  stop = locations->ssr->left-1;
351  else
352  stop = length-1;
353 
354  if (retval == NULL)
355  tail = BlastSeqLocNew(&retval, start, stop);
356  else
357  tail = BlastSeqLocNew(&tail, start, stop);
358  }
359  return retval;
360 }
361 
362 /** Determine whether mask at hash is enabled from the QuerySetUpOptions */
364 {
365  if ( !query_options ) {
366  return FALSE;
367  }
368  if (SBlastFilterOptionsMaskAtHash(query_options->filtering_options)) {
369  return TRUE;
370  }
371  if (query_options->filter_string &&
372  strstr(query_options->filter_string, "m")) {
373  return TRUE;
374  }
375  return FALSE;
376 }
377 
379  BlastSeqLoc* locations,
381  const LookupTableOptions * opt,
382  const QuerySetUpOptions* query_options,
383  Int4 lut_width)
384 {
385  Int4 status = 0;
386  Int4 **thin_backbone;
389 
390  ASSERT(lookup != NULL);
391 
392  lookup->word_length = opt->word_size;
393  lookup->lut_word_length = lut_width;
394  lookup->backbone_size = 1 << (BITS_PER_NUC * lookup->lut_word_length);
395  lookup->mask = lookup->backbone_size - 1;
396  lookup->overflow = NULL;
397  lookup->scan_step = lookup->word_length - lookup->lut_word_length + 1;
398 
399  thin_backbone = (Int4 **) calloc(lookup->backbone_size, sizeof(Int4 *));
400  ASSERT(thin_backbone != NULL);
401 
402  BlastLookupIndexQueryExactMatches(thin_backbone,
403  lookup->word_length,
404  BITS_PER_NUC,
405  lookup->lut_word_length,
406  query, locations);
407  if (locations &&
408  lookup->word_length > lookup->lut_word_length ) {
409  /* because we use compressed query, we must always check masked location*/
410  lookup->masked_locations = s_SeqLocListInvert(locations, query->length);
411  }
412 
413  status = s_BlastSmallNaLookupFinalize(thin_backbone, lookup, query);
414  if (status != 0) {
416  }
417 
418  sfree(thin_backbone);
419  *lut = lookup;
420  return status;
421 }
422 
425 {
426  sfree(lookup->final_backbone);
427  sfree(lookup->overflow);
428  if (lookup->masked_locations)
429  lookup->masked_locations = BlastSeqLocFree(lookup->masked_locations);
430  sfree(lookup);
431  return NULL;
432 }
433 
434 
435 /*--------------------- Standard nucleotide table ----------------------*/
436 
437 /** Pack the data structures comprising a nucleotide lookup table
438  * into their final form
439  * @param thin_backbone structure containing indexed query offsets [in][out]
440  * @param lookup the lookup table [in]
441  */
442 static void s_BlastNaLookupFinalize(Int4 **thin_backbone,
444 {
445  Int4 i;
446  Int4 overflow_cells_needed = 0;
447  Int4 overflow_cursor = 0;
448  Int4 longest_chain = 0;
449  PV_ARRAY_TYPE *pv;
450 #ifdef LOOKUP_VERBOSE
451  Int4 backbone_occupancy = 0;
452  Int4 thick_backbone_occupancy = 0;
453  Int4 num_overflows = 0;
454 #endif
455 
456  /* allocate the new lookup table */
457  lookup->thick_backbone = (NaLookupBackboneCell *)calloc(
458  lookup->backbone_size,
459  sizeof(NaLookupBackboneCell));
460  ASSERT(lookup->thick_backbone != NULL);
461 
462  /* allocate the pv_array */
463  pv = lookup->pv = (PV_ARRAY_TYPE *)calloc(
464  (lookup->backbone_size >> PV_ARRAY_BTS) + 1,
465  sizeof(PV_ARRAY_TYPE));
466  ASSERT(pv != NULL);
467 
468  /* find out how many cells need the overflow array */
469  for (i = 0; i < lookup->backbone_size; i++) {
470  if (thin_backbone[i] != NULL) {
471  Int4 num_hits = thin_backbone[i][1];
472  if (num_hits > NA_HITS_PER_CELL)
473  overflow_cells_needed += num_hits;
474  longest_chain = MAX(longest_chain, num_hits);
475  }
476  }
477 
478  lookup->longest_chain = longest_chain;
479 
480  /* allocate the overflow array */
481  if (overflow_cells_needed > 0) {
482  lookup->overflow = (Int4 *) calloc(overflow_cells_needed, sizeof(Int4));
483  ASSERT(lookup->overflow != NULL);
484  }
485 
486  /* for each position in the lookup table backbone, */
487  for (i = 0; i < lookup->backbone_size; i++) {
488 
489  Int4 j, num_hits;
490 
491  /* skip if there are no hits in cell i */
492  if (thin_backbone[i] == NULL)
493  continue;
494 
495 #ifdef LOOKUP_VERBOSE
496  backbone_occupancy++;
497 #endif
498  num_hits = thin_backbone[i][1];
499  lookup->thick_backbone[i].num_used = num_hits;
500 
501  PV_SET(pv, i, PV_ARRAY_BTS);
502 
503  /* if there are few enough hits, copy them into
504  the thick_backbone cell; otherwise copy all
505  hits to the overflow array */
506 
507  if (num_hits <= NA_HITS_PER_CELL) {
508 #ifdef LOOKUP_VERBOSE
509  thick_backbone_occupancy++;
510 #endif
511  for (j = 0; j < num_hits; j++) {
512  lookup->thick_backbone[i].payload.entries[j] =
513  thin_backbone[i][j + 2];
514  }
515  }
516  else {
517 #ifdef LOOKUP_VERBOSE
518  num_overflows++;
519 #endif
520  lookup->thick_backbone[i].payload.overflow_cursor =
521  overflow_cursor;
522  for (j = 0; j < num_hits; j++) {
523  lookup->overflow[overflow_cursor] =
524  thin_backbone[i][j + 2];
525  overflow_cursor++;
526  }
527  }
528 
529  /* done with this chain */
530  sfree(thin_backbone[i]);
531  }
532 
533  lookup->overflow_size = overflow_cursor;
534 
535 #ifdef LOOKUP_VERBOSE
536  printf("backbone size: %d\n", lookup->backbone_size);
537  printf("backbone occupancy: %d (%f%%)\n", backbone_occupancy,
538  100.0 * backbone_occupancy / lookup->backbone_size);
539  printf("thick_backbone occupancy: %d (%f%%)\n",
540  thick_backbone_occupancy,
541  100.0 * thick_backbone_occupancy / lookup->backbone_size);
542  printf("num_overflows: %d\n", num_overflows);
543  printf("overflow size: %d\n", overflow_cells_needed);
544  printf("longest chain: %d\n", longest_chain);
545 #endif
546 }
547 
549  BlastSeqLoc* locations,
550  BlastNaLookupTable * *lut,
551  const LookupTableOptions * opt,
552  const QuerySetUpOptions* query_options,
553  Int4 lut_width)
554 {
555  Int4 **thin_backbone;
556  BlastNaLookupTable *lookup = *lut =
558 
559  ASSERT(lookup != NULL);
560 
561  lookup->word_length = opt->word_size;
562  lookup->lut_word_length = lut_width;
563  lookup->backbone_size = 1 << (BITS_PER_NUC * lookup->lut_word_length);
564  lookup->mask = lookup->backbone_size - 1;
565  lookup->overflow = NULL;
566  lookup->scan_step = lookup->word_length - lookup->lut_word_length + 1;
567 
568  thin_backbone = (Int4 **) calloc(lookup->backbone_size, sizeof(Int4 *));
569  ASSERT(thin_backbone != NULL);
570 
571  BlastLookupIndexQueryExactMatches(thin_backbone,
572  lookup->word_length,
573  BITS_PER_NUC,
574  lookup->lut_word_length,
575  query, locations);
576  if (locations &&
577  lookup->word_length > lookup->lut_word_length &&
578  s_HasMaskAtHashEnabled(query_options)) {
579  lookup->masked_locations = s_SeqLocListInvert(locations, query->length);
580  }
581  s_BlastNaLookupFinalize(thin_backbone, lookup);
582  sfree(thin_backbone);
583  return 0;
584 }
585 
587 {
588  sfree(lookup->thick_backbone);
589  sfree(lookup->overflow);
590  if (lookup->masked_locations)
591  lookup->masked_locations = BlastSeqLocFree(lookup->masked_locations);
592  sfree(lookup->pv);
593  sfree(lookup);
594  return NULL;
595 }
596 
597 
598 /*--------------------- Megablast table ---------------------------*/
599 
600 /** Convert weight, template length and template type from input options into
601  an MBTemplateType enum
602 */
603 static EDiscTemplateType
606 {
607  if (weight == 11) {
608  if (length == 16) {
611  else if (type == eMBWordOptimal)
613  } else if (length == 18) {
616  else if (type == eMBWordOptimal)
618  } else if (length == 21) {
621  else if (type == eMBWordOptimal)
623  }
624  } else if (weight == 12) {
625  if (length == 16) {
628  else if (type == eMBWordOptimal)
630  } else if (length == 18) {
633  else if (type == eMBWordOptimal)
635  } else if (length == 21) {
638  else if (type == eMBWordOptimal)
640  }
641  }
642  return eDiscTemplateContiguous; /* All unsupported cases default to 0 */
643 }
644 
645 /** Fills in the hashtable and next_pos fields of BlastMBLookupTable*
646  * for the discontiguous case.
647  *
648  * @param query the query sequence [in]
649  * @param location locations on the query to be indexed in table [in]
650  * @param mb_lt the (already allocated) megablast lookup
651  * table structure [in|out]
652  * @param lookup_options specifies the word_size and template options [in]
653  * @return zero on success, negative number on failure.
654  */
655 
656 static Int2
658  BlastMBLookupTable* mb_lt,
659  const LookupTableOptions* lookup_options)
660 
661 {
662  BlastSeqLoc* loc;
663  EDiscTemplateType template_type;
664  EDiscTemplateType second_template_type = eDiscTemplateContiguous;
665  const Boolean kTwoTemplates =
666  (lookup_options->mb_template_type == eMBWordTwoTemplates);
667  PV_ARRAY_TYPE *pv_array=NULL;
668  Int4 pv_array_bts;
669  Int4 index;
670  Int4 template_length;
671  /* The calculation of the longest chain can be cpu intensive for
672  long queries or sets of queries. So we use a helper_array to
673  keep track of this, but compress it by kCompressionFactor so
674  it stays in cache. Hence we only end up with a conservative
675  (high) estimate for longest_chain, but this does not seem to
676  affect the overall performance of the rest of the program. */
677  Uint4 longest_chain;
678  Uint4* helper_array = NULL; /* Helps to estimate longest chain. */
679  Uint4* helper_array2 = NULL; /* Helps to estimate longest chain. */
680  const Int4 kCompressionFactor=2048; /* compress helper_array by this much */
681 
682  ASSERT(mb_lt);
683  ASSERT(lookup_options->mb_template_length > 0);
684 
685  mb_lt->next_pos = (Int4 *)calloc(query->length + 1, sizeof(Int4));
686  helper_array = (Uint4*) calloc(mb_lt->hashsize/kCompressionFactor,
687  sizeof(Uint4));
688  if (mb_lt->next_pos == NULL || helper_array == NULL)
689  return -1;
690 
691  template_type = s_GetDiscTemplateType(lookup_options->word_size,
692  lookup_options->mb_template_length,
693  (EDiscWordType)lookup_options->mb_template_type);
694 
695  ASSERT(template_type != eDiscTemplateContiguous);
696 
697  mb_lt->template_type = template_type;
698  mb_lt->two_templates = kTwoTemplates;
699  /* For now leave only one possibility for the second template.
700  Note that the intention here is to select both the coding
701  and the optimal templates for one combination of word size
702  and template length. */
703  if (kTwoTemplates) {
704  /* Use the temporaray to avoid annoying ICC warning. */
705  int temp_int = template_type + 1;
706  second_template_type =
707  mb_lt->second_template_type = (EDiscTemplateType) temp_int;
708 
709  mb_lt->hashtable2 = (Int4*)calloc(mb_lt->hashsize, sizeof(Int4));
710  mb_lt->next_pos2 = (Int4*)calloc(query->length + 1, sizeof(Int4));
711  helper_array2 = (Uint4*) calloc(mb_lt->hashsize/kCompressionFactor,
712  sizeof(Uint4));
713  if (mb_lt->hashtable2 == NULL ||
714  mb_lt->next_pos2 == NULL ||
715  helper_array2 == NULL)
716  return -1;
717  }
718 
719  mb_lt->discontiguous = TRUE;
720  mb_lt->template_length = lookup_options->mb_template_length;
721  template_length = lookup_options->mb_template_length;
722  pv_array = mb_lt->pv_array;
723  pv_array_bts = mb_lt->pv_array_bts;
724 
725  for (loc = location; loc; loc = loc->next) {
726  Int4 from;
727  Int4 to;
728  Uint8 accum = 0;
729  Int4 ecode1 = 0;
730  Int4 ecode2 = 0;
731  Uint1* pos;
732  Uint1* seq;
733  Uint1 val;
734 
735  /* A word is added to the table after the last base
736  in the word is read in. At that point, the start
737  offset of the word is (template_length-1) positions
738  behind. This index is also incremented, because
739  lookup table indices are 1-based (offset 0 is reserved). */
740 
741  from = loc->ssr->left - (template_length - 2);
742  to = loc->ssr->right - (template_length - 2);
743  seq = query->sequence_start + loc->ssr->left;
744  pos = seq + template_length;
745 
746  for (index = from; index <= to; index++) {
747  val = *++seq;
748  /* if an ambiguity is encountered, do not add
749  any words that would contain it */
750  if ((val & BLAST2NA_MASK) != 0) {
751  accum = 0;
752  pos = seq + template_length;
753  continue;
754  }
755 
756  /* get next base */
757  accum = (accum << BITS_PER_NUC) | val;
758  if (seq < pos)
759  continue;
760 
761 #ifdef LOOKUP_VERBOSE
762  mb_lt->num_words_added++;
763 #endif
764  /* compute the hashtable index for the first template
765  and add 'index' at that position */
766 
767  ecode1 = ComputeDiscontiguousIndex(accum, template_type);
768  if (mb_lt->hashtable[ecode1] == 0) {
769 #ifdef LOOKUP_VERBOSE
770  mb_lt->num_unique_pos_added++;
771 #endif
772  PV_SET(pv_array, ecode1, pv_array_bts);
773  }
774  else {
775  helper_array[ecode1/kCompressionFactor]++;
776  }
777  mb_lt->next_pos[index] = mb_lt->hashtable[ecode1];
778  mb_lt->hashtable[ecode1] = index;
779 
780  if (!kTwoTemplates)
781  continue;
782 
783  /* repeat for the second template, if applicable */
784 
785  ecode2 = ComputeDiscontiguousIndex(accum, second_template_type);
786  if (mb_lt->hashtable2[ecode2] == 0) {
787 #ifdef LOOKUP_VERBOSE
788  mb_lt->num_unique_pos_added++;
789 #endif
790  PV_SET(pv_array, ecode2, pv_array_bts);
791  }
792  else {
793  helper_array2[ecode2/kCompressionFactor]++;
794  }
795  mb_lt->next_pos2[index] = mb_lt->hashtable2[ecode2];
796  mb_lt->hashtable2[ecode2] = index;
797  }
798  }
799 
800  longest_chain = 2;
801  for (index = 0; index < mb_lt->hashsize / kCompressionFactor; index++)
802  longest_chain = MAX(longest_chain, helper_array[index]);
803  mb_lt->longest_chain = longest_chain;
804  sfree(helper_array);
805 
806  if (kTwoTemplates) {
807  longest_chain = 2;
808  for (index = 0; index < mb_lt->hashsize / kCompressionFactor; index++)
809  longest_chain = MAX(longest_chain, helper_array2[index]);
810  mb_lt->longest_chain += longest_chain;
811  sfree(helper_array2);
812  }
813  return 0;
814 }
815 
816 static Int2
819  BlastMBLookupTable* mb_lt,
820  const LookupTableOptions* lookup_options)
821 
822 {
823  BlastSeqLoc* loc;
824  /* 12-mers (or perhaps 8-mers) are used to build the lookup table
825  and this is what kLutWordLength specifies. */
826  const Int4 kLutWordLength = mb_lt->lut_word_length;
827  const Int8 kLutMask = mb_lt->hashsize - 1;
828  /* The user probably specified a much larger word size (like 28)
829  and this is what full_word_size is. */
830  Int4 full_word_size = mb_lt->word_length;
831  Int4 index;
832  PV_ARRAY_TYPE *pv_array;
833  Int4 pv_array_bts;
834 
835  ASSERT(mb_lt);
836 
837  pv_array = mb_lt->pv_array;
838  pv_array_bts = mb_lt->pv_array_bts;
839 
840  for (loc = location; loc; loc = loc->next) {
841  /* We want index to be always pointing to the start of the word.
842  Since sequence pointer points to the end of the word, subtract
843  word length from the loop boundaries. */
844  Int4 from = loc->ssr->left;
845  Int4 to = loc->ssr->right - kLutWordLength;
846  Int8 ecode = 0;
847  Int4 last_offset;
848  Uint1* pos;
849  Uint1* seq;
850  Uint1 val;
851 // int counter = 1; /* collect this many adjacent words */
852 
853  /* case of unmasked region >= kLutWordLength but < full_word_size,
854  so no hits should be generated. */
855  if (full_word_size > (loc->ssr->right - loc->ssr->left + 1))
856  continue;
857 
858  seq = query->sequence_start + from;
859  pos = seq + kLutWordLength;
860 
861  /* Also add 1 to all indices, because lookup table indices count
862  from 1. */
863  from -= kLutWordLength - 2;
864  last_offset = to + 2;
865 
866  for (index = from; index <= last_offset; index++) {
867  val = *++seq;
868  /* if an ambiguity is encountered, do not add
869  any words that would contain it */
870  if ((val & BLAST2NA_MASK) != 0) {
871  ecode = 0;
872  pos = seq + kLutWordLength;
873  continue;
874  }
875 
876  /* get next base */
877  ecode = ((ecode << BITS_PER_NUC) & kLutMask) + val;
878  if (seq < pos)
879  continue;
880 
881  PV_SET(pv_array, ecode, pv_array_bts);
882  }
883  }
884 
885  return 0;
886 }
887 
888 
889 /* Remove words that appear in polyA tails from the lookup table: string of As,
890  string of Ts, and As and Ts with one error. */
892 {
893  Int4 word_size = mb_lt->lut_word_length;
894  Int8 word;
895  Int4 i, k;
896 
897  /* remove As and Ts */
898  mb_lt->hashtable[0] = 0;
899  mb_lt->hashtable[(Int8)((1 << (2 * word_size)) - 1)] = 0;
900 
901  if (word_size < 16) {
902  return 0;
903  }
904 
905  ASSERT(word_size == 16);
906 
907  /* remove As with a single error */
908  for (i = 1;i < 4;i++) {
909  word = i;
910  for (k = 0;k < word_size;k++) {
911  mb_lt->hashtable[word << (k * 2)] = 0;
912  }
913  }
914 
915  /* remove Ts with a single error */
916  for (i = 0;i < 3;i++) {
917  for (k = 0;k < word_size;k++) {
918  word = ((0xffffffff ^ (3 << k*2)) | (i << k*2)) & 0xffffffff;
919  mb_lt->hashtable[word] = 0;
920  }
921  }
922 
923  return 0;
924 }
925 
926 
927 /** Fills in the hashtable and next_pos fields of BlastMBLookupTable*
928  * for the contiguous case.
929  *
930  * @param query the query sequence [in]
931  * @param location locations on the query to be indexed in table [in]
932  * @param mb_lt the (already allocated) megablast lookup table structure [in|out]
933  * @return zero on success, negative number on failure.
934  */
935 
936 static Int2
939  BlastMBLookupTable* mb_lt,
940  const LookupTableOptions* lookup_options,
941  Uint1* counts)
942 {
943  BlastSeqLoc* loc;
944  /* 12-mers (or perhaps 8-mers) are used to build the lookup table
945  and this is what kLutWordLength specifies. */
946  const Int4 kLutWordLength = mb_lt->lut_word_length;
947  const Int8 kLutMask = mb_lt->hashsize - 1;
948  /* The user probably specified a much larger word size (like 28)
949  and this is what full_word_size is. */
950  Int4 full_word_size = mb_lt->word_length;
951  Int4 index;
952  PV_ARRAY_TYPE *pv_array;
953  Int4 pv_array_bts;
954  /* The calculation of the longest chain can be cpu intensive for
955  long queries or sets of queries. So we use a helper_array to
956  keep track of this, but compress it by kCompressionFactor so
957  it stays in cache. Hence we only end up with a conservative
958  (high) estimate for longest_chain, but this does not seem to
959  affect the overall performance of the rest of the program. */
960  const Int4 kCompressionFactor=2048; /* compress helper_array by this much */
961  Uint4 longest_chain;
962  Uint4* helper_array;
963  const Boolean kDbFilter = lookup_options->db_filter;
964 
965  ASSERT(mb_lt);
966 
967  mb_lt->next_pos = (Int4 *)calloc(query->length + 1, sizeof(Int4));
968  if (mb_lt->next_pos == NULL)
969  return -1;
970 
971  pv_array = mb_lt->pv_array;
972  pv_array_bts = mb_lt->pv_array_bts;
973 
974  helper_array = (Uint4*) calloc(mb_lt->hashsize/kCompressionFactor,
975  sizeof(Uint4));
976  if (helper_array == NULL)
977  return -1;
978 
979  /* if filtering by database word counts, then reset the pv array to avoid
980  too many bits set for database scanning */
981  if (kDbFilter) {
982  memset(pv_array, 0,
983  (mb_lt->hashsize >> mb_lt->pv_array_bts) * PV_ARRAY_BYTES);
984  }
985 
986  for (loc = location; loc; loc = loc->next) {
987  /* We want index to be always pointing to the start of the word.
988  Since sequence pointer points to the end of the word, subtract
989  word length from the loop boundaries. */
990  Int4 from = loc->ssr->left;
991  Int4 to = loc->ssr->right - kLutWordLength;
992  Int8 ecode = 0;
993  Int4 last_offset;
994  Uint1* pos;
995  Uint1* seq;
996  Uint1 val;
997  Uint1 max_word_count = lookup_options->max_db_word_count;
998 // int counter = 1; /* collect this many adjacent words */
999  int shift = 0;
1000  int pos_shift = 0;
1001  if (lookup_options->stride > 0) {
1002  shift = lookup_options->stride - 1;
1003  pos_shift = kLutWordLength + 1;
1004  }
1005 
1006 
1007  /* case of unmasked region >= kLutWordLength but < full_word_size,
1008  so no hits should be generated. */
1009  if (full_word_size > (loc->ssr->right - loc->ssr->left + 1))
1010  continue;
1011 
1012  seq = query->sequence_start + from;
1013  pos = seq + kLutWordLength;
1014 
1015  /* Also add 1 to all indices, because lookup table indices count
1016  from 1. */
1017  from -= kLutWordLength - 2;
1018  last_offset = to + 2;
1019 
1020  for (index = from; index <= last_offset; index++) {
1021  val = *++seq;
1022  /* if an ambiguity is encountered, do not add
1023  any words that would contain it */
1024  if ((val & BLAST2NA_MASK) != 0) {
1025  ecode = 0;
1026  pos = seq + kLutWordLength;
1027  continue;
1028  }
1029 
1030  /* get next base */
1031  ecode = ((ecode << BITS_PER_NUC) & kLutMask) + val;
1032  if (seq < pos)
1033  continue;
1034 
1035  /* if filtering by database word count, then do not add words
1036  with too many counts */
1037  if (kDbFilter) {
1038  if (!(ecode & 1)) {
1039  if ((counts[ecode / 2] >> 4) >= max_word_count) {
1040  continue;
1041  }
1042  }
1043  else {
1044  if ((counts[ecode / 2] & 0xf) >= max_word_count) {
1045  continue;
1046  }
1047  }
1048  }
1049 
1050 
1051  /* collect 1 word and skip lookup_options->stride */
1052 
1053  /*
1054  if (!counter) {
1055  pos = seq + lookup_options->stride;
1056  counter = 1;
1057 
1058  continue;
1059  }
1060  if (lookup_options->stride) {
1061  counter--;
1062  }
1063  */
1064 
1065 #ifdef LOOKUP_VERBOSE
1066  mb_lt->num_words_added++;
1067 #endif
1068 
1069  if (mb_lt->hashtable[ecode] == 0) {
1070 #ifdef LOOKUP_VERBOSE
1071  mb_lt->num_unique_pos_added++;
1072 #endif
1073  PV_SET(pv_array, ecode, pv_array_bts);
1074  }
1075  else {
1076  helper_array[ecode/kCompressionFactor]++;
1077  }
1078  mb_lt->next_pos[index] = mb_lt->hashtable[ecode];
1079  mb_lt->hashtable[ecode] = index;
1080 
1081  /* skip shift words */
1082  index += shift;
1083  seq += shift;
1084  pos = seq + pos_shift;
1085  }
1086  }
1087 
1088  if (Blast_ProgramIsMapping(lookup_options->program_number)) {
1089  s_RemovePolyAWords(mb_lt);
1090  }
1091 
1092  longest_chain = 2;
1093  for (index = 0; index < mb_lt->hashsize / kCompressionFactor; index++)
1094  longest_chain = MAX(longest_chain, helper_array[index]);
1095 
1096  mb_lt->longest_chain = longest_chain;
1097  sfree(helper_array);
1098  return 0;
1099 }
1100 
1101 
1102 /** Scan a subject sequecne and update words counters, for 16-base words with
1103  * scan step of 1. The counters are 4-bit and counting is done up to 10.
1104  *
1105  * @param sequence Subject sequence [in]
1106  * @param mb_lt Megablast lookup table [in|out]
1107  * @param counts Word counters [in|out]
1108  */
1109 static Int2
1111  BlastMBLookupTable* mb_lt,
1112  Uint1* counts,
1113  Uint1 max_word_count)
1114 {
1115  Uint1 *s;
1116  Int4 i;
1117  Int8 mask = mb_lt->hashsize - 1;
1118  Int8 word, index, w;
1119  const Int4 kNumWords
1120  = sequence->length - mb_lt->lut_word_length;
1121 
1122  PV_ARRAY_TYPE* pv = mb_lt->pv_array;
1123  Int4 pv_array_bts = mb_lt->pv_array_bts;
1124  Int4 shift;
1125 
1126  if (!sequence || !counts || !mb_lt || !pv) {
1127  return -1;
1128  }
1129 
1130  ASSERT(mb_lt->lut_word_length == 16);
1131 
1132  /* scan the words in the sequence */
1133  shift = 8;
1134  s = sequence->sequence;
1135  w = (Int8)s[0] << 24 | (Int8)s[1] << 16 | (Int8)s[2] << 8 | s[3];
1136  for (i = 0;i < kNumWords;i++) {
1137 
1138  if (i % COMPRESSION_RATIO == 0) {
1139  shift = 8;
1140  w = (w << 8) | (Int8)s[i / COMPRESSION_RATIO + 4];
1141  }
1142  else {
1143  shift -= 2;
1144  ASSERT(shift > 0);
1145  }
1146 
1147  word = (w >> shift) & mask;
1148 
1149  /* skip words that do not appear in the query */
1150  if (!PV_TEST(pv, word, pv_array_bts)) {
1151  continue;
1152  }
1153 
1154  /* update the counter */
1155  index = word / 2;
1156  if (word & 1) {
1157  if ((counts[index] & 0xf) < max_word_count) {
1158  counts[index]++;
1159  }
1160  }
1161  else {
1162  if ((counts[index] >> 4) < max_word_count) {
1163  counts[index] += 1 << 4;
1164  }
1165  }
1166  }
1167 
1168  return 0;
1169 }
1170 
1171 /** Scan database sequences and count query words that appear in the database.
1172  * Then reset pv_array bits that correspond to words that do not appear in
1173  * in the database, or appear 10 or more times
1174  *
1175  * @param seq_src Source for subject sequences [in]
1176  * @param mb_lt Megablast lookuptable [in|out]
1177  */
1178 static Int2
1180  BlastMBLookupTable* mb_lt,
1181  Uint1* counts,
1182  Uint1 max_word_count)
1183 {
1184  BlastSeqSrcIterator* itr;
1185  BlastSeqSrcGetSeqArg seq_arg;
1186  PV_ARRAY_TYPE* pv = mb_lt->pv_array;
1187 
1188  if (!seq_src || !pv || !counts) {
1189  return -1;
1190  }
1191 
1192  memset(&seq_arg, 0, sizeof(seq_arg));
1193  seq_arg.encoding = eBlastEncodingProtein;
1194 
1195  /* scan subject sequences and update the counters for each */
1197  itr = BlastSeqSrcIteratorNewEx(MAX(BlastSeqSrcGetNumSeqs(seq_src)/100,1));
1198  while ((seq_arg.oid = BlastSeqSrcIteratorNext(seq_src, itr))
1199  != BLAST_SEQSRC_EOF) {
1200 
1201  BlastSeqSrcGetSequence(seq_src, &seq_arg);
1202  s_MBCountWordsInSubject_16_1(seq_arg.seq, mb_lt, counts,
1203  max_word_count);
1204  BlastSeqSrcReleaseSequence(seq_src, &seq_arg);
1205  }
1206 
1207  BlastSequenceBlkFree(seq_arg.seq);
1209 
1210  return 0;
1211 }
1212 
1213 
1214 /* Documentation in mb_lookup.h */
1216  BlastMBLookupTable** mb_lt_ptr,
1217  const LookupTableOptions* lookup_options,
1218  const QuerySetUpOptions* query_options,
1219  Int4 approx_table_entries,
1220  Int4 lut_width,
1221  BlastSeqSrc* seqsrc)
1222 {
1223  Int4 pv_size;
1224  Int2 status = 0;
1225  BlastMBLookupTable* mb_lt;
1226  const Int4 kTargetPVSize = 131072;
1227  const Int4 kSmallQueryCutoff = 15000;
1228  const Int4 kLargeQueryCutoff = 800000;
1229  Uint1* counts = NULL; /* array of word counts */
1230 
1231  *mb_lt_ptr = NULL;
1232 
1233  if (!location || !query) {
1234  /* Empty sequence location provided */
1235  return -1;
1236  }
1237 
1238  mb_lt = (BlastMBLookupTable*)calloc(1, sizeof(BlastMBLookupTable));
1239  if (mb_lt == NULL) {
1240  return -1;
1241  }
1242 
1243  ASSERT(lut_width >= 9);
1244  mb_lt->word_length = lookup_options->word_size;
1245 /* mb_lt->skip = lookup_options->skip; */
1246  mb_lt->stride = lookup_options->stride > 0;
1247  mb_lt->lut_word_length = lut_width;
1248  mb_lt->hashsize = 1ULL << (BITS_PER_NUC * mb_lt->lut_word_length);
1249 
1250  mb_lt->hashtable = (Int4*)calloc(mb_lt->hashsize, sizeof(Int4));
1251  if (mb_lt->hashtable == NULL) {
1253  return -1;
1254  }
1255 
1256  if (location &&
1257  mb_lt->word_length > mb_lt->lut_word_length &&
1258  s_HasMaskAtHashEnabled(query_options)) {
1260  }
1261 
1262  /* Allocate the PV array. To fit in the external cache of
1263  latter-day microprocessors, the PV array cannot have one
1264  bit for for every lookup table entry. Instead we choose
1265  a size that should fit in cache and make a single bit
1266  of the PV array handle multiple hashtable entries if
1267  necessary.
1268 
1269  If the query is too small or too large, the compression
1270  should be higher. Small queries don't reuse the PV array,
1271  and large queries saturate it. In either case, cache
1272  is better used on something else */
1273 
1274  if (mb_lt->lut_word_length <= 12) {
1275  if (mb_lt->hashsize <= 8 * kTargetPVSize)
1276  pv_size = (Int4)(mb_lt->hashsize >> PV_ARRAY_BTS);
1277  else
1278  pv_size = kTargetPVSize / PV_ARRAY_BYTES;
1279  }
1280  else {
1281  /* use 8M-byte pv array for large lut word (only size 16 implemented
1282  currently) */
1283  pv_size = kTargetPVSize * 64 / PV_ARRAY_BYTES;
1284  }
1285 
1286  if(!lookup_options->db_filter &&
1287  (approx_table_entries <= kSmallQueryCutoff ||
1288  approx_table_entries >= kLargeQueryCutoff)) {
1289  pv_size = pv_size / 2;
1290  }
1291 
1292  mb_lt->pv_array_bts = ilog2(mb_lt->hashsize / pv_size);
1293  mb_lt->pv_array = calloc(PV_ARRAY_BYTES, pv_size);
1294  if (mb_lt->pv_array == NULL) {
1296  return -1;
1297  }
1298 
1299 
1300  /* allocate word counters, to save memory we are using 4 bits per word */
1301  if (lookup_options->db_filter) {
1302  counts = (Uint1*)calloc(mb_lt->hashsize / 2, sizeof(Uint1));
1303  if (counts == NULL) {
1305  return -1;
1306  }
1307  }
1308 
1309  if (lookup_options->db_filter) {
1310  s_FillPV(query, location, mb_lt, lookup_options);
1311  s_ScanSubjectForWordCounts(seqsrc, mb_lt, counts,
1312  lookup_options->max_db_word_count);
1313  }
1314 
1315  if (lookup_options->mb_template_length > 0) {
1316  /* discontiguous megablast */
1317  mb_lt->scan_step = 1;
1318  status = s_FillDiscMBTable(query, location, mb_lt, lookup_options);
1319  }
1320  else {
1321  /* contiguous megablast */
1322  mb_lt->scan_step = mb_lt->word_length - mb_lt->lut_word_length + 1;
1323  status = s_FillContigMBTable(query, location, mb_lt, lookup_options,
1324  counts);
1325 
1326  if (status) {
1328  return -1;
1329  }
1330  }
1331 
1332  if (lookup_options->db_filter && counts) {
1333  free(counts);
1334  }
1335 
1336 
1337  if (status > 0) {
1339  return status;
1340  }
1341 
1342  *mb_lt_ptr = mb_lt;
1343 
1344 #ifdef LOOKUP_VERBOSE
1345  printf("lookup table size: %ld (%d letters)\n", mb_lt->hashsize,
1346  mb_lt->lut_word_length);
1347  printf("words in table: %d\n", mb_lt->num_words_added);
1348  printf("filled entries: %d (%f%%)\n", mb_lt->num_unique_pos_added,
1349  100.0 * mb_lt->num_unique_pos_added / mb_lt->hashsize);
1350  printf("PV array size: %d bytes (%ld table entries/bit)\n",
1351  pv_size * PV_ARRAY_BYTES,
1352  mb_lt->hashsize / (pv_size << PV_ARRAY_BTS));
1353  printf("longest chain: %d\n", mb_lt->longest_chain);
1354 #endif
1355  return 0;
1356 }
1357 
1358 
1360 {
1361  if (!mb_lt)
1362  return NULL;
1363 
1364  sfree(mb_lt->hashtable);
1365  sfree(mb_lt->next_pos);
1366  sfree(mb_lt->hashtable2);
1367  sfree(mb_lt->next_pos2);
1368  sfree(mb_lt->pv_array);
1369  if (mb_lt->masked_locations)
1371  sfree(mb_lt);
1372  return mb_lt;
1373 }
1374 
1375 
1376 /* Hash function: Fowler-Noll-Vo (FNV) hash
1377  http://www.isthe.com/chongo/tech/comp/fnv/index.html */
1379 {
1380  const Uint4 fnv_prime = 16777619u;
1381  const Uint4 fnv_offset_basis = 2166136261u;
1382  Int4 i;
1383  Uint4 hash;
1384 
1385  hash = fnv_offset_basis;
1386  for (i = 0;i < 4;i++) {
1387  hash *= fnv_prime;
1388  hash ^= seq[i];
1389  }
1390 
1391  return hash & mask;
1392 }
1393 
1394 
1395 static Int2
1397  BlastSeqLoc* locations,
1399 {
1400  BlastSeqLoc *loc;
1401  Int4 word_length;
1402  Int4 lut_word_length;
1403  PV_ARRAY_TYPE* pv = NULL;
1404  const Int4 pv_array_bts = lookup->pv_array_bts;
1405 
1406  ASSERT(lookup);
1407 
1408  word_length = lookup->word_length;
1409  lut_word_length = lookup->lut_word_length;
1410  pv = lookup->pv;
1411  ASSERT(pv);
1412 
1413  for (loc = locations; loc; loc = loc->next) {
1414  /* We want index to be always pointing to the start of the word.
1415  Since sequence pointer points to the end of the word, subtract
1416  word length from the loop boundaries. */
1417  Int4 from = loc->ssr->left;
1418  Int4 to = loc->ssr->right;
1419  Uint4 ecode = 0;
1420  Uint1* pos;
1421  Uint1* seq;
1422  Uint1* end;
1423  Uint1 base;
1424 
1425  /* case of unmasked region >= kLutWordLength but < full_word_size,
1426  so no hits should be generated. */
1427  if (word_length > (loc->ssr->right - loc->ssr->left + 1)) {
1428  continue;
1429  }
1430 
1431  seq = query->sequence + from;
1432  pos = seq + lut_word_length - 1;
1433  end = query->sequence + to + 1;
1434 
1435  for (; seq < end; seq++) {
1436 
1437  base = *seq;
1438  /* if an ambiguity is encountered, do not add
1439  any words that would contain it */
1440  if ((base & BLAST2NA_MASK) != 0) {
1441  ecode = 0;
1442  pos = seq + lut_word_length;
1443  continue;
1444  }
1445 
1446  /* get next base */
1447  ecode = (ecode << BITS_PER_NUC) | base;
1448  if (seq < pos) {
1449  continue;
1450  }
1451 
1452  PV_SET(pv, (Int8)ecode, pv_array_bts);
1453  }
1454  }
1455 
1456  return 0;
1457 }
1458 
1459 /* Get number of set bits (adapted
1460  from http://graphics.stanford.edu/~seander/bithacks.html)
1461  @param v Bit vector [in]
1462  @return Number of set bits
1463 */
1465 {
1466  if (v==0) return 0; // early bailout for sparse vectors
1467  v = v - ((v >> 1) & 0x55555555);
1468  v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
1469  v = ((v + (v >> 4)) & 0xF0F0F0F);
1470  v = v * 0x1010101;
1471 
1472  return v >> 24; // count
1473 }
1474 
1475 /** Sparse array of Uint1 implemented with a bitfield. The implementation
1476  assumes that indices present are known beforehand and the array is used
1477  only to access values with certain indices */
1479 {
1480  Uint4* bitfield; /**< bitfield with bits set for present indices */
1481  Uint1* values; /**< array of values for present indices */
1482  Int4* counts; /**< cumulative number of bits set */
1483  Uint4 num_elements; /**< number of values present in the array */
1484  Uint4 length; /**< length of the bitfield */
1486 
1487 
1488 static BlastSparseUint1Array*
1490 {
1491  if (!array) {
1492  return NULL;
1493  }
1494 
1495  if (array->values) {
1496  free(array->values);
1497  }
1498 
1499  if (array->counts) {
1500  free(array->counts);
1501  }
1502 
1503  free(array);
1504 
1505  return NULL;
1506 }
1507 
1508 static BlastSparseUint1Array*
1510 {
1511  Int4 i;
1512  BlastSparseUint1Array* retval = calloc(1, sizeof(BlastSparseUint1Array));
1513 
1514  if (!retval || !bitfield) {
1515  return NULL;
1516  }
1517 
1518  retval->bitfield = bitfield;
1519  retval->length = (Uint4)(len >> PV_ARRAY_BTS);
1520  retval->counts = calloc(retval->length, sizeof(Int4));
1521  if (!retval->counts) {
1522  BlastSparseUint1ArrayFree(retval);
1523  return NULL;
1524  }
1525 
1526  retval->counts[0] = s_Popcount(retval->bitfield[0]);
1527  for (i = 1;i < retval->length;i++) {
1528  retval->counts[i] = retval->counts[i - 1] +
1529  s_Popcount(retval->bitfield[i]);
1530  }
1531 
1532  Int4 num_elements = retval->counts[retval->length - 1];
1533  retval->num_elements = num_elements;
1534  retval->values = calloc(num_elements, sizeof(Uint1));
1535  if (!retval->values) {
1536  BlastSparseUint1ArrayFree(retval);
1537  return NULL;
1538  }
1539 
1540  return retval;
1541 }
1542 
1543 /* Get index into array->values for a given vector index */
1544 static Int4
1546 {
1547  /* index into bitfield */
1548  Int4 idx = (Int4)(index >> PV_ARRAY_BTS);
1549 
1550  /* bit number within a bitfield cell (mod 32) */
1551  Int4 bit_number = index & PV_ARRAY_MASK;
1552 
1553  /* number of bits set before a specified bit */
1554  Int4 bit_count = 0;
1555 
1556  if (!array || idx >= array->length) {
1557  return -1;
1558  }
1559 
1560  /* get number of bits set up to idx */
1561  bit_count = (idx > 0) ? array->counts[idx - 1] : 0;
1562  ASSERT(array->bitfield[idx] & (1 << bit_number));
1563 
1564  /* add number of bits set up to bit number in the cell */
1565  bit_count += s_Popcount(array->bitfield[idx] & ((1 << bit_number) - 1));
1566  bit_count++;
1567 
1568 
1569  ASSERT(bit_count > 0);
1570  return bit_count - 1;
1571 }
1572 
1573 /* Get a pointer to a non zero element in the sparse vector */
1574 static Uint1*
1576 {
1577  Int4 sparse_index;
1578 
1579  if (!array) {
1580  return NULL;
1581  }
1582 
1583  sparse_index = BlastSparseUint1ArrayGetIndex(array, index);
1584  ASSERT(sparse_index < array->num_elements);
1585  if (sparse_index < 0 || sparse_index > array->num_elements) {
1586  return NULL;
1587  }
1588 
1589  return array->values + sparse_index;
1590 }
1591 
1592 
1593 /** Scan a subject sequecne and update words counters, for 16-base words with
1594  * scan step of 1. The counters are 4-bit and counting is done up to 10.
1595  *
1596  * @param sequence Subject sequence [in]
1597  * @param lookup Hashed lookup table [in|out]
1598  * @param counts Word counters [in|out]
1599  */
1600 static Int2
1603  BlastSparseUint1Array* counts,
1604  Uint1 max_word_count)
1605 {
1606  Uint1 *s;
1607  Int4 i;
1608  Int8 mask = (1ULL << (16 * BITS_PER_NUC)) - 1;
1609  Int8 word, w;
1610  const Int4 kNumWords
1611  = sequence->length - lookup->lut_word_length;
1612 
1613  PV_ARRAY_TYPE* pv = lookup->pv;
1614  Int4 pv_array_bts = lookup->pv_array_bts;
1615  Int4 shift;
1616  Uint1* pelem;
1617 
1618  if (!sequence || !counts || !lookup || !pv) {
1619  return -1;
1620  }
1621 
1622  ASSERT(lookup->lut_word_length == 16);
1623  if (sequence->length < lookup->lut_word_length) {
1624  return -1;
1625  }
1626 
1627  /* scan the words in the sequence */
1628  shift = 8;
1629  s = sequence->sequence;
1630  w = (Int8)s[0] << 24 | (Int8)s[1] << 16 | (Int8)s[2] << 8 | s[3];
1631  for (i = 0;i < kNumWords;i++) {
1632 
1633  if (i % COMPRESSION_RATIO == 0) {
1634  shift = 8;
1635  w = (w << 8) | (Int8)s[i / COMPRESSION_RATIO + 4];
1636  }
1637  else {
1638  shift -= 2;
1639  ASSERT(shift > 0);
1640  }
1641 
1642  word = (w >> shift) & mask;
1643 
1644  /* skip words that do not appear in the query */
1645  if (!PV_TEST(pv, word, pv_array_bts)) {
1646  continue;
1647  }
1648 
1649  /* update the counter */
1650  pelem = BlastSparseUint1ArrayGetElement(counts, word);
1651  if (*pelem < max_word_count) {
1652  (*pelem)++;
1653  }
1654  }
1655 
1656  return 0;
1657 }
1658 
1659 
1660 /* Thread local data for database word counting phase of lookup table
1661  generation (for Magic-BLAST) */
1663 {
1670 
1671 
1674 {
1675  if (!th) {
1676  return NULL;
1677  }
1678 
1679  if (th->seq_arg) {
1680  Int4 i;
1681  for (i = 0;i < th->num_threads;i++) {
1682  BlastSequenceBlkFree(th->seq_arg[i].seq);
1683  }
1684  free(th->seq_arg);
1685  }
1686 
1687  if (th->itr) {
1688  Int4 i;
1689  for (i = 0;i < th->num_threads;i++) {
1690  BlastSeqSrcIteratorFree(th->itr[i]);
1691  }
1692  free(th->itr);
1693  }
1694 
1695  if (th->seq_src) {
1696  Int4 i;
1697  for (i = 0;i < th->num_threads;i++) {
1698  BlastSeqSrcFree(th->seq_src[i]);
1699  }
1700  free(th->seq_src);
1701  }
1702 
1703  if (th->word_counts) {
1704  Int4 i;
1705  for (i = 1;i < th->num_threads;i++) {
1706  if (th->word_counts[i]) {
1707  if (th->word_counts[i]->values) {
1708  free(th->word_counts[i]->values);
1709  }
1710  free(th->word_counts[i]);
1711  }
1712 
1713 
1714  }
1715  BlastSparseUint1ArrayFree(th->word_counts[0]);
1716  free(th->word_counts);
1717  }
1718 
1719  free(th);
1720 
1721  return NULL;
1722 }
1723 
1724 
1727  BlastSeqSrc* seq_src)
1728 {
1729  Int4 i;
1730 
1731  if (num_threads < 1 || !lookup || !seq_src) {
1732  return NULL;
1733  }
1734 
1736  if (!retval) {
1737  return NULL;
1738  }
1739 
1740  retval->seq_arg = calloc(num_threads, sizeof(BlastSeqSrcGetSeqArg));
1741  if (!retval->seq_arg) {
1743  return NULL;
1744  }
1745 
1746  retval->itr = calloc(num_threads, sizeof(BlastSeqSrcIterator*));
1747  if (!retval->itr) {
1749  return NULL;
1750  }
1751 
1752  retval->seq_src = calloc(num_threads, sizeof(BlastSeqSrc*));
1753  if (!retval->seq_src) {
1755  return NULL;
1756  }
1757 
1758  retval->word_counts = calloc(num_threads, sizeof(BlastSparseUint1Array*));
1759  if (!retval->word_counts) {
1761  return NULL;
1762  }
1763 
1764  for (i = 0;i < num_threads;i++) {
1765 
1767 
1768  retval->seq_src[i] = BlastSeqSrcCopy(seq_src);
1769  if (!retval->seq_src[i]) {
1771  return NULL;
1772  }
1773 
1774  /* each thread must have its own iterator, the small batch seems to
1775  work better for work balansing between threads */
1776  retval->itr[i] = BlastSeqSrcIteratorNewEx(1);
1777  if (!retval->itr[i]) {
1779  return NULL;
1780  }
1781 
1782  if (i == 0) {
1784  1LL << (2 * lookup->lut_word_length));
1785 
1786  if (!retval->word_counts[i]) {
1788  return NULL;
1789  }
1790  }
1791  else {
1792  /* Make shallow copies of the counts array. We do not copy data
1793  that are read only to save memory. */
1794  retval->word_counts[i] = malloc(sizeof(BlastSparseUint1Array));
1795  if (!retval->word_counts[i]) {
1797  return NULL;
1798  }
1799  memcpy(retval->word_counts[i], retval->word_counts[0],
1800  sizeof(BlastSparseUint1Array));
1801 
1802  retval->word_counts[i]->values = calloc(
1803  retval->word_counts[i]->num_elements,
1804  sizeof(Uint1));
1805 
1806  if (!retval->word_counts[i]->values) {
1808  return NULL;
1809  }
1810  }
1811  }
1812 
1813  retval->num_threads = num_threads;
1814 
1815  return retval;
1816 }
1817 
1818 /** Scan database sequences and count query words that appear in the database.
1819  * Then reset pv_array bits that correspond to words that do not appear in
1820  * in the database, or appear 10 or more times
1821  *
1822  * @param seq_src Source for subject sequences [in]
1823  * @param lookup Hashed lookuptable [in|out]
1824  * @param num_threads Number of threads to use [in]
1825  */
1826 static Int2
1829  Uint4 in_num_threads,
1830  Uint1 max_word_count)
1831 {
1832  Int8 i;
1833  Int4 k, b;
1834  Int4 num_db_seqs, th_batch;
1835  NaHashLookupThreadData* th_data = NULL;
1836  Uint4 num_threads;
1837 
1838  if (!seq_src || !lookup || !lookup->pv) {
1839  return -1;
1840  }
1841 
1842  ASSERT(lookup->lut_word_length == 16);
1843 
1844  /* pv array must be one bit per word */
1845  ASSERT(lookup->pv_array_bts == 5);
1846 
1847  num_db_seqs = BlastSeqSrcGetNumSeqs(seq_src);
1848  num_threads = MIN(in_num_threads, num_db_seqs);
1849  th_batch = BlastSeqSrcGetNumSeqs(seq_src) / num_threads;
1850 
1851  th_data = NaHashLookupThreadDataNew(num_threads, lookup, seq_src);
1852  if (!th_data) {
1853  return -1;
1854  }
1855 
1856  /* reset database iterator */
1858 
1859  /* scan subject sequences and update the counters for each */
1860 #pragma omp parallel for if (num_threads > 1) num_threads(num_threads) \
1861  default(none) shared(num_threads, th_data, lookup, \
1862  th_batch, max_word_count) private(i) \
1863  schedule(dynamic, 1)
1864 
1865  for (i = 0;i < num_threads;i++) {
1866  Int4 j;
1867  for (j = 0;j < th_batch;j++) {
1868 
1869 #pragma omp critical (get_sequence_for_word_counts)
1870  {
1871  th_data->seq_arg[i].oid = BlastSeqSrcIteratorNext(
1872  th_data->seq_src[i],
1873  th_data->itr[i]);
1874 
1875  if (th_data->seq_arg[i].oid != BLAST_SEQSRC_EOF) {
1876  BlastSeqSrcGetSequence(th_data->seq_src[i],
1877  &th_data->seq_arg[i]);
1878  }
1879  }
1880 
1881  if (th_data->seq_arg[i].oid != BLAST_SEQSRC_EOF) {
1882 
1884  lookup,
1885  th_data->word_counts[i],
1886  max_word_count);
1888  &th_data->seq_arg[i]);
1889  }
1890  }
1891  }
1892 
1893  /* scan the last sequences */
1894  while ((th_data->seq_arg[0].oid = BlastSeqSrcIteratorNext(seq_src,
1895  th_data->itr[0]))
1896  != BLAST_SEQSRC_EOF) {
1897 
1898  BlastSeqSrcGetSequence(seq_src, &th_data->seq_arg[0]);
1900  th_data->word_counts[0],
1901  max_word_count);
1902  BlastSeqSrcReleaseSequence(seq_src, &th_data->seq_arg[0]);
1903  }
1904 
1905  /* aggregate counts */
1906  for (i = 0;i < th_data->word_counts[0]->num_elements;i++) {
1907  for (k = 1;k < num_threads;k++) {
1908  th_data->word_counts[0]->values[i] =
1909  MIN(th_data->word_counts[0]->values[i] +
1910  th_data->word_counts[k]->values[i],
1911  max_word_count);
1912  }
1913  }
1914 
1915  /* iterate over word counts and clear bits for words that appear too
1916  often or not at all */
1917  i = 0;
1918  b = 1;
1919  k = 0;
1920  while (i < th_data->word_counts[0]->length) {
1921 
1922  /* skip bit field array elements with all bits cleared */
1923  if (th_data->word_counts[0]->bitfield[i] == 0) {
1924  i++;
1925  b = 1;
1926  continue;
1927  }
1928 
1929  if (th_data->word_counts[0]->bitfield[i] & b) {
1930  ASSERT(k < th_data->word_counts[0]->num_elements);
1931 
1932  /* clear bit if word count is too low or too large */
1933  if (th_data->word_counts[0]->values[k] == 0 ||
1934  th_data->word_counts[0]->values[k] >= max_word_count) {
1935 
1936  th_data->word_counts[0]->bitfield[i] &= ~b;
1937  }
1938  k++;
1939  }
1940  b <<= 1;
1941  if (b == 0) {
1942  i++;
1943  b = 1;
1944  }
1945  }
1946 
1947  NaHashLookupThreadDataFree(th_data);
1948 
1949  return 0;
1950 }
1951 
1952 
1954 {
1955  Int8 word;
1956  Int4 word_size;
1957  Int4 i, k;
1958  PV_ARRAY_TYPE* pv = NULL;
1959  Int4 pv_array_bts;
1960 
1961  if (!lookup) {
1962  return -1;
1963  }
1964 
1965  ASSERT(lookup->lut_word_length == 16);
1966 
1967  /* a bit must represent a single word */
1968  ASSERT(lookup->pv_array_bts == 5);
1969 
1970  pv = lookup->pv;
1971  pv_array_bts = lookup->pv_array_bts;
1972  word_size = lookup->lut_word_length;
1973 
1974  /* remove As and Ts */
1975  pv[0] &= ~(PV_ARRAY_TYPE)1;
1976  pv[0xffffffff >> pv_array_bts] &=
1977  ~((PV_ARRAY_TYPE)1 << (0xffffffff & PV_ARRAY_MASK));
1978 
1979  /* remove As with a single error */
1980  for (i = 1;i < 4;i++) {
1981  word = i;
1982  for (k = 0;k < word_size;k++) {
1983  pv[word >> pv_array_bts] &=
1984  ~((PV_ARRAY_TYPE)1 << (word & PV_ARRAY_MASK));
1985  }
1986  }
1987 
1988  /* remove Ts with a single error */
1989  for (i = 0;i < 3;i++) {
1990  for (k = 0;k < word_size;k++) {
1991  word = ((0xffffffff ^ (3 << k*2)) | (i << k*2)) & 0xffffffff;
1992 
1993  pv[word >> pv_array_bts] &=
1994  ~((PV_ARRAY_TYPE)1 << (word & PV_ARRAY_MASK));
1995  }
1996  }
1997 
1998  return 0;
1999 }
2000 
2001 
2002 /** Pack the data structures comprising a nucleotide lookup table
2003  * into their final form
2004  * @param thin_backbone structure containing indexed query offsets [in][out]
2005  * @param lookup the lookup table [in]
2006  */
2007 static void s_BlastNaHashLookupFinalize(BackboneCell* thin_backbone,
2008  Int4* offsets,
2010 {
2011  Int4 i;
2012  Int4 overflow_cells_needed = 0;
2013  Int4 overflow_cursor = 0;
2014  Int4 longest_chain = 0;
2015  PV_ARRAY_TYPE *pv;
2016  const Int4 pv_array_bts = lookup->pv_array_bts;
2017  const Int8 kNumWords = 1LL << (2 * lookup->lut_word_length);
2018 #ifdef LOOKUP_VERBOSE
2019  Int4 backbone_occupancy = 0;
2020  Int4 thick_backbone_occupancy = 0;
2021  Int4 num_overflows = 0;
2022  Int4 words_per_hash[5] = {0,};
2023 #endif
2024 
2025  ASSERT(lookup->lut_word_length == 16);
2026 
2027  if (!lookup->pv) {
2028 
2029  lookup->pv = (PV_ARRAY_TYPE*)calloc(kNumWords >> lookup->pv_array_bts,
2030  sizeof(PV_ARRAY_TYPE));
2031  ASSERT(lookup->pv);
2032  }
2033  else {
2034  /* reset PV array, it might have been set earlier to count database
2035  words, and a few bits may need to be reset */
2036  memset(lookup->pv, 0, (kNumWords >> lookup->pv_array_bts) *
2037  sizeof(PV_ARRAY_TYPE));
2038  }
2039  pv = lookup->pv;
2040  ASSERT(pv != NULL);
2041 
2042  /* allocate the new lookup table */
2043  lookup->thick_backbone = (NaHashLookupBackboneCell *)calloc(
2044  lookup->backbone_size,
2045  sizeof(NaHashLookupBackboneCell));
2046  ASSERT(lookup->thick_backbone != NULL);
2047 
2048  /* find out how many cells are needed for the overflow array */
2049  for (i = 0; i < lookup->backbone_size; i++) {
2050  BackboneCell* b = &thin_backbone[i];
2051  Int4 num_hits = 0;
2052  Int4 num_words = 0;
2053  if (b->num_offsets > 0) {
2054  for (; b; b = b->next) {
2055  num_hits += b->num_offsets;
2056  num_words++;
2057  }
2058  }
2059 
2060  if (num_words > NA_WORDS_PER_HASH || num_hits > NA_OFFSETS_PER_HASH) {
2061  /* +1 because we store unhashed word to resolve hash collisions
2062  +1 for number of offsets */
2063  overflow_cells_needed += num_hits + (num_words * 2);
2064  }
2065  longest_chain = MAX(longest_chain, num_hits);
2066  }
2067 
2068  lookup->longest_chain = longest_chain;
2069 
2070  /* allocate the overflow array */
2071  if (overflow_cells_needed > 0) {
2072  lookup->overflow = (Int4*)calloc(overflow_cells_needed, sizeof(Int4));
2073  ASSERT(lookup->overflow != NULL);
2074  }
2075 
2076  /* for each position in the lookup table backbone, */
2077  for (i = 0; i < lookup->backbone_size; i++) {
2078 
2079  Int4 num_words = 0;
2080  Int4 num_offsets = 0;
2081  NaHashLookupBackboneCell* cell = lookup->thick_backbone + i;
2082  BackboneCell* head = &thin_backbone[i];
2083  BackboneCell* b = NULL;
2084  Boolean is_overflow = FALSE;
2085 
2086  if (head->num_offsets == 0) {
2087  continue;
2088  }
2089 
2090 #ifdef LOOKUP_VERBOSE
2091  thick_backbone_occupancy++;
2092 #endif
2093 
2094  /* for each cell with the same hash value in the thin backbone
2095  count number of words and offsets stored */
2096  for (b = head; b; b = b->next) {
2097  num_words++;
2098  num_offsets += b->num_offsets;
2099 
2100 #ifdef LOOKUP_VERBOSE
2101  backbone_occupancy++;
2102 #endif
2103  }
2104  cell->num_words = num_words;
2105 
2106 #ifdef LOOKUP_VERBOSE
2107  words_per_hash[((num_words < 6) ? num_words : 5) - 1]++;
2108 #endif
2109 
2110  /* if the thin cell stores at most 3 words and 9 offsets, store them
2111  all in the thick backbone */
2112  if (num_words <= NA_WORDS_PER_HASH &&
2113  num_offsets <= NA_OFFSETS_PER_HASH) {
2114  Int4 k = 0;
2115  Int4 n = 0;
2116 
2117  for (b = head; b; b = b->next, k++) {
2118  Int4 j;
2119  cell->words[k] = b->word;
2120  cell->num_offsets[k] = b->num_offsets;
2121 
2122  PV_SET(pv, (Int8)b->word, pv_array_bts);
2123 
2124  j = b->offset;
2125  while (j != 0) {
2127  /* offsets array stores 1-based offsets */
2128  cell->offsets[n++] = j - 1;
2129  j = offsets[j];
2130  }
2131  }
2132  }
2133  /* otherwise, store them in the overflow array */
2134  else if (num_words <= NA_WORDS_PER_HASH) {
2135  Int4 k = 0;
2136  for (b = head; b; b = b->next, k++) {
2137  cell->words[k] = b->word;
2138  }
2139  is_overflow = TRUE;
2140  }
2141  else {
2142  is_overflow = TRUE;
2143  }
2144 
2145  /* add words and offsets to overflow array: word, number of offsets,
2146  offsets */
2147  if (is_overflow) {
2148 #ifdef LOOKUP_VERBOSE
2149  num_overflows++;
2150 #endif
2151  cell->offsets[0] = overflow_cursor;
2152  for (b = head; b; b = b->next) {
2153  Int4 j;
2154  lookup->overflow[overflow_cursor++] = *(Int4*)(&b->word);
2155  lookup->overflow[overflow_cursor++] = b->num_offsets;
2156 
2157  j = b->offset;
2158  while (j != 0) {
2159  /* offsets array stores 1-based offsets */
2160  lookup->overflow[overflow_cursor++] = j - 1;
2161  j = offsets[j];
2162  }
2163 
2164  ASSERT(overflow_cursor <= overflow_cells_needed);
2165  PV_SET(pv, (Int8)b->word, pv_array_bts);
2166  }
2167  }
2168 
2169  /* done with this chain */
2170  BackboneCellFree(thin_backbone[i].next);
2171  }
2172  lookup->offsets_size = overflow_cursor;
2173 
2174 #ifdef LOOKUP_VERBOSE
2175  printf("backbone size: %d\n", lookup->backbone_size);
2176  printf("backbone occupancy: %d (%f%%)\n", backbone_occupancy,
2177  100.0 * backbone_occupancy / lookup->backbone_size);
2178  printf("thick_backbone occupancy: %d (%f%%)\n",
2179  thick_backbone_occupancy,
2180  100.0 * thick_backbone_occupancy / lookup->backbone_size);
2181  printf("num_overflows: %d\n", num_overflows);
2182  printf("\tnumber of words per hash\tcount\n");
2183  {
2184  Int4 ii;
2185  for (ii = 0;ii < 5;ii++) {
2186  printf("\t%d\t%d\n", ii + 1, words_per_hash[ii]);
2187  }
2188  }
2189  printf("overflow size: %d\n", overflow_cells_needed);
2190  printf("longest chain: %d\n", longest_chain);
2191 #endif
2192 }
2193 
2194 
2197 {
2198  sfree(lookup->thick_backbone);
2199  sfree(lookup->overflow);
2200  if (lookup->masked_locations)
2201  lookup->masked_locations = BlastSeqLocFree(lookup->masked_locations);
2202  if (lookup->pv)
2203  sfree(lookup->pv);
2204  sfree(lookup);
2205 
2206  return NULL;
2207 }
2208 
2209 
2211  BlastSeqLoc* locations,
2212  BlastNaHashLookupTable** lut,
2213  const LookupTableOptions* opt,
2214  const QuerySetUpOptions* query_options,
2215  BlastSeqSrc* seqsrc,
2216  Uint4 num_threads)
2217 {
2218  BackboneCell *thin_backbone = NULL;
2219  Int4* offsets = NULL;
2220  BlastNaHashLookupTable *lookup = *lut =
2222  /* Number of possible 16-base words */
2223  const Int8 kNumWords = (1ULL << 32);
2224  Int4 num_hash_bits = 8;
2225  Int4 i, num_unique_words = 0;
2226 
2227  ASSERT(lookup != NULL);
2228 
2229  if (opt->db_filter && !seqsrc) {
2230  return -1;
2231  }
2232 
2233  lookup->word_length = opt->word_size;
2234  lookup->lut_word_length = 16;
2235  lookup->overflow = NULL;
2236  lookup->hash_callback = FNV_hash;
2237 
2238  if (opt->db_filter) {
2239  /* with database filtering some query words are not put in the lookup
2240  table and neighboring query words would be missed with larger scan
2241  step */
2242  lookup->scan_step = 1;
2243  }
2244  else {
2245  lookup->scan_step = lookup->word_length - lookup->lut_word_length + 1;
2246  }
2247 
2248  /* PV array does not use hashing */
2249  lookup->pv_array_bts = PV_ARRAY_BTS;
2250  lookup->pv = (PV_ARRAY_TYPE*)calloc(kNumWords >> lookup->pv_array_bts,
2251  sizeof(PV_ARRAY_TYPE));
2252  if (!lookup->pv) {
2253  return BLASTERR_MEMORY;
2254  }
2255 
2256  s_NaHashLookupFillPV(query, locations, lookup);
2258 
2259  /* count words in the database */
2260  if (opt->db_filter) {
2261  s_NaHashLookupScanSubjectForWordCounts(seqsrc, lookup, num_threads,
2262  opt->max_db_word_count);
2263  }
2264 
2265  /* find number of unique query words */
2266  for (i = 0;i < kNumWords >> lookup->pv_array_bts; i++) {
2267  num_unique_words += s_Popcount(lookup->pv[i]);
2268  }
2269 
2270  /* find number of bits to use for hash function */
2271  while (num_hash_bits < 32 &&
2272  (1LL << num_hash_bits) < num_unique_words) {
2273  num_hash_bits++;
2274  }
2275 
2276  lookup->backbone_size = 1 << num_hash_bits;
2277  lookup->mask = lookup->backbone_size - 1;
2278 
2279  thin_backbone = calloc(lookup->backbone_size, sizeof(BackboneCell));
2280  if (!thin_backbone) {
2281  return BLASTERR_MEMORY;
2282  }
2283 
2284  /* it will store 1-based offsets, hence length + 1 */
2285  offsets = calloc(query->length + 1, sizeof(Int4));
2286  if (!offsets) {
2287  return BLASTERR_MEMORY;
2288  }
2289 
2291  offsets,
2292  lookup->word_length,
2293  BITS_PER_NUC,
2294  lookup->lut_word_length,
2295  query, locations,
2296  lookup->hash_callback,
2297  lookup->mask,
2298  lookup->pv);
2299 
2300  if (locations &&
2301  lookup->word_length > lookup->lut_word_length &&
2302  s_HasMaskAtHashEnabled(query_options)) {
2303  lookup->masked_locations = s_SeqLocListInvert(locations, query->length);
2304  }
2305  s_BlastNaHashLookupFinalize(thin_backbone, offsets, lookup);
2306  sfree(thin_backbone);
2307  sfree(offsets);
2308 
2309  return 0;
2310 }
2311 
#define COMPRESSION_RATIO
Compression ratio of nucleotide bases (4 bases in 1 byte)
Definition: blast_def.h:83
#define sfree(x)
Safe free a pointer: belongs to a higher level header.
Definition: blast_def.h:112
Declarations of static arrays used to define some NCBI encodings to be used in a toolkit independent ...
BLAST filtering functions.
BlastSeqLoc * BlastSeqLocFree(BlastSeqLoc *loc)
Deallocate all BlastSeqLoc objects in a chain.
Definition: blast_filter.c:737
BlastSeqLoc * BlastSeqLocNew(BlastSeqLoc **head, Int4 from, Int4 to)
Create and initialize a new sequence interval.
Definition: blast_filter.c:608
#define PV_ARRAY_BTS
bits-to-shift from lookup_index to pv_array index.
Definition: blast_lookup.h:43
BackboneCell * BackboneCellFree(BackboneCell *cell)
Definition: blast_lookup.c:135
#define PV_TEST(lookup, index, shift)
Test the bit at position 'index' in the PV array bitfield within 'lookup'.
Definition: blast_lookup.h:55
#define PV_ARRAY_MASK
amount to mask off.
Definition: blast_lookup.h:44
void BlastLookupIndexQueryExactMatches(Int4 **backbone, Int4 word_length, Int4 charsize, Int4 lut_word_length, BLAST_SequenceBlk *query, BlastSeqLoc *locations)
Add all applicable query offsets to a generic lookup table.
Definition: blast_lookup.c:79
void BlastHashLookupIndexQueryExactMatches(BackboneCell *backbone, Int4 *offsets, Int4 word_length, Int4 charsize, Int4 lut_word_length, BLAST_SequenceBlk *query, BlastSeqLoc *locations, TNaLookupHashFunction hash_func, Uint4 mask, Uint4 *pv_array)
Add all applicable query offsets to a hashed lookup table.
#define PV_ARRAY_BYTES
number of BYTES in 'native' type.
Definition: blast_lookup.h:42
#define PV_SET(lookup, index, shift)
Set the bit at position 'index' in the PV array bitfield within 'lookup'.
Definition: blast_lookup.h:49
#define PV_ARRAY_TYPE
The pv_array 'native' type.
Definition: blast_lookup.h:41
#define BLASTERR_MEMORY
System error: out of memory condition.
static Int2 s_NaHashLookupCountWordsInSubject_16_1(const BLAST_SequenceBlk *sequence, BlastNaHashLookupTable *lookup, BlastSparseUint1Array *counts, Uint1 max_word_count)
Scan a subject sequecne and update words counters, for 16-base words with scan step of 1.
BlastSmallNaLookupTable * BlastSmallNaLookupTableDestruct(BlastSmallNaLookupTable *lookup)
Free a small nucleotide lookup table.
Int4 BlastNaLookupTableNew(BLAST_SequenceBlk *query, BlastSeqLoc *locations, BlastNaLookupTable **lut, const LookupTableOptions *opt, const QuerySetUpOptions *query_options, Int4 lut_width)
Create a new nucleotide lookup table.
static Int4 s_BlastSmallNaLookupFinalize(Int4 **thin_backbone, BlastSmallNaLookupTable *lookup, BLAST_SequenceBlk *query)
Pack the data structures comprising a small nucleotide lookup table into their final form.
static Int2 s_FillContigMBTable(BLAST_SequenceBlk *query, BlastSeqLoc *location, BlastMBLookupTable *mb_lt, const LookupTableOptions *lookup_options, Uint1 *counts)
Fills in the hashtable and next_pos fields of BlastMBLookupTable* for the contiguous case.
static Boolean s_HasMaskAtHashEnabled(const QuerySetUpOptions *query_options)
Determine whether mask at hash is enabled from the QuerySetUpOptions.
static NaHashLookupThreadData * NaHashLookupThreadDataFree(NaHashLookupThreadData *th)
BlastMBLookupTable * BlastMBLookupTableDestruct(BlastMBLookupTable *mb_lt)
Deallocate memory used by the Mega BLAST lookup table.
static BlastSeqLoc * s_SeqLocListInvert(const BlastSeqLoc *locations, Int4 length)
Changes the list of locations into a list of the intervals between locations (the inverse).
static Int2 s_ScanSubjectForWordCounts(BlastSeqSrc *seq_src, BlastMBLookupTable *mb_lt, Uint1 *counts, Uint1 max_word_count)
Scan database sequences and count query words that appear in the database.
BlastNaHashLookupTable * BlastNaHashLookupTableDestruct(BlastNaHashLookupTable *lookup)
Free a nucleotide lookup table.
static BlastSparseUint1Array * BlastSparseUint1ArrayFree(BlastSparseUint1Array *array)
static EDiscTemplateType s_GetDiscTemplateType(Int4 weight, Uint1 length, EDiscWordType type)
Convert weight, template length and template type from input options into an MBTemplateType enum.
ELookupTableType BlastChooseNaLookupTable(const LookupTableOptions *lookup_options, Int4 approx_table_entries, Int4 max_q_off, Int4 *lut_width)
choose the type of nucleotide lookup table to be used for a blast search
Int2 BlastMBLookupTableNew(BLAST_SequenceBlk *query, BlastSeqLoc *location, BlastMBLookupTable **mb_lt_ptr, const LookupTableOptions *lookup_options, const QuerySetUpOptions *query_options, Int4 approx_table_entries, Int4 lut_width, BlastSeqSrc *seqsrc)
Create the lookup table for Mega BLAST.
Int4 BlastNaHashLookupTableNew(BLAST_SequenceBlk *query, BlastSeqLoc *locations, BlastNaHashLookupTable **lut, const LookupTableOptions *opt, const QuerySetUpOptions *query_options, BlastSeqSrc *seqsrc, Uint4 num_threads)
struct NaHashLookupThreadData NaHashLookupThreadData
static BlastSparseUint1Array * BlastSparseUint1ArrayNew(Uint4 *bitfield, Int8 len)
static Int2 s_NaHashLookupScanSubjectForWordCounts(BlastSeqSrc *seq_src, BlastNaHashLookupTable *lookup, Uint4 in_num_threads, Uint1 max_word_count)
Scan database sequences and count query words that appear in the database.
static void s_BlastNaHashLookupFinalize(BackboneCell *thin_backbone, Int4 *offsets, BlastNaHashLookupTable *lookup)
Pack the data structures comprising a nucleotide lookup table into their final form.
static Int2 s_NaHashLookupFillPV(BLAST_SequenceBlk *query, BlastSeqLoc *locations, BlastNaHashLookupTable *lookup)
static Uint4 s_Popcount(Uint4 v)
static Uint1 * BlastSparseUint1ArrayGetElement(BlastSparseUint1Array *array, Int8 index)
static Int4 BlastSparseUint1ArrayGetIndex(BlastSparseUint1Array *array, Int8 index)
#define BLAST2NA_MASK
bitfield used to detect ambiguities in uncompressed nucleotide letters
static Int2 s_FillPV(BLAST_SequenceBlk *query, BlastSeqLoc *location, BlastMBLookupTable *mb_lt, const LookupTableOptions *lookup_options)
static void s_BlastNaLookupFinalize(Int4 **thin_backbone, BlastNaLookupTable *lookup)
Pack the data structures comprising a nucleotide lookup table into their final form.
static Int2 s_NaHashLookupRemovePolyAWords(BlastNaHashLookupTable *lookup)
static Int2 s_MBCountWordsInSubject_16_1(const BLAST_SequenceBlk *sequence, BlastMBLookupTable *mb_lt, Uint1 *counts, Uint1 max_word_count)
Scan a subject sequecne and update words counters, for 16-base words with scan step of 1.
#define BITS_PER_NUC
number of bits in a compressed nucleotide letter
static Int2 s_FillDiscMBTable(BLAST_SequenceBlk *query, BlastSeqLoc *location, BlastMBLookupTable *mb_lt, const LookupTableOptions *lookup_options)
Fills in the hashtable and next_pos fields of BlastMBLookupTable* for the discontiguous case.
static NaHashLookupThreadData * NaHashLookupThreadDataNew(Int4 num_threads, BlastNaHashLookupTable *lookup, BlastSeqSrc *seq_src)
struct BlastSparseUint1Array BlastSparseUint1Array
Sparse array of Uint1 implemented with a bitfield.
static Int2 s_RemovePolyAWords(BlastMBLookupTable *mb_lt)
Int4 BlastSmallNaLookupTableNew(BLAST_SequenceBlk *query, BlastSeqLoc *locations, BlastSmallNaLookupTable **lut, const LookupTableOptions *opt, const QuerySetUpOptions *query_options, Int4 lut_width)
Create a new small nucleotide lookup table.
static Uint4 FNV_hash(Uint1 *seq, Uint4 mask)
BlastNaLookupTable * BlastNaLookupTableDestruct(BlastNaLookupTable *lookup)
Free a nucleotide lookup table.
Routines for creating nucleotide BLAST lookup tables.
#define NA_OFFSETS_PER_HASH
EDiscWordType
General types of discontiguous word templates.
@ eMBWordOptimal
@ eMBWordCoding
@ eMBWordTwoTemplates
#define NA_WORDS_PER_HASH
static NCBI_INLINE Int4 ComputeDiscontiguousIndex(Uint8 accum, EDiscTemplateType template_type)
Given an accumulator containing packed bases, compute the discontiguous word index specified by templ...
EDiscTemplateType
Enumeration of all discontiguous word templates; the enumerated values encode the weight,...
@ eDiscTemplate_12_18_Optimal
@ eDiscTemplate_11_18_Optimal
@ eDiscTemplateContiguous
@ eDiscTemplate_12_16_Optimal
@ eDiscTemplate_12_16_Coding
@ eDiscTemplate_11_21_Coding
@ eDiscTemplate_11_18_Coding
@ eDiscTemplate_12_21_Coding
@ eDiscTemplate_11_16_Optimal
@ eDiscTemplate_11_21_Optimal
@ eDiscTemplate_12_21_Optimal
@ eDiscTemplate_12_18_Coding
@ eDiscTemplate_11_16_Coding
#define NA_HITS_PER_CELL
maximum number of hits in one lookup table cell
Boolean SBlastFilterOptionsMaskAtHash(const SBlastFilterOptions *filter_options)
Queries whether masking should be done only for the lookup table or for the entire search.
ELookupTableType
Types of the lookup table.
@ eSmallNaLookupTable
lookup table for blastn with small query
@ eNaLookupTable
blastn lookup table
@ eMBLookupTable
megablast lookup table (includes both contiguous and discontiguous megablast)
@ eNaHashLookupTable
used for 16-base words
Boolean Blast_ProgramIsMapping(EBlastProgramType p)
Definition: blast_program.c:76
Int4 BlastSeqSrcIteratorNext(const BlastSeqSrc *seq_src, BlastSeqSrcIterator *itr)
Increments the BlastSeqSrcIterator.
Definition: blast_seqsrc.c:425
BlastSeqSrcIterator * BlastSeqSrcIteratorFree(BlastSeqSrcIterator *itr)
Frees the BlastSeqSrcIterator structure.
Definition: blast_seqsrc.c:412
BlastSeqSrcIterator * BlastSeqSrcIteratorNewEx(unsigned int chunk_sz)
Allocate and initialize an iterator over a BlastSeqSrc.
Definition: blast_seqsrc.c:387
void BlastSeqSrcReleaseSequence(const BlastSeqSrc *seq_src, BlastSeqSrcGetSeqArg *getseq_arg)
Deallocate individual sequence.
Definition: blast_seqsrc.c:289
BlastSeqSrc * BlastSeqSrcCopy(const BlastSeqSrc *seq_src)
Copy function: needed to guarantee thread safety.
Definition: blast_seqsrc.c:138
Int4 BlastSeqSrcGetNumSeqs(const BlastSeqSrc *seq_src)
Get the number of sequences contained in the sequence source.
Definition: blast_seqsrc.c:177
BlastSeqSrc * BlastSeqSrcFree(BlastSeqSrc *seq_src)
Frees the BlastSeqSrc structure by invoking the destructor function set by the user-defined construct...
Definition: blast_seqsrc.c:112
Int2 BlastSeqSrcGetSequence(const BlastSeqSrc *seq_src, BlastSeqSrcGetSeqArg *getseq_arg)
Retrieve an individual sequence.
Definition: blast_seqsrc.c:271
#define BLAST_SEQSRC_EOF
No more sequences available.
Definition: blast_seqsrc.h:292
void BlastSeqSrcResetChunkIterator(BlastSeqSrc *seq_src)
Reset the internal "bookmark" of the last chunk for iteration provided by this object.
Definition: blast_seqsrc.c:436
Various auxiliary BLAST utility functions.
BLAST_SequenceBlk * BlastSequenceBlkFree(BLAST_SequenceBlk *seq_blk)
Deallocate memory for a sequence block.
Definition: blast_util.c:245
Int2 BlastCompressBlastnaSequence(BLAST_SequenceBlk *seq_blk)
Adds a specialized representation of sequence data to a sequence block.
Definition: blast_util.c:459
ncbi::TMaskedQueryRegions mask
#define head
Definition: ct_nlmzip_i.h:138
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:56
static int lookup(const char *name, const struct lookup_int *table)
Definition: attributes.c:50
static const char location[]
Definition: config.c:97
@ eBlastEncodingProtein
NCBIstdaa.
#define NULL
Definition: ncbistd.hpp:225
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
<!DOCTYPE HTML >< html > n< header > n< title > PubSeq Gateway Help Page</title > n< style > n th
n font weight
int i
yy_size_t n
int len
Utility functions for lookup table generation.
Int4 ilog2(Int8 x)
Integer base two logarithm.
Definition: lookup_util.c:71
#define MIN(a, b)
returns smaller of a and b.
Definition: ncbi_std.h:112
Uint1 Boolean
bool replacment for C
Definition: ncbi_std.h:94
#define TRUE
bool replacment for C indicating true.
Definition: ncbi_std.h:97
#define FALSE
bool replacment for C indicating false.
Definition: ncbi_std.h:101
#define ASSERT
macro for assert.
Definition: ncbi_std.h:107
#define MAX(a, b)
returns larger of a and b.
Definition: ncbi_std.h:117
Structure to hold a sequence.
Definition: blast_def.h:242
Int4 length
Length of sequence.
Definition: blast_def.h:246
Uint1 * sequence
Sequence used for search (could be translation).
Definition: blast_def.h:243
Thin backbone cell for nucleotide lookup table with hashed words.
Definition: blast_lookup.h:133
The lookup table structure used for Mega BLAST.
Int4 num_words_added
Number of words added to the l.t.
Int4 lut_word_length
number of letters in a lookup table word
Int4 pv_array_bts
The exponent of 2 by which pv_array is smaller than the backbone.
BlastSeqLoc * masked_locations
masked locations, only non-NULL for soft-masking.
Int4 * next_pos2
Extra positions for the second template.
Int4 * hashtable2
Array of positions for second template.
Int4 * hashtable
Array of positions.
Int4 num_unique_pos_added
Number of positions added to the l.t.
PV_ARRAY_TYPE * pv_array
Presence vector, used for quick presence check.
Boolean stride
is lookup table created with a stride
Int8 hashsize
= 4^(lut_word_length)
EDiscTemplateType template_type
Type of the discontiguous word template.
Int4 scan_step
Step size for scanning the database.
Int4 longest_chain
Largest number of query positions for a given word.
Int4 word_length
number of exact letter matches that will trigger an ungapped extension
Boolean discontiguous
Are discontiguous words used?
Int4 * next_pos
Extra positions stored here.
Boolean two_templates
Use two templates simultaneously.
EDiscTemplateType second_template_type
Type of the second discontiguous word template.
Int4 template_length
Length of the discontiguous word template.
The basic lookup table structure for blastn searches.
Used to hold a set of positions, mostly used for filtering.
Definition: blast_def.h:204
SSeqRange * ssr
location data on the sequence.
Definition: blast_def.h:206
struct BlastSeqLoc * next
next in linked list
Definition: blast_def.h:205
Structure used as the second argument to functions satisfying the GetSeqBlkFnPtr signature,...
Definition: blast_seqsrc.h:257
Int4 oid
Oid in BLAST database, index in an array of sequences, etc [in].
Definition: blast_seqsrc.h:259
EBlastEncoding encoding
Encoding of sequence, i.e.
Definition: blast_seqsrc.h:263
BLAST_SequenceBlk * seq
Sequence to return, if NULL, it should allocated by GetSeqBlkFnPtr (using BlastSeqBlkNew or BlastSetU...
Definition: blast_seqsrc.h:284
Complete type definition of Blast Sequence Source Iterator.
Complete type definition of Blast Sequence Source ADT.
Definition: blast_seqsrc.c:43
Lookup table structure for blastn searches with small queries.
Sparse array of Uint1 implemented with a bitfield.
Uint1 * values
array of values for present indices
Uint4 num_elements
number of values present in the array
Int4 * counts
cumulative number of bits set
Uint4 length
length of the bitfield
Uint4 * bitfield
bitfield with bits set for present indices
Options needed to construct a lookup table Also needed: query sequence and query length.
Int4 word_size
Determines the size of the lookup table.
Uint1 max_db_word_count
words with larger frequency in the database will be masked in the lookup table, if the db_filter opto...
Boolean db_filter
scan the database and include only words that appear in the database between 1 and 9 times (currently...
EBlastProgramType program_number
indicates blastn, blastp, etc.
Int4 mb_template_type
Type of a discontiguous word template.
Uint4 stride
number of words to skip after collecting each word
Int4 mb_template_length
Length of the discontiguous words.
Structure defining one cell of the compacted lookup table.
Int1 num_offsets[3]
number of offsets for each word if there are fewer than 3
Uint4 words[3]
words stored under this hash value
Int4 offsets[9]
offset locations for each word
Int1 num_words
number of words stored under the same hash value
BlastSeqSrcIterator ** itr
BlastSparseUint1Array ** word_counts
BlastSeqSrcGetSeqArg * seq_arg
structure defining one cell of the compacted lookup table
Options required for setting up the query sequence.
char * filter_string
DEPRECATED, filtering options above.
SBlastFilterOptions * filtering_options
structured options for all filtering offered from algo/blast/core for BLAST.
Int4 left
left endpoint of range (zero based)
Definition: blast_def.h:156
Int4 right
right endpoint of range (zero based)
Definition: blast_def.h:157
static string query
Definition: _hash_fun.h:40
Definition: type.c:6
void free(voidpf ptr)
voidp malloc(uInt size)
voidp calloc(uInt items, uInt size)
Modified on Thu Jun 13 17:33:12 2024 by modify_doxy.py rev. 669887