40 #define BLAST2NA_MASK 0xfc
43 #define BITS_PER_NUC 2
47 Int4 approx_table_entries,
Int4 max_q_off,
87 if (approx_table_entries < 250)
95 if (approx_table_entries < 8500)
102 if (approx_table_entries < 1250) {
105 }
else if (approx_table_entries < 21000) {
115 if (approx_table_entries < 1250) {
118 }
else if (approx_table_entries < 8500) {
121 }
else if (approx_table_entries < 18000) {
131 if (approx_table_entries < 12000) {
134 }
else if (approx_table_entries < 180000) {
144 if (approx_table_entries < 8500) {
147 }
else if (approx_table_entries < 18000) {
150 }
else if (approx_table_entries < 60000) {
153 }
else if (approx_table_entries < 900000) {
163 if (approx_table_entries < 8500) {
166 }
else if (approx_table_entries < 300000) {
182 (approx_table_entries >= 32767 || max_q_off >= 32768)) {
202 Int4 overflow_cells_needed = 2;
203 Int4 overflow_cursor = 2;
204 Int4 longest_chain = 0;
205 #ifdef LOOKUP_VERBOSE
206 Int4 backbone_occupancy = 0;
207 Int4 thick_backbone_occupancy = 0;
208 Int4 num_overflows = 0;
216 for (
i = 0;
i <
lookup->backbone_size;
i++) {
217 if (thin_backbone[
i] !=
NULL) {
218 Int4 num_hits = thin_backbone[
i][1];
220 overflow_cells_needed += num_hits + 1;
221 longest_chain =
MAX(longest_chain, num_hits);
231 if (overflow_cells_needed >= 32768) {
232 for (
i = 0;
i <
lookup->backbone_size;
i++)
247 lookup->longest_chain = longest_chain;
250 if (overflow_cells_needed > 0) {
257 for (
i = 0;
i <
lookup->backbone_size;
i++) {
262 if (thin_backbone[
i] ==
NULL) {
263 lookup->final_backbone[
i] = -1;
267 #ifdef LOOKUP_VERBOSE
268 backbone_occupancy++;
270 num_hits = thin_backbone[
i][1];
276 #ifdef LOOKUP_VERBOSE
277 thick_backbone_occupancy++;
279 lookup->final_backbone[
i] = thin_backbone[
i][2];
282 #ifdef LOOKUP_VERBOSE
289 lookup->final_backbone[
i] = -overflow_cursor;
290 for (j = 0; j < num_hits; j++) {
291 lookup->overflow[overflow_cursor++] =
292 thin_backbone[
i][j + 2];
298 lookup->overflow[overflow_cursor++] = -1;
305 lookup->overflow_size = overflow_cursor;
307 #ifdef LOOKUP_VERBOSE
309 printf(
"backbone size: %d\n",
lookup->backbone_size);
310 printf(
"backbone occupancy: %d (%f%%)\n", backbone_occupancy,
311 100.0 * backbone_occupancy /
lookup->backbone_size);
312 printf(
"thick_backbone occupancy: %d (%f%%)\n",
313 thick_backbone_occupancy,
314 100.0 * thick_backbone_occupancy /
lookup->backbone_size);
315 printf(
"num_overflows: %d\n", num_overflows);
316 printf(
"overflow size: %d\n", overflow_cells_needed);
317 printf(
"longest chain: %d\n", longest_chain);
341 if (stop - start > 2)
347 locations = locations->
next;
365 if ( !query_options ) {
386 Int4 **thin_backbone;
393 lookup->lut_word_length = lut_width;
418 sfree(thin_backbone);
428 if (
lookup->masked_locations)
446 Int4 overflow_cells_needed = 0;
447 Int4 overflow_cursor = 0;
448 Int4 longest_chain = 0;
450 #ifdef LOOKUP_VERBOSE
451 Int4 backbone_occupancy = 0;
452 Int4 thick_backbone_occupancy = 0;
453 Int4 num_overflows = 0;
469 for (
i = 0;
i <
lookup->backbone_size;
i++) {
470 if (thin_backbone[
i] !=
NULL) {
471 Int4 num_hits = thin_backbone[
i][1];
473 overflow_cells_needed += num_hits;
474 longest_chain =
MAX(longest_chain, num_hits);
478 lookup->longest_chain = longest_chain;
481 if (overflow_cells_needed > 0) {
487 for (
i = 0;
i <
lookup->backbone_size;
i++) {
492 if (thin_backbone[
i] ==
NULL)
495 #ifdef LOOKUP_VERBOSE
496 backbone_occupancy++;
498 num_hits = thin_backbone[
i][1];
499 lookup->thick_backbone[
i].num_used = num_hits;
508 #ifdef LOOKUP_VERBOSE
509 thick_backbone_occupancy++;
511 for (j = 0; j < num_hits; j++) {
512 lookup->thick_backbone[
i].payload.entries[j] =
513 thin_backbone[
i][j + 2];
517 #ifdef LOOKUP_VERBOSE
520 lookup->thick_backbone[
i].payload.overflow_cursor =
522 for (j = 0; j < num_hits; j++) {
523 lookup->overflow[overflow_cursor] =
524 thin_backbone[
i][j + 2];
533 lookup->overflow_size = overflow_cursor;
535 #ifdef LOOKUP_VERBOSE
536 printf(
"backbone size: %d\n",
lookup->backbone_size);
537 printf(
"backbone occupancy: %d (%f%%)\n", backbone_occupancy,
538 100.0 * backbone_occupancy /
lookup->backbone_size);
539 printf(
"thick_backbone occupancy: %d (%f%%)\n",
540 thick_backbone_occupancy,
541 100.0 * thick_backbone_occupancy /
lookup->backbone_size);
542 printf(
"num_overflows: %d\n", num_overflows);
543 printf(
"overflow size: %d\n", overflow_cells_needed);
544 printf(
"longest chain: %d\n", longest_chain);
555 Int4 **thin_backbone;
562 lookup->lut_word_length = lut_width;
582 sfree(thin_backbone);
590 if (
lookup->masked_locations)
613 }
else if (length == 18) {
618 }
else if (length == 21) {
624 }
else if (
weight == 12) {
630 }
else if (length == 18) {
635 }
else if (length == 21) {
670 Int4 template_length;
680 const Int4 kCompressionFactor=2048;
705 int temp_int = template_type + 1;
706 second_template_type =
715 helper_array2 ==
NULL)
741 from = loc->
ssr->
left - (template_length - 2);
742 to = loc->
ssr->
right - (template_length - 2);
744 pos = seq + template_length;
746 for (index = from; index <= to; index++) {
752 pos = seq + template_length;
761 #ifdef LOOKUP_VERBOSE
769 #ifdef LOOKUP_VERBOSE
772 PV_SET(pv_array, ecode1, pv_array_bts);
775 helper_array[ecode1/kCompressionFactor]++;
787 #ifdef LOOKUP_VERBOSE
790 PV_SET(pv_array, ecode2, pv_array_bts);
793 helper_array2[ecode2/kCompressionFactor]++;
801 for (index = 0; index < mb_lt->
hashsize / kCompressionFactor; index++)
802 longest_chain =
MAX(longest_chain, helper_array[index]);
808 for (index = 0; index < mb_lt->
hashsize / kCompressionFactor; index++)
809 longest_chain =
MAX(longest_chain, helper_array2[index]);
811 sfree(helper_array2);
858 seq =
query->sequence_start + from;
859 pos = seq + kLutWordLength;
863 from -= kLutWordLength - 2;
864 last_offset = to + 2;
866 for (index = from; index <= last_offset; index++) {
872 pos = seq + kLutWordLength;
881 PV_SET(pv_array, ecode, pv_array_bts);
901 if (word_size < 16) {
908 for (
i = 1;
i < 4;
i++) {
910 for (k = 0;k < word_size;k++) {
916 for (
i = 0;
i < 3;
i++) {
917 for (k = 0;k < word_size;k++) {
918 word = ((0xffffffff ^ (3 << k*2)) | (
i << k*2)) & 0xffffffff;
960 const Int4 kCompressionFactor=2048;
976 if (helper_array ==
NULL)
1001 if (lookup_options->
stride > 0) {
1002 shift = lookup_options->
stride - 1;
1003 pos_shift = kLutWordLength + 1;
1012 seq =
query->sequence_start + from;
1013 pos = seq + kLutWordLength;
1017 from -= kLutWordLength - 2;
1018 last_offset = to + 2;
1020 for (index = from; index <= last_offset; index++) {
1026 pos = seq + kLutWordLength;
1039 if ((counts[ecode / 2] >> 4) >= max_word_count) {
1044 if ((counts[ecode / 2] & 0xf) >= max_word_count) {
1065 #ifdef LOOKUP_VERBOSE
1070 #ifdef LOOKUP_VERBOSE
1073 PV_SET(pv_array, ecode, pv_array_bts);
1076 helper_array[ecode/kCompressionFactor]++;
1084 pos = seq + pos_shift;
1093 for (index = 0; index < mb_lt->
hashsize / kCompressionFactor; index++)
1094 longest_chain =
MAX(longest_chain, helper_array[index]);
1097 sfree(helper_array);
1113 Uint1 max_word_count)
1118 Int8 word, index, w;
1119 const Int4 kNumWords
1126 if (!sequence || !counts || !mb_lt || !pv) {
1135 w = (
Int8)s[0] << 24 | (
Int8)s[1] << 16 | (
Int8)s[2] << 8 | s[3];
1136 for (
i = 0;
i < kNumWords;
i++) {
1147 word = (w >> shift) &
mask;
1150 if (!
PV_TEST(pv, word, pv_array_bts)) {
1157 if ((counts[index] & 0xf) < max_word_count) {
1162 if ((counts[index] >> 4) < max_word_count) {
1163 counts[index] += 1 << 4;
1182 Uint1 max_word_count)
1188 if (!seq_src || !pv || !counts) {
1192 memset(&seq_arg, 0,
sizeof(seq_arg));
1219 Int4 approx_table_entries,
1226 const Int4 kTargetPVSize = 131072;
1227 const Int4 kSmallQueryCutoff = 15000;
1228 const Int4 kLargeQueryCutoff = 800000;
1239 if (mb_lt ==
NULL) {
1275 if (mb_lt->
hashsize <= 8 * kTargetPVSize)
1287 (approx_table_entries <= kSmallQueryCutoff ||
1288 approx_table_entries >= kLargeQueryCutoff)) {
1289 pv_size = pv_size / 2;
1303 if (counts ==
NULL) {
1332 if (lookup_options->
db_filter && counts) {
1344 #ifdef LOOKUP_VERBOSE
1345 printf(
"lookup table size: %ld (%d letters)\n", mb_lt->
hashsize,
1350 printf(
"PV array size: %d bytes (%ld table entries/bit)\n",
1380 const Uint4 fnv_prime = 16777619u;
1381 const Uint4 fnv_offset_basis = 2166136261u;
1385 hash = fnv_offset_basis;
1386 for (
i = 0;
i < 4;
i++) {
1402 Int4 lut_word_length;
1404 const Int4 pv_array_bts =
lookup->pv_array_bts;
1408 word_length =
lookup->word_length;
1409 lut_word_length =
lookup->lut_word_length;
1413 for (loc = locations; loc; loc = loc->
next) {
1431 seq =
query->sequence + from;
1432 pos = seq + lut_word_length - 1;
1433 end =
query->sequence + to + 1;
1435 for (; seq < end; seq++) {
1442 pos = seq + lut_word_length;
1467 v = v - ((v >> 1) & 0x55555555);
1468 v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
1469 v = ((v + (v >> 4)) & 0xF0F0F0F);
1495 if (
array->values) {
1499 if (
array->counts) {
1514 if (!retval || !bitfield) {
1561 bit_count = (idx > 0) ?
array->counts[idx - 1] : 0;
1565 bit_count +=
s_Popcount(
array->bitfield[idx] & ((1 << bit_number) - 1));
1570 return bit_count - 1;
1584 ASSERT(sparse_index < array->num_elements);
1585 if (sparse_index < 0 || sparse_index >
array->num_elements) {
1589 return array->values + sparse_index;
1604 Uint1 max_word_count)
1610 const Int4 kNumWords
1618 if (!sequence || !counts || !
lookup || !pv) {
1630 w = (
Int8)s[0] << 24 | (
Int8)s[1] << 16 | (
Int8)s[2] << 8 | s[3];
1631 for (
i = 0;
i < kNumWords;
i++) {
1642 word = (w >> shift) &
mask;
1645 if (!
PV_TEST(pv, word, pv_array_bts)) {
1651 if (*pelem < max_word_count) {
1681 for (
i = 0;
i <
th->num_threads;
i++) {
1689 for (
i = 0;
i <
th->num_threads;
i++) {
1697 for (
i = 0;
i <
th->num_threads;
i++) {
1703 if (
th->word_counts) {
1705 for (
i = 1;
i <
th->num_threads;
i++) {
1706 if (
th->word_counts[
i]) {
1707 if (
th->word_counts[
i]->values) {
1708 free(
th->word_counts[
i]->values);
1731 if (num_threads < 1 || !
lookup || !seq_src) {
1764 for (
i = 0;
i < num_threads;
i++) {
1777 if (!retval->
itr[
i]) {
1784 1LL << (2 *
lookup->lut_word_length));
1829 Uint4 in_num_threads,
1830 Uint1 max_word_count)
1834 Int4 num_db_seqs, th_batch;
1848 num_threads =
MIN(in_num_threads, num_db_seqs);
1860 #pragma omp parallel for if (num_threads > 1) num_threads(num_threads) \
1861 default(none) shared(num_threads, th_data, lookup, \
1862 th_batch, max_word_count) private(i) \
1863 schedule(dynamic, 1)
1865 for (
i = 0;
i < num_threads;
i++) {
1867 for (j = 0;j < th_batch;j++) {
1869 #pragma omp critical (get_sequence_for_word_counts)
1907 for (k = 1;k < num_threads;k++) {
1920 while (i < th_data->word_counts[0]->length) {
1930 ASSERT(k < th_data->word_counts[0]->num_elements);
1971 pv_array_bts =
lookup->pv_array_bts;
1972 word_size =
lookup->lut_word_length;
1976 pv[0xffffffff >> pv_array_bts] &=
1980 for (
i = 1;
i < 4;
i++) {
1982 for (k = 0;k < word_size;k++) {
1983 pv[word >> pv_array_bts] &=
1989 for (
i = 0;
i < 3;
i++) {
1990 for (k = 0;k < word_size;k++) {
1991 word = ((0xffffffff ^ (3 << k*2)) | (
i << k*2)) & 0xffffffff;
1993 pv[word >> pv_array_bts] &=
2012 Int4 overflow_cells_needed = 0;
2013 Int4 overflow_cursor = 0;
2014 Int4 longest_chain = 0;
2016 const Int4 pv_array_bts =
lookup->pv_array_bts;
2017 const Int8 kNumWords = 1LL << (2 *
lookup->lut_word_length);
2018 #ifdef LOOKUP_VERBOSE
2019 Int4 backbone_occupancy = 0;
2020 Int4 thick_backbone_occupancy = 0;
2021 Int4 num_overflows = 0;
2022 Int4 words_per_hash[5] = {0,};
2036 memset(
lookup->pv, 0, (kNumWords >>
lookup->pv_array_bts) *
2049 for (
i = 0;
i <
lookup->backbone_size;
i++) {
2053 if (
b->num_offsets > 0) {
2054 for (;
b;
b =
b->next) {
2055 num_hits +=
b->num_offsets;
2063 overflow_cells_needed += num_hits + (num_words * 2);
2065 longest_chain =
MAX(longest_chain, num_hits);
2068 lookup->longest_chain = longest_chain;
2071 if (overflow_cells_needed > 0) {
2077 for (
i = 0;
i <
lookup->backbone_size;
i++) {
2080 Int4 num_offsets = 0;
2086 if (
head->num_offsets == 0) {
2090 #ifdef LOOKUP_VERBOSE
2091 thick_backbone_occupancy++;
2098 num_offsets +=
b->num_offsets;
2100 #ifdef LOOKUP_VERBOSE
2101 backbone_occupancy++;
2106 #ifdef LOOKUP_VERBOSE
2107 words_per_hash[((num_words < 6) ? num_words : 5) - 1]++;
2117 for (
b =
head;
b;
b =
b->next, k++) {
2119 cell->
words[k] =
b->word;
2136 for (
b =
head;
b;
b =
b->next, k++) {
2137 cell->
words[k] =
b->word;
2148 #ifdef LOOKUP_VERBOSE
2151 cell->
offsets[0] = overflow_cursor;
2154 lookup->overflow[overflow_cursor++] = *(
Int4*)(&
b->word);
2155 lookup->overflow[overflow_cursor++] =
b->num_offsets;
2160 lookup->overflow[overflow_cursor++] = j - 1;
2164 ASSERT(overflow_cursor <= overflow_cells_needed);
2172 lookup->offsets_size = overflow_cursor;
2174 #ifdef LOOKUP_VERBOSE
2175 printf(
"backbone size: %d\n",
lookup->backbone_size);
2176 printf(
"backbone occupancy: %d (%f%%)\n", backbone_occupancy,
2177 100.0 * backbone_occupancy /
lookup->backbone_size);
2178 printf(
"thick_backbone occupancy: %d (%f%%)\n",
2179 thick_backbone_occupancy,
2180 100.0 * thick_backbone_occupancy /
lookup->backbone_size);
2181 printf(
"num_overflows: %d\n", num_overflows);
2182 printf(
"\tnumber of words per hash\tcount\n");
2185 for (ii = 0;ii < 5;ii++) {
2186 printf(
"\t%d\t%d\n", ii + 1, words_per_hash[ii]);
2189 printf(
"overflow size: %d\n", overflow_cells_needed);
2190 printf(
"longest chain: %d\n", longest_chain);
2200 if (
lookup->masked_locations)
2223 const Int8 kNumWords = (1ULL << 32);
2224 Int4 num_hash_bits = 8;
2225 Int4 i, num_unique_words = 0;
2234 lookup->lut_word_length = 16;
2266 for (
i = 0;i < kNumWords >>
lookup->pv_array_bts;
i++) {
2271 while (num_hash_bits < 32 &&
2272 (1LL << num_hash_bits) < num_unique_words) {
2276 lookup->backbone_size = 1 << num_hash_bits;
2280 if (!thin_backbone) {
2306 sfree(thin_backbone);
static int lookup(const char *name, const struct lookup_int *table)
#define COMPRESSION_RATIO
Compression ratio of nucleotide bases (4 bases in 1 byte)
#define sfree(x)
Safe free a pointer: belongs to a higher level header.
Declarations of static arrays used to define some NCBI encodings to be used in a toolkit independent ...
BLAST filtering functions.
BlastSeqLoc * BlastSeqLocFree(BlastSeqLoc *loc)
Deallocate all BlastSeqLoc objects in a chain.
BlastSeqLoc * BlastSeqLocNew(BlastSeqLoc **head, Int4 from, Int4 to)
Create and initialize a new sequence interval.
#define PV_ARRAY_BTS
bits-to-shift from lookup_index to pv_array index.
BackboneCell * BackboneCellFree(BackboneCell *cell)
#define PV_TEST(lookup, index, shift)
Test the bit at position 'index' in the PV array bitfield within 'lookup'.
#define PV_ARRAY_MASK
amount to mask off.
void BlastLookupIndexQueryExactMatches(Int4 **backbone, Int4 word_length, Int4 charsize, Int4 lut_word_length, BLAST_SequenceBlk *query, BlastSeqLoc *locations)
Add all applicable query offsets to a generic lookup table.
void BlastHashLookupIndexQueryExactMatches(BackboneCell *backbone, Int4 *offsets, Int4 word_length, Int4 charsize, Int4 lut_word_length, BLAST_SequenceBlk *query, BlastSeqLoc *locations, TNaLookupHashFunction hash_func, Uint4 mask, Uint4 *pv_array)
Add all applicable query offsets to a hashed lookup table.
#define PV_ARRAY_BYTES
number of BYTES in 'native' type.
#define PV_SET(lookup, index, shift)
Set the bit at position 'index' in the PV array bitfield within 'lookup'.
#define PV_ARRAY_TYPE
The pv_array 'native' type.
#define BLASTERR_MEMORY
System error: out of memory condition.
static Int2 s_NaHashLookupCountWordsInSubject_16_1(const BLAST_SequenceBlk *sequence, BlastNaHashLookupTable *lookup, BlastSparseUint1Array *counts, Uint1 max_word_count)
Scan a subject sequecne and update words counters, for 16-base words with scan step of 1.
BlastSmallNaLookupTable * BlastSmallNaLookupTableDestruct(BlastSmallNaLookupTable *lookup)
Free a small nucleotide lookup table.
Int4 BlastNaLookupTableNew(BLAST_SequenceBlk *query, BlastSeqLoc *locations, BlastNaLookupTable **lut, const LookupTableOptions *opt, const QuerySetUpOptions *query_options, Int4 lut_width)
Create a new nucleotide lookup table.
static Int4 s_BlastSmallNaLookupFinalize(Int4 **thin_backbone, BlastSmallNaLookupTable *lookup, BLAST_SequenceBlk *query)
Pack the data structures comprising a small nucleotide lookup table into their final form.
static Int2 s_FillContigMBTable(BLAST_SequenceBlk *query, BlastSeqLoc *location, BlastMBLookupTable *mb_lt, const LookupTableOptions *lookup_options, Uint1 *counts)
Fills in the hashtable and next_pos fields of BlastMBLookupTable* for the contiguous case.
static Boolean s_HasMaskAtHashEnabled(const QuerySetUpOptions *query_options)
Determine whether mask at hash is enabled from the QuerySetUpOptions.
static NaHashLookupThreadData * NaHashLookupThreadDataFree(NaHashLookupThreadData *th)
BlastMBLookupTable * BlastMBLookupTableDestruct(BlastMBLookupTable *mb_lt)
Deallocate memory used by the Mega BLAST lookup table.
static BlastSeqLoc * s_SeqLocListInvert(const BlastSeqLoc *locations, Int4 length)
Changes the list of locations into a list of the intervals between locations (the inverse).
static Int2 s_ScanSubjectForWordCounts(BlastSeqSrc *seq_src, BlastMBLookupTable *mb_lt, Uint1 *counts, Uint1 max_word_count)
Scan database sequences and count query words that appear in the database.
BlastNaHashLookupTable * BlastNaHashLookupTableDestruct(BlastNaHashLookupTable *lookup)
Free a nucleotide lookup table.
static BlastSparseUint1Array * BlastSparseUint1ArrayFree(BlastSparseUint1Array *array)
static EDiscTemplateType s_GetDiscTemplateType(Int4 weight, Uint1 length, EDiscWordType type)
Convert weight, template length and template type from input options into an MBTemplateType enum.
ELookupTableType BlastChooseNaLookupTable(const LookupTableOptions *lookup_options, Int4 approx_table_entries, Int4 max_q_off, Int4 *lut_width)
choose the type of nucleotide lookup table to be used for a blast search
Int2 BlastMBLookupTableNew(BLAST_SequenceBlk *query, BlastSeqLoc *location, BlastMBLookupTable **mb_lt_ptr, const LookupTableOptions *lookup_options, const QuerySetUpOptions *query_options, Int4 approx_table_entries, Int4 lut_width, BlastSeqSrc *seqsrc)
Create the lookup table for Mega BLAST.
Int4 BlastNaHashLookupTableNew(BLAST_SequenceBlk *query, BlastSeqLoc *locations, BlastNaHashLookupTable **lut, const LookupTableOptions *opt, const QuerySetUpOptions *query_options, BlastSeqSrc *seqsrc, Uint4 num_threads)
struct NaHashLookupThreadData NaHashLookupThreadData
static BlastSparseUint1Array * BlastSparseUint1ArrayNew(Uint4 *bitfield, Int8 len)
static Int2 s_NaHashLookupScanSubjectForWordCounts(BlastSeqSrc *seq_src, BlastNaHashLookupTable *lookup, Uint4 in_num_threads, Uint1 max_word_count)
Scan database sequences and count query words that appear in the database.
static void s_BlastNaHashLookupFinalize(BackboneCell *thin_backbone, Int4 *offsets, BlastNaHashLookupTable *lookup)
Pack the data structures comprising a nucleotide lookup table into their final form.
static Int2 s_NaHashLookupFillPV(BLAST_SequenceBlk *query, BlastSeqLoc *locations, BlastNaHashLookupTable *lookup)
static Uint4 s_Popcount(Uint4 v)
static Uint1 * BlastSparseUint1ArrayGetElement(BlastSparseUint1Array *array, Int8 index)
static Int4 BlastSparseUint1ArrayGetIndex(BlastSparseUint1Array *array, Int8 index)
#define BLAST2NA_MASK
bitfield used to detect ambiguities in uncompressed nucleotide letters
static Int2 s_FillPV(BLAST_SequenceBlk *query, BlastSeqLoc *location, BlastMBLookupTable *mb_lt, const LookupTableOptions *lookup_options)
static void s_BlastNaLookupFinalize(Int4 **thin_backbone, BlastNaLookupTable *lookup)
Pack the data structures comprising a nucleotide lookup table into their final form.
static Int2 s_NaHashLookupRemovePolyAWords(BlastNaHashLookupTable *lookup)
static Int2 s_MBCountWordsInSubject_16_1(const BLAST_SequenceBlk *sequence, BlastMBLookupTable *mb_lt, Uint1 *counts, Uint1 max_word_count)
Scan a subject sequecne and update words counters, for 16-base words with scan step of 1.
#define BITS_PER_NUC
number of bits in a compressed nucleotide letter
static Int2 s_FillDiscMBTable(BLAST_SequenceBlk *query, BlastSeqLoc *location, BlastMBLookupTable *mb_lt, const LookupTableOptions *lookup_options)
Fills in the hashtable and next_pos fields of BlastMBLookupTable* for the discontiguous case.
static NaHashLookupThreadData * NaHashLookupThreadDataNew(Int4 num_threads, BlastNaHashLookupTable *lookup, BlastSeqSrc *seq_src)
struct BlastSparseUint1Array BlastSparseUint1Array
Sparse array of Uint1 implemented with a bitfield.
static Int2 s_RemovePolyAWords(BlastMBLookupTable *mb_lt)
Int4 BlastSmallNaLookupTableNew(BLAST_SequenceBlk *query, BlastSeqLoc *locations, BlastSmallNaLookupTable **lut, const LookupTableOptions *opt, const QuerySetUpOptions *query_options, Int4 lut_width)
Create a new small nucleotide lookup table.
static Uint4 FNV_hash(Uint1 *seq, Uint4 mask)
BlastNaLookupTable * BlastNaLookupTableDestruct(BlastNaLookupTable *lookup)
Free a nucleotide lookup table.
Routines for creating nucleotide BLAST lookup tables.
#define NA_OFFSETS_PER_HASH
EDiscWordType
General types of discontiguous word templates.
#define NA_WORDS_PER_HASH
static NCBI_INLINE Int4 ComputeDiscontiguousIndex(Uint8 accum, EDiscTemplateType template_type)
Given an accumulator containing packed bases, compute the discontiguous word index specified by templ...
EDiscTemplateType
Enumeration of all discontiguous word templates; the enumerated values encode the weight,...
@ eDiscTemplate_12_18_Optimal
@ eDiscTemplate_11_18_Optimal
@ eDiscTemplateContiguous
@ eDiscTemplate_12_16_Optimal
@ eDiscTemplate_12_16_Coding
@ eDiscTemplate_11_21_Coding
@ eDiscTemplate_11_18_Coding
@ eDiscTemplate_12_21_Coding
@ eDiscTemplate_11_16_Optimal
@ eDiscTemplate_11_21_Optimal
@ eDiscTemplate_12_21_Optimal
@ eDiscTemplate_12_18_Coding
@ eDiscTemplate_11_16_Coding
#define NA_HITS_PER_CELL
maximum number of hits in one lookup table cell
Boolean SBlastFilterOptionsMaskAtHash(const SBlastFilterOptions *filter_options)
Queries whether masking should be done only for the lookup table or for the entire search.
ELookupTableType
Types of the lookup table.
@ eSmallNaLookupTable
lookup table for blastn with small query
@ eNaLookupTable
blastn lookup table
@ eMBLookupTable
megablast lookup table (includes both contiguous and discontiguous megablast)
@ eNaHashLookupTable
used for 16-base words
Boolean Blast_ProgramIsMapping(EBlastProgramType p)
Int4 BlastSeqSrcIteratorNext(const BlastSeqSrc *seq_src, BlastSeqSrcIterator *itr)
Increments the BlastSeqSrcIterator.
BlastSeqSrcIterator * BlastSeqSrcIteratorFree(BlastSeqSrcIterator *itr)
Frees the BlastSeqSrcIterator structure.
BlastSeqSrcIterator * BlastSeqSrcIteratorNewEx(unsigned int chunk_sz)
Allocate and initialize an iterator over a BlastSeqSrc.
void BlastSeqSrcReleaseSequence(const BlastSeqSrc *seq_src, BlastSeqSrcGetSeqArg *getseq_arg)
Deallocate individual sequence.
BlastSeqSrc * BlastSeqSrcCopy(const BlastSeqSrc *seq_src)
Copy function: needed to guarantee thread safety.
Int4 BlastSeqSrcGetNumSeqs(const BlastSeqSrc *seq_src)
Get the number of sequences contained in the sequence source.
BlastSeqSrc * BlastSeqSrcFree(BlastSeqSrc *seq_src)
Frees the BlastSeqSrc structure by invoking the destructor function set by the user-defined construct...
Int2 BlastSeqSrcGetSequence(const BlastSeqSrc *seq_src, BlastSeqSrcGetSeqArg *getseq_arg)
Retrieve an individual sequence.
#define BLAST_SEQSRC_EOF
No more sequences available.
void BlastSeqSrcResetChunkIterator(BlastSeqSrc *seq_src)
Reset the internal "bookmark" of the last chunk for iteration provided by this object.
Various auxiliary BLAST utility functions.
BLAST_SequenceBlk * BlastSequenceBlkFree(BLAST_SequenceBlk *seq_blk)
Deallocate memory for a sequence block.
Int2 BlastCompressBlastnaSequence(BLAST_SequenceBlk *seq_blk)
Adds a specialized representation of sequence data to a sequence block.
static const char location[]
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
@ eBlastEncodingProtein
NCBIstdaa.
uint8_t Uint1
1-byte (8-bit) unsigned integer
int16_t Int2
2-byte (16-bit) signed integer
int32_t Int4
4-byte (32-bit) signed integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
int64_t Int8
8-byte (64-bit) signed integer
uint64_t Uint8
8-byte (64-bit) unsigned integer
<!DOCTYPE HTML >< html > n< header > n< title > PubSeq Gateway Help Page</title > n< style > n th
Utility functions for lookup table generation.
Int4 ilog2(Int8 x)
Integer base two logarithm.
#define MIN(a, b)
returns smaller of a and b.
Uint1 Boolean
bool replacment for C
#define ASSERT
macro for assert.
#define MAX(a, b)
returns larger of a and b.
Structure to hold a sequence.
Int4 length
Length of sequence.
Uint1 * sequence
Sequence used for search (could be translation).
Thin backbone cell for nucleotide lookup table with hashed words.
The lookup table structure used for Mega BLAST.
Int4 num_words_added
Number of words added to the l.t.
Int4 lut_word_length
number of letters in a lookup table word
Int4 pv_array_bts
The exponent of 2 by which pv_array is smaller than the backbone.
BlastSeqLoc * masked_locations
masked locations, only non-NULL for soft-masking.
Int4 * next_pos2
Extra positions for the second template.
Int4 * hashtable2
Array of positions for second template.
Int4 * hashtable
Array of positions.
Int4 num_unique_pos_added
Number of positions added to the l.t.
PV_ARRAY_TYPE * pv_array
Presence vector, used for quick presence check.
Boolean stride
is lookup table created with a stride
Int8 hashsize
= 4^(lut_word_length)
EDiscTemplateType template_type
Type of the discontiguous word template.
Int4 scan_step
Step size for scanning the database.
Int4 longest_chain
Largest number of query positions for a given word.
Int4 word_length
number of exact letter matches that will trigger an ungapped extension
Boolean discontiguous
Are discontiguous words used?
Int4 * next_pos
Extra positions stored here.
Boolean two_templates
Use two templates simultaneously.
EDiscTemplateType second_template_type
Type of the second discontiguous word template.
Int4 template_length
Length of the discontiguous word template.
The basic lookup table structure for blastn searches.
Used to hold a set of positions, mostly used for filtering.
SSeqRange * ssr
location data on the sequence.
struct BlastSeqLoc * next
next in linked list
Structure used as the second argument to functions satisfying the GetSeqBlkFnPtr signature,...
Int4 oid
Oid in BLAST database, index in an array of sequences, etc [in].
EBlastEncoding encoding
Encoding of sequence, i.e.
BLAST_SequenceBlk * seq
Sequence to return, if NULL, it should allocated by GetSeqBlkFnPtr (using BlastSeqBlkNew or BlastSetU...
Complete type definition of Blast Sequence Source Iterator.
Complete type definition of Blast Sequence Source ADT.
Lookup table structure for blastn searches with small queries.
Sparse array of Uint1 implemented with a bitfield.
Uint1 * values
array of values for present indices
Uint4 num_elements
number of values present in the array
Int4 * counts
cumulative number of bits set
Uint4 length
length of the bitfield
Uint4 * bitfield
bitfield with bits set for present indices
Options needed to construct a lookup table Also needed: query sequence and query length.
Int4 word_size
Determines the size of the lookup table.
Uint1 max_db_word_count
words with larger frequency in the database will be masked in the lookup table, if the db_filter opto...
Boolean db_filter
scan the database and include only words that appear in the database between 1 and 9 times (currently...
EBlastProgramType program_number
indicates blastn, blastp, etc.
Int4 mb_template_type
Type of a discontiguous word template.
Uint4 stride
number of words to skip after collecting each word
Int4 mb_template_length
Length of the discontiguous words.
Structure defining one cell of the compacted lookup table.
Int1 num_offsets[3]
number of offsets for each word if there are fewer than 3
Uint4 words[3]
words stored under this hash value
Int4 offsets[9]
offset locations for each word
Int1 num_words
number of words stored under the same hash value
BlastSeqSrcIterator ** itr
BlastSparseUint1Array ** word_counts
BlastSeqSrcGetSeqArg * seq_arg
structure defining one cell of the compacted lookup table
Options required for setting up the query sequence.
char * filter_string
DEPRECATED, filtering options above.
SBlastFilterOptions * filtering_options
structured options for all filtering offered from algo/blast/core for BLAST.
Int4 left
left endpoint of range (zero based)
Int4 right
right endpoint of range (zero based)
voidp calloc(uInt items, uInt size)