NCBI C++ ToolKit
na_ungapped.c
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: na_ungapped.c 100164 2023-06-28 13:36:01Z merezhuk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * thus cannot be copyrighted. This software/database is freely available
10  * to the public for use. The National Library of Medicine and the U.S.
11  * Government have not placed any restriction on its use or reproduction.
12  *
13  * Although all reasonable efforts have been taken to ensure the accuracy
14  * and reliability of the software and data, the NLM and the U.S.
15  * Government do not and cannot warrant the performance or results that
16  * may be obtained by using this software or data. The NLM and the U.S.
17  * Government disclaim all warranties, express or implied, including
18  * warranties of performance, merchantability or fitness for any particular
19  * purpose.
20  *
21  * Please cite the author in any work or product based on this material.
22  *
23  * ===========================================================================
24  *
25  */
26 
27 /** @file na_ungapped.c
28  * Nucleotide ungapped extension routines
29  */
30 
35 #include <algo/blast/core/blast_util.h> /* for NCBI2NA_UNPACK_BASE macros */
36 
37 #include "index_ungapped.h"
38 #include "masksubj.inl"
39 #include "jumper.h"
40 
41 /** Check to see if an index->q_pos pair exists in MB lookup table
42  * @param lookup_wrap The lookup table wrap structure [in]
43  * @param index The index to the lookup table [in]
44  * @param q_pos The offset of query to be looked up [in]
45  * @return TRUE if the pair is found to exist in the lookup table
46  */
47 static Boolean
48 s_MBLookup(const LookupTableWrap *lookup_wrap,
49  Int4 index,
50  Int4 q_pos)
51 {
52  BlastMBLookupTable* mb_lt = (BlastMBLookupTable *) lookup_wrap->lut;
53  PV_ARRAY_TYPE *pv = mb_lt->pv_array;
54  Int4 q_off;
55 
56  index &= (mb_lt->hashsize-1);
57  ++q_pos;
58 
59  if (! PV_TEST(pv, index, mb_lt->pv_array_bts)) {
60  return FALSE;
61  }
62 
63  q_off = mb_lt->hashtable[index];
64  while (q_off) {
65  if (q_off == q_pos) return TRUE;
66  q_off = mb_lt->next_pos[q_off];
67  }
68 
69  return FALSE;
70 }
71 
72 /** Check to see if an index->q_pos pair exists in SmallNa lookup table
73  * @param lookup_wrap The lookup table wrap structure [in]
74  * @param index The index to the lookup table [in]
75  * @param q_pos The offset of query to be looked up [in]
76  * @return TRUE if the pair is found to exist in the lookup table
77  */
78 static Boolean
79 s_SmallNaLookup(const LookupTableWrap *lookup_wrap,
80  Int4 index,
81  Int4 q_pos)
82 {
84  (BlastSmallNaLookupTable *) lookup_wrap->lut;
85  Int2 *overflow = lookup->overflow;
86  Int4 src_off;
87 
88  index = lookup->final_backbone[index & lookup->mask];
89 
90  if (index == q_pos) return TRUE;
91  if (index == -1 || index >= 0) return FALSE;
92 
93  src_off = -index;
94  index = overflow[src_off++];
95  do {
96  if (index == q_pos) return TRUE;
97  index = overflow[src_off++];
98  } while (index >=0);
99 
100  return FALSE;
101 }
102 
103 /** Check to see if an index->q_pos pair exists in Na lookup table
104  * @param lookup_wrap The lookup table wrap structure [in]
105  * @param index The index to the lookup table [in]
106  * @param q_pos The offset of query to be looked up [in]
107  * @return TRUE if the pair is found to exist in the lookup table
108  */
109 static Boolean
110 s_NaLookup(const LookupTableWrap *lookup_wrap,
111  Int4 index,
112  Int4 q_pos)
113 {
114  BlastNaLookupTable* lookup = (BlastNaLookupTable *) lookup_wrap->lut;
115  PV_ARRAY_TYPE *pv = lookup->pv;
116  Int4 num_hits, i;
117  Int4 *lookup_pos;
118 
119  index &= (lookup->mask);
120 
121  if (! PV_TEST(pv, index, PV_ARRAY_BTS)) {
122  return FALSE;
123  }
124 
125  num_hits = lookup->thick_backbone[index].num_used;
126 
127  lookup_pos = (num_hits <= NA_HITS_PER_CELL) ?
128  lookup->thick_backbone[index].payload.entries :
129  lookup->overflow + lookup->thick_backbone[index].payload.overflow_cursor;
130 
131  for (i=0; i<num_hits; ++i) {
132  if (lookup_pos[i] == q_pos) return TRUE;
133  }
134 
135  return FALSE;
136 }
137 
138 /** Perform ungapped extension of a word hit, using a score
139  * matrix and extending one base at a time
140  * @param query The query sequence [in]
141  * @param subject The subject sequence [in]
142  * @param matrix The scoring matrix [in]
143  * @param q_off The offset of a word in query [in]
144  * @param s_off The offset of a word in subject [in]
145  * @param X The drop-off parameter for the ungapped extension [in]
146  * @param ungapped_data The ungapped extension information [out]
147  */
148 static void
150  BLAST_SequenceBlk * subject, Int4 ** matrix,
151  Int4 q_off, Int4 s_off, Int4 X,
152  BlastUngappedData * ungapped_data)
153 {
154  Uint1 *q;
155  Int4 sum, score;
156  Uint1 ch;
157  Uint1 *subject0, *sf, *q_beg, *q_end, *s, *start;
158  Int2 remainder, base;
159  Int4 q_avail, s_avail;
160 
161  base = 3 - (s_off % 4);
162 
163  subject0 = subject->sequence;
164  q_avail = query->length - q_off;
165  s_avail = subject->length - s_off;
166 
167  q = q_beg = q_end = query->sequence + q_off;
168  s = subject0 + s_off / COMPRESSION_RATIO;
169  if (q_off < s_off) {
170  start = subject0 + (s_off - q_off) / COMPRESSION_RATIO;
171  remainder = 3 - ((s_off - q_off) % COMPRESSION_RATIO);
172  } else {
173  start = subject0;
174  remainder = 3;
175  }
176 
177  score = 0;
178  sum = 0;
179 
180 /*
181  There is a trick in the loop below that you can't see from the code.
182  X is negative and when sum becomes more negative than X we break out of
183  the loop. The reason that X is guaranteed to become negative is because
184  there is a sentinel at the beginning of the query sequence, so if you hit
185  that you get a big negative value.
186 */
187 
188  /* extend to the left */
189  while ((s > start) || (s == start && base < remainder)) {
190  if (base == 3) {
191  s--;
192  base = 0;
193  } else {
194  ++base;
195  }
196  ch = *s;
197  if ((sum += matrix[*--q][NCBI2NA_UNPACK_BASE(ch, base)]) > 0) {
198  q_beg = q;
199  score += sum;
200  sum = 0;
201  } else if (sum < X) {
202  break;
203  }
204  }
205 
206  ungapped_data->q_start = (Int4)(q_beg - query->sequence);
207  ungapped_data->s_start = s_off - (q_off - ungapped_data->q_start);
208 
209  if (q_avail < s_avail) {
210  sf = subject0 + (s_off + q_avail) / COMPRESSION_RATIO;
211  remainder = 3 - ((s_off + q_avail) % COMPRESSION_RATIO);
212  } else {
213  sf = subject0 + (subject->length) / COMPRESSION_RATIO;
214  remainder = 3 - ((subject->length) % COMPRESSION_RATIO);
215  }
216 
217  /* extend to the right */
218  q = query->sequence + q_off;
219  s = subject0 + s_off / COMPRESSION_RATIO;
220  sum = 0;
221  base = 3 - (s_off % COMPRESSION_RATIO);
222 
223  /* X_current is used to break out of loop if score goes negative */
224  Int4 X_current=X;
225  while (s < sf || (s == sf && base > remainder)) {
226  ch = *s;
227  if ((sum += matrix[*q++][NCBI2NA_UNPACK_BASE(ch, base)]) > 0) {
228  q_end = q;
229  score += sum;
230  X_current = (-score > X) ? -score : X;
231  sum = 0;
232  } else if (sum < X_current)
233  break;
234  if (base == 0) {
235  base = 3;
236  s++;
237  } else
238  base--;
239  }
240 
241  ungapped_data->length = (Int4)(q_end - q_beg);
242  ungapped_data->score = score;
243 }
244 
245 /** Perform ungapped extension of a word hit. Use an approximate method
246  * and revert to rigorous ungapped alignment if the approximate score
247  * is high enough
248  * @param query The query sequence [in]
249  * @param subject The subject sequence [in]
250  * @param matrix The scoring matrix [in]
251  * @param q_off The offset of a word in query [in]
252  * @param s_match_end The first offset in the subject sequence that
253  * is not known to exactly match the query sequence [in]
254  * @param s_off The offset of a word in subject [in]
255  * @param X The drop-off parameter for the ungapped extension [in]
256  * @param ungapped_data The ungapped extension information [out]
257  * @param score_table Array containing precompute sums of
258  * match and mismatch scores [in]
259  * @param reduced_cutoff Score beyond which a rigorous extension is used [in]
260  */
261 static void
263  BLAST_SequenceBlk * subject, Int4 ** matrix,
264  Int4 q_off, Int4 s_match_end, Int4 s_off,
265  Int4 X, BlastUngappedData * ungapped_data,
266  const Int4 * score_table, Int4 reduced_cutoff)
267 {
268  Uint1 *q_start = query->sequence;
269  Uint1 *s_start = subject->sequence;
270  Uint1 *q;
271  Uint1 *s;
272  Int4 sum, score;
273  Int4 i, len;
274  Uint1 *new_q;
275  Int4 q_ext, s_ext;
276 
277  /* The left extension begins behind (q_ext,s_ext); this is the first
278  4-base boundary after s_off. */
279 
280  len = (COMPRESSION_RATIO - (s_off % COMPRESSION_RATIO)) %
282  q_ext = q_off + len;
283  s_ext = s_off + len;
284  q = q_start + q_ext;
285  s = s_start + s_ext / COMPRESSION_RATIO;
286  len = MIN(q_ext, s_ext) / COMPRESSION_RATIO;
287 
288  score = 0;
289  sum = 0;
290  new_q = q;
291 
292  for (i = 0; i < len; s--, q -= 4, i++) {
293  Uint1 s_byte = s[-1];
294  Uint1 q_byte = (q[-4] << 6) | (q[-3] << 4) | (q[-2] << 2) | q[-1];
295 
296  sum += score_table[q_byte ^ s_byte];
297  if (sum > 0) {
298  new_q = q - 4;
299  score += sum;
300  sum = 0;
301  }
302  if (sum < X) {
303  break;
304  }
305  }
306 
307  /* record the start point of the extension */
308 
309  ungapped_data->q_start = (Int4)(new_q - q_start);
310  ungapped_data->s_start = s_ext - (q_ext - ungapped_data->q_start);
311 
312  /* the right extension begins at the first bases not examined by the
313  previous loop */
314 
315  q = q_start + q_ext;
316  s = s_start + s_ext / COMPRESSION_RATIO;
317  len = MIN(query->length - q_ext, subject->length - s_ext) /
319  sum = 0;
320  new_q = q;
321 
322  for (i = 0; i < len; s++, q += 4, i++) {
323  Uint1 s_byte = s[0];
324  Uint1 q_byte = (q[0] << 6) | (q[1] << 4) | (q[2] << 2) | q[3];
325 
326  sum += score_table[q_byte ^ s_byte];
327  if (sum > 0) {
328  new_q = q + 3;
329  score += sum;
330  sum = 0;
331  }
332  if (sum < X) {
333  break;
334  }
335  }
336 
337  if (score >= reduced_cutoff) {
338  /* the current score is high enough; throw away the alignment and
339  recompute it rigorously */
340  s_NuclUngappedExtendExact(query, subject, matrix, q_off,
341  s_off, X, ungapped_data);
342  } else {
343  /* record the length and score of the extension. Make sure the
344  alignment extends at least to s_match_end */
345  ungapped_data->score = score;
346  ungapped_data->length = (Int4)( MAX(s_match_end - ungapped_data->s_start,
347  (new_q - q_start) -
348  ungapped_data->q_start + 1) );
349  }
350 }
351 
352 /**
353  * Attempt to retrieve information associated with diagonal diag.
354  * @param table The hash table [in]
355  * @param diag The diagonal to be retrieved [in]
356  * @param level The offset of the last hit on the specified diagonal [out]
357  * @param hit_saved Whether or not the last hit on the specified diagonal was saved [out]
358  * @param hit_length length of the last hit on the specified diagonal [out]
359  * @return 1 if successful, 0 if no hit was found on the specified diagonal.
360  */
362  Int4 diag, Int4 * level,
363  Int4 * hit_len,
364  Int4 * hit_saved)
365 {
366  /* see http://lxr.linux.no/source/include/linux/hash.h */
367  /* mod operator will be strength-reduced to an and by the compiler */
368  Uint4 bucket = ((Uint4) diag * 0x9E370001) % DIAGHASH_NUM_BUCKETS;
369  Uint4 index = table->backbone[bucket];
370 
371  while (index) {
372  if (table->chain[index].diag == diag) {
373  *level = table->chain[index].level;
374  *hit_len = table->chain[index].hit_len;
375  *hit_saved = table->chain[index].hit_saved;
376  return 1;
377  }
378 
379  index = table->chain[index].next;
380  }
381  return 0;
382 }
383 
384 /**
385  * Attempt to store information associated with diagonal diag.
386  * Cleans up defunct entries along the way or allocates more space if necessary.
387  * @param table The hash table [in]
388  * @param diag The diagonal to be stored [in]
389  * @param level The offset of the hit to be stored [in]
390  * @param len The length of the hit to be stored [in]
391  * @param hit_saved Whether or not this hit was stored [in]
392  * @param s_end Needed to clean up defunct entries [in]
393  * @param window_size Needed to clean up defunct entries [in]
394  * @return 1 if successful, 0 if memory allocation failed.
395  */
397  Int4 diag, Int4 level,
398  Int4 len,
399  Int4 hit_saved,
400  Int4 s_off,
402 {
403  Uint4 bucket = ((Uint4) diag * 0x9E370001) % DIAGHASH_NUM_BUCKETS;
404  Uint4 index = table->backbone[bucket];
405  DiagHashCell *cell = NULL;
406 
407  while (index) {
408  /* if we find what we're looking for, save into it */
409  if (table->chain[index].diag == diag) {
410  table->chain[index].level = level;
411  table->chain[index].hit_len = len;
412  table->chain[index].hit_saved = hit_saved;
413  return 1;
414  }
415  /* otherwise, if this hit is stale, save into it. */
416  else {
417  /* if this hit is stale, save into it. */
418  if ( s_off - table->chain[index].level > window_size) {
419  table->chain[index].diag = diag;
420  table->chain[index].level = level;
421  table->chain[index].hit_len = len;
422  table->chain[index].hit_saved = hit_saved;
423  return 1;
424  }
425  }
426  index = table->chain[index].next;
427  }
428 
429  /* if we got this far, we were unable to replace any existing entries. */
430 
431  /* if there's no more room, allocate more */
432  if (table->occupancy == table->capacity) {
433  table->capacity *= 2;
434  table->chain =
435  realloc(table->chain, table->capacity * sizeof(DiagHashCell));
436  if (table->chain == NULL)
437  return 0;
438  }
439 
440  cell = table->chain + table->occupancy;
441  cell->diag = diag;
442  cell->level = level;
443  cell->hit_len = len;
444  cell->hit_saved = hit_saved;
445  cell->next = table->backbone[bucket];
446  table->backbone[bucket] = table->occupancy;
447  table->occupancy++;
448 
449  return 1;
450 }
451 
452 /** Test to see if seed->q_off exists in lookup table
453  * @param lookup_wrap The lookup table wrap structure [in]
454  * @param subject Subject sequence data [in]
455  * @s_off The starting offset of the seed [in]
456  * @lut_word_length The length of the lookup word [in]
457  */
458 static NCBI_INLINE Boolean
459 s_IsSeedMasked(const LookupTableWrap * lookup_wrap,
460  const BLAST_SequenceBlk * subject,
461  Int4 s_off,
462  Int4 lut_word_length,
463  Int4 q_pos)
464 {
465  Uint1 *s = subject->sequence + s_off / COMPRESSION_RATIO;
466  Int4 shift = 2* (16 - s_off % COMPRESSION_RATIO - lut_word_length);
467  Int4 index;
468  switch (shift) {
469  case 8:
470  case 10:
471  case 12:
472  case 14:
473  index = (s[0] << 24 | s[1] << 16 | s[2] << 8) >> shift;
474  break;
475  case 16:
476  case 18:
477  case 20:
478  case 22:
479  index = (s[0] << 24 | s[1] << 16 ) >> shift;
480  break;
481  case 24:
482  index = s[0];
483  break;
484  default:
485  index = (s[0] << 24 | s[1] << 16 | s[2] << 8 | s[3]) >> shift;
486  break;
487  }
488  return !(((T_Lookup_Callback)(lookup_wrap->lookup_callback))
489  (lookup_wrap, index, q_pos));
490 }
491 
492 /** Check the mini-extended word against masked query regions, and do right
493  * extension if necessary.
494  * @param query Query sequence data [in]
495  * @param subject Subject sequence data [in]
496  * @param q_off Query offset [in][out]
497  * @param s_off Subject offset [in][out]
498  * @param locations of the masked query regions [in]
499  * @param query_info of the masked query regions [in]
500  * @param s_range the open bound of subject region [in]
501  * @param word_length length of word [in]
502  * @param lut_word_length length of lookup table word [in]
503  * @param check_double check to see if it is a double word [in]
504  * @param extended if successful, the actual bases extended [out]
505  * @return 0,1,2 for non-word, single word, and double word
506  */
507 static Int4
510  Int4 *q_off, Int4 *s_off,
511  BlastSeqLoc* locations,
512  BlastQueryInfo * query_info,
513  Uint4 s_range,
514  Uint4 word_length,
515  Uint4 lut_word_length,
516  const LookupTableWrap* lookup_wrap,
517  Boolean check_double,
518  Int4 * extended)
519 {
520  Int4 context, q_range;
521  Int4 ext_to, ext_max;
522  Int4 q_end = *q_off + word_length;
523  Int4 s_end = *s_off + word_length;
524  Int4 s_pos, q_pos;
525 
526  *extended = 0;
527 
528  /* No need to check if mini-extension is not performed.
529  It turns out that we may skip checking for double-hit in 2-hit
530  algo case as well -- they will come in as 2 hits naturally*/
531  if (word_length == lut_word_length) return 1;
532 
533  /* Find the query context boundary */
534  context = BSearchContextInfo(q_end, query_info);
535  q_range = query_info->contexts[context].query_offset
536  + query_info->contexts[context].query_length;
537 
538  /* check query mask at two ends */
539  if (locations) {
540  /* check for right end first */
541  if (s_IsSeedMasked(lookup_wrap, subject,
542  s_end - lut_word_length,
543  lut_word_length,
544  q_end - lut_word_length)) return 0;
545 
546  /* search for valid left end and reposition q_off */
547  for (; TRUE; ++(*s_off), ++(*q_off)) {
548  if (!s_IsSeedMasked(lookup_wrap, subject,
549  *s_off, lut_word_length, *q_off)) break;
550  }
551  }
552 
553  ext_to = word_length - (q_end - (*q_off));
554  ext_max = MIN(q_range - q_end, s_range - s_end);
555 
556  /* shift to the right, and check query mask inside */
557  if (ext_to || locations) {
558 
559  if (ext_to > ext_max) return 0;
560  q_end += ext_to;
561  s_end += ext_to;
562 
563  for (s_pos = s_end - lut_word_length,
564  q_pos = q_end - lut_word_length;
565  s_pos > *s_off;
566  s_pos -= lut_word_length,
567  q_pos -= lut_word_length) {
568  if (s_IsSeedMasked(lookup_wrap, subject,
569  s_pos, lut_word_length, q_pos)) return 0;
570  }
571 
572  (*extended) = ext_to;
573  }
574 
575  /* if we get here, single word check is passed */
576  if (!check_double) return 1;
577 
578  /* do right extension to double word
579  Note: unlike the single word check, here we need to determine the
580  precise value of maximal possible *extend */
581  ext_to += word_length;
582  ext_max = MIN(ext_max, ext_to);
583 
584  /* try seed by seed */
585  for (s_pos = s_end, q_pos = q_end;
586  *extended + lut_word_length <= ext_max;
587  s_pos += lut_word_length,
588  q_pos += lut_word_length,
589  (*extended) += lut_word_length) {
590  if (s_IsSeedMasked(lookup_wrap, subject, s_pos,
591  lut_word_length, q_pos)) break;
592  }
593 
594  /* try base by base */
595  s_pos -= (lut_word_length - 1);
596  q_pos -= (lut_word_length - 1);
597  while (*extended < ext_max) {
598  if (s_IsSeedMasked(lookup_wrap, subject, s_pos,
599  lut_word_length, q_pos)) return 1;
600  (*extended)++;
601  ++s_pos;
602  ++q_pos;
603  }
604 
605  return ((ext_max == ext_to) ? 2 : 1);
606 }
607 
608 /** Perform ungapped extension given an offset pair, and save the initial
609  * hit information if the hit qualifies. This function assumes that the
610  * exact match has already been extended to the word size parameter. It also
611  * supports a two-hit version, where extension is performed only after two hits
612  * are detected within a window.
613  * @param query The query sequence [in]
614  * @param subject The subject sequence [in]
615  * @param q_off The offset in the query sequence [in]
616  * @param s_off The offset in the subject sequence [in]
617  * @param query_mask Structure containing query mask ranges [in]
618  * @param query_info Structure containing query context ranges [in]
619  * @param s_range Subject range [in]
620  * @param word_length The length of the hit [in]
621  * @param word_lut_length The length of the lookup table word [in]
622  * @param word_params The parameters related to initial word extension [in]
623  * @param matrix the substitution matrix for ungapped extension [in]
624  * @param diag_table Structure containing diagonal table with word extension
625  * information [in]
626  * @param init_hitlist The structure containing information about all
627  * initial hits [in] [out]
628  * @return 1 if hit was extended, 0 if not
629  */
630 static Int4
633  Int4 q_off, Int4 s_off,
634  BlastSeqLoc* query_mask,
635  BlastQueryInfo * query_info,
636  Int4 s_range,
637  Int4 word_length, Int4 lut_word_length,
638  const LookupTableWrap * lut,
639  const BlastInitialWordParameters * word_params,
640  Int4 ** matrix,
641  BLAST_DiagTable * diag_table,
642  BlastInitHitList * init_hitlist,
643  Boolean check_masks)
644 {
645  Int4 diag, real_diag;
646  Int4 s_end, s_off_pos, s_end_pos;
647  Int4 word_type = 0;
648  Int4 extended = 0;
649  BlastUngappedData *ungapped_data;
650  BlastUngappedData dummy_ungapped_data;
651  Int4 window_size = word_params->options->window_size;
652  Int4 hit_ready = 1;
653  Int4 last_hit, hit_saved;
654  DiagStruct *hit_level_array;
655  BlastUngappedCutoffs *cutoffs = NULL;
656  Boolean two_hits = (window_size > 0);
657  Boolean off_found = FALSE;
658  Int4 Delta = MIN(word_params->options->scan_range, window_size - word_length);
659 
660  hit_level_array = diag_table->hit_level_array;
661  ASSERT(hit_level_array);
662 
663  diag = s_off + diag_table->diag_array_length - q_off;
664  real_diag = diag & diag_table->diag_mask;
665  last_hit = hit_level_array[real_diag].last_hit;
666  hit_saved = hit_level_array[real_diag].flag;
667  s_end = s_off + word_length;
668  s_off_pos = s_off + diag_table->offset;
669  s_end_pos = s_end + diag_table->offset;
670 
671  /* hit within the explored area should be rejected*/
672  if (s_off_pos < last_hit) return 0;
673 
674  if (two_hits && (hit_saved || s_end_pos > last_hit + window_size )) {
675  /* check the masks for the word; also check to see if this
676  first hit qualifies for a double-word */
677  word_type = s_TypeOfWord(query, subject, &q_off, &s_off,
678  query_mask, query_info, s_range,
679  word_length, lut_word_length, lut, TRUE, &extended);
680  if (!word_type) return 0;
681  /* update the right end*/
682  s_end += extended;
683  s_end_pos += extended;
684 
685  /* for single word, also try off diagonals */
686  if (word_type == 1) {
687  /* try off-diagonals */
688  Int4 orig_diag = real_diag + diag_table->diag_array_length;
689  Int4 s_a = s_off_pos + word_length - window_size;
690  Int4 s_b = s_end_pos - 2 * word_length;
691  Int4 delta;
692  if (Delta < 0) Delta = 0;
693  for (delta = 1; delta <= Delta ; ++delta) {
694  Int4 off_diag = (orig_diag + delta) & diag_table->diag_mask;
695  Int4 off_s_end = hit_level_array[off_diag].last_hit;
696  Int4 off_s_l = diag_table->hit_len_array[off_diag];
697  if ( off_s_l
698  && off_s_end - delta >= s_a
699  && off_s_end - off_s_l <= s_b) {
700  off_found = TRUE;
701  break;
702  }
703  off_diag = (orig_diag - delta) & diag_table->diag_mask;
704  off_s_end = hit_level_array[off_diag].last_hit;
705  off_s_l = diag_table->hit_len_array[off_diag];
706  if ( off_s_l
707  && off_s_end >= s_a
708  && off_s_end - off_s_l + delta <= s_b) {
709  off_found = TRUE;
710  break;
711  }
712  }
713  if (!off_found) {
714  /* This is a new hit */
715  hit_ready = 0;
716  }
717  }
718  } else if (check_masks) {
719  /* check the masks for the word */
720  if(!s_TypeOfWord(query, subject, &q_off, &s_off,
721  query_mask, query_info, s_range,
722  word_length, lut_word_length, lut, FALSE, &extended)) return 0;
723  /* update the right end*/
724  s_end += extended;
725  s_end_pos += extended;
726  }
727 
728  if (hit_ready) {
729  if (word_params->ungapped_extension) {
730  Int4 context = BSearchContextInfo(q_off, query_info);
731  cutoffs = word_params->cutoffs + context;
732  ungapped_data = &dummy_ungapped_data;
733 
734  /*
735  * Skip use of the scoring table and go straight to the matrix
736  * based extension if matrix_only_scoring is set. Used by
737  * app rmblastn.
738  * -RMH-
739  */
740  if ( word_params->options->program_number == eBlastTypeBlastn &&
741  (word_params->matrix_only_scoring || word_length < 11))
742  {
743  s_NuclUngappedExtendExact(query, subject, matrix, q_off,
744  s_off, -(cutoffs->x_dropoff), ungapped_data);
745  }else {
746  s_NuclUngappedExtend(query, subject, matrix, q_off, s_end, s_off,
747  -(cutoffs->x_dropoff), ungapped_data,
748  word_params->nucl_score_table,
749  cutoffs->reduced_nucl_cutoff_score);
750  }
751 
752  if (off_found || ungapped_data->score >= cutoffs->cutoff_score) {
753  BlastUngappedData *final_data =
755  *final_data = *ungapped_data;
756  BLAST_SaveInitialHit(init_hitlist, q_off, s_off, final_data);
757  s_end_pos = ungapped_data->length + ungapped_data->s_start
758  + diag_table->offset;
759  } else {
760  hit_ready = 0;
761  }
762  } else {
763  ungapped_data = NULL;
764  BLAST_SaveInitialHit(init_hitlist, q_off, s_off, ungapped_data);
765  }
766  }
767 
768  hit_level_array[real_diag].last_hit = s_end_pos;
769  hit_level_array[real_diag].flag = hit_ready;
770  if (two_hits) {
771  diag_table->hit_len_array[real_diag] = (hit_ready) ? 0 : s_end_pos - s_off_pos;
772  }
773 
774  return hit_ready;
775 }
776 
777 /** Perform ungapped extension given an offset pair, and save the initial
778  * hit information if the hit qualifies. This function assumes that the
779  * exact match has already been extended to the word size parameter. It also
780  * supports a two-hit version, where extension is performed only after two hits
781  * are detected within a window.
782  * @param query The query sequence [in]
783  * @param subject The subject sequence [in]
784  * @param q_off The offset in the query sequence [in]
785  * @param s_off The offset in the subject sequence [in]
786  * @param query_mask Structure containing query mask ranges [in]
787  * @param query_info Structure containing query context ranges [in]
788  * @param s_range Subject range [in]
789  * @param word_length The length of the hit [in]
790  * @param word_lut_length The length of the lookup table word [in]
791  * @param word_params The parameters related to initial word extension [in]
792  * @param matrix the substitution matrix for ungapped extension [in]
793  * @param hash_table Structure containing initial hits [in] [out]
794  * @param init_hitlist The structure containing information about all
795  * initial hits [in] [out]
796  * @return 1 if hit was extended, 0 if not
797  */
798 static Int4
801  Int4 q_off, Int4 s_off,
802  BlastSeqLoc* query_mask,
803  BlastQueryInfo * query_info,
804  Int4 s_range,
805  Int4 word_length, Int4 lut_word_length,
806  const LookupTableWrap * lut,
807  const BlastInitialWordParameters * word_params,
808  Int4 ** matrix,
809  BLAST_DiagHash * hash_table,
810  BlastInitHitList * init_hitlist,
811  Boolean check_masks)
812 {
813  Int4 diag;
814  Int4 s_end, s_off_pos, s_end_pos, s_l;
815  Int4 word_type = 0;
816  Int4 extended = 0;
817  BlastUngappedData *ungapped_data;
818  BlastUngappedData dummy_ungapped_data;
819  Int4 window_size = word_params->options->window_size;
820  Int4 hit_ready = 1;
821  Int4 last_hit, hit_saved = 0;
822  BlastUngappedCutoffs *cutoffs = NULL;
823  Boolean two_hits = (window_size > 0);
824  Boolean off_found = FALSE;
825  Int4 Delta = MIN(word_params->options->scan_range, window_size - word_length);
826  Int4 rc;
827 
828  diag = s_off - q_off;
829  s_end = s_off + word_length;
830  s_off_pos = s_off + hash_table->offset;
831  s_end_pos = s_end + hash_table->offset;
832 
833  rc = s_BlastDiagHashRetrieve(hash_table, diag, &last_hit, &s_l, &hit_saved);
834 
835  /* if there is no record in hashtable, we set last_hit to be a very negative number */
836  if(!rc) last_hit = 0;
837 
838  /* hit within the explored area should be rejected*/
839  if (s_off_pos < last_hit) return 0;
840 
841  if (two_hits && (hit_saved || s_end_pos > last_hit + window_size )) {
842  /* check the masks for the word; also check to see if this
843  first hit qualifies for a double-hit */
844  word_type = s_TypeOfWord(query, subject, &q_off, &s_off,
845  query_mask, query_info, s_range,
846  word_length, lut_word_length, lut, TRUE, &extended);
847  if (!word_type) return 0;
848  /* update the right end*/
849  s_end += extended;
850  s_end_pos += extended;
851 
852  /* for single word, also try off diagonals */
853  if (word_type == 1) {
854  /* try off-diagonal */
855  Int4 s_a = s_off_pos + word_length - window_size;
856  Int4 s_b = s_end_pos - 2 * word_length;
857  Int4 delta;
858  if (Delta < 0) Delta = 0;
859  for (delta = 1; delta <= Delta; ++delta) {
860  Int4 off_s_end = 0;
861  Int4 off_s_l = 0;
862  Int4 off_hit_saved = 0;
863  Int4 off_rc = s_BlastDiagHashRetrieve(hash_table, diag + delta,
864  &off_s_end, &off_s_l, &off_hit_saved);
865  if ( off_rc
866  && off_s_l
867  && off_s_end - delta >= s_a
868  && off_s_end - off_s_l <= s_b) {
869  off_found = TRUE;
870  break;
871  }
872  off_rc = s_BlastDiagHashRetrieve(hash_table, diag - delta,
873  &off_s_end, &off_s_l, &off_hit_saved);
874  if ( off_rc
875  && off_s_l
876  && off_s_end >= s_a
877  && off_s_end - off_s_l + delta <= s_b) {
878  off_found = TRUE;
879  break;
880  }
881  }
882  if (!off_found) {
883  /* This is a new hit */
884  hit_ready = 0;
885  }
886  }
887  } else if (check_masks) {
888  /* check the masks for the word */
889  if (!s_TypeOfWord(query, subject, &q_off, &s_off,
890  query_mask, query_info, s_range,
891  word_length, lut_word_length, lut, FALSE, &extended)) return 0;
892  /* update the right end*/
893  s_end += extended;
894  s_end_pos += extended;
895  }
896 
897  if (hit_ready) {
898  if (word_params->ungapped_extension) {
899  /* Perform ungapped extension */
900  Int4 context = BSearchContextInfo(q_off, query_info);
901  cutoffs = word_params->cutoffs + context;
902  ungapped_data = &dummy_ungapped_data;
903 
904  /*
905  * Skip use of the scoring table and go straight to the matrix
906  * based extension if matrix_only_scoring is set. Used by
907  * app rmblastn.
908  * -RMH-
909  */
910  if ( word_params->options->program_number == eBlastTypeBlastn &&
911  (word_params->matrix_only_scoring || word_length < 11))
912  {
913  s_NuclUngappedExtendExact(query, subject, matrix, q_off,
914  s_off, -(cutoffs->x_dropoff), ungapped_data);
915  }else {
916  s_NuclUngappedExtend(query, subject, matrix, q_off, s_end,
917  s_off, -(cutoffs->x_dropoff),
918  ungapped_data,
919  word_params->nucl_score_table,
920  cutoffs->reduced_nucl_cutoff_score);
921  }
922 
923  if (off_found || ungapped_data->score >= cutoffs->cutoff_score) {
924  BlastUngappedData *final_data =
926  *final_data = *ungapped_data;
927  BLAST_SaveInitialHit(init_hitlist, q_off, s_off, final_data);
928  s_end_pos = ungapped_data->length + ungapped_data->s_start
929  + hash_table->offset;
930  } else {
931  hit_ready = 0;
932  }
933  } else {
934  ungapped_data = NULL;
935  BLAST_SaveInitialHit(init_hitlist, q_off, s_off, ungapped_data);
936  }
937  }
938 
939  s_BlastDiagHashInsert(hash_table, diag, s_end_pos,
940  (hit_ready) ? 0 : s_end_pos - s_off_pos,
941  hit_ready, s_off_pos, window_size + Delta + 1);
942 
943  return hit_ready;
944 }
945 
946 /** Perform ungapped extensions on the hits retrieved from
947  * blastn/megablast lookup tables, skipping the mini-extension process
948  * @param offset_pairs Array of query and subject offsets. [in]
949  * @param num_hits Size of the above arrays [in]
950  * @param word_params Parameters for word extension [in]
951  * @param lookup_wrap Lookup table wrapper structure [in]
952  * @param query Query sequence data [in]
953  * @param subject Subject sequence data [in]
954  * @param matrix Scoring matrix for ungapped extension [in]
955  * @param query_info Structure containing query context ranges [in]
956  * @param ewp Word extension structure containing information about the
957  * extent of already processed hits on each diagonal [in]
958  * @param init_hitlist Structure to keep the extended hits.
959  * Must be allocated outside of this function [in] [out]
960  * @param s_range The subject range [in]
961  * @return Number of hits extended.
962  */
963 static Int4
964 s_BlastNaExtendDirect(const BlastOffsetPair * offset_pairs, Int4 num_hits,
965  const BlastInitialWordParameters * word_params,
966  LookupTableWrap * lookup_wrap,
968  BLAST_SequenceBlk * subject, Int4 ** matrix,
969  BlastQueryInfo * query_info,
970  Blast_ExtendWord * ewp,
971  BlastInitHitList * init_hitlist,
972  Uint4 s_range)
973 {
974  Int4 index = 0;
975  Int4 hits_extended = 0;
976  Int4 word_length;
977  Boolean check_masks = TRUE;
978 
979  if (lookup_wrap->lut_type == eMBLookupTable) {
980  BlastMBLookupTable *lut = (BlastMBLookupTable *) lookup_wrap->lut;
981  word_length = (lut->discontiguous) ? lut->template_length : lut->word_length;
982  ASSERT(word_length == lut->lut_word_length || lut->discontiguous);
983  check_masks = !lut->stride;
984  }
985  else if (lookup_wrap->lut_type == eSmallNaLookupTable) {
987  (BlastSmallNaLookupTable *) lookup_wrap->lut;
988  word_length = lut->word_length;
989  }
990  else {
991  BlastNaLookupTable *lut = (BlastNaLookupTable *) lookup_wrap->lut;
992  word_length = lut->word_length;
993  }
994 
995  if (word_params->container_type == eDiagHash) {
996  for (; index < num_hits; ++index) {
997  Int4 s_offset = offset_pairs[index].qs_offsets.s_off;
998  Int4 q_offset = offset_pairs[index].qs_offsets.q_off;
999 
1000  hits_extended += s_BlastnDiagHashExtendInitialHit(query, subject,
1001  q_offset, s_offset,
1002  NULL,
1003  query_info, s_range,
1004  word_length, word_length,
1005  lookup_wrap,
1006  word_params, matrix,
1007  ewp->hash_table,
1008  init_hitlist,
1009  check_masks);
1010  }
1011  }
1012  else {
1013  for (; index < num_hits; ++index) {
1014  Int4 s_offset = offset_pairs[index].qs_offsets.s_off;
1015  Int4 q_offset = offset_pairs[index].qs_offsets.q_off;
1016 
1017  hits_extended += s_BlastnDiagTableExtendInitialHit(query, subject,
1018  q_offset, s_offset,
1019  NULL,
1020  query_info, s_range,
1021  word_length, word_length,
1022  lookup_wrap,
1023  word_params, matrix,
1024  ewp->diag_table,
1025  init_hitlist,
1026  check_masks);
1027  }
1028  }
1029  return hits_extended;
1030 }
1031 
1032 /** Perform exact match extensions on the hits retrieved from
1033  * blastn/megablast lookup tables, assuming an arbitrary number of bases
1034  * in a lookup and arbitrary start offset of each hit. Also
1035  * update the diagonal structure.
1036  * @param offset_pairs Array of query and subject offsets [in]
1037  * @param num_hits Size of the above arrays [in]
1038  * @param word_params Parameters for word extension [in]
1039  * @param lookup_wrap Lookup table wrapper structure [in]
1040  * @param query Query sequence data [in]
1041  * @param subject Subject sequence data [in]
1042  * @param matrix Scoring matrix for ungapped extension [in]
1043  * @param query_info Structure containing query context ranges [in]
1044  * @param ewp Word extension structure containing information about the
1045  * extent of already processed hits on each diagonal [in]
1046  * @param init_hitlist Structure to keep the extended hits.
1047  * Must be allocated outside of this function [in] [out]
1048  * @param s_range The subject range [in]
1049  * @return Number of hits extended.
1050  */
1051 static Int4
1052 s_BlastNaExtend(const BlastOffsetPair * offset_pairs, Int4 num_hits,
1053  const BlastInitialWordParameters * word_params,
1054  LookupTableWrap * lookup_wrap,
1056  BLAST_SequenceBlk * subject, Int4 ** matrix,
1057  BlastQueryInfo * query_info,
1058  Blast_ExtendWord * ewp,
1059  BlastInitHitList * init_hitlist,
1060  Uint4 s_range)
1061 {
1062  Int4 index = 0;
1063  Int4 hits_extended = 0;
1064  Int4 word_length, lut_word_length, ext_to;
1065  BlastSeqLoc* masked_locations = NULL;
1066  Boolean check_masks = TRUE;
1067 
1068  if (lookup_wrap->lut_type == eMBLookupTable) {
1069  BlastMBLookupTable *lut = (BlastMBLookupTable *) lookup_wrap->lut;
1070  word_length = lut->word_length;
1071  lut_word_length = lut->lut_word_length;
1072  masked_locations = lut->masked_locations;
1073  check_masks = !lut->stride;
1074  }
1075  else {
1076  BlastNaLookupTable *lut = (BlastNaLookupTable *) lookup_wrap->lut;
1077  word_length = lut->word_length;
1078  lut_word_length = lut->lut_word_length;
1079  masked_locations = lut->masked_locations;
1080  }
1081  ext_to = word_length - lut_word_length;
1082 
1083  /* We trust that the bases of the hit itself are exact matches,
1084  and look only for exact matches before and after the hit.
1085 
1086  Most of the time, the lookup table width is close to the word size
1087  so only a few bases need examining. Also, most of the time (for
1088  random sequences) extensions will fail almost immediately (the
1089  first base examined will not match about 3/4 of the time). Thus it
1090  is critical to reduce as much as possible all work that is not the
1091  actual examination of sequence data */
1092 
1093  for (; index < num_hits; ++index) {
1094  Int4 s_offset = offset_pairs[index].qs_offsets.s_off;
1095  Int4 q_offset = offset_pairs[index].qs_offsets.q_off;
1096 
1097  /* begin with the left extension; the initialization is slightly
1098  faster. Point to the first base of the lookup table hit and
1099  work backwards */
1100 
1101  Int4 ext_left = 0;
1102  Int4 s_off = s_offset;
1103  Uint1 *q = query->sequence + q_offset;
1104  Uint1 *s = subject->sequence + s_off / COMPRESSION_RATIO;
1105 
1106  for (; ext_left < MIN(ext_to, s_offset); ++ext_left) {
1107  s_off--;
1108  q--;
1109  if (s_off % COMPRESSION_RATIO == 3)
1110  s--;
1111  if (((Uint1) (*s << (2 * (s_off % COMPRESSION_RATIO))) >> 6)
1112  != *q)
1113  break;
1114  }
1115 
1116  /* do the right extension if the left extension did not find all
1117  the bases required. Begin at the first base beyond the lookup
1118  table hit and move forwards */
1119 
1120  if (ext_left < ext_to) {
1121  Int4 ext_right = 0;
1122  s_off = s_offset + lut_word_length;
1123  if (s_off + ext_to - ext_left > s_range)
1124  continue;
1125  q = query->sequence + q_offset + lut_word_length;
1126  s = subject->sequence + s_off / COMPRESSION_RATIO;
1127 
1128  for (; ext_right < ext_to - ext_left; ++ext_right) {
1129  if (((Uint1) (*s << (2 * (s_off % COMPRESSION_RATIO))) >>
1130  6) != *q)
1131  break;
1132  s_off++;
1133  q++;
1134  if (s_off % COMPRESSION_RATIO == 0)
1135  s++;
1136  }
1137 
1138  /* check if enough extra matches were found */
1139  if (ext_left + ext_right < ext_to)
1140  continue;
1141  }
1142 
1143  q_offset -= ext_left;
1144  s_offset -= ext_left;
1145  /* check the diagonal on which the hit lies. The boundaries
1146  extend from the first match of the hit to one beyond the last
1147  match */
1148 
1149  if (word_params->container_type == eDiagHash) {
1150  hits_extended += s_BlastnDiagHashExtendInitialHit(query, subject,
1151  q_offset, s_offset,
1152  masked_locations,
1153  query_info, s_range,
1154  word_length, lut_word_length,
1155  lookup_wrap,
1156  word_params, matrix,
1157  ewp->hash_table,
1158  init_hitlist,
1159  check_masks);
1160  } else {
1161  hits_extended += s_BlastnDiagTableExtendInitialHit(query, subject,
1162  q_offset, s_offset,
1163  masked_locations,
1164  query_info, s_range,
1165  word_length, lut_word_length,
1166  lookup_wrap,
1167  word_params, matrix,
1168  ewp->diag_table,
1169  init_hitlist,
1170  check_masks);
1171  }
1172  }
1173  return hits_extended;
1174 }
1175 
1176 /** Perform exact match extensions on the hits retrieved from
1177  * blastn/megablast lookup tables, assuming the number of bases in a lookup
1178  * table word, and the start offset of each hit, is a multiple
1179  * of 4. Also update the diagonal structure.
1180  * @param offset_pairs Array of query and subject offsets. [in]
1181  * @param num_hits Size of the above arrays [in]
1182  * @param word_params Parameters for word extension [in]
1183  * @param lookup_wrap Lookup table wrapper structure [in]
1184  * @param query Query sequence data [in]
1185  * @param subject Subject sequence data [in]
1186  * @param matrix Scoring matrix for ungapped extension [in]
1187  * @param query_info Structure containing query context ranges [in]
1188  * @param ewp Word extension structure containing information about the
1189  * extent of already processed hits on each diagonal [in]
1190  * @param init_hitlist Structure to keep the extended hits.
1191  * Must be allocated outside of this function [in] [out]
1192  * @param s_range The subject range [in]
1193  * @return Number of hits extended.
1194  */
1195 static Int4
1196 s_BlastNaExtendAligned(const BlastOffsetPair * offset_pairs, Int4 num_hits,
1197  const BlastInitialWordParameters * word_params,
1198  LookupTableWrap * lookup_wrap,
1200  Int4 ** matrix, BlastQueryInfo * query_info,
1201  Blast_ExtendWord * ewp,
1202  BlastInitHitList * init_hitlist,
1203  Uint4 s_range)
1204 {
1205  Int4 index = 0;
1206  Int4 hits_extended = 0;
1207  Int4 word_length, lut_word_length, ext_to;
1208  BlastSeqLoc* masked_locations = NULL;
1209  Boolean check_masks = TRUE;
1210 
1211  if (lookup_wrap->lut_type == eMBLookupTable) {
1212  BlastMBLookupTable *lut = (BlastMBLookupTable *) lookup_wrap->lut;
1213  word_length = lut->word_length;
1214  lut_word_length = lut->lut_word_length;
1215  masked_locations = lut->masked_locations;
1216  check_masks = !lut->stride;
1217  }
1218  else {
1219  BlastNaLookupTable *lut = (BlastNaLookupTable *) lookup_wrap->lut;
1220  word_length = lut->word_length;
1221  lut_word_length = lut->lut_word_length;
1222  masked_locations = lut->masked_locations;
1223  }
1224  ext_to = word_length - lut_word_length;
1225 
1226  /* We trust that the bases of the hit itself are exact matches,
1227  and look only for exact matches before and after the hit.
1228 
1229  Most of the time, the lookup table width is close to the word size
1230  so only a few bases need examining. Also, most of the time (for
1231  random sequences) extensions will fail almost immediately (the
1232  first base examined will not match about 3/4 of the time). Thus it
1233  is critical to reduce as much as possible all work that is not the
1234  actual examination of sequence data */
1235 
1236  for (; index < num_hits; ++index) {
1237  Int4 s_offset = offset_pairs[index].qs_offsets.s_off;
1238  Int4 q_offset = offset_pairs[index].qs_offsets.q_off;
1239 
1240  /* begin with the left extension; the initialization is slightly
1241  faster. q below points to the first base of the lookup table hit
1242  and s points to the first four bases of the hit (which is
1243  guaranteed to be aligned on a byte boundary) */
1244 
1245  Int4 ext_left = 0;
1246  Int4 ext_max = MIN(ext_to, s_offset);
1247  Uint1 *q = query->sequence + q_offset;
1248  Uint1 *s = subject->sequence + s_offset / COMPRESSION_RATIO;
1249 
1250  for (; ext_left < ext_max; s--, q -= 4, ++ext_left) {
1251  Uint1 byte = s[-1];
1252 
1253  if ((byte & 3) != q[-1] || ++ext_left == ext_max)
1254  break;
1255  if (((byte >> 2) & 3) != q[-2] || ++ext_left == ext_max)
1256  break;
1257  if (((byte >> 4) & 3) != q[-3] || ++ext_left == ext_max)
1258  break;
1259  if ((byte >> 6) != q[-4])
1260  break;
1261  }
1262 
1263  /* do the right extension if the left extension did not find all the
1264  bases required. Begin at the first base past the lookup table hit
1265  and move forwards */
1266 
1267  if (ext_left < ext_to) {
1268  Int4 ext_right = 0;
1269  ext_max = ext_to -ext_left;
1270  if (s_offset + lut_word_length + ext_max > s_range)
1271  continue;
1272  q = query->sequence + q_offset + lut_word_length;
1273  s = subject->sequence + (s_offset + lut_word_length) / COMPRESSION_RATIO;
1274 
1275  for (; ext_right < ext_max; s++, q += 4, ++ext_right) {
1276  Uint1 byte = s[0];
1277 
1278  if ((byte >> 6) != q[0] || ++ext_right == ext_max)
1279  break;
1280  if (((byte >> 4) & 3) != q[1] || ++ext_right == ext_max)
1281  break;
1282  if (((byte >> 2) & 3) != q[2] || ++ext_right == ext_max)
1283  break;
1284  if ((byte & 3) != q[3])
1285  break;
1286  }
1287 
1288  /* check if enough extra matches were found */
1289  if (ext_left + ext_right < ext_to)
1290  continue;
1291  }
1292 
1293  q_offset -= ext_left;
1294  s_offset -= ext_left;
1295 
1296  /* check the diagonal on which the hit lies. The boundaries extend
1297  from the first match of the hit to one beyond the last match */
1298 
1299  if (word_params->container_type == eDiagHash) {
1300  hits_extended += s_BlastnDiagHashExtendInitialHit(query, subject,
1301  q_offset, s_offset,
1302  masked_locations,
1303  query_info, s_range,
1304  word_length, lut_word_length,
1305  lookup_wrap,
1306  word_params, matrix,
1307  ewp->hash_table,
1308  init_hitlist,
1309  check_masks);
1310  } else {
1311  hits_extended += s_BlastnDiagTableExtendInitialHit(query, subject,
1312  q_offset, s_offset,
1313  masked_locations,
1314  query_info, s_range,
1315  word_length, lut_word_length,
1316  lookup_wrap,
1317  word_params, matrix,
1318  ewp->diag_table,
1319  init_hitlist,
1320  check_masks);
1321  }
1322  }
1323  return hits_extended;
1324 }
1325 
1326 /** Entry i of this list gives the number of pairs of
1327  * bits that are zero in the bit pattern of i, looking
1328  * from right to left
1329  */
1330 static const Uint1 s_ExactMatchExtendLeft[256] = {
1331 4, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
1332 2, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
1333 2, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
1334 2, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
1335 3, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
1336 2, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
1337 2, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
1338 2, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
1339 3, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
1340 2, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
1341 2, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
1342 2, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
1343 3, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
1344 2, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
1345 2, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
1346 2, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
1347 };
1348 
1349 /** Entry i of this list gives the number of pairs of
1350  * bits that are zero in the bit pattern of i, looking
1351  * from left to right
1352  */
1353 static const Uint1 s_ExactMatchExtendRight[256] = {
1354 4, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1355 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1356 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1357 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1358 };
1359 
1360 /** Perform exact match extensions on the hits retrieved from
1361  * small-query lookup tables. Assumes the number of bases in a lookup
1362  * table word and the start offset of each hit is a multiple
1363  * of 4, and also that the word size is within 4 bases of the lookup
1364  * table width
1365  * @param offset_pairs Array of query and subject offsets. [in]
1366  * @param num_hits Size of the above arrays [in]
1367  * @param word_params Parameters for word extension [in]
1368  * @param lookup_wrap Lookup table wrapper structure [in]
1369  * @param query Query sequence data [in]
1370  * @param subject Subject sequence data [in]
1371  * @param matrix Scoring matrix for ungapped extension [in]
1372  * @param query_info Structure containing query context ranges [in]
1373  * @param ewp Word extension structure containing information about the
1374  * extent of already processed hits on each diagonal [in]
1375  * @param init_hitlist Structure to keep the extended hits.
1376  * Must be allocated outside of this function [in] [out]
1377  * @param s_range The subject range [in]
1378  * @return Number of hits extended.
1379  */
1380 static Int4
1382  Int4 num_hits,
1383  const BlastInitialWordParameters * word_params,
1384  LookupTableWrap * lookup_wrap,
1386  Int4 ** matrix, BlastQueryInfo * query_info,
1387  Blast_ExtendWord * ewp,
1388  BlastInitHitList * init_hitlist,
1389  Uint4 s_range)
1390 {
1391  Int4 index = 0;
1392  Int4 hits_extended = 0;
1393  BlastSmallNaLookupTable *lut = (BlastSmallNaLookupTable *) lookup_wrap->lut;
1394  Int4 word_length = lut->word_length;
1395  Int4 lut_word_length = lut->lut_word_length;
1396  Int4 ext_to = word_length - lut_word_length;
1397  Uint1 *q = query->compressed_nuc_seq;
1398  Uint1 *s = subject->sequence;
1399 
1400  for (; index < num_hits; ++index) {
1401  Int4 s_offset = offset_pairs[index].qs_offsets.s_off;
1402  Int4 q_offset = offset_pairs[index].qs_offsets.q_off;
1403  Int4 ext_left = 0;
1404 
1405  Int4 context = BSearchContextInfo(q_offset, query_info);
1406  Int4 q_start = query_info->contexts[context].query_offset;
1407  Int4 q_range = q_start + query_info->contexts[context].query_length;
1408 
1409  /* the seed is assumed to start on a multiple of 4 bases
1410  in the subject sequence. Look for up to 4 exact matches
1411  to the left. The index into q[] below can
1412  technically be negative, but the compressed version
1413  of the query has extra pad bytes before q[0] */
1414 
1415  if ( (s_offset > 0) && (q_offset > 0) ) {
1416  Uint1 q_byte = q[q_offset - 4];
1417  Uint1 s_byte = s[s_offset / COMPRESSION_RATIO - 1];
1418  ext_left = s_ExactMatchExtendLeft[q_byte ^ s_byte];
1419  ext_left = MIN(MIN(ext_left, ext_to), q_offset - q_start);
1420  }
1421 
1422  /* look for up to 4 exact matches to the right of the seed */
1423 
1424  if ((ext_left < ext_to) && ((q_offset + lut_word_length) < query->length)) {
1425  Uint1 q_byte = q[q_offset + lut_word_length];
1426  Uint1 s_byte = s[(s_offset + lut_word_length) / COMPRESSION_RATIO];
1427  Int4 ext_right = s_ExactMatchExtendRight[q_byte ^ s_byte];
1428  ext_right = MIN(MIN(ext_right, s_range - (s_offset + lut_word_length)),
1429  q_range - (q_offset + lut_word_length));
1430  if (ext_left + ext_right < ext_to)
1431  continue;
1432  }
1433 
1434  q_offset -= ext_left;
1435  s_offset -= ext_left;
1436 
1437  if (word_params->container_type == eDiagHash) {
1439  q_offset, s_offset,
1440  lut->masked_locations,
1441  query_info, s_range,
1442  word_length, lut_word_length,
1443  lookup_wrap,
1444  word_params, matrix,
1445  ewp->hash_table,
1446  init_hitlist,
1447  TRUE);
1448  }
1449  else {
1450  hits_extended += s_BlastnDiagTableExtendInitialHit(query, subject,
1451  q_offset, s_offset,
1452  lut->masked_locations,
1453  query_info, s_range,
1454  word_length, lut_word_length,
1455  lookup_wrap,
1456  word_params, matrix,
1457  ewp->diag_table,
1458  init_hitlist,
1459  TRUE);
1460  }
1461  }
1462  return hits_extended;
1463 }
1464 
1465 
1466 /** Perform exact match extensions on the hits retrieved from
1467  * small-query blastn lookup tables, assuming an arbitrary number of bases
1468  * in a lookup and arbitrary start offset of each hit. Also
1469  * update the diagonal structure.
1470  * @param offset_pairs Array of query and subject offsets [in]
1471  * @param num_hits Size of the above arrays [in]
1472  * @param word_params Parameters for word extension [in]
1473  * @param lookup_wrap Lookup table wrapper structure [in]
1474  * @param query Query sequence data [in]
1475  * @param subject Subject sequence data [in]
1476  * @param matrix Scoring matrix for ungapped extension [in]
1477  * @param query_info Structure containing query context ranges [in]
1478  * @param ewp Word extension structure containing information about the
1479  * extent of already processed hits on each diagonal [in]
1480  * @param init_hitlist Structure to keep the extended hits.
1481  * Must be allocated outside of this function [in] [out]
1482  * @param s_range The subject range [in]
1483  * @return Number of hits extended.
1484  */
1485 static Int4
1486 s_BlastSmallNaExtend(const BlastOffsetPair * offset_pairs, Int4 num_hits,
1487  const BlastInitialWordParameters * word_params,
1488  LookupTableWrap * lookup_wrap,
1490  BLAST_SequenceBlk * subject, Int4 ** matrix,
1491  BlastQueryInfo * query_info,
1492  Blast_ExtendWord * ewp,
1493  BlastInitHitList * init_hitlist,
1494  Uint4 s_range)
1495 {
1496  Int4 index = 0;
1497  Int4 hits_extended = 0;
1498  BlastSmallNaLookupTable *lut = (BlastSmallNaLookupTable *) lookup_wrap->lut;
1499  Int4 word_length = lut->word_length;
1500  Int4 lut_word_length = lut->lut_word_length;
1501  Uint1 *q = query->compressed_nuc_seq;
1502  Uint1 *s = subject->sequence;
1503 
1504  for (; index < num_hits; ++index) {
1505  Int4 s_offset = offset_pairs[index].qs_offsets.s_off;
1506  Int4 q_offset = offset_pairs[index].qs_offsets.q_off;
1507  Int4 s_off;
1508  Int4 q_off;
1509  Int4 ext_left = 0;
1510  Int4 ext_right = 0;
1511  Int4 context = BSearchContextInfo(q_offset, query_info);
1512  Int4 q_start = query_info->contexts[context].query_offset;
1513  Int4 q_range = q_start + query_info->contexts[context].query_length;
1514  Int4 ext_max = MIN(MIN(word_length - lut_word_length, s_offset), q_offset - q_start);
1515 
1516  /* Start the extension at the first multiple of 4 bases in
1517  the subject sequence to the right of the seed.
1518  Collect exact matches in groups of four, until a
1519  mismatch is encountered or the expected number of
1520  matches is found. The index into q[] below can
1521  technically be negative, but the compressed version
1522  of the query has extra pad bytes before q[0] */
1523 
1524  Int4 rsdl = COMPRESSION_RATIO - (s_offset % COMPRESSION_RATIO);
1525  s_offset += rsdl;
1526  q_offset += rsdl;
1527  ext_max += rsdl;
1528 
1529  s_off = s_offset;
1530  q_off = q_offset;
1531 
1532  while (ext_left < ext_max) {
1533  Uint1 q_byte = q[q_off - 4];
1534  Uint1 s_byte = s[s_off / COMPRESSION_RATIO - 1];
1535  Uint1 bases = s_ExactMatchExtendLeft[q_byte ^ s_byte];
1536  ext_left += bases;
1537  if (bases < 4)
1538  break;
1539  q_off -= 4;
1540  s_off -= 4;
1541  }
1542  ext_left = MIN(ext_left, ext_max);
1543 
1544  /* extend to the right. The extension begins at the first
1545  base not examined by the left extension */
1546 
1547  s_off = s_offset;
1548  q_off = q_offset;
1549  ext_max = MIN(MIN(word_length - ext_left, s_range - s_off), q_range - q_off);
1550  while (ext_right < ext_max) {
1551  Uint1 q_byte = q[q_off];
1552  Uint1 s_byte = s[s_off / COMPRESSION_RATIO];
1553  Uint1 bases = s_ExactMatchExtendRight[q_byte ^ s_byte];
1554  ext_right += bases;
1555  if (bases < 4)
1556  break;
1557  q_off += 4;
1558  s_off += 4;
1559  }
1560  ext_right = MIN(ext_right, ext_max);
1561 
1562  if (ext_left + ext_right < word_length)
1563  continue;
1564 
1565  q_offset -= ext_left;
1566  s_offset -= ext_left;
1567 
1568  if (word_params->container_type == eDiagHash) {
1569  hits_extended += s_BlastnDiagHashExtendInitialHit(query, subject,
1570  q_offset, s_offset,
1571  lut->masked_locations,
1572  query_info, s_range,
1573  word_length, lut_word_length,
1574  lookup_wrap,
1575  word_params, matrix,
1576  ewp->hash_table,
1577  init_hitlist,
1578  TRUE);
1579  } else {
1580  hits_extended += s_BlastnDiagTableExtendInitialHit(query, subject,
1581  q_offset, s_offset,
1582  lut->masked_locations,
1583  query_info, s_range,
1584  word_length, lut_word_length,
1585  lookup_wrap,
1586  word_params, matrix,
1587  ewp->diag_table,
1588  init_hitlist,
1589  TRUE);
1590  }
1591  }
1592  return hits_extended;
1593 }
1594 
1595 /* Description in na_ungapped.h */
1596 
1599  BlastQueryInfo * query_info,
1600  LookupTableWrap * lookup_wrap,
1601  Int4 ** matrix,
1602  const BlastInitialWordParameters * word_params,
1603  Blast_ExtendWord * ewp,
1604  BlastOffsetPair * offset_pairs,
1605  Int4 max_hits,
1606  BlastInitHitList * init_hitlist,
1607  BlastUngappedStats * ungapped_stats)
1608 {
1609  Int4 hitsfound, total_hits = 0;
1610  Int4 hits_extended = 0;
1611  TNaScanSubjectFunction scansub = NULL;
1612  TNaExtendFunction extend = NULL;
1613  Int4 scan_range[3];
1614  Int4 word_length;
1615  Int4 lut_word_length;
1616 
1617  if (lookup_wrap->lut_type == eSmallNaLookupTable) {
1619  (BlastSmallNaLookupTable *) lookup_wrap->lut;
1620  word_length = lookup->word_length;
1621  lut_word_length = lookup->lut_word_length;
1622  scansub = (TNaScanSubjectFunction)lookup->scansub_callback;
1623  extend = (TNaExtendFunction)lookup->extend_callback;
1624  }
1625  else if (lookup_wrap->lut_type == eMBLookupTable) {
1627  (BlastMBLookupTable *) lookup_wrap->lut;
1628  if (lookup->discontiguous) {
1629  word_length = lookup->template_length;
1630  lut_word_length = lookup->template_length;
1631  } else {
1632  word_length = lookup->word_length;
1633  lut_word_length = lookup->lut_word_length;
1634  }
1635  scansub = (TNaScanSubjectFunction)lookup->scansub_callback;
1636  extend = (TNaExtendFunction)lookup->extend_callback;
1637  }
1638  else {
1640  (BlastNaLookupTable *) lookup_wrap->lut;
1641  word_length = lookup->word_length;
1642  lut_word_length = lookup->lut_word_length;
1643  scansub = (TNaScanSubjectFunction)lookup->scansub_callback;
1644  extend = (TNaExtendFunction)lookup->extend_callback;
1645  }
1646 
1647  scan_range[0] = 0; /* subject seq mask index */
1648  scan_range[1] = 0; /* start pos of scan */
1649  scan_range[2] = subject->length - lut_word_length; /*end pos (inclusive) of scan*/
1650 
1651  /* if sequence is masked, fall back to generic scanner and extender */
1652  if (subject->mask_type != eNoSubjMasking) {
1653  if (lookup_wrap->lut_type == eMBLookupTable &&
1654  ((BlastMBLookupTable *) lookup_wrap->lut)->discontiguous) {
1655  /* discontiguous scan subs assumes any (non-aligned starting offset */
1656  } else {
1657  scansub = (TNaScanSubjectFunction)
1659  if (extend != (TNaExtendFunction)s_BlastNaExtendDirect) {
1660  extend = (lookup_wrap->lut_type == eSmallNaLookupTable)
1663  }
1664  }
1665  /* generic scanner permits any (non-aligned) starting offset */
1666  scan_range[1] = subject->seq_ranges[0].left + word_length - lut_word_length;
1667  scan_range[2] = subject->seq_ranges[0].right - lut_word_length;
1668  }
1669 
1670  ASSERT(scansub);
1671  ASSERT(extend);
1672 
1673  while(s_DetermineScanningOffsets(subject, word_length, lut_word_length, scan_range)) {
1674 
1675  hitsfound = scansub(lookup_wrap, subject, offset_pairs, max_hits, &scan_range[1]);
1676 
1677  if (hitsfound == 0)
1678  continue;
1679 
1680  total_hits += hitsfound;
1681  hits_extended += extend(offset_pairs, hitsfound, word_params,
1682  lookup_wrap, query, subject, matrix,
1683  query_info, ewp, init_hitlist, scan_range[2] + lut_word_length);
1684  }
1685 
1686  Blast_ExtendWordExit(ewp, subject->length);
1687 
1688  Blast_UngappedStatsUpdate(ungapped_stats, total_hits, hits_extended,
1689  init_hitlist->total);
1690 
1691  if (word_params->ungapped_extension)
1692  Blast_InitHitListSortByScore(init_hitlist);
1693 
1694  return 0;
1695 }
1696 
1700  BlastQueryInfo * query_info,
1701  LookupTableWrap * lookup_wrap,
1702  Int4 ** matrix,
1703  const BlastInitialWordParameters * word_params,
1704  Blast_ExtendWord * ewp,
1705  BlastOffsetPair * offset_pairs,
1706  Int4 max_hits,
1707  BlastInitHitList * init_hitlist,
1708  BlastUngappedStats * ungapped_stats)
1709 {
1710  BlastInitHSP * hsp, * new_hsp, * hsp_end;
1711  BlastUngappedData dummy_ungapped_data;
1712  BlastUngappedData * ungapped_data = 0;
1713  ir_diag_hash * hash = 0;
1714  ir_hash_entry * e = 0;
1715  Uint4 word_size;
1716  Uint4 q_off, s_off;
1717  Uint4 diag, key;
1718  Int4 oid = subject->oid;
1719  Int4 chunk = subject->chunk;
1720  Int4 context;
1721  BlastUngappedCutoffs *cutoffs;
1722  T_MB_IdbCheckOid check_oid =
1723  (T_MB_IdbCheckOid)lookup_wrap->check_index_oid;
1725  (T_MB_IdbGetResults)lookup_wrap->read_indexed_db;
1726  Int4 last_vol_idx = LAST_VOL_IDX_NULL;
1727 
1728  /* In the case oid belongs to the non-indexed part of the
1729  database, route the call to the original word finder.
1730  */
1731  if( check_oid( oid, &last_vol_idx ) == eNotIndexed ) {
1732  return BlastNaWordFinder(
1733  subject, query, query_info, lookup_wrap, matrix,word_params,
1734  ewp, offset_pairs, max_hits, init_hitlist, ungapped_stats );
1735  }
1736 
1738  word_size = (Uint4) get_results(/*lookup_wrap->lut, */oid, chunk, init_hitlist);
1739 
1740  if( word_size > 0 && word_params->ungapped_extension ) {
1741  hash = ir_hash_create();
1742  new_hsp = hsp = init_hitlist->init_hsp_array;
1743  hsp_end = hsp + init_hitlist->total;
1744 
1745  for( ; hsp < hsp_end; ++hsp ) {
1746  q_off = hsp->offsets.qs_offsets.q_off;
1747  s_off = hsp->offsets.qs_offsets.s_off;
1748  diag = IR_DIAG( q_off, s_off );
1749  key = IR_KEY( diag );
1750  e = IR_LOCATE( hash, diag, key );
1751  if( e != 0 ) {
1752  if( q_off + word_size - 1 > e->diag_data.qend ) {
1753  context = BSearchContextInfo(q_off, query_info);
1754  cutoffs = word_params->cutoffs + context;
1756  query, subject, matrix,
1757  q_off, s_off + word_size, s_off,
1758  -(cutoffs->x_dropoff), &dummy_ungapped_data,
1759  word_params->nucl_score_table,
1760  cutoffs->reduced_nucl_cutoff_score);
1761 
1762  if( dummy_ungapped_data.score >= cutoffs->cutoff_score ) {
1763  ungapped_data =
1765  *ungapped_data = dummy_ungapped_data;
1766  if( new_hsp != hsp ) *new_hsp = *hsp;
1767  new_hsp->ungapped_data = ungapped_data;
1768  ++new_hsp;
1769  }
1770 
1771  if( e->diag_data.diag != diag ) e->diag_data.diag = diag;
1772  e->diag_data.qend = dummy_ungapped_data.q_start + dummy_ungapped_data.length - 1;
1773  }
1774  }
1775  else {
1776  if( new_hsp != hsp ) *new_hsp = *hsp;
1777  ++new_hsp;
1778  }
1779  }
1780 
1781  init_hitlist->total = (Int4)(new_hsp - init_hitlist->init_hsp_array);
1782  hash = ir_hash_destroy( hash );
1783  }
1784 
1785  if (word_params->ungapped_extension)
1786  Blast_InitHitListSortByScore(init_hitlist);
1787 
1788  return 0;
1789 }
1790 
1792 {
1793  if (lookup_wrap->lut_type == eMBLookupTable) {
1794  BlastMBLookupTable *lut;
1795  lookup_wrap->lookup_callback = (void *)s_MBLookup;
1796  lut = (BlastMBLookupTable *) lookup_wrap->lut;
1797 
1798  if (lut->lut_word_length == lut->word_length || lut->discontiguous)
1799  lut->extend_callback = (void *)s_BlastNaExtendDirect;
1800  else if (lut->lut_word_length % COMPRESSION_RATIO == 0 &&
1801  lut->scan_step % COMPRESSION_RATIO == 0)
1802  lut->extend_callback = (void *)s_BlastNaExtendAligned;
1803  else
1804  lut->extend_callback = (void *)s_BlastNaExtend;
1805  }
1806  else if (lookup_wrap->lut_type == eSmallNaLookupTable) {
1808  lookup_wrap->lookup_callback = (void *)s_SmallNaLookup;
1809  lut = (BlastSmallNaLookupTable *) lookup_wrap->lut;
1810 
1811  if (lut->lut_word_length == lut->word_length)
1812  lut->extend_callback = (void *)s_BlastNaExtendDirect;
1813  else if (lut->lut_word_length % COMPRESSION_RATIO == 0 &&
1814  lut->scan_step % COMPRESSION_RATIO == 0 &&
1815  lut->word_length - lut->lut_word_length <= 4)
1817  else
1818  lut->extend_callback = (void *)s_BlastSmallNaExtend;
1819  }
1820  else if (lookup_wrap->lut_type == eNaHashLookupTable) {
1821  lookup_wrap->lookup_callback = NULL;
1822  }
1823  else {
1824  BlastNaLookupTable *lut;
1825  lookup_wrap->lookup_callback = (void *)s_NaLookup;
1826  lut = (BlastNaLookupTable *) lookup_wrap->lut;
1827 
1828  if (lut->lut_word_length == lut->word_length)
1829  lut->extend_callback = (void *)s_BlastNaExtendDirect;
1830  else if (lut->lut_word_length % COMPRESSION_RATIO == 0 &&
1831  lut->scan_step % COMPRESSION_RATIO == 0)
1832  lut->extend_callback = (void *)s_BlastNaExtendAligned;
1833  else
1834  lut->extend_callback = (void *)s_BlastNaExtend;
1835  }
1836 }
1837 
1838 
1840 {
1841  if (wh) {
1842  if (wh->pair_arrays) {
1843  if (wh->pair_arrays[0]) {
1844  sfree(wh->pair_arrays[0]);
1845  }
1846  sfree(wh->pair_arrays);
1847  }
1848 
1849  if (wh->num) {
1850  sfree(wh->num);
1851  }
1852 
1853  if (wh->last_diag) {
1854  sfree(wh->last_diag);
1855  }
1856 
1857  if (wh->last_pos) {
1858  sfree(wh->last_pos);
1859  }
1860 
1861  sfree(wh);
1862  }
1863 
1864  return NULL;
1865 }
1866 
1868  const BlastQueryInfo* query_info)
1869 {
1870  MapperWordHits* wh = NULL;
1871  Int4 num_arrays;
1872  Int4 array_size;
1873  Int4 i;
1874 
1875  num_arrays = MAX(query_info->num_queries / 100, 1);
1876  array_size = 1000;
1877 
1878  wh = calloc(1, sizeof(MapperWordHits));
1879  if (!wh) {
1880  return NULL;
1881  }
1882 
1883  wh->pair_arrays = calloc(num_arrays, sizeof(BlastOffsetPair*));
1884  if (!wh->pair_arrays) {
1885  MapperWordHitsFree(wh);
1886  return NULL;
1887  }
1888 
1889  wh->pair_arrays[0] =
1890  malloc(num_arrays * array_size * sizeof(BlastOffsetPair));
1891  if (!wh->pair_arrays[0]) {
1892  MapperWordHitsFree(wh);
1893  return NULL;
1894  }
1895 
1896  for (i = 1;i < num_arrays;i++) {
1897  wh->pair_arrays[i] = wh->pair_arrays[0] + (i * array_size);
1898  }
1899 
1900  wh->num = calloc(num_arrays, sizeof(Int4));
1901  if (!wh->num) {
1902  MapperWordHitsFree(wh);
1903  return NULL;
1904  }
1905 
1906  wh->num_arrays = num_arrays;
1907  wh->array_size = array_size;
1908  wh->divisor = query->length / wh->num_arrays + 1;
1909 
1910  wh->last_diag = calloc(query_info->last_context + 1, sizeof(Int4));
1911  if (!wh) {
1912  MapperWordHitsFree(wh);
1913  return NULL;
1914  }
1915 
1916  wh->last_pos = malloc((query_info->last_context + 1) * sizeof(Int4));
1917  if (!wh) {
1918  MapperWordHitsFree(wh);
1919  return NULL;
1920  }
1921  for (i = 0;i < query_info->num_queries;i++) {
1922  wh->last_pos[i] = INT4_MIN;
1923  }
1924 
1925  return wh;
1926 }
1927 
1928 
1929 Int2
1932  BlastQueryInfo * query_info,
1933  LookupTableWrap * lookup_wrap,
1934  const BlastInitialWordParameters * word_params,
1935  const BlastScoringParameters* score_params,
1936  const BlastHitSavingParameters* hit_params,
1937  BlastOffsetPair * offset_pairs,
1938  MapperWordHits* word_hits,
1939  Int4 max_hits,
1940  BlastGapAlignStruct* gap_align,
1941  BlastInitHitList* init_hitlist,
1942  BlastHSPList** hsp_list_ptr,
1943  BlastUngappedStats * ungapped_stats,
1944  BlastGappedStats* gapped_stats)
1945 {
1946  Int4 hitsfound, total_hits = 0;
1947  Int4 hits_extended = 0;
1948  TNaScanSubjectFunction scansub = NULL;
1949  Int4 scan_range[3];
1950  Int4 word_length;
1951  Int4 lut_word_length;
1952  BlastHSPList* hsp_list = NULL;
1953  Boolean read_is_query = query_info->max_length < subject->length;
1954  SubjectIndex* s_index = NULL;
1955  Int4 i;
1956 
1957  if (*hsp_list_ptr == NULL) {
1958  *hsp_list_ptr = hsp_list = Blast_HSPListNew(BlastHspNumMax(TRUE,
1959  hit_params->options));
1960  }
1961  else {
1962  hsp_list = *hsp_list_ptr;
1963  }
1964 
1965  if (word_hits) {
1966  memset(word_hits->num, 0, word_hits->num_arrays * sizeof(Int4));
1967  }
1968 
1969  if (lookup_wrap->lut_type == eSmallNaLookupTable) {
1971  (BlastSmallNaLookupTable *) lookup_wrap->lut;
1972  word_length = lookup->word_length;
1973  lut_word_length = lookup->lut_word_length;
1974  scansub = (TNaScanSubjectFunction)lookup->scansub_callback;
1975  }
1976  else if (lookup_wrap->lut_type == eMBLookupTable) {
1978  (BlastMBLookupTable *) lookup_wrap->lut;
1979  if (lookup->discontiguous) {
1980  word_length = lookup->template_length;
1981  lut_word_length = lookup->template_length;
1982  } else {
1983  word_length = lookup->word_length;
1984  lut_word_length = lookup->lut_word_length;
1985  }
1986  scansub = (TNaScanSubjectFunction)lookup->scansub_callback;
1987  }
1988  else if (lookup_wrap->lut_type == eNaHashLookupTable) {
1990  (BlastNaHashLookupTable *) lookup_wrap->lut;
1991 
1992  word_length = lookup->word_length;
1993  lut_word_length = lookup->lut_word_length;
1994  scansub = (TNaScanSubjectFunction)lookup->scansub_callback;
1995  }
1996  else {
1998  (BlastNaLookupTable *) lookup_wrap->lut;
1999  word_length = lookup->word_length;
2000  lut_word_length = lookup->lut_word_length;
2001  scansub = (TNaScanSubjectFunction)lookup->scansub_callback;
2002  }
2003 
2004  scan_range[0] = 0; /* subject seq mask index */
2005  scan_range[1] = 0; /* start pos of scan */
2006  scan_range[2] = subject->length - lut_word_length; /*end pos (inclusive) of scan*/
2007 
2008  /* if sequence is masked, fall back to generic scanner and extender */
2009  if (subject->mask_type != eNoSubjMasking) {
2010  if (lookup_wrap->lut_type == eMBLookupTable &&
2011  ((BlastMBLookupTable *) lookup_wrap->lut)->discontiguous) {
2012  /* discontiguous scan subs assumes any (non-aligned starting offset */
2013  } else {
2014  scansub = (TNaScanSubjectFunction)
2016  }
2017  /* generic scanner permits any (non-aligned) starting offset */
2018  scan_range[1] = subject->seq_ranges[0].left + word_length - lut_word_length;
2019  scan_range[2] = subject->seq_ranges[0].right - lut_word_length;
2020  }
2021 
2022  ASSERT(scansub);
2023 
2024  if (word_hits) {
2025  memset(word_hits->last_pos, 0,
2026  (query_info->last_context + 1) * sizeof(Int4));
2027  }
2028 
2029  if (getenv("MAPPER_USE_SMALL_WORDS")) {
2031  }
2032  while(s_DetermineScanningOffsets(subject, word_length, lut_word_length, scan_range)) {
2033 
2034 
2035  hitsfound = scansub(lookup_wrap, subject, offset_pairs, max_hits, &scan_range[1]);
2036 
2037  if (hitsfound >= 0) {
2038 
2039  /* if the extended word hits structure is allocated */
2040  if (word_hits) {
2041 
2042  /* Distribute each word hit into lists that correspond to
2043  a group of queries, ensuring that most word hits for each
2044  query are extended in the same batch. */
2045  for (i = 0; i < hitsfound;i++) {
2046  Int4 q_off = offset_pairs[i].qs_offsets.q_off;
2047  Int4 s_off = offset_pairs[i].qs_offsets.s_off;
2048  Int4 index = q_off / word_hits->divisor;
2049 
2050  Int4 context = BSearchContextInfo(q_off, query_info);
2051  Int4 diag = s_off - q_off;
2052  Int4 last_d = word_hits->last_diag[context];
2053  Int4 last_p = word_hits->last_pos[context];
2054 
2055  word_hits->last_diag[context] = diag;
2056  word_hits->last_pos[context] = s_off;
2057 
2058  /* Discard word hits on the same diagnol and with adjacent
2059  positions, allowing up to 2 mismatches, as the last one
2060  saved. All these word hits will produce the same
2061  alignment. There is another check for this situation
2062  before extension, but doing it here helps keeping
2063  numer of word hits managable. The other check is still
2064  necessary, because depending on extension more
2065  word hits can be discarded. */
2066  if (last_p != 0 && last_d == diag &&
2067  s_off - last_p < lut_word_length + 1) {
2068 
2069  continue;
2070  }
2071  ASSERT(index < word_hits->num_arrays);
2072 
2073 
2074  /* if the word hit list for the current word hit is full,
2075  then extend hits from this list */
2076  if (word_hits->num[index] >= word_hits->array_size) {
2077  hits_extended += BlastNaExtendJumper(
2078  word_hits->pair_arrays[index],
2079  word_hits->num[index],
2080  word_params, score_params,
2081  hit_params,
2082  lookup_wrap,
2083  query, subject,
2084  query_info,
2085  gap_align,
2086  hsp_list,
2087  scan_range[2] + lut_word_length,
2088  s_index);
2089 
2090  word_hits->num[index] = 0;
2091 
2092  }
2093 
2094  /* add new word hit */
2095  word_hits->pair_arrays[index][word_hits->num[index]++] =
2096  offset_pairs[i];
2097  }
2098  }
2099  else {
2100  /* otherwise extend collected word hits */
2101 
2102  total_hits += hitsfound;
2103  hits_extended += BlastNaExtendJumper(offset_pairs, hitsfound,
2104  word_params, score_params,
2105  hit_params,
2106  lookup_wrap,
2107  query, subject,
2108  query_info,
2109  gap_align,
2110  hsp_list,
2111  scan_range[2] + lut_word_length,
2112  s_index);
2113 
2114  }
2115  }
2116 
2117  if (!read_is_query) {
2118  break;
2119  }
2120  }
2121 
2122  if (word_hits) {
2123 
2124  /* extend word hits from all lists */
2125  for (i = 0;i < word_hits->num_arrays;i++) {
2126  if (word_hits->num[i] > 0) {
2127  hits_extended += BlastNaExtendJumper(word_hits->pair_arrays[i],
2128  word_hits->num[i],
2129  word_params, score_params,
2130  hit_params,
2131  lookup_wrap,
2132  query, subject,
2133  query_info,
2134  gap_align,
2135  hsp_list,
2136  scan_range[2] + lut_word_length,
2137  s_index);
2138 
2139  }
2140  word_hits->num[i] = 0;
2141  }
2142  }
2143 
2144  Blast_UngappedStatsUpdate(ungapped_stats, total_hits, 0, 0);
2145 
2146  if (gapped_stats) {
2147  gapped_stats->extensions = hits_extended;
2148  /* this is needed for adaptive batch size */
2149  ungapped_stats->good_init_extends = hits_extended;
2150  }
2151 
2152  if (s_index) {
2153  s_index = SubjectIndexFree(s_index);
2154  }
2155 
2156  return 0;
2157 
2158 }
2159 
2160 
2161 /* read is a qurey, reference is a subject */
2165  BlastQueryInfo * query_info,
2166  LookupTableWrap * lookup_wrap,
2167  const BlastInitialWordParameters * word_params,
2168  const BlastScoringParameters* score_params,
2169  const BlastHitSavingParameters* hit_params,
2170  BlastOffsetPair * offset_pairs,
2171  MapperWordHits* word_hits,
2172  Int4 max_hits,
2173  BlastGapAlignStruct* gap_align,
2174  BlastInitHitList* init_hitlist,
2175  BlastHSPList** hsp_list,
2176  BlastUngappedStats* ungapped_stats,
2177  BlastGappedStats* gapped_stats)
2178 {
2179  BlastInitHSP * hsp, /* *new_hsp,*/ * hsp_end;
2180  ir_diag_hash * hash = 0;
2181  ir_hash_entry * e = 0;
2182  Uint4 word_size;
2183  Uint4 q_off, s_off;
2184  Uint4 diag, key;
2185  Int4 oid = subject->oid;
2186  Int4 chunk = subject->chunk;
2187  T_MB_IdbCheckOid check_oid =
2188  (T_MB_IdbCheckOid)lookup_wrap->check_index_oid;
2190  (T_MB_IdbGetResults)lookup_wrap->read_indexed_db;
2191  Int4 last_vol_idx = LAST_VOL_IDX_NULL;
2192 
2193  /* In the case oid belongs to the non-indexed part of the
2194  database, route the call to the original word finder.
2195  */
2196  if( check_oid( oid, &last_vol_idx ) == eNotIndexed ) {
2197  return JumperNaWordFinder(
2198  subject, query, query_info, lookup_wrap, word_params,
2199  score_params, hit_params, offset_pairs, word_hits, max_hits,
2200  gap_align, init_hitlist, hsp_list, ungapped_stats,
2201  gapped_stats);
2202  }
2203 
2205  word_size = (Uint4)get_results(oid, chunk, init_hitlist);
2206 
2207  if (*hsp_list == NULL) {
2208  *hsp_list = Blast_HSPListNew(BlastHspNumMax(TRUE,
2209  hit_params->options));
2210  }
2211 
2212  if( word_size > 0) {
2213 
2214  hash = ir_hash_create();
2215  hsp = init_hitlist->init_hsp_array;
2216  hsp_end = hsp + init_hitlist->total;
2217 
2218  for( ; hsp < hsp_end; ++hsp ) {
2219  q_off = hsp->offsets.qs_offsets.q_off;
2220  s_off = hsp->offsets.qs_offsets.s_off;
2221 
2222 
2223  diag = IR_DIAG( q_off, s_off );
2224  key = IR_KEY( diag );
2225  e = IR_LOCATE( hash, diag, key );
2226 
2227  if( e != 0 ) {
2228 
2229 
2230  Int4 context = BSearchContextInfo(q_off, query_info);
2231  Int4 query_start = query_info->contexts[context].query_offset;
2232  Int4 query_len = query_info->contexts[context].query_length;
2233  Uint1* query_seq = query->sequence + query_start;
2234 
2235  if( q_off + word_size - 1 > e->diag_data.qend ) {
2236 
2237  Int4 num_identical = 0;
2238  Int4 right_ungapped_ext_len = 0;
2239  gapped_stats->extensions++;
2240 
2242  subject->sequence,
2243  query_len,
2244  subject->length,
2245  q_off - query_start,
2246  s_off,
2247  gap_align,
2248  score_params,
2249  &num_identical,
2250  &right_ungapped_ext_len);
2251 
2252 
2253  if (JumperGoodAlign(gap_align, hit_params, num_identical,
2254  &query_info->contexts[context])) {
2255 
2256  BlastHSP* new_hsp;
2257  int status;
2258 
2259  gap_align->edit_script =
2261  gap_align->jumper->left_prelim_block,
2262  gap_align->jumper->right_prelim_block);
2263 
2264  status = Blast_HSPInit(gap_align->query_start,
2265  gap_align->query_stop,
2266  gap_align->subject_start,
2267  gap_align->subject_stop,
2268  q_off - query_start, s_off,
2269  context,
2270  query_info->contexts[context].frame,
2271  subject->frame,
2272  gap_align->score,
2273  &(gap_align->edit_script),
2274  &new_hsp);
2275  if (!new_hsp) {
2276  return -1;
2277  }
2278  new_hsp->map_info = BlastHSPMappingInfoNew();
2279  if (!new_hsp->map_info) {
2280  return -1;
2281  }
2282  new_hsp->num_ident = num_identical;
2283  new_hsp->evalue = 0.0;
2284  new_hsp->map_info->edits = JumperFindEdits(query_seq,
2285  subject->sequence,
2286  gap_align);
2287 
2288  if (hit_params->options->splice) {
2289  JumperFindSpliceSignals(new_hsp, query_len,
2290  subject->sequence,
2291  subject->length);
2292  }
2293 
2294  status = Blast_HSPListSaveHSP(*hsp_list, new_hsp);
2295  if (status) {
2296  break;
2297  }
2298  }
2299 
2300  if (e->diag_data.diag != diag) {
2301  e->diag_data.diag = diag;
2302  }
2303  e->diag_data.qend = q_off + right_ungapped_ext_len - 1;
2304 
2305  }
2306 
2307 
2308  }
2309 /*
2310  else {
2311  if( new_hsp != hsp ) *new_hsp = *hsp;
2312  ++new_hsp;
2313  }
2314 */
2315  }
2316 
2317  hash = ir_hash_destroy( hash );
2318  }
2319 
2320  return 0;
2321 }
2322 
2323 
@ eNoSubjMasking
Definition: blast_def.h:236
#define COMPRESSION_RATIO
Compression ratio of nucleotide bases (4 bases in 1 byte)
Definition: blast_def.h:83
#define sfree(x)
Safe free a pointer: belongs to a higher level header.
Definition: blast_def.h:112
void Blast_UngappedStatsUpdate(BlastUngappedStats *ungapped_stats, Int4 total_hits, Int4 extended_hits, Int4 saved_hits)
Fill data in the ungapped hits diagnostics structure.
Boolean BLAST_SaveInitialHit(BlastInitHitList *init_hitlist, Int4 q_off, Int4 s_off, BlastUngappedData *ungapped_data)
Save the initial hit data into the initial hit list structure.
Definition: blast_extend.c:325
void Blast_InitHitListSortByScore(BlastInitHitList *init_hitlist)
Sort array of initial HSPs by score.
Definition: blast_extend.c:306
Int2 Blast_ExtendWordExit(Blast_ExtendWord *ewp, Int4 subject_length)
Update the word extension structure after scanning of each subject sequence.
Definition: blast_extend.c:162
#define DIAGHASH_NUM_BUCKETS
Number of hash buckets in BLAST_DiagHash.
Definition: blast_extend.h:51
Int2 Blast_HSPInit(Int4 query_start, Int4 query_end, Int4 subject_start, Int4 subject_end, Int4 query_gapped_start, Int4 subject_gapped_start, Int4 query_context, Int2 query_frame, Int2 subject_frame, Int4 score, GapEditScript **gap_edit, BlastHSP **ret_hsp)
Allocates BlastHSP and inits with information from input.
Definition: blast_hits.c:151
Int4 BlastHspNumMax(Boolean gapped_calculation, const BlastHitSavingOptions *options)
Calculated the number of HSPs that should be saved.
Definition: blast_hits.c:213
BlastHSPList * Blast_HSPListNew(Int4 hsp_max)
Creates HSP list structure with a default size HSP array.
Definition: blast_hits.c:1558
BlastHSPMappingInfo * BlastHSPMappingInfoNew(void)
Allocate memory for an HSP's additional data structure.
Definition: blast_hits.c:207
Int2 Blast_HSPListSaveHSP(BlastHSPList *hsp_list, BlastHSP *hsp)
Saves HSP information into a BlastHSPList structure.
Definition: blast_hits.c:1754
#define PV_ARRAY_BTS
bits-to-shift from lookup_index to pv_array index.
Definition: blast_lookup.h:43
#define PV_TEST(lookup, index, shift)
Test the bit at position 'index' in the PV array bitfield within 'lookup'.
Definition: blast_lookup.h:55
#define PV_ARRAY_TYPE
The pv_array 'native' type.
Definition: blast_lookup.h:41
Routines for creating nucleotide BLAST lookup tables.
#define NA_HITS_PER_CELL
maximum number of hits in one lookup table cell
Routines for scanning nucleotide BLAST lookup tables.
void * BlastChooseNucleotideScanSubjectAny(LookupTableWrap *lookup_wrap)
Return the most generic function to scan through nucleotide subject sequences.
Int4(* TNaScanSubjectFunction)(const LookupTableWrap *lookup_wrap, const BLAST_SequenceBlk *subject, BlastOffsetPair *offset_pairs, Int4 max_hits, Int4 *scan_range)
Generic prototype for nucleotide subject scanning routines.
Definition: blast_nascan.h:43
@ eSmallNaLookupTable
lookup table for blastn with small query
@ eMBLookupTable
megablast lookup table (includes both contiguous and discontiguous megablast)
@ eNaHashLookupTable
used for 16-base words
@ eDiagHash
use hash table (blastn only)
@ eBlastTypeBlastn
Definition: blast_program.h:74
Int4 BSearchContextInfo(Int4 n, const BlastQueryInfo *A)
Search BlastContextInfo structures for the specified offset.
Various auxiliary BLAST utility functions.
#define NCBI2NA_UNPACK_BASE(x, N)
Macro to extract base N from a byte x (N >= 0, N < 4)
Definition: blast_util.h:55
static ulg window_size
static void get_results(DBPROCESS *dbproc, int start)
Definition: t0006.c:17
static int lookup(const char *name, const struct lookup_int *table)
Definition: attributes.c:50
#define NULL
Definition: ncbistd.hpp:225
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
ir_diag_hash * ir_hash_create(void)
Hash table constructor.
ir_diag_hash * ir_hash_destroy(ir_diag_hash *hash)
Hash table destructor.
Declarations of structures needed to implement diagonal hash to support ungapped extensions for index...
#define IR_LOCATE(hash, diag, key)
Find a hash table entry for the given diagonal.
#define IR_KEY(diag)
Compute the hash key from a diagonal identifier.
#define IR_DIAG(qoff, soff)
Compute diagonal identifier from subject and query offsets.
<!DOCTYPE HTML >< html > n< header > n< title > PubSeq Gateway Help Page</title > n< style > n table
Int4 BlastNaExtendJumper(BlastOffsetPair *offset_pairs, Int4 num_hits, const BlastInitialWordParameters *word_params, const BlastScoringParameters *score_params, const BlastHitSavingParameters *hit_params, LookupTableWrap *lookup_wrap, BLAST_SequenceBlk *query, BLAST_SequenceBlk *subject, BlastQueryInfo *query_info, BlastGapAlignStruct *gap_align, BlastHSPList *hsp_list, Uint4 s_range, SubjectIndex *s_index)
Extend a list of word hits.
Definition: jumper.c:3253
int JumperGappedAlignmentCompressedWithTraceback(const Uint1 *query, const Uint1 *subject, Int4 query_length, Int4 subject_length, Int4 query_start, Int4 subject_start, BlastGapAlignStruct *gap_align, const BlastScoringParameters *score_params, Int4 *num_identical, Int4 *right_ungapped_ext_len)
Jumper gapped alignment with traceback; 1 base per byte in query, 4 bases per byte in subject.
Definition: jumper.c:2512
int JumperFindSpliceSignals(BlastHSP *hsp, Int4 query_len, const Uint1 *subject, Int4 subject_len)
Find splice signals at the edges of an HSP and save them in the HSP.
Definition: jumper.c:2947
Boolean JumperGoodAlign(const BlastGapAlignStruct *gap_align, const BlastHitSavingParameters *hit_params, Int4 num_identical, BlastContextInfo *context_info)
Test whether an HSP should be saved.
Definition: jumper.c:2650
SubjectIndex * SubjectIndexNew(BLAST_SequenceBlk *subject, Int4 width, Int4 word_size)
Index a sequence, used for indexing compressed nucleotide subject sequence.
Definition: jumper.c:3874
SubjectIndex * SubjectIndexFree(SubjectIndex *sindex)
Free subject index structure.
Definition: jumper.c:3822
JumperEditsBlock * JumperFindEdits(const Uint1 *query, const Uint1 *subject, BlastGapAlignStruct *gap_align)
Definition: jumper.c:2755
GapEditScript * JumperPrelimEditBlockToGapEditScript(JumperPrelimEditBlock *rev_prelim_block, JumperPrelimEditBlock *fwd_prelim_block)
Convert Jumper's preliminary edit script to GapEditScript.
Definition: jumper.c:610
#define SUBJECT_INDEX_WORD_LENGTH
Definition: jumper.h:148
for(len=0;yy_str[len];++len)
int i
if(yy_accept[yy_current_state])
int len
Boolean(* T_Lookup_Callback)(const LookupTableWrap *, Int4, Int4)
Function pointer type to check the presence of index->q_off pair.
Definition: lookup_wrap.h:66
static NCBI_INLINE Boolean s_DetermineScanningOffsets(const BLAST_SequenceBlk *subject, Int4 word_length, Int4 lut_word_length, Int4 *range)
Determines the scanner's offsets taking the database masking restrictions into account (if any).
Definition: masksubj.inl:43
Declarations for functions that extract hits from indexed blast databases (specialized for megablast)
#define LAST_VOL_IDX_NULL
int(* T_MB_IdbCheckOid)(Int4 oid, Int4 *last_vol_id)
Function pointer type to check index seeds availability for oid.
unsigned long(* T_MB_IdbGetResults)(Int4 oid, Int4 chunk, BlastInitHitList *init_hitlist)
Function pointer type to retrieve hits from an indexed database.
@ eNotIndexed
static NCBI_INLINE Int4 s_BlastDiagHashRetrieve(BLAST_DiagHash *table, Int4 diag, Int4 *level, Int4 *hit_len, Int4 *hit_saved)
Attempt to retrieve information associated with diagonal diag.
Definition: na_ungapped.c:361
static Boolean s_MBLookup(const LookupTableWrap *lookup_wrap, Int4 index, Int4 q_pos)
Check to see if an index->q_pos pair exists in MB lookup table.
Definition: na_ungapped.c:48
static NCBI_INLINE Int4 s_BlastDiagHashInsert(BLAST_DiagHash *table, Int4 diag, Int4 level, Int4 len, Int4 hit_saved, Int4 s_off, Int4 window_size)
Attempt to store information associated with diagonal diag.
Definition: na_ungapped.c:396
static Int4 s_BlastSmallNaExtendAlignedOneByte(const BlastOffsetPair *offset_pairs, Int4 num_hits, const BlastInitialWordParameters *word_params, LookupTableWrap *lookup_wrap, BLAST_SequenceBlk *query, BLAST_SequenceBlk *subject, Int4 **matrix, BlastQueryInfo *query_info, Blast_ExtendWord *ewp, BlastInitHitList *init_hitlist, Uint4 s_range)
Perform exact match extensions on the hits retrieved from small-query lookup tables.
Definition: na_ungapped.c:1381
static Int4 s_BlastnDiagTableExtendInitialHit(BLAST_SequenceBlk *query, BLAST_SequenceBlk *subject, Int4 q_off, Int4 s_off, BlastSeqLoc *query_mask, BlastQueryInfo *query_info, Int4 s_range, Int4 word_length, Int4 lut_word_length, const LookupTableWrap *lut, const BlastInitialWordParameters *word_params, Int4 **matrix, BLAST_DiagTable *diag_table, BlastInitHitList *init_hitlist, Boolean check_masks)
Perform ungapped extension given an offset pair, and save the initial hit information if the hit qual...
Definition: na_ungapped.c:631
void BlastChooseNaExtend(LookupTableWrap *lookup_wrap)
Choose the best routine to use for creating ungapped alignments.
Definition: na_ungapped.c:1791
static NCBI_INLINE Boolean s_IsSeedMasked(const LookupTableWrap *lookup_wrap, const BLAST_SequenceBlk *subject, Int4 s_off, Int4 lut_word_length, Int4 q_pos)
Test to see if seed->q_off exists in lookup table.
Definition: na_ungapped.c:459
static Int4 s_BlastNaExtendAligned(const BlastOffsetPair *offset_pairs, Int4 num_hits, const BlastInitialWordParameters *word_params, LookupTableWrap *lookup_wrap, BLAST_SequenceBlk *query, BLAST_SequenceBlk *subject, Int4 **matrix, BlastQueryInfo *query_info, Blast_ExtendWord *ewp, BlastInitHitList *init_hitlist, Uint4 s_range)
Perform exact match extensions on the hits retrieved from blastn/megablast lookup tables,...
Definition: na_ungapped.c:1196
static Int4 s_BlastNaExtend(const BlastOffsetPair *offset_pairs, Int4 num_hits, const BlastInitialWordParameters *word_params, LookupTableWrap *lookup_wrap, BLAST_SequenceBlk *query, BLAST_SequenceBlk *subject, Int4 **matrix, BlastQueryInfo *query_info, Blast_ExtendWord *ewp, BlastInitHitList *init_hitlist, Uint4 s_range)
Perform exact match extensions on the hits retrieved from blastn/megablast lookup tables,...
Definition: na_ungapped.c:1052
static Int4 s_BlastSmallNaExtend(const BlastOffsetPair *offset_pairs, Int4 num_hits, const BlastInitialWordParameters *word_params, LookupTableWrap *lookup_wrap, BLAST_SequenceBlk *query, BLAST_SequenceBlk *subject, Int4 **matrix, BlastQueryInfo *query_info, Blast_ExtendWord *ewp, BlastInitHitList *init_hitlist, Uint4 s_range)
Perform exact match extensions on the hits retrieved from small-query blastn lookup tables,...
Definition: na_ungapped.c:1486
static void s_NuclUngappedExtend(BLAST_SequenceBlk *query, BLAST_SequenceBlk *subject, Int4 **matrix, Int4 q_off, Int4 s_match_end, Int4 s_off, Int4 X, BlastUngappedData *ungapped_data, const Int4 *score_table, Int4 reduced_cutoff)
Perform ungapped extension of a word hit.
Definition: na_ungapped.c:262
Int2 BlastNaWordFinder(BLAST_SequenceBlk *subject, BLAST_SequenceBlk *query, BlastQueryInfo *query_info, LookupTableWrap *lookup_wrap, Int4 **matrix, const BlastInitialWordParameters *word_params, Blast_ExtendWord *ewp, BlastOffsetPair *offset_pairs, Int4 max_hits, BlastInitHitList *init_hitlist, BlastUngappedStats *ungapped_stats)
Find all words for a given subject sequence and perform ungapped extensions, assuming ordinary blastn...
Definition: na_ungapped.c:1597
static Boolean s_NaLookup(const LookupTableWrap *lookup_wrap, Int4 index, Int4 q_pos)
Check to see if an index->q_pos pair exists in Na lookup table.
Definition: na_ungapped.c:110
static void s_NuclUngappedExtendExact(BLAST_SequenceBlk *query, BLAST_SequenceBlk *subject, Int4 **matrix, Int4 q_off, Int4 s_off, Int4 X, BlastUngappedData *ungapped_data)
Perform ungapped extension of a word hit, using a score matrix and extending one base at a time.
Definition: na_ungapped.c:149
Int2 JumperNaWordFinder(BLAST_SequenceBlk *subject, BLAST_SequenceBlk *query, BlastQueryInfo *query_info, LookupTableWrap *lookup_wrap, const BlastInitialWordParameters *word_params, const BlastScoringParameters *score_params, const BlastHitSavingParameters *hit_params, BlastOffsetPair *offset_pairs, MapperWordHits *word_hits, Int4 max_hits, BlastGapAlignStruct *gap_align, BlastInitHitList *init_hitlist, BlastHSPList **hsp_list_ptr, BlastUngappedStats *ungapped_stats, BlastGappedStats *gapped_stats)
Definition: na_ungapped.c:1930
MapperWordHits * MapperWordHitsFree(MapperWordHits *wh)
Definition: na_ungapped.c:1839
static const Uint1 s_ExactMatchExtendRight[256]
Entry i of this list gives the number of pairs of bits that are zero in the bit pattern of i,...
Definition: na_ungapped.c:1353
static Int4 s_BlastNaExtendDirect(const BlastOffsetPair *offset_pairs, Int4 num_hits, const BlastInitialWordParameters *word_params, LookupTableWrap *lookup_wrap, BLAST_SequenceBlk *query, BLAST_SequenceBlk *subject, Int4 **matrix, BlastQueryInfo *query_info, Blast_ExtendWord *ewp, BlastInitHitList *init_hitlist, Uint4 s_range)
Perform ungapped extensions on the hits retrieved from blastn/megablast lookup tables,...
Definition: na_ungapped.c:964
static Int4 s_TypeOfWord(BLAST_SequenceBlk *query, BLAST_SequenceBlk *subject, Int4 *q_off, Int4 *s_off, BlastSeqLoc *locations, BlastQueryInfo *query_info, Uint4 s_range, Uint4 word_length, Uint4 lut_word_length, const LookupTableWrap *lookup_wrap, Boolean check_double, Int4 *extended)
Check the mini-extended word against masked query regions, and do right extension if necessary.
Definition: na_ungapped.c:508
MapperWordHits * MapperWordHitsNew(const BLAST_SequenceBlk *query, const BlastQueryInfo *query_info)
Definition: na_ungapped.c:1867
Int2 ShortRead_IndexedWordFinder(BLAST_SequenceBlk *subject, BLAST_SequenceBlk *query, BlastQueryInfo *query_info, LookupTableWrap *lookup_wrap, const BlastInitialWordParameters *word_params, const BlastScoringParameters *score_params, const BlastHitSavingParameters *hit_params, BlastOffsetPair *offset_pairs, MapperWordHits *word_hits, Int4 max_hits, BlastGapAlignStruct *gap_align, BlastInitHitList *init_hitlist, BlastHSPList **hsp_list, BlastUngappedStats *ungapped_stats, BlastGappedStats *gapped_stats)
Definition: na_ungapped.c:2162
static const Uint1 s_ExactMatchExtendLeft[256]
Entry i of this list gives the number of pairs of bits that are zero in the bit pattern of i,...
Definition: na_ungapped.c:1330
static Boolean s_SmallNaLookup(const LookupTableWrap *lookup_wrap, Int4 index, Int4 q_pos)
Check to see if an index->q_pos pair exists in SmallNa lookup table.
Definition: na_ungapped.c:79
Int2 MB_IndexedWordFinder(BLAST_SequenceBlk *subject, BLAST_SequenceBlk *query, BlastQueryInfo *query_info, LookupTableWrap *lookup_wrap, Int4 **matrix, const BlastInitialWordParameters *word_params, Blast_ExtendWord *ewp, BlastOffsetPair *offset_pairs, Int4 max_hits, BlastInitHitList *init_hitlist, BlastUngappedStats *ungapped_stats)
Finds all runs of a specified number of exact matches between two nucleotide sequences.
Definition: na_ungapped.c:1697
static Int4 s_BlastnDiagHashExtendInitialHit(BLAST_SequenceBlk *query, BLAST_SequenceBlk *subject, Int4 q_off, Int4 s_off, BlastSeqLoc *query_mask, BlastQueryInfo *query_info, Int4 s_range, Int4 word_length, Int4 lut_word_length, const LookupTableWrap *lut, const BlastInitialWordParameters *word_params, Int4 **matrix, BLAST_DiagHash *hash_table, BlastInitHitList *init_hitlist, Boolean check_masks)
Perform ungapped extension given an offset pair, and save the initial hit information if the hit qual...
Definition: na_ungapped.c:799
Nucleotide ungapped extension code.
Int4(* TNaExtendFunction)(const BlastOffsetPair *offset_pairs, Int4 num_hits, const BlastInitialWordParameters *word_params, LookupTableWrap *lookup_wrap, BLAST_SequenceBlk *query, BLAST_SequenceBlk *subject, Int4 **matrix, BlastQueryInfo *query_info, Blast_ExtendWord *ewp, BlastInitHitList *init_hitlist, Int4 range)
Signature of function used to compute ungapped alignments.
Definition: na_ungapped.h:53
const struct ncbi::grid::netcache::search::fields::KEY key
#define MIN(a, b)
returns smaller of a and b.
Definition: ncbi_std.h:112
#define NCBI_INLINE
"inline" seems to work on our remaining in-house compilers (WorkShop, Compaq, ICC,...
Definition: ncbi_std.h:81
Uint1 Boolean
bool replacment for C
Definition: ncbi_std.h:94
#define TRUE
bool replacment for C indicating true.
Definition: ncbi_std.h:97
#define FALSE
bool replacment for C indicating false.
Definition: ncbi_std.h:101
#define ASSERT
macro for assert.
Definition: ncbi_std.h:107
#define INT4_MIN
Smallest (most negative) number represented by signed int.
Definition: ncbi_std.h:146
#define MAX(a, b)
returns larger of a and b.
Definition: ncbi_std.h:117
Int4 delta(size_t dimension_, const Int4 *score_)
Track initial word matches using hashing with chaining.
Definition: blast_extend.h:98
Int4 offset
"offset" added to query and subject position so that "last_hit" doesn't have to be zeroed out every t...
Definition: blast_extend.h:104
Structure containing parameters needed for initial word extension.
Definition: blast_extend.h:77
DiagStruct * hit_level_array
Array to hold latest hits and their lengths for all diagonals.
Definition: blast_extend.h:78
Int4 diag_array_length
Smallest power of 2 longer than query length.
Definition: blast_extend.h:81
Int4 diag_mask
Used to mask off everything above min_diag_length (mask = min_diag_length-1).
Definition: blast_extend.h:82
Uint1 * hit_len_array
Array to hold the lengthof the latest hit.
Definition: blast_extend.h:80
Int4 offset
"offset" added to query and subject position so that "last_hit" doesn't have to be zeroed out every t...
Definition: blast_extend.h:84
Structure to hold a sequence.
Definition: blast_def.h:242
Int4 query_length
Length of this query, strand or frame.
Int4 query_offset
Offset of this query, strand or frame in the concatenated super-query.
Int1 frame
Frame number (-1, -2, -3, 0, 1, 2, or 3)
Structure supporting the gapped alignment.
Int4 query_stop
query end offseet of current alignment
Int4 subject_start
subject start offset current alignment
Int4 query_start
query start offset of current alignment
Int4 subject_stop
subject end offset of current alignment
JumperGapAlign * jumper
data for jumper alignment
Int4 score
Return value: alignment score.
GapEditScript * edit_script
The traceback (gap) information.
Structure containing hit counts from the gapped stage of a BLAST search.
Int4 extensions
Total number of gapped extensions performed.
The structure to hold all HSPs for a given sequence after the gapped alignment.
Definition: blast_hits.h:153
JumperEditsBlock * edits
Information about mismatches and gaps, used for mapping short reads.
Definition: blast_hits.h:114
Structure holding all information about an HSP.
Definition: blast_hits.h:126
double evalue
This HSP's e-value.
Definition: blast_hits.h:130
Int4 num_ident
Number of identical base pairs in this HSP.
Definition: blast_hits.h:128
BlastHSPMappingInfo * map_info
Definition: blast_hits.h:146
Parameter block that contains a pointer to BlastHitSavingOptions and the values derived from it.
BlastHitSavingOptions * options
The original (unparsed) options.
Structure to hold the initial HSP information.
Definition: blast_extend.h:150
BlastUngappedData * ungapped_data
Pointer to a structure holding ungapped alignment information.
Definition: blast_extend.h:153
BlastOffsetPair offsets
Offsets in query and subject, or, in PHI BLAST, start and end of pattern in subject.
Definition: blast_extend.h:151
Structure to hold all initial HSPs for a given subject sequence.
Definition: blast_extend.h:158
Int4 total
Total number of hits currently saved.
Definition: blast_extend.h:159
BlastInitHSP * init_hsp_array
Array of offset pairs, possibly with scores.
Definition: blast_extend.h:161
EBlastProgramType program_number
indicates blastn, blastp, etc.
Int4 window_size
Maximal allowed distance between 2 hits in case 2 hits are required to trigger the extension.
Int4 scan_range
Maximal number of gaps allowed between 2 hits.
Parameter block that contains a pointer to BlastInitialWordOptions and the values derived from it.
BlastUngappedCutoffs * cutoffs
cutoff values (one per context)
Boolean ungapped_extension
Should an ungapped extension be performed?
BlastInitialWordOptions * options
The original (unparsed) options.
Boolean matrix_only_scoring
Use the scoring matrix ( not table ) to score ungapped and gapped alignments -RMH-.
ESeedContainerType container_type
How to store offset pairs for initial seeds?
Int4 nucl_score_table[256]
the combined score of all match/mismatch combinations for aligning four bases
The lookup table structure used for Mega BLAST.
Int4 lut_word_length
number of letters in a lookup table word
Int4 pv_array_bts
The exponent of 2 by which pv_array is smaller than the backbone.
BlastSeqLoc * masked_locations
masked locations, only non-NULL for soft-masking.
Int4 * hashtable
Array of positions.
PV_ARRAY_TYPE * pv_array
Presence vector, used for quick presence check.
Boolean stride
is lookup table created with a stride
Int8 hashsize
= 4^(lut_word_length)
Int4 scan_step
Step size for scanning the database.
Int4 word_length
number of exact letter matches that will trigger an ungapped extension
Boolean discontiguous
Are discontiguous words used?
Int4 * next_pos
Extra positions stored here.
void * extend_callback
function for extending hits
Int4 template_length
Length of the discontiguous word template.
The basic lookup table structure for blastn searches.
Int4 scan_step
number of bases between successive words
BlastSeqLoc * masked_locations
masked locations, only non-NULL for soft-masking.
Int4 lut_word_length
Length in bases of a word indexed by the lookup table.
Int4 word_length
Length in bases of the full word match required to trigger extension.
void * extend_callback
function for extending hits
The query related information.
BlastContextInfo * contexts
Information per context.
int num_queries
Number of query sequences.
Int4 last_context
Index of the last element of the context array.
Uint4 max_length
Length of the longest among the concatenated queries.
Scoring parameters block Contains scoring-related information that is actually used for the blast sea...
Used to hold a set of positions, mostly used for filtering.
Definition: blast_def.h:204
Lookup table structure for blastn searches with small queries.
Int4 scan_step
number of bases between successive words
Int4 word_length
Length in bases of the full word match required to trigger extension.
void * extend_callback
function for extending hits
BlastSeqLoc * masked_locations
masked locations, only non-NULL for soft-masking.
Int4 lut_word_length
Length in bases of a word indexed by the lookup table.
All the ungapped cutoff values that can change from context to context.
Int4 reduced_nucl_cutoff_score
for blastn, a reduced cutoff score for use with approximate ungapped alignments
Int4 cutoff_score
Cutoff score for saving ungapped hits.
Int4 x_dropoff
Raw X-dropoff value used in the ungapped extension.
Structure to hold ungapped alignment information.
Definition: blast_extend.h:142
Int4 score
Score of the ungapped alignment.
Definition: blast_extend.h:146
Int4 length
Length of the ungapped alignment.
Definition: blast_extend.h:145
Int4 q_start
Start of the ungapped alignment in query.
Definition: blast_extend.h:143
Int4 s_start
Start of the ungapped alignment in subject.
Definition: blast_extend.h:144
Structure containing hit counts from the ungapped stage of a BLAST search.
Int4 good_init_extends
Number of successful initial extensions, i.e.
Structure for keeping initial word extension information.
Definition: blast_extend.h:109
BLAST_DiagHash * hash_table
Hash table and related parameters.
Definition: blast_extend.h:111
BLAST_DiagTable * diag_table
Diagonal array and related parameters.
Definition: blast_extend.h:110
Structure for keeping last hit information for a diagonal in a hash table, when eRight or eRightAndLe...
Definition: blast_extend.h:65
Int4 hit_len
The length of last hit.
Definition: blast_extend.h:69
signed int level
This hit's offset in the subject sequence.
Definition: blast_extend.h:67
unsigned int hit_saved
Whether or not this hit has been saved.
Definition: blast_extend.h:68
Uint4 next
Offset of next element in the chain.
Definition: blast_extend.h:70
Int4 diag
This hit's diagonal.
Definition: blast_extend.h:66
Structure for keeping last hit information for a diagonal.
Definition: blast_extend.h:57
signed int last_hit
Offset of the last hit.
Definition: blast_extend.h:58
unsigned int flag
Reset the next extension?
Definition: blast_extend.h:59
JumperPrelimEditBlock * left_prelim_block
Definition: jumper.h:73
JumperPrelimEditBlock * right_prelim_block
Definition: jumper.h:74
Wrapper structure for different types of BLAST lookup tables.
Definition: lookup_wrap.h:50
void * lookup_callback
function used to look up an index->q_off pair
Definition: lookup_wrap.h:61
void * lut
Pointer to the actual lookup table structure.
Definition: lookup_wrap.h:52
void * check_index_oid
function used to check if seeds for a given oid are present
Definition: lookup_wrap.h:55
ELookupTableType lut_type
What kind of a lookup table it is?
Definition: lookup_wrap.h:51
void * read_indexed_db
function used to retrieve hits from an indexed database
Definition: lookup_wrap.h:53
Int4 num_arrays
number of pair_arrays
Definition: na_ungapped.h:106
BlastOffsetPair ** pair_arrays
lists of word hits
Definition: na_ungapped.h:104
Int4 * last_diag
diagnal for the last word hit for each query context
Definition: na_ungapped.h:108
Int4 array_size
size of each array
Definition: na_ungapped.h:107
Int4 * last_pos
subject position for the last word hit for each query context
Definition: na_ungapped.h:110
Int4 * num
number of hits in the list
Definition: na_ungapped.h:105
Int4 divisor
divisor used to find pair_arrays index based on query offset
Definition: na_ungapped.h:113
Index for a chunk of a subject sequence.
Definition: jumper.h:152
static string subject
static string query
Definition: _hash_fun.h:40
Uint4 qend
Right end (in the query) of the last seen seed on the diagonal.
Uint4 diag
Diagonal identifier.
Hash table structure.
Hash table entry.
ir_diag_data diag_data
This symbol enables the verbose option in makeblastdb and other BLAST+ search command line applicatio...
Definition: blast_def.h:141
Uint4 q_off
Query offset.
Definition: blast_def.h:143
Uint4 s_off
Subject offset.
Definition: blast_def.h:144
struct BlastOffsetPair::@6 qs_offsets
Query/subject offset pair.
static CS_CONTEXT * context
Definition: will_convert.c:21
voidp malloc(uInt size)
voidp calloc(uInt items, uInt size)
Modified on Fri May 24 14:55:11 2024 by modify_doxy.py rev. 669887