NCBI C++ ToolKit
blast_lookup.c
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: blast_lookup.c 75981 2017-01-04 16:58:25Z boratyng $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  */
26 
27 /** @file blast_lookup.c
28  * Functions that provide generic services for BLAST lookup tables
29  */
30 
32 
33 void BlastLookupAddWordHit(Int4 **backbone, Int4 wordsize,
34  Int4 charsize, Uint1* seq,
35  Int4 query_offset)
36 {
37  Int4 index;
38  Int4 *chain = NULL;
39  Int4 chain_size = 0; /* total number of elements in the chain */
40  Int4 hits_in_chain = 0; /* number of occupied elements in the chain,
41  not including the zeroth and first
42  positions */
43 
44  /* compute the backbone cell to update */
45 
46  index = ComputeTableIndex(wordsize, charsize, seq);
47 
48  /* if backbone cell is null, initialize a new chain */
49  if (backbone[index] == NULL) {
50  chain_size = 8;
51  hits_in_chain = 0;
52  chain = (Int4 *) malloc(chain_size * sizeof(Int4));
53  ASSERT(chain != NULL);
54  chain[0] = chain_size;
55  chain[1] = hits_in_chain;
56  backbone[index] = chain;
57  } else {
58  /* otherwise, use the existing chain */
59  chain = backbone[index];
60  chain_size = chain[0];
61  hits_in_chain = chain[1];
62  }
63 
64  /* if the chain is full, allocate more room */
65  if ((hits_in_chain + 2) == chain_size) {
66  chain_size = chain_size * 2;
67  chain = (Int4 *) realloc(chain, chain_size * sizeof(Int4));
68  ASSERT(chain != NULL);
69 
70  backbone[index] = chain;
71  chain[0] = chain_size;
72  }
73 
74  /* add the hit */
75  chain[chain[1] + 2] = query_offset;
76  chain[1]++;
77 }
78 
80  Int4 word_length,
81  Int4 charsize,
82  Int4 lut_word_length,
84  BlastSeqLoc * locations)
85 {
86  BlastSeqLoc *loc;
87  Int4 offset;
88  Uint1 *seq;
89  Uint1 *word_target;
90  Uint1 invalid_mask = 0xff << charsize;
91 
92  for (loc = locations; loc; loc = loc->next) {
93  Int4 from = loc->ssr->left;
94  Int4 to = loc->ssr->right;
95 
96  /* if this location is too small to fit a complete word, skip the
97  location */
98 
99  if (word_length > to - from + 1)
100  continue;
101 
102  /* Indexing proceeds from the start point to the last offset
103  such that a full lookup table word can be created. word_target
104  points to the letter beyond which indexing is allowed */
105  seq = query->sequence + from;
106  word_target = seq + lut_word_length;
107 
108  for (offset = from; offset <= to; offset++, seq++) {
109 
110  if (seq >= word_target) {
111  BlastLookupAddWordHit(backbone,
112  lut_word_length, charsize,
113  seq - lut_word_length,
114  offset - lut_word_length);
115  }
116 
117  /* if the current word contains an ambiguity, skip all the
118  words that would contain that ambiguity */
119  if (*seq & invalid_mask)
120  word_target = seq + lut_word_length + 1;
121  }
122 
123  /* handle the last word, without loading *seq */
124  if (seq >= word_target) {
125  BlastLookupAddWordHit(backbone,
126  lut_word_length, charsize,
127  seq - lut_word_length,
128  offset - lut_word_length);
129  }
130 
131  }
132 }
133 
134 
136 {
137  BackboneCell* b = cell;
138  while (b) {
139  BackboneCell* next = b->next;
140  sfree(b);
141  b = next;
142  }
143 
144  return NULL;
145 }
146 
147 
149 {
150  BackboneCell* cell = calloc(1, sizeof(BackboneCell));
151  if (!cell) {
152  BackboneCellFree(cell);
153  return NULL;
154  }
155 
156  BackboneCellInit(cell, word, offset);
157  return cell;
158 }
159 
160 
162 {
163  if (!cell) {
164  return -1;
165  }
166 
167  cell->word = word;
168  cell->offset = offset;
169  cell->num_offsets = 1;
170 
171  return 0;
172 }
173 
174 static Int2 s_AddWordHit(BackboneCell* backbone, Int4* offsets, Int4 wordsize,
175  Int4 charsize, Uint1* seq, Int4 offset,
177  PV_ARRAY_TYPE* pv_array)
178 {
179  Uint4 large_index;
180  Int8 index;
181  Int4 i;
182 
183  /* convert a sequence from 4NA to 2NA */
184  large_index = 0;
185  for (i = 0;i < wordsize;i++) {
186  large_index = (large_index << charsize) | seq[i];
187  }
188 
189  /* if filtering by database word count, then do not add words
190  that do not appear in the database or appear to many times */
191  if (pv_array && !PV_TEST(pv_array, large_index, PV_ARRAY_BTS)) {
192  return 0;
193  }
194 
195  index = (Int8)hash_func((Uint1*)&large_index, mask);
196 
197  /* offset zero indicates end of offset list, so we are storing offset + 1
198  values */
199  offset++;
200 
201  /* if the hash table entry is emtpy, initialize a new cell */
202  if (backbone[index].num_offsets == 0) {
203  BackboneCellInit(&backbone[index], large_index, offset);
204  }
205  else {
206  /* otherwise check if the word was already added */
207  BackboneCell* b = &backbone[index];
208  while (b->next && b->word != large_index) {
209  b = b->next;
210  }
211 
212  /* if word was already added, add the new offset to an existing cell */
213  if (b->word == large_index) {
214  /* word offsets are all sored in a linear array where next offset =
215  offsets[current offset] until offset is zero (similarily to
216  next_pos array in MBLookupTable) */
217 
218  ASSERT(offsets[offset] == 0);
219  offsets[offset] = b->offset;
220  b->offset = offset;
221  b->num_offsets++;
222  }
223  else {
224  /* otherwise creare a new cell */
225  ASSERT(!b->next);
226  b->next = BackboneCellNew(large_index, offset);
227  if (!b->next) {
228  return -1;
229  }
230  }
231  }
232 
233  return 0;
234 }
235 
236 
238  Int4* offsets,
239  Int4 word_length,
240  Int4 charsize,
241  Int4 lut_word_length,
243  BlastSeqLoc* locations,
245  Uint4 mask,
246  PV_ARRAY_TYPE* pv_array)
247 {
248  BlastSeqLoc *loc;
249  Int4 offset;
250  Uint1 *seq;
251  Uint1 *word_target;
252  Uint1 invalid_mask = 0xff << charsize;
253 
254  for (loc = locations; loc; loc = loc->next) {
255  Int4 from = loc->ssr->left;
256  Int4 to = loc->ssr->right;
257 
258  /* if this location is too small to fit a complete word, skip the
259  location */
260 
261  if (word_length > to - from + 1)
262  continue;
263 
264  /* Indexing proceeds from the start point to the last offset
265  such that a full lookup table word can be created. word_target
266  points to the letter beyond which indexing is allowed */
267  seq = query->sequence + from;
268  word_target = seq + lut_word_length;
269 
270  for (offset = from; offset <= to; offset++, seq++) {
271 
272  if (seq >= word_target) {
273  s_AddWordHit(backbone,
274  offsets,
275  lut_word_length, charsize,
276  seq - lut_word_length,
277  offset - lut_word_length,
278  hash_func, mask,
279  pv_array);
280  }
281 
282  /* if the current word contains an ambiguity, skip all the
283  words that would contain that ambiguity */
284  if (*seq & invalid_mask)
285  word_target = seq + lut_word_length + 1;
286  }
287 
288  /* handle the last word, without loading *seq */
289  if (seq >= word_target) {
290  s_AddWordHit(backbone,
291  offsets,
292  lut_word_length, charsize,
293  seq - lut_word_length,
294  offset - lut_word_length,
295  hash_func, mask,
296  pv_array);
297  }
298 
299  }
300 }
301 
#define sfree(x)
Safe free a pointer: belongs to a higher level header.
Definition: blast_def.h:112
void BlastLookupAddWordHit(Int4 **backbone, Int4 wordsize, Int4 charsize, Uint1 *seq, Int4 query_offset)
Add a single query offset to a generic lookup table.
Definition: blast_lookup.c:33
static Int2 s_AddWordHit(BackboneCell *backbone, Int4 *offsets, Int4 wordsize, Int4 charsize, Uint1 *seq, Int4 offset, TNaLookupHashFunction hash_func, Uint4 mask, PV_ARRAY_TYPE *pv_array)
Definition: blast_lookup.c:174
BackboneCell * BackboneCellFree(BackboneCell *cell)
Definition: blast_lookup.c:135
void BlastHashLookupIndexQueryExactMatches(BackboneCell *backbone, Int4 *offsets, Int4 word_length, Int4 charsize, Int4 lut_word_length, BLAST_SequenceBlk *query, BlastSeqLoc *locations, TNaLookupHashFunction hash_func, Uint4 mask, PV_ARRAY_TYPE *pv_array)
Definition: blast_lookup.c:237
BackboneCell * BackboneCellNew(Uint4 word, Int4 offset)
Create a new cell for a given word and offset.
Definition: blast_lookup.c:148
void BlastLookupIndexQueryExactMatches(Int4 **backbone, Int4 word_length, Int4 charsize, Int4 lut_word_length, BLAST_SequenceBlk *query, BlastSeqLoc *locations)
Add all applicable query offsets to a generic lookup table.
Definition: blast_lookup.c:79
Int4 BackboneCellInit(BackboneCell *cell, Uint4 word, Int4 offset)
Initialize an olready allocated cell for a given word and offset.
Definition: blast_lookup.c:161
Common definitions for protein and nucleotide lookup tables.
#define PV_ARRAY_BTS
bits-to-shift from lookup_index to pv_array index.
Definition: blast_lookup.h:43
#define PV_TEST(lookup, index, shift)
Test the bit at position 'index' in the PV array bitfield within 'lookup'.
Definition: blast_lookup.h:55
static NCBI_INLINE Int4 ComputeTableIndex(Int4 wordsize, Int4 charsize, const Uint1 *word)
Given a word, compute its index value from scratch.
Definition: blast_lookup.h:96
Uint4(* TNaLookupHashFunction)(Uint1 *, Uint4)
Hash function type for the lookup table.
Definition: blast_lookup.h:142
#define PV_ARRAY_TYPE
The pv_array 'native' type.
Definition: blast_lookup.h:41
ncbi::TMaskedQueryRegions mask
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:56
int offset
Definition: replacements.h:160
#define NULL
Definition: ncbistd.hpp:225
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
#define hash_func
int i
#define ASSERT
macro for assert.
Definition: ncbi_std.h:107
static PCRE2_SIZE * offsets
Definition: pcre2grep.c:266
Structure to hold a sequence.
Definition: blast_def.h:242
Thin backbone cell for nucleotide lookup table with hashed words.
Definition: blast_lookup.h:133
Used to hold a set of positions, mostly used for filtering.
Definition: blast_def.h:204
SSeqRange * ssr
location data on the sequence.
Definition: blast_def.h:206
struct BlastSeqLoc * next
next in linked list
Definition: blast_def.h:205
Int4 left
left endpoint of range (zero based)
Definition: blast_def.h:156
Int4 right
right endpoint of range (zero based)
Definition: blast_def.h:157
static string query
voidp malloc(uInt size)
voidp calloc(uInt items, uInt size)
Modified on Fri Sep 20 14:58:29 2024 by modify_doxy.py rev. 669887