NCBI C++ ToolKit
blast_extend.c
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: blast_extend.c 72378 2016-05-04 14:59:01Z camacho $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * thus cannot be copyrighted. This software/database is freely available
10  * to the public for use. The National Library of Medicine and the U.S.
11  * Government have not placed any restriction on its use or reproduction.
12  *
13  * Although all reasonable efforts have been taken to ensure the accuracy
14  * and reliability of the software and data, the NLM and the U.S.
15  * Government do not and cannot warrant the performance or results that
16  * may be obtained by using this software or data. The NLM and the U.S.
17  * Government disclaim all warranties, express or implied, including
18  * warranties of performance, merchantability or fitness for any particular
19  * purpose.
20  *
21  * Please cite the author in any work or product based on this material.
22  *
23  * ===========================================================================
24  *
25  */
26 
27 /** @file blast_extend.c
28  * Functions to initialize structures used for BLAST extension
29  */
30 
33 
34 /** Allocates memory for the BLAST_DiagTable*. This function also
35  * sets many of the parametes such as diag_array_length etc.
36  * @param qlen Length of the query [in]
37  * @param multiple_hits Specifies whether multiple hits method is used [in]
38  * @param window_size The max. distance between two hits that are extended [in]
39  * @return The allocated BLAST_DiagTable structure
40 */
41 static BLAST_DiagTable*
43 
44 {
45  BLAST_DiagTable* diag_table;
46  Int4 diag_array_length;
47 
48  diag_table= (BLAST_DiagTable*) calloc(1, sizeof(BLAST_DiagTable));
49 
50  if (diag_table)
51  {
52  diag_array_length = 1;
53  /* What power of 2 is just longer than the query? */
54  while (diag_array_length < (qlen+window_size))
55  {
56  diag_array_length = diag_array_length << 1;
57  }
58  /* These are used in the word finders to shift and mask
59  rather than dividing and taking the remainder. */
60  diag_table->diag_array_length = diag_array_length;
61  diag_table->diag_mask = diag_array_length-1;
62  diag_table->multiple_hits = multiple_hits;
63  diag_table->offset = window_size;
64  diag_table->window = window_size;
65  }
66  return diag_table;
67 }
68 
69 /** Deallocate memory for the diagonal table structure
70  * @param diag_table the object to be freed [in]
71  * @return NULL
72 */
73 static BLAST_DiagTable*
75 {
76  if (diag_table) {
77  sfree(diag_table->hit_level_array);
78  sfree(diag_table->hit_len_array);
79  sfree(diag_table);
80  }
81  return NULL;
82 }
83 
84 /** Reset the diagonal array structure. Used when offset has wrapped around.
85  * @param diag pointer to the diagonal array structure [in]
86  */
88 {
89  Int4 i, n;
90  DiagStruct *diag_struct_array;
91 
92  if (diag == NULL)
93  return 0;
94 
95  n = diag->diag_array_length;
96 
97  diag->offset = diag->window;
98 
99  diag_struct_array = diag->hit_level_array;
100 
101  for (i = 0; i < n; i++) {
102  diag_struct_array[i].flag = 0;
103  diag_struct_array[i].last_hit = -diag->window;
104  if (diag->hit_len_array) diag->hit_len_array[i] = 0;
105  }
106  return 0;
107 }
108 
109 /* Description in blast_extend.h */
111  const BlastInitialWordParameters * word_params,
112  Blast_ExtendWord ** ewp_ptr)
113 {
114  Blast_ExtendWord *ewp;
115 
116  *ewp_ptr = ewp = (Blast_ExtendWord *) calloc(1, sizeof(Blast_ExtendWord));
117 
118  if (!ewp) {
119  return -1;
120  }
121 
122  if (word_params->container_type == eDiagHash) {
123  ewp->hash_table =
124  (BLAST_DiagHash *) calloc(1, sizeof(BLAST_DiagHash));
125 
127  ewp->hash_table->backbone =
128  calloc(ewp->hash_table->num_buckets, sizeof(Uint4));
130  ewp->hash_table->chain =
131  calloc(ewp->hash_table->capacity, sizeof(DiagHashCell));
132  ewp->hash_table->occupancy = 1;
133  ewp->hash_table->window = word_params->options->window_size;
134  ewp->hash_table->offset = word_params->options->window_size;
135  } else { /* container_type == eDiagArray */
136 
137  Boolean multiple_hits = (word_params->options->window_size > 0);
138  BLAST_DiagTable *diag_table;
139 
140  ewp->diag_table = diag_table =
141  s_BlastDiagTableNew(query_length, multiple_hits,
142  word_params->options->window_size);
143  /* Allocate the buffer to be used for diagonal array. */
144 
145  diag_table->hit_level_array = (DiagStruct *)
146  calloc(diag_table->diag_array_length, sizeof(DiagStruct));
147  if (word_params->options->window_size) {
148  diag_table->hit_len_array = (Uint1 *)
149  calloc(diag_table->diag_array_length, sizeof(Uint1));
150  }
151  if (!diag_table->hit_level_array) {
152  sfree(ewp);
153  return -1;
154  }
155  }
156  *ewp_ptr = ewp;
157 
158  return 0;
159 }
160 
161 Int2
163 {
164  if (!ewp)
165  return -1;
166 
167  if (ewp->diag_table) {
168  if (ewp->diag_table->offset >= INT4_MAX / 4) {
169  ewp->diag_table->offset = ewp->diag_table->window;
171  } else {
172  ewp->diag_table->offset += subject_length + ewp->diag_table->window;
173  }
174  } else if (ewp->hash_table) {
175  if (ewp->hash_table->offset >= INT4_MAX / 4) {
176  ewp->hash_table->occupancy = 1;
177  ewp->hash_table->offset = ewp->hash_table->window;
178  memset(ewp->hash_table->backbone, 0,
179  ewp->hash_table->num_buckets * sizeof(Int4));
180  } else {
181  ewp->hash_table->offset += subject_length + ewp->hash_table->window;
182  }
183  }
184  return 0;
185 }
186 
187 /** Deallocate memory for the hash table structure.
188  * @param hash_table The hash table structure to free. [in]
189  * @return NULL.
190  */
192 {
193  if (!hash_table)
194  return NULL;
195 
196  sfree(hash_table->backbone);
197  sfree(hash_table->chain);
198  sfree(hash_table);
199 
200  return NULL;
201 }
202 
204 {
205 
206  if (ewp == NULL)
207  return NULL;
208 
211  sfree(ewp);
212  return NULL;
213 }
214 
215 
217 {
218  BlastInitHitList *init_hitlist = (BlastInitHitList *)
219  calloc(1, sizeof(BlastInitHitList));
220 
221  init_hitlist->allocated = MIN_INIT_HITLIST_SIZE;
222 
223  init_hitlist->init_hsp_array = (BlastInitHSP *)
225 
226  return init_hitlist;
227 }
228 
230 {
231  Int4 index;
232 
233  for (index = 0; index < init_hitlist->total; ++index)
234  sfree(init_hitlist->init_hsp_array[index].ungapped_data);
235  init_hitlist->total = 0;
236 }
237 
238 
239 /** empty an init hitlist but do not deallocate the base structure
240  * @param hi list of initial hits to clean [in][out]
241  */
243 {
245  sfree(hi->init_hsp_array);
246 }
247 
249  BlastInitHitList * src)
250 {
251  ASSERT(dst != 0);
252  ASSERT(src != 0);
253  ASSERT(!dst->do_not_reallocate);
254 
256  memmove((void *)dst, (const void *)src, sizeof(BlastInitHitList));
257  src->total = src->allocated = 0;
258  src->init_hsp_array = 0;
259 }
260 
262 {
263  if (init_hitlist == NULL)
264  return NULL;
265 
266  s_BlastInitHitListClean(init_hitlist);
267  sfree(init_hitlist);
268  return NULL;
269 }
270 
271 /** Callback for sorting an array of initial HSP structures (not pointers to
272  * structures!) by score.
273  */
274 static int score_compare_match(const void *v1, const void *v2)
275 {
276  BlastInitHSP *h1, *h2;
277  int result = 0;
278 
279  h1 = (BlastInitHSP *) v1;
280  h2 = (BlastInitHSP *) v2;
281 
282  /* Check if ungapped_data substructures are initialized. If not, move
283  those array elements to the end. In reality this should never happen. */
284  if (h1->ungapped_data == NULL && h2->ungapped_data == NULL)
285  return 0;
286  else if (h1->ungapped_data == NULL)
287  return 1;
288  else if (h2->ungapped_data == NULL)
289  return -1;
290 
291  if (0 == (result = BLAST_CMP(h2->ungapped_data->score,
292  h1->ungapped_data->score)) &&
293  0 == (result = BLAST_CMP(h1->ungapped_data->s_start,
294  h2->ungapped_data->s_start)) &&
295  0 == (result = BLAST_CMP(h2->ungapped_data->length,
296  h1->ungapped_data->length)) &&
297  0 == (result = BLAST_CMP(h1->ungapped_data->q_start,
298  h2->ungapped_data->q_start))) {
300  h1->ungapped_data->length);
301  }
302 
303  return result;
304 }
305 
307 {
308  qsort(init_hitlist->init_hsp_array, init_hitlist->total,
310 }
311 
313 {
314  Int4 index;
315  BlastInitHSP *init_hsp_array = init_hitlist->init_hsp_array;
316 
317  for (index = 0; index < init_hitlist->total - 1; ++index) {
318  if (score_compare_match(&init_hsp_array[index],
319  &init_hsp_array[index + 1]) > 0)
320  return FALSE;
321  }
322  return TRUE;
323 }
324 
326  Int4 q_off, Int4 s_off,
327  BlastUngappedData * ungapped_data)
328 {
329  BlastInitHSP *match_array;
330  Int4 num, num_avail;
331 
332  num = init_hitlist->total;
333  num_avail = init_hitlist->allocated;
334 
335  match_array = init_hitlist->init_hsp_array;
336  if (num >= num_avail) {
337  if (init_hitlist->do_not_reallocate)
338  return FALSE;
339  num_avail *= 2;
340  match_array = (BlastInitHSP *)
341  realloc(match_array, num_avail * sizeof(BlastInitHSP));
342  if (!match_array) {
343  init_hitlist->do_not_reallocate = TRUE;
344  return FALSE;
345  } else {
346  init_hitlist->allocated = num_avail;
347  init_hitlist->init_hsp_array = match_array;
348  }
349  }
350 
351  match_array[num].offsets.qs_offsets.q_off = q_off;
352  match_array[num].offsets.qs_offsets.s_off = s_off;
353  match_array[num].ungapped_data = ungapped_data;
354 
355  init_hitlist->total++;
356  return TRUE;
357 }
358 
359 void
360 BlastSaveInitHsp(BlastInitHitList * ungapped_hsps, Int4 q_start, Int4 s_start,
361  Int4 q_off, Int4 s_off, Int4 len, Int4 score)
362 {
363  BlastUngappedData *ungapped_data = NULL;
364 
365  ungapped_data = (BlastUngappedData *) malloc(sizeof(BlastUngappedData));
366 
367  ungapped_data->q_start = q_start;
368  ungapped_data->s_start = s_start;
369  ungapped_data->length = len;
370  ungapped_data->score = score;
371 
372  BLAST_SaveInitialHit(ungapped_hsps, q_off, s_off, ungapped_data);
373 
374  return;
375 }
#define sfree(x)
Safe free a pointer: belongs to a higher level header.
Definition: blast_def.h:112
#define BLAST_CMP(a, b)
A macro expression that returns 1, 0, -1 if a is greater than, equal to or less than b,...
Definition: blast_def.h:107
static int score_compare_match(const void *v1, const void *v2)
Callback for sorting an array of initial HSP structures (not pointers to structures!...
Definition: blast_extend.c:274
void BlastInitHitListReset(BlastInitHitList *init_hitlist)
Free the ungapped data substructures and reset initial HSP count to 0.
Definition: blast_extend.c:229
BlastInitHitList * BLAST_InitHitListNew(void)
Allocate memory for the BlastInitHitList structure.
Definition: blast_extend.c:216
Boolean BLAST_SaveInitialHit(BlastInitHitList *init_hitlist, Int4 q_off, Int4 s_off, BlastUngappedData *ungapped_data)
Save the initial hit data into the initial hit list structure.
Definition: blast_extend.c:325
static BLAST_DiagHash * s_BlastDiagHashFree(BLAST_DiagHash *hash_table)
Deallocate memory for the hash table structure.
Definition: blast_extend.c:191
void Blast_InitHitListSortByScore(BlastInitHitList *init_hitlist)
Sort array of initial HSPs by score.
Definition: blast_extend.c:306
Int2 Blast_ExtendWordExit(Blast_ExtendWord *ewp, Int4 subject_length)
Update the word extension structure after scanning of each subject sequence.
Definition: blast_extend.c:162
void BlastInitHitListMove(BlastInitHitList *dst, BlastInitHitList *src)
Move the contents of a BlastInitHitList structure.
Definition: blast_extend.c:248
static BLAST_DiagTable * s_BlastDiagTableNew(Int4 qlen, Boolean multiple_hits, Int4 window_size)
Allocates memory for the BLAST_DiagTable*.
Definition: blast_extend.c:42
static BLAST_DiagTable * s_BlastDiagTableFree(BLAST_DiagTable *diag_table)
Deallocate memory for the diagonal table structure.
Definition: blast_extend.c:74
Blast_ExtendWord * BlastExtendWordFree(Blast_ExtendWord *ewp)
Deallocate memory for the word extension structure.
Definition: blast_extend.c:203
Boolean Blast_InitHitListIsSortedByScore(BlastInitHitList *init_hitlist)
Check if array of initial HSPs is sorted by score.
Definition: blast_extend.c:312
static void s_BlastInitHitListClean(BlastInitHitList *hi)
empty an init hitlist but do not deallocate the base structure
Definition: blast_extend.c:242
BlastInitHitList * BLAST_InitHitListFree(BlastInitHitList *init_hitlist)
Free memory for the BlastInitList structure.
Definition: blast_extend.c:261
void BlastSaveInitHsp(BlastInitHitList *ungapped_hsps, Int4 q_start, Int4 s_start, Int4 q_off, Int4 s_off, Int4 len, Int4 score)
Add a new initial (ungapped) HSP to an initial hit list.
Definition: blast_extend.c:360
static Int4 s_BlastDiagClear(BLAST_DiagTable *diag)
Reset the diagonal array structure.
Definition: blast_extend.c:87
Int2 BlastExtendWordNew(Uint4 query_length, const BlastInitialWordParameters *word_params, Blast_ExtendWord **ewp_ptr)
Initializes the word extension structure.
Definition: blast_extend.c:110
Ungapped extension structures that are common to nucleotide and protein extension routines.
#define DIAGHASH_CHAIN_LENGTH
Default hash chain length.
Definition: blast_extend.h:54
#define DIAGHASH_NUM_BUCKETS
Number of hash buckets in BLAST_DiagHash.
Definition: blast_extend.h:51
#define MIN_INIT_HITLIST_SIZE
Minimal size of an array of initial word hits, allocated up front.
Definition: blast_extend.h:139
The structures and functions in blast_options.
@ eDiagHash
use hash table (blastn only)
static ulg window_size
#define NULL
Definition: ncbistd.hpp:225
const CVect2< U > & v2
Definition: globals.hpp:440
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
int i
yy_size_t n
int len
#define INT4_MAX
largest nubmer represented by signed int
Definition: ncbi_std.h:141
Uint1 Boolean
bool replacment for C
Definition: ncbi_std.h:94
#define TRUE
bool replacment for C indicating true.
Definition: ncbi_std.h:97
#define FALSE
bool replacment for C indicating false.
Definition: ncbi_std.h:101
#define ASSERT
macro for assert.
Definition: ncbi_std.h:107
#define memmove(a, b, c)
Track initial word matches using hashing with chaining.
Definition: blast_extend.h:98
Int4 window
The "window" size, within which two (or more) hits must be found in order to be extended.
Definition: blast_extend.h:105
Uint4 occupancy
Number of occupied elements.
Definition: blast_extend.h:100
Uint4 capacity
Total number of elements.
Definition: blast_extend.h:101
DiagHashCell * chain
Array of data cells.
Definition: blast_extend.h:103
Uint4 * backbone
Array of offsets to heads of chains.
Definition: blast_extend.h:102
Int4 offset
"offset" added to query and subject position so that "last_hit" doesn't have to be zeroed out every t...
Definition: blast_extend.h:104
Uint4 num_buckets
Number of buckets to be used for storing hit offsets.
Definition: blast_extend.h:99
Structure containing parameters needed for initial word extension.
Definition: blast_extend.h:77
DiagStruct * hit_level_array
Array to hold latest hits and their lengths for all diagonals.
Definition: blast_extend.h:78
Int4 diag_array_length
Smallest power of 2 longer than query length.
Definition: blast_extend.h:81
Int4 diag_mask
Used to mask off everything above min_diag_length (mask = min_diag_length-1).
Definition: blast_extend.h:82
Uint1 * hit_len_array
Array to hold the lengthof the latest hit.
Definition: blast_extend.h:80
Boolean multiple_hits
Used by BlastExtendWordNew to decide whether or not to prepare the structure for multiple-hit type se...
Definition: blast_extend.h:89
Int4 window
The "window" size, within which two (or more) hits must be found in order to be extended.
Definition: blast_extend.h:87
Int4 offset
"offset" added to query and subject position so that "last_hit" doesn't have to be zeroed out every t...
Definition: blast_extend.h:84
Structure to hold the initial HSP information.
Definition: blast_extend.h:150
BlastUngappedData * ungapped_data
Pointer to a structure holding ungapped alignment information.
Definition: blast_extend.h:153
BlastOffsetPair offsets
Offsets in query and subject, or, in PHI BLAST, start and end of pattern in subject.
Definition: blast_extend.h:151
Structure to hold all initial HSPs for a given subject sequence.
Definition: blast_extend.h:158
Int4 allocated
Available size of the offsets array.
Definition: blast_extend.h:160
Int4 total
Total number of hits currently saved.
Definition: blast_extend.h:159
Boolean do_not_reallocate
Can the init_hsp_array be reallocated?
Definition: blast_extend.h:163
BlastInitHSP * init_hsp_array
Array of offset pairs, possibly with scores.
Definition: blast_extend.h:161
Int4 window_size
Maximal allowed distance between 2 hits in case 2 hits are required to trigger the extension.
Parameter block that contains a pointer to BlastInitialWordOptions and the values derived from it.
BlastInitialWordOptions * options
The original (unparsed) options.
ESeedContainerType container_type
How to store offset pairs for initial seeds?
Structure to hold ungapped alignment information.
Definition: blast_extend.h:142
Int4 score
Score of the ungapped alignment.
Definition: blast_extend.h:146
Int4 length
Length of the ungapped alignment.
Definition: blast_extend.h:145
Int4 q_start
Start of the ungapped alignment in query.
Definition: blast_extend.h:143
Int4 s_start
Start of the ungapped alignment in subject.
Definition: blast_extend.h:144
Structure for keeping initial word extension information.
Definition: blast_extend.h:109
BLAST_DiagHash * hash_table
Hash table and related parameters.
Definition: blast_extend.h:111
BLAST_DiagTable * diag_table
Diagonal array and related parameters.
Definition: blast_extend.h:110
Structure for keeping last hit information for a diagonal in a hash table, when eRight or eRightAndLe...
Definition: blast_extend.h:65
Structure for keeping last hit information for a diagonal.
Definition: blast_extend.h:57
signed int last_hit
Offset of the last hit.
Definition: blast_extend.h:58
unsigned int flag
Reset the next extension?
Definition: blast_extend.h:59
else result
Definition: token2.c:20
Uint4 q_off
Query offset.
Definition: blast_def.h:143
Uint4 s_off
Subject offset.
Definition: blast_def.h:144
struct BlastOffsetPair::@6 qs_offsets
Query/subject offset pair.
voidp malloc(uInt size)
voidp calloc(uInt items, uInt size)
Modified on Tue May 28 05:52:58 2024 by modify_doxy.py rev. 669887