NCBI C++ ToolKit
blast_sw.c
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: blast_sw.c 73100 2016-06-20 15:45:40Z boratyng $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Jason Papadopoulos
27  *
28  */
29 
30 /** @file blast_sw.c
31  * Smith-Waterman gapped alignment, for use with infrastructure of BLAST
32  * @sa blast_sw.h
33  */
34 
36 #include <algo/blast/core/blast_util.h> /* for NCBI2NA_UNPACK_BASE */
37 
38 /** swap (pointers to) a pair of sequences */
39 #define SWAP_SEQS(A, B) {const Uint1 *tmp = (A); (A) = (B); (B) = tmp; }
40 
41 /** swap two integers */
42 #define SWAP_INT(A, B) {Int4 tmp = (A); (A) = (B); (B) = tmp; }
43 
44 /** Compute the score of the best local alignment between
45  * two protein sequences. When using Smith-Waterman, the vast
46  * majority of the runtime is tied up in this routine.
47  * @param A The first sequence [in]
48  * @param a_size Length of the first sequence [in]
49  * @param B The second sequence [in]
50  * @param b_size Length of the second sequence [in]
51  * @param gap_open Gap open penalty [in]
52  * @param gap_extend Gap extension penalty [in]
53  * @param gap_align Auxiliary data for gapped alignment
54  * (used for score matrix info) [in]
55  * @return The score of the best local alignment between A and B
56  */
57 static Int4 s_SmithWatermanScoreOnly(const Uint1 *A, Int4 a_size,
58  const Uint1 *B, Int4 b_size,
59  Int4 gap_open, Int4 gap_extend,
60  BlastGapAlignStruct *gap_align)
61 {
62  Int4 i, j;
63  Int4 **matrix;
64  Int4 *matrix_row;
65 
66  Int4 final_best_score;
67  Int4 best_score;
68  Int4 insert_score;
69  Int4 row_score;
70  BlastGapDP *scores;
71 
72  Boolean is_pssm = gap_align->positionBased;
73  Int4 gap_open_extend = gap_open + gap_extend;
74 
75  /* choose the score matrix */
76  if (is_pssm) {
77  matrix = gap_align->sbp->psi_matrix->pssm->data;
78  }
79  else {
80  /* for square score matrices, assume the matrix
81  is symmetric. This means that A and B can be
82  switched without changing the score, and this
83  saves memory if one sequence is large but the
84  other is not */
85  if (a_size < b_size) {
86  SWAP_SEQS(A, B);
87  SWAP_INT(a_size, b_size);
88  }
89  matrix = gap_align->sbp->matrix->data;
90  }
91 
92  /* allocate space for scratch structures */
93  if (b_size + 1 > gap_align->dp_mem_alloc) {
94  gap_align->dp_mem_alloc = MAX(b_size + 100,
95  2 * gap_align->dp_mem_alloc);
96  sfree(gap_align->dp_mem);
97  gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc *
98  sizeof(BlastGapDP));
99  }
100  scores = gap_align->dp_mem;
101  memset(scores, 0, (b_size + 1) * sizeof(BlastGapDP));
102  final_best_score = 0;
103 
104 
105  for (i = 1; i <= a_size; i++) {
106 
107  if (is_pssm)
108  matrix_row = matrix[i-1];
109  else
110  matrix_row = matrix[A[i-1]];
111 
112  insert_score = 0;
113  row_score = 0;
114 
115  for (j = 1; j <= b_size; j++) {
116 
117  /* score of best alignment ending at (i,j) with gap in B */
118  best_score = scores[j].best_gap - gap_extend;
119  if (scores[j].best - gap_open_extend > best_score)
120  best_score = scores[j].best - gap_open_extend;
121  scores[j].best_gap = best_score;
122 
123  /* score of best alignment ending at (i,j) with gap in A */
124  best_score = insert_score - gap_extend;
125  if (row_score - gap_open_extend > best_score)
126  best_score = row_score - gap_open_extend;
127  insert_score = best_score;
128 
129  /* score of best alignment ending at (i,j) */
130  best_score = MAX(scores[j-1].best + matrix_row[B[j-1]], 0);
131  if (insert_score > best_score)
132  best_score = insert_score;
133  if (scores[j].best_gap > best_score)
134  best_score = scores[j].best_gap;
135 
136  if (best_score > final_best_score)
137  final_best_score = best_score;
138 
139  scores[j-1].best = row_score;
140  row_score = best_score;
141  }
142 
143  scores[j-1].best = row_score;
144  }
145 
146  return final_best_score;
147 }
148 
149 
150 /** Compute the score of the best local alignment between
151  * two nucleotide sequences. One of the sequences must be in
152  * packed format. For nucleotide Smith-Waterman, the vast
153  * majority of the runtime is tied up in this routine.
154  * @param B The first sequence (must be in ncbi2na format) [in]
155  * @param b_size Length of the first sequence [in]
156  * @param A The second sequence [in]
157  * @param a_size Length of the second sequence [in]
158  * @param gap_open Gap open penalty [in]
159  * @param gap_extend Gap extension penalty [in]
160  * @param gap_align Auxiliary data for gapped alignment
161  * (used for score matrix info) [in]
162  * @return The score of the best local alignment between A and B
163  */
164 static Int4 s_NuclSmithWaterman(const Uint1 *B, Int4 b_size,
165  const Uint1 *A, Int4 a_size,
166  Int4 gap_open, Int4 gap_extend,
167  BlastGapAlignStruct *gap_align)
168 {
169  Int4 i, j;
170  Int4 **matrix;
171  Int4 *matrix_row;
172 
173  Int4 final_best_score;
174  Int4 best_score;
175  Int4 insert_score;
176  Int4 row_score;
177  BlastGapDP *scores;
178 
179  Int4 gap_open_extend = gap_open + gap_extend;
180 
181  /* position-specific scoring is not allowed, because
182  the loops below assume the score matrix is symmetric */
183  matrix = gap_align->sbp->matrix->data;
184 
185  if (a_size + 1 > gap_align->dp_mem_alloc) {
186  gap_align->dp_mem_alloc = MAX(a_size + 100,
187  2 * gap_align->dp_mem_alloc);
188  sfree(gap_align->dp_mem);
189  gap_align->dp_mem = (BlastGapDP *)malloc(gap_align->dp_mem_alloc *
190  sizeof(BlastGapDP));
191  }
192  scores = gap_align->dp_mem;
193  memset(scores, 0, (a_size + 1) * sizeof(BlastGapDP));
194  final_best_score = 0;
195 
196 
197  for (i = 1; i <= b_size; i++) {
198 
199  /* The only real difference between this routine
200  and its unpacked counterpart is the choice of
201  score matrix row in the outer loop */
202  Int4 base_pair = NCBI2NA_UNPACK_BASE(B[(i-1)/4], (3-((i-1)%4)));
203  matrix_row = matrix[base_pair];
204  insert_score = 0;
205  row_score = 0;
206 
207  for (j = 1; j <= a_size; j++) {
208 
209  /* score of best alignment ending at (i,j) with gap in A */
210  best_score = scores[j].best_gap - gap_extend;
211  if (scores[j].best - gap_open_extend > best_score)
212  best_score = scores[j].best - gap_open_extend;
213  scores[j].best_gap = best_score;
214 
215  /* score of best alignment ending at (i,j) with gap in B */
216  best_score = insert_score - gap_extend;
217  if (row_score - gap_open_extend > best_score)
218  best_score = row_score - gap_open_extend;
219  insert_score = best_score;
220 
221  /* score of best alignment ending at (i,j) */
222  best_score = MAX(scores[j-1].best + matrix_row[A[j-1]], 0);
223  if (insert_score > best_score)
224  best_score = insert_score;
225  if (scores[j].best_gap > best_score)
226  best_score = scores[j].best_gap;
227 
228  if (best_score > final_best_score)
229  final_best_score = best_score;
230 
231  scores[j-1].best = row_score;
232  row_score = best_score;
233  }
234 
235  scores[j-1].best = row_score;
236  }
237 
238  return final_best_score;
239 }
240 
241 
242 /** Values for the editing script operations in traceback */
243 enum {
244  EDIT_SUB = eGapAlignSub, /**< Substitution */
245  EDIT_GAP_IN_A = eGapAlignDel, /**< Deletion */
246  EDIT_GAP_IN_B = eGapAlignIns, /**< Insertion */
247  EDIT_OP_MASK = 0x07, /**< Mask for edit script operations */
248 
249  EDIT_START_GAP_A = 0x10, /**< open a gap in A */
250  EDIT_START_GAP_B = 0x20 /**< open a gap in B */
251 };
252 
253 
254 /** Generate the traceback for a single local alignment
255  * between two (unpacked) sequences, then create an HSP
256  * for the alignment and add to a list of such HSPs
257  * @param program_number Blast program requesting traceback [in]
258  * @param trace 2-D array of edit actions, size (a_size+1) x (b_size+1) [in]
259  * @param A The first sequence [in]
260  * @param B The second sequence [in]
261  * @param b_size Length of the second sequence [in]
262  * @param gap_open Gap open penalty [in]
263  * @param gap_extend Gap extension penalty [in]
264  * @param gap_align Auxiliary data for gapped alignment
265  * (used for score matrix info) [in]
266  * @param a_end The alignment end offset on A (plus one) [in]
267  * @param b_end The alignment end offset on B (plus one) [in]
268  * @param best_score Score of the alignment [in]
269  * @param hsp_list Collection of alignments found so far [in][out]
270  * @param swapped TRUE if A and B were swapped before the alignment
271  * was found [in]
272  * @param template_hsp Placeholder alignment, used only to
273  * determine contexts and frames [in]
274  * @param score_options Structure containing gap penalties [in]
275  * @param hit_options Structure used for percent identity calculation [in]
276  * @param start_shift Bias to be applied to subject offsets [in]
277  */
278 static void s_GetTraceback(EBlastProgramType program_number,
279  Uint1 *trace, const Uint1 *A, const Uint1 *B, Int4 b_size,
280  Int4 gap_open, Int4 gap_extend,
281  BlastGapAlignStruct *gap_align,
282  Int4 a_end, Int4 b_end, Int4 best_score,
283  BlastHSPList *hsp_list, Boolean swapped,
284  BlastHSP *template_hsp,
285  const BlastScoringOptions *score_options,
286  const BlastHitSavingOptions *hit_options,
287  Int4 start_shift)
288 {
289  Int4 i, j;
290  Uint1 script;
291  Uint1 next_action;
292  Uint1 *traceback_row;
293  Int4 a_start, b_start;
294  Int4 **matrix;
295  Int4 curr_score = -best_score;
296  Boolean is_pssm = gap_align->positionBased;
297  GapPrelimEditBlock *prelim_tback = gap_align->fwd_prelim_tback;
298  GapEditScript *final_tback;
299  BlastHSP *new_hsp;
300 
301  i = a_end;
302  j = b_end;
303  traceback_row = trace + i * (b_size + 1);
304  script = traceback_row[j] & EDIT_OP_MASK;
305  GapPrelimEditBlockReset(prelim_tback);
306 
307  if (is_pssm)
308  matrix = gap_align->sbp->psi_matrix->pssm->data;
309  else
310  matrix = gap_align->sbp->matrix->data;
311 
312  /* Only the start point of the alignment is unknown, but
313  there was no bookkeeping performed to remember where the
314  start point is located. We know the list of traceback
315  actions backwards from (i,j) and know the score of the
316  alignment, so the start point will be the offset pair where
317  the score becomes zero after edit operations are applied */
318 
319  while (curr_score != 0) {
320 
321  next_action = traceback_row[j];
322  GapPrelimEditBlockAdd(prelim_tback, (EGapAlignOpType)script, 1);
323 
324  switch(script) {
325  case EDIT_SUB:
326  if (is_pssm)
327  curr_score += matrix[i-1][B[j-1]];
328  else
329  curr_score += matrix[A[i-1]][B[j-1]];
330 
331  i--; j--;
332  traceback_row -= b_size + 1;
333  script = traceback_row[j] & EDIT_OP_MASK;
334  break;
335  case EDIT_GAP_IN_A:
336  j--;
337  if (next_action & EDIT_START_GAP_A) {
338  script = traceback_row[j] & EDIT_OP_MASK;
339  curr_score -= gap_open;
340  }
341  curr_score -= gap_extend;
342  break;
343  case EDIT_GAP_IN_B:
344  i--;
345  traceback_row -= b_size + 1;
346  if (next_action & EDIT_START_GAP_B) {
347  script = traceback_row[j] & EDIT_OP_MASK;
348  curr_score -= gap_open;
349  }
350  curr_score -= gap_extend;
351  break;
352  }
353  }
354 
355  /* found the start point. Now create the final (reversed)
356  edit script; if A and B were swapped before calling
357  this routine, swap them back */
358  a_start = i;
359  b_start = j;
360  final_tback = GapEditScriptNew(prelim_tback->num_ops);
361  for (i = prelim_tback->num_ops - 1, j = 0; i >= 0; i--, j++) {
362  GapPrelimEditScript *p = prelim_tback->edit_ops + i;
363  final_tback->num[j] = p->num;
364  final_tback->op_type[j] = p->op_type;
365  if (swapped) {
366  if (p->op_type == eGapAlignIns)
367  final_tback->op_type[j] = eGapAlignDel;
368  else if (p->op_type == eGapAlignDel)
369  final_tback->op_type[j] = eGapAlignIns;
370  }
371  }
372 
373  if (swapped) {
374  SWAP_SEQS(A, B);
375  SWAP_INT(a_start, b_start);
376  SWAP_INT(a_end, b_end);
377  }
378 
379  /* construct an HSP, verify it meets length and percent
380  identity criteria, and save it */
381  Blast_HSPInit(a_start, a_end, b_start, b_end,
382  a_start, b_start, template_hsp->context,
383  template_hsp->query.frame,
384  template_hsp->subject.frame, best_score,
385  &final_tback, &new_hsp);
386 
387  if (Blast_HSPTestIdentityAndLength(program_number, new_hsp, A, B,
388  score_options, hit_options)) {
389  Blast_HSPFree(new_hsp);
390  }
391  else {
392  Blast_HSPAdjustSubjectOffset(new_hsp, start_shift);
393  Blast_HSPListSaveHSP(hsp_list, new_hsp);
394  }
395 }
396 
397 
398 /** Auxiliary structures for Smith-Waterman alignment
399  * with traceback
400  */
401 typedef struct BlastGapSW{
402  Int4 best; /**< Score of best alignment at this position */
403  Int4 best_gap; /**< Score of best alignment at this position that
404  ends in a gap */
405  Int4 path_score; /**< The highest score that the alignment at this
406  position has previously achieved */
407  Int4 path_stop_i; /**< Offset (plus one) on the first sequence where
408  path_score occurs */
409  Int4 path_stop_j; /**< Offset (plus one) on the second sequence where
410  path_score occurs */
412 
413 /* See blast_sw.h for details */
415  const Uint1 *A, Int4 a_size,
416  const Uint1 *B, Int4 b_size,
417  BlastHSP *template_hsp,
418  BlastHSPList *hsp_list,
419  const BlastScoringParameters *score_params,
420  const BlastHitSavingParameters *hit_params,
421  BlastGapAlignStruct *gap_align,
422  Int4 start_shift, Int4 cutoff)
423 {
424  Int4 i, j;
425  Int4 *matrix_row;
426  Int4 **matrix;
427  Boolean swapped = FALSE;
428 
429  Int4 best_score;
430  Int4 insert_score;
431  Int4 row_score;
432  Int4 row_path_score;
433  Int4 row_path_stop_i;
434  Int4 row_path_stop_j;
435  Int4 new_path_score;
436  Int4 new_path_stop_i;
437  Int4 new_path_stop_j;
438  BlastGapSW *scores;
439 
440  Boolean is_pssm = gap_align->positionBased;
441  Int4 gap_open = score_params->gap_open;
442  Int4 gap_extend = score_params->gap_extend;
443  Int4 gap_open_extend = gap_open + gap_extend;
444 
445  Uint1 *traceback_array;
446  Uint1 *traceback_row;
447  Uint1 script;
448 
449  /* choose the score matrix */
450  if (is_pssm) {
451  matrix = gap_align->sbp->psi_matrix->pssm->data;
452  }
453  else {
454  /* for square score matrices, assume the matrix
455  is symmetric. This means that A and B can be
456  switched without changing the score, and this
457  saves memory if one sequence is large but the
458  other is not */
459  if (a_size < b_size) {
460  swapped = TRUE;
461  SWAP_SEQS(A, B);
462  SWAP_INT(a_size, b_size);
463  }
464  matrix = gap_align->sbp->matrix->data;
465  }
466 
467  /* allocate space for scratch structures */
468  scores = (BlastGapSW *)calloc(b_size + 1, sizeof(BlastGapSW));
469  traceback_array = (Uint1 *)malloc((a_size + 1) * (b_size + 1) *
470  sizeof(Uint1));
471  traceback_row = traceback_array;
472  for (i = 0; i <= b_size; i++)
473  traceback_row[i] = EDIT_GAP_IN_A;
474  traceback_row += b_size + 1;
475 
476  for (i = 1; i <= a_size; i++) {
477 
478  if (is_pssm)
479  matrix_row = matrix[i-1];
480  else
481  matrix_row = matrix[A[i-1]];
482 
483  insert_score = 0;
484  row_score = 0;
485  row_path_stop_i = 0;
486  row_path_stop_j = 0;
487  row_path_score = 0;
488  traceback_row[0] = EDIT_GAP_IN_B;
489 
490  for (j = 1; j <= b_size; j++) {
491 
492  /* score of best alignment ending at (i,j) with gap in B */
493  best_score = scores[j].best_gap - gap_extend;
494  script = 0;
495  if (scores[j].best - gap_open_extend > best_score) {
496  script |= EDIT_START_GAP_B;
497  best_score = scores[j].best - gap_open_extend;
498  }
499  scores[j].best_gap = best_score;
500 
501  /* score of best alignment ending at (i,j) with gap in A */
502  best_score = insert_score - gap_extend;
503  if (row_score - gap_open_extend > best_score) {
504  script |= EDIT_START_GAP_A;
505  best_score = row_score - gap_open_extend;
506  }
507  insert_score = best_score;
508 
509  /* Every cell computed by Smith-Waterman lies on exactly
510  one highest-scoring path. This means that at any given
511  time there are only b_size possible paths that need
512  bookkeeping information. In addition to the 3-way
513  maximum needed to choose the highest score, we also
514  need to choose the path that (i,j) extends. Begin by
515  assuming (i,j) extends the substitution path */
516  best_score = MAX(scores[j-1].best + matrix_row[B[j-1]], 0);
517  traceback_row[j] = script | EDIT_SUB;
518  new_path_score = scores[j-1].path_score;
519  new_path_stop_i = scores[j-1].path_stop_i;
520  new_path_stop_j = scores[j-1].path_stop_j;
521 
522  /* switch to the insertion or deletion path, if one
523  of these has the highest overall score */
524  if (insert_score > best_score) {
525  best_score = insert_score;
526  traceback_row[j] = script | EDIT_GAP_IN_A;
527 
528  new_path_score = row_path_score;
529  new_path_stop_i = row_path_stop_i;
530  new_path_stop_j = row_path_stop_j;
531  }
532  if (scores[j].best_gap >= best_score) {
533  best_score = scores[j].best_gap;
534  traceback_row[j] = script | EDIT_GAP_IN_B;
535 
536  new_path_score = scores[j].path_score;
537  new_path_stop_i = scores[j].path_stop_i;
538  new_path_stop_j = scores[j].path_stop_j;
539  }
540 
541  if (best_score == 0) {
542  /* the score of the path extended by (i,j) has
543  decayed to zero, meaning this path can never
544  be extended again. Before proceeding to the
545  next cell and forgetting about this path, check
546  whether the highest score previously achieved
547  by this path exceeds the cutoff for the overall
548  search, and if so then recover the alignment for
549  the current path right now */
550  if (new_path_score >= cutoff) {
551  s_GetTraceback(program_number, traceback_array,
552  A, B, b_size,
553  gap_open, gap_extend,
554  gap_align, new_path_stop_i,
555  new_path_stop_j, new_path_score,
556  hsp_list, swapped, template_hsp,
557  score_params->options,
558  hit_params->options, start_shift);
559  }
560  new_path_score = 0;
561  }
562 
563  /* check if (i,j) is a new local maximum score
564  for this path */
565  if (best_score > new_path_score) {
566  new_path_score = best_score;
567  new_path_stop_i = i;
568  new_path_stop_j = j;
569  }
570 
571  /* save path information */
572  scores[j-1].best = row_score;
573  scores[j-1].path_score = row_path_score;
574  scores[j-1].path_stop_i = row_path_stop_i;
575  scores[j-1].path_stop_j = row_path_stop_j;
576 
577  row_score = best_score;
578  row_path_score = new_path_score;
579  row_path_stop_i = new_path_stop_i;
580  row_path_stop_j = new_path_stop_j;
581  }
582 
583  /* save path information for the last cell in row i */
584  scores[j-1].best = row_score;
585  scores[j-1].path_score = row_path_score;
586  scores[j-1].path_stop_i = row_path_stop_i;
587  scores[j-1].path_stop_j = row_path_stop_j;
588 
589  /* the last cell may not have decayed to zero, but the
590  best score of the path containing it can still exceed
591  the cutoff. Recover the alignment if this is the case;
592  it is the last chance to do so */
593  if (scores[j-1].path_score >= cutoff) {
594  s_GetTraceback(program_number, traceback_array,
595  A, B, b_size,
596  gap_open, gap_extend,
597  gap_align, scores[j-1].path_stop_i,
598  scores[j-1].path_stop_j,
599  scores[j-1].path_score,
600  hsp_list, swapped, template_hsp,
601  score_params->options,
602  hit_params->options, start_shift);
603  }
604  traceback_row += b_size + 1;
605  }
606 
607  /* finally, check the last row again for paths that
608  have not decayed to zero */
609  for (i = 0; i < b_size; i++) {
610  if (scores[i].best && scores[i].path_score >= cutoff) {
611  s_GetTraceback(program_number, traceback_array,
612  A, B, b_size,
613  gap_open, gap_extend,
614  gap_align, scores[i].path_stop_i,
615  scores[i].path_stop_j,
616  scores[i].path_score,
617  hsp_list, swapped, template_hsp,
618  score_params->options,
619  hit_params->options, start_shift);
620  }
621  }
622 
623  free(scores);
624  free(traceback_array);
625 }
626 
627 
628 /* See blast_sw.h for details */
630  BLAST_SequenceBlk* query, BlastQueryInfo* query_info,
632  BlastGapAlignStruct* gap_align,
633  const BlastScoringParameters* score_params,
634  const BlastExtensionParameters* ext_params,
635  const BlastHitSavingParameters* hit_params,
636  BlastInitHitList* init_hitlist,
637  BlastHSPList** hsp_list_ptr, BlastGappedStats* gapped_stats,
638  Boolean * fence_hit)
639 {
640  Boolean is_prot;
641  BlastHSPList* hsp_list = NULL;
642  const BlastHitSavingOptions* hit_options = hit_params->options;
643  Int4 cutoff_score = 0;
644  Int4 context;
645  Int4 **rpsblast_pssms = NULL; /* Pointer to concatenated PSSMs in
646  RPS-BLAST database */
647  const int kHspNumMax = BlastHspNumMax(TRUE, hit_options);
648 
649  if (!query || !subject || !gap_align || !score_params || !ext_params ||
650  !hit_params || !init_hitlist || !hsp_list_ptr)
651  return 1;
652 
653  is_prot = (program_number != eBlastTypeBlastn &&
654  program_number != eBlastTypePhiBlastn &&
655  program_number != eBlastTypeMapping);
656 
657  if (Blast_ProgramIsRpsBlast(program_number)) {
658  Int4 rps_context = subject->oid;
659  rpsblast_pssms = gap_align->sbp->psi_matrix->pssm->data;
660  if (program_number == eBlastTypeRpsTblastn) {
661  rps_context = rps_context * NUM_FRAMES +
662  BLAST_FrameToContext(subject->frame, program_number);
663  }
664  /* only one cutoff applies to an RPS search */
665  cutoff_score = hit_params->cutoffs[rps_context].cutoff_score;
666  }
667 
668  if (*hsp_list_ptr == NULL)
669  *hsp_list_ptr = hsp_list = Blast_HSPListNew(kHspNumMax);
670  else
671  hsp_list = *hsp_list_ptr;
672 
673  /* ignore any initial ungapped alignments; just search
674  all contexts against the subject sequence, one context
675  at a time */
676  for (context = query_info->first_context;
677  context <= query_info->last_context; context++) {
678 
679  BlastContextInfo *curr_ctx = query_info->contexts + context;
680  Int4 score;
681  BlastHSP* new_hsp;
682 
683  if (!curr_ctx->is_valid)
684  continue;
685 
686  if (rpsblast_pssms) {
687  gap_align->sbp->psi_matrix->pssm->data =
688  rpsblast_pssms + curr_ctx->query_offset;
689  }
690  else {
691  cutoff_score = hit_params->cutoffs[context].cutoff_score;
692  }
693 
694  if (is_prot) {
695  score = s_SmithWatermanScoreOnly(
696  query->sequence + curr_ctx->query_offset,
697  curr_ctx->query_length,
698  subject->sequence,
699  subject->length,
700  score_params->gap_open,
701  score_params->gap_extend,
702  gap_align);
703  }
704  else {
705  score = s_NuclSmithWaterman(subject->sequence,
706  subject->length,
707  query->sequence + curr_ctx->query_offset,
708  curr_ctx->query_length,
709  score_params->gap_open,
710  score_params->gap_extend,
711  gap_align);
712  }
713 
714  if (score >= cutoff_score) {
715  /* we know the score of the highest-scoring alignment but
716  not its boundaries. Use a single placeholder HSP to carry
717  score, context and frame information to the traceback phase */
718  Blast_HSPInit(0, curr_ctx->query_length, 0, subject->length, 0, 0,
719  context, curr_ctx->frame, subject->frame, score,
720  NULL, &new_hsp);
721  Blast_HSPListSaveHSP(hsp_list, new_hsp);
722  }
723  }
724 
725  if (rpsblast_pssms)
726  gap_align->sbp->psi_matrix->pssm->data = rpsblast_pssms;
727 
728  *hsp_list_ptr = hsp_list;
729  return 0;
730 }
#define sfree(x)
Safe free a pointer: belongs to a higher level header.
Definition: blast_def.h:112
#define NUM_FRAMES
Number of frames to which we translate in translating searches.
Definition: blast_def.h:88
Int2 Blast_HSPInit(Int4 query_start, Int4 query_end, Int4 subject_start, Int4 subject_end, Int4 query_gapped_start, Int4 subject_gapped_start, Int4 query_context, Int2 query_frame, Int2 subject_frame, Int4 score, GapEditScript **gap_edit, BlastHSP **ret_hsp)
Allocates BlastHSP and inits with information from input.
Definition: blast_hits.c:151
Int4 BlastHspNumMax(Boolean gapped_calculation, const BlastHitSavingOptions *options)
Calculated the number of HSPs that should be saved.
Definition: blast_hits.c:213
Boolean Blast_HSPTestIdentityAndLength(EBlastProgramType program_number, BlastHSP *hsp, const Uint1 *query, const Uint1 *subject, const BlastScoringOptions *score_options, const BlastHitSavingOptions *hit_options)
Calculates number of identities and alignment lengths of an HSP via Blast_HSPGetNumIdentities and det...
Definition: blast_hits.c:1004
BlastHSPList * Blast_HSPListNew(Int4 hsp_max)
Creates HSP list structure with a default size HSP array.
Definition: blast_hits.c:1558
BlastHSP * Blast_HSPFree(BlastHSP *hsp)
Deallocate memory for an HSP structure.
Definition: blast_hits.c:130
Int2 Blast_HSPListSaveHSP(BlastHSPList *hsp_list, BlastHSP *hsp)
Saves HSP information into a BlastHSPList structure.
Definition: blast_hits.c:1754
void Blast_HSPAdjustSubjectOffset(BlastHSP *hsp, Int4 start_shift)
Adjusts offsets if partial sequence was used for extension.
Definition: blast_hits.c:1316
Boolean Blast_ProgramIsRpsBlast(EBlastProgramType p)
Returns true if program is RPS-BLAST (i.e.
Definition: blast_program.c:73
EBlastProgramType
Defines the engine's notion of the different applications of the BLAST algorithm.
Definition: blast_program.h:72
@ eBlastTypeBlastn
Definition: blast_program.h:74
@ eBlastTypeRpsTblastn
Definition: blast_program.h:85
@ eBlastTypePhiBlastn
Definition: blast_program.h:87
@ eBlastTypeMapping
Definition: blast_program.h:88
static Int4 s_NuclSmithWaterman(const Uint1 *B, Int4 b_size, const Uint1 *A, Int4 a_size, Int4 gap_open, Int4 gap_extend, BlastGapAlignStruct *gap_align)
Compute the score of the best local alignment between two nucleotide sequences.
Definition: blast_sw.c:164
static Int4 s_SmithWatermanScoreOnly(const Uint1 *A, Int4 a_size, const Uint1 *B, Int4 b_size, Int4 gap_open, Int4 gap_extend, BlastGapAlignStruct *gap_align)
Compute the score of the best local alignment between two protein sequences.
Definition: blast_sw.c:57
@ EDIT_START_GAP_B
open a gap in B
Definition: blast_sw.c:250
@ EDIT_GAP_IN_B
Insertion.
Definition: blast_sw.c:246
@ EDIT_GAP_IN_A
Deletion.
Definition: blast_sw.c:245
@ EDIT_START_GAP_A
open a gap in A
Definition: blast_sw.c:249
@ EDIT_SUB
Substitution.
Definition: blast_sw.c:244
@ EDIT_OP_MASK
Mask for edit script operations.
Definition: blast_sw.c:247
#define SWAP_INT(A, B)
swap two integers
Definition: blast_sw.c:42
#define SWAP_SEQS(A, B)
swap (pointers to) a pair of sequences
Definition: blast_sw.c:39
struct BlastGapSW BlastGapSW
Auxiliary structures for Smith-Waterman alignment with traceback.
static void s_GetTraceback(EBlastProgramType program_number, Uint1 *trace, const Uint1 *A, const Uint1 *B, Int4 b_size, Int4 gap_open, Int4 gap_extend, BlastGapAlignStruct *gap_align, Int4 a_end, Int4 b_end, Int4 best_score, BlastHSPList *hsp_list, Boolean swapped, BlastHSP *template_hsp, const BlastScoringOptions *score_options, const BlastHitSavingOptions *hit_options, Int4 start_shift)
Generate the traceback for a single local alignment between two (unpacked) sequences,...
Definition: blast_sw.c:278
Smith-Waterman alignment for use within the infrastructure of BLAST.
Various auxiliary BLAST utility functions.
Int4 BLAST_FrameToContext(Int2 frame, EBlastProgramType program)
Convert translation frame or strand into a context number suitable for indexing into the BlastQueryIn...
Definition: blast_util.c:1211
#define NCBI2NA_UNPACK_BASE(x, N)
Macro to extract base N from a byte x (N >= 0, N < 4)
Definition: blast_util.h:55
void GapPrelimEditBlockAdd(GapPrelimEditBlock *edit_block, EGapAlignOpType op_type, Int4 num_ops)
Add a new operation to a preliminary edit block, possibly combining it with the last operation if the...
Definition: gapinfo.c:175
EGapAlignOpType
Operation types within the edit script.
Definition: gapinfo.h:44
@ eGapAlignIns
Insertion: a gap in subject.
Definition: gapinfo.h:51
@ eGapAlignSub
Substitution.
Definition: gapinfo.h:48
@ eGapAlignDel
Deletion: a gap in query.
Definition: gapinfo.h:45
GapEditScript * GapEditScriptNew(Int4 size)
Initialize the edit script structure.
Definition: gapinfo.c:56
void GapPrelimEditBlockReset(GapPrelimEditBlock *edit_block)
Reset a preliminary edit block without freeing it.
Definition: gapinfo.c:213
static bool trace
void SmithWatermanScoreWithTraceback(EBlastProgramType program_number, const Uint1 *A, Int4 a_size, const Uint1 *B, Int4 b_size, BlastHSP *template_hsp, BlastHSPList *hsp_list, const BlastScoringParameters *score_params, const BlastHitSavingParameters *hit_params, BlastGapAlignStruct *gap_align, Int4 start_shift, Int4 cutoff)
Find all local alignments between two (unpacked) sequences, using the Smith-Waterman algorithm,...
Definition: blast_sw.c:414
Int2 BLAST_SmithWatermanGetGappedScore(EBlastProgramType program_number, BLAST_SequenceBlk *query, BlastQueryInfo *query_info, BLAST_SequenceBlk *subject, BlastGapAlignStruct *gap_align, const BlastScoringParameters *score_params, const BlastExtensionParameters *ext_params, const BlastHitSavingParameters *hit_params, BlastInitHitList *init_hitlist, BlastHSPList **hsp_list_ptr, BlastGappedStats *gapped_stats, Boolean *fence_hit)
Performs score-only Smith-Waterman gapped alignment of the subject sequence with all contexts in the ...
Definition: blast_sw.c:629
#define NULL
Definition: ncbistd.hpp:225
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
int i
Uint1 Boolean
bool replacment for C
Definition: ncbi_std.h:94
#define TRUE
bool replacment for C indicating true.
Definition: ncbi_std.h:97
#define FALSE
bool replacment for C indicating false.
Definition: ncbi_std.h:101
#define MAX(a, b)
returns larger of a and b.
Definition: ncbi_std.h:117
Structure to hold a sequence.
Definition: blast_def.h:242
The context related information.
Int4 query_length
Length of this query, strand or frame.
Boolean is_valid
Determine if this context is valid or not.
Int4 query_offset
Offset of this query, strand or frame in the concatenated super-query.
Int1 frame
Frame number (-1, -2, -3, 0, 1, 2, or 3)
Computed values used as parameters for gapped alignments.
Structure supporting the gapped alignment.
GapPrelimEditBlock * fwd_prelim_tback
traceback from right extensions
Boolean positionBased
Is this PSI-BLAST?
Int4 dp_mem_alloc
current number of structures allocated
BlastScoreBlk * sbp
Pointer to the scoring information block.
BlastGapDP * dp_mem
scratch structures for dynamic programming
Auxiliary structure for dynamic programming gapped extension.
Int4 best_gap
score of best path that ends in a gap at this position
Int4 best
score of best path that ends in a match at this position
Auxiliary structures for Smith-Waterman alignment with traceback.
Definition: blast_sw.c:401
Int4 path_stop_j
Offset (plus one) on the second sequence where path_score occurs.
Definition: blast_sw.c:409
Int4 path_score
The highest score that the alignment at this position has previously achieved.
Definition: blast_sw.c:405
Int4 best_gap
Score of best alignment at this position that ends in a gap.
Definition: blast_sw.c:403
Int4 path_stop_i
Offset (plus one) on the first sequence where path_score occurs.
Definition: blast_sw.c:407
Int4 best
Score of best alignment at this position.
Definition: blast_sw.c:402
Int4 cutoff_score
Raw cutoff score corresponding to the e-value provided by the user if no sum stats,...
Structure containing hit counts from the gapped stage of a BLAST search.
The structure to hold all HSPs for a given sequence after the gapped alignment.
Definition: blast_hits.h:153
Structure holding all information about an HSP.
Definition: blast_hits.h:126
BlastSeg query
Query sequence info.
Definition: blast_hits.h:131
Int4 context
Context number of query.
Definition: blast_hits.h:133
BlastSeg subject
Subject sequence info.
Definition: blast_hits.h:132
Options used when evaluating and saving hits These include: a.
Parameter block that contains a pointer to BlastHitSavingOptions and the values derived from it.
BlastGappedCutoffs * cutoffs
per-context gapped cutoff information
BlastHitSavingOptions * options
The original (unparsed) options.
Structure to hold all initial HSPs for a given subject sequence.
Definition: blast_extend.h:158
The query related information.
Int4 first_context
Index of the first element of the context array.
BlastContextInfo * contexts
Information per context.
SPsiBlastScoreMatrix * psi_matrix
PSSM and associated data.
Definition: blast_stat.h:186
SBlastScoreMatrix * matrix
scoring matrix data
Definition: blast_stat.h:185
Scoring options block Used to produce the BlastScoreBlk structure This structure may be needed for lo...
Scoring parameters block Contains scoring-related information that is actually used for the blast sea...
Int4 gap_extend
Penalty for each gap residue (scaled version)
Int4 gap_open
Extra penalty for starting a gap (scaled version)
BlastScoringOptions * options
User-provided values for these params.
Int2 frame
Translation frame.
Definition: blast_hits.h:97
Edit script: linked list of correspondencies between two sequences.
Definition: gapinfo.h:57
Int4 * num
Array of number of operations.
Definition: gapinfo.h:59
EGapAlignOpType * op_type
Array of type of operation.
Definition: gapinfo.h:58
Preliminary version of GapEditBlock, used directly by the low- level dynamic programming routines.
Definition: gapinfo.h:73
GapPrelimEditScript * edit_ops
array of edit operations
Definition: gapinfo.h:74
Int4 num_ops
number of edit ops presently in use
Definition: gapinfo.h:76
A version of GapEditScript used to store initial results from the gapped alignment routines.
Definition: gapinfo.h:65
Int4 num
Number of operations.
Definition: gapinfo.h:67
EGapAlignOpType op_type
Type of operation.
Definition: gapinfo.h:66
int ** data
actual scoring matrix data, stored in row-major form
Definition: blast_stat.h:140
SBlastScoreMatrix * pssm
position-specific score matrix
Definition: blast_stat.h:150
static string subject
static string query
static CS_CONTEXT * context
Definition: will_convert.c:21
void free(voidpf ptr)
voidp malloc(uInt size)
voidp calloc(uInt items, uInt size)
Modified on Thu May 23 12:30:31 2024 by modify_doxy.py rev. 669887