NCBI C++ ToolKit
jumper.c
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: jumper.c 100164 2023-06-28 13:36:01Z merezhuk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Greg Boratyn
27  *
28  * Implementaion of Jumper alignment
29  *
30  */
31 
32 #include "blast_gapalign_priv.h"
33 
37 
38 #include "jumper.h"
39 
40 
42  {1, 1, 9, 0}, /* this was added for illumina */
43  {1, 0, 10, 0}, /* insert in 1 */
44  {0, 1, 10, 0}, /* deletion in 1 */
45  {2, 0, 10, 0},
46  {0, 2, 10, 0},
47  {3, 0, 13, 0},
48  {0, 3, 13, 0},
49  {2, 2, 12, 0}, /* try double mismatch */
50  {1, 0, 10, 2},
51  {0, 1, 10, 2},
52  {2, 0, 10, 2},
53  {0, 2, 10, 2},
54  {3, 0, 13, 2},
55  {0, 3, 13, 2},
56 /* removed for illumina
57  {4, 0, 15, 3},
58  {0, 4, 15, 3},
59  {5, 0, 15, 3},
60  {0, 5, 15, 3},
61  {6, 0, 15, 3},
62  {0, 6, 15, 3},
63  {7, 0, 15, 3},
64  {0, 7, 15, 3},
65  {8, 0, 15, 3},
66  {0, 8, 15, 3},
67  {9, 0, 15, 3},
68  {0, 9, 15, 3},
69  {10, 0, 15, 3},
70  {0, 10, 15, 3},
71 */
72  {1, 1, 0, 0} /* default is punctual */
73 } ;
74 
76  {1, 0, 6, 0},
77  {0, 1, 6, 0},
78  {1, 0, 2, 0},
79  {0, 1, 2, 0},
80  {1, 1, 0, 0}
81 };
82 
83 
85  {1, 1, 0, 0} /* default is punctual */
86 };
87 
88 
89 #define GET_BASE(seq, pos, compressed) (compressed ? NCBI2NA_UNPACK_BASE(seq[pos >> 2], 3 - (pos & 3)) : seq[pos])
90 
91 
92 #define UNPACK_BASE_OLD(seq, pos) ((((seq)[(pos) >> 2] << (2 * ((pos) & 3))) & 0xC0) >> 6)
93 
94 
95 #define UNPACK_BASE(seq, pos) (NCBI2NA_UNPACK_BASE((seq)[(pos) / 4], 3 - ((pos) & 3)))
96 
97 
98 #define JUMPER_EDIT_BLOCK_ADD(block, op) ((block)->edit_ops[(block)->num_ops++] = op)
99 
100 
101 /* convert jumper edit op type to BLAST op type */
102 #define JOP_TO_OP(op) (op >= 0 ? eGapAlignSub : (op == JUMPER_INSERTION ? eGapAlignIns : eGapAlignDel))
103 
104 /* get number of ops */
105 #define JOP_TO_NUM(op) (op > 0 ? op : 1)
106 
107 
109 {
110  JumperPrelimEditBlock* retval = calloc(1, sizeof(JumperPrelimEditBlock));
111  if (!retval) {
112  return NULL;
113  }
114 
115  retval->edit_ops = calloc(size, sizeof(JumperOpType));
116  if (!retval->edit_ops) {
117  free(retval);
118  return NULL;
119  }
120 
121  retval->num_allocated = size;
122  return retval;
123 }
124 
126  JumperPrelimEditBlock* block)
127 {
128  if (block) {
129  if (block->edit_ops) {
130  free(block->edit_ops);
131  }
132 
133  free(block);
134  }
135 
136  return NULL;
137 }
138 
140 {
141  if (block->num_ops >= block->num_allocated) {
142  block->edit_ops = realloc(block->edit_ops,
143  block->num_allocated * sizeof(JumperOpType) * 2);
144 
145  if (!block->edit_ops) {
146  return -1;
147  }
148  block->num_allocated *= 2;
149  }
150 
151  if (block->num_ops > 0 && block->edit_ops[block->num_ops - 1] > 0 &&
152  op > 0) {
153 
154  block->edit_ops[block->num_ops - 1] += op;
155  }
156  else {
157  block->edit_ops[block->num_ops++] = op;
158  }
159 
160  return 0;
161 }
162 
163 
164 static void s_CreateTable(Uint4* table)
165 {
166  int i, k;
167  for (i = 0; i < 256; i++) {
168  table[i] = 0;
169  for (k = 0;k < 4; k++) {
170  Uint4 cell = ((i >> (2 * k)) & 3);
171  switch (k) {
172  case 0: table[i] += cell << 3 * 8; break;
173  case 1: table[i] += cell << 2 * 8; break;
174  case 2: table[i] += cell << 1 * 8; break;
175  case 3: table[i] += cell; break;
176  default: ASSERT(0);
177  }
178  }
179  }
180 }
181 
182 
184 {
185  if (!jgap_align) {
186  return NULL;
187  }
188 
191  if (jgap_align->table) {
192  free(jgap_align->table);
193  }
194  sfree(jgap_align);
195  return NULL;
196 }
197 
198 
200 {
201  JumperGapAlign* retval = (JumperGapAlign*)calloc(1, sizeof(JumperGapAlign));
202  if (!retval) {
203  return NULL;
204  }
205 
207  if (!retval->left_prelim_block) {
208  JumperGapAlignFree(retval);
209  return NULL;
210  }
211 
213  if (!retval->right_prelim_block) {
214  JumperGapAlignFree(retval);
215  return NULL;
216  }
217 
218  retval->table = calloc(256, sizeof(Uint4));
219  if (!retval->table) {
220  JumperGapAlignFree(retval);
221  return NULL;
222  }
223  s_CreateTable(retval->table);
224 
225  return retval;
226 }
227 
228 /* Reset prelim edit blocks so that they can be used for a new alignment */
230  JumperPrelimEditBlock* right)
231 {
232  if (!left || !right) {
233  return;
234  }
235 
236  left->num_ops = 0;
237  right->num_ops = 0;
238 }
239 
240 
241 /* Get query and subject position at the edit_index element of the edit script.
242  The jumper edit script must be going forwadr (from right extension) and
243  *query_pos and *subject_pos must be initialized to start positions of
244  the HSP. The resulting positions will be in *query_pos and *subject_pos. */
245 static int s_GetSeqPositions(JumperPrelimEditBlock* edit_script,
246  Int4 edit_index,
247  Int4* query_pos, Int4* subject_pos)
248 {
249  Int4 j;
250 
251  ASSERT(edit_script);
252  ASSERT(query_pos);
253  ASSERT(subject_pos);
254  ASSERT(edit_index < edit_script->num_ops);
255 
256  if (!edit_script || !query_pos || !subject_pos) {
257  return -1;
258  }
259 
260  for (j = 0;j < edit_index;j++) {
261  ASSERT(j < edit_script->num_ops);
262 
263  if (edit_script->edit_ops[j] == JUMPER_MISMATCH) {
264  (*query_pos)++;
265  (*subject_pos)++;
266  }
267  else if (edit_script->edit_ops[j] == JUMPER_INSERTION) {
268  (*query_pos)++;
269  }
270  else if (edit_script->edit_ops[j] == JUMPER_DELETION) {
271  (*subject_pos)++;
272  }
273  else {
274  *query_pos += edit_script->edit_ops[j];
275  *subject_pos += edit_script->edit_ops[j];
276  }
277  }
278 
279  return 0;
280 }
281 
282 
283 /* Shift gaps to the right using the forward edit script (right extension).
284  Alignment score and number of matches may improve if input the alignment
285  is not optimal */
286 static int s_ShiftGapsRight(JumperPrelimEditBlock* edit_script,
287  const Uint1* query, const Uint1* subject,
288  Int4 query_offset, Int4 subject_offset,
289  Int4 query_length, Int4 subject_length,
290  Int4* score, Int4 err_score, Int4* num_identical)
291 {
292  Int4 i;
293  Int4 q_pos;
294  Int4 s_pos;
295 
296  ASSERT(score && num_identical);
297 
298  /* iterate over edit script elements */
299  for (i = 0;i < edit_script->num_ops;i++) {
300 
301  /* if insertion or deltion */
302  if (edit_script->edit_ops[i] < 0) {
303 
304  /* find the end of the gap; gap start will be marked with i and
305  k will be right after gap end */
306  Int4 k = i + 1;
307  while (k < edit_script->num_ops &&
308  edit_script->edit_ops[k] == edit_script->edit_ops[i]) {
309  k++;
310  }
311 
312  /* iterate over the next script elements and see if any can switch
313  sides with the indel */
314  for (;k < edit_script->num_ops;k++) {
315  ASSERT(edit_script->edit_ops[i] < 0);
316 
317  /* if a deletion follows insertion (or the other way around),
318  change it into a mismatch, it is one error instead of two */
319  if (edit_script->edit_ops[k] < 0 &&
320  edit_script->edit_ops[k] != edit_script->edit_ops[i]) {
321  Int4 j;
322 
323  /* remove one indel */
324  for (j = k;j < edit_script->num_ops - 1;j++) {
325  edit_script->edit_ops[j] =
326  edit_script->edit_ops[j + 1];
327  }
328  edit_script->num_ops--;
329  /* change the leftmost indel to a mismatch */
330  edit_script->edit_ops[i] = JUMPER_MISMATCH;
331  /* update poiner to the beginning of the gap */
332  i++;
333 
334  /* update score */
335  (*score) -= err_score;
336 
337  /* check if the new mismatch is a match and update
338  scores and identity if it is */
339  q_pos = query_offset;
340  s_pos = subject_offset;
341  s_GetSeqPositions(edit_script, i - 1, &q_pos, &s_pos);
342  if (query[q_pos] == UNPACK_BASE(subject, s_pos)) {
343  edit_script->edit_ops[i - 1] = 1;
344  (*score)++;
345  (*num_identical)++;
346  }
347 
348  /* if there a no indels left, search for a new gap */
349  if (i >= 0) {
350  break;
351  }
352  }
353 
354  /* if a mismatch follows an indel, move the mismatch to the
355  left; it will result in a different mismatch or a macth,
356  so alignment score will not decrease */
357  if (edit_script->edit_ops[k] == JUMPER_MISMATCH) {
358  /* shift the mismatch to the left by swapping ops */
359  JumperOpType tmp = edit_script->edit_ops[i];
360  edit_script->edit_ops[i] = edit_script->edit_ops[k];
361  edit_script->edit_ops[k] = tmp;
362  i++;
363 
364  /* check if mismatch did not change to a match and update
365  scores and identity if it did */
366  q_pos = query_offset;
367  s_pos = subject_offset;
368  s_GetSeqPositions(edit_script, i - 1, &q_pos, &s_pos);
369  if (query[q_pos] == UNPACK_BASE(subject, s_pos)) {
370  edit_script->edit_ops[i - 1] = 1;
371  (*score) -= err_score;
372  (*score)++;
373  (*num_identical)++;
374  }
375  }
376 
377  /* if matches follow an indel, check if bases to the right
378  of the indel match bases to the left and move the matches
379  if they do */
380  if (edit_script->edit_ops[k] > 0) {
381 
382  Int4 num_matches = (Int4)edit_script->edit_ops[k];
383  Int4 /*j, */ b = 0;
384 
385  q_pos = query_offset;
386  s_pos = subject_offset;
387  s_GetSeqPositions(edit_script, i, &q_pos, &s_pos);
388 
389  /* find how many bases match */
390  while (b < edit_script->edit_ops[k] &&
391  query[q_pos] == UNPACK_BASE(subject, s_pos)) {
392 
393  b++;
394  q_pos++;
395  s_pos++;
396  ASSERT(q_pos <= query_length);
397  ASSERT(s_pos <= subject_length);
398  }
399 
400  /* if none, look for a new indel */
401  if (b == 0) {
402  break;
403  }
404 
405  /* otherwise shift the gap to the right */
406  ASSERT(i > 0);
407  /* if there are no matches preceding the indel, add a new
408  script element */
409  if (edit_script->edit_ops[i - 1] <= 0) {
410  Int4 j;
411  /* make space */
413  for (j = edit_script->num_ops - 1;j > i;j--) {
414  edit_script->edit_ops[j] =
415  edit_script->edit_ops[j - 1];
416  }
417  k++;
418  i++;
419  /* new matches will be here */
420  edit_script->edit_ops[i - 1] = 0;
421  }
422  /* move matching bases to the left */
423  edit_script->edit_ops[i - 1] += b;
424  edit_script->edit_ops[k] -= b;
425 
426  /* if not all matches were moved, search for a new indel */
427  if (b < num_matches) {
428  break;
429  }
430  else {
431  /* otherwise remove the script element that held the
432  moved matches */
433  Int4 j;
434  ASSERT(edit_script->edit_ops[k] == 0);
435  for (j = k;j < edit_script->num_ops - 1;j++) {
436  edit_script->edit_ops[j] =
437  edit_script->edit_ops[j + 1];
438  }
439  edit_script->num_ops--;
440  ASSERT(edit_script->num_ops > 0);
441  k--;
442  }
443  }
444  }
445 
446  /* move indel pointer to the end of the indel, to find a new one */
447  i = k - 1;
448  }
449  }
450 
451 
452  return 0;
453 }
454 
455 
456 /* Shift gaps to the right and update scores and identities */
457 static int s_ShiftGaps(BlastGapAlignStruct* gap_align,
458  const Uint1* query, const Uint1* subject,
459  Int4 query_length, Int4 subject_length,
460  Int4 err_score, Int4* num_identical)
461 {
462  Int4 i;
463  JumperPrelimEditBlock* left = gap_align->jumper->left_prelim_block;
464  JumperPrelimEditBlock* right = gap_align->jumper->right_prelim_block;
466  right->num_allocated);
467  /* create a single forward edit script from the scripts for the left
468  and right extensions */
469  for (i = left->num_ops - 1;i >=0;i--) {
470  JUMPER_EDIT_BLOCK_ADD(combined, left->edit_ops[i]);
471  }
472 
473  for (i = 0;i < right->num_ops;i++) {
474  JUMPER_EDIT_BLOCK_ADD(combined, right->edit_ops[i]);
475  }
476 
477  for (i = 1;i < combined->num_ops;i++) {
478  if (combined->edit_ops[i - 1] > 0 && combined->edit_ops[i] > 0) {
479  Int4 k;
480  combined->edit_ops[i - 1] += combined->edit_ops[i];
481  for (k = i + 1;k < combined->num_ops;k++) {
482  combined->edit_ops[k - 1] = combined->edit_ops[k];
483  }
484  combined->num_ops--;
485  }
486  }
487 
488  /* shift gaps */
489  s_ShiftGapsRight(combined, query, subject, gap_align->query_start,
490  gap_align->subject_start, query_length,
491  subject_length, &gap_align->score,
492  err_score, num_identical);
493 
494  /* trim flanking gaps */
495  while (combined->num_ops > 0 &&
496  combined->edit_ops[combined->num_ops - 1] < 0) {
497 
498  if (combined->edit_ops[combined->num_ops - 1] == JUMPER_DELETION) {
499  gap_align->subject_stop--;
500  }
501  else {
502  gap_align->query_stop--;
503  }
504  combined->num_ops--;
505  gap_align->score -= err_score;
506  }
507 
508  /* replace edit scripts with the new ones in gap_align */
509  left->num_ops = 0;
511  gap_align->jumper->right_prelim_block = combined;
512 
513  return 0;
514 }
515 
516 
517 /* trimm the extension so that there is at least -mismatch penalty matches
518  after the last mismatch */
519 static void s_TrimExtension(JumperPrelimEditBlock* jops, int margin,
520  const Uint1** cp, Int4* cq, Int4* num_identical,
521  Boolean is_right_ext)
522 {
523 
524  Int4 num_matches = 0;
525  Int4 index = jops->num_ops - 1;
526 
527  if (jops->num_ops == 0 || margin == 0) {
528  return;
529  }
530 
531  /* no trimming if mismatches do not cost anything */
532  if (margin == 0) {
533  return;
534  }
535 
536  /* find number of local matches
537  Consequitive match operations are not guaranteed to be combined in the
538  preliminary block. This is to facilitate faster extensions */
539 
540  while (index >= 1 && jops->edit_ops[index] > 0) {
541  num_matches += jops->edit_ops[index];
542  index--;
543  }
544 
545  /* we do not remove the last op; thanks to word hit there will be enough
546  matches, but extension starting at word hit edge may not have enugh
547  matches */
548  while (jops->num_ops > 1 && num_matches < margin) {
549 
550  JumperOpType op = jops->edit_ops[jops->num_ops - 1];
551  switch (JOP_TO_OP(op)) {
552  case eGapAlignSub:
553  if (op > 0) {
554  (*cp) += (is_right_ext ? -op : op);
555  (*cq) += (is_right_ext ? -op : op);
556  *num_identical -= op;
557  }
558  else if (is_right_ext) {
559  (*cp)--;
560  (*cq)--;
561  }
562  else {
563  (*cp)++;
564  (*cq)++;
565  }
566  break;
567 
568  case eGapAlignIns:
569  if (is_right_ext) {
570  (*cp)--;
571  }
572  else {
573  (*cp)++;
574  }
575  break;
576 
577  case eGapAlignDel:
578  if (is_right_ext) {
579  (*cq)--;
580  }
581  else {
582  (*cq)++;
583  }
584  break;
585 
586  default:
587  ASSERT(0);
588  }
589  jops->num_ops--;
590 
591  if (index >= jops->num_ops) {
592  num_matches = 0;
593  index = jops->num_ops - 1;
594  while (index >= 1 && jops->edit_ops[index] > 0) {
595  num_matches += jops->edit_ops[index];
596  index--;
597  }
598  }
599  }
600  ASSERT(jops->num_ops > 0);
601  /* remove the last op (closest to the word match) only if it is different
602  than match */
603  if (jops->num_ops == 1 && jops->edit_ops[0] <= 0) {
604  jops->num_ops--;
605  }
606  ASSERT(jops->num_ops == 0 || jops->edit_ops[jops->num_ops - 1] > 0);
607 }
608 
609 /* Create BLAST edit script from jumper scripts for left and right extensions */
611  JumperPrelimEditBlock* rev_prelim_block,
612  JumperPrelimEditBlock* fwd_prelim_block)
613 {
614  Int4 size = 0;
615  Int4 i;
616  Int4 index;
617  GapEditScript* retval;
618  EGapAlignOpType last_op;
619 
620  if (rev_prelim_block->num_ops == 0 && fwd_prelim_block->num_ops == 0) {
621  return NULL;
622  }
623 
624  /* find the size of the resulting edit scrtipt */
625  size = 1;
626  last_op = rev_prelim_block->num_ops > 0 ?
627  JOP_TO_OP(rev_prelim_block->edit_ops[rev_prelim_block->num_ops - 1])
628  : JOP_TO_OP(fwd_prelim_block->edit_ops[0]);
629 
630  for (i = rev_prelim_block->num_ops - 2;i >= 0;i--) {
631  if (last_op != JOP_TO_OP(rev_prelim_block->edit_ops[i])) {
632  size++;
633  last_op = JOP_TO_OP(rev_prelim_block->edit_ops[i]);
634  }
635  }
636  for (i = 0;i < fwd_prelim_block->num_ops;i++) {
637  if (last_op != JOP_TO_OP(fwd_prelim_block->edit_ops[i])) {
638  size++;
639  last_op = JOP_TO_OP(fwd_prelim_block->edit_ops[i]);
640  }
641  }
642 
643  retval = GapEditScriptNew(size);
644 
645  /* rev_prelim_block needs to be reveresed, because left extension edit
646  operations are collected backwards */
647  index = 0;
648  if (rev_prelim_block->num_ops > 0) {
649  i = rev_prelim_block->num_ops - 1;
650  retval->op_type[index] = JOP_TO_OP(rev_prelim_block->edit_ops[i]);
651  retval->num[index] = JOP_TO_NUM(rev_prelim_block->edit_ops[i]);
652  last_op = retval->op_type[index];
653  i--;
654  for (;i >= 0;i--) {
655  EGapAlignOpType current_op = JOP_TO_OP(rev_prelim_block->edit_ops[i]);
656  if (current_op == last_op) {
657  retval->num[index] += JOP_TO_NUM(rev_prelim_block->edit_ops[i]);
658  }
659  else {
660  index++;
661  retval->op_type[index] = JOP_TO_OP(rev_prelim_block->edit_ops[i]);
662  retval->num[index] = JOP_TO_NUM(rev_prelim_block->edit_ops[i]);
663  last_op = current_op;
664  }
665  }
666  }
667 
668  i = 0;
669  if (index == 0 && retval->num[index] == 0) {
670  retval->op_type[index] = JOP_TO_OP(fwd_prelim_block->edit_ops[i]);
671  retval->num[index] = JOP_TO_NUM(fwd_prelim_block->edit_ops[i]);
672  last_op = retval->op_type[index];
673  i++;
674  }
675  for (;i < fwd_prelim_block->num_ops;i++) {
676  EGapAlignOpType current_op = JOP_TO_OP(fwd_prelim_block->edit_ops[i]);
677  if (current_op == last_op) {
678  retval->num[index] += JOP_TO_NUM(fwd_prelim_block->edit_ops[i]);
679  }
680  else {
681  index++;
682  retval->op_type[index] = JOP_TO_OP(fwd_prelim_block->edit_ops[i]);
683  retval->num[index] = JOP_TO_NUM(fwd_prelim_block->edit_ops[i]);
684  last_op = current_op;
685  }
686  }
687 
688  return retval;
689 }
690 
691 
692 /* Compute a score for an extension */
694  Int4 match_score, Int4 mismatch_score,
695  Int4 gap_open_score, Int4 gap_extend_score)
696 {
697  Int4 score = 0;
698  Int4 i;
699 
700  for (i = 0;i < edit_script->num_ops;i++) {
701  Int4 op = edit_script->edit_ops[i];
702 
703 /* indel penalty is the same as mismatch penalty
704  if (op < 0) {
705  continue;
706  }
707 */
708 
709  if (op > 0) {
710  score += op * match_score;
711  }
712  else if (op == 0) {
713  score += mismatch_score;
714  }
715  else {
716  score += gap_open_score;
717  while (i < edit_script->num_ops && edit_script->edit_ops[i] == op) {
718  score += gap_extend_score;
719  i++;
720  }
721  i--;
722  }
723  }
724 
725  return score;
726 }
727 
728 
729 
730 /* query is one base per byte, subject is 2NA, we assume that initial seed is
731  at byte edge */
732 
733 
735  int query_length, int subject_length,
736  Int4 match_score, Int4 mismatch_score,
737  Int4 gap_open_score, Int4 gap_extend_score,
738  int max_mismatches, int window, Uint4* table,
739  Int4* query_ext_len, Int4* subject_ext_len,
740  Int4* num_identical,
741  Int4* ungapped_ext_len)
742 {
743  const Uint1 *cp, *cp1, *cpmax, *cpmax4, *cpstop = NULL;
744  Int4 cq, cq1, cqmax, cqmax4, cqstop = 0;
745  int i, n;
746  JUMP *jp;
747  int num_mismatches = 0;
748  int new_matches = 0;
749  Uint4 trace = 0;
750  Uint4 trace_mask = (1 << max_mismatches) - 1;
751  Int4 score = 0, best_score = 0;
752  Boolean is_ungapped = TRUE;
753  JUMP* jumper = jumper_default;
754 
755  if (!query || ! subject) {
756  return -1;
757  }
758 
759  cp = query;
760  cpmax = cp + query_length;
761 
762  /* or assume matches up to byte edge */
763  cq = 0;
764  cqmax = subject_length;
765 
766  cpmax4 = cpmax - 4;
767  cqmax4 = cqmax - 4;
768 
769  /* skip the first pair as it is compared by the left extension */
770  cp++;
771  cq++;
772 
773  while (cp < cpmax && cq < cqmax && num_mismatches < max_mismatches) {
774 
775  if (!(cq & 3) && cp < cpmax4 && cq < cqmax4) {
776  if (table[subject[cq / 4]] == *(Uint4*)(cp)) {
777  cp += 4;
778  cq += 4;
779  new_matches += 4;
780  continue;
781  }
782  }
783 
784  if (*cp == UNPACK_BASE(subject, cq)) {
785  cp++;
786  cq++;
787  new_matches++;
788  continue;
789  }
790 
791  jp = jumper;
792  jp-- ;
793  while (jp++) { /* 1, 1, 0, 0 = last always accepted */
794 
795  if (!jp->lng) {
796  break;
797  }
798 
799  n = 0;
800  i = jp->ok;
801  cp1 = cp + jp->dcp;
802  cq1 = cq + jp->dcq;
803  while (i--) {
804  if (cp1 >= cpmax || cq1 >= cqmax
805  || *cp1++ != UNPACK_BASE(subject, cq1)) {
806 
807  goto next_jp;
808  }
809  cq1++;
810  }
811 
812  n = 0;
813  i = jp->lng;
814  cp1 = cp + jp->dcp;
815  cq1 = cq + jp->dcq;
816  if (i + cp1 >= cpmax || i + cq1 >= cqmax) {
817  continue;
818  }
819  while (i--) {
820  if (cp1 >= cpmax || cq1 >= cqmax) {
821  goto next_jp;
822  }
823  if (*cp1++ != UNPACK_BASE(subject, cq1)) {
824  if (++n > jp->ok) {
825  goto next_jp;
826  }
827  }
828  cq1++;
829  }
830  /* current jumper row works */
831  break;
832 
833 next_jp:
834  continue;
835  }
836 
837  if (new_matches) {
838  if (trace) {
839  if (new_matches < window) {
840  trace <<= new_matches;
841  }
842  else {
843  trace = 0;
844  }
845  }
846  *num_identical += new_matches;
847  score += new_matches * match_score;
848  new_matches = 0;
849  }
850 
851  /* update recent errors and score */
852  if (jp->dcp == jp->dcq) {
853  score += mismatch_score * jp->dcp;
854  if (trace & trace_mask) {
855  num_mismatches += jp->dcp;
856  trace <<= jp->dcp;
857  trace |= (1 << jp->dcp) - 1;
858  }
859  else {
860  num_mismatches = jp->dcp;
861  trace = (1 << jp->dcp) - 1;
862  }
863  }
864  /* note that we are not penalizing gaps */
865 
866  /* save the length of the ungapped extension */
867  if (is_ungapped && jp->dcp != jp->dcq) {
868  *ungapped_ext_len = (Int4)(cp - query - 1);
869  is_ungapped = FALSE;
870  }
871 
872  /* update cp and cq */
873  cp += jp->dcp;
874  cq += jp->dcq;
875 
876  /* we have already checked that these are matches */
877  if (jp->ok == 0 && jp->lng) {
878  cp += jp->lng;
879  cq += jp->lng;
880  trace <<= jp->lng;
881  *num_identical += jp->lng;
882  score += jp->lng * match_score;
883  }
884 
885  /* update optimal alignment extent */
886  if (score >= best_score) {
887  cpstop = cp;
888  cqstop = cq;
889  best_score = score;
890  }
891  }
892 
893  /* if matches all the way to the end of a sequence */
894  if (new_matches) {
895  *num_identical += new_matches;
896  score += new_matches * match_score;
897  if (score >= best_score) {
898  cpstop = cp;
899  cqstop = cq;
900  }
901  }
902 
903  *query_ext_len = (Int4)(cpstop - query);
904  *subject_ext_len = cqstop;
905 
906  if (is_ungapped) {
907  *ungapped_ext_len = *query_ext_len;
908  }
909 
910  return best_score;
911 } /* JumperExtendRightCompressed */
912 
913 
914 
916  const Uint1* query, const Uint1* subject,
917  int query_length, int subject_length,
918  Int4 match_score, Int4 mismatch_score,
919  Int4 gap_open_score, Int4 gap_extend_score,
920  int max_mismatches, int window, Uint4* table,
921  Int4* query_ext_len, Int4* subject_ext_len,
922  JumperPrelimEditBlock* edit_script,
923  Int4* num_identical,
924  Boolean left_extension,
925  Int4* ungapped_ext_len,
926  JUMP* jumper)
927 {
928  const Uint1 *cp, *cp1, *cpmax, *cpmax4;
929  Int4 cq, cq1, cqmax, cqmax4;
930  int i, n;
931  JUMP *jp;
932  int num_mismatches = 0;
933  int new_matches = 0;
934  Uint4 trace = 0;
935  Uint4 trace_mask = (1 << max_mismatches) - 1;
936  Boolean is_ungapped = TRUE;
937 
938  if (!query || ! subject) {
939  return -1;
940  }
941 
942  cp = query;
943  cpmax = cp + query_length;
944 
945  /* or assume matches up to byte edge */
946  cq = 0;
947  cqmax = subject_length;
948 
949  cpmax4 = cpmax - 4;
950  cqmax4 = cqmax - 4;
951 
952  /* skip the first pair as it is compared by the left extension */
953  cp++;
954  cq++;
955  if (!left_extension) {
956  new_matches++;
957  }
958 
959  while (cp < cpmax && cq < cqmax && num_mismatches < max_mismatches) {
960 
961  if (!(cq & 3) && cp < cpmax4 && cq < cqmax4) {
962  if (table[subject[cq / 4]] == *(Uint4*)(cp)) {
963  cp += 4;
964  cq += 4;
965  new_matches += 4;
966  continue;
967  }
968  }
969 
970  if (*cp == UNPACK_BASE(subject, cq)) {
971  cp++;
972  cq++;
973  new_matches++;
974  continue;
975  }
976 
977  jp = jumper;
978  jp-- ;
979  while (jp++) { /* 1, 1, 0, 0 = last always accepted */
980 
981  if (!jp->lng) {
982  break;
983  }
984 
985  n = 0;
986  i = jp->ok;
987  cp1 = cp + jp->dcp;
988  cq1 = cq + jp->dcq;
989  while (i--) {
990  if (cp1 >= cpmax || cq1 >= cqmax
991  || *cp1++ != UNPACK_BASE(subject, cq1)) {
992 
993  goto next_jp;
994  }
995  cq1++;
996  }
997 
998  n = 0;
999  i = jp->lng;
1000  cp1 = cp + jp->dcp;
1001  cq1 = cq + jp->dcq;
1002  if (i + cp1 >= cpmax || i + cq1 >= cqmax) {
1003  continue;
1004  }
1005  while (i--) {
1006  if (cp1 >= cpmax || cq1 >= cqmax) {
1007  goto next_jp;
1008  }
1009  if (*cp1++ != UNPACK_BASE(subject, cq1)) {
1010  if (++n > jp->ok) {
1011  goto next_jp;
1012  }
1013  }
1014  cq1++;
1015  }
1016  /* current jumper row works */
1017  break;
1018 
1019 next_jp:
1020  continue;
1021  }
1022 
1023  if (new_matches) {
1024  ASSERT(edit_script->num_ops < edit_script->num_allocated);
1025  JUMPER_EDIT_BLOCK_ADD(edit_script, new_matches);
1026  if (trace) {
1027  if (new_matches < window) {
1028  trace <<= new_matches;
1029  }
1030  else {
1031  trace = 0;
1032  }
1033  }
1034  *num_identical += new_matches;
1035  new_matches = 0;
1036  }
1037 
1038  /* update recent errors */
1039  if (jp->dcp == jp->dcq) {
1040  if (trace & trace_mask) {
1041  num_mismatches += jp->dcp;
1042  trace <<= jp->dcp;
1043  trace |= (1 << jp->dcp) - 1;
1044  }
1045  else {
1046  num_mismatches = jp->dcp;
1047  trace = (1 << jp->dcp) - 1;
1048  }
1049  for (i = 0;i < jp->dcp;i++) {
1050  ASSERT(edit_script->num_ops < edit_script->num_allocated);
1051  JUMPER_EDIT_BLOCK_ADD(edit_script, JUMPER_MISMATCH);
1052  }
1053  } else if (jp->dcp > jp->dcq) {
1054  for (i = 0;i < jp->dcp - jp->dcq;i++) {
1055  ASSERT(edit_script->num_ops < edit_script->num_allocated);
1057  }
1058  if (jp->dcq) {
1059  /* either jp->dcp r jp->dcq must be zero or jp->dcp == jp->dcq
1060  otherwise number of mismatches must be updated here */
1061  ASSERT(0);
1062  }
1063  }
1064  else {
1065  for (i = 0;i < jp->dcq - jp->dcp;i++) {
1066  ASSERT(edit_script->num_ops < edit_script->num_allocated);
1067  JUMPER_EDIT_BLOCK_ADD(edit_script, JUMPER_DELETION);
1068  }
1069  if (jp->dcp) {
1070  /* either jp->dcp r jp->dcq must be zero or jp->dcp == jp->dcq
1071  otherwise number of mismatches must be updated here */
1072  ASSERT(0);
1073  }
1074  }
1075 
1076  /* save the length of the ungapped extension */
1077  if (is_ungapped && jp->dcp != jp->dcq) {
1078  *ungapped_ext_len = (Int4)(cp - query - 1);
1079  is_ungapped = FALSE;
1080  }
1081 
1082  /* update cp and cq */
1083  cp += jp->dcp;
1084  cq += jp->dcq;
1085 
1086  /* we have already checked that these are matches */
1087  if (jp->ok == 0 && jp->lng) {
1088  cp += jp->lng;
1089  cq += jp->lng;
1090  ASSERT(edit_script->num_ops < edit_script->num_allocated);
1091  JUMPER_EDIT_BLOCK_ADD(edit_script, jp->lng);
1092  trace <<= jp->lng;
1093  *num_identical += jp->lng;
1094  /* reset number of mismatches (we assume that error window is
1095  smaller than jp->lng) */
1096  num_mismatches = 0;
1097  }
1098  }
1099 
1100  /* if matches all the way to the end of a sequence */
1101  if (new_matches) {
1102  ASSERT(edit_script->num_ops < edit_script->num_allocated);
1103  JUMPER_EDIT_BLOCK_ADD(edit_script, new_matches);
1104  *num_identical += new_matches;
1105  }
1106 
1107  /* find optimal length of the extension */
1108  s_TrimExtension(edit_script, -mismatch_score, &cp, &cq, num_identical,
1109  TRUE);
1110 
1111  *query_ext_len = (Int4)(cp - query);
1112  *subject_ext_len = cq;
1113 
1114  if (is_ungapped) {
1115  *ungapped_ext_len = *query_ext_len;
1116  }
1117 
1118  /* return extension score */
1119  return s_ComputeExtensionScore(edit_script, match_score, mismatch_score,
1120  gap_open_score, gap_extend_score);
1121 } /* JumperExtendRightCompressedWithTraceback */
1122 
1123 
1125  const Uint1* query, const Uint1* subject,
1126  int query_length, int subject_length,
1127  Int4 match_score, Int4 mismatch_score,
1128  Int4 gap_open_score, Int4 gap_extend_score,
1129  int max_mismatches, int window,
1130  Int4 x_drop, Uint4* table,
1131  Int4* query_ext_len, Int4* subject_ext_len,
1132  JumperPrelimEditBlock* edit_script,
1133  Int4* best_num_identical,
1134  Boolean left_extension,
1135  Int4* ungapped_ext_len)
1136 {
1137  const Uint1 *cp, *cp1, *cpmax, *cpmax4, *cpstop = NULL;
1138  Int4 cq, cq1, cqmax, cqmax4, cqstop = 0;
1139  int i, n;
1140  JUMP *jp;
1141  int num_mismatches = 0;
1142  int new_matches = 0;
1143  Uint4 trace = 0;
1144  Uint4 trace_mask = (1 << max_mismatches) - 1;
1145  Boolean is_ungapped = TRUE;
1146  Int4 score = 0, best_score = 0;
1147  Int4 num_ops = 0, num_identical = *best_num_identical;
1148  /* 0 - no gap, JUMPER_INSERTION, JUMPER_DELETION for open insertion or
1149  deletion */
1150  Int4 last_gap_open = 0;
1151  JUMP* jumper = jumper_default;
1152 
1153  if (!query || ! subject) {
1154  return -1;
1155  }
1156 
1157  cp = query;
1158  cpmax = cp + query_length;
1159  cp1 = cpmax;
1160 
1161  /* or assume matches up to byte edge */
1162  cq = 0;
1163  cqmax = subject_length;
1164 
1165  cpmax4 = cpmax - 4;
1166  cqmax4 = cqmax - 4;
1167 
1168  /* skip the first pair as it is compared by the left extension */
1169  cp++;
1170  cq++;
1171  if (!left_extension) {
1172  new_matches++;
1173  }
1174 
1175  while (cp < cpmax && cq < cqmax && num_mismatches < max_mismatches) {
1176 
1177  if (!(cq & 3) && cp < cpmax4 && cq < cqmax4) {
1178  if (table[subject[cq / 4]] == *(Uint4*)(cp)) {
1179  cp += 4;
1180  cq += 4;
1181  new_matches += 4;
1182  continue;
1183  }
1184  }
1185 
1186  if (*cp == UNPACK_BASE(subject, cq)) {
1187  cp++;
1188  cq++;
1189  new_matches++;
1190  continue;
1191  }
1192 
1193  jp = jumper;
1194  jp-- ;
1195  while (jp++) { /* 1, 1, 0, 0 = last always accepted */
1196 
1197  if (!jp->lng) {
1198  break;
1199  }
1200 
1201  n = 0;
1202  i = jp->ok;
1203  cp1 = cp + jp->dcp;
1204  cq1 = cq + jp->dcq;
1205  while (i--) {
1206  if (cp1 >= cpmax || cq1 >= cqmax
1207  || *cp1++ != UNPACK_BASE(subject, cq1)) {
1208 
1209  goto next_jp;
1210  }
1211  cq1++;
1212  }
1213 
1214  n = 0;
1215  i = jp->lng;
1216  cp1 = cp + jp->dcp;
1217  cq1 = cq + jp->dcq;
1218 
1219  if (cp1 >= cpmax || i + cq1 >= cqmax) {
1220  continue;
1221  }
1222  while (i--) {
1223  if (cp1 >= cpmax) {
1224  break;
1225  }
1226  if (/*cp1 >= cpmax ||*/ cq1 >= cqmax) {
1227  goto next_jp;
1228  }
1229  if (*cp1++ != UNPACK_BASE(subject, cq1)) {
1230  if (++n > jp->ok) {
1231  goto next_jp;
1232  }
1233  }
1234  cq1++;
1235  }
1236  /* current jumper row works */
1237  break;
1238 
1239 next_jp:
1240  continue;
1241  }
1242 
1243  if (new_matches) {
1244  ASSERT(edit_script->num_ops < edit_script->num_allocated);
1245  JUMPER_EDIT_BLOCK_ADD(edit_script, new_matches);
1246  if (trace) {
1247  if (new_matches < window) {
1248  trace <<= new_matches;
1249  }
1250  else {
1251  trace = 0;
1252  }
1253  }
1254  num_identical += new_matches;
1255  score += new_matches * match_score;
1256  new_matches = 0;
1257  last_gap_open = 0;
1258  }
1259 
1260  /* update optimal alignment extent */
1261  if (score >= best_score) {
1262  cpstop = cp;
1263  cqstop = cq;
1264  num_ops = edit_script->num_ops;
1265  best_score = score;
1266  *best_num_identical = num_identical;
1267  }
1268 
1269  if (best_score - score > x_drop) {
1270  break;
1271  }
1272 
1273  /* update recent errors */
1274  if (jp->dcp == jp->dcq) {
1275  score += jp->dcp * mismatch_score;
1276  if (trace & trace_mask) {
1277  num_mismatches += jp->dcp;
1278  trace <<= jp->dcp;
1279  trace |= (1 << jp->dcp) - 1;
1280  }
1281  else {
1282  num_mismatches = jp->dcp;
1283  trace = (1 << jp->dcp) - 1;
1284  }
1285  for (i = 0;i < jp->dcp;i++) {
1286  ASSERT(edit_script->num_ops < edit_script->num_allocated);
1287  JUMPER_EDIT_BLOCK_ADD(edit_script, JUMPER_MISMATCH);
1288  }
1289  } else if (jp->dcp > jp->dcq) {
1290  for (i = 0;i < jp->dcp - jp->dcq;i++) {
1291  ASSERT(edit_script->num_ops < edit_script->num_allocated);
1293  score += gap_extend_score;
1294  }
1295  if (last_gap_open != JUMPER_INSERTION) {
1296  score += gap_open_score;
1297  }
1298  last_gap_open = JUMPER_INSERTION;
1299  if (jp->dcq) {
1300  /* either jp->dcp r jp->dcq must be zero or jp->dcp == jp->dcq
1301  otherwise number of mismatches must be updated here */
1302  ASSERT(0);
1303  }
1304  }
1305  else {
1306  for (i = 0;i < jp->dcq - jp->dcp;i++) {
1307  ASSERT(edit_script->num_ops < edit_script->num_allocated);
1308  JUMPER_EDIT_BLOCK_ADD(edit_script, JUMPER_DELETION);
1309  score += gap_extend_score;
1310  }
1311  if (last_gap_open != JUMPER_DELETION) {
1312  score += gap_open_score;
1313  }
1314  last_gap_open = JUMPER_DELETION;
1315  if (jp->dcp) {
1316  /* either jp->dcp r jp->dcq must be zero or jp->dcp == jp->dcq
1317  otherwise number of mismatches must be updated here */
1318  ASSERT(0);
1319  }
1320  }
1321 
1322  /* save the length of the ungapped extension */
1323  if (is_ungapped && jp->dcp != jp->dcq) {
1324  *ungapped_ext_len = (Int4)(cp - query - 1);
1325  is_ungapped = FALSE;
1326  }
1327 
1328  /* update cp and cq */
1329  cp += jp->dcp;
1330  cq += jp->dcq;
1331 
1332  /* we have already checked that these are matches */
1333  if (cp1 < cpmax && jp->ok == 0 && jp->lng) {
1334  cp += jp->lng;
1335  cq += jp->lng;
1336  ASSERT(edit_script->num_ops < edit_script->num_allocated);
1337  JUMPER_EDIT_BLOCK_ADD(edit_script, jp->lng);
1338  trace <<= jp->lng;
1339  num_identical += jp->lng;
1340  score += jp->lng * match_score;
1341  last_gap_open = 0;
1342  }
1343 
1344  /* update optimal alignment extent */
1345  if (score >= best_score) {
1346  cpstop = cp;
1347  cqstop = cq;
1348  num_ops = edit_script->num_ops;
1349  best_score = score;
1350  *best_num_identical = num_identical;
1351  }
1352  }
1353 
1354  /* if matches all the way to the end of a sequence */
1355  if (new_matches) {
1356  ASSERT(edit_script->num_ops < edit_script->num_allocated);
1357  JUMPER_EDIT_BLOCK_ADD(edit_script, new_matches);
1358  num_identical += new_matches;
1359  score += new_matches;
1360  }
1361 
1362  /* update optimal alignment extent */
1363  if (score >= best_score) {
1364  cpstop = cp;
1365  cqstop = cq;
1366  num_ops = edit_script->num_ops;
1367  best_score = score;
1368  *best_num_identical = num_identical;
1369  }
1370 
1371  *query_ext_len = (Int4)(cpstop - query);
1372  *subject_ext_len = cqstop;
1373  edit_script->num_ops = num_ops;
1374 
1375  if (is_ungapped) {
1376  *ungapped_ext_len = *query_ext_len;
1377  }
1378 
1379  return best_score;
1380 } /* JumperExtendRightCompressedWithTracebackOptimal */
1381 
1382 
1383 
1385  int query_length, int subject_length,
1386  Int4 match_score, Int4 mismatch_score,
1387  Int4 gap_open_score, Int4 gap_extend_score,
1388  int max_mismatches, int window,
1389  int* query_ext_len, int* subject_ext_len,
1390  GapPrelimEditBlock* edit_script,
1391  Boolean left_extension)
1392 {
1393  const Uint1 *cp, *cp1, *cpmax;
1394  Int4 cq, cq1, cqmax;
1395  int i, n;
1396  JUMP *jp;
1397  int score = 0, num_mismatches = 0;
1398  int new_matches = 0;
1399  Uint4 trace = 0;
1400  Uint4 trace_mask = (1 << max_mismatches) - 1;
1401  JUMP* jumper = jumper_default;
1402 
1403  if (!query || ! subject || !edit_script) {
1404  return -1;
1405  }
1406 
1407  cp = query;
1408  cpmax = cp + query_length;
1409 
1410  /* or assume matches up to byte edge */
1411  cq = 0;
1412  cqmax = subject_length;
1413 
1414  /* skip the first pair as it is compared by the left extension */
1415  cp++;
1416  cq++;
1417  if (!left_extension) {
1418  new_matches++;
1419  }
1420 
1421  while (cp < cpmax && cq < cqmax && num_mismatches < max_mismatches) {
1422 
1423  if (*cp == subject[cq]) {
1424  score += match_score;
1425  cp++;
1426  cq++;
1427  new_matches++;
1428  continue;
1429  }
1430 
1431  /* handle ambiguous bases */
1432 
1433 /* this is only for 454
1434  jp = (cqmax - cq > 80) ? jumper : jumper_edge;
1435 */
1436  jp = jumper;
1437  jp-- ;
1438  while (jp++) { /* 1, 1, 0, 0 = last always accepted */
1439 
1440  if (!jp->lng) {
1441  break;
1442  }
1443 
1444  n = 0;
1445  i = jp->ok;
1446  cp1 = cp + jp->dcp;
1447  cq1 = cq + jp->dcq;
1448  while (i--) {
1449  if (cp1 >= cpmax || cq1 >= cqmax
1450  || *cp1++ != subject[cq1]) {
1451 
1452  goto next_jp;
1453  }
1454  cq1++;
1455  }
1456 
1457  n = 0;
1458  i = jp->lng;
1459  cp1 = cp + jp->dcp;
1460  cq1 = cq + jp->dcq;
1461  if (i + cp1 >= cpmax || i + cq1 >= cqmax) {
1462  continue;
1463  }
1464  while (i--) {
1465  if (cp1 >= cpmax || cq1 >= cqmax) {
1466  goto next_jp;
1467  }
1468  if (*cp1++ != subject[cq1]) {
1469  if (++n > jp->ok) {
1470  goto next_jp;
1471  }
1472  }
1473  cq1++;
1474  }
1475  /* current jumper row works */
1476  break;
1477 
1478 next_jp:
1479  continue;
1480  }
1481 
1482  if (new_matches) {
1483  GapPrelimEditBlockAdd(edit_script, eGapAlignSub, new_matches);
1484  if (trace) {
1485  if (new_matches < window) {
1486  trace <<= new_matches;
1487  }
1488  else {
1489  trace = 0;
1490  }
1491  }
1492  new_matches = 0;
1493  }
1494 
1495 /* another stop condition, ensures a positive score *--/
1496  if (!jp->lng && num_mismatches >= score + mismatch_score) {
1497  break;
1498  }
1499 */
1500 
1501  /* update score */
1502  if (jp->dcp == jp->dcq) {
1503  score += mismatch_score * jp->dcp;
1504  if (trace & trace_mask) {
1505  num_mismatches += jp->dcp;
1506  trace <<= jp->dcp;
1507  trace |= (1 << jp->dcp) - 1;
1508  }
1509  else {
1510  num_mismatches = jp->dcp;
1511  trace = (1 << jp->dcp) - 1;
1512  }
1513  GapPrelimEditBlockAdd(edit_script, eGapAlignSub, jp->dcp);
1514  } else if (jp->dcp > jp->dcq) {
1515  score += gap_open_score + gap_extend_score * (jp->dcp - jp->dcq);
1516  GapPrelimEditBlockAdd(edit_script, eGapAlignIns, jp->dcp - jp->dcq);
1517  ASSERT(!jp->dcq);
1518  }
1519  else {
1520  score += gap_open_score + gap_extend_score * (jp->dcq - jp->dcp);
1521  GapPrelimEditBlockAdd(edit_script, eGapAlignDel, jp->dcq - jp->dcp);
1522  ASSERT(!jp->dcp);
1523  }
1524 
1525  /* update cp and cq */
1526  cp += jp->dcp;
1527  cq += jp->dcq;
1528 
1529  /* we have already checked that these are matches */
1530  if (jp->ok == 0 && jp->lng) {
1531  score += match_score * jp->lng;
1532  cp += jp->lng;
1533  cq += jp->lng;
1534 
1535  GapPrelimEditBlockAdd(edit_script, eGapAlignSub, jp->lng);
1536  trace <<= jp->lng;
1537  }
1538  }
1539 
1540  /* if matches all the way to the end of a sequence */
1541  if (new_matches) {
1542  GapPrelimEditBlockAdd(edit_script, eGapAlignSub, new_matches);
1543  }
1544 
1545  *query_ext_len = (Int4)(cp - query);
1546  *subject_ext_len = cq;
1547 
1548  return score;
1549 } /* JumperExtendRight */
1550 
1551 
1553  const Uint1* query, const Uint1* subject,
1554  int query_length, int subject_length,
1555  Int4 match_score, Int4 mismatch_score,
1556  Int4 gap_open_score, Int4 gap_extend_score,
1557  int max_mismatches, int window,
1558  Int4* query_ext_len, Int4* subject_ext_len,
1559  JumperPrelimEditBlock* edit_script,
1560  Int4* num_identical,
1561  Boolean left_extension,
1562  Int4* ungapped_ext_len,
1563  JUMP* jumper)
1564 {
1565  const Uint1 *cp, *cp1, *cpmax;
1566  Int4 cq, cq1, cqmax;
1567  int i, n;
1568  JUMP *jp;
1569  int num_mismatches = 0;
1570  int new_matches = 0;
1571  Uint4 trace = 0;
1572  Uint4 trace_mask = (1 << max_mismatches) - 1;
1573  Boolean is_ungapped = TRUE;
1574 
1575  if (!query || ! subject) {
1576  return -1;
1577  }
1578 
1579  cp = query;
1580  cpmax = cp + query_length;
1581 
1582  /* or assume matches up to byte edge */
1583  cq = 0;
1584  cqmax = subject_length;
1585 
1586  /* skip the first pair as it is compared by the left extension */
1587  if (left_extension) {
1588  cp++;
1589  cq++;
1590  }
1591 
1592  while (cp < cpmax && cq < cqmax && num_mismatches < max_mismatches) {
1593 
1594  if (*cp == subject[cq]) {
1595  cp++;
1596  cq++;
1597  new_matches++;
1598  continue;
1599  }
1600 
1601  jp = jumper;
1602  jp-- ;
1603  while (jp++) { /* 1, 1, 0, 0 = last always accepted */
1604 
1605  if (!jp->lng) {
1606  break;
1607  }
1608 
1609  n = 0;
1610  i = jp->ok;
1611  cp1 = cp + jp->dcp;
1612  cq1 = cq + jp->dcq;
1613  while (i--) {
1614  if (cp1 >= cpmax || cq1 >= cqmax
1615  || *cp1++ != subject[cq1]) {
1616 
1617  goto next_jp;
1618  }
1619  cq1++;
1620  }
1621 
1622  n = 0;
1623  i = jp->lng;
1624  cp1 = cp + jp->dcp;
1625  cq1 = cq + jp->dcq;
1626  if (i + cp1 >= cpmax || i + cq1 >= cqmax) {
1627  continue;
1628  }
1629  while (i--) {
1630  if (cp1 >= cpmax || cq1 >= cqmax) {
1631  goto next_jp;
1632  }
1633  if (*cp1++ != subject[cq1]) {
1634  if (++n > jp->ok) {
1635  goto next_jp;
1636  }
1637  }
1638  cq1++;
1639  }
1640  /* current jumper row works */
1641  break;
1642 
1643 next_jp:
1644  continue;
1645  }
1646 
1647  if (new_matches) {
1648  ASSERT(edit_script->num_ops < edit_script->num_allocated);
1649  JUMPER_EDIT_BLOCK_ADD(edit_script, new_matches);
1650  if (trace) {
1651  if (new_matches < window) {
1652  trace <<= new_matches;
1653  }
1654  else {
1655  trace = 0;
1656  }
1657  }
1658  *num_identical += new_matches;
1659  new_matches = 0;
1660  }
1661 
1662  /* update recent errors */
1663  if (jp->dcp == jp->dcq) {
1664  if (trace & trace_mask) {
1665  num_mismatches += jp->dcp;
1666  trace <<= jp->dcp;
1667  trace |= (1 << jp->dcp) - 1;
1668  }
1669  else {
1670  num_mismatches = jp->dcp;
1671  trace = (1 << jp->dcp) - 1;
1672  }
1673  for (i = 0;i < jp->dcp;i++) {
1674  ASSERT(edit_script->num_ops < edit_script->num_allocated);
1675  JUMPER_EDIT_BLOCK_ADD(edit_script, JUMPER_MISMATCH);
1676  }
1677  } else if (jp->dcp > jp->dcq) {
1678  for (i = 0;i < jp->dcp - jp->dcq;i++) {
1679  ASSERT(edit_script->num_ops < edit_script->num_allocated);
1681  }
1682  if (jp->dcq) {
1683  /* either jp->dcp r jp->dcq must be zero or jp->dcp == jp->dcq
1684  otherwise number of mismatches must be updated here */
1685  ASSERT(0);
1686  }
1687  }
1688  else {
1689  for (i = 0;i < jp->dcq - jp->dcp;i++) {
1690  ASSERT(edit_script->num_ops < edit_script->num_allocated);
1691  JUMPER_EDIT_BLOCK_ADD(edit_script, JUMPER_DELETION);
1692  }
1693  if (jp->dcp) {
1694  /* either jp->dcp r jp->dcq must be zero or jp->dcp == jp->dcq
1695  otherwise number of mismatches must be updated here */
1696  ASSERT(0);
1697  }
1698  }
1699 
1700  /* save the length of the ungapped extension */
1701  if (is_ungapped && jp->dcp != jp->dcq) {
1702  *ungapped_ext_len = (Int4)(cp - query - 1);
1703  is_ungapped = FALSE;
1704  }
1705 
1706  /* update cp and cq */
1707  cp += jp->dcp;
1708  cq += jp->dcq;
1709 
1710  /* we have already checked that these are matches */
1711  if (jp->ok == 0 && jp->lng) {
1712  cp += jp->lng;
1713  cq += jp->lng;
1714  ASSERT(edit_script->num_ops < edit_script->num_allocated);
1715  JUMPER_EDIT_BLOCK_ADD(edit_script, jp->lng);
1716  trace <<= jp->lng;
1717  *num_identical += jp->lng;
1718  /* reset number of mismatches (we assume that error window is
1719  smaller than jp->lng) */
1720  num_mismatches = 0;
1721  }
1722  }
1723 
1724  /* if matches all the way to the end of a sequence */
1725  if (new_matches) {
1726  ASSERT(edit_script->num_ops < edit_script->num_allocated);
1727  JUMPER_EDIT_BLOCK_ADD(edit_script, new_matches);
1728  *num_identical += new_matches;
1729  }
1730 
1731  /* find optimal length of the extension */
1732  s_TrimExtension(edit_script, -mismatch_score, &cp, &cq, num_identical,
1733  TRUE);
1734 
1735  *query_ext_len = (Int4)(cp - query);
1736  *subject_ext_len = cq;
1737 
1738  if (is_ungapped) {
1739  *ungapped_ext_len = *query_ext_len;
1740  }
1741 
1742  /* return extension score */
1743  return s_ComputeExtensionScore(edit_script, match_score, mismatch_score,
1744  gap_open_score, gap_extend_score);
1745 } /* JumperExtendRightWithTraceback */
1746 
1747 
1748 
1750  Int4 query_offset, Int4 subject_offset,
1751  Int4 match_score, Int4 mismatch_score,
1752  Int4 gap_open_score, Int4 gap_extend_score,
1753  int max_mismatches, int window, Uint4* table,
1754  Int4* query_ext_len, Int4* subject_ext_len,
1755  Int4* num_identical)
1756 {
1757  const Uint1 *cp, *cp1, *cpmin, *cpmin4, *cpstop = NULL;
1758  Int4 cq, cq1, cqmin, cqmin4, cqstop = 0;
1759  int i, n;
1760  JUMP *jp;
1761  int num_mismatches = 0;
1762  int new_matches = 0;
1763  Uint4 trace = 0;
1764  Uint4 trace_mask = (1 << max_mismatches) - 1;
1765  Int4 score = 0, best_score = 0;
1766  JUMP* jumper = jumper_default;
1767 
1768  if (!query || ! subject) {
1769  return -1;
1770  }
1771 
1772  cp = query + query_offset;
1773  cpmin = query;
1774 
1775  /* or assume matches up to byte edge */
1776  cq = subject_offset;
1777  cqmin = 0;
1778 
1779  cpmin4 = query + 4;
1780  cqmin4 = 4;
1781 
1782 
1783  while (cp >= cpmin && cq >= cqmin && num_mismatches < max_mismatches) {
1784 
1785  if ((cq & 3) == 3 && cp >= cpmin4 && cq >= cqmin4) {
1786  if (table[subject[cq / 4]] == *(Uint4*)(cp - 3)) {
1787  cp -= 4;
1788  cq -= 4;
1789  new_matches += 4;
1790  continue;
1791  }
1792  }
1793 
1794  if (*cp == UNPACK_BASE(subject, cq)) {
1795  cp--;
1796  cq--;
1797  new_matches++;
1798  continue;
1799  }
1800 
1801  /* handle ambiguous bases */
1802 
1803 
1804  jp = jumper;
1805  jp-- ;
1806  while (jp++) { /* 1, 1, 0, 0 = last always accepted */
1807 
1808  if (!jp->lng) {
1809  break;
1810  }
1811 
1812  n = 0;
1813  i = jp->ok;
1814  cp1 = cp - jp->dcp;
1815  cq1 = cq - jp->dcq;
1816  while (i--) {
1817  if (cp1 < cpmin || cq1 < cqmin
1818  || *cp1-- != UNPACK_BASE(subject, cq1)) {
1819 
1820  goto next_jp;
1821  }
1822  cq1--;
1823  }
1824 
1825  n = 0;
1826  i = jp->lng;
1827  cp1 = cp - jp->dcp;
1828  cq1 = cq - jp->dcq;
1829  if (cp1 - i < cpmin || cq1 - i < cqmin) {
1830  continue;
1831  }
1832  while (i--) {
1833  if (cp1 < cpmin || cq1 < cqmin) {
1834  goto next_jp;
1835  }
1836  if (*cp1-- != UNPACK_BASE(subject, cq1)) {
1837  if (++n > jp->ok) {
1838  goto next_jp;
1839  }
1840  }
1841  cq1--;
1842  }
1843  /* current jumper row works */
1844  break;
1845 
1846 next_jp:
1847  continue;
1848  }
1849 
1850  if (new_matches) {
1851  if (trace) {
1852  if (new_matches < window) {
1853  trace <<= new_matches;
1854  }
1855  else {
1856  trace = 0;
1857  }
1858  }
1859  *num_identical += new_matches;
1860  score = new_matches * match_score;
1861  new_matches = 0;
1862  }
1863 
1864  /* update recent errors and score */
1865  if (jp->dcp == jp->dcq) {
1866  score += mismatch_score * jp->dcp;
1867  if (trace & trace_mask) {
1868  num_mismatches += jp->dcp;
1869  trace <<= jp->dcp;
1870  trace |= (1 << jp->dcp) - 1;
1871  }
1872  else {
1873  num_mismatches = jp->dcp;
1874  trace = (1 << jp->dcp) - 1;
1875  }
1876  }
1877  /* note that we are not penalizing gaps */
1878 
1879  /* update cp and cq */
1880  cp -= jp->dcp;
1881  cq -= jp->dcq;
1882 
1883  /* we have already checked that these are matches */
1884  if (!jp->ok && jp->lng) {
1885  cp -= jp->lng;
1886  cq -= jp->lng;
1887  trace <<= jp->lng;
1888  *num_identical += jp->lng;
1889  score += jp->lng * match_score;
1890  }
1891 
1892  /* update optimal alignment extent */
1893  if (score >= best_score) {
1894  cpstop = cp;
1895  cqstop = cq;
1896  best_score = score;
1897  }
1898  }
1899 
1900  /* if matches all the way to the end of a sequence */
1901  if (new_matches) {
1902  *num_identical += new_matches;
1903  score += new_matches * match_score;
1904  if (score >= best_score) {
1905  cpstop = cp;
1906  cqstop = cq;
1907  }
1908  }
1909 
1910  *query_ext_len = (Int4)(query + query_offset - cpstop);
1911  *subject_ext_len = subject_offset - cqstop;
1912 
1913  return best_score;
1914 } /* JumperExtendLeftCompressed */
1915 
1916 
1918  const Uint1* query, const Uint1* subject,
1919  Int4 query_offset, Int4 subject_offset,
1920  Int4 match_score, Int4 mismatch_score,
1921  Int4 gap_open_score, Int4 gap_extend_score,
1922  int max_mismatches, int window, Uint4* table,
1923  Int4* query_ext_len, Int4* subject_ext_len,
1924  JumperPrelimEditBlock* edit_script,
1925  Int4* num_identical,
1926  JUMP* jumper)
1927 {
1928  const Uint1 *cp, *cp1, *cpmin, *cpmin4;
1929  Int4 cq, cq1, cqmin, cqmin4;
1930  int i, n;
1931  JUMP *jp;
1932  int num_mismatches = 0;
1933  int new_matches = 0;
1934  Uint4 trace = 0;
1935  Uint4 trace_mask = (1 << max_mismatches) - 1;
1936 
1937  if (!query || ! subject) {
1938  return -1;
1939  }
1940 
1941  cp = query + query_offset;
1942  cpmin = query;
1943 
1944  /* or assume matches up to byte edge */
1945  cq = subject_offset;
1946  cqmin = 0;
1947 
1948  cpmin4 = query + 4;
1949  cqmin4 = 4;
1950 
1951 
1952  while (cp >= cpmin && cq >= cqmin && num_mismatches < max_mismatches) {
1953 
1954  if ((cq & 3) == 3 && cp >= cpmin4 && cq >= cqmin4) {
1955  if (table[subject[cq / 4]] == *(Uint4*)(cp - 3)) {
1956  cp -= 4;
1957  cq -= 4;
1958  new_matches += 4;
1959  continue;
1960  }
1961  }
1962 
1963  if (*cp == UNPACK_BASE(subject, cq)) {
1964  cp--;
1965  cq--;
1966  new_matches++;
1967  continue;
1968  }
1969 
1970  /* handle ambiguous bases */
1971 
1972 
1973  jp = jumper;
1974  jp-- ;
1975  while (jp++) { /* 1, 1, 0, 0 = last always accepted */
1976 
1977  if (!jp->lng) {
1978  break;
1979  }
1980 
1981  n = 0;
1982  i = jp->ok;
1983  cp1 = cp - jp->dcp;
1984  cq1 = cq - jp->dcq;
1985  while (i--) {
1986  if (cp1 < cpmin || cq1 < cqmin
1987  || *cp1-- != UNPACK_BASE(subject, cq1)) {
1988 
1989  goto next_jp;
1990  }
1991  cq1--;
1992  }
1993 
1994  n = 0;
1995  i = jp->lng;
1996  cp1 = cp - jp->dcp;
1997  cq1 = cq - jp->dcq;
1998  if (cp1 - i < cpmin || cq1 - i < cqmin) {
1999  continue;
2000  }
2001  while (i--) {
2002  if (cp1 < cpmin || cq1 < cqmin) {
2003  goto next_jp;
2004  }
2005  if (*cp1-- != UNPACK_BASE(subject, cq1)) {
2006  if (++n > jp->ok) {
2007  goto next_jp;
2008  }
2009  }
2010  cq1--;
2011  }
2012  /* current jumper row works */
2013  break;
2014 
2015 next_jp:
2016  continue;
2017  }
2018 
2019  if (new_matches) {
2020  ASSERT(edit_script->num_ops < edit_script->num_allocated);
2021  JUMPER_EDIT_BLOCK_ADD(edit_script, new_matches);
2022  if (trace) {
2023  if (new_matches < window) {
2024  trace <<= new_matches;
2025  }
2026  else {
2027  trace = 0;
2028  }
2029  }
2030  *num_identical += new_matches;
2031  new_matches = 0;
2032  }
2033 
2034  /* update recent errors */
2035  if (jp->dcp == jp->dcq) {
2036  if (trace & trace_mask) {
2037  num_mismatches += jp->dcp;
2038  trace <<= jp->dcp;
2039  trace |= (1 << jp->dcp) - 1;
2040  }
2041  else {
2042  num_mismatches = jp->dcp;
2043  trace = (1 << jp->dcp) - 1;
2044  }
2045  for (i = 0;i < jp->dcp;i++) {
2046  ASSERT(edit_script->num_ops < edit_script->num_allocated);
2047  JUMPER_EDIT_BLOCK_ADD(edit_script, JUMPER_MISMATCH);
2048  }
2049  } else if (jp->dcp > jp->dcq) {
2050  for (i = 0;i < jp->dcp - jp->dcq;i++) {
2051  ASSERT(edit_script->num_ops < edit_script->num_allocated);
2053  }
2054  if (jp->dcq) {
2055  /* either jp->dcp r jp->dcq must be zero or jp->dcp == jp->dcq
2056  otherwise number of mismatches must be updated here */
2057  ASSERT(0);
2058  }
2059  }
2060  else {
2061  for (i = 0;i < jp->dcq - jp->dcp;i++) {
2062  ASSERT(edit_script->num_ops < edit_script->num_allocated);
2063  JUMPER_EDIT_BLOCK_ADD(edit_script, JUMPER_DELETION);
2064  }
2065  if (jp->dcp) {
2066  /* either jp->dcp r jp->dcq must be zero or jp->dcp == jp->dcq
2067  otherwise number of mismatches must be updated here */
2068  ASSERT(0);
2069  }
2070  }
2071 
2072  /* update cp and cq */
2073  cp -= jp->dcp;
2074  cq -= jp->dcq;
2075 
2076  /* we have already checked that these are matches */
2077  if (!jp->ok && jp->lng) {
2078  cp -= jp->lng;
2079  cq -= jp->lng;
2080  ASSERT(edit_script->num_ops < edit_script->num_allocated);
2081  JUMPER_EDIT_BLOCK_ADD(edit_script, jp->lng);
2082  trace <<= jp->lng;
2083  *num_identical += jp->lng;
2084  /* reset number of mismatches (we assume that error window is
2085  smaller than jp->lng) */
2086  num_mismatches = 0;
2087  }
2088  }
2089 
2090  /* if matches all the way to the end of a sequence */
2091  if (new_matches) {
2092  ASSERT(edit_script->num_ops < edit_script->num_allocated);
2093  JUMPER_EDIT_BLOCK_ADD(edit_script, new_matches);
2094  *num_identical += new_matches;
2095  }
2096 
2097  /* find optimal extension length */
2098  s_TrimExtension(edit_script, -mismatch_score, &cp, &cq, num_identical,
2099  FALSE);
2100 
2101  *query_ext_len = (Int4)(query + query_offset - cp);
2102  *subject_ext_len = subject_offset - cq;
2103 
2104  /* return extension score */
2105  return s_ComputeExtensionScore(edit_script, match_score, mismatch_score,
2106  gap_open_score, gap_extend_score);
2107 } /* JumperExtendLeftCompressedWithTraceback */
2108 
2109 
2111  const Uint1* query, const Uint1* subject,
2112  Int4 query_offset, Int4 subject_offset,
2113  Int4 match_score, Int4 mismatch_score,
2114  Int4 gap_open_score, Int4 gap_extend_score,
2115  int max_mismatches, int window,
2116  Int4 x_drop, Uint4* table,
2117  Int4* query_ext_len, Int4* subject_ext_len,
2118  JumperPrelimEditBlock* edit_script,
2119  Int4* best_num_identical)
2120 {
2121  const Uint1 *cp, *cp1, *cpmin, *cpmin4, *cpstop = NULL;
2122  Int4 cq, cq1, cqmin, cqmin4, cqstop = 0;
2123  int i, n;
2124  JUMP *jp;
2125  int num_mismatches = 0;
2126  int new_matches = 0;
2127  Uint4 trace = 0;
2128  Uint4 trace_mask = (1 << max_mismatches) - 1;
2129  Int4 score = 0, best_score = 0;
2130  Int4 num_ops = 0, num_identical = *best_num_identical;
2131  /* 0 - no gap, JUMPER_INSERTION, JUMPER_DELETION for open insertion or
2132  deletion */
2133  Int4 last_gap_open = 0;
2134  JUMP* jumper = jumper_default;
2135 
2136  if (!query || ! subject) {
2137  return -1;
2138  }
2139 
2140  cp = query + query_offset;
2141  cpmin = query;
2142  cp1 = cpmin;
2143 
2144  /* or assume matches up to byte edge */
2145  cq = subject_offset;
2146  cqmin = 0;
2147 
2148  cpmin4 = query + 4;
2149  cqmin4 = 4;
2150 
2151 
2152  while (cp >= cpmin && cq >= cqmin && num_mismatches < max_mismatches) {
2153 
2154  if ((cq & 3) == 3 && cp >= cpmin4 && cq >= cqmin4) {
2155  if (table[subject[cq / 4]] == *(Uint4*)(cp - 3)) {
2156  cp -= 4;
2157  cq -= 4;
2158  new_matches += 4;
2159  continue;
2160  }
2161  }
2162 
2163  if (*cp == UNPACK_BASE(subject, cq)) {
2164  cp--;
2165  cq--;
2166  new_matches++;
2167  continue;
2168  }
2169 
2170  /* handle ambiguous bases */
2171 
2172 
2173  jp = jumper;
2174  jp-- ;
2175  while (jp++) { /* 1, 1, 0, 0 = last always accepted */
2176 
2177  if (!jp->lng) {
2178  break;
2179  }
2180 
2181  n = 0;
2182  i = jp->ok;
2183  cp1 = cp - jp->dcp;
2184  cq1 = cq - jp->dcq;
2185  while (i--) {
2186  if (cp1 < cpmin || cq1 < cqmin
2187  || *cp1-- != UNPACK_BASE(subject, cq1)) {
2188 
2189  goto next_jp;
2190  }
2191  cq1--;
2192  }
2193 
2194  n = 0;
2195  i = jp->lng;
2196  cp1 = cp - jp->dcp;
2197  cq1 = cq - jp->dcq;
2198 
2199  if (cp1 <= cpmin || cq1 - i < cqmin) {
2200  continue;
2201  }
2202  while (i--) {
2203  if (cp1 < cpmin) {
2204  break;
2205  }
2206  if (/*cp1 < cpmin ||*/ cq1 < cqmin) {
2207  goto next_jp;
2208  }
2209  if (*cp1-- != UNPACK_BASE(subject, cq1)) {
2210  if (++n > jp->ok) {
2211  goto next_jp;
2212  }
2213  }
2214  cq1--;
2215  }
2216  /* current jumper row works */
2217  break;
2218 
2219 next_jp:
2220  continue;
2221  }
2222 
2223  if (new_matches) {
2224  ASSERT(edit_script->num_ops < edit_script->num_allocated);
2225  JUMPER_EDIT_BLOCK_ADD(edit_script, new_matches);
2226  if (trace) {
2227  if (new_matches < window) {
2228  trace <<= new_matches;
2229  }
2230  else {
2231  trace = 0;
2232  }
2233  }
2234  num_identical += new_matches;
2235  score += new_matches * match_score;
2236  new_matches = 0;
2237  last_gap_open = 0;
2238  }
2239 
2240  /* update optimal alignment extent */
2241  if (score >= best_score) {
2242  cpstop = cp;
2243  cqstop = cq;
2244  best_score = score;
2245  num_ops = edit_script->num_ops;
2246  *best_num_identical = num_identical;
2247  }
2248 
2249  if (best_score - score > x_drop) {
2250  break;
2251  }
2252 
2253  /* update recent errors */
2254  if (jp->dcp == jp->dcq) {
2255  score += jp->dcp * mismatch_score;
2256  if (trace & trace_mask) {
2257  num_mismatches += jp->dcp;
2258  trace <<= jp->dcp;
2259  trace |= (1 << jp->dcp) - 1;
2260  }
2261  else {
2262  num_mismatches = jp->dcp;
2263  trace = (1 << jp->dcp) - 1;
2264  }
2265  for (i = 0;i < jp->dcp;i++) {
2266  ASSERT(edit_script->num_ops < edit_script->num_allocated);
2267  JUMPER_EDIT_BLOCK_ADD(edit_script, JUMPER_MISMATCH);
2268  }
2269  } else if (jp->dcp > jp->dcq) {
2270  for (i = 0;i < jp->dcp - jp->dcq;i++) {
2271  ASSERT(edit_script->num_ops < edit_script->num_allocated);
2273  score += gap_extend_score;
2274  }
2275  if (last_gap_open != JUMPER_INSERTION) {
2276  score += gap_open_score;
2277  }
2278  last_gap_open = JUMPER_INSERTION;
2279  if (jp->dcq) {
2280  /* either jp->dcp r jp->dcq must be zero or jp->dcp == jp->dcq
2281  otherwise number of mismatches must be updated here */
2282  ASSERT(0);
2283  }
2284  }
2285  else {
2286  for (i = 0;i < jp->dcq - jp->dcp;i++) {
2287  ASSERT(edit_script->num_ops < edit_script->num_allocated);
2288  JUMPER_EDIT_BLOCK_ADD(edit_script, JUMPER_DELETION);
2289  score += gap_extend_score;
2290  }
2291  if (last_gap_open != JUMPER_DELETION) {
2292  score += gap_open_score;
2293  }
2294  last_gap_open = JUMPER_DELETION;
2295  if (jp->dcp) {
2296  /* either jp->dcp r jp->dcq must be zero or jp->dcp == jp->dcq
2297  otherwise number of mismatches must be updated here */
2298  ASSERT(0);
2299  }
2300  }
2301 
2302  /* update cp and cq */
2303  cp -= jp->dcp;
2304  cq -= jp->dcq;
2305 
2306  /* we have already checked that these are matches */
2307  if (cp1 > cpmin && !jp->ok && jp->lng) {
2308  cp -= jp->lng;
2309  cq -= jp->lng;
2310  ASSERT(edit_script->num_ops < edit_script->num_allocated);
2311  JUMPER_EDIT_BLOCK_ADD(edit_script, jp->lng);
2312  trace <<= jp->lng;
2313  num_identical += jp->lng;
2314  score += jp->lng * match_score;
2315  last_gap_open = 0;
2316  }
2317 
2318  /* update optimal alignment extent */
2319  if (score >= best_score) {
2320  cpstop = cp;
2321  cqstop = cq;
2322  best_score = score;
2323  num_ops = edit_script->num_ops;
2324  *best_num_identical = num_identical;
2325  }
2326  }
2327 
2328  /* if matches all the way to the end of a sequence */
2329  if (new_matches) {
2330  ASSERT(edit_script->num_ops < edit_script->num_allocated);
2331  JUMPER_EDIT_BLOCK_ADD(edit_script, new_matches);
2332  num_identical += new_matches;
2333  score += new_matches * match_score;
2334  }
2335 
2336  /* update optimal alignment extent */
2337  if (score >= best_score) {
2338  cpstop = cp;
2339  cqstop = cq;
2340  best_score = score;
2341  num_ops = edit_script->num_ops;
2342  *best_num_identical = num_identical;
2343  }
2344 
2345  *query_ext_len = (Int4)(query + query_offset - cpstop);
2346  *subject_ext_len = subject_offset - cqstop;
2347  edit_script->num_ops = num_ops;
2348 
2349  /* return extension score */
2350  return best_score;
2351 } /* JumperExtendLeftCompressedWithTracebackOptimal */
2352 
2353 
2355  Int4 query_offset, Int4 subject_offset,
2356  Int4 match_score, Int4 mismatch_score,
2357  Int4 gap_open_score, Int4 gap_extend_score,
2358  int max_mismatches, int window,
2359  int* query_ext_len, int* subject_ext_len,
2360  GapPrelimEditBlock* edit_script)
2361 {
2362  const Uint1 *cp, *cp1, *cpmin;
2363  Int4 cq, cq1, cqmin;
2364  int i, n;
2365  JUMP *jp;
2366  int score = 0, num_mismatches = 0;
2367  int new_matches = 0;
2368  Uint4 trace = 0;
2369  Uint4 trace_mask = (1 << max_mismatches) - 1;
2370  JUMP* jumper = jumper_default;
2371 
2372  if (!query || ! subject || !edit_script) {
2373  return -1;
2374  }
2375 
2376  cp = query + query_offset;
2377  cpmin = query;
2378 
2379  /* or assume matches up to byte edge */
2380  cq = subject_offset;
2381  cqmin = 0;
2382 
2383  while (cp >= cpmin && cq >= cqmin && num_mismatches < max_mismatches) {
2384 
2385  if (*cp == subject[cq]) {
2386  score += match_score;
2387  cp--;
2388  cq--;
2389  new_matches++;
2390  continue;
2391  }
2392 
2393  /* handle ambiguous bases */
2394 
2395 
2396  jp = jumper;
2397  jp-- ;
2398  while (jp++) { /* 1, 1, 0, 0 = last always accepted */
2399 
2400  if (!jp->lng) {
2401  break;
2402  }
2403 
2404  n = 0;
2405  i = jp->ok;
2406  cp1 = cp - jp->dcp;
2407  cq1 = cq - jp->dcq;
2408  while (i--) {
2409  if (cp1 < cpmin || cq1 < cqmin
2410  || *cp1-- != subject[cq1]) {
2411 
2412  goto next_jp;
2413  }
2414  cq1--;
2415  }
2416 
2417  n = 0;
2418  i = jp->lng;
2419  cp1 = cp - jp->dcp;
2420  cq1 = cq - jp->dcq;
2421  if (cp1 - i < cpmin || cq1 - i < cqmin) {
2422  continue;
2423  }
2424  while (i--) {
2425  if (cp1 < cpmin || cq1 < cqmin) {
2426  goto next_jp;
2427  }
2428  if (*cp1-- != subject[cq1]) {
2429  if (++n > jp->ok) {
2430  goto next_jp;
2431  }
2432  }
2433  cq1--;
2434  }
2435  /* current jumper row works */
2436  break;
2437 
2438 next_jp:
2439  continue;
2440  }
2441 
2442  if (new_matches) {
2443  GapPrelimEditBlockAdd(edit_script, eGapAlignSub, new_matches);
2444  if (trace) {
2445  if (new_matches < window) {
2446  trace <<= new_matches;
2447  }
2448  else {
2449  trace = 0;
2450  }
2451  }
2452  new_matches = 0;
2453  }
2454 
2455 /* another stop condition, ensures a positive score *--/
2456  if (!jp->lng && num_mismatches >= score + mismatch_score) {
2457  break;
2458  }
2459 */
2460 
2461  /* update score */
2462  if (jp->dcp == jp->dcq) {
2463  score += mismatch_score * jp->dcp;
2464  if (trace & trace_mask) {
2465  num_mismatches += jp->dcp;
2466  trace <<= jp->dcp;
2467  trace |= (1 << jp->dcp) - 1;
2468  }
2469  else {
2470  num_mismatches = jp->dcp;
2471  trace = (1 << jp->dcp) - 1;
2472  }
2473  GapPrelimEditBlockAdd(edit_script, eGapAlignSub, jp->dcp);
2474  } else if (jp->dcp > jp->dcq) {
2475  score += gap_open_score + gap_extend_score * (jp->dcp - jp->dcq);
2476  GapPrelimEditBlockAdd(edit_script, eGapAlignIns, jp->dcp - jp->dcq);
2477  ASSERT(!jp->dcq);
2478  }
2479  else {
2480  score += gap_open_score + gap_extend_score * (jp->dcq - jp->dcp);
2481  GapPrelimEditBlockAdd(edit_script, eGapAlignDel, jp->dcq - jp->dcp);
2482  ASSERT(!jp->dcp);
2483  }
2484 
2485  /* update cp and cq */
2486  cp -= jp->dcp;
2487  cq -= jp->dcq;
2488 
2489  /* we have already checked that these are matches */
2490  if (!jp->ok && jp->lng) {
2491  score += match_score * jp->lng;
2492  cp -= jp->lng;
2493  cq -= jp->lng;
2494 
2495  GapPrelimEditBlockAdd(edit_script, eGapAlignSub, jp->lng);
2496  trace <<= jp->lng;
2497  }
2498  }
2499 
2500  /* if matches all the way to the end of a sequence */
2501  if (new_matches) {
2502  GapPrelimEditBlockAdd(edit_script, eGapAlignSub, new_matches);
2503  }
2504 
2505  *query_ext_len = (int)(query + query_offset - cp);
2506  *subject_ext_len = subject_offset - cq;
2507 
2508  return score;
2509 } /* JumperExtendLeft */
2510 
2511 
2513  const Uint1* subject,
2514  Int4 query_length, Int4 subject_length,
2515  Int4 query_start, Int4 subject_start,
2516  BlastGapAlignStruct* gap_align,
2517  const BlastScoringParameters* score_params,
2518  Int4* num_identical,
2519  Int4* right_ungapped_ext_len)
2520 {
2521  Int4 score_left = 0, score_right = 0;
2522  Int4 q_ext_len, s_ext_len;
2523  Int4 q_length, s_length;
2524  Int4 offset_adjustment;
2525  Boolean left_ext_done = FALSE;
2526  const Uint1 kBaseN = 14;
2527  Int4 i;
2528 
2529 
2530  ASSERT(gap_align->jumper);
2531 
2532  *num_identical = 0;
2533 
2534  JumperPrelimEditBlock** rev_prelim_block =
2535  &gap_align->jumper->left_prelim_block;
2536 
2537  JumperPrelimEditBlock** fwd_prelim_block =
2538  &gap_align->jumper->right_prelim_block;
2539 
2540  /* reallocate gapped preliminary edit blocks if necessary;
2541  edit scripts should not be longer than 2 times the length of the shorter
2542  sequence, so memory reallocation will not be necessary */
2543  if (!*rev_prelim_block || !*fwd_prelim_block ||
2544  (*rev_prelim_block)->num_allocated <
2545  2 * MIN(query_length, subject_length)) {
2546 
2547  JumperPrelimEditBlockFree(*rev_prelim_block);
2548  *rev_prelim_block =
2549  JumperPrelimEditBlockNew(2 * MIN(query_length, subject_length));
2550 
2551  JumperPrelimEditBlockFree(*fwd_prelim_block);
2552  *fwd_prelim_block =
2553  JumperPrelimEditBlockNew(2 * MIN(query_length, subject_length));
2554  }
2555  s_ResetJumperPrelimEditBlocks(*rev_prelim_block, *fwd_prelim_block);
2556 
2557  /* this is currently needed for right extension */
2558  offset_adjustment = COMPRESSION_RATIO - (subject_start % COMPRESSION_RATIO);
2559 
2560  q_length = query_start + offset_adjustment;
2561  s_length = subject_start + offset_adjustment;
2562 
2563 
2564  if (query_start > 0 && subject_start > 0) {
2565 
2567  query, subject,
2568  q_length, s_length,
2569  score_params->reward,
2570  score_params->penalty,
2571  -score_params->gap_open,
2572  -score_params->gap_extend,
2573  gap_align->max_mismatches,
2574  gap_align->mismatch_window,
2575  gap_align->gap_x_dropoff,
2576  gap_align->jumper->table,
2577  &q_ext_len, &s_ext_len,
2578  *rev_prelim_block,
2579  num_identical);
2580 
2581 
2582  gap_align->query_start = q_length - q_ext_len + 1;
2583  gap_align->subject_start = s_length - s_ext_len + 1;
2584  left_ext_done = TRUE;
2585  ASSERT(gap_align->query_start >= 0);
2586  ASSERT(gap_align->subject_start >= 0);
2587  }
2588  else {
2589  gap_align->query_start = query_start;
2590  gap_align->subject_start = subject_start;
2591  }
2592 
2593  if (query_start < query_length - 1 && subject_start < subject_length - 1) {
2594 
2596  query + q_length,
2597  subject + (s_length + 3) / 4,
2598  query_length - q_length,
2599  subject_length - s_length,
2600  score_params->reward,
2601  score_params->penalty,
2602  -score_params->gap_open,
2603  -score_params->gap_extend,
2604  gap_align->max_mismatches,
2605  gap_align->mismatch_window,
2606  gap_align->gap_x_dropoff,
2607  gap_align->jumper->table,
2608  &q_ext_len, &s_ext_len,
2609  *fwd_prelim_block,
2610  num_identical,
2611  left_ext_done,
2612  right_ungapped_ext_len);
2613 
2614  gap_align->query_stop = q_length + q_ext_len;
2615  gap_align->subject_stop = s_length + s_ext_len;
2616  }
2617  else {
2618  gap_align->query_stop = query_start;
2619  gap_align->subject_stop = subject_start;
2620  }
2621 
2622  gap_align->score = score_left + score_right;
2623 
2624  if (offset_adjustment && !left_ext_done) {
2625  ASSERT((*rev_prelim_block)->num_ops <
2626  (*rev_prelim_block)->num_allocated);
2627  JUMPER_EDIT_BLOCK_ADD(*rev_prelim_block, offset_adjustment);
2628 
2629  *num_identical += offset_adjustment;
2630  gap_align->score += offset_adjustment * score_params->reward;
2631  }
2632  if (offset_adjustment && *right_ungapped_ext_len) {
2633  *right_ungapped_ext_len += offset_adjustment;
2634  }
2635 
2636  /* Remove mismatch penalty for Ns. We assume that there are no gaps
2637  corresponfding to Ns (gaps may happen, but are unlikely). This to
2638  ensure that an alignment spanning the whole read containing a few Ns
2639  has a larger score than one covering a part. */
2640  for (i = gap_align->query_start;i < gap_align->query_stop;i++) {
2641  if (query[i] == kBaseN) {
2642  gap_align->score -= score_params->penalty;
2643  }
2644  }
2645 
2646  return 0;
2647 }
2648 
2649 
2651  const BlastHitSavingParameters* hit_params,
2652  Int4 num_identical,
2653  BlastContextInfo* context_info)
2654 {
2655  Int4 align_len;
2656  Int4 cutoff_score;
2657  Int4 edit_dist;
2658  Int4 score = gap_align->score;
2659 
2660  align_len = MAX(gap_align->query_stop - gap_align->query_start,
2661  gap_align->subject_stop - gap_align->subject_start);
2662 
2663  /* check percent identity */
2664  if (100.0 * (double)num_identical / (double)align_len
2665  < hit_params->options->percent_identity) {
2666 
2667  return FALSE;
2668  }
2669 
2670  /* for spliced alignments thresholds apply to the final spliced
2671  alignment */
2672  if (hit_params->options->splice) {
2673  return TRUE;
2674  }
2675 
2676  cutoff_score = 0;
2677  if (hit_params->options->cutoff_score_fun[1] != 0) {
2678  cutoff_score = (hit_params->options->cutoff_score_fun[0] +
2679  context_info->query_length *
2680  hit_params->options->cutoff_score_fun[1]) / 100;
2681  }
2682  else if (hit_params->options->cutoff_score == 0) {
2683  cutoff_score = GetCutoffScore(context_info->query_length);
2684  }
2685  else {
2686  cutoff_score = hit_params->options->cutoff_score;
2687  }
2688 
2689  /* for continuous alignments check score threshold here */
2690  if (score < cutoff_score) {
2691  return FALSE;
2692  }
2693 
2694  edit_dist = align_len - num_identical;
2695  if (edit_dist > hit_params->options->max_edit_distance) {
2696  return FALSE;
2697  }
2698 
2699  return TRUE;
2700 }
2701 
2702 
2703 
2705 {
2706  if (!block) {
2707  return NULL;
2708  }
2709 
2710  if (block->edits) {
2711  sfree(block->edits);
2712  }
2713  sfree(block);
2714 
2715  return NULL;
2716 }
2717 
2719 {
2720  JumperEditsBlock* retval =
2722 
2723  if (!retval) {
2724  return NULL;
2725  }
2726 
2727  retval->edits = (JumperEdit*)calloc(num, sizeof(JumperEdit));
2728  if (!retval->edits) {
2729  JumperEditsBlockFree(retval);
2730  return NULL;
2731  }
2732 
2733  retval->num_edits = 0;
2734  return retval;
2735 }
2736 
2738 {
2739  JumperEditsBlock* retval = NULL;
2740 
2741  if (!block) {
2742  return NULL;
2743  }
2744 
2745  retval = JumperEditsBlockNew(block->num_edits);
2746  if (retval) {
2747  memcpy(retval->edits, block->edits,
2748  block->num_edits * sizeof(JumperEdit));
2749  retval->num_edits = block->num_edits;
2750  }
2751 
2752  return retval;
2753 }
2754 
2756  BlastGapAlignStruct* gap_align)
2757 {
2758  const Uint1 kGap = 15;
2759  Int4 q_pos = gap_align->query_start;
2760  Int4 s_pos = gap_align->subject_start;
2761 
2762  JumperPrelimEditBlock* left_ext = gap_align->jumper->left_prelim_block;
2763  JumperPrelimEditBlock* right_ext = gap_align->jumper->right_prelim_block;
2764 
2765  JumperEditsBlock* retval = NULL;
2766  int len = left_ext->num_ops + right_ext->num_ops;
2767  int i, k;
2768 
2769  retval = JumperEditsBlockNew(len);
2770  if (!retval) {
2771  return NULL;
2772  }
2773  k = 0;
2774  for (i=left_ext->num_ops - 1;i >= 0;i--) {
2775 
2776  ASSERT(k < len);
2777 
2778  JumperEdit* edit = &retval->edits[k];
2779  JumperOpType* edit_op = &left_ext->edit_ops[i];
2780  switch(*edit_op) {
2781  case JUMPER_MISMATCH:
2782  edit->query_pos = q_pos;
2783  edit->query_base = query[q_pos];
2784  edit->subject_base = UNPACK_BASE(subject, s_pos);
2785  q_pos++;
2786  s_pos++;
2787  k++;
2788  break;
2789 
2790  case JUMPER_DELETION:
2791  edit->query_pos = q_pos;
2792  edit->query_base = kGap;
2793  edit->subject_base = UNPACK_BASE(subject, s_pos);
2794  s_pos++;
2795  k++;
2796  break;
2797 
2798  case JUMPER_INSERTION:
2799  edit->query_pos = q_pos;
2800  edit->query_base = query[q_pos];
2801  edit->subject_base = kGap;
2802  q_pos++;
2803  k++;
2804  break;
2805 
2806  default:
2807  q_pos += *edit_op;
2808  s_pos += *edit_op;
2809  }
2810  }
2811 
2812  for (i=0;i < right_ext->num_ops;i++) {
2813 
2814  ASSERT(k < len);
2815 
2816  JumperEdit* edit = &retval->edits[k];
2817  JumperOpType* edit_op = &right_ext->edit_ops[i];
2818  switch(*edit_op) {
2819  case JUMPER_MISMATCH:
2820  edit->query_pos = q_pos;
2821  edit->query_base = query[q_pos];
2822  edit->subject_base = UNPACK_BASE(subject, s_pos);
2823  q_pos++;
2824  s_pos++;
2825  k++;
2826  break;
2827 
2828  case JUMPER_DELETION:
2829  edit->query_pos = q_pos;
2830  edit->query_base = kGap;
2831  edit->subject_base = UNPACK_BASE(subject, s_pos);
2832  s_pos++;
2833  k++;
2834  break;
2835 
2836  case JUMPER_INSERTION:
2837  edit->query_pos = q_pos;
2838  edit->query_base = query[q_pos];
2839  edit->subject_base = kGap;
2840  q_pos++;
2841  k++;
2842  break;
2843 
2844  default:
2845  q_pos += *edit_op;
2846  s_pos += *edit_op;
2847  }
2848  }
2849 
2850  retval->num_edits = k;
2851 
2852  ASSERT(k <= len);
2853  ASSERT(q_pos == gap_align->query_stop);
2854  ASSERT(s_pos == gap_align->subject_stop);
2855 
2856  return retval;
2857 }
2858 
2859 
2861  JumperEditsBlock** append_ptr)
2862 {
2863  Int4 i;
2864  JumperEditsBlock* block = NULL;
2866 
2867  if (!block_ptr || !*block_ptr || !append_ptr) {
2868  return NULL;
2869  }
2870 
2871  block = *block_ptr;
2872  append = *append_ptr;
2873 
2874  if (!append || append->num_edits == 0) {
2875  *append_ptr = JumperEditsBlockFree(*append_ptr);
2876  return block;
2877  }
2878 
2879  block->edits = realloc(block->edits,
2880  (block->num_edits + append->num_edits) *
2881  sizeof(JumperEdit));
2882 
2883  if (!block->edits) {
2884  return NULL;
2885  }
2886  for (i = 0;i < append->num_edits;i++) {
2887  block->edits[block->num_edits++] = append->edits[i];
2888  }
2889 
2890  *append_ptr = JumperEditsBlockFree(*append_ptr);
2891 
2892  return block;
2893 }
2894 
2896  GapEditScript** append_ptr)
2897 {
2898  Int4 i = 0;
2899  GapEditScript* edit_script = NULL;
2901 
2902  if (!edit_script_ptr || !*edit_script_ptr || !append_ptr) {
2903  return NULL;
2904  }
2905 
2906  edit_script = *edit_script_ptr;
2907  append = *append_ptr;
2908 
2909  if (!append || append->size == 0) {
2910  *append_ptr = GapEditScriptDelete(*append_ptr);
2911  return edit_script;
2912  }
2913 
2914  edit_script->op_type = realloc(edit_script->op_type,
2915  (edit_script->size + append->size) *
2916  sizeof(EGapAlignOpType));
2917  if (!edit_script->op_type) {
2918  return NULL;
2919  }
2920  edit_script->num = realloc(edit_script->num,
2921  (edit_script->size + append->size) *
2922  sizeof(Int4));
2923  if (!edit_script->num) {
2924  return NULL;
2925  }
2926 
2927  if (edit_script->op_type[edit_script->size - 1] == append->op_type[0]) {
2928  edit_script->num[edit_script->size - 1] += append->num[0];
2929  i = 1;
2930  }
2931  for (;i < append->size;i++) {
2932  edit_script->op_type[edit_script->size] = append->op_type[i];
2933  edit_script->num[edit_script->size] = append->num[i];
2934  edit_script->size++;
2935  }
2936 
2937  *append_ptr = GapEditScriptDelete(*append_ptr);
2938 
2939  return edit_script;
2940 }
2941 
2942 
2943 #define NUM_SIGNALS 8
2944 
2945 /* Record subject bases pre and post alignment in HSP as possible splice
2946  signals and set a flag for recognized signals */
2948  const Uint1* subject, Int4 subject_len)
2949 {
2950  if (!hsp || !subject) {
2951  return -1;
2952  }
2953 
2954  if (hsp->query.offset == 0 || hsp->subject.offset < 2) {
2955  hsp->map_info->left_edge = MAPPER_EXON;
2956  }
2957  else {
2958  hsp->map_info->left_edge =
2959  (UNPACK_BASE(subject, hsp->subject.offset - 2) << 2) |
2960  UNPACK_BASE(subject, hsp->subject.offset - 1);
2961  }
2962 
2963  if (hsp->query.end == query_len || hsp->subject.end == subject_len) {
2965  }
2966  else {
2967  hsp->map_info->right_edge =
2968  (UNPACK_BASE(subject, hsp->subject.end) << 2) |
2969  UNPACK_BASE(subject, hsp->subject.end + 1);
2970  }
2971 
2972  return 0;
2973 }
2974 
2975 
2977 {
2978  if (!overhangs) {
2979  return NULL;
2980  }
2981 
2982  if (overhangs->left) {
2983  sfree(overhangs->left);
2984  }
2985 
2986  if (overhangs->right) {
2987  sfree(overhangs->right);
2988  }
2989 
2990  sfree(overhangs);
2991  return NULL;
2992 }
2993 
2994 
2996  Int4 query_len)
2997 {
2998  SequenceOverhangs* overhangs = NULL;
2999  const Int4 kMinOverhangLength = 0;
3000  const Int4 kMaxSubjectOverhang = query_len < 400 ? 30 : 60;
3001 
3002  if (hsp->query.offset < kMinOverhangLength &&
3003  query_len - hsp->query.end < kMinOverhangLength) {
3004 
3005  return 0;
3006  }
3007 
3008  overhangs = calloc(1, sizeof(SequenceOverhangs));
3009  if (!overhangs) {
3010  return -1;
3011  }
3012 
3013  if (hsp->query.offset >= kMinOverhangLength) {
3014  Int4 i;
3015  /* at least two subject bases are needed for the search for splice
3016  signals */
3017  Int4 len = MIN(MAX(hsp->query.offset, 2), kMaxSubjectOverhang);
3018  Uint1* overhang = calloc(len, sizeof(Uint1));
3019  if (!overhang) {
3020  SequenceOverhangsFree(overhangs);
3021  return -1;
3022  }
3023  if (hsp->subject.offset < len) {
3024  len = hsp->subject.offset;
3025  }
3026  for (i = 0;i < len; i++) {
3027  overhang[i] = UNPACK_BASE(subject, hsp->subject.offset - len + i);
3028  }
3029  overhangs->left = overhang;
3030  overhangs->left_len = len;
3031  }
3032 
3033  if (hsp->query.end <= query_len - kMinOverhangLength) {
3034  Int4 i;
3035  Int4 len;
3036  /* right side overhangs are used to populate SAM MD tag, so we need
3037  to save a long overhang, unless very few query bases are left and
3038  we are not expecting to find another local alignment */
3039  if (query_len - hsp->query.end + 1 < 6)
3040  len =
3041  MIN(MAX(query_len - hsp->query.end + 1, 2), kMaxSubjectOverhang);
3042  else {
3043  len = kMaxSubjectOverhang;
3044  }
3045  Uint1* overhang = calloc(len, sizeof(Uint1));
3046  if (!overhang) {
3047  SequenceOverhangsFree(overhangs);
3048  return -1;
3049  }
3050  for (i = 0;i < len;i++) {
3051  /* hsp->subject.end is an open limit */
3052  overhang[i] = UNPACK_BASE(subject, hsp->subject.end + i);
3053  }
3054  overhangs->right = overhang;
3055  overhangs->right_len = len;
3056  }
3057 
3058  hsp->map_info->subject_overhangs = overhangs;
3059 
3060  return 0;
3061 }
3062 
3063 
3064 static int s_CompareOffsetPairsByDiagQuery(const void* a, const void* b)
3065 {
3067  BlastOffsetPair* second = (BlastOffsetPair*)b;
3068 
3069  Int4 first_diag = first->qs_offsets.s_off - first->qs_offsets.q_off;
3070  Int4 second_diag = second->qs_offsets.s_off - second->qs_offsets.q_off;
3071 
3072  if (first_diag < second_diag) {
3073  return -1;
3074  }
3075  if (first_diag > second_diag) {
3076  return 1;
3077  }
3078 
3079  if (first->qs_offsets.q_off < second->qs_offsets.q_off) {
3080  return -1;
3081  }
3082  if (first->qs_offsets.q_off > second->qs_offsets.q_off) {
3083  return 1;
3084  }
3085 
3086  if (first->qs_offsets.s_off < second->qs_offsets.s_off) {
3087  return -1;
3088  }
3089  if (first->qs_offsets.s_off > second->qs_offsets.s_off) {
3090  return 1;
3091  }
3092 
3093  return 0;
3094 }
3095 
3096 
3097 /* Create an HSP from a word hit */
3098 static BlastHSP* s_CreateHSPForWordHit(Int4 q_offset, Int4 s_offset,
3099  Int4 length, Int4 context,
3100  const Uint1* query,
3101  const BlastQueryInfo* query_info,
3102  const BLAST_SequenceBlk* subject,
3103  Int4 query_len)
3104 {
3105  BlastHSP* retval = NULL;
3106  GapEditScript* edit_script = NULL;
3107  Int2 status;
3108  Int4 i;
3109  Int4 num_Ns = 0;
3110 
3111  edit_script = GapEditScriptNew(1);
3112  if (!edit_script) {
3113  return NULL;
3114  }
3115 
3116  edit_script->num[0] = length;
3117  edit_script->op_type[0] = eGapAlignSub;
3118 
3119 
3120  status = Blast_HSPInit(q_offset,
3121  q_offset + length,
3122  s_offset,
3123  s_offset + length,
3124  q_offset, s_offset,
3125  context,
3126  query_info->contexts[context].frame,
3127  subject->frame,
3128  length,
3129  &edit_script,
3130  &retval);
3131  if (status) {
3132  if (retval) {
3133  Blast_HSPFree(retval);
3134  }
3135  else {
3136  GapEditScriptDelete(edit_script);
3137  }
3138  return NULL;
3139  }
3140 
3141  retval->map_info = BlastHSPMappingInfoNew();
3142  if (!retval->map_info) {
3143  Blast_HSPFree(retval);
3144  return NULL;
3145  }
3146 
3147  /* ambiguous bases in query are allowed for extending small word matches,
3148  so there can be Ns within the word hit */
3149  for (i = 0;i < length;i++) {
3150  if ((query[q_offset + i] & 0xfc) != 0) {
3151  num_Ns++;
3152  }
3153  }
3154 
3155  retval->num_ident = length - num_Ns;
3156  retval->evalue = 0.0;
3157  retval->map_info->edits = JumperEditsBlockNew(num_Ns);
3158  if (!retval->map_info->edits) {
3159  Blast_HSPFree(retval);
3160  return NULL;
3161  }
3162 
3163  /* add edits for ambiguous bases */
3164  for (i = 0;i < length;i++) {
3165  if ((query[q_offset + i] & 0xfc) != 0) {
3166  JumperEdit* edit = retval->map_info->edits->edits +
3167  retval->map_info->edits->num_edits;
3168 
3169  ASSERT(retval->map_info->edits->num_edits < num_Ns);
3170  edit->query_pos = q_offset + i;
3171  edit->query_base = query[q_offset + i];
3172  edit->subject_base = UNPACK_BASE(subject->sequence, s_offset + i);
3173  retval->map_info->edits->num_edits++;
3174  }
3175  }
3176 
3177  /* FIXME: This is currently needed because these splice sites are used
3178  in finding splice signals for overlapping HSPs */
3179  JumperFindSpliceSignals(retval, query_len, subject->sequence,
3180  subject->length);
3181 
3182  s_SaveSubjectOverhangs(retval, subject->sequence, query_len);
3183 
3184  return retval;
3185 }
3186 
3187 
3188 static BlastHSP* s_CreateHSP(Uint1* query_seq,
3189  Int4 query_len,
3190  Int4 context,
3191  BlastQueryInfo* query_info,
3192  BlastGapAlignStruct* gap_align,
3194  const BlastScoringParameters* score_params,
3195  const BlastHitSavingParameters* hit_params)
3196 {
3197  Int4 num_identical = 0;
3198  Int4 status;
3199  BlastHSP* new_hsp = NULL;
3200 
3201  if (!getenv("MAPPER_NO_GAP_SHIFT")) {
3202  s_ShiftGaps(gap_align, query_seq, subject->sequence,
3203  query_len, subject->length,
3204  score_params->penalty, &num_identical);
3205  }
3206 
3208  gap_align->jumper->left_prelim_block,
3209  gap_align->jumper->right_prelim_block);
3210 
3211 
3212  status = Blast_HSPInit(gap_align->query_start,
3213  gap_align->query_stop,
3214  gap_align->subject_start,
3215  gap_align->subject_stop,
3216  gap_align->query_start,
3217  gap_align->subject_start,
3218  context,
3219  query_info->contexts[context].frame,
3220  subject->frame,
3221  gap_align->score,
3222  &(gap_align->edit_script),
3223  &new_hsp);
3224 
3225  if (!new_hsp || status) {
3226  return NULL;
3227  }
3228  new_hsp->map_info = BlastHSPMappingInfoNew();
3229  if (!new_hsp->map_info) {
3230  return NULL;
3231  }
3232 
3233  new_hsp->num_ident = num_identical;
3234  new_hsp->evalue = 0.0;
3235  new_hsp->map_info->edits =
3236  JumperFindEdits(query_seq, subject->sequence, gap_align);
3237 
3238  if (hit_params->options->splice) {
3239  /* FIXME: This is currently needed because these splice
3240  sites are used in finding splice signals for overlapping
3241  HSPs */
3242  JumperFindSpliceSignals(new_hsp, query_len, subject->sequence,
3243  subject->length);
3244 
3245  s_SaveSubjectOverhangs(new_hsp, subject->sequence, query_len);
3246  }
3247 
3248  return new_hsp;
3249 }
3250 
3251 /* for mapping this may only work if we hash genome and scan reads */
3252 Int4
3253 BlastNaExtendJumper(BlastOffsetPair* offset_pairs, Int4 num_hits,
3254  const BlastInitialWordParameters* word_params,
3255  const BlastScoringParameters* score_params,
3256  const BlastHitSavingParameters* hit_params,
3257  LookupTableWrap* lookup_wrap,
3260  BlastQueryInfo* query_info,
3261  BlastGapAlignStruct* gap_align,
3262  BlastHSPList* hsp_list,
3263  Uint4 s_range,
3264  SubjectIndex* s_index)
3265 {
3266  Int4 index = 0;
3267  Int4 hits_extended = 0;
3268  Int4 word_length, lut_word_length, ext_to;
3269  Int4 context;
3270  int status;
3271  Int4 num_identical = 0;
3272  /* word matches until this query position will be skipped */
3273  Int4 skip_until = 0;
3274  Int4 last_diag = 0;
3275 
3276  if (lookup_wrap->lut_type == eMBLookupTable) {
3277  BlastMBLookupTable *lut = (BlastMBLookupTable *) lookup_wrap->lut;
3278  word_length = lut->word_length;
3279  lut_word_length = lut->lut_word_length;
3280  }
3281  else {
3282  BlastNaLookupTable *lut = (BlastNaLookupTable *) lookup_wrap->lut;
3283  word_length = lut->word_length;
3284  lut_word_length = lut->lut_word_length;
3285  }
3286  ext_to = word_length - lut_word_length;
3287 
3288  /* We trust that the bases of the hit itself are exact matches,
3289  and look only for exact matches before and after the hit.
3290 
3291  Most of the time, the lookup table width is close to the word size
3292  so only a few bases need examining. Also, most of the time (for
3293  random sequences) extensions will fail almost immediately (the
3294  first base examined will not match about 3/4 of the time). Thus it
3295  is critical to reduce as much as possible all work that is not the
3296  actual examination of sequence data */
3297 
3298  /* Sort word hits by diagnal, query, and subject position */
3299  qsort(offset_pairs, num_hits, sizeof(BlastOffsetPair),
3301 
3302  for (; index < num_hits; ++index) {
3303  Int4 s_offset = offset_pairs[index].qs_offsets.s_off;
3304  Int4 q_offset = offset_pairs[index].qs_offsets.q_off;
3305  Int4 query_start;
3306  /* FIXME: this may need to be Int8 for 10M queries or long reads */
3307  Int4 diag = s_offset - q_offset;
3308 
3309  /* skip word hits from already explored part of the diagonal (word
3310  hits must be sorted by diagonal and query position) */
3311  if (diag == last_diag && q_offset < skip_until) {
3312  continue;
3313  }
3314 
3315  /* begin with the left extension; the initialization is slightly
3316  faster. Point to the first base of the lookup table hit and
3317  work backwards */
3318 
3319  Int4 ext_left = 0;
3320  Int4 s_off = s_offset;
3321  Uint1 *q = query->sequence + q_offset;
3322  Uint1 *s = subject->sequence + s_off / COMPRESSION_RATIO;
3323 
3324  for (; ext_left < MIN(ext_to, s_offset); ++ext_left) {
3325  s_off--;
3326  q--;
3327  if (s_off % COMPRESSION_RATIO == 3)
3328  s--;
3329  if (((Uint1) (*s << (2 * (s_off % COMPRESSION_RATIO))) >> 6)
3330  != *q)
3331  break;
3332  }
3333 
3334  /* do the right extension if the left extension did not find all
3335  the bases required. Begin at the first base beyond the lookup
3336  table hit and move forwards */
3337 
3338  if (ext_left < ext_to) {
3339  Int4 ext_right = 0;
3340  s_off = s_offset + lut_word_length;
3341  if (s_off + ext_to - ext_left > s_range)
3342  continue;
3343  q = query->sequence + q_offset + lut_word_length;
3344  s = subject->sequence + s_off / COMPRESSION_RATIO;
3345 
3346  for (; ext_right < ext_to - ext_left; ++ext_right) {
3347  if (((Uint1) (*s << (2 * (s_off % COMPRESSION_RATIO))) >>
3348  6) != *q)
3349  break;
3350  s_off++;
3351  q++;
3352  if (s_off % COMPRESSION_RATIO == 0)
3353  s++;
3354  }
3355 
3356  /* check if enough extra matches were found */
3357  if (ext_left + ext_right < ext_to)
3358  continue;
3359  }
3360 
3361  q_offset -= ext_left;
3362  s_offset -= ext_left;
3363 
3364  /* adjust query offset */
3365  context = BSearchContextInfo(q_offset, query_info);
3366  query_start = query_info->contexts[context].query_offset;
3367  q_offset -= query_start;
3368  ASSERT(q_offset >= 0);
3369 
3370  Int4 query_len = query_info->contexts[context].query_length;
3371  Uint1* query_seq = query->sequence + query_start;
3372  Int4 right_ungapped_ext_len = 0;
3373 
3375  subject->sequence,
3376  query_len,
3377  subject->length,
3378  q_offset,
3379  s_offset,
3380  gap_align,
3381  score_params,
3382  &num_identical,
3383  &right_ungapped_ext_len);
3384 
3385  hits_extended++;
3386  /* Word hits on the same diagonal up to this query position will be
3387  skipped. right_ungapped_ext_len gives the length of the first
3388  ungapped segment of the right extension, hence word hits on this
3389  part of the diagonal will not generate better alignment than the
3390  one just found. This is assuming that word hits are sorted by
3391  diagonal and query position. */
3392  skip_until = q_offset + query_start + right_ungapped_ext_len;
3393  last_diag = diag;
3394 
3395  if (JumperGoodAlign(gap_align, hit_params, num_identical,
3396  &query_info->contexts[context])) {
3397 
3398  BlastHSP* new_hsp;
3399  Uint1* query_seq = query->sequence + query_start;
3400  /* minimum length of the unaligned part of the subject to search
3401  for matching small words */
3402  const Int4 kMinSubjectOverhang = 100;
3403  /* maximum intron length, i.e. gap between two hsps that can be
3404  combined */
3405  const Int4 kMaxIntronLength = hit_params->options->longest_intron;
3406 
3407  /* shift gaps the right */
3408  /* When several gap positions give the same alignment score,
3409  gap positions in the alignment are arbitrary. Here we ensure
3410  that gaps will be at the same postitions for all reads
3411  that map to the same place in the genome. */
3412  if (!getenv("MAPPER_NO_GAP_SHIFT")) {
3413  s_ShiftGaps(gap_align, query_seq, subject->sequence,
3414  query_len, subject->length,
3415  score_params->penalty, &num_identical);
3416  }
3417 
3419  gap_align->jumper->left_prelim_block,
3420  gap_align->jumper->right_prelim_block);
3421 
3422 
3423  status = Blast_HSPInit(gap_align->query_start,
3424  gap_align->query_stop,
3425  gap_align->subject_start,
3426  gap_align->subject_stop,
3427  q_offset, s_offset,
3428  context,
3429  query_info->contexts[context].frame,
3430  subject->frame,
3431  gap_align->score,
3432  &(gap_align->edit_script),
3433  &new_hsp);
3434  if (!new_hsp) {
3435  return -1;
3436  }
3437  new_hsp->map_info = BlastHSPMappingInfoNew();
3438  if (!new_hsp->map_info) {
3439  return -1;
3440  }
3441  new_hsp->num_ident = num_identical;
3442  new_hsp->evalue = 0.0;
3443  new_hsp->map_info->edits =
3444  JumperFindEdits(query_seq, subject->sequence, gap_align);
3445 
3446  if (hit_params->options->splice) {
3447  /* FIXME: This is currently needed because these splice
3448  sites are used in finding splice signals for overlapping
3449  HSPs */
3450  JumperFindSpliceSignals(new_hsp, query_len, subject->sequence,
3451  subject->length);
3452 
3453  s_SaveSubjectOverhangs(new_hsp, subject->sequence, query_len);
3454  }
3455 
3456  ASSERT(new_hsp);
3457  status = Blast_HSPListSaveHSP(hsp_list, new_hsp);
3458  if (status) {
3459  break;
3460  }
3461 
3462 
3463  /* If left of right query overhang is shorter than word size, then
3464  if overhang length is smaller than mismatch penalty,
3465  first assume the alignment ended with a mismatch and try to
3466  extend it with perfect matches to the end of the query. If this
3467  does not succeed, then look for 4-base word matches from
3468  query overhangs. */
3469 
3470  /* first for the right end of the query */
3471  if (getenv("MAPPER_USE_SMALL_WORDS") &&
3472  query_len - new_hsp->query.end < 16 &&
3473  query_len - new_hsp->query.end >= SUBJECT_INDEX_WORD_LENGTH &&
3474  subject->length - new_hsp->subject.end >= kMinSubjectOverhang) {
3475 
3476  Int4 q = new_hsp->query.end;
3477  Int4 round = 0;
3478  Boolean found = FALSE;
3479  ASSERT(s_index);
3480 
3481  /* if number of unaligned query bases is smaller than
3482  mismatch penalty, then try extending the alignment past the
3483  mismatch to the end of the query */
3484  if (query_len - new_hsp->query.end < -score_params->penalty) {
3485  Int4 i;
3486  for (i = 1;i < query_len - new_hsp->query.end;i++) {
3487  if (query_seq[new_hsp->query.end + i] !=
3488  UNPACK_BASE(subject->sequence,
3489  new_hsp->subject.end + i)) {
3490  break;
3491  }
3492  }
3493 
3494  /* if extension succeeded through 4 bases or till the end
3495  of thequery, create and save an HSP */
3496  if (i > 4 || i == query_len - new_hsp->query.end) {
3497 
3499  new_hsp->query.end + 1,
3500  new_hsp->subject.end + 1,
3501  i - 1,
3502  context, query_seq,
3503  query_info, subject,
3504  query_len);
3505 
3506  ASSERT(w_hsp->query.offset >= 0 &&
3507  w_hsp->query.end <= query_len);
3508  status = Blast_HSPListSaveHSP(hsp_list, w_hsp);
3509  if (status) {
3510  return -1;
3511  }
3512  }
3513  }
3514 
3515  for (; !found && q + SUBJECT_INDEX_WORD_LENGTH <= query_len &&
3516  round < 2;q++, round++) {
3517  Int4 word;
3518  Int4 from;
3519  Int4 s_pos;
3520  SubjectIndexIterator* it = NULL;
3521  Int4 i;
3522 
3523  /* skip over ambiguous bases */
3524  while (q + SUBJECT_INDEX_WORD_LENGTH <= query_len) {
3525  for (i = 0;i < SUBJECT_INDEX_WORD_LENGTH;i++) {
3526  if ((query_seq[q + i] & 0xfc) != 0) {
3527  q = q + i + 1;
3528  break;
3529  }
3530  }
3531 
3532  if (i == SUBJECT_INDEX_WORD_LENGTH) {
3533  break;
3534  }
3535  }
3536 
3537  if (q + SUBJECT_INDEX_WORD_LENGTH - 1 >= query_len) {
3538  break;
3539  }
3540 
3541  /* this is query word */
3542  word = (query_seq[q] << 6) | (query_seq[q + 1] << 4) |
3543  (query_seq[q + 2] << 2) | query_seq[q + 3];
3544  for (i = 4; i < SUBJECT_INDEX_WORD_LENGTH; i++) {
3545  word = (word << 2) | query_seq[q + i];
3546  }
3547 
3548  /* search subject for matching words from this position */
3549  from = new_hsp->subject.end;
3550 
3551  /* create subject word iterator and iterate from the end
3552  of current HSP through the max intron length or up to
3553  the end of subject less query overhang bases (so that
3554  we can extend to the end of the query) */
3555  it = SubjectIndexIteratorNew(s_index, word, from,
3556  MIN((from + kMaxIntronLength),
3557  (subject->length - (query_len - q + 1))));
3558 
3559  /* for each word match position in the subject */
3560  for (s_pos = SubjectIndexIteratorNext(it); s_pos >= 0;
3561  s_pos = SubjectIndexIteratorNext(it)) {
3562  Int4 qt;
3563  Int4 st;
3564 
3565  /* try to extend the hit right to the end of the
3566  query accepting only matches */
3567  qt = q;
3568  st = s_pos;
3569  while (qt < query_len && st < subject->length &&
3570  (query_seq[qt] ==
3571  UNPACK_BASE(subject->sequence, st) ||
3572  /* skip over ambiguous bases */
3573  (query_seq[qt] & 0xfc) != 0)) {
3574  qt++;
3575  st++;
3576  }
3577 
3578  /* proceed only if query matches to the end and try
3579  extending left as much as possible */
3580  if (qt == query_len) {
3581  Int4 qf = q;
3582  Int4 sf = s_pos;
3583  BlastHSP* w_hsp = NULL;
3584 
3585  while (qf >= 0 && sf >= 0 &&
3586  (query_seq[qf] ==
3587  UNPACK_BASE(subject->sequence, sf) ||
3588  (query_seq[qf] & 0xfc) != 0)) {
3589 
3590  qf--;
3591  sf--;
3592  }
3593  qf++;
3594  sf++;
3595  ASSERT(qt - qf == st - sf);
3596 
3597  if (qt - qf < 5 ||
3598  qf > new_hsp->query.end + 1 ||
3599  qf <= new_hsp->query.offset) {
3600 
3601  continue;
3602  }
3603 
3604 
3605  /* trim ambiguous bases at the ends of the word
3606  match */
3607  while (qf < qt && (query_seq[qf] & 0xfc) != 0) {
3608  qf++;
3609  sf++;
3610  }
3611 
3612  while (qt > qf && (query_seq[qt - 1] & 0xfc) != 0) {
3613  qt--;
3614  st--;
3615  }
3616 
3617  if (qf >= qt) {
3618  continue;
3619  }
3620 
3621  /* create HSP */
3622  w_hsp = s_CreateHSPForWordHit(qf, sf, qt - qf,
3623  context, query_seq,
3624  query_info, subject,
3625  query_len);
3626 
3627  ASSERT(w_hsp->query.offset >= 0 &&
3628  w_hsp->query.end <= query_len);
3629 
3630  ASSERT(w_hsp);
3631  if (!w_hsp) {
3632  return -1;
3633  }
3634 
3635  /* add HSP to the list */
3636  status = Blast_HSPListSaveHSP(hsp_list, w_hsp);
3637  if (status) {
3638  return -1;
3639  }
3640  }
3641  }
3642  it = SubjectIndexIteratorFree(it);
3643  }
3644  }
3645 
3646 
3647  if (getenv("MAPPER_USE_SMALL_WORDS") &&
3648  new_hsp->query.offset < 16 &&
3649  new_hsp->query.offset >= SUBJECT_INDEX_WORD_LENGTH &&
3650  new_hsp->subject.offset >= kMinSubjectOverhang) {
3651 
3653  0);
3654  Int4 round = 0;
3655  Boolean found = FALSE;
3656  ASSERT(s_index);
3657 
3658  /* if number of unaligned query bases is smaller than
3659  mismatch penalty, then try extending the alignment past the
3660  mismatch to beginning of the query */
3661  if (new_hsp->query.offset < -score_params->penalty) {
3662  Int4 i;
3663  for (i = 2;i < new_hsp->query.offset - 1;i++) {
3664  if (query_seq[new_hsp->query.offset - i] !=
3665  UNPACK_BASE(subject->sequence,
3666  new_hsp->subject.offset - i)) {
3667  break;
3668  }
3669  }
3670 
3671  /* if the extension suceeded thrught at least 4 bases or
3672  to the beginning of the query, create and save the HSP */
3673  if (i > 4 || i == new_hsp->query.offset - 1) {
3674 
3676  new_hsp->query.offset - 1 - i,
3677  new_hsp->subject.offset - 1- i,
3678  i,
3679  context, query_seq,
3680  query_info, subject,
3681  query_len);
3682 
3683  ASSERT(w_hsp->query.offset >= 0 &&
3684  w_hsp->query.end <= query_len);
3685  status = Blast_HSPListSaveHSP(hsp_list, w_hsp);
3686  if (status) {
3687  return -1;
3688  }
3689  }
3690  }
3691 
3692 
3693  for (; !found && q >= 0 && round < 2;q--, round++) {
3694 
3695  Int4 word;
3696  Int4 from;
3697  Int4 s_pos;
3698  SubjectIndexIterator* it = NULL;
3699  Int4 i;
3700 
3701  /* skip over ambiguous bases */
3702  while (q >= 0) {
3703  for (i = 0;i < SUBJECT_INDEX_WORD_LENGTH;i++) {
3704  if ((query_seq[q + i] & 0xfc) != 0) {
3705  q = q + i - SUBJECT_INDEX_WORD_LENGTH;
3706  break;
3707  }
3708  }
3709 
3710  if (i == SUBJECT_INDEX_WORD_LENGTH) {
3711  break;
3712  }
3713  }
3714 
3715  if (q < 0) {
3716  break;
3717  }
3718 
3719 
3720  /* this is query word */
3721  word = (query_seq[q] << 6) | (query_seq[q + 1] << 4) |
3722  (query_seq[q + 2] << 2) | query_seq[q + 3];
3723  for (i = 4; i < SUBJECT_INDEX_WORD_LENGTH; i++) {
3724  word = (word << 2) | query_seq[q + i];
3725  }
3726 
3727 
3728  /* search subject for matching words from this position */
3729  from = new_hsp->subject.offset - SUBJECT_INDEX_WORD_LENGTH;
3730 
3731  it = SubjectIndexIteratorNew(s_index, word, from,
3732  MAX(from - kMaxIntronLength, q + 1));
3733 
3734  /* for each word match position in the subject */
3735  for (s_pos = SubjectIndexIteratorPrev(it); s_pos >= 0;
3736  s_pos = SubjectIndexIteratorPrev(it)) {
3737 
3738  /* try extending left */
3739  Int4 k = q;
3740  Int4 s = s_pos;
3741  while (k >= 0 &&
3742  (query_seq[k] ==
3743  UNPACK_BASE(subject->sequence, s) ||
3744  (query_seq[k] & 0xfc) != 0)) {
3745  k--;
3746  s--;
3747  }
3748  /* if query matches all the way, then extend right as
3749  far as there are matches */
3750  if (k == -1) {
3751  Int4 qt = q;
3752  Int4 st = s_pos;
3753  BlastHSP* w_hsp = NULL;
3754 
3755  k++;
3756  s++;
3757  while (qt < query_len && st < subject->length &&
3758  (query_seq[qt] ==
3759  UNPACK_BASE(subject->sequence, st) ||
3760  /* skip over ambiguous bases */
3761  (query_seq[qt] & 0xfc) != 0)) {
3762  qt++;
3763  st++;
3764  }
3765  ASSERT(k - qt == s - st);
3766 
3767 
3768  if (qt - k < 5 ||
3769  s >= new_hsp->subject.offset ||
3770  new_hsp->query.offset > qt + 1) {
3771  continue;
3772  }
3773 
3774 
3775  /* trim ambiguous bases at the ends of the word
3776  match */
3777  while (k < qt && (query_seq[k] & 0xfc) != 0) {
3778  k++;
3779  s++;
3780  }
3781 
3782  while (qt > k && (query_seq[qt] & 0xfc) != 0) {
3783  qt--;
3784  st--;
3785  }
3786 
3787  if (k >= qt) {
3788  continue;
3789  }
3790 
3791  /* create HSP */
3792  w_hsp = s_CreateHSPForWordHit(k, s, qt - k,
3793  context, query_seq,
3794  query_info, subject,
3795  query_len);
3796 
3797  ASSERT(w_hsp->query.offset >= 0 &&
3798  w_hsp->query.end <= query_len);
3799 
3800  ASSERT(w_hsp);
3801  if (!w_hsp) {
3802  return -1;
3803  }
3804 
3805  status = Blast_HSPListSaveHSP(hsp_list, w_hsp);
3806  if (status) {
3807  return -1;
3808  }
3809  }
3810  }
3811  it = SubjectIndexIteratorFree(it);
3812  }
3813  }
3814 
3815  }
3816  }
3817 
3818  return hits_extended;
3819 }
3820 
3821 
3823 {
3824  if (!sindex) {
3825  return NULL;
3826  }
3827 
3828  if (sindex->lookups) {
3829  Int4 i;
3830  for (i = 0;i < sindex->num_lookups;i++) {
3831  if (sindex->lookups[i]) {
3833  }
3834  }
3835  free(sindex->lookups);
3836  }
3837 
3838  free(sindex);
3839  return NULL;
3840 }
3841 
3842 static void
3844  LookupTableOptions* opt, QuerySetUpOptions* query_opt,
3845  SubjectIndex* sindex)
3846 {
3847  if (sequence) {
3848  if (sequence->sequence) {
3849  free(sequence->sequence);
3850  }
3851  free(sequence);
3852  }
3853 
3854  while (seqloc) {
3855  BlastSeqLoc* s = seqloc->next;
3856  if (seqloc->ssr) {
3857  free(seqloc->ssr);
3858  }
3859  free(seqloc);
3860  seqloc = s;
3861  }
3862 
3863  if (opt) {
3864  free(opt);
3865  }
3866 
3867  if (query_opt) {
3868  free(query_opt);
3869  }
3870 
3871  SubjectIndexFree(sindex);
3872 }
3873 
3875  Int4 word_size)
3876 {
3877  Int4 i;
3878  BLAST_SequenceBlk* sequence = NULL;
3879  BlastSeqLoc* seqloc = NULL;
3880  SSeqRange* ssr = NULL;
3881  LookupTableOptions* opt = NULL;
3882  QuerySetUpOptions* query_opt = NULL;
3883  SubjectIndex* retval = NULL;
3884  Int4 num_lookups = subject->length / width + 1;
3885 
3886  sequence = calloc(1, sizeof(BLAST_SequenceBlk));
3887  if (!sequence) {
3888  return NULL;
3889  }
3890 
3891  /* convert subject sequence to blastna */
3892  sequence->sequence = calloc(subject->length, sizeof(Uint1));
3893  if (!sequence->sequence) {
3894  free(sequence);
3895  return NULL;
3896  }
3897 
3898  for (i = 0;i < subject->length / 4;i++) {
3899  Int4 k;
3900  for (k = 0;k < 4;k++) {
3901  sequence->sequence[4 * i + k] =
3902  (subject->sequence[i] >> (2 * (3 - k))) & 3;
3903  }
3904  }
3905 
3906  retval = calloc(1, sizeof(SubjectIndex));
3907  if (!retval) {
3908  s_SubjectIndexNewCleanup(sequence, seqloc, opt, query_opt, retval);
3909  return NULL;
3910  }
3911 
3912  retval->lookups = calloc(num_lookups, sizeof(BlastNaLookupTable*));
3913  if (!retval->lookups) {
3914  s_SubjectIndexNewCleanup(sequence, seqloc, opt, query_opt, retval);
3915  return NULL;
3916  }
3917 
3918  ssr = malloc(sizeof(SSeqRange));
3919  if (!ssr) {
3920  s_SubjectIndexNewCleanup(sequence, seqloc, opt, query_opt, retval);
3921  return NULL;
3922  }
3923 
3924  seqloc = calloc(1, sizeof(BlastSeqLoc));
3925  if (!seqloc) {
3926  if (ssr) {
3927  free(ssr);
3928  }
3929  s_SubjectIndexNewCleanup(sequence, seqloc, opt, query_opt, retval);
3930  return NULL;
3931  }
3932 
3933  opt = calloc(1, sizeof(LookupTableOptions));
3934  if (!opt) {
3935  s_SubjectIndexNewCleanup(sequence, seqloc, opt, query_opt, retval);
3936  return NULL;
3937  }
3938  opt->word_size = 4;
3939 
3940  query_opt = calloc(1, sizeof(QuerySetUpOptions));
3941  if (!query_opt) {
3942  s_SubjectIndexNewCleanup(sequence, seqloc, opt, query_opt, retval);
3943  return NULL;
3944  }
3945 
3946 
3947  for (i = 0;i < num_lookups;i++) {
3948 
3949  ssr->left = width * i;
3950  ssr->right = MIN(ssr->left + width, subject->length - 1);
3951 
3952  seqloc->ssr = ssr;
3953 
3954  BlastNaLookupTableNew(sequence, seqloc, &(retval->lookups[i]), opt,
3955  query_opt, word_size);
3956 
3957  if (!retval->lookups[i]) {
3958  s_SubjectIndexNewCleanup(sequence, seqloc, opt, query_opt, retval);
3959  }
3960 
3961  ASSERT(i < num_lookups);
3962  }
3963  retval->num_lookups = i;
3964  retval->width = width;
3965 
3966  s_SubjectIndexNewCleanup(sequence, seqloc, opt, query_opt, NULL);
3967 
3968  return retval;
3969 }
3970 
3971 
3973 {
3974  if (it) {
3975  free(it);
3976  }
3977 
3978  return NULL;
3979 }
3980 
3981 /* Create an iterator for positions of a given word in subject sequence between
3982  positions from and to */
3984  Int4 from, Int4 to)
3985 {
3986  SubjectIndexIterator* retval = NULL;
3987 
3988  if (!s_index || !s_index->lookups[0]) {
3989  return NULL;
3990  }
3991 
3992  retval = calloc(1, sizeof(SubjectIndexIterator));
3993  if (!retval) {
3994  return NULL;
3995  }
3996 
3997  retval->subject_index = s_index;
3998  retval->to = to;
3999  retval->lookup_index = from / s_index->width;
4000  ASSERT(retval->lookup_index < s_index->num_lookups);
4001  if (retval->lookup_index >= s_index->num_lookups) {
4002  SubjectIndexIteratorFree(retval);
4003  return NULL;
4004  }
4005 
4006  /* find the first word with position at least from */
4007  while (retval->lookup_index < retval->subject_index->num_lookups) {
4008  BlastNaLookupTable* lookup = s_index->lookups[retval->lookup_index];
4009  if (!lookup) {
4010  SubjectIndexIteratorFree(retval);
4011  return NULL;
4012  }
4013 
4014  word = word & lookup->mask;
4015  retval->num_words = lookup->thick_backbone[word].num_used;
4016  if (retval->num_words <= NA_HITS_PER_CELL) {
4017  retval->lookup_pos = lookup->thick_backbone[word].payload.entries;
4018  }
4019  else {
4020  retval->lookup_pos = lookup->overflow +
4021  lookup->thick_backbone[word].payload.overflow_cursor;
4022  }
4023 
4024  retval->word = word;
4025  retval->word_index = 0;
4026 
4027  while (retval->word_index < retval->num_words &&
4028  retval->lookup_pos[retval->word_index] < from) {
4029 
4030  retval->word_index++;
4031  }
4032 
4033  if (retval->word_index >= retval->num_words) {
4034  retval->lookup_index++;
4035  }
4036  else {
4037  break;
4038  }
4039  }
4040 
4041  return retval;
4042 }
4043 
4044 /* Return current position of a word and move iterator to the next position */
4046 {
4047  Int4 pos;
4048 
4049  if (!it) {
4050  return -1;
4051  }
4052 
4053  /* if all words in the current lookup table were processed, move to the
4054  next lookup table */
4055  if (it->word_index >= it->num_words) {
4057  it->lookup_index++;
4058 
4059  /* if no more lookup table return no positon found */
4060  if (it->lookup_index >= it->subject_index->num_lookups) {
4061  return -1;
4062  }
4064  ASSERT(lookup);
4065  it->num_words = lookup->thick_backbone[it->word].num_used;
4066  if (it->num_words <= NA_HITS_PER_CELL) {
4067  it->lookup_pos = lookup->thick_backbone[it->word].payload.entries;
4068  }
4069  else {
4070  it->lookup_pos = lookup->overflow +
4071  lookup->thick_backbone[it->word].payload.overflow_cursor;
4072  }
4073  ASSERT(it->lookup_pos);
4074  it->word_index = 0;
4075  }
4076 
4077  if (!it->lookup_pos) {
4078  return -1;
4079  }
4080 
4081  pos = it->lookup_pos[it->word_index];
4082  if (pos > it->to) {
4083  return -1;
4084  }
4085 
4086  /* move iterator to the next position */
4087  it->word_index++;
4088 
4089  return pos;
4090 }
4091 
4092 /* Return current position of a word and move iterator to the previous
4093  position */
4095 {
4096  Int4 pos;
4097 
4098  if (!it) {
4099  return -1;
4100  }
4101 
4102  if (it->word_index < 0) {
4104  it->lookup_index--;
4105  if (it->lookup_index < 0) {
4106  return -1;
4107  }
4109  ASSERT(lookup);
4110  it->num_words = lookup->thick_backbone[it->word].num_used;
4111  if (it->num_words <= NA_HITS_PER_CELL) {
4112  it->lookup_pos = lookup->thick_backbone[it->word].payload.entries;
4113  }
4114  else {
4115  it->lookup_pos = lookup->overflow +
4116  lookup->thick_backbone[it->word].payload.overflow_cursor;
4117  }
4118  ASSERT(it->lookup_pos);
4119  it->word_index = it->num_words - 1;
4120  }
4121 
4122  if (!it->lookup_pos) {
4123  return -1;
4124  }
4125 
4126  pos = it->lookup_pos[it->word_index];
4127  if (pos < it->to) {
4128  return -1;
4129  }
4130 
4131  it->word_index--;
4132 
4133  return pos;
4134 }
4135 
4136 
4137 #define MAX_NUM_MATCHES 10
4138 
4139 static Int4 DoAnchoredScan(Uint1* query_seq, Int4 query_len,
4140  Int4 query_from, Int4 context,
4142  Int4 subject_from, Int4 subject_to,
4143  BlastQueryInfo* query_info,
4144  BlastGapAlignStruct* gap_align,
4145  const BlastScoringParameters* score_params,
4146  const BlastHitSavingParameters* hit_params,
4147  BlastHSPList* hsp_list)
4148 
4149 {
4150  Int4 q = query_from;
4151  BlastHSP* hsp = NULL;
4152  Int4 num = 0;
4153  const int kMaxNumMatches = MAX_NUM_MATCHES;
4154  Int4 num_extensions = 0;
4155  Int4 word_size = 12;
4156  Int4 big_word_size = 0;
4157  Int4 scan_step;
4158 
4159  Uint4 word[MAX_NUM_MATCHES];
4160  Int4 query_pos[MAX_NUM_MATCHES];
4161  Uint4 w;
4162  Int4 i;
4163 
4164  Int4 scan_from = subject_from;
4165  Int4 scan_to = MIN(subject_to, subject->length - 1);
4166 
4167  Uint4 mask = (1U << (2 * word_size)) - 1;
4168 
4169  Int4 best_score = 0;
4170  Int4 num_matches = 0;
4171  Uint1* s = NULL;
4172  Int4 last_idx = 0;
4173  Int4 num_bytes = 0;
4174  Int4 num_words = 0;
4175 
4176  Boolean is_right = subject_from < subject_to;
4177  if (is_right) {
4178  big_word_size = MIN(MAX(query_len - query_from - 5, word_size), 24);
4179  scan_step = big_word_size - word_size + 1;
4180 
4181  }
4182  else {
4183  big_word_size = MIN(MAX(query_from - 5, word_size), 24);
4184  scan_step = -(big_word_size - word_size + 1);
4185  }
4186 
4187  if ((is_right && (query_len - query_from + 1 < big_word_size ||
4188  scan_to - subject_from < big_word_size)) ||
4189  (!is_right && (query_from < big_word_size ||
4190  subject_from - scan_to < big_word_size))) {
4191 
4192  return 0;
4193  }
4194 
4195 
4196 
4197  if (is_right) {
4198  for (; q + big_word_size < query_len && num_words < MAX_NUM_MATCHES; q++) {
4199 
4200  /* skip over ambiguous bases */
4201  while (q + big_word_size <= query_len) {
4202  for (i = 0;i < big_word_size;i++) {
4203  if ((query_seq[q + i] & 0xfc) != 0) {
4204  q = q + i + 1;
4205  break;
4206  }
4207  }
4208 
4209  /* success */
4210  if (i == big_word_size) {
4211  break;
4212  }
4213 
4214  q++;
4215  }
4216 
4217  /* not enough query left */
4218  if (q + big_word_size - 1 >= query_len) {
4219  break;
4220  }
4221 
4222  /* this is query word */
4223  word[num_words] = (query_seq[q] << 6) | (query_seq[q + 1] << 4) |
4224  (query_seq[q + 2] << 2) | query_seq[q + 3];
4225  for (i = 4; i < word_size; i++) {
4226  word[num_words] = (word[num_words] << 2) | query_seq[q + i];
4227  }
4228 
4229  /* do not search for PolyA words */
4230  if (word[num_words] == 0 || word[num_words] == 0xffffff) {
4231  continue;
4232  }
4233 
4234  query_pos[num_words] = q;
4235  num_words++;
4236  }
4237  }
4238  else {
4239  q -= big_word_size;
4240  for (; q >= 0 && num_words < MAX_NUM_MATCHES; q--) {
4241 
4242  /* skip over ambiguous bases */
4243  while (q >= 0) {
4244  for (i = 0;i < big_word_size;i++) {
4245  if ((query_seq[q + i] & 0xfc) != 0) {
4246  q = q - big_word_size + i;
4247  break;
4248  }
4249  }
4250 
4251  /* success */
4252  if (i == big_word_size) {
4253  break;
4254  }
4255 
4256  q--;
4257  }
4258 
4259  /* not enough query left */
4260  if (q < 0) {
4261  break;
4262  }
4263 
4264  /* this is query word */
4265  word[num_words] = (query_seq[q] << 6) | (query_seq[q + 1] << 4) |
4266  (query_seq[q + 2] << 2) | query_seq[q + 3];
4267  for (i = 4; i < word_size; i++) {
4268  word[num_words] = (word[num_words] << 2) | query_seq[q + i];
4269  }
4270 
4271  /* do not search for PolyA words */
4272  if (word[num_words] == 0 || word[num_words] == 0xffffff) {
4273  continue;
4274  }
4275 
4276  query_pos[num_words] = q;
4277  num_words++;
4278  }
4279 
4280  }
4281 
4282  if (num_words == 0) {
4283  return 0;
4284  }
4285 
4286  for (i = scan_from; (scan_from < scan_to && i < scan_to) ||
4287  (scan_from > scan_to && i > scan_to); i += scan_step) {
4288 
4289  Int4 local_ungapped_ext;
4290  Int4 shift;
4291  /* subject word */
4292  Uint4 index;
4293 
4294  Int4 q_offset;
4295  Int4 s_offset;
4296  Int4 num_identical;
4297  Int4 k;
4298 
4299  if (num_matches > kMaxNumMatches) {
4300  break;
4301  }
4302 
4303  s = subject->sequence + i / COMPRESSION_RATIO;
4304 
4305  w = (Int4)s[0] << 16 | s[1] << 8 | s[2];
4306  last_idx = 3;
4307  num_bytes = word_size / COMPRESSION_RATIO;
4308  ASSERT(num_bytes < 9);
4309  for (; last_idx < num_bytes; last_idx++) {
4310  w = w << 8 | s[last_idx];
4311  }
4312 
4313  if (i % COMPRESSION_RATIO != 0) {
4314  w = (w << 8) | s[last_idx];
4315  shift = 2 * (COMPRESSION_RATIO - (i % COMPRESSION_RATIO));
4316  index = (w >> shift) & mask;
4317  }
4318  else {
4319  index = w & mask;
4320  }
4321 
4322 
4323  for (k = 0;k < num_words; k++) {
4324  if (index == word[k]) {
4325  break;
4326  }
4327  }
4328 
4329  if (k >= num_words) {
4330  continue;
4331  }
4332 
4333  q_offset = query_pos[k];
4334  s_offset = i;
4335 
4336  for (k = word_size;k < big_word_size;k++) {
4337  if (query_seq[q_offset + k] != UNPACK_BASE(subject->sequence, s_offset + k)) {
4338  break;
4339  }
4340  }
4341  if (k < big_word_size) {
4342  continue;
4343  }
4344 
4345 
4346  num_matches++;
4347 
4348  num_extensions++;
4349 
4350  num_identical = 0;
4352  subject->sequence,
4353  query_len,
4354  subject->length,
4355  q_offset,
4356  s_offset,
4357  gap_align,
4358  score_params,
4359  &num_identical,
4360  &local_ungapped_ext);
4361 
4362 
4363  if (gap_align->score <= best_score) {
4364  continue;
4365  }
4366 
4367  best_score = gap_align->score;
4368 
4369  if (hsp) {
4370  hsp = Blast_HSPFree(hsp);
4371  }
4372 
4373  hsp = s_CreateHSP(query_seq, query_len, context,
4374  query_info, gap_align, subject,
4375  score_params, hit_params);
4376 
4377 
4378  if (hsp->score >= query_len - query_from) {
4379  break;
4380  }
4381 
4382 
4383  }
4384 
4385  if (hsp) {
4386  Blast_HSPListSaveHSP(hsp_list, hsp);
4387  num++;
4388  }
4389 
4390  return num;
4391 }
4392 
4393 
4396  Int4 word_size,
4397  BlastQueryInfo* query_info,
4398  BlastGapAlignStruct* gap_align,
4399  const BlastScoringParameters* score_params,
4400  const BlastHitSavingParameters* hit_params,
4401  BlastHSPStream* hsp_stream)
4402 {
4403  HSPChain* chains = NULL;
4404  HSPChain* ch = NULL;
4405  BlastHSPList* hsp_list = NULL;
4406 
4407  if (!query || !subject || !query_info || !gap_align || !score_params ||
4408  !hit_params || !hsp_stream) {
4409 
4410  return -1;
4411  }
4412 
4413  hsp_list = Blast_HSPListNew(MAX(query_info->num_queries, 100));
4414  if (!hsp_list) {
4415  return BLASTERR_MEMORY;
4416  }
4417  hsp_list->oid = subject->oid;
4418 
4419  /* Collect HSPs for HSP chains with that cover queries partially */
4420  MT_LOCK_Do(hsp_stream->x_lock, eMT_Lock);
4421  chains = FindPartialyCoveredQueries(hsp_stream->writer->data,
4422  hsp_list->oid, word_size);
4423  MT_LOCK_Do(hsp_stream->x_lock, eMT_Unlock);
4424 
4425 
4426  /* Search uncovered parts of the queries */
4427  for (ch = chains; ch; ch = ch->next) {
4428  HSPContainer* h = ch->hsps;
4429  Int4 context = h->hsp->context;
4430  Uint1* query_seq =
4431  query->sequence + query_info->contexts[context].query_offset;
4432  Int4 query_len = query_info->contexts[context].query_length;
4433  Int4 num = 0;
4434 
4435 
4436  if (h->hsp->query.offset >= 12) {
4437 
4438  num = DoAnchoredScan(query_seq, query_len,
4439  h->hsp->query.offset - 1,
4440  context, subject,
4441  h->hsp->subject.offset - 1,
4442  h->hsp->subject.offset - 1 -
4443  hit_params->options->longest_intron,
4444  query_info, gap_align, score_params,
4445  hit_params, hsp_list);
4446  }
4447 
4448  while (h->next) {
4449  h = h->next;
4450  }
4451 
4452 
4453  if (query_len - h->hsp->query.end > 12) {
4454 
4455  num += DoAnchoredScan(query_seq, query_len, h->hsp->query.end,
4456  context, subject,
4457  h->hsp->subject.end,
4458  h->hsp->subject.end +
4459  hit_params->options->longest_intron,
4460  query_info, gap_align, score_params,
4461  hit_params, hsp_list);
4462  }
4463 
4464  if (num) {
4465  for (h = ch->hsps; h; h = h->next) {
4466  Blast_HSPListSaveHSP(hsp_list, h->hsp);
4467  h->hsp = NULL;
4468  }
4469  }
4470  }
4471 
4472  BlastHSPStreamWrite(hsp_stream, &hsp_list);
4473  HSPChainFree(chains);
4474  Blast_HSPListFree(hsp_list);
4475 
4476  return 0;
4477 }
4478 
4479 #define NUM_DIMERS (1 << 4)
4480 
4481 /* Compute enrtopy of 2-mers in a BLASTNA sequence */
4482 static Int4 s_FindDimerEntropy(Uint1* sequence, Int4 length)
4483 {
4484  Int4 counts[NUM_DIMERS];
4485  Int4 num = 0;
4486  Int4 i;
4487  double sum = 0.0;
4488 
4489  memset(counts, 0, NUM_DIMERS * sizeof(Int4));
4490  // count dimers
4491  for (i=0;i < length - 1;i++) {
4492  Uint1 base_1 = sequence[i];
4493  Uint1 base_2 = sequence[i + 1];
4494 
4495  if ((base_1 & 0xfc) == 0 && (base_2 & 0xfc) == 0) {
4496  Int4 dimer = (base_1 << 2) | base_2;
4497  counts[dimer]++;
4498  num++;
4499  }
4500  }
4501 
4502  // compute amount of information in the sequence
4503  for (i=0;i < NUM_DIMERS;i++) {
4504  if (counts[i]) {
4505  sum += (double)counts[i] * log((double)counts[i] / num);
4506  }
4507  }
4508 
4509  return -sum * (1.0 /(log(16.0))) + 0.5;
4510 }
4511 
4512 
4513 static Int2 s_MaskSequence(Int4 offset, Int4 length, BlastSeqLoc** seq_locs)
4514 {
4515  BlastSeqLoc* loc = calloc(1, sizeof(BlastSeqLoc));
4516  if (!loc) {
4517  return BLASTERR_MEMORY;
4518  }
4519  loc->ssr = calloc(1, sizeof(SSeqRange));
4520  if (!loc->ssr) {
4521  free(loc);
4522  return BLASTERR_MEMORY;
4523  }
4524  loc->ssr->left = offset;
4525  loc->ssr->right = offset + length - 1;
4526  *seq_locs = loc;
4527 
4528  return 0;
4529 }
4530 
4532  const SReadQualityOptions* options,
4533  BlastSeqLoc** seq_loc)
4534 {
4535  const double kMaxFractionOfAmbiguousBases = options->frac_ambig;
4536  Int4 i;
4537  Int4 num = 0;
4538  Int4 entropy;
4539  Int2 status = 0;
4540 
4541  for (i = 0;i < length;i++) {
4542  if (sequence[i] & 0xfc) {
4543  num++;
4544  }
4545  }
4546 
4547  if ((double)num / length > kMaxFractionOfAmbiguousBases) {
4548  status = s_MaskSequence(offset, length, seq_loc);
4549  return status;
4550  }
4551 
4552  entropy = s_FindDimerEntropy(sequence, length);
4553  if (entropy <= options->entropy) {
4554  status = s_MaskSequence(offset, length, seq_loc);
4555  }
4556 
4557  return status;
4558 }
4559 
4560 
4561 /* Get alignment cutoff score for a given query length. Note that the function
4562  assumes that score for match is 1 */
4563 Int4 GetCutoffScore(Int4 query_length)
4564 {
4565  if (query_length <= 20) {
4566  return query_length;
4567  }
4568  else if (query_length <= 34) {
4569  return 20;
4570  }
4571  else if (query_length < 200) {
4572  return (Int4)(0.6 * query_length);
4573  }
4574 
4575  return 120;
4576 }
T round(const T &v)
#define COMPRESSION_RATIO
Compression ratio of nucleotide bases (4 bases in 1 byte)
Definition: blast_def.h:83
#define sfree(x)
Safe free a pointer: belongs to a higher level header.
Definition: blast_def.h:112
Private interface for blast_gapalign.c.
Structures and API used for saving BLAST hits.
Int2 Blast_HSPInit(Int4 query_start, Int4 query_end, Int4 subject_start, Int4 subject_end, Int4 query_gapped_start, Int4 subject_gapped_start, Int4 query_context, Int2 query_frame, Int2 subject_frame, Int4 score, GapEditScript **gap_edit, BlastHSP **ret_hsp)
Allocates BlastHSP and inits with information from input.
Definition: blast_hits.c:151
BlastHSPList * Blast_HSPListNew(Int4 hsp_max)
Creates HSP list structure with a default size HSP array.
Definition: blast_hits.c:1558
BlastHSPMappingInfo * BlastHSPMappingInfoNew(void)
Allocate memory for an HSP's additional data structure.
Definition: blast_hits.c:207
BlastHSP * Blast_HSPFree(BlastHSP *hsp)
Deallocate memory for an HSP structure.
Definition: blast_hits.c:130
Int2 Blast_HSPListSaveHSP(BlastHSPList *hsp_list, BlastHSP *hsp)
Saves HSP information into a BlastHSPList structure.
Definition: blast_hits.c:1754
BlastHSPList * Blast_HSPListFree(BlastHSPList *hsp_list)
Deallocate memory for an HSP list structure as well as all it's components.
Definition: blast_hits.c:1542
int BlastHSPStreamWrite(BlastHSPStream *hsp_stream, BlastHSPList **hsp_list)
Invokes the user-specified write function for this BlastHSPStream implementation.
#define BLASTERR_MEMORY
System error: out of memory condition.
Routines for creating nucleotide BLAST lookup tables.
Int4 BlastNaLookupTableNew(BLAST_SequenceBlk *query, BlastSeqLoc *locations, BlastNaLookupTable **lut, const LookupTableOptions *opt, const QuerySetUpOptions *query_options, Int4 lut_width)
Create a new nucleotide lookup table.
#define NA_HITS_PER_CELL
maximum number of hits in one lookup table cell
BlastNaLookupTable * BlastNaLookupTableDestruct(BlastNaLookupTable *lookup)
Free a nucleotide lookup table.
@ eMBLookupTable
megablast lookup table (includes both contiguous and discontiguous megablast)
Int4 BSearchContextInfo(Int4 n, const BlastQueryInfo *A)
Search BlastContextInfo structures for the specified offset.
ncbi::TMaskedQueryRegions mask
static void DLIST_NAME() append(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:78
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static int lookup(const char *name, const struct lookup_int *table)
Definition: attributes.c:50
static char tmp[3200]
Definition: utf8.c:42
int offset
Definition: replacements.h:160
void GapPrelimEditBlockAdd(GapPrelimEditBlock *edit_block, EGapAlignOpType op_type, Int4 num_ops)
Add a new operation to a preliminary edit block, possibly combining it with the last operation if the...
Definition: gapinfo.c:175
EGapAlignOpType
Operation types within the edit script.
Definition: gapinfo.h:44
@ eGapAlignIns
Insertion: a gap in subject.
Definition: gapinfo.h:51
@ eGapAlignSub
Substitution.
Definition: gapinfo.h:48
@ eGapAlignDel
Deletion: a gap in query.
Definition: gapinfo.h:45
GapEditScript * GapEditScriptNew(Int4 size)
Initialize the edit script structure.
Definition: gapinfo.c:56
GapEditScript * GapEditScriptDelete(GapEditScript *esp)
Free edit script structure.
Definition: gapinfo.c:75
static bool trace
#define NULL
Definition: ncbistd.hpp:225
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
#define MT_LOCK_Do(lk, how)
Call "lk->handler(lk->data, how)".
Definition: ncbi_core.h:270
@ eMT_Unlock
unlock critical section
Definition: ncbi_core.h:183
@ eMT_Lock
lock critical section
Definition: ncbi_core.h:181
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
Implementation of a number of BlastHSPWriters to save the best chain of RNA-Seq hits to a genome.
HSPChain * FindPartialyCoveredQueries(void *data, Int4 oid, Int4 word_size)
Find HSP chains that do not cover full extend of queries for a given subject.
<!DOCTYPE HTML >< html > n< header > n< title > PubSeq Gateway Help Page</title > n< style > n table
static int s_GetSeqPositions(JumperPrelimEditBlock *edit_script, Int4 edit_index, Int4 *query_pos, Int4 *subject_pos)
Definition: jumper.c:245
static JumperPrelimEditBlock * JumperPrelimEditBlockFree(JumperPrelimEditBlock *block)
Definition: jumper.c:125
Int4 BlastNaExtendJumper(BlastOffsetPair *offset_pairs, Int4 num_hits, const BlastInitialWordParameters *word_params, const BlastScoringParameters *score_params, const BlastHitSavingParameters *hit_params, LookupTableWrap *lookup_wrap, BLAST_SequenceBlk *query, BLAST_SequenceBlk *subject, BlastQueryInfo *query_info, BlastGapAlignStruct *gap_align, BlastHSPList *hsp_list, Uint4 s_range, SubjectIndex *s_index)
Extend a list of word hits.
Definition: jumper.c:3253
Int4 JumperExtendLeft(const Uint1 *query, const Uint1 *subject, Int4 query_offset, Int4 subject_offset, Int4 match_score, Int4 mismatch_score, Int4 gap_open_score, Int4 gap_extend_score, int max_mismatches, int window, int *query_ext_len, int *subject_ext_len, GapPrelimEditBlock *edit_script)
Definition: jumper.c:2354
static BlastHSP * s_CreateHSPForWordHit(Int4 q_offset, Int4 s_offset, Int4 length, Int4 context, const Uint1 *query, const BlastQueryInfo *query_info, const BLAST_SequenceBlk *subject, Int4 query_len)
Definition: jumper.c:3098
int JumperGappedAlignmentCompressedWithTraceback(const Uint1 *query, const Uint1 *subject, Int4 query_length, Int4 subject_length, Int4 query_start, Int4 subject_start, BlastGapAlignStruct *gap_align, const BlastScoringParameters *score_params, Int4 *num_identical, Int4 *right_ungapped_ext_len)
Jumper gapped alignment with traceback; 1 base per byte in query, 4 bases per byte in subject.
Definition: jumper.c:2512
JumperEditsBlock * JumperEditsBlockFree(JumperEditsBlock *block)
Definition: jumper.c:2704
SequenceOverhangs * SequenceOverhangsFree(SequenceOverhangs *overhangs)
Definition: jumper.c:2976
int JumperFindSpliceSignals(BlastHSP *hsp, Int4 query_len, const Uint1 *subject, Int4 subject_len)
Find splice signals at the edges of an HSP and save them in the HSP.
Definition: jumper.c:2947
#define UNPACK_BASE(seq, pos)
Definition: jumper.c:95
JumperGapAlign * JumperGapAlignNew(Int4 size)
Definition: jumper.c:199
static int s_ShiftGapsRight(JumperPrelimEditBlock *edit_script, const Uint1 *query, const Uint1 *subject, Int4 query_offset, Int4 subject_offset, Int4 query_length, Int4 subject_length, Int4 *score, Int4 err_score, Int4 *num_identical)
Definition: jumper.c:286
SubjectIndexIterator * SubjectIndexIteratorNew(SubjectIndex *s_index, Int4 word, Int4 from, Int4 to)
Create an iterator for locations of a given word.
Definition: jumper.c:3983
Int4 SubjectIndexIteratorPrev(SubjectIndexIterator *it)
Return the previous location of a word in an indexed sequence.
Definition: jumper.c:4094
JUMP jumper_default[]
Definition: jumper.c:41
Int4 JumperExtendRight(const Uint1 *query, const Uint1 *subject, int query_length, int subject_length, Int4 match_score, Int4 mismatch_score, Int4 gap_open_score, Int4 gap_extend_score, int max_mismatches, int window, int *query_ext_len, int *subject_ext_len, GapPrelimEditBlock *edit_script, Boolean left_extension)
Definition: jumper.c:1384
static Int2 s_MaskSequence(Int4 offset, Int4 length, BlastSeqLoc **seq_locs)
Definition: jumper.c:4513
#define JOP_TO_OP(op)
Definition: jumper.c:102
static int s_ShiftGaps(BlastGapAlignStruct *gap_align, const Uint1 *query, const Uint1 *subject, Int4 query_length, Int4 subject_length, Int4 err_score, Int4 *num_identical)
Definition: jumper.c:457
static Int4 s_SaveSubjectOverhangs(BlastHSP *hsp, Uint1 *subject, Int4 query_len)
Definition: jumper.c:2995
static void s_ResetJumperPrelimEditBlocks(JumperPrelimEditBlock *left, JumperPrelimEditBlock *right)
Definition: jumper.c:229
static Int4 s_ComputeExtensionScore(JumperPrelimEditBlock *edit_script, Int4 match_score, Int4 mismatch_score, Int4 gap_open_score, Int4 gap_extend_score)
Definition: jumper.c:693
JumperEditsBlock * JumperEditsBlockCombine(JumperEditsBlock **block_ptr, JumperEditsBlock **append_ptr)
Definition: jumper.c:2860
Int4 JumperPrelimEditBlockAdd(JumperPrelimEditBlock *block, JumperOpType op)
Definition: jumper.c:139
JumperGapAlign * JumperGapAlignFree(JumperGapAlign *jgap_align)
Definition: jumper.c:183
static Int4 s_FindDimerEntropy(Uint1 *sequence, Int4 length)
Definition: jumper.c:4482
GapEditScript * GapEditScriptCombine(GapEditScript **edit_script_ptr, GapEditScript **append_ptr)
Definition: jumper.c:2895
#define MAX_NUM_MATCHES
Definition: jumper.c:4137
Int4 JumperExtendLeftCompressedWithTraceback(const Uint1 *query, const Uint1 *subject, Int4 query_offset, Int4 subject_offset, Int4 match_score, Int4 mismatch_score, Int4 gap_open_score, Int4 gap_extend_score, int max_mismatches, int window, Uint4 *table, Int4 *query_ext_len, Int4 *subject_ext_len, JumperPrelimEditBlock *edit_script, Int4 *num_identical, JUMP *jumper)
Definition: jumper.c:1917
JumperEditsBlock * JumperEditsBlockNew(Int4 num)
Definition: jumper.c:2718
Boolean JumperGoodAlign(const BlastGapAlignStruct *gap_align, const BlastHitSavingParameters *hit_params, Int4 num_identical, BlastContextInfo *context_info)
Test whether an HSP should be saved.
Definition: jumper.c:2650
SubjectIndex * SubjectIndexNew(BLAST_SequenceBlk *subject, Int4 width, Int4 word_size)
Index a sequence, used for indexing compressed nucleotide subject sequence.
Definition: jumper.c:3874
JUMP jumper_ungapped[]
Definition: jumper.c:84
Int4 JumperExtendLeftCompressedWithTracebackOptimal(const Uint1 *query, const Uint1 *subject, Int4 query_offset, Int4 subject_offset, Int4 match_score, Int4 mismatch_score, Int4 gap_open_score, Int4 gap_extend_score, int max_mismatches, int window, Int4 x_drop, Uint4 *table, Int4 *query_ext_len, Int4 *subject_ext_len, JumperPrelimEditBlock *edit_script, Int4 *best_num_identical)
Definition: jumper.c:2110
Int2 DoAnchoredSearch(BLAST_SequenceBlk *query, BLAST_SequenceBlk *subject, Int4 word_size, BlastQueryInfo *query_info, BlastGapAlignStruct *gap_align, const BlastScoringParameters *score_params, const BlastHitSavingParameters *hit_params, BlastHSPStream *hsp_stream)
Do a search against a single subject with smaller word size and with no database word frequency filte...
Definition: jumper.c:4394
static JumperPrelimEditBlock * JumperPrelimEditBlockNew(Int4 size)
Definition: jumper.c:108
Int4 JumperExtendRightCompressedWithTraceback(const Uint1 *query, const Uint1 *subject, int query_length, int subject_length, Int4 match_score, Int4 mismatch_score, Int4 gap_open_score, Int4 gap_extend_score, int max_mismatches, int window, Uint4 *table, Int4 *query_ext_len, Int4 *subject_ext_len, JumperPrelimEditBlock *edit_script, Int4 *num_identical, Boolean left_extension, Int4 *ungapped_ext_len, JUMP *jumper)
Definition: jumper.c:915
SubjectIndexIterator * SubjectIndexIteratorFree(SubjectIndexIterator *it)
Free memory reserved for subject index word iterator.
Definition: jumper.c:3972
static Int4 DoAnchoredScan(Uint1 *query_seq, Int4 query_len, Int4 query_from, Int4 context, BLAST_SequenceBlk *subject, Int4 subject_from, Int4 subject_to, BlastQueryInfo *query_info, BlastGapAlignStruct *gap_align, const BlastScoringParameters *score_params, const BlastHitSavingParameters *hit_params, BlastHSPList *hsp_list)
Definition: jumper.c:4139
Int4 JumperExtendRightCompressed(const Uint1 *query, const Uint1 *subject, int query_length, int subject_length, Int4 match_score, Int4 mismatch_score, Int4 gap_open_score, Int4 gap_extend_score, int max_mismatches, int window, Uint4 *table, Int4 *query_ext_len, Int4 *subject_ext_len, Int4 *num_identical, Int4 *ungapped_ext_len)
Definition: jumper.c:734
Int4 JumperExtendRightCompressedWithTracebackOptimal(const Uint1 *query, const Uint1 *subject, int query_length, int subject_length, Int4 match_score, Int4 mismatch_score, Int4 gap_open_score, Int4 gap_extend_score, int max_mismatches, int window, Int4 x_drop, Uint4 *table, Int4 *query_ext_len, Int4 *subject_ext_len, JumperPrelimEditBlock *edit_script, Int4 *best_num_identical, Boolean left_extension, Int4 *ungapped_ext_len)
Definition: jumper.c:1124
SubjectIndex * SubjectIndexFree(SubjectIndex *sindex)
Free subject index structure.
Definition: jumper.c:3822
#define JOP_TO_NUM(op)
Definition: jumper.c:105
static int s_CompareOffsetPairsByDiagQuery(const void *a, const void *b)
Definition: jumper.c:3064
JumperEditsBlock * JumperFindEdits(const Uint1 *query, const Uint1 *subject, BlastGapAlignStruct *gap_align)
Definition: jumper.c:2755
Int2 FilterQueriesForMapping(Uint1 *sequence, Int4 length, Int4 offset, const SReadQualityOptions *options, BlastSeqLoc **seq_loc)
Definition: jumper.c:4531
Int4 JumperExtendLeftCompressed(const Uint1 *query, const Uint1 *subject, Int4 query_offset, Int4 subject_offset, Int4 match_score, Int4 mismatch_score, Int4 gap_open_score, Int4 gap_extend_score, int max_mismatches, int window, Uint4 *table, Int4 *query_ext_len, Int4 *subject_ext_len, Int4 *num_identical)
Definition: jumper.c:1749
#define JUMPER_EDIT_BLOCK_ADD(block, op)
Definition: jumper.c:98
JumperEditsBlock * JumperEditsBlockDup(const JumperEditsBlock *block)
Definition: jumper.c:2737
GapEditScript * JumperPrelimEditBlockToGapEditScript(JumperPrelimEditBlock *rev_prelim_block, JumperPrelimEditBlock *fwd_prelim_block)
Convert Jumper's preliminary edit script to GapEditScript.
Definition: jumper.c:610
static BlastHSP * s_CreateHSP(Uint1 *query_seq, Int4 query_len, Int4 context, BlastQueryInfo *query_info, BlastGapAlignStruct *gap_align, BLAST_SequenceBlk *subject, const BlastScoringParameters *score_params, const BlastHitSavingParameters *hit_params)
Definition: jumper.c:3188
Int4 SubjectIndexIteratorNext(SubjectIndexIterator *it)
Return the next location of a word in an indexed sequence.
Definition: jumper.c:4045
static void s_CreateTable(Uint4 *table)
Definition: jumper.c:164
JUMP jumper_edge[]
Definition: jumper.c:75
static void s_TrimExtension(JumperPrelimEditBlock *jops, int margin, const Uint1 **cp, Int4 *cq, Int4 *num_identical, Boolean is_right_ext)
Definition: jumper.c:519
Int4 GetCutoffScore(Int4 query_length)
Get alignment cutoff score for a given query length.
Definition: jumper.c:4563
static void s_SubjectIndexNewCleanup(BLAST_SequenceBlk *sequence, BlastSeqLoc *seqloc, LookupTableOptions *opt, QuerySetUpOptions *query_opt, SubjectIndex *sindex)
Definition: jumper.c:3843
#define NUM_DIMERS
Definition: jumper.c:4479
Int4 JumperExtendRightWithTraceback(const Uint1 *query, const Uint1 *subject, int query_length, int subject_length, Int4 match_score, Int4 mismatch_score, Int4 gap_open_score, Int4 gap_extend_score, int max_mismatches, int window, Int4 *query_ext_len, Int4 *subject_ext_len, JumperPrelimEditBlock *edit_script, Int4 *num_identical, Boolean left_extension, Int4 *ungapped_ext_len, JUMP *jumper)
Right extension with traceback.
Definition: jumper.c:1552
#define JUMPER_INSERTION
Definition: jumper.h:55
#define MAPPER_EXON
Definition: jumper.h:234
#define JUMPER_MISMATCH
Definition: jumper.h:54
#define JUMPER_DELETION
Definition: jumper.h:56
Int2 JumperOpType
Jumper edit script operation.
Definition: jumper.h:59
#define SUBJECT_INDEX_WORD_LENGTH
Definition: jumper.h:148
int i
yy_size_t n
int len
Definition: fix_pub.hpp:45
const struct ncbi::grid::netcache::search::fields::SIZE size
unsigned int a
Definition: ncbi_localip.c:102
#define MIN(a, b)
returns smaller of a and b.
Definition: ncbi_std.h:112
Uint1 Boolean
bool replacment for C
Definition: ncbi_std.h:94
#define TRUE
bool replacment for C indicating true.
Definition: ncbi_std.h:97
#define FALSE
bool replacment for C indicating false.
Definition: ncbi_std.h:101
#define ASSERT
macro for assert.
Definition: ncbi_std.h:107
#define MAX(a, b)
returns larger of a and b.
Definition: ncbi_std.h:117
HSPChain * HSPChainFree(HSPChain *chain_list)
Deallocate a chain or list of chains.
Definition: spliced_hits.c:109
Structure to hold a sequence.
Definition: blast_def.h:242
Uint1 * sequence
Sequence used for search (could be translation).
Definition: blast_def.h:243
The context related information.
Int4 query_length
Length of this query, strand or frame.
Int4 query_offset
Offset of this query, strand or frame in the concatenated super-query.
Int1 frame
Frame number (-1, -2, -3, 0, 1, 2, or 3)
Structure supporting the gapped alignment.
Int4 gap_x_dropoff
X-dropoff parameter to use.
Int4 query_stop
query end offseet of current alignment
Int4 subject_start
subject start offset current alignment
Int4 query_start
query start offset of current alignment
Int4 subject_stop
subject end offset of current alignment
Int4 max_mismatches
Max number of mismatches for jumper.
Int4 mismatch_window
Window sie for mismatches for jumper.
JumperGapAlign * jumper
data for jumper alignment
Int4 score
Return value: alignment score.
GapEditScript * edit_script
The traceback (gap) information.
The structure to hold all HSPs for a given sequence after the gapped alignment.
Definition: blast_hits.h:153
Int4 oid
The ordinal id of the subject sequence this HSP list is for.
Definition: blast_hits.h:154
Uint1 left_edge
Two subject bases before the alignment in the four least significant bits and flags in most significa...
Definition: blast_hits.h:116
JumperEditsBlock * edits
Information about mismatches and gaps, used for mapping short reads.
Definition: blast_hits.h:114
SequenceOverhangs * subject_overhangs
Unaligned subject subsequence.
Definition: blast_hits.h:122
Default implementation of BlastHSPStream.
BlastHSPWriter * writer
writer to be applied when writing
MT_LOCK x_lock
Mutex for writing and reading results.
void * data
data structure
Structure holding all information about an HSP.
Definition: blast_hits.h:126
double evalue
This HSP's e-value.
Definition: blast_hits.h:130
Int4 num_ident
Number of identical base pairs in this HSP.
Definition: blast_hits.h:128
BlastSeg query
Query sequence info.
Definition: blast_hits.h:131
Int4 context
Context number of query.
Definition: blast_hits.h:133
BlastSeg subject
Subject sequence info.
Definition: blast_hits.h:132
Int4 score
This HSP's raw score.
Definition: blast_hits.h:127
BlastHSPMappingInfo * map_info
Definition: blast_hits.h:146
Int4 longest_intron
The longest distance between HSPs allowed for combining via sum statistics with uneven gaps.
Int4 cutoff_score
The (raw) score cut-off threshold.
Int4 max_edit_distance
Maximum number of mismatches and gaps.
Int4 cutoff_score_fun[2]
Coefficients x100 for the raw score cut-off threshold as a function of query length: x[0] + x[1] * qu...
double percent_identity
The percent identity cut-off threshold.
Parameter block that contains a pointer to BlastHitSavingOptions and the values derived from it.
BlastHitSavingOptions * options
The original (unparsed) options.
Parameter block that contains a pointer to BlastInitialWordOptions and the values derived from it.
The lookup table structure used for Mega BLAST.
Int4 lut_word_length
number of letters in a lookup table word
Int4 word_length
number of exact letter matches that will trigger an ungapped extension
The basic lookup table structure for blastn searches.
Int4 lut_word_length
Length in bases of a word indexed by the lookup table.
Int4 word_length
Length in bases of the full word match required to trigger extension.
The query related information.
BlastContextInfo * contexts
Information per context.
int num_queries
Number of query sequences.
Scoring parameters block Contains scoring-related information that is actually used for the blast sea...
Int4 gap_extend
Penalty for each gap residue (scaled version)
Int2 penalty
Penalty for a mismatch.
Int4 gap_open
Extra penalty for starting a gap (scaled version)
Int2 reward
Reward for a match.
Int4 end
End of hsp.
Definition: blast_hits.h:99
Int4 offset
Start of hsp.
Definition: blast_hits.h:98
Used to hold a set of positions, mostly used for filtering.
Definition: blast_def.h:204
SSeqRange * ssr
location data on the sequence.
Definition: blast_def.h:206
struct BlastSeqLoc * next
next in linked list
Definition: blast_def.h:205
Edit script: linked list of correspondencies between two sequences.
Definition: gapinfo.h:57
Int4 * num
Array of number of operations.
Definition: gapinfo.h:59
Int4 size
Size of above arrays.
Definition: gapinfo.h:60
EGapAlignOpType * op_type
Array of type of operation.
Definition: gapinfo.h:58
Preliminary version of GapEditBlock, used directly by the low- level dynamic programming routines.
Definition: gapinfo.h:73
A chain of HSPs: spliced alignment.
Definition: spliced_hits.h:60
HSPContainer * hsps
A list of HSPs that belong to this chain.
Definition: spliced_hits.h:64
struct HSPChain * next
Pointer to the next chain in a list.
Definition: spliced_hits.h:73
struct HSPContainer * next
Definition: spliced_hits.h:45
BlastHSP * hsp
Definition: spliced_hits.h:44
Definition: jumper.h:48
int dcq
Definition: jumper.h:48
int ok
Definition: jumper.h:48
int dcp
Definition: jumper.h:48
int lng
Definition: jumper.h:48
Single alignment error.
Definition: jumper.h:87
Alignment edit script for gapped alignment.
Definition: jumper.h:96
JumperEdit * edits
Definition: jumper.h:97
Int4 num_edits
Definition: jumper.h:98
Gapped alignment data needed for jumper.
Definition: jumper.h:72
JumperPrelimEditBlock * left_prelim_block
Definition: jumper.h:73
Uint4 * table
Table used for matching 4 bases in compressed subject to 4 bases in uncompressed query.
Definition: jumper.h:75
JumperPrelimEditBlock * right_prelim_block
Definition: jumper.h:74
Internal alignment edit script.
Definition: jumper.h:63
JumperOpType * edit_ops
Definition: jumper.h:64
Options needed to construct a lookup table Also needed: query sequence and query length.
Int4 word_size
Determines the size of the lookup table.
Wrapper structure for different types of BLAST lookup tables.
Definition: lookup_wrap.h:50
void * lut
Pointer to the actual lookup table structure.
Definition: lookup_wrap.h:52
ELookupTableType lut_type
What kind of a lookup table it is?
Definition: lookup_wrap.h:51
Options required for setting up the query sequence.
Filtering options for mapping next-generation sequences.
double frac_ambig
Fraction of ambiguous bases.
A structure containing two integers, used e.g.
Definition: blast_def.h:155
Int4 left
left endpoint of range (zero based)
Definition: blast_def.h:156
Int4 right
right endpoint of range (zero based)
Definition: blast_def.h:157
Structure to save short unaligned subsequences outside an HSP.
Definition: jumper.h:268
Uint1 * left
Left subsequence.
Definition: jumper.h:271
Uint1 * right
Rught subsequence.
Definition: jumper.h:272
Int4 right_len
Length of the right subsequence.
Definition: jumper.h:270
Int4 left_len
Length of the left subsequence.
Definition: jumper.h:269
Iterator over word locations in subject index.
Definition: jumper.h:179
SubjectIndex * subject_index
Definition: jumper.h:180
Index for a chunk of a subject sequence.
Definition: jumper.h:152
Int4 num_lookups
Number of lookup tables used.
Definition: jumper.h:158
Int4 width
Number of bases covered by each lookup table.
Definition: jumper.h:156
BlastNaLookupTable ** lookups
Array of lookup tables.
Definition: jumper.h:154
static string subject
static string query
This symbol enables the verbose option in makeblastdb and other BLAST+ search command line applicatio...
Definition: blast_def.h:141
Uint4 q_off
Query offset.
Definition: blast_def.h:143
Uint4 s_off
Subject offset.
Definition: blast_def.h:144
struct BlastOffsetPair::@6 qs_offsets
Query/subject offset pair.
static CS_CONTEXT * context
Definition: will_convert.c:21
void free(voidpf ptr)
voidp malloc(uInt size)
voidp calloc(uInt items, uInt size)
Modified on Fri May 24 14:51:17 2024 by modify_doxy.py rev. 669887