NCBI C++ ToolKit
blast_engine.c
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: blast_engine.c 100829 2023-09-15 20:29:23Z camacho $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  */
27 
28 /** @file blast_engine.c
29  * Function calls to actually perform a BLAST search (high level).
30  * The hierarchy of function calls, starting from
31  * the top level in the BLAST core, is described below.
32  * <pre>
33  * Preliminary stage of the BLAST search:
34  *
35  * Blast_RunPreliminarySearch
36  * BLAST_GapAlignSetUp
37  * BLAST_PreliminarySearchEngine
38  * if (RPS BLAST) {
39  * s_RPSPreliminarySearchEngine
40  * s_BlastSearchEngineCore
41  * } else {
42  * for (all sequences in the database)
43  * s_BlastSearchEngineCore
44  * }
45  *
46  * Full BLAST search, including preliminary search and traceback:
47  *
48  * Blast_RunFullSearch
49  * BLAST_GapAlignSetUp
50  * BLAST_PreliminarySearchEngine
51  * BLAST_ComputeTraceback
52  *
53  * </pre>
54  */
55 
75 #include "blast_gapalign_priv.h"
76 #include "jumper.h"
77 
78 /** Converts nucleotide coordinates to protein */
79 #define CONV_NUCL2PROT_COORDINATES(length) (length) / CODON_LENGTH
80 
84 
85 /** Structure to be passed to s_BlastSearchEngineCore, containing pointers
86  to various preallocated structures and arrays. */
87 typedef struct BlastCoreAuxStruct {
88 
89  Blast_ExtendWord* ewp; /**< Structure for keeping track of diagonal
90  information for initial word matches */
91  BlastWordFinderType WordFinder; /**< Word finder function pointer */
92  BlastGetGappedScoreType GetGappedScore; /**< Gapped extension function
93  pointer */
94  JumperGappedType JumperGapped; /**< Word finder and gapped extension
95  for mapping short reads */
96 
97  BlastInitHitList* init_hitlist; /**< Placeholder for HSPs after
98  ungapped extension */
99  BlastOffsetPair* offset_pairs; /**< Array of offset pairs for initial seeds. */
101 
102  Uint1* translation_buffer; /**< Placeholder for translated subject
103  sequences */
104  Uint1* translation_table; /**< Translation table for forward strand */
105  Uint1* translation_table_rc; /**< Translation table for reverse
106  strand */
108 
109 /** Deallocates all memory in BlastCoreAuxStruct */
110 static BlastCoreAuxStruct*
112 {
113  BlastExtendWordFree(aux_struct->ewp);
114  BLAST_InitHitListFree(aux_struct->init_hitlist);
115  sfree(aux_struct->offset_pairs);
116  MapperWordHitsFree(aux_struct->mapper_wordhits);
117 
118  sfree(aux_struct);
119  return NULL;
120 }
121 
122 /** Structure used for subject sequence split */
123 typedef struct SubjectSplitStruct {
124 
125  Uint1* sequence; /**< backup of original sequence */
126  SSeqRange full_range; /**< full sequence range */
127 
128  SSeqRange* seq_ranges; /**< backup of original sequence range */
129  Int4 num_seq_ranges; /**< backup of original number of items in seq_ranges */
130  Int4 allocated; /**< number of seq_range allocated for subject */
131 
132  SSeqRange* hard_ranges; /**< sequence ranges for hard masking */
133  Int4 num_hard_ranges; /**< number of hard masking ranges */
134  Int4 hm_index; /**< the current hard masking range index*/
135 
136  SSeqRange* soft_ranges; /**< sequence ranges for soft masking */
137  Int4 num_soft_ranges; /**< number of soft masking ranges */
138  Int4 sm_index; /**< the current soft masking range index*/
139 
140  Int4 offset; /**< the offset of current chunk */
141  Int4 next; /**< the offset of next chunk */
142 
144 
146  SubjectSplitStruct* backup)
147 {
148  if (backup->sequence) return;
149 
150  backup->sequence = subject->sequence;
151  backup->full_range.left = 0;
152  backup->full_range.right = subject->length;
153 
154  backup->seq_ranges = subject->seq_ranges;
155  backup->num_seq_ranges = subject->num_seq_ranges;
156  backup->allocated = 0;
157 
158  backup->hard_ranges = &(backup->full_range);
159  backup->num_hard_ranges = 1;
160  backup->hm_index = 0;
161 
162  backup->soft_ranges = &(backup->full_range);
163  backup->num_soft_ranges = 1;
164  backup->sm_index = 0;
165 
166  if (subject->mask_type == eSoftSubjMasking) {
167  ASSERT (backup->seq_ranges);
168  ASSERT (backup->num_seq_ranges >= 1);
169  backup->soft_ranges = backup->seq_ranges;
170  backup->num_soft_ranges = backup->num_seq_ranges;
171  } else if (subject->mask_type == eHardSubjMasking) {
172  ASSERT (backup->seq_ranges);
173  ASSERT (backup->num_seq_ranges >= 1);
174  backup->hard_ranges = backup->seq_ranges;
175  backup->num_hard_ranges = backup->num_seq_ranges;
176  }
177 
178  backup->offset = backup->hard_ranges[0].left;
179  backup->next = backup->offset;
180  subject->chunk = -1;
181 
182 }
183 
185  SubjectSplitStruct* backup,
186  Int4 num_seq_ranges)
187 {
188  ASSERT(num_seq_ranges >= 1);
189  subject->num_seq_ranges = num_seq_ranges;
190  if (backup->allocated >= num_seq_ranges) return;
191  if (backup->allocated) {
192  sfree(subject->seq_ranges);
193  }
194 
195  backup->allocated = num_seq_ranges;
196  subject->seq_ranges = (SSeqRange *) calloc(backup->allocated,
197  sizeof(SSeqRange));
198 }
199 
201  SubjectSplitStruct* backup)
202 {
203  if (! backup->sequence) return;
204 
205  subject->sequence = backup->sequence;
206  subject->length = backup->full_range.right;
207 
208  if (backup->allocated) {
209  sfree(subject->seq_ranges);
210  }
211  subject->seq_ranges = backup->seq_ranges;
212  subject->num_seq_ranges = backup->num_seq_ranges;
213 
214  backup->sequence = NULL;
215 }
216 
217 const Int2 SUBJECT_SPLIT_DONE = 0; /**< return value indicating hitting the end */
218 const Int2 SUBJECT_SPLIT_OK = 1; /**< return value indicating OK */
219 const Int2 SUBJECT_SPLIT_NO_RANGE = 2; /**< return value indicating all masked out */
220 
222  SubjectSplitStruct *backup,
223  Boolean is_nucleotide,
224  int chunk_overlap)
225 {
226  int start, len, i, residual;
227  int dbseq_chunk_overlap;
228 
229  if (chunk_overlap > 0) {
230  dbseq_chunk_overlap = chunk_overlap;
231  }
232  else {
233  dbseq_chunk_overlap = DBSEQ_CHUNK_OVERLAP;
234  }
235 
236  ASSERT(subject);
237  ASSERT(backup);
238 
239  if (backup->next >= backup->full_range.right) return SUBJECT_SPLIT_DONE;
240 
241  residual = is_nucleotide ? backup->next % COMPRESSION_RATIO : 0;
242  backup->offset = backup->next - residual;
243  subject->sequence = backup->sequence + ((is_nucleotide) ?
244  backup->offset /COMPRESSION_RATIO : backup->offset);
245 
246  if (backup->offset + MAX_DBSEQ_LEN <
247  backup->hard_ranges[backup->hm_index].right) {
248 
249  subject->length = MAX_DBSEQ_LEN;
250  backup->next = backup->offset + MAX_DBSEQ_LEN - dbseq_chunk_overlap;
251 
252  } else {
253 
254  subject->length = backup->hard_ranges[backup->hm_index].right
255  - backup->offset;
256  backup->hm_index++;
257  backup->next = (backup->hm_index < backup->num_hard_ranges) ?
258  backup->hard_ranges[backup->hm_index].left :
259  backup->full_range.right;
260  }
261 
262  (subject->chunk)++;
263 
264  /* if no chunking is performed */
265  if (backup->offset == 0 && residual == 0 && backup->next == backup->full_range.right) {
266  subject->seq_ranges = backup->soft_ranges;
267  subject->num_seq_ranges = backup->num_soft_ranges;
268  return SUBJECT_SPLIT_OK;
269  }
270 
271  /* if soft masking is off */
272  if (subject->mask_type != eSoftSubjMasking) {
273  s_AllocateSeqRange(subject, backup, 1);
274  subject->seq_ranges[0].left = residual;
275  subject->seq_ranges[0].right = subject->length;
276  return SUBJECT_SPLIT_OK;
277  }
278 
279  /* soft masking is on, sequence is chunked, must re-allocate and adjust */
280  ASSERT (residual == 0);
281  start = backup->offset;
282  len = start + subject->length;
283  i = backup->sm_index;
284 
285  while (backup->soft_ranges[i].right < start) ++i;
286  start = i;
287 
288  while (i < backup->num_soft_ranges
289  && backup->soft_ranges[i].left < len) ++i;
290  len = i - start;
291  backup->sm_index = i - 1;
292 
293  ASSERT(len >= 0);
294  ASSERT(backup->sm_index >= 0);
295 
296  if (len == 0) return SUBJECT_SPLIT_NO_RANGE;
297 
298  s_AllocateSeqRange(subject, backup, len);
299 
300  for (i=0; i<len; i++) {
301  subject->seq_ranges[i].left = backup->soft_ranges[i+start].left - backup->offset;
302  subject->seq_ranges[i].right = backup->soft_ranges[i+start].right - backup->offset;
303  }
304 
305  if (subject->seq_ranges[0].left < 0)
306  subject->seq_ranges[0].left = 0;
307  if (subject->seq_ranges[len-1].right > subject->length)
308  subject->seq_ranges[len-1].right = subject->length;
309 
310  return SUBJECT_SPLIT_OK;
311 }
312 
313 /** Adjust HSP coordinates for out-of-frame gapped extension.
314  * @param program One of blastx or tblastn [in]
315  * @param init_hitlist List of hits after ungapped extension [in]
316  * @param query_info Query information containing context offsets;
317  * needed for blastx only [in]
318  * @param subject_frame Frame of the subject sequence; tblastn only [in]
319  * @param subject_length Length of the original nucleotide subject sequence;
320  * tblastn only [in]
321  * @param offset Shift in the subject sequence protein coordinates [in]
322  */
323 static void
325  BlastInitHitList* init_hitlist,
326  const BlastQueryInfo* query_info,
327  Int2 subject_frame, Int4 subject_length,
328  Int4 offset)
329 {
330  BlastInitHSP * init_hsp = 0;
331  Int4 index = 0;
332 
333  for(index = 0; index < init_hitlist->total; ++index) {
334  BlastContextInfo * contexts = query_info->contexts;
335  init_hsp = &init_hitlist->init_hsp_array[index];
336 
337  if (program == eBlastTypeBlastx) {
338  Int4 context_idx = 0; /* Index of this HSP's context */
339  Int4 frame_idx = 0; /* Index of this frame within set of
340  frames with same query and sign */
341  Int4 init_frame_idx = 0; /* First frame of this query */
342  Int4 frame_pos = 0; /* Start of this frame in DNA */
343 
344  /* Find context containing this HSP */
345  context_idx =
347  query_info);
348 
349  frame_idx = context_idx % CODON_LENGTH;
350  init_frame_idx = context_idx - frame_idx;
351 
352  frame_pos = contexts[init_frame_idx].query_offset + frame_idx;
353 
354  init_hsp->offsets.qs_offsets.q_off =
355  (init_hsp->offsets.qs_offsets.q_off -
356  contexts[context_idx].query_offset) * CODON_LENGTH + frame_pos;
357 
358  init_hsp->ungapped_data->q_start =
359  (init_hsp->ungapped_data->q_start -
360  contexts[context_idx].query_offset) * CODON_LENGTH + frame_pos;
361  } else {
362  init_hsp->offsets.qs_offsets.s_off += offset;
363  init_hsp->ungapped_data->s_start += offset;
364  if (subject_frame > 0) {
365  init_hsp->offsets.qs_offsets.s_off =
366  (init_hsp->offsets.qs_offsets.s_off * CODON_LENGTH) +
367  subject_frame - 1;
368  init_hsp->ungapped_data->s_start =
369  (init_hsp->ungapped_data->s_start * CODON_LENGTH) +
370  subject_frame - 1;
371  } else {
372  init_hsp->offsets.qs_offsets.s_off =
373  (init_hsp->offsets.qs_offsets.s_off * CODON_LENGTH) +
374  subject_length - subject_frame;
375  init_hsp->ungapped_data->s_start =
376  (init_hsp->ungapped_data->s_start * CODON_LENGTH) +
377  subject_length - subject_frame;
378  }
379  }
380  }
381  Blast_InitHitListSortByScore(init_hitlist);
382 }
383 
384 /** Set up context offsets for the auxiliary BlastQueryInfo structure that is
385  * created for the concatenated database in RPS BLAST search. Calls the public
386  * function OffsetArrayToContextOffsets with a blastp program, because subjects
387  * are protein sequences. This guarantees that all frames are set to 0.
388  * @param info The preallocated structure [in] [out]
389  * @param new_offsets The array context offsets to fill [in]
390  */
391 static void
393  Int4 * new_offsets)
394 {
395  const EBlastProgramType kProgram = eBlastTypeBlastp;
396  OffsetArrayToContextOffsets(info, new_offsets, kProgram);
397 }
398 
399 /** Searches only one context of a database sequence, but does all chunks if it is split.
400  * @param program_number BLAST program type [in]
401  * @param query Query sequence structure [in]
402  * @param query_info Query information [in]
403  * @param subject Subject sequence structure [in]
404  * @param orig_length original length of query before translation [in]
405  * @param lookup Lookup table [in]
406  * @param gap_align Structure for gapped alignment information [in]
407  * @param score_params Scoring parameters [in]
408  * @param word_params Initial word finding and ungapped extension
409  * parameters [in]
410  * @param ext_params Gapped extension parameters [in]
411  * @param hit_params Hit saving parameters [in]
412  * @param diagnostics Hit counts and other diagnostics [in] [out]
413  * @param aux_struct Structure containing different auxiliary data and memory
414  * for the preliminary stage of the BLAST search [in]
415  * @param hsp_list_out_ptr List of HSPs found for a given subject sequence [out]
416  * @param interrupt_search function callback to allow interruption of BLAST
417  * search [in, optional]
418  * @param progress_info contains information about the progress of the current
419  * BLAST search [in|out]
420  */
421 
422 static Int2
424  BLAST_SequenceBlk* query, BlastQueryInfo* query_info,
426  BlastGapAlignStruct* gap_align,
427  const BlastScoringParameters* score_params,
428  const BlastInitialWordParameters* word_params,
429  const BlastExtensionParameters* ext_params,
430  const BlastHitSavingParameters* hit_params,
431  BlastDiagnostics* diagnostics,
432  BlastCoreAuxStruct* aux_struct,
433  BlastHSPList** hsp_list_out_ptr,
434  TInterruptFnPtr interrupt_search,
435  SBlastProgress* progress_info)
436 {
437  Int2 status = 0; /* return value */
438  BlastHSPList* combined_hsp_list = NULL;
439  BlastHSPList* hsp_list = NULL;
440  BlastInitHitList* init_hitlist = aux_struct->init_hitlist;
441  BlastScoringOptions* score_options = score_params->options;
442  BlastUngappedStats* ungapped_stats = NULL;
443  BlastGappedStats* gapped_stats = NULL;
444  Int4 **matrix = (gap_align->positionBased) ?
445  gap_align->sbp->psi_matrix->pssm->data :
446  gap_align->sbp->matrix->data;
447  const Boolean kTranslatedSubject =
448  (Blast_SubjectIsTranslated(program_number) || program_number == eBlastTypeRpsTblastn);
449  const Boolean kNucleotide = Blast_ProgramIsNucleotide(program_number);
450  const int kHspNumMax = BlastHspNumMax(score_options->gapped_calculation, hit_params->options);
451  const int kScanSubjectOffsetArraySize = GetOffsetArraySize(lookup);
452  Int4 dbseq_chunk_overlap;
453  Int4 overlap;
454 
455  SubjectSplitStruct backup;
456  backup.sequence = NULL;
457 
458  /* increase overlap on db chunks for mapping to 1.5 times the longest
459  query */
460 
461  if (Blast_ProgramIsMapping(program_number) &&
462  (Int4)query_info->max_length < 110) {
463 
464  dbseq_chunk_overlap = (Int4)query_info->max_length
465  + (query_info->max_length / 2);
466  }
467  else {
468  dbseq_chunk_overlap = DBSEQ_CHUNK_OVERLAP;
469  }
470 
471  if (diagnostics) {
472  ungapped_stats = diagnostics->ungapped_stat;
473  gapped_stats = diagnostics->gapped_stat;
474  }
475 
476  s_BackupSubject(subject, &backup);
477 
478  while (TRUE) {
479  status = s_GetNextSubjectChunk(subject, &backup, kNucleotide,
480  dbseq_chunk_overlap);
481 
482  if (status == SUBJECT_SPLIT_DONE) break;
483  if (status == SUBJECT_SPLIT_NO_RANGE) continue;
484  ASSERT(status == SUBJECT_SPLIT_OK);
485  ASSERT(subject->num_seq_ranges >= 1);
486  ASSERT(subject->seq_ranges);
487 
488  /* Delete if not done in last loop iteration to prevent memory leak. */
489  hsp_list = Blast_HSPListFree(hsp_list);
490 
491  BlastInitHitListReset(init_hitlist);
492 
493  if (aux_struct->WordFinder) {
494  aux_struct->WordFinder(subject, query, query_info, lookup, matrix,
495  word_params, aux_struct->ewp,
496  aux_struct->offset_pairs,
497  kScanSubjectOffsetArraySize,
498  init_hitlist, ungapped_stats);
499 
500  if (init_hitlist->total == 0) continue;
501  }
502 
503  if (score_options->gapped_calculation) {
504  Int4 prot_length = 0;
505  if (score_options->is_ooframe) {
506  /* Convert query offsets in all HSPs into the mixed-frame
507  coordinates */
508  s_TranslateHSPsToDNAPCoord(program_number, init_hitlist,
509  query_info, subject->frame, orig_length, backup.offset);
510  if (kTranslatedSubject) {
511  prot_length = subject->length;
512  subject->length = orig_length;
513  }
514  }
515  /** NB: If queries are concatenated, HSP offsets must be adjusted
516  * inside the following function call, so coordinates are
517  * relative to the individual contexts (i.e. queries, strands or
518  * frames). Contexts should also be filled in HSPs when they
519  * are saved.
520  */
521  /* fence_hit is null, since this is only for prelim stage. */
522  if (aux_struct->GetGappedScore) {
523  status = aux_struct->GetGappedScore(program_number, query,
524  query_info,
525  subject, gap_align, score_params, ext_params, hit_params,
526  init_hitlist, &hsp_list, gapped_stats, NULL);
527  }
528  else if (aux_struct->JumperGapped) {
529  status = aux_struct->JumperGapped(subject, query, query_info,
530  lookup, word_params,
531  score_params, hit_params,
532  aux_struct->offset_pairs,
533  aux_struct->mapper_wordhits,
534  kScanSubjectOffsetArraySize,
535  gap_align, init_hitlist,
536  &hsp_list, ungapped_stats,
537  gapped_stats);
538  }
539  if (status) break;
540 
541  /* No need to do this for short reads */
542  if (aux_struct->GetGappedScore) {
543 
544  /* Removes redundant HSPs. */
545  Blast_HSPListPurgeHSPsWithCommonEndpoints(program_number, hsp_list, TRUE);
546 
547  /* For nucleotide search, if match score is = 2, the odd scores
548  are rounded down to the nearest even number. */
549 #if 0
550  Blast_HSPListAdjustOddBlastnScores(hsp_list, score_options->gapped_calculation, gap_align->sbp);
551 #endif
552 
553  }
554 
555  Blast_HSPListSortByScore(hsp_list);
556 
557 
558  if (score_options->is_ooframe && kTranslatedSubject)
559  subject->length = prot_length;
560  } else {
561  BLAST_GetUngappedHSPList(init_hitlist, query_info, subject,
562  hit_params->options, &hsp_list);
563  }
564 
565  if (hsp_list->hspcnt == 0) continue;
566 
567  /* The subject ordinal id is not yet filled in this HSP list */
568  hsp_list->oid = subject->oid;
569 
570  /* check for interrupt */
571  if (interrupt_search && (*interrupt_search)(progress_info) == TRUE) {
572  combined_hsp_list = Blast_HSPListFree(combined_hsp_list);
573  BlastInitHitListReset(init_hitlist);
574  status = BLASTERR_INTERRUPTED;
575  break;
576  }
577 
578  Blast_HSPListAdjustOffsets(hsp_list, backup.offset);
579  overlap = (backup.offset == backup.hard_ranges[backup.hm_index].left) ?
580  0 : dbseq_chunk_overlap;
581  status = Blast_HSPListsMerge(&hsp_list, &combined_hsp_list,
582  kHspNumMax, &(backup.offset), INT4_MIN,
583  overlap, score_options->gapped_calculation,
584  Blast_ProgramIsMapping(program_number));
585 
586  if((hit_params->options->hsp_filt_opt != NULL) &&
587  (hit_params->options->hsp_filt_opt->subject_besthit_opts != NULL)) {
588  Blast_HSPListSubjectBestHit(program_number,
590  query_info, combined_hsp_list);
591  }
592 
593  } /* End loop on chunks of subject sequence */
594 
595  s_RestoreSubject(subject, &backup);
596 
597  hsp_list = Blast_HSPListFree(hsp_list); /* In case this was not freed in above loop. */
598 
599  *hsp_list_out_ptr = combined_hsp_list;
600 
601  return status;
602 }
603 
604 /** Clean up function for s_BlastSearchEngineCore
605  * @param program_number BLAST program type [in]
606  * @param query_info Query information structure local to
607  * s_BlastSearchEngineCore, which may or may not be deallocated [in]
608  * @param query_info_in Query information [in]
609  * @param translation_buffer buffer containing translated sequence data [in]
610  * @param frame_offsets_a FIXME
611  */
612 static void
614  BlastQueryInfo* query_info,
615  const BlastQueryInfo* query_info_in,
616  Uint1* translation_buffer,
617  Uint4* frame_offsets_a)
618 {
619  /* Free the local query info structure when needed (in RPS BLAST). */
620  if (query_info != query_info_in)
621  BlastQueryInfoFree(query_info);
622 
623  /* Free translation buffer and frame offsets, except for RPS tblastn,
624  * where they are taken from different structures, and hence shouldn't
625  * be freed here.
626  */
627  if (program_number != eBlastTypeRpsTblastn) {
628  if (translation_buffer) {
629  sfree(translation_buffer);
630  }
631  }
632 
633  if (frame_offsets_a) {
634  sfree(frame_offsets_a);
635  }
636 }
637 
638 /** Discard the HSPs above the prelim e-value threshold from the HSP list
639  * @param hsp_list List of HSPs for one subject sequence [in] [out]
640  * @param hit_params Parameters block containing the prelim e-value [in]
641 */
642 static Int2
644 {
645  BlastHSP* hsp;
646  BlastHSP** hsp_array;
647  Int4 hsp_cnt = 0;
648  Int4 index;
649  double cutoff;
650 
651  if (hsp_list == NULL)
652  return 0;
653 
654  cutoff = hit_params->prelim_evalue;
655 
656  hsp_array = hsp_list->hsp_array;
657  for (index = 0; index < hsp_list->hspcnt; index++) {
658  hsp = hsp_array[index];
659 
660  ASSERT(hsp != NULL);
661 
662  if (hsp->evalue > cutoff) {
663  hsp_array[index] = Blast_HSPFree(hsp_array[index]);
664  } else {
665  if (index > hsp_cnt)
666  hsp_array[hsp_cnt] = hsp_array[index];
667  hsp_cnt++;
668  }
669  }
670 
671  hsp_list->hspcnt = hsp_cnt;
672 
673  return 0;
674 }
675 
676 /** The core of the BLAST search: comparison between the (concatenated)
677  * query against one subject sequence. Translation of the subject sequence
678  * into 6 frames is done inside, if necessary. If subject sequence is
679  * too long, it can be split into several chunks.
680  * @param program_number BLAST program type [in]
681  * @param query Query sequence structure [in]
682  * @param query_info_in Query information [in]
683  * @param subject Subject sequence structure [in]
684  * @param lookup Lookup table [in]
685  * @param gap_align Structure for gapped alignment information [in]
686  * @param score_params Scoring parameters [in]
687  * @param word_params Initial word finding and ungapped extension
688  * parameters [in]
689  * @param ext_params Gapped extension parameters [in]
690  * @param hit_params Hit saving parameters [in]
691  * @param db_options Database options [in]
692  * @param diagnostics Hit counts and other diagnostics [in] [out]
693  * @param aux_struct Structure containing different auxiliary data and memory
694  * for the preliminary stage of the BLAST search [in]
695  * @param hsp_list_out_ptr List of HSPs found for a given subject sequence [in]
696  * @param interrupt_search function callback to allow interruption of BLAST
697  * search [in, optional]
698  * @param progress_info contains information about the progress of the current
699  * BLAST search [in|out]
700  */
701 static Int2
704  BlastQueryInfo* query_info_in,
707  BlastGapAlignStruct* gap_align,
708  const BlastScoringParameters* score_params,
709  const BlastInitialWordParameters* word_params,
710  const BlastExtensionParameters* ext_params,
711  const BlastHitSavingParameters* hit_params,
712  const BlastDatabaseOptions* db_options,
713  BlastDiagnostics* diagnostics,
714  BlastCoreAuxStruct* aux_struct,
715  BlastHSPList** hsp_list_out_ptr,
716  TInterruptFnPtr interrupt_search,
717  SBlastProgress* progress_info)
718 {
719  BlastHSPList* hsp_list_out=NULL;
720  Uint1* translation_buffer = NULL;
721  Uint4* frame_offsets = NULL;
722  Uint4* frame_offsets_a = NULL; /* Will be freed if non-null */
723  BlastHitSavingOptions* hit_options = hit_params->options;
724  BlastScoringOptions* score_options = score_params->options;
725  Int2 status = 0;
726  Uint4 context, first_context, last_context;
727  BlastQueryInfo* query_info = query_info_in;
728  Int4 orig_length = subject->length;
729  Int4 stat_length = subject->length;
730  // To support rmblastn -RMH-
731  BlastScoreBlk* sbp = gap_align->sbp;
732 
733  const Boolean kTranslatedSubject =
734  (Blast_SubjectIsTranslated(program_number) || program_number == eBlastTypeRpsTblastn);
735  const Boolean kNucleotide = Blast_ProgramIsNucleotide(program_number);
736  const int kHspNumMax = BlastHspNumMax(score_options->gapped_calculation, hit_options);
737  Boolean isRPS = FALSE;
738  SubjectSplitStruct backup = {0,};
739 
740  if (Blast_ProgramIsRpsBlast(program_number))
741  isRPS = TRUE;
742 
743  backup.sequence = NULL;
744 
745  *hsp_list_out_ptr = NULL;
746 
747  if (kTranslatedSubject) {
748 
749  s_BackupSubject(subject, &backup);
750  if (subject->mask_type != eNoSubjMasking) {
751  s_AllocateSeqRange(subject, &backup, backup.num_seq_ranges);
752  } else {
753  subject->num_seq_ranges = 0;
754  subject->seq_ranges = NULL;
755  }
756 
757  first_context = 0;
758  last_context = 5;
759  if (score_options->is_ooframe) {
761  backup.full_range.right,
762  subject->gen_code_string, &translation_buffer,
763  &frame_offsets, &subject->oof_sequence);
764  subject->oof_sequence_allocated = TRUE;
765  frame_offsets_a = frame_offsets;
766  } else if (program_number == eBlastTypeRpsTblastn ) {
767  /* For RPS tblastn, subject is actually query, which has already
768  been translated during the setup stage. */
769  translation_buffer = backup.sequence - 1;
770  frame_offsets_a = frame_offsets = (Uint4*)ContextOffsetsToOffsetArray(query_info_in);
771  } else {
773  backup.full_range.right,
774  subject->gen_code_string, &translation_buffer,
775  &frame_offsets, NULL);
776  frame_offsets_a = frame_offsets;
777  /* The following limits the search to plus or minus strand if desired. */
778  if (subject->subject_strand == 1) {
779  first_context = 0;
780  last_context = 2;
781  } else if (subject->subject_strand == 2) {
782  first_context = 3;
783  last_context = 5;
784  }
785  }
786  } else if (kNucleotide) {
787  first_context = 1;
788  last_context = 1;
789  } else {
790  first_context = 0;
791  last_context = 0;
792  }
793 
794 
795  /* Substitute query info by concatenated database info for RPS BLAST search */
796  if (isRPS) {
799  /* Since this will really be "subject info", not "query info",
800  pass program argument such that all frames will be set to 0. */
802  }
803 
804  /* Loop over frames of the subject sequence */
805  for (context=first_context; context<=last_context; context++) {
806  BlastHSPList* hsp_list_for_chunks = NULL;
807 
808  if (kTranslatedSubject) {
809  Uint4 i;
811  subject->sequence = translation_buffer + frame_offsets[context] + 1;
812  subject->length = frame_offsets[context+1] - frame_offsets[context] - 1;
813  if (subject->length > 0) stat_length = subject->length;
814 
815  /* perform per-context mask translation */
816  if (context == 0) { /* first positive context */
817  for (i = 0; i < subject->num_seq_ranges; i++) {
818  subject->seq_ranges[i].left =
820  subject->seq_ranges[i].right =
822  }
823  } else if (context == 3) { /* first negative context */
824  for (i = 0; i < subject->num_seq_ranges; i++) {
825  subject->seq_ranges[subject->num_seq_ranges-i-1].left =
827  subject->seq_ranges[subject->num_seq_ranges-i-1].right =
829  }
830  }
831  } else {
832  subject->frame = context;
833  }
834 
835  status = s_BlastSearchEngineOneContext(program_number, query, query_info,
836  subject, orig_length, lookup,
837  gap_align, score_params,
838  word_params, ext_params,
839  hit_params, diagnostics,
840  aux_struct, &hsp_list_for_chunks,
841  interrupt_search, progress_info);
842  if (status != 0) break;
843 
844  if (Blast_HSPListAppend(&hsp_list_for_chunks, &hsp_list_out, kHspNumMax)) {
845  status = 1;
846  break;
847  }
848 
849  /* if searching was interrupted, delete accumulated results
850  but continue execution so temporary structures get freed */
851  if (interrupt_search && (*interrupt_search)(progress_info) == TRUE) {
852  status = BLASTERR_INTERRUPTED;
853  break;
854  }
855  } /* End loop on frames */
856 
857  /* restore mask ranges */
858  if (kTranslatedSubject) {
859  s_RestoreSubject(subject, &backup);
860  }
861 
862  if (status) {
863  hsp_list_out = Blast_HSPListFree(hsp_list_out);
864  s_BlastSearchEngineCoreCleanUp(program_number, query_info,
865  query_info_in, translation_buffer,
866  frame_offsets_a);
867  return status;
868  }
869 
870  if (hit_params->link_hsp_params) {
871  status = BLAST_LinkHsps(program_number, hsp_list_out, query_info,
872  subject->length, gap_align->sbp, hit_params->link_hsp_params,
873  score_options->gapped_calculation);
874  } else if (!Blast_ProgramIsPhiBlast(program_number)
875  && !(isRPS && !sbp->gbp)
876  /* do not calculate E-values for mapping */
877  && program_number != eBlastTypeMapping ) {
878  /* Calculate e-values for all HSPs. Skip this step
879  for PHI or RPS with old FSC, since calculating the E values
880  requires precomputation that has not been done yet */
881  double scale_factor = 1.0;
882  if (isRPS) {
883  scale_factor = score_params->scale_factor;
884  }
885  Blast_HSPListGetEvalues(program_number, query_info,
886  stat_length, hsp_list_out,
887  score_options->gapped_calculation,
888  isRPS, gap_align->sbp, 0, scale_factor);
889  }
890 
891  /* Use score threshold rather than evalue if
892  * matrix_only_scoring is used. -RMH-
893  */
894  if ( sbp->matrix_only_scoring )
895  {
896  status = Blast_HSPListReapByRawScore(hsp_list_out, hit_options);
897  }else {
898  /* Discard HSPs that don't pass the e-value test. */
899  status = s_Blast_HSPListReapByPrelimEvalue(hsp_list_out, hit_params);
900  }
901 
902  /* If there are no HSPs left, destroy the HSP list too. */
903  if (hsp_list_out && hsp_list_out->hspcnt == 0)
904  *hsp_list_out_ptr = hsp_list_out = Blast_HSPListFree(hsp_list_out);
905 
906  if (diagnostics && diagnostics->gapped_stat && hsp_list_out && hsp_list_out->hspcnt > 0) {
907  BlastGappedStats* gapped_stats = diagnostics->gapped_stat;
908  ++gapped_stats->num_seqs_passed;
909  gapped_stats->good_extensions += hsp_list_out->hspcnt;
910  }
911 
912  s_BlastSearchEngineCoreCleanUp(program_number, query_info, query_info_in,
913  translation_buffer, frame_offsets_a);
914 
915  *hsp_list_out_ptr = hsp_list_out;
916 
917  return status;
918 }
919 
920 /** Fills the output information about the cutoffs uses in a BLAST search.
921  * @param return_cutoffs Structure for saving cutoffs information [in] [out]
922  * @param score_params Scoring parameters, containing the scaling factor [in]
923  * @param word_params Initial word parameters [in]
924  * @param ext_params Gapped extension parameters [in]
925  * @param hit_params Hit saving parameters [in]
926  */
927 static Int2
929  const BlastScoringParameters* score_params,
930  const BlastInitialWordParameters* word_params,
931  const BlastExtensionParameters* ext_params,
932  const BlastHitSavingParameters* hit_params)
933 {
934  /* since the cutoff score here will be used for display
935  purposes, strip out any internal scaling of the scores
936 
937  If this was a multi-query search, use the least stringent
938  cutoff and most generous dropoff value among all the
939  possible sequences */
940 
941  Int4 scale_factor = (Int4)score_params->scale_factor;
942 
943  if (!return_cutoffs)
944  return -1;
945 
946  return_cutoffs->x_drop_ungapped = word_params->x_dropoff_max / scale_factor;
947  return_cutoffs->x_drop_gap = ext_params->gap_x_dropoff / scale_factor;
948  return_cutoffs->x_drop_gap_final = ext_params->gap_x_dropoff_final /
949  scale_factor;
950  return_cutoffs->ungapped_cutoff = word_params->cutoff_score_min /
951  scale_factor;
952  return_cutoffs->cutoff_score = hit_params->cutoff_score_min / scale_factor;
953 
954  return 0;
955 }
956 
957 /** Setup of the auxiliary BLAST structures;
958  * also calculates internally used parameters from options.
959  * @param seq_src Sequence source information, with callbacks to get
960  * sequences, their lengths, etc. [in]
961  * @param lookup_wrap Lookup table, already constructed. [in]
962  * @param word_params Parameters for initial word finding and ungapped
963  * extension. [in]
964  * @param ext_options options for gapped extension. [in]
965  * @param hit_options options for saving hits. [in]
966  * @param query The query sequence block [in]
967  * @param aux_struct_ptr Placeholder joining various auxiliary memory
968  * structures [out]
969  */
970 static Int2
972  LookupTableWrap* lookup_wrap,
973  const BlastInitialWordParameters* word_params,
974  const BlastExtensionOptions* ext_options,
975  const BlastHitSavingOptions* hit_options,
977  const BlastQueryInfo* query_info, BlastCoreAuxStruct** aux_struct_ptr)
978 {
979  Int2 status = 0;
980  BlastCoreAuxStruct* aux_struct;
981  Boolean blastp = (lookup_wrap->lut_type == eAaLookupTable ||
982  lookup_wrap->lut_type == eCompressedAaLookupTable);
983  Boolean rpsblast = (lookup_wrap->lut_type == eRPSLookupTable);
984  // Boolean indexed_mb_lookup = (lookup_wrap->lut_type == eIndexedMBLookupTable);
985  Boolean indexed_mb_lookup = (lookup_wrap->read_indexed_db != 0);
986  Boolean phi_lookup = (lookup_wrap->lut_type == ePhiLookupTable ||
987  lookup_wrap->lut_type == ePhiNaLookupTable);
988  Boolean smith_waterman =
989  (ext_options->ePrelimGapExt == eSmithWatermanScoreOnly);
990 
991  Boolean jumper = (ext_options->ePrelimGapExt == eJumperWithTraceback);
992  Int4 offset_array_size = GetOffsetArraySize(lookup_wrap);
993 
994  if(phi_lookup) {
995  offset_array_size = PHI_MAX_HIT;
996  }
997  ASSERT(seq_src);
998 
999  *aux_struct_ptr = aux_struct = (BlastCoreAuxStruct*)
1000  calloc(1, sizeof(BlastCoreAuxStruct));
1001 
1002  if ((status = BlastExtendWordNew(query->length, word_params,
1003  &aux_struct->ewp)) != 0)
1004  return status;
1005 
1006  aux_struct->JumperGapped = NULL;
1007  aux_struct->mapper_wordhits = NULL;
1008 
1009  if (smith_waterman) {
1010  aux_struct->WordFinder = NULL;
1011  /*
1012  } else if (indexed_mb_lookup) {
1013  aux_struct->WordFinder = MB_IndexedWordFinder;
1014  */
1015  } else if (phi_lookup) {
1016  aux_struct->WordFinder = PHIBlastWordFinder;
1017  } else if (blastp) {
1018  BlastChooseProteinScanSubject(lookup_wrap);
1019  aux_struct->WordFinder = BlastAaWordFinder;
1020  } else if (rpsblast) {
1021  aux_struct->WordFinder = BlastRPSWordFinder;
1022  } else {
1023  if( lookup_wrap->lut_type != eIndexedMBLookupTable ) {
1024  BlastChooseNucleotideScanSubject(lookup_wrap);
1025  BlastChooseNaExtend(lookup_wrap);
1026  }
1027 
1028  if( indexed_mb_lookup ) {
1029  aux_struct->WordFinder = MB_IndexedWordFinder;
1030  }
1031  else {
1032  aux_struct->WordFinder = BlastNaWordFinder;
1033  }
1034 
1035  if (jumper) {
1036  aux_struct->WordFinder = NULL;
1037  }
1038  }
1039 
1040  aux_struct->offset_pairs =
1041  (BlastOffsetPair*) malloc(offset_array_size * sizeof(BlastOffsetPair));
1042 
1043  aux_struct->init_hitlist = BLAST_InitHitListNew();
1044  /* Pick which gapped alignment algorithm to use. */
1045  if (phi_lookup)
1046  aux_struct->GetGappedScore = PHIGetGappedScore;
1047  else if (smith_waterman)
1049  else if (jumper) {
1050  aux_struct->GetGappedScore = NULL;
1051 
1052  /* Create several lists of word hits for mapper with large number of
1053  queries */
1054  if (query_info->num_queries > 1000) {
1055  aux_struct->mapper_wordhits = MapperWordHitsNew(query, query_info);
1056  }
1057 
1058  if (indexed_mb_lookup) {
1060  }
1061  else {
1062  aux_struct->JumperGapped = JumperNaWordFinder;
1063  }
1064  }
1065  else
1066  aux_struct->GetGappedScore = BLAST_GetGappedScore;
1067 
1068  return status;
1069 }
1070 
1071 /** Performs the preliminary stage of an RPS BLAST search, after all set up has
1072  * already been done.
1073  * @param program_number Type of BLAST program [in]
1074  * @param query The query sequence [in]
1075  * @param query_info Additional query information [in]
1076  * @param seq_src Structure containing BLAST database [in]
1077  * @param score_params Hit scoring parameters [in]
1078  * @param lookup_wrap The lookup table, constructed earlier [in]
1079  * @param aux_struct Wrapper for auxiliary structures used in preliminary
1080  * search [in]
1081  * @param word_params Parameters for processing initial word hits [in]
1082  * @param ext_params Parameters for the gapped extension [in]
1083  * @param gap_align Structure containing scoring block and memory allocated
1084  * for gapped alignment. [in]
1085  * @param hit_params Parameters for saving the HSPs [in]
1086  * @param hsp_stream Placeholder for saving HSP lists [in]
1087  * @param diagnostics Return statistics containing numbers of hits on
1088  * different stages of the search. Statistics saved only
1089  * for the allocated parts of the structure. [in] [out]
1090  * @param interrupt_search function callback to allow interruption of BLAST
1091  * search [in, optional]
1092  * @param progress_info contains information about the progress of the current
1093  * BLAST search [in|out]
1094  */
1095 static Int2
1097  BLAST_SequenceBlk* query, BlastQueryInfo* query_info,
1098  const BlastSeqSrc* seq_src,
1099  const BlastScoringParameters* score_params,
1100  LookupTableWrap* lookup_wrap, BlastCoreAuxStruct* aux_struct,
1101  const BlastInitialWordParameters* word_params,
1102  const BlastExtensionParameters* ext_params,
1103  BlastGapAlignStruct* gap_align,
1104  const BlastHitSavingParameters* hit_params,
1105  BlastHSPStream* hsp_stream, BlastDiagnostics* diagnostics,
1106  TInterruptFnPtr interrupt_search, SBlastProgress* progress_info)
1107 {
1108  BlastHSPList* hsp_list = NULL;
1109  Int2 status = 0;
1110  Int8 dbsize;
1111  Int4 num_db_seqs;
1113  BLAST_SequenceBlk concat_db;
1114  BlastQueryInfo* one_query_info = NULL;
1115  BLAST_SequenceBlk* one_query = NULL;
1116  Int4 index;
1117 
1118  if ( !Blast_ProgramIsRpsBlast(program_number))
1119  return -1;
1120 
1121  /* modify scoring and gap alignment structures for
1122  use with RPS blast. */
1123 
1124  gap_align->positionBased = TRUE;
1125  RPSPsiMatrixAttach(gap_align->sbp, lookup->rps_pssm,
1126  lookup->alphabet_size);
1127 
1128  /* determine the total number of residues in the db.
1129  This figure must also include one trailing NULL for
1130  each DB sequence */
1131 
1132  num_db_seqs = BlastSeqSrcGetNumSeqs(seq_src);
1133  dbsize = BlastSeqSrcGetTotLen(seq_src) + num_db_seqs;
1134  if (dbsize > INT4_MAX)
1135  return -3;
1136 
1137  /* Concatenate all of the DB sequences together, and pretend
1138  this is a large multiplexed sequence. Note that because the
1139  scoring is position-specific, the actual sequence data is
1140  not needed */
1141 
1142  memset(&concat_db, 0, sizeof(concat_db)); /* fill in SequenceBlk */
1143  concat_db.length = (Int4) dbsize;
1144 
1145  /* Change the table of diagonals that will be used for the
1146  search; we need a diag table that can fit the entire
1147  concatenated DB */
1148  BlastExtendWordFree(aux_struct->ewp);
1149  BlastExtendWordNew(concat_db.length, word_params, &aux_struct->ewp);
1150 
1151  /* Run the search; the input query is what gets scanned
1152  and the concatenated DB is the sequence associated with
1153  the score matrix. This essentially means that 'query'
1154  and 'subject' have opposite conventions for the search.
1155 
1156  Note that while scores can be calculated for any alignment
1157  found, we have not set up any Karlin parameters or effective
1158  search space sizes for the concatenated DB. This means that
1159  E-values cannot be calculated after hits are found. */
1160 
1161  for (index = 0; index < query_info->num_queries; ++index) {
1162  /* Separate one query from the set: create an auxiliary query_info
1163  structure which refers to this single query. */
1164  if (Blast_GetOneQueryStructs(&one_query_info, &one_query,
1165  query_info, query, index) != 0)
1166  return -1;
1167 
1168  /* It is OK to pass NULL for the BlastDatabaseOptions argument, because it
1169  will not be checked for RPS BLAST program types. */
1170  status = (Int4)
1171  s_BlastSearchEngineCore(program_number, &concat_db, one_query_info,
1172  one_query, lookup_wrap, gap_align, score_params,
1173  word_params, ext_params, hit_params, NULL,
1174  diagnostics, aux_struct, &hsp_list, interrupt_search,
1175  progress_info);
1176 
1177  if (interrupt_search && (*interrupt_search)(progress_info) == TRUE) {
1178  hsp_list = Blast_HSPListFree(hsp_list);
1179  status = BLASTERR_INTERRUPTED;
1180  break;
1181  }
1182 
1183  /* Save the resulting list of HSPs. 'query' and 'subject' are
1184  still reversed */
1185  if (hsp_list && hsp_list->hspcnt > 0) {
1186  hsp_list->query_index = index;
1187  /* Save the HSP list */
1188  BlastHSPStreamWrite(hsp_stream, &hsp_list);
1189  }
1190  }
1191 
1192  BlastQueryInfoFree(one_query_info);
1193  BlastSequenceBlkFree(one_query);
1194 
1195  /* Restore original settings in the gapped alignment structure. */
1196  RPSPsiMatrixDetach(gap_align->sbp);
1197  gap_align->positionBased = FALSE;
1198 
1199  /* Fill the cutoff values in the diagnostics structure */
1200  if (diagnostics && diagnostics->cutoffs) {
1201  s_FillReturnCutoffsInfo(diagnostics->cutoffs, score_params, word_params,
1202  ext_params, hit_params);
1203  }
1204 
1205  return status;
1206 }
1207 
1208 static void
1210 {
1211  int i = 0;
1212  BlastHSP ** hsp_array = hsp_list->hsp_array;
1213  for(i=0; i < hsp_list->hspcnt; i++) {
1214  BlastHSP * hsp = hsp_array[i];
1215  Boolean fix_co = FALSE;
1216 
1217  if(hsp->subject.frame > 0) {
1218  switch (offset) {
1219  case 1:
1220  if(hsp->subject.frame == 1) {
1221  hsp->subject.frame = 3;
1222  fix_co = TRUE;
1223  }
1224  else if(hsp->subject.frame == 2) {
1225  hsp->subject.frame = 1;
1226  }
1227  else if (hsp->subject.frame == 3) {
1228  hsp->subject.frame = 2;
1229  }
1230  break;
1231  case 2:
1232  if(hsp->subject.frame == 1) {
1233  hsp->subject.frame = 2;
1234  fix_co = TRUE;
1235  }
1236  else if(hsp->subject.frame == 2) {
1237  hsp->subject.frame = 3;
1238  fix_co = TRUE;
1239  }
1240  else if (hsp->subject.frame == 3) {
1241  hsp->subject.frame = 1;
1242  }
1243  break;
1244  case 3:
1245  fix_co = TRUE;
1246  break;
1247  default:
1248  break;
1249  }
1250 
1251  if(fix_co) {
1252  if(hsp->subject.offset == 0)
1253  hsp->query.offset +=1;
1254 
1255  if(hsp->subject.offset > 0)
1256  hsp->subject.offset -=1;
1257 
1258  hsp->subject.end -=1;
1259  hsp->subject.gapped_start -=1;
1260  }
1261  }
1262  else {
1263  Int4 max_end = length - offset;
1264  Int4 subj_end = (hsp->subject.end + 1) *CODON_LENGTH -hsp->subject.frame -2;
1265  if(subj_end > max_end)
1266  {
1267  hsp->subject.end -=1;
1268  hsp->query.end -=1;
1269  }
1270  }
1271  }
1272 }
1273 
1274 static void
1276 {
1277  int i = 0;
1278  BlastHSP ** hsp_array = hsp_list->hsp_array;
1279 
1280  for(i=0; i < hsp_list->hspcnt; i++)
1281  {
1282  BlastHSP * hsp = hsp_array[i];
1283 
1284  if(hsp->subject.offset <= offset)
1285  {
1286  unsigned int q_offset = offset - hsp->subject.offset;
1287  hsp->subject.offset = 0;
1288  hsp->query.offset += q_offset;
1289 
1290  if(hsp->subject.gapped_start <= offset)
1291  {
1292  hsp->subject.gapped_start = 0;
1293  }
1294  else
1295  {
1296  hsp->subject.gapped_start -= offset;
1297  }
1298 
1299  if(hsp->query.gapped_start < hsp->query.offset)
1300  {
1301  hsp->query.gapped_start += q_offset;
1302  }
1303  }
1304  else
1305  {
1306  hsp->subject.offset -= offset;
1307  hsp->subject.gapped_start -= offset;
1308  }
1309 
1310  hsp->subject.end -= offset;
1311 
1312  ASSERT(hsp->subject.offset < hsp->subject.end);
1313  ASSERT(hsp->query.offset < hsp->query.end);
1314  }
1315 }
1316 
1317 
1319 {
1320  Int4 word_length = 1;
1321  if (lookup_wrap->lut_type == eAaLookupTable) {
1322  BlastAaLookupTable* lut;
1323  lut = (BlastAaLookupTable*)(lookup_wrap->lut);
1324  word_length = lut->word_length;
1325  } else if (lookup_wrap->lut_type == eCompressedAaLookupTable) {
1327  lut = (BlastCompressedAaLookupTable*)(lookup_wrap->lut);
1328  word_length = lut->word_length;
1329  }
1330  return word_length * 3 + 2;
1331 }
1332 
1333 
1334 Int4
1336  BLAST_SequenceBlk* query, BlastQueryInfo* query_info,
1337  const BlastSeqSrc* seq_src, BlastGapAlignStruct* gap_align,
1338  BlastScoringParameters* score_params,
1339  LookupTableWrap* lookup_wrap,
1340  const BlastInitialWordOptions* word_options,
1341  BlastExtensionParameters* ext_params,
1342  BlastHitSavingParameters* hit_params,
1343  BlastEffectiveLengthsParameters* eff_len_params,
1344  const PSIBlastOptions* psi_options,
1345  const BlastDatabaseOptions* db_options,
1346  BlastHSPStream* hsp_stream, BlastDiagnostics* diagnostics,
1347  TInterruptFnPtr interrupt_search, SBlastProgress* progress_info)
1348 {
1349  BlastCoreAuxStruct* aux_struct = NULL;
1350  BlastHSPList* hsp_list = NULL;
1351  BlastSeqSrcGetSeqArg seq_arg;
1352  Int2 status = 0;
1353  Int8 db_length = 0;
1354  Int4 min_subj_seq_length = 1;
1355  const BlastScoringOptions* score_options = score_params->options;
1356  const BlastHitSavingOptions* hit_options = hit_params->options;
1357  const BlastExtensionOptions* ext_options = ext_params->options;
1358  BlastInitialWordParameters* word_params = NULL;
1359  Boolean gapped_calculation = score_options->gapped_calculation;
1360  BlastScoreBlk* sbp = gap_align->sbp;
1361  BlastSeqSrcIterator* itr;
1362  const Boolean kNucleotide = Blast_ProgramIsNucleotide(program_number);
1363 
1364  T_MB_IdbCheckOid check_index_oid =
1365  (T_MB_IdbCheckOid)lookup_wrap->check_index_oid;
1366  Int4 last_vol_idx = LAST_VOL_IDX_INIT;
1367 
1368  if (Blast_SubjectIsTranslated(program_number)) {
1369  min_subj_seq_length = s_GetMinimumSubjSeqLen(lookup_wrap);
1370  }
1371 
1372  BlastInitialWordParametersNew(program_number, word_options,
1373  hit_params, lookup_wrap, sbp, query_info,
1374  BlastSeqSrcGetAvgSeqLen(seq_src), &word_params);
1375 
1376  if ((status =
1377  s_BlastSetUpAuxStructures(seq_src, lookup_wrap, word_params,
1378  ext_options, hit_options, query, query_info, &aux_struct)) != 0)
1379  return status;
1380 
1381  /* remember the current search state */
1382  if (progress_info)
1383  progress_info->stage = ePrelimSearch;
1384 
1385  /* For RPS BLAST, there is no loop over subject sequences, so the preliminary
1386  search engine is done in a separate function. */
1387  if (Blast_ProgramIsRpsBlast(program_number)) {
1388  status =
1389  s_RPSPreliminarySearchEngine(program_number, query, query_info,
1390  seq_src, score_params, lookup_wrap, aux_struct, word_params,
1391  ext_params, gap_align, hit_params, hsp_stream, diagnostics,
1392  interrupt_search, progress_info);
1393  word_params = BlastInitialWordParametersFree(word_params);
1394  s_BlastCoreAuxStructFree(aux_struct);
1395  return status;
1396  }
1397 
1398  /* Update the parameters for linking HSPs, if necessary. */
1399  BlastLinkHSPParametersUpdate(word_params, hit_params, gapped_calculation);
1400 
1401  memset((void*) &seq_arg, 0, sizeof(seq_arg));
1402 
1403  /* Encoding is set so there are no sentinel bytes, and protein/nucleotide
1404  sequences are retieved in ncbistdaa/ncbi2na encodings respectively. */
1405  seq_arg.encoding = eBlastEncodingProtein;
1406 
1407  db_length = BlastSeqSrcGetTotLen(seq_src);
1408 
1409  itr = BlastSeqSrcIteratorNewEx(MAX(BlastSeqSrcGetNumSeqs(seq_src)/100,1));
1410 
1411  /* iterate over all subject sequences */
1412  while ( (seq_arg.oid = BlastSeqSrcIteratorNext(seq_src, itr))
1413  != BLAST_SEQSRC_EOF) {
1414  Int4 stat_length;
1415  if (seq_arg.oid == BLAST_SEQSRC_ERROR) {
1416  status = BLASTERR_SEQSRC;
1417  break;
1418  }
1419 
1420  if (check_index_oid != 0 &&
1421  check_index_oid(seq_arg.oid, &last_vol_idx) == eNoResults) {
1422  continue;
1423  }
1424 
1425  if (BlastSeqSrcGetSequence(seq_src, &seq_arg) < 0) {
1426  continue;
1427  }
1428 
1429  if (seq_arg.seq->length < min_subj_seq_length) {
1430  BlastSeqSrcReleaseSequence(seq_src, &seq_arg);
1431  continue;
1432  }
1433 
1434  if (db_length == 0) {
1435  /* This is not a database search, hence need to recalculate and save
1436  the effective search spaces and length adjustments for all
1437  queries based on the length of the current single subject
1438  sequence. */
1439  if ((status = BLAST_OneSubjectUpdateParameters(program_number,
1440  seq_arg.seq->length, score_options, query_info,
1441  sbp, hit_params, word_params,
1442  eff_len_params)) != 0)
1443  return status;
1444  }
1445 
1446  stat_length = seq_arg.seq->length;
1447 
1448  /* Calculate cutoff scores for linking HSPs. Do this only for
1449  ungapped protein searches and ungapped translated
1450  searches. */
1451  if (hit_params->link_hsp_params && !kNucleotide &&
1452  !gapped_calculation) {
1453  CalculateLinkHSPCutoffs(program_number, query_info, sbp,
1454  hit_params->link_hsp_params, word_params, db_length,
1455  seq_arg.seq->length);
1456  }
1457 
1458  if (Blast_SubjectIsTranslated(program_number)) {
1459  /* If the subject is translated and the BlastSeqSrc implementation
1460  * doesn't provide a genetic code string, use the default genetic
1461  * code for all subjects (as in the C toolkit) */
1462  if (seq_arg.seq->gen_code_string == NULL) {
1463  seq_arg.seq->gen_code_string =
1464  GenCodeSingletonFind(db_options->genetic_code);
1465  }
1466  ASSERT(seq_arg.seq->gen_code_string);
1467  stat_length /= CODON_LENGTH;
1468  }
1469  status =
1470  s_BlastSearchEngineCore(program_number, query, query_info,
1471  seq_arg.seq, lookup_wrap, gap_align,
1472  score_params, word_params, ext_params,
1473  hit_params, db_options, diagnostics,
1474  aux_struct, &hsp_list,
1475  interrupt_search, progress_info);
1476  if (status) {
1477  break;
1478  }
1479 
1480  if (hsp_list && hsp_list->hspcnt > 0) {
1481  int query_index=0; /* Used to loop over queries below. */
1482  if (!gapped_calculation) {
1483  if(seq_arg.seq->bases_offset > 0)
1484  {
1485  if (Blast_SubjectIsTranslated(program_number))
1486  s_AdjustSubjectForTranslatedSraSearch(hsp_list, seq_arg.seq->bases_offset, seq_arg.seq->length);
1487  else
1488  s_AdjustSubjectForSraSearch(hsp_list, seq_arg.seq->bases_offset);
1489  }
1490  /* The following must be performed for any ungapped
1491  search with a nucleotide database. */
1492  status =
1494  program_number, hsp_list, query,
1495  seq_arg.seq, word_params, hit_params,
1496  query_info, sbp, score_params, seq_src,
1497  seq_arg.seq->gen_code_string);
1498  if (status) {
1499  /* Tell the indexing library that this thread is done with
1500  preliminary search.
1501  */
1502  if( check_index_oid != 0 ) {
1504  lookup_wrap->end_search_indication))( last_vol_idx );
1505  }
1506 
1507  BlastSeqSrcReleaseSequence(seq_src, &seq_arg);
1508  return status;
1509  }
1510  /* Relink HSPs if sum statistics is used, because scores might
1511  * have changed after reevaluation with ambiguities, and there
1512  * will be no traceback stage where relinking is done normally.
1513  * If sum statistics are not used, just recalculate e-values.
1514  */
1515  if (hit_params->link_hsp_params) {
1516  status =
1517  BLAST_LinkHsps(program_number, hsp_list, query_info,
1518  seq_arg.seq->length, sbp,
1519  hit_params->link_hsp_params,
1520  gapped_calculation);
1521  } else {
1522  Blast_HSPListGetEvalues(program_number, query_info,
1523  stat_length, hsp_list,
1524  gapped_calculation, FALSE,
1525  sbp, 0, 1.0);
1526  }
1527  /* Use score threshold rather than evalue if
1528  * matrix_only_scoring is used. -RMH-
1529  */
1530  if ( sbp->matrix_only_scoring )
1531  {
1532  status = Blast_HSPListReapByRawScore(hsp_list,
1533  hit_params->options);
1534  }else {
1535  status = s_Blast_HSPListReapByPrelimEvalue(hsp_list, hit_params);
1536  }
1537 
1538  Blast_HSPListReapByQueryCoverage(hsp_list,hit_params->options, query_info, program_number);
1539  /* Calculate and fill the bit scores, since there will be no
1540  traceback stage where this can be done. */
1541  Blast_HSPListGetBitScores(hsp_list, gapped_calculation, sbp);
1542  }
1543 
1544  // This should only happen for sra searches
1545  if((seq_arg.seq->bases_offset > 0) && (gapped_calculation))
1546  {
1547  if (Blast_SubjectIsTranslated(program_number))
1548  s_AdjustSubjectForTranslatedSraSearch(hsp_list, seq_arg.seq->bases_offset, seq_arg.seq->length);
1549  else
1550  s_AdjustSubjectForSraSearch(hsp_list, seq_arg.seq->bases_offset);
1551  }
1552 
1553  /* Save the results. */
1554  status = BlastHSPStreamWrite(hsp_stream, &hsp_list);
1555  if (status != 0)
1556  break;
1557 
1558  /* Do anchored search for mapping */
1559  if (Blast_ProgramIsMapping(program_number) && getenv("MAPPER_ANCHOR")) {
1560  Int4 word_size = 12;
1561 
1562  DoAnchoredSearch(query, seq_arg.seq, word_size,
1563  query_info, gap_align, score_params,
1564  hit_params, hsp_stream);
1565  }
1566 
1567  if (hit_params->low_score)
1568  {
1569  for (query_index=0; query_index<hsp_stream->results->num_queries; query_index++)
1570  if (hsp_stream->results->hitlist_array[query_index] && hsp_stream->results->hitlist_array[query_index]->heapified)
1571  hit_params->low_score[query_index] =
1572  MAX(hit_params->low_score[query_index],
1573  hit_params->options->low_score_perc*(hsp_stream->results->hitlist_array[query_index]->low_score));
1574  }
1575  }
1576 
1577  BlastSeqSrcReleaseSequence(seq_src, &seq_arg);
1578 
1579  /* check for interrupt */
1580  if (interrupt_search && (*interrupt_search)(progress_info) == TRUE) {
1581  status = BLASTERR_INTERRUPTED;
1582  break;
1583  }
1584  }
1585 
1586  /* Tell the indexing library that this thread is done with
1587  preliminary search.
1588  */
1589  if( check_index_oid != 0 ) {
1591  lookup_wrap->end_search_indication))( last_vol_idx );
1592  }
1593 
1594  hsp_list = Blast_HSPListFree(hsp_list); /* in case we were interrupted */
1595  BlastSequenceBlkFree(seq_arg.seq);
1596  itr = BlastSeqSrcIteratorFree(itr);
1597 
1598  /* Fill the cutoff values in the diagnostics structure */
1599  if (diagnostics && diagnostics->cutoffs) {
1600  s_FillReturnCutoffsInfo(diagnostics->cutoffs, score_params, word_params,
1601  ext_params, hit_params);
1602  }
1603 
1604  word_params = BlastInitialWordParametersFree(word_params);
1605  s_BlastCoreAuxStructFree(aux_struct);
1606  return status;
1607 }
1608 
1609 Int2
1612  BlastQueryInfo* query_info,
1613  const BlastSeqSrc* seq_src,
1614  const BlastScoringOptions* score_options,
1615  BlastScoreBlk* sbp,
1616  LookupTableWrap* lookup_wrap,
1617  const BlastInitialWordOptions* word_options,
1618  const BlastExtensionOptions* ext_options,
1619  const BlastHitSavingOptions* hit_options,
1620  const BlastEffectiveLengthsOptions* eff_len_options,
1621  const PSIBlastOptions* psi_options,
1622  const BlastDatabaseOptions* db_options,
1623  BlastHSPStream* hsp_stream,
1624  BlastDiagnostics* diagnostics)
1625 {
1627  query, query_info, seq_src, score_options, sbp, lookup_wrap,
1628  word_options, ext_options, hit_options, eff_len_options,
1629  psi_options, db_options, hsp_stream, diagnostics, NULL, NULL);
1630 }
1631 
1632 Int2
1635  BlastQueryInfo* query_info,
1636  const BlastSeqSrc* seq_src,
1637  const BlastScoringOptions* score_options,
1638  BlastScoreBlk* sbp,
1639  LookupTableWrap* lookup_wrap,
1640  const BlastInitialWordOptions* word_options,
1641  const BlastExtensionOptions* ext_options,
1642  const BlastHitSavingOptions* hit_options,
1643  const BlastEffectiveLengthsOptions* eff_len_options,
1644  const PSIBlastOptions* psi_options,
1645  const BlastDatabaseOptions* db_options,
1646  BlastHSPStream* hsp_stream,
1647  BlastDiagnostics* diagnostics,
1648  TInterruptFnPtr interrupt_search, SBlastProgress* progress_info)
1649 {
1650  Int2 status = 0;
1651  BlastScoringParameters* score_params = NULL;/**< Scoring parameters */
1652  BlastExtensionParameters* ext_params = NULL;/**< Gapped extension
1653  parameters */
1654  BlastHitSavingParameters* hit_params = NULL;/**< Hit saving parameters */
1655  BlastEffectiveLengthsParameters* eff_len_params = NULL; /**< Parameters
1656  for effective lengths calculations */
1657  BlastGapAlignStruct* gap_align = NULL; /**< Gapped alignment structure */
1658 
1659  /* Use a local diagnostics structure, because the one passed in an input
1660  argument can be shared between multiple threads, so we don't want to pass
1661  it to the engine and have a lot of mutex contention. */
1662  BlastDiagnostics* local_diagnostics = Blast_DiagnosticsInit();
1663 
1664  if ((status =
1665  BLAST_GapAlignSetUp(program, seq_src, score_options,
1666  eff_len_options, ext_options, hit_options,
1667  query_info, sbp, &score_params, &ext_params,
1668  &hit_params, &eff_len_params, &gap_align)) != 0) {
1669  /* Blast_DiagnosticsUpdate(diagnostics, local_diagnostics); */
1670  Blast_DiagnosticsFree(local_diagnostics);
1671  return status;
1672  }
1673 
1674  if ((status=
1675  BLAST_PreliminarySearchEngine(program, query, query_info,
1676  seq_src, gap_align, score_params,
1677  lookup_wrap, word_options,
1678  ext_params, hit_params, eff_len_params,
1679  psi_options, db_options, hsp_stream,
1680  local_diagnostics, interrupt_search,
1681  progress_info)) != 0) {
1682  gap_align = BLAST_GapAlignStructFree(gap_align);
1683  score_params = BlastScoringParametersFree(score_params);
1684  hit_params = BlastHitSavingParametersFree(hit_params);
1685  ext_params = BlastExtensionParametersFree(ext_params);
1686  eff_len_params = BlastEffectiveLengthsParametersFree(eff_len_params);
1687  /* Blast_DiagnosticsUpdate(diagnostics, local_diagnostics); */
1688  Blast_DiagnosticsFree(local_diagnostics);
1689  return status;
1690  }
1691 
1692  /* Do not destruct score block here */
1693  gap_align->sbp = NULL;
1694  gap_align = BLAST_GapAlignStructFree(gap_align);
1695 
1696  score_params = BlastScoringParametersFree(score_params);
1697  hit_params = BlastHitSavingParametersFree(hit_params);
1698  ext_params = BlastExtensionParametersFree(ext_params);
1699  eff_len_params = BlastEffectiveLengthsParametersFree(eff_len_params);
1700 
1701  /* Now update the input diagonistics structure. */
1702  Blast_DiagnosticsUpdate(diagnostics, local_diagnostics);
1703  Blast_DiagnosticsFree(local_diagnostics);
1704 
1705  return status;
1706 }
1707 
1708 /** Function to deallocate data structures allocated in Blast_RunFullSearch */
1709 static void
1711  BlastScoringParameters* score_params,
1712  BlastExtensionParameters* ext_params,
1713  BlastHitSavingParameters* hit_params,
1714  BlastEffectiveLengthsParameters* eff_len_params)
1715 {
1716  /* Do not destruct score block here */
1717  gap_align->sbp = NULL;
1718  BLAST_GapAlignStructFree(gap_align);
1719 
1720  BlastScoringParametersFree(score_params);
1721  BlastHitSavingParametersFree(hit_params);
1722  BlastExtensionParametersFree(ext_params);
1723  BlastEffectiveLengthsParametersFree(eff_len_params);
1724 }
1725 
1726 Int4
1728  BLAST_SequenceBlk* query, BlastQueryInfo* query_info,
1729  const BlastSeqSrc* seq_src, BlastScoreBlk* sbp,
1730  const BlastScoringOptions* score_options,
1731  LookupTableWrap* lookup_wrap,
1732  const BlastInitialWordOptions* word_options,
1733  const BlastExtensionOptions* ext_options,
1734  const BlastHitSavingOptions* hit_options,
1735  const BlastEffectiveLengthsOptions* eff_len_options,
1736  const PSIBlastOptions* psi_options,
1737  const BlastDatabaseOptions* db_options,
1738  BlastHSPStream* hsp_stream, const BlastRPSInfo* rps_info,
1739  BlastDiagnostics* diagnostics, BlastHSPResults** results,
1740  TInterruptFnPtr interrupt_search,
1741  SBlastProgress* progress_info)
1742 {
1743  Int4 status = 0;
1744  BlastScoringParameters* score_params = NULL;
1745  BlastExtensionParameters* ext_params = NULL;
1746  BlastHitSavingParameters* hit_params = NULL;
1747  BlastEffectiveLengthsParameters* eff_len_params = NULL;
1748  BlastGapAlignStruct* gap_align = NULL;
1749  SPHIPatternSearchBlk* pattern_blk = NULL;
1750 
1751  if ((status =
1752  BLAST_GapAlignSetUp(program_number, seq_src, score_options,
1753  eff_len_options, ext_options, hit_options, query_info, sbp,
1754  &score_params, &ext_params, &hit_params, &eff_len_params,
1755  &gap_align)) != 0) {
1756  s_BlastRunFullSearchCleanUp(gap_align, score_params, ext_params,
1757  hit_params, eff_len_params);
1758  return status;
1759  }
1760 
1761  if ((status=
1762  BLAST_PreliminarySearchEngine(program_number, query, query_info,
1763  seq_src, gap_align, score_params, lookup_wrap, word_options,
1764  ext_params, hit_params, eff_len_params, psi_options,
1765  db_options, hsp_stream, diagnostics, interrupt_search,
1766  progress_info)) != 0) {
1767  s_BlastRunFullSearchCleanUp(gap_align, score_params, ext_params,
1768  hit_params, eff_len_params);
1769  return status;
1770  }
1771 
1772  /* Prohibit any subsequent writing to the HSP stream. */
1773  BlastHSPStreamClose(hsp_stream);
1774 
1775  if (Blast_ProgramIsPhiBlast(program_number)) {
1776  pattern_blk = ((SPHIPatternSearchBlk*) lookup_wrap->lut);
1777  pattern_blk->num_patterns_db =
1778  (Int4)diagnostics->ungapped_stat->lookup_hits;
1779  }
1780 
1781  if ((status =
1782  BLAST_ComputeTraceback(program_number, hsp_stream, query, query_info,
1783  seq_src, gap_align, score_params, ext_params,
1784  hit_params, eff_len_params, db_options,
1785  psi_options, rps_info, pattern_blk, results,
1786  interrupt_search, progress_info))
1787  != 0) {
1788  s_BlastRunFullSearchCleanUp(gap_align, score_params, ext_params,
1789  hit_params, eff_len_params);
1790  return status;
1791  }
1792 
1793  s_BlastRunFullSearchCleanUp(gap_align, score_params, ext_params, hit_params,
1794  eff_len_params);
1795  return status;
1796 }
Protein ungapped extension code.
Int2 BlastAaWordFinder(BLAST_SequenceBlk *subject, BLAST_SequenceBlk *query, BlastQueryInfo *query_info, LookupTableWrap *lookup, Int4 **matrix, const BlastInitialWordParameters *word_params, Blast_ExtendWord *ewp, BlastOffsetPair *offset_pairs, Int4 offset_array_size, BlastInitHitList *init_hitlist, BlastUngappedStats *ungapped_stats)
Scan a subject sequence for word hits (blastp)
Definition: aa_ungapped.c:200
Int2 BlastRPSWordFinder(BLAST_SequenceBlk *subject, BLAST_SequenceBlk *query, BlastQueryInfo *query_info, LookupTableWrap *lookup, Int4 **matrix, const BlastInitialWordParameters *word_params, Blast_ExtendWord *ewp, BlastOffsetPair *offset_pairs, Int4 offset_array_size, BlastInitHitList *init_hitlist, BlastUngappedStats *ungapped_stats)
Scan a subject sequence for word hits (rpsblast and rpstblastn)
Definition: aa_ungapped.c:238
Routines for creating protein BLAST lookup tables.
Routines for creating protein BLAST lookup tables.
void BlastChooseProteinScanSubject(LookupTableWrap *lookup_wrap)
Choose the most appropriate function to scan through protein subject sequences.
Definition: blast_aascan.c:553
@ eHardSubjMasking
Definition: blast_def.h:238
@ eNoSubjMasking
Definition: blast_def.h:236
@ eSoftSubjMasking
Definition: blast_def.h:237
#define COMPRESSION_RATIO
Compression ratio of nucleotide bases (4 bases in 1 byte)
Definition: blast_def.h:83
#define sfree(x)
Safe free a pointer: belongs to a higher level header.
Definition: blast_def.h:112
Boolean(* TInterruptFnPtr)(SBlastProgress *progress_info)
Prototype for function pointer to determine whether the BLAST search should proceed or be interrupted...
Definition: blast_def.h:354
#define CODON_LENGTH
Codons are always of length 3.
Definition: blast_def.h:63
@ ePrelimSearch
Preliminary stage.
Definition: blast_def.h:328
BlastDiagnostics * Blast_DiagnosticsInit(void)
Initialize the BlastDiagnostics structure and all its substructures.
BlastDiagnostics * Blast_DiagnosticsFree(BlastDiagnostics *diagnostics)
Free the BlastDiagnostics structure and all substructures.
void Blast_DiagnosticsUpdate(BlastDiagnostics *diag_global, BlastDiagnostics *diag_local)
In a multi-threaded run, update global diagnostics data with the data coming from one of the prelimin...
static Int2 s_FillReturnCutoffsInfo(BlastRawCutoffs *return_cutoffs, const BlastScoringParameters *score_params, const BlastInitialWordParameters *word_params, const BlastExtensionParameters *ext_params, const BlastHitSavingParameters *hit_params)
Fills the output information about the cutoffs uses in a BLAST search.
Definition: blast_engine.c:928
Int4 Blast_RunFullSearch(EBlastProgramType program_number, BLAST_SequenceBlk *query, BlastQueryInfo *query_info, const BlastSeqSrc *seq_src, BlastScoreBlk *sbp, const BlastScoringOptions *score_options, LookupTableWrap *lookup_wrap, const BlastInitialWordOptions *word_options, const BlastExtensionOptions *ext_options, const BlastHitSavingOptions *hit_options, const BlastEffectiveLengthsOptions *eff_len_options, const PSIBlastOptions *psi_options, const BlastDatabaseOptions *db_options, BlastHSPStream *hsp_stream, const BlastRPSInfo *rps_info, BlastDiagnostics *diagnostics, BlastHSPResults **results, TInterruptFnPtr interrupt_search, SBlastProgress *progress_info)
The high level function performing the BLAST search against a BLAST database after all the setup has ...
struct BlastCoreAuxStruct BlastCoreAuxStruct
Structure to be passed to s_BlastSearchEngineCore, containing pointers to various preallocated struct...
static BlastCoreAuxStruct * s_BlastCoreAuxStructFree(BlastCoreAuxStruct *aux_struct)
Deallocates all memory in BlastCoreAuxStruct.
Definition: blast_engine.c:111
static Int2 s_RPSPreliminarySearchEngine(EBlastProgramType program_number, BLAST_SequenceBlk *query, BlastQueryInfo *query_info, const BlastSeqSrc *seq_src, const BlastScoringParameters *score_params, LookupTableWrap *lookup_wrap, BlastCoreAuxStruct *aux_struct, const BlastInitialWordParameters *word_params, const BlastExtensionParameters *ext_params, BlastGapAlignStruct *gap_align, const BlastHitSavingParameters *hit_params, BlastHSPStream *hsp_stream, BlastDiagnostics *diagnostics, TInterruptFnPtr interrupt_search, SBlastProgress *progress_info)
Performs the preliminary stage of an RPS BLAST search, after all set up has already been done.
const Int2 SUBJECT_SPLIT_OK
return value indicating OK
Definition: blast_engine.c:218
Int2 Blast_RunPreliminarySearch(EBlastProgramType program, BLAST_SequenceBlk *query, BlastQueryInfo *query_info, const BlastSeqSrc *seq_src, const BlastScoringOptions *score_options, BlastScoreBlk *sbp, LookupTableWrap *lookup_wrap, const BlastInitialWordOptions *word_options, const BlastExtensionOptions *ext_options, const BlastHitSavingOptions *hit_options, const BlastEffectiveLengthsOptions *eff_len_options, const PSIBlastOptions *psi_options, const BlastDatabaseOptions *db_options, BlastHSPStream *hsp_stream, BlastDiagnostics *diagnostics)
The high level function performing the BLAST search against a BLAST database after all the setup has ...
static void s_AdjustSubjectForSraSearch(BlastHSPList *hsp_list, Uint1 offset)
const Int2 SUBJECT_SPLIT_DONE
return value indicating hitting the end
Definition: blast_engine.c:217
static void s_AdjustSubjectForTranslatedSraSearch(BlastHSPList *hsp_list, Uint1 offset, Int4 length)
static Int2 s_BlastSearchEngineOneContext(EBlastProgramType program_number, BLAST_SequenceBlk *query, BlastQueryInfo *query_info, BLAST_SequenceBlk *subject, Int4 orig_length, LookupTableWrap *lookup, BlastGapAlignStruct *gap_align, const BlastScoringParameters *score_params, const BlastInitialWordParameters *word_params, const BlastExtensionParameters *ext_params, const BlastHitSavingParameters *hit_params, BlastDiagnostics *diagnostics, BlastCoreAuxStruct *aux_struct, BlastHSPList **hsp_list_out_ptr, TInterruptFnPtr interrupt_search, SBlastProgress *progress_info)
Searches only one context of a database sequence, but does all chunks if it is split.
Definition: blast_engine.c:423
const int kBlastPatchVersion
Patch version.
Definition: blast_engine.c:83
#define CONV_NUCL2PROT_COORDINATES(length)
Converts nucleotide coordinates to protein.
Definition: blast_engine.c:79
static void s_AllocateSeqRange(BLAST_SequenceBlk *subject, SubjectSplitStruct *backup, Int4 num_seq_ranges)
Definition: blast_engine.c:184
static void s_BlastRunFullSearchCleanUp(BlastGapAlignStruct *gap_align, BlastScoringParameters *score_params, BlastExtensionParameters *ext_params, BlastHitSavingParameters *hit_params, BlastEffectiveLengthsParameters *eff_len_params)
Function to deallocate data structures allocated in Blast_RunFullSearch.
static Int2 s_BlastSetUpAuxStructures(const BlastSeqSrc *seq_src, LookupTableWrap *lookup_wrap, const BlastInitialWordParameters *word_params, const BlastExtensionOptions *ext_options, const BlastHitSavingOptions *hit_options, BLAST_SequenceBlk *query, const BlastQueryInfo *query_info, BlastCoreAuxStruct **aux_struct_ptr)
Setup of the auxiliary BLAST structures; also calculates internally used parameters from options.
Definition: blast_engine.c:971
static Int2 s_GetNextSubjectChunk(BLAST_SequenceBlk *subject, SubjectSplitStruct *backup, Boolean is_nucleotide, int chunk_overlap)
Definition: blast_engine.c:221
static Int4 s_GetMinimumSubjSeqLen(LookupTableWrap *lookup_wrap)
static Int2 s_Blast_HSPListReapByPrelimEvalue(BlastHSPList *hsp_list, const BlastHitSavingParameters *hit_params)
Discard the HSPs above the prelim e-value threshold from the HSP list.
Definition: blast_engine.c:643
Int4 BLAST_PreliminarySearchEngine(EBlastProgramType program_number, BLAST_SequenceBlk *query, BlastQueryInfo *query_info, const BlastSeqSrc *seq_src, BlastGapAlignStruct *gap_align, BlastScoringParameters *score_params, LookupTableWrap *lookup_wrap, const BlastInitialWordOptions *word_options, BlastExtensionParameters *ext_params, BlastHitSavingParameters *hit_params, BlastEffectiveLengthsParameters *eff_len_params, const PSIBlastOptions *psi_options, const BlastDatabaseOptions *db_options, BlastHSPStream *hsp_stream, BlastDiagnostics *diagnostics, TInterruptFnPtr interrupt_search, SBlastProgress *progress_info)
Perform the preliminary stage of the BLAST search.
static void s_RPSOffsetArrayToContextOffsets(BlastQueryInfo *info, Int4 *new_offsets)
Set up context offsets for the auxiliary BlastQueryInfo structure that is created for the concatenate...
Definition: blast_engine.c:392
static void s_BlastSearchEngineCoreCleanUp(EBlastProgramType program_number, BlastQueryInfo *query_info, const BlastQueryInfo *query_info_in, Uint1 *translation_buffer, Uint4 *frame_offsets_a)
Clean up function for s_BlastSearchEngineCore.
Definition: blast_engine.c:613
static Int2 s_BlastSearchEngineCore(EBlastProgramType program_number, BLAST_SequenceBlk *query, BlastQueryInfo *query_info_in, BLAST_SequenceBlk *subject, LookupTableWrap *lookup, BlastGapAlignStruct *gap_align, const BlastScoringParameters *score_params, const BlastInitialWordParameters *word_params, const BlastExtensionParameters *ext_params, const BlastHitSavingParameters *hit_params, const BlastDatabaseOptions *db_options, BlastDiagnostics *diagnostics, BlastCoreAuxStruct *aux_struct, BlastHSPList **hsp_list_out_ptr, TInterruptFnPtr interrupt_search, SBlastProgress *progress_info)
The core of the BLAST search: comparison between the (concatenated) query against one subject sequenc...
Definition: blast_engine.c:702
static void s_TranslateHSPsToDNAPCoord(EBlastProgramType program, BlastInitHitList *init_hitlist, const BlastQueryInfo *query_info, Int2 subject_frame, Int4 subject_length, Int4 offset)
Adjust HSP coordinates for out-of-frame gapped extension.
Definition: blast_engine.c:324
const int kBlastMinorVersion
Minor version.
Definition: blast_engine.c:82
static void s_RestoreSubject(BLAST_SequenceBlk *subject, SubjectSplitStruct *backup)
Definition: blast_engine.c:200
static void s_BackupSubject(BLAST_SequenceBlk *subject, SubjectSplitStruct *backup)
Definition: blast_engine.c:145
struct SubjectSplitStruct SubjectSplitStruct
Structure used for subject sequence split.
const int kBlastMajorVersion
Major version.
Definition: blast_engine.c:81
const Int2 SUBJECT_SPLIT_NO_RANGE
return value indicating all masked out
Definition: blast_engine.c:219
Int2 Blast_RunPreliminarySearchWithInterrupt(EBlastProgramType program, BLAST_SequenceBlk *query, BlastQueryInfo *query_info, const BlastSeqSrc *seq_src, const BlastScoringOptions *score_options, BlastScoreBlk *sbp, LookupTableWrap *lookup_wrap, const BlastInitialWordOptions *word_options, const BlastExtensionOptions *ext_options, const BlastHitSavingOptions *hit_options, const BlastEffectiveLengthsOptions *eff_len_options, const PSIBlastOptions *psi_options, const BlastDatabaseOptions *db_options, BlastHSPStream *hsp_stream, BlastDiagnostics *diagnostics, TInterruptFnPtr interrupt_search, SBlastProgress *progress_info)
Same as above, with support for user interrupt function.
Function calls to actually perform a BLAST search (high level).
Int2(* BlastWordFinderType)(BLAST_SequenceBlk *, BLAST_SequenceBlk *, BlastQueryInfo *, LookupTableWrap *, Int4 **, const BlastInitialWordParameters *, Blast_ExtendWord *, BlastOffsetPair *, Int4, BlastInitHitList *, BlastUngappedStats *)
Word finder function pointer type.
Definition: blast_engine.h:226
Int2(* BlastGetGappedScoreType)(EBlastProgramType, BLAST_SequenceBlk *, BlastQueryInfo *, BLAST_SequenceBlk *, BlastGapAlignStruct *, const BlastScoringParameters *, const BlastExtensionParameters *, const BlastHitSavingParameters *, BlastInitHitList *, BlastHSPList **, BlastGappedStats *, Boolean *fence_hit)
Gapped extension function pointer type.
Definition: blast_engine.h:211
Int2(* JumperGappedType)(BLAST_SequenceBlk *, BLAST_SequenceBlk *, BlastQueryInfo *, LookupTableWrap *, const BlastInitialWordParameters *, const BlastScoringParameters *, const BlastHitSavingParameters *, BlastOffsetPair *offset_pairs, MapperWordHits *mapper_wordhits, Int4, BlastGapAlignStruct *, BlastInitHitList *, BlastHSPList **, BlastUngappedStats *, BlastGappedStats *)
Short read mapper function pointer type.
Definition: blast_engine.h:240
#define NCBI_XBLAST_EXPORT
NULL operations for other cases.
Definition: blast_export.h:65
void BlastInitHitListReset(BlastInitHitList *init_hitlist)
Free the ungapped data substructures and reset initial HSP count to 0.
Definition: blast_extend.c:229
BlastInitHitList * BLAST_InitHitListNew(void)
Allocate memory for the BlastInitHitList structure.
Definition: blast_extend.c:216
void Blast_InitHitListSortByScore(BlastInitHitList *init_hitlist)
Sort array of initial HSPs by score.
Definition: blast_extend.c:306
Blast_ExtendWord * BlastExtendWordFree(Blast_ExtendWord *ewp)
Deallocate memory for the word extension structure.
Definition: blast_extend.c:203
BlastInitHitList * BLAST_InitHitListFree(BlastInitHitList *init_hitlist)
Free memory for the BlastInitList structure.
Definition: blast_extend.c:261
Int2 BlastExtendWordNew(Uint4 query_length, const BlastInitialWordParameters *word_params, Blast_ExtendWord **ewp_ptr)
Initializes the word extension structure.
Definition: blast_extend.c:110
Structures and functions prototypes used for BLAST gapped extension.
#define MAX_DBSEQ_LEN
Split subject sequences if longer than this.
Int2 BLAST_GetUngappedHSPList(BlastInitHitList *init_hitlist, BlastQueryInfo *query_info, BLAST_SequenceBlk *subject, const BlastHitSavingOptions *hit_options, BlastHSPList **hsp_list_ptr)
Convert initial HSP list to an HSP list: to be used in ungapped search.
Int2 BLAST_GetGappedScore(EBlastProgramType program_number, BLAST_SequenceBlk *query, BlastQueryInfo *query_info, BLAST_SequenceBlk *subject, BlastGapAlignStruct *gap_align, const BlastScoringParameters *score_params, const BlastExtensionParameters *ext_params, const BlastHitSavingParameters *hit_params, BlastInitHitList *init_hitlist, BlastHSPList **hsp_list_ptr, BlastGappedStats *gapped_stats, Boolean *fence_hit)
Performs gapped extension for all non-Mega BLAST programs, given that ungapped extension has been don...
BlastGapAlignStruct * BLAST_GapAlignStructFree(BlastGapAlignStruct *gap_align)
Deallocates memory in the BlastGapAlignStruct structure.
Private interface for blast_gapalign.c.
void RPSPsiMatrixAttach(BlastScoreBlk *sbp, Int4 **rps_pssm, Int4 alphabet_size)
Modify a BlastScoreBlk structure so that it can be used in RPS-BLAST.
void RPSPsiMatrixDetach(BlastScoreBlk *sbp)
Remove the artificially built SPsiBlastScoreMatrix structure allocated by RPSPsiMatrixAttach.
Int2 Blast_HSPListReapByRawScore(BlastHSPList *hsp_list, const BlastHitSavingOptions *hit_options)
Discard the HSPs above the raw threshold from the HSP list.
Definition: blast_hits.c:2076
Int2 Blast_HSPListAppend(BlastHSPList **old_hsp_list_ptr, BlastHSPList **combined_hsp_list_ptr, Int4 hsp_num_max)
Append one HSP list to the other.
Definition: blast_hits.c:2807
Int4 BlastHspNumMax(Boolean gapped_calculation, const BlastHitSavingOptions *options)
Calculated the number of HSPs that should be saved.
Definition: blast_hits.c:213
Int4 Blast_HSPListSubjectBestHit(EBlastProgramType program, const BlastHSPSubjectBestHitOptions *subject_besthit_opts, const BlastQueryInfo *query_info, BlastHSPList *hsp_list)
Definition: blast_hits.c:2536
Int4 Blast_HSPListPurgeHSPsWithCommonEndpoints(EBlastProgramType program, BlastHSPList *hsp_list, Boolean purge)
Check for an overlap of two different alignments and remove redundant HSPs.
Definition: blast_hits.c:2455
#define DBSEQ_CHUNK_OVERLAP
By how much should the chunks of a subject sequence overlap if it is too long and has to be split.
Definition: blast_hits.h:192
Int2 Blast_HSPListsMerge(BlastHSPList **hsp_list, BlastHSPList **combined_hsp_list_ptr, Int4 hsp_num_max, Int4 *split_points, Int4 contexts_per_query, Int4 chunk_overlap_size, Boolean allow_gap, Boolean short_reads)
Merge an HSP list from a chunk of the subject sequence into a previously computed HSP list.
Definition: blast_hits.c:2855
Int2 Blast_HSPListGetEvalues(EBlastProgramType program_number, const BlastQueryInfo *query_info, Int4 subject_length, BlastHSPList *hsp_list, Boolean gapped_calculation, Boolean RPS_prelim, const BlastScoreBlk *sbp, double gap_decay_rate, double scaling_factor)
Calculate the expected values for all HSPs in a hit list, without using the sum statistics.
Definition: blast_hits.c:1811
Int2 Blast_HSPListReevaluateUngapped(EBlastProgramType program, BlastHSPList *hsp_list, BLAST_SequenceBlk *query_blk, BLAST_SequenceBlk *subject_blk, const BlastInitialWordParameters *word_params, const BlastHitSavingParameters *hit_params, const BlastQueryInfo *query_info, BlastScoreBlk *sbp, const BlastScoringParameters *score_params, const BlastSeqSrc *seq_src, const Uint1 *gen_code_string)
Reevaluate all ungapped HSPs in an HSP list.
Definition: blast_hits.c:2607
void Blast_HSPListAdjustOddBlastnScores(BlastHSPList *hsp_list, Boolean gapped_calculation, const BlastScoreBlk *sbp)
For nucleotide BLAST, if the match reward score is equal to 2, random alignments are dominated by run...
Definition: blast_hits.c:3051
Int2 Blast_HSPListReapByQueryCoverage(BlastHSPList *hsp_list, const BlastHitSavingOptions *hit_options, const BlastQueryInfo *query_info, EBlastProgramType program_number)
Discard the HSPs below the min query coverage pct from the HSP list.
Definition: blast_hits.c:2010
BlastHSP * Blast_HSPFree(BlastHSP *hsp)
Deallocate memory for an HSP structure.
Definition: blast_hits.c:130
BlastHSPList * Blast_HSPListFree(BlastHSPList *hsp_list)
Deallocate memory for an HSP list structure as well as all it's components.
Definition: blast_hits.c:1542
void Blast_HSPListSortByScore(BlastHSPList *hsp_list)
Sort the HSPs in an HSP list by score.
Definition: blast_hits.c:1374
Int2 Blast_HSPListGetBitScores(BlastHSPList *hsp_list, Boolean gapped_calculation, const BlastScoreBlk *sbp)
Calculate bit scores from raw scores in an HSP list.
Definition: blast_hits.c:1907
void Blast_HSPListAdjustOffsets(BlastHSPList *hsp_list, Int4 offset)
Adjust subject offsets in an HSP list if only part of the subject sequence was searched.
Definition: blast_hits.c:3035
void BlastHSPStreamClose(BlastHSPStream *hsp_stream)
Closes the BlastHSPStream structure for writing.
int BlastHSPStreamWrite(BlastHSPStream *hsp_stream, BlastHSPList **hsp_list)
Invokes the user-specified write function for this BlastHSPStream implementation.
#define BLASTERR_INTERRUPTED
BLAST search was interrupted via a user-provided callback.
#define BLASTERR_SEQSRC
Blast seqsrc returns BLAST_SEQSRC_ERROR.
Routines for creating nucleotide BLAST lookup tables.
Routines for scanning nucleotide BLAST lookup tables.
void BlastChooseNucleotideScanSubject(LookupTableWrap *lookup_wrap)
Choose the most appropriate function to scan through nucleotide subject sequences.
@ eJumperWithTraceback
Jumper extension (mapping)
@ eSmithWatermanScoreOnly
Score-only smith-waterman.
@ eIndexedMBLookupTable
use database index as a lookup structure
@ ePhiNaLookupTable
nucleotide lookup table for phi-blast
@ eAaLookupTable
standard protein (blastp) lookup table
@ eCompressedAaLookupTable
compressed alphabet (blastp) lookup table
@ ePhiLookupTable
protein lookup table specialized for phi-blast
@ eRPSLookupTable
RPS lookup table (rpsblast and rpstblastn)
Structure and function definitions for BLAST parameter structures, which are internal to the CORE of ...
BlastHitSavingParameters * BlastHitSavingParametersFree(BlastHitSavingParameters *parameters)
Deallocate memory for BlastHitSavingOptions*.
BlastEffectiveLengthsParameters * BlastEffectiveLengthsParametersFree(BlastEffectiveLengthsParameters *parameters)
Deallocate memory for BlastEffectiveLengthsParameters*.
BlastInitialWordParameters * BlastInitialWordParametersFree(BlastInitialWordParameters *parameters)
Deallocate memory for BlastInitialWordParameters.
BlastExtensionParameters * BlastExtensionParametersFree(BlastExtensionParameters *parameters)
Deallocate memory for BlastExtensionParameters.
Int2 BlastInitialWordParametersNew(EBlastProgramType program_number, const BlastInitialWordOptions *word_options, const BlastHitSavingParameters *hit_params, const LookupTableWrap *lookup_wrap, const BlastScoreBlk *sbp, BlastQueryInfo *query_info, Uint4 subject_length, BlastInitialWordParameters **parameters)
Allocate memory for BlastInitialWordParameters and set x_dropoff.
Int2 BlastLinkHSPParametersUpdate(const BlastInitialWordParameters *word_params, const BlastHitSavingParameters *hit_params, Boolean gapped_calculation)
Update BlastLinkHSPParameters, using calculated values of other parameters.
void CalculateLinkHSPCutoffs(EBlastProgramType program, BlastQueryInfo *query_info, const BlastScoreBlk *sbp, BlastLinkHSPParameters *link_hsp_params, const BlastInitialWordParameters *word_params, Int8 db_length, Int4 subject_length)
Calculates cutoff scores and returns them.
BlastScoringParameters * BlastScoringParametersFree(BlastScoringParameters *parameters)
Deallocate memory for BlastScoringParameters.
Boolean Blast_ProgramIsMapping(EBlastProgramType p)
Definition: blast_program.c:76
Boolean Blast_ProgramIsPhiBlast(EBlastProgramType p)
Returns true if program is PHI-BLAST (i.e.
Definition: blast_program.c:70
Boolean Blast_ProgramIsNucleotide(EBlastProgramType p)
Definition: blast_program.c:82
Boolean Blast_ProgramIsRpsBlast(EBlastProgramType p)
Returns true if program is RPS-BLAST (i.e.
Definition: blast_program.c:73
EBlastProgramType
Defines the engine's notion of the different applications of the BLAST algorithm.
Definition: blast_program.h:72
@ eBlastTypeBlastx
Definition: blast_program.h:75
@ eBlastTypeRpsTblastn
Definition: blast_program.h:85
@ eBlastTypeMapping
Definition: blast_program.h:88
@ eBlastTypeRpsBlast
Definition: blast_program.h:84
@ eBlastTypeBlastp
Definition: blast_program.h:73
Boolean Blast_SubjectIsTranslated(EBlastProgramType p)
Returns true if the subject is translated.
Definition: blast_program.c:63
BlastQueryInfo * BlastQueryInfoFree(BlastQueryInfo *query_info)
Deallocate memory for query information structure.
Int2 Blast_GetOneQueryStructs(BlastQueryInfo **one_query_info_ptr, BLAST_SequenceBlk **one_query_ptr, const BlastQueryInfo *query_info, BLAST_SequenceBlk *query, Int4 query_index)
Create auxiliary query structures with all data corresponding to a single query sequence within a con...
Int4 BSearchContextInfo(Int4 n, const BlastQueryInfo *A)
Search BlastContextInfo structures for the specified offset.
Int4 * ContextOffsetsToOffsetArray(const BlastQueryInfo *info)
Copy the context query offsets to an allocated array of Int4.
void OffsetArrayToContextOffsets(BlastQueryInfo *info, Int4 *new_offsets, EBlastProgramType prog)
Copy the context query offsets from an array of Int4, allocating the context array if needed.
BlastQueryInfo * BlastQueryInfoNew(EBlastProgramType program, int num_queries)
Allocate memory for query information structure.
#define BLAST_SEQSRC_ERROR
Error while retrieving sequence.
Definition: blast_seqsrc.h:291
Int4 BlastSeqSrcIteratorNext(const BlastSeqSrc *seq_src, BlastSeqSrcIterator *itr)
Increments the BlastSeqSrcIterator.
Definition: blast_seqsrc.c:425
BlastSeqSrcIterator * BlastSeqSrcIteratorFree(BlastSeqSrcIterator *itr)
Frees the BlastSeqSrcIterator structure.
Definition: blast_seqsrc.c:412
BlastSeqSrcIterator * BlastSeqSrcIteratorNewEx(unsigned int chunk_sz)
Allocate and initialize an iterator over a BlastSeqSrc.
Definition: blast_seqsrc.c:387
void BlastSeqSrcReleaseSequence(const BlastSeqSrc *seq_src, BlastSeqSrcGetSeqArg *getseq_arg)
Deallocate individual sequence.
Definition: blast_seqsrc.c:289
Int4 BlastSeqSrcGetAvgSeqLen(const BlastSeqSrc *seq_src)
Get the average length of all sequences in the sequence source.
Definition: blast_seqsrc.c:211
Int4 BlastSeqSrcGetNumSeqs(const BlastSeqSrc *seq_src)
Get the number of sequences contained in the sequence source.
Definition: blast_seqsrc.c:177
Int8 BlastSeqSrcGetTotLen(const BlastSeqSrc *seq_src)
Get the total length of all sequences in the sequence source.
Definition: blast_seqsrc.c:219
Int2 BlastSeqSrcGetSequence(const BlastSeqSrc *seq_src, BlastSeqSrcGetSeqArg *getseq_arg)
Retrieve an individual sequence.
Definition: blast_seqsrc.c:271
#define BLAST_SEQSRC_EOF
No more sequences available.
Definition: blast_seqsrc.h:292
Utilities initialize/setup BLAST.
Int2 BLAST_OneSubjectUpdateParameters(EBlastProgramType program_number, Uint4 subject_length, const BlastScoringOptions *scoring_options, BlastQueryInfo *query_info, const BlastScoreBlk *sbp, BlastHitSavingParameters *hit_params, BlastInitialWordParameters *word_params, BlastEffectiveLengthsParameters *eff_len_params)
Recalculates the parameters that depend on an individual sequence, if this is not a database search.
Definition: blast_setup.c:1001
Int2 BLAST_GapAlignSetUp(EBlastProgramType program_number, const BlastSeqSrc *seq_src, const BlastScoringOptions *scoring_options, const BlastEffectiveLengthsOptions *eff_len_options, const BlastExtensionOptions *ext_options, const BlastHitSavingOptions *hit_options, BlastQueryInfo *query_info, BlastScoreBlk *sbp, BlastScoringParameters **score_params, BlastExtensionParameters **ext_params, BlastHitSavingParameters **hit_params, BlastEffectiveLengthsParameters **eff_len_params, BlastGapAlignStruct **gap_align)
Set up the auxiliary structures for gapped alignment / traceback only.
Definition: blast_setup.c:888
Smith-Waterman alignment for use within the infrastructure of BLAST.
Functions to do gapped alignment with traceback.
Int2 BLAST_ComputeTraceback(EBlastProgramType program_number, BlastHSPStream *hsp_stream, BLAST_SequenceBlk *query, BlastQueryInfo *query_info, const BlastSeqSrc *seq_src, BlastGapAlignStruct *gap_align, BlastScoringParameters *score_params, const BlastExtensionParameters *ext_params, BlastHitSavingParameters *hit_params, BlastEffectiveLengthsParameters *eff_len_params, const BlastDatabaseOptions *db_options, const PSIBlastOptions *psi_options, const BlastRPSInfo *rps_info, SPHIPatternSearchBlk *pattern_blk, BlastHSPResults **results, TInterruptFnPtr interrupt_search, SBlastProgress *progress_info)
Given the preliminary alignment results from a database search, redo the gapped alignment with traceb...
Various auxiliary BLAST utility functions.
BLAST_SequenceBlk * BlastSequenceBlkFree(BLAST_SequenceBlk *seq_blk)
Deallocate memory for a sequence block.
Definition: blast_util.c:245
Int1 BLAST_ContextToFrame(EBlastProgramType prog_number, Uint4 context_number)
This function translates the context number of a context into the frame of the sequence.
Definition: blast_util.c:839
Int2 BLAST_GetAllTranslations(const Uint1 *nucl_seq, EBlastEncoding encoding, Int4 nucl_length, const Uint1 *genetic_code, Uint1 **translation_buffer_ptr, Uint4 **frame_offsets_ptr, Uint1 **mixed_seq_ptr)
Translate nucleotide into 6 frames.
Definition: blast_util.c:1045
static int lookup(const char *name, const struct lookup_int *table)
Definition: attributes.c:50
int offset
Definition: replacements.h:160
Defines the interface to interact with the genetic code singleton object.
Uint1 * GenCodeSingletonFind(Uint4 gen_code_id)
Returns the genetic code string for the requested genetic code id.
Int2 BLAST_SmithWatermanGetGappedScore(EBlastProgramType program_number, BLAST_SequenceBlk *query, BlastQueryInfo *query_info, BLAST_SequenceBlk *subject, BlastGapAlignStruct *gap_align, const BlastScoringParameters *score_params, const BlastExtensionParameters *ext_params, const BlastHitSavingParameters *hit_params, BlastInitHitList *init_hitlist, BlastHSPList **hsp_list_ptr, BlastGappedStats *gapped_stats, Boolean *fence_hit)
Performs score-only Smith-Waterman gapped alignment of the subject sequence with all contexts in the ...
Definition: blast_sw.c:629
@ eBlastEncodingProtein
NCBIstdaa.
@ eBlastEncodingNcbi2na
NCBI2na.
#define NULL
Definition: ncbistd.hpp:225
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
Int2 DoAnchoredSearch(BLAST_SequenceBlk *query, BLAST_SequenceBlk *subject, Int4 word_size, BlastQueryInfo *query_info, BlastGapAlignStruct *gap_align, const BlastScoringParameters *score_params, const BlastHitSavingParameters *hit_params, BlastHSPStream *hsp_stream)
Do a search against a single subject with smaller word size and with no database word frequency filte...
Definition: jumper.c:4394
int i
if(yy_accept[yy_current_state])
int len
Int4 GetOffsetArraySize(LookupTableWrap *lookup)
Determine the size of the offsets arrays to be filled by the ScanSubject function.
Definition: lookup_wrap.c:255
Declarations for functions that extract hits from indexed blast databases (specialized for megablast)
#define LAST_VOL_IDX_INIT
int(* T_MB_IdbCheckOid)(Int4 oid, Int4 *last_vol_id)
Function pointer type to check index seeds availability for oid.
void(* T_MB_IdxEndSearchIndication)(Int4 last_vol_id)
Function pointer type to indicate end of preliminary search by the thread to the indexing library.
@ eNoResults
Int2 MB_IndexedWordFinder(BLAST_SequenceBlk *subject, BLAST_SequenceBlk *query, BlastQueryInfo *query_info, LookupTableWrap *lookup_wrap, Int4 **matrix, const BlastInitialWordParameters *word_params, Blast_ExtendWord *ewp, BlastOffsetPair *offset_pairs, Int4 max_hits, BlastInitHitList *init_hitlist, BlastUngappedStats *ungapped_stats)
Finds all runs of a specified number of exact matches between two nucleotide sequences.
Definition: na_ungapped.c:1697
static MDB_envinfo info
Definition: mdb_load.c:37
Nucleotide ungapped extension code.
void BlastChooseNaExtend(LookupTableWrap *lookup_wrap)
Choose the best routine to use for creating ungapped alignments.
Definition: na_ungapped.c:1791
Int2 BlastNaWordFinder(BLAST_SequenceBlk *subject, BLAST_SequenceBlk *query, BlastQueryInfo *query_info, LookupTableWrap *lookup_wrap, Int4 **matrix, const BlastInitialWordParameters *word_params, Blast_ExtendWord *ewp, BlastOffsetPair *offset_pairs, Int4 max_hits, BlastInitHitList *init_hitlist, BlastUngappedStats *ungapped_stats)
Find all words for a given subject sequence and perform ungapped extensions, assuming ordinary blastn...
Definition: na_ungapped.c:1597
Int2 JumperNaWordFinder(BLAST_SequenceBlk *subject, BLAST_SequenceBlk *query, BlastQueryInfo *query_info, LookupTableWrap *lookup_wrap, const BlastInitialWordParameters *word_params, const BlastScoringParameters *score_params, const BlastHitSavingParameters *hit_params, BlastOffsetPair *offset_pairs, MapperWordHits *word_hits, Int4 max_hits, BlastGapAlignStruct *gap_align, BlastInitHitList *init_hitlist, BlastHSPList **hsp_list_ptr, BlastUngappedStats *ungapped_stats, BlastGappedStats *gapped_stats)
Definition: na_ungapped.c:1930
MapperWordHits * MapperWordHitsFree(MapperWordHits *wh)
Definition: na_ungapped.c:1839
MapperWordHits * MapperWordHitsNew(const BLAST_SequenceBlk *query, const BlastQueryInfo *query_info)
Definition: na_ungapped.c:1867
Int2 ShortRead_IndexedWordFinder(BLAST_SequenceBlk *subject, BLAST_SequenceBlk *query, BlastQueryInfo *query_info, LookupTableWrap *lookup_wrap, const BlastInitialWordParameters *word_params, const BlastScoringParameters *score_params, const BlastHitSavingParameters *hit_params, BlastOffsetPair *offset_pairs, MapperWordHits *word_hits, Int4 max_hits, BlastGapAlignStruct *gap_align, BlastInitHitList *init_hitlist, BlastHSPList **hsp_list, BlastUngappedStats *ungapped_stats, BlastGappedStats *gapped_stats)
Definition: na_ungapped.c:2162
#define INT4_MAX
largest nubmer represented by signed int
Definition: ncbi_std.h:141
Uint1 Boolean
bool replacment for C
Definition: ncbi_std.h:94
#define TRUE
bool replacment for C indicating true.
Definition: ncbi_std.h:97
#define FALSE
bool replacment for C indicating false.
Definition: ncbi_std.h:101
#define ASSERT
macro for assert.
Definition: ncbi_std.h:107
#define INT4_MIN
Smallest (most negative) number represented by signed int.
Definition: ncbi_std.h:146
#define MAX(a, b)
returns larger of a and b.
Definition: ncbi_std.h:117
#define PHI_MAX_HIT
Maximal size of an array of pattern hits.
Definition: pattern.h:57
Word finder for PHI-BLAST.
Int2 PHIBlastWordFinder(BLAST_SequenceBlk *subject, BLAST_SequenceBlk *query, BlastQueryInfo *query_info, LookupTableWrap *lookup_wrap, Int4 **matrix, const BlastInitialWordParameters *word_params, Blast_ExtendWord *ewp, BlastOffsetPair *offset_pairs, Int4 max_hits, BlastInitHitList *init_hitlist, BlastUngappedStats *ungapped_stats)
WordFinder type function for PHI BLAST.
Definition: phi_extend.c:53
Function prototypes used for PHI BLAST gapped extension and gapped extension with traceback.
Int2 PHIGetGappedScore(EBlastProgramType program_number, BLAST_SequenceBlk *query, BlastQueryInfo *query_info, BLAST_SequenceBlk *subject, BlastGapAlignStruct *gap_align, const BlastScoringParameters *score_params, const BlastExtensionParameters *ext_params, const BlastHitSavingParameters *hit_params, BlastInitHitList *init_hitlist, BlastHSPList **hsp_list_ptr, BlastGappedStats *gapped_stats, Boolean *fence_hit)
Preliminary gapped alignment for PHI BLAST.
Definition: phi_gapalign.c:739
Pseudo lookup table structure and database scanning functions used in PHI-BLAST.
Structure to hold a sequence.
Definition: blast_def.h:242
Int4 length
Length of sequence.
Definition: blast_def.h:246
Uint1 * gen_code_string
for nucleotide subject sequences (tblast[nx]), the genetic code used to create a translated protein s...
Definition: blast_def.h:272
The basic lookup table structure for blastp searches.
Int4 word_length
Length in letters of the full word match required to trigger extension.
The lookup table structure for protein searches using a compressed alphabet.
Int4 word_length
Length in letters of the full word match required to trigger extension.
The context related information.
Int4 query_offset
Offset of this query, strand or frame in the concatenated super-query.
Structure to be passed to s_BlastSearchEngineCore, containing pointers to various preallocated struct...
Definition: blast_engine.c:87
BlastWordFinderType WordFinder
Word finder function pointer.
Definition: blast_engine.c:91
Uint1 * translation_table_rc
Translation table for reverse strand.
Definition: blast_engine.c:105
MapperWordHits * mapper_wordhits
Definition: blast_engine.c:100
Uint1 * translation_table
Translation table for forward strand.
Definition: blast_engine.c:104
BlastGetGappedScoreType GetGappedScore
Gapped extension function pointer.
Definition: blast_engine.c:92
BlastOffsetPair * offset_pairs
Array of offset pairs for initial seeds.
Definition: blast_engine.c:99
JumperGappedType JumperGapped
Word finder and gapped extension for mapping short reads.
Definition: blast_engine.c:94
Uint1 * translation_buffer
Placeholder for translated subject sequences.
Definition: blast_engine.c:102
BlastInitHitList * init_hitlist
Placeholder for HSPs after ungapped extension.
Definition: blast_engine.c:97
Blast_ExtendWord * ewp
Structure for keeping track of diagonal information for initial word matches.
Definition: blast_engine.c:89
Options used to create the ReadDBFILE structure Include database name and various information for res...
Int4 genetic_code
Genetic code to use for translation, tblast[nx] only.
Return statistics from the BLAST search.
BlastUngappedStats * ungapped_stat
Ungapped extension counts.
BlastRawCutoffs * cutoffs
Various raw values for the cutoffs.
BlastGappedStats * gapped_stat
Gapped extension counts.
Options for setting up effective lengths and search spaces.
Parameters for setting up effective lengths and search spaces.
Options used for gapped extension These include: a.
EBlastPrelimGapExt ePrelimGapExt
type of preliminary gapped extension (normally) for calculating score.
Computed values used as parameters for gapped alignments.
BlastExtensionOptions * options
The original (unparsed) options.
Int4 gap_x_dropoff_final
X-dropoff value for the final gapped extension (raw)
Int4 gap_x_dropoff
X-dropoff value for gapped extension (raw)
Structure supporting the gapped alignment.
Boolean positionBased
Is this PSI-BLAST?
BlastScoreBlk * sbp
Pointer to the scoring information block.
Structure containing hit counts from the gapped stage of a BLAST search.
Int4 good_extensions
Number of HSPs below the e-value threshold after gapped extension.
Int4 num_seqs_passed
Number of sequences with top HSP passing the e-value threshold.
BlastHSPSubjectBestHitOptions * subject_besthit_opts
Subject Culling.
The structure to hold all HSPs for a given sequence after the gapped alignment.
Definition: blast_hits.h:153
Int4 oid
The ordinal id of the subject sequence this HSP list is for.
Definition: blast_hits.h:154
Int4 hspcnt
Number of HSPs saved.
Definition: blast_hits.h:158
BlastHSP ** hsp_array
Array of pointers to individual HSPs.
Definition: blast_hits.h:157
Int4 query_index
Index of the query which this HSPList corresponds to.
Definition: blast_hits.h:155
The structure to contain all BLAST results, for multiple queries.
Definition: blast_hits.h:183
BlastHitList ** hitlist_array
Array of results for individual query sequences.
Definition: blast_hits.h:185
Int4 num_queries
Number of query sequences.
Definition: blast_hits.h:184
Default implementation of BlastHSPStream.
BlastHSPResults * results
Structure for saving HSP lists.
Structure holding all information about an HSP.
Definition: blast_hits.h:126
double evalue
This HSP's e-value.
Definition: blast_hits.h:130
BlastSeg query
Query sequence info.
Definition: blast_hits.h:131
BlastSeg subject
Subject sequence info.
Definition: blast_hits.h:132
Int4 low_score
The lowest of the best scores among the HSP lists.
Definition: blast_hits.h:174
Boolean heapified
Is this hit list already heapified?
Definition: blast_hits.h:175
Options used when evaluating and saving hits These include: a.
double low_score_perc
Low-score option.
BlastHSPFilteringOptions * hsp_filt_opt
Contains options to configure the HSP filtering/writering structures If not set, the default HSP filt...
Parameter block that contains a pointer to BlastHitSavingOptions and the values derived from it.
Int4 cutoff_score_min
smallest cutoff score across all contexts
Int4 * low_score
lowest ungapped score that can trigger a gapped alignment if the histlist is already full.
double prelim_evalue
evalue for preliminary search (may be higher for CBS).
BlastLinkHSPParameters * link_hsp_params
Parameters for linking HSPs with sum statistics; linking is not done if NULL.
BlastHitSavingOptions * options
The original (unparsed) options.
Structure to hold the initial HSP information.
Definition: blast_extend.h:150
BlastUngappedData * ungapped_data
Pointer to a structure holding ungapped alignment information.
Definition: blast_extend.h:153
BlastOffsetPair offsets
Offsets in query and subject, or, in PHI BLAST, start and end of pattern in subject.
Definition: blast_extend.h:151
Structure to hold all initial HSPs for a given subject sequence.
Definition: blast_extend.h:158
Int4 total
Total number of hits currently saved.
Definition: blast_extend.h:159
BlastInitHSP * init_hsp_array
Array of offset pairs, possibly with scores.
Definition: blast_extend.h:161
Options needed for initial word finding and processing.
Parameter block that contains a pointer to BlastInitialWordOptions and the values derived from it.
Int4 cutoff_score_min
smallest cutoff score across all contexts
Int4 x_dropoff_max
largest X-drop cutoff across all contexts
The query related information.
BlastContextInfo * contexts
Information per context.
int num_queries
Number of query sequences.
Uint4 max_length
Length of the longest among the concatenated queries.
The RPS engine uses this structure to access all of the RPS blast related data (assumed to be collect...
Definition: blast_rps.h:120
The basic lookup table structure for RPS blast searches.
Int4 num_profiles
Number of profiles in RPS database.
Int4 * rps_seq_offsets
array of start offsets for each RPS DB seq.
Structure holding raw cutoff and gap-x-drop values.
Int4 ungapped_cutoff
Minimal raw score for starting gapped extension.
Int4 x_drop_gap_final
Raw value of the x-dropoff for gapped extensions with traceback.
Int4 x_drop_gap
Raw value of the x-dropoff for preliminary gapped extensions.
Int4 x_drop_ungapped
Raw value of the x-dropoff for ungapped extensions.
Int4 cutoff_score
Cutoff score corresponding to given evalue.
Structure used for scoring calculations.
Definition: blast_stat.h:177
Boolean matrix_only_scoring
Score ungapped/gapped alignment only using the matrix parameters and with raw scores.
Definition: blast_stat.h:189
SPsiBlastScoreMatrix * psi_matrix
PSSM and associated data.
Definition: blast_stat.h:186
SBlastScoreMatrix * matrix
scoring matrix data
Definition: blast_stat.h:185
Blast_GumbelBlk * gbp
Gumbel parameters for FSC.
Definition: blast_stat.h:209
Scoring options block Used to produce the BlastScoreBlk structure This structure may be needed for lo...
Boolean gapped_calculation
gap-free search if FALSE
Boolean is_ooframe
Should out-of-frame gapping be used in a translated search?
Scoring parameters block Contains scoring-related information that is actually used for the blast sea...
double scale_factor
multiplier for all cutoff scores
BlastScoringOptions * options
User-provided values for these params.
Int4 end
End of hsp.
Definition: blast_hits.h:99
Int4 gapped_start
Where the gapped extension started.
Definition: blast_hits.h:100
Int2 frame
Translation frame.
Definition: blast_hits.h:97
Int4 offset
Start of hsp.
Definition: blast_hits.h:98
Structure used as the second argument to functions satisfying the GetSeqBlkFnPtr signature,...
Definition: blast_seqsrc.h:257
Int4 oid
Oid in BLAST database, index in an array of sequences, etc [in].
Definition: blast_seqsrc.h:259
EBlastEncoding encoding
Encoding of sequence, i.e.
Definition: blast_seqsrc.h:263
BLAST_SequenceBlk * seq
Sequence to return, if NULL, it should allocated by GetSeqBlkFnPtr (using BlastSeqBlkNew or BlastSetU...
Definition: blast_seqsrc.h:284
Complete type definition of Blast Sequence Source Iterator.
Complete type definition of Blast Sequence Source ADT.
Definition: blast_seqsrc.c:43
Int4 q_start
Start of the ungapped alignment in query.
Definition: blast_extend.h:143
Int4 s_start
Start of the ungapped alignment in subject.
Definition: blast_extend.h:144
Structure containing hit counts from the ungapped stage of a BLAST search.
Int8 lookup_hits
Number of successful lookup table hits.
Structure for keeping initial word extension information.
Definition: blast_extend.h:109
Wrapper structure for different types of BLAST lookup tables.
Definition: lookup_wrap.h:50
void * lut
Pointer to the actual lookup table structure.
Definition: lookup_wrap.h:52
void * end_search_indication
function used to report that a thread is done iterating over the database in preliminary search
Definition: lookup_wrap.h:57
void * check_index_oid
function used to check if seeds for a given oid are present
Definition: lookup_wrap.h:55
ELookupTableType lut_type
What kind of a lookup table it is?
Definition: lookup_wrap.h:51
void * read_indexed_db
function used to retrieve hits from an indexed database
Definition: lookup_wrap.h:53
Options used in protein BLAST only (PSI, PHI, RPS and translated BLAST) Some of these possibly should...
Progress monitoring structure.
Definition: blast_def.h:341
EBlastStage stage
Stage of the BLAST search currently in progress.
Definition: blast_def.h:342
int ** data
actual scoring matrix data, stored in row-major form
Definition: blast_stat.h:140
Structure containing all auxiliary information needed in a pattern search.
Definition: pattern.h:155
Int4 num_patterns_db
Number of patterns actually found during the database search.
Definition: pattern.h:168
SBlastScoreMatrix * pssm
position-specific score matrix
Definition: blast_stat.h:150
A structure containing two integers, used e.g.
Definition: blast_def.h:155
Int4 left
left endpoint of range (zero based)
Definition: blast_def.h:156
Int4 right
right endpoint of range (zero based)
Definition: blast_def.h:157
Structure used for subject sequence split.
Definition: blast_engine.c:123
Int4 num_soft_ranges
number of soft masking ranges
Definition: blast_engine.c:137
SSeqRange full_range
full sequence range
Definition: blast_engine.c:126
SSeqRange * hard_ranges
sequence ranges for hard masking
Definition: blast_engine.c:132
Uint1 * sequence
backup of original sequence
Definition: blast_engine.c:125
Int4 offset
the offset of current chunk
Definition: blast_engine.c:140
Int4 num_seq_ranges
backup of original number of items in seq_ranges
Definition: blast_engine.c:129
Int4 hm_index
the current hard masking range index
Definition: blast_engine.c:134
Int4 sm_index
the current soft masking range index
Definition: blast_engine.c:138
Int4 allocated
number of seq_range allocated for subject
Definition: blast_engine.c:130
Int4 num_hard_ranges
number of hard masking ranges
Definition: blast_engine.c:133
SSeqRange * soft_ranges
sequence ranges for soft masking
Definition: blast_engine.c:136
SSeqRange * seq_ranges
backup of original sequence range
Definition: blast_engine.c:128
Int4 next
the offset of next chunk
Definition: blast_engine.c:141
static string subject
static string query
This symbol enables the verbose option in makeblastdb and other BLAST+ search command line applicatio...
Definition: blast_def.h:141
Uint4 q_off
Query offset.
Definition: blast_def.h:143
Uint4 s_off
Subject offset.
Definition: blast_def.h:144
struct BlastOffsetPair::@6 qs_offsets
Query/subject offset pair.
static CS_CONTEXT * context
Definition: will_convert.c:21
voidp malloc(uInt size)
voidp calloc(uInt items, uInt size)
Modified on Wed May 29 18:40:52 2024 by modify_doxy.py rev. 669887