NCBI C++ ToolKit
blast_setup.c
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: blast_setup.c 100585 2023-08-14 14:05:41Z fongah2 $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Tom Madden
27  *
28  */
29 
30 /** @file blast_setup.c
31  * Utilities initialize/setup BLAST.
32  */
33 
34 
38 
39 /* See description in blast_setup.h */
40 Int2
42  const BlastScoringOptions * scoring_options,
43  EBlastProgramType program,
44  const BlastQueryInfo * query_info,
45  Blast_Message** error_return)
46 {
47  Int4 index = 0;
48  Int2 retval = 0;
49 
50  if (sbp == NULL || scoring_options == NULL) {
52  return 1;
53  }
54 
55  /* Fill values for gumbel parameters*/
56  if (program == eBlastTypeBlastn) {
57  /* TODO gumbel parameters are not supported for nucleotide search yet
58  retval =
59  Blast_KarlinBlkNuclGappedCalc(sbp->kbp_gap_std[index],
60  scoring_options->gap_open, scoring_options->gap_extend,
61  scoring_options->reward, scoring_options->penalty,
62  sbp->kbp_std[index], &(sbp->round_down), error_return); */
63  } else if (sbp->gbp) {
64  retval = Blast_GumbelBlkCalc(sbp->gbp,
65  scoring_options->gap_open, scoring_options->gap_extend,
66  sbp->name, error_return);
67  }
68  if (retval) return retval;
69 
70  /* Allocate and fill values for a gapped Karlin block, given the scoring
71  options, then copy it for all the query contexts, as long as they're
72  contexts that will be searched (i.e.: valid) */
73  for (index = query_info->first_context;
74  index <= query_info->last_context; index++) {
75 
76  if ( !query_info->contexts[index].is_valid ) {
77  continue;
78  }
79 
80  sbp->kbp_gap_std[index] = Blast_KarlinBlkNew();
81 
82  /* At this stage query sequences are nucleotide only for blastn */
83  if (program == eBlastTypeBlastn) {
84  /* If reward/penalty are both zero the calling program is
85  * indicating that a matrix must be used to score both the
86  * ungapped and gapped alignments. If this is the case
87  * set reward/penalty to allowed values so that extraneous
88  * KA stats can be performed without error. -RMH-
89  */
90  if ( scoring_options->reward == 0 && scoring_options->penalty == 0 )
91  {
92  retval =
94  scoring_options->gap_open, scoring_options->gap_extend,
96  sbp->kbp_std[index], &(sbp->round_down), error_return);
97  }else {
98  retval =
100  scoring_options->gap_open, scoring_options->gap_extend,
101  scoring_options->reward, scoring_options->penalty,
102  sbp->kbp_std[index], &(sbp->round_down), error_return);
103  }
104  } else {
105  retval =
107  scoring_options->gap_open, scoring_options->gap_extend,
108  sbp->name, error_return);
109  }
110  if (retval) {
111  return retval;
112  }
113 
114  /* For right now, copy the contents from kbp_gap_std to
115  * kbp_gap_psi (as in old code - BLASTSetUpSearchInternalByLoc) */
116  if (program != eBlastTypeBlastn && program != eBlastTypeMapping) {
117  sbp->kbp_gap_psi[index] = Blast_KarlinBlkNew();
118  Blast_KarlinBlkCopy(sbp->kbp_gap_psi[index],
119  sbp->kbp_gap_std[index]);
120  }
121  }
122 
123  /* Set gapped Blast_KarlinBlk* alias */
124  sbp->kbp_gap = Blast_QueryIsPssm(program) ?
125  sbp->kbp_gap_psi : sbp->kbp_gap_std;
126 
127  return 0;
128 }
129 
130 /** Fills a scoring block structure for a PHI BLAST search.
131  * @param sbp Scoring block structure [in] [out]
132  * @param options Scoring options structure [in]
133  * @param blast_message Structure for reporting errors [out]
134  * @param get_path callback function for matrix path [in]
135  */
136 static Int2
138  Blast_Message** blast_message, GET_MATRIX_PATH get_path)
139 {
140  Blast_KarlinBlk* kbp;
141  char buffer[1024];
142  Int2 status = 0;
143  int index;
144 
145  sbp->read_in_matrix = TRUE;
146  if ((status = Blast_ScoreBlkMatrixFill(sbp, get_path)) != 0)
147  return status;
148  kbp = sbp->kbp_gap_std[0] = Blast_KarlinBlkNew();
149  /* Point both non-allocated Karlin block arrays to kbp_gap_std. */
150  sbp->kbp_gap = sbp->kbp_gap_std;
151 
152  /* For PHI BLAST, the H value is not used, but it is not allowed to be 0,
153  so set it to 1. */
154  kbp->H = 1.0;
155 
156  /* This is populated so that the checks for valid contexts don't fail,
157  * note that this field is not used at all during a PHI-BLAST search */
158  sbp->sfp[0] = Blast_ScoreFreqNew(sbp->loscore, sbp->hiscore);
159 
160  /* Ideal Karlin block is filled unconditionally. */
161  status = Blast_ScoreBlkKbpIdealCalc(sbp);
162  if (status)
163  return status;
164 
165  if (0 == strcmp("BLOSUM62", options->matrix)) {
166  kbp->paramC = 0.50;
167  if ((11 == options->gap_open) && (1 == options->gap_extend)) {
168  kbp->Lambda = 0.270;
169  kbp->K = 0.047;
170  } else if ((9 == options->gap_open) && (2 == options->gap_extend)) {
171  kbp->Lambda = 0.285;
172  kbp->K = 0.075;
173  } else if ((8 == options->gap_open) && (2 == options->gap_extend)) {
174  kbp->Lambda = 0.265;
175  kbp->K = 0.046;
176  } else if ((7 == options->gap_open) && (2 == options->gap_extend)) {
177  kbp->Lambda = 0.243;
178  kbp->K = 0.032;
179  } else if ((12 == options->gap_open) && (1 == options->gap_extend)) {
180  kbp->Lambda = 0.281;
181  kbp->K = 0.057;
182  } else if ((10 == options->gap_open) && (1 == options->gap_extend)) {
183  kbp->Lambda = 0.250;
184  kbp->K = 0.033;
185  } else {
186  status = -1;
187  }
188  } else if (0 == strcmp("PAM30", options->matrix)) {
189  kbp->paramC = 0.30;
190  if ((9 == options->gap_open) && (1 == options->gap_extend)) {
191  kbp->Lambda = 0.295;
192  kbp->K = 0.13;
193  } else if ((7 == options->gap_open) && (2 == options->gap_extend)) {
194  kbp->Lambda = 0.306;
195  kbp->K = 0.15;
196  } else if ((6 == options->gap_open) && (2 == options->gap_extend)) {
197  kbp->Lambda = 0.292;
198  kbp->K = 0.13;
199  } else if ((5 == options->gap_open) && (2 == options->gap_extend)) {
200  kbp->Lambda = 0.263;
201  kbp->K = 0.077;
202  } else if ((10 == options->gap_open) && (1 == options->gap_extend)) {
203  kbp->Lambda = 0.309;
204  kbp->K = 0.15;
205  } else if ((8 == options->gap_open) && (1 == options->gap_extend)) {
206  kbp->Lambda = 0.270;
207  kbp->K = 0.070;
208  } else {
209  status = -1;
210  }
211  } else if (0 == strcmp("PAM70", options->matrix)) {
212  kbp->paramC = 0.35;
213  if ((10 == options->gap_open) && (1 == options->gap_extend)) {
214  kbp->Lambda = 0.291;
215  kbp->K = 0.089;
216  } else if ((8 == options->gap_open) && (2 == options->gap_extend)) {
217  kbp->Lambda = 0.303;
218  kbp->K = 0.13;
219  } else if ((7 == options->gap_open) && (2 == options->gap_extend)) {
220  kbp->Lambda = 0.287;
221  kbp->K = 0.095;
222  } else if ((6 == options->gap_open) && (2 == options->gap_extend)) {
223  kbp->Lambda = 0.269;
224  kbp->K = 0.079;
225  } else if ((11 == options->gap_open) && (1 == options->gap_extend)) {
226  kbp->Lambda = 0.307;
227  kbp->K = 0.13;
228  } else if ((9 == options->gap_open) && (1 == options->gap_extend)) {
229  kbp->Lambda = 0.269;
230  kbp->K = 0.058;
231  } else {
232  status = -1;
233  }
234  } else if (0 == strcmp("BLOSUM80", options->matrix)) {
235  kbp->paramC = 0.40;
236  if ((10 == options->gap_open) && (1 == options->gap_extend)) {
237  kbp->Lambda = 0.300;
238  kbp->K = 0.072;
239  } else if ((8 == options->gap_open) && (2 == options->gap_extend)) {
240  kbp->Lambda = 0.308;
241  kbp->K = 0.089;
242  } else if ((7 == options->gap_open) && (2 == options->gap_extend)) {
243  kbp->Lambda = 0.295;
244  kbp->K = 0.077;
245  } else if ((6 == options->gap_open) && (2 == options->gap_extend)) {
246  kbp->Lambda = 0.271;
247  kbp->K = 0.051;
248  } else if ((11 == options->gap_open) && (1 == options->gap_extend)) {
249  kbp->Lambda = 0.314;
250  kbp->K = 0.096;
251  } else if ((9 == options->gap_open) && (1 == options->gap_extend)) {
252  kbp->Lambda = 0.277;
253  kbp->K = 0.046;
254  } else {
255  status = -1;
256  }
257  } else if (0 == strcmp("BLOSUM45", options->matrix)) {
258  kbp->paramC = 0.60;
259  if ((14 == options->gap_open) && (2 == options->gap_extend)) {
260  kbp->Lambda = 0.199;
261  kbp->K = 0.040;
262  } else if ((13 == options->gap_open) && (3 == options->gap_extend)) {
263  kbp->Lambda = 0.209;
264  kbp->K = 0.057;
265  } else if ((12 == options->gap_open) && (3 == options->gap_extend)) {
266  kbp->Lambda = 0.203;
267  kbp->K = 0.049;
268  } else if ((11 == options->gap_open) && (3 == options->gap_extend)) {
269  kbp->Lambda = 0.193;
270  kbp->K = 0.037;
271  } else if ((10 == options->gap_open) && (3 == options->gap_extend)) {
272  kbp->Lambda = 0.182;
273  kbp->K = 0.029;
274  } else if ((15 == options->gap_open) && (2 == options->gap_extend)) {
275  kbp->Lambda = 0.206;
276  kbp->K = 0.049;
277  } else if ((13 == options->gap_open) && (2 == options->gap_extend)) {
278  kbp->Lambda = 0.190;
279  kbp->K = 0.032;
280  } else if ((12 == options->gap_open) && (2 == options->gap_extend)) {
281  kbp->Lambda = 0.177;
282  kbp->K = 0.023;
283  } else if ((19 == options->gap_open) && (1 == options->gap_extend)) {
284  kbp->Lambda = 0.209;
285  kbp->K = 0.049;
286  } else if ((18 == options->gap_open) && (1 == options->gap_extend)) {
287  kbp->Lambda = 0.202;
288  kbp->K = 0.041;
289  } else if ((17 == options->gap_open) && (1 == options->gap_extend)) {
290  kbp->Lambda = 0.195;
291  kbp->K = 0.034;
292  } else if ((16 == options->gap_open) && (1 == options->gap_extend)) {
293  kbp->Lambda = 0.183;
294  kbp->K = 0.024;
295  } else {
296  status = -1;
297  }
298  } else {
299  status = -2;
300  }
301 
302  if (status == -1) {
303  sprintf(buffer, "The combination %d for gap opening cost and %d for "
304  "gap extension is not supported in PHI-BLAST with matrix %s\n",
305  options->gap_open, options->gap_extend, options->matrix);
306  } else if (status == -2) {
307  sprintf(buffer, "Matrix %s not allowed in PHI-BLAST\n", options->matrix);
308  }
309  if (status)
311  else {
312 
313  /* fill in the rest of kbp_gap_std */
314  for(index=1;index<sbp->number_of_contexts;index++)
315  sbp->kbp_gap_std[index] = (Blast_KarlinBlk*)
316  BlastMemDup(sbp->kbp_gap_std[0], sizeof(Blast_KarlinBlk));
317 
318  /* copy kbp_gap_std to kbp_std */
319  for(index=0;index<sbp->number_of_contexts;index++)
320  sbp->kbp_std[index] = (Blast_KarlinBlk*)
321  BlastMemDup(sbp->kbp_gap_std[0], sizeof(Blast_KarlinBlk));
322 
323  sbp->kbp = sbp->kbp_std;
324  }
325 
326  return status;
327 }
328 
329 Int2
331  const BlastScoringOptions* scoring_options,
332  BlastScoreBlk* sbp,
333  GET_MATRIX_PATH get_path)
334 {
335  Int2 status = 0;
336 
337  if ( !sbp || !scoring_options ) {
338  return 1;
339  }
340 
341  /* Matrix only scoring is used to disable the greedy extension
342  optimisations which avoid use of a full-matrix. This is
343  currently only turned on in RMBlastN -RMH- */
344  sbp->matrix_only_scoring = FALSE;
345 
346  if (program_number == eBlastTypeBlastn ||
347  program_number == eBlastTypeMapping) {
348 
349  BLAST_ScoreSetAmbigRes(sbp, 'N');
350  BLAST_ScoreSetAmbigRes(sbp, '-');
351 
352  /* If reward/penalty are both zero the calling program is
353  * indicating that a matrix must be used to score both the
354  * ungapped and gapped alignments. Set the new
355  * matrix_only_scoring. For now reset reward/penalty to
356  * allowed blastn values so that extraneous KA stats can be
357  * performed without error. -RMH-
358  */
359  if ( scoring_options->penalty == 0 && scoring_options->reward == 0 )
360  {
361  sbp->matrix_only_scoring = TRUE;
362  sbp->penalty = BLAST_PENALTY;
363  sbp->reward = BLAST_REWARD;
364  }else {
365  sbp->penalty = scoring_options->penalty;
366  sbp->reward = scoring_options->reward;
367  }
368 
369  if (scoring_options->matrix && *scoring_options->matrix != NULLB) {
370 
371  sbp->read_in_matrix = TRUE;
372  sbp->name = strdup(scoring_options->matrix);
373 
374  } else {
375  char buffer[50];
376  sbp->read_in_matrix = FALSE;
377  sprintf(buffer, "blastn matrix:%ld %ld",
378  (long) sbp->reward, (long) sbp->penalty);
379  sbp->name = strdup(buffer);
380  }
381 
382  } else {
383  sbp->read_in_matrix = TRUE;
384  BLAST_ScoreSetAmbigRes(sbp, 'X');
385  sbp->name = BLAST_StrToUpper(scoring_options->matrix);
386  }
387  status = Blast_ScoreBlkMatrixFill(sbp, get_path);
388  if (status) {
389  return status;
390  }
391 
392  return status;
393 }
394 
395 static Int2
397  Blast_Message** error_return)
398 {
399  Int4 context;
400  Blast_KarlinBlk* kbp;
401  Int2 status;
402 
403  /* Create ungapped block */
404  status = Blast_ScoreBlkKbpIdealCalc(sbp);
405  if (status) {
406  return status;
407  }
408 
409  for (context = query_info->first_context;
410  context <= query_info->last_context; ++context) {
411 
412  if (!query_info->contexts[context].is_valid) {
413  continue;
414  }
415 
416  sbp->sfp[context] = NULL;
419  }
420  sbp->kbp = sbp->kbp_std;
421 
422  /* Create gapped block */
423  context = query_info->first_context;
424  while (!query_info->contexts[context].is_valid) {
425  context++;
426  }
427 
428  sbp->kbp_gap_std[context] = kbp = Blast_KarlinBlkNew();
431  BLAST_REWARD,
433  sbp->kbp_std[context],
434  &(sbp->round_down),
435  error_return);
436  if (status){
437  return status;
438  }
439 
440  for (++context;context <= query_info->last_context; ++context) {
441 
442  if (!query_info->contexts[context].is_valid) {
443  continue;
444  }
445 
448  }
449  sbp->kbp_gap = sbp->kbp_gap_std;
450 
451  return status;
452 }
453 
454 
455 Int2
457  const BlastQueryInfo* query_info,
458  const BlastScoringOptions* scoring_options,
459  EBlastProgramType program_number,
460  BlastScoreBlk* *sbpp,
461  double scale_factor,
462  Blast_Message* *blast_message,
463  GET_MATRIX_PATH get_path)
464 {
465  BlastScoreBlk* sbp;
466  Int2 status=0; /* return value. */
467  ASSERT(blast_message);
468 
469  if (sbpp == NULL)
470  return 1;
471 
472  if (program_number == eBlastTypeBlastn ||
473  program_number == eBlastTypeMapping) {
474 
475  sbp = BlastScoreBlkNew(BLASTNA_SEQ_CODE, query_info->last_context + 1);
476  /* disable new FSC rules for nucleotide case for now */
477  if (sbp && sbp->gbp) {
478  sfree(sbp->gbp);
479  sbp->gbp = NULL;
480  }
481  } else {
482  sbp = BlastScoreBlkNew(BLASTAA_SEQ_CODE, query_info->last_context + 1);
483  }
484 
485  if (!sbp) {
486  Blast_PerrorWithLocation(blast_message, BLASTERR_MEMORY, -1);
487  return 1;
488  }
489 
490  *sbpp = sbp;
491  sbp->scale_factor = scale_factor;
492 
493  /* Flag to indicate if we are using cross_match-like complexity
494  adjustments on the raw scores. RMBlastN is the currently
495  the only program using this flag. -RMH- */
497 
498  status = Blast_ScoreBlkMatrixInit(program_number, scoring_options, sbp, get_path);
499  if (status) {
500  Blast_PerrorWithLocation(blast_message, status, kBlastMessageNoContext);
501  return status;
502  }
503 
504  /* Fills in block for gapped blast. */
505  if (Blast_ProgramIsPhiBlast(program_number)) {
506  status = s_PHIScoreBlkFill(sbp, scoring_options, blast_message, get_path);
507  } else if (Blast_ProgramIsMapping(program_number)) {
508  /* Create fake score blocks for each query without computing base
509  frequencies. We do not compute e-values for mapping, so the
510  KA statistics are not needed, but the data structures are checked
511  and used in BLAST engine. */
512  status = s_JumperScoreBlkFill(sbp, query_info, blast_message);
513  } else {
514  status = Blast_ScoreBlkKbpUngappedCalc(program_number, sbp, query_blk->sequence,
515  query_info, blast_message);
516 
517  if (scoring_options->gapped_calculation) {
518  status =
519  Blast_ScoreBlkKbpGappedCalc(sbp, scoring_options, program_number,
520  query_info, blast_message);
521  } else {
522  ASSERT(sbp->kbp_gap == NULL);
523  /* for ungapped cases we do not have gbp filled */
524  if (sbp->gbp) {
525  sfree(sbp->gbp);
526  sbp->gbp=NULL;
527  }
528  }
529  }
530 
531  return status;
532 }
533 
534 Int2
536  const BlastScoreBlk* score_blk)
537 {
538  int index;
539  Boolean valid_context_found = FALSE;
540  ASSERT(query_info);
541 
542  for (index = query_info->first_context;
543  index <= query_info->last_context;
544  index++) {
545  if (query_info->contexts[index].is_valid) {
546  valid_context_found = TRUE;
547  } else if (score_blk) {
548  ASSERT(score_blk->kbp[index] == NULL);
549  ASSERT(score_blk->sfp[index] == NULL);
550  if (score_blk->kbp_gap) {
551  ASSERT(score_blk->kbp_gap[index] == NULL);
552  }
553  }
554  }
555 
556  if (valid_context_found) {
557  return 0;
558  } else {
559  return 1;
560  }
561 }
562 
564  const QuerySetUpOptions *qsup_options,
565  const BlastScoringOptions *scoring_options,
566  BLAST_SequenceBlk *query_blk,
567  const BlastQueryInfo *query_info,
568  double scale_factor,
569  BlastSeqLoc **lookup_segments,
570  BlastMaskLoc **mask,
571  BlastScoreBlk **sbpp,
572  Blast_Message **blast_message,
573  GET_MATRIX_PATH get_path)
574 {
575  Boolean mask_at_hash = FALSE; /* mask only for making lookup table? */
576  Int2 status = 0; /* return value */
577  BlastMaskLoc *filter_maskloc = NULL; /* Local variable for mask locs. */
578 
579  SBlastFilterOptions* filter_options = qsup_options->filtering_options;
580  Boolean filter_options_allocated = FALSE;
581 
582  ASSERT(blast_message);
583 
584  if (mask)
585  *mask = NULL;
586 
587  if (filter_options == NULL && qsup_options->filter_string)
588  {
589  status = BlastFilteringOptionsFromString(program_number,
590  qsup_options->filter_string,
591  &filter_options,
592  blast_message);
593  if (status) {
594  filter_options = SBlastFilterOptionsFree(filter_options);
595  return status;
596  }
597  filter_options_allocated = TRUE;
598  }
599  ASSERT(filter_options);
600 
601  status = BlastSetUp_GetFilteringLocations(query_blk,
602  query_info,
603  program_number,
604  filter_options,
605  & filter_maskloc,
606  blast_message);
607 
608  if (status) {
609  if (filter_options_allocated)
610  filter_options = SBlastFilterOptionsFree(filter_options);
611  return status;
612  }
613 
614  mask_at_hash = SBlastFilterOptionsMaskAtHash(filter_options);
615 
616  if (filter_options_allocated) {
617  filter_options = SBlastFilterOptionsFree(filter_options);
618  }
619 
620 
621  if (!mask_at_hash) {
622  BlastSetUp_MaskQuery(query_blk, query_info, filter_maskloc,
623  program_number);
624  }
625 
626  if (program_number == eBlastTypeBlastx && scoring_options->is_ooframe) {
627  BLAST_CreateMixedFrameDNATranslation(query_blk, query_info);
628  }
629 
630  /* Find complement of the mask locations, for which lookup table will be
631  * created. This should only be done if we do want to create a lookup table,
632  * i.e. if it is a full search, not a traceback-only search.
633  */
634  if (lookup_segments) {
635  BLAST_ComplementMaskLocations(program_number, query_info,
636  filter_maskloc, lookup_segments);
637  }
638 
639  if (mask)
640  {
641  if (Blast_QueryIsTranslated(program_number)) {
642  /* Filter locations so far are in protein coordinates;
643  convert them back to nucleotide here. */
644  BlastMaskLocProteinToDNA(filter_maskloc, query_info);
645  }
646  *mask = filter_maskloc;
647  filter_maskloc = NULL;
648  }
649  else
650  filter_maskloc = BlastMaskLocFree(filter_maskloc);
651 
652  status = BlastSetup_ScoreBlkInit(query_blk, query_info, scoring_options,
653  program_number, sbpp, scale_factor,
654  blast_message, get_path);
655  if (status) {
656  return status;
657  }
658 
659  if ( (status = BlastSetup_Validate(query_info, *sbpp) != 0)) {
660  if (*blast_message == NULL) {
661  Blast_PerrorWithLocation(blast_message, status, kBlastMessageNoContext);
662  }
663  return 1;
664  }
665 
666  return status;
667 }
668 
669 /** Return the search space appropriate for a given context from
670  * a list of tabulated search spaces
671  * @param eff_len_options Container for search spaces [in]
672  * @param context_index Identifier for the search space to return [in]
673  * @param blast_message List of messages, to receive possible warnings [in][out]
674  * @return The selected search space
675  */
677  const BlastEffectiveLengthsOptions* eff_len_options,
678  int context_index, Blast_Message **blast_message)
679 {
680  Int8 retval = 0;
681 
682  if (eff_len_options->num_searchspaces == 0) {
683  retval = 0;
684  } else if (eff_len_options->num_searchspaces == 1) {
685  if (context_index != 0) {
686  Blast_MessageWrite(blast_message, eBlastSevWarning, context_index,
687  "One search space is being used for multiple sequences");
688  }
689  retval = eff_len_options->searchsp_eff[0];
690  } else if (eff_len_options->num_searchspaces > 1) {
691  ASSERT(context_index < eff_len_options->num_searchspaces);
692  retval = eff_len_options->searchsp_eff[context_index];
693  } else {
694  abort(); /* should never happen */
695  }
696  return retval;
697 }
698 
700  const BlastScoringOptions* scoring_options,
701  const BlastEffectiveLengthsParameters* eff_len_params,
702  const BlastScoreBlk* sbp, BlastQueryInfo* query_info,
703  Blast_Message * *blast_message)
704 {
705  double alpha=0, beta=0; /*alpha and beta for new scoring system */
706  Int4 index; /* loop index. */
707  Int4 db_num_seqs; /* number of sequences in database. */
708  Int8 db_length; /* total length of database. */
709  Blast_KarlinBlk* *kbp_ptr; /* Array of Karlin block pointers */
710  const BlastEffectiveLengthsOptions* eff_len_options = eff_len_params->options;
711 
712  if (!query_info || !sbp)
713  return -1;
714 
715 
716  /* use overriding value from effective lengths options or the real value
717  from effective lengths parameters. */
718  if (eff_len_options->db_length > 0)
719  db_length = eff_len_options->db_length;
720  else
721  db_length = eff_len_params->real_db_length;
722 
723  /* If database (subject) length is not available at this stage, and
724  * overriding value of effective search space is not provided by user,
725  * do nothing.
726  * This situation can occur in the initial set up for a non-database search,
727  * where each subject is treated as an individual database.
728  */
729  if (db_length == 0 &&
731  return 0;
732  }
733 
734  if (Blast_SubjectIsTranslated(program_number))
735  db_length = db_length/3;
736 
737  if (eff_len_options->dbseq_num > 0)
738  db_num_seqs = eff_len_options->dbseq_num;
739  else
740  db_num_seqs = eff_len_params->real_num_seqs;
741 
742  /* Mapping does not need length correction */
743  if (Blast_ProgramIsMapping(program_number)) {
744  for (index = query_info->first_context;
745  index <= query_info->last_context;
746  index++) {
747  query_info->contexts[index].eff_searchsp = db_length;
748  }
749 
750  return 0;
751  }
752 
753  /* PHI BLAST search space calculation is different. */
754  if (Blast_ProgramIsPhiBlast(program_number))
755  {
756  for (index = query_info->first_context;
757  index <= query_info->last_context;
758  index++) {
759  Int8 effective_search_space = db_length - (db_num_seqs*(query_info->contexts[index].length_adjustment));
760  query_info->contexts[index].eff_searchsp = effective_search_space;
761  }
762 
763  return 0;
764  }
765 
766  /* N.B.: the old code used kbp_gap_std instead of the kbp_gap alias (which
767  * could be kbp_gap_psi), hence we duplicate that behavior here */
768  kbp_ptr = (scoring_options->gapped_calculation ? sbp->kbp_gap_std : sbp->kbp);
769 
770  for (index = query_info->first_context;
771  index <= query_info->last_context;
772  index++) {
773  Blast_KarlinBlk *kbp; /* statistical parameters for the current context */
774  Int4 length_adjustment = 0; /* length adjustment for current iteration. */
775  Int4 query_length; /* length of an individual query sequence */
776 
777  /* Effective search space for a given sequence/strand/frame */
778  Int8 effective_search_space =
779  s_GetEffectiveSearchSpaceForContext(eff_len_options, index,
780  blast_message);
781 
782  kbp = kbp_ptr[index];
783 
784  if (query_info->contexts[index].is_valid &&
785  ((query_length = query_info->contexts[index].query_length) > 0) ) {
786 
787  /* Use the correct Karlin block. For blastn, two identical Karlin
788  * blocks are allocated for each sequence (one per strand), but we
789  * only need one of them.
790  */
791  if (program_number == eBlastTypeBlastn) {
792  /* Setting reward and penalty to zero is being used to indicate
793  * that matrix scoring should be used for ungapped and gapped
794  * alignment. For now reward/penalty are being reset to the
795  * default blastn values to not disturb the KA calcs -RMH- */
796  if ( scoring_options->reward == 0 && scoring_options->penalty == 0 )
797  {
800  scoring_options->gap_open,
801  scoring_options->gap_extend,
802  sbp->kbp_std[index],
803  scoring_options->gapped_calculation,
804  &alpha, &beta);
805  }else {
806  Blast_GetNuclAlphaBeta(scoring_options->reward,
807  scoring_options->penalty,
808  scoring_options->gap_open,
809  scoring_options->gap_extend,
810  sbp->kbp_std[index],
811  scoring_options->gapped_calculation,
812  &alpha, &beta);
813  }
814  } else {
815  BLAST_GetAlphaBeta(sbp->name, &alpha, &beta,
816  scoring_options->gapped_calculation,
817  scoring_options->gap_open,
818  scoring_options->gap_extend,
819  sbp->kbp_std[index]);
820  }
822  alpha/kbp->Lambda, beta,
823  query_length, db_length,
824  db_num_seqs, &length_adjustment);
825 
826  if (effective_search_space == 0) {
827 
828  /* if the database length was specified, do not
829  adjust it when calculating the search space;
830  it's counter-intuitive to specify a value and
831  not have that value be used */
832  /* Changing this rule for now sicne cutoff score depends
833  * on the effective seach space length. SB-902
834  */
835 
836  Int8 effective_db_length = db_length - ((Int8)db_num_seqs * length_adjustment);
837 
838  // Just in case effective_db_length < 0
839  if (effective_db_length <= 0)
840  effective_db_length = 1;
841 
842  effective_search_space = effective_db_length *
843  (query_length - length_adjustment);
844  }
845  }
846  query_info->contexts[index].eff_searchsp = effective_search_space;
847  query_info->contexts[index].length_adjustment = length_adjustment;
848  }
849  return 0;
850 }
851 
852 void
854  Int8* total_length,
855  Int4* num_seqs)
856 {
857  ASSERT(total_length && num_seqs);
858 
859  *total_length = -1;
860  *num_seqs = -1;
861 
862  if ( !seqsrc ) {
863  return;
864  }
865 
866  *total_length = BlastSeqSrcGetTotLenStats(seqsrc);
867  if (*total_length <= 0)
868  *total_length = BlastSeqSrcGetTotLen(seqsrc);
869 
870  if (*total_length > 0) {
871  *num_seqs = BlastSeqSrcGetNumSeqsStats(seqsrc);
872  if (*num_seqs <= 0)
873  *num_seqs = BlastSeqSrcGetNumSeqs(seqsrc);
874  } else {
875  /* Not a database search; each subject sequence is considered
876  individually */
877  Int4 oid = 0; /* Get length of first sequence. */
878  if ( (*total_length = BlastSeqSrcGetSeqLen(seqsrc, (void*) &oid)) < 0) {
879  *total_length = -1;
880  *num_seqs = -1;
881  return;
882  }
883  *num_seqs = 1;
884  }
885 }
886 
887 Int2
889  const BlastSeqSrc* seq_src,
890  const BlastScoringOptions* scoring_options,
891  const BlastEffectiveLengthsOptions* eff_len_options,
892  const BlastExtensionOptions* ext_options,
893  const BlastHitSavingOptions* hit_options,
894  BlastQueryInfo* query_info,
895  BlastScoreBlk* sbp,
896  BlastScoringParameters** score_params,
897  BlastExtensionParameters** ext_params,
898  BlastHitSavingParameters** hit_params,
899  BlastEffectiveLengthsParameters** eff_len_params,
900  BlastGapAlignStruct** gap_align)
901 {
902  Int2 status = 0;
903  Uint4 max_subject_length;
904  Uint4 min_subject_length;
905  Int8 total_length = -1;
906  Int4 num_seqs = -1;
907 
908  if (seq_src) {
909  total_length = BlastSeqSrcGetTotLenStats(seq_src);
910  if (total_length <= 0)
911  total_length = BlastSeqSrcGetTotLen(seq_src);
912 
913  /* Set the database length for new FSC */
914  if (sbp->gbp) {
915  Int8 dbl = total_length;
916  /* Override the database length or set one (e.g., blast2seq). */
917  if (eff_len_options->db_length) {
918  dbl = eff_len_options->db_length;
919  }
920  sbp->gbp->db_length =
921  (Blast_SubjectIsTranslated(program_number))?
922  dbl/3 : dbl;
923  }
924 
925  if (total_length > 0) {
926  num_seqs = BlastSeqSrcGetNumSeqsStats(seq_src);
927  if (num_seqs <= 0)
928  num_seqs = BlastSeqSrcGetNumSeqs(seq_src);
929  } else {
930  /* Not a database search; each subject sequence is considered
931  individually */
932  Int4 oid = 0; /* Get length of first sequence. */
933  if ( (total_length = BlastSeqSrcGetSeqLen(seq_src, (void*) &oid)) < 0) {
934  total_length = -1;
935  num_seqs = -1;
936  }
937  num_seqs = 1;
938  }
939  }
940 
941  /* Initialize the effective length parameters with real values of
942  database length and number of sequences */
943  BlastEffectiveLengthsParametersNew(eff_len_options, total_length, num_seqs,
944  eff_len_params);
945  /* Effective lengths are calculated for all programs except PHI BLAST. */
946  if ((status = BLAST_CalcEffLengths(program_number, scoring_options,
947  *eff_len_params, sbp, query_info, NULL)) != 0)
948  {
949  *eff_len_params = BlastEffectiveLengthsParametersFree(*eff_len_params);
950  return status;
951  }
952 
953  if((status=BlastScoringParametersNew(scoring_options, sbp, score_params)) != 0)
954  {
955  *eff_len_params = BlastEffectiveLengthsParametersFree(*eff_len_params);
956  *score_params = BlastScoringParametersFree(*score_params);
957  return status;
958  }
959 
960  if((status=BlastExtensionParametersNew(program_number, ext_options, sbp,
961  query_info, ext_params)) != 0)
962  {
963  *eff_len_params = BlastEffectiveLengthsParametersFree(*eff_len_params);
964  *score_params = BlastScoringParametersFree(*score_params);
965  *ext_params = BlastExtensionParametersFree(*ext_params);
966  return status;
967  }
968 
969  if (sbp->gbp) {
970  min_subject_length = BlastSeqSrcGetMinSeqLen(seq_src);
971  if (Blast_SubjectIsTranslated(program_number)) {
972  min_subject_length/=3;
973  }
974  } else {
975  min_subject_length = (Int4) (total_length/num_seqs);
976  }
977 
978  if(min_subject_length <=0) {
980  }
981 
982  if ((status = BlastHitSavingParametersNew(program_number, hit_options, sbp,
983  query_info, min_subject_length,
984  (*ext_params)->options->compositionBasedStats,
985  hit_params)) != 0){
986  return status;
987  }
988 
989  /* To initialize the gapped alignment structure, we need to know the
990  maximal subject sequence length */
991  max_subject_length = BlastSeqSrcGetMaxSeqLen(seq_src);
992 
993  if ((status = BLAST_GapAlignStructNew(*score_params, *ext_params,
994  max_subject_length, sbp, gap_align)) != 0) {
995  return status;
996  }
997 
998  return status;
999 }
1000 
1002  Uint4 subject_length,
1003  const BlastScoringOptions* scoring_options,
1004  BlastQueryInfo* query_info,
1005  const BlastScoreBlk* sbp,
1006  BlastHitSavingParameters* hit_params,
1007  BlastInitialWordParameters* word_params,
1008  BlastEffectiveLengthsParameters* eff_len_params)
1009 {
1010  Int2 status = 0;
1011  eff_len_params->real_db_length = subject_length;
1012  if ((status = BLAST_CalcEffLengths(program_number, scoring_options,
1013  eff_len_params, sbp, query_info, NULL)) != 0)
1014  return status;
1015  /* Update cutoff scores in hit saving parameters */
1016  BlastHitSavingParametersUpdate(program_number, sbp, query_info, subject_length, 0, /* FIXME */
1017  hit_params);
1018 
1019  if (word_params) {
1020  /* Update cutoff scores in initial word parameters */
1021  BlastInitialWordParametersUpdate(program_number, hit_params, sbp, query_info,
1022  subject_length, word_params);
1023  /* Update the parameters for linking HSPs, if necessary. */
1024  BlastLinkHSPParametersUpdate(word_params, hit_params, scoring_options->gapped_calculation);
1025  }
1026  return status;
1027 }
1028 
1029 void
1031 {
1032  BlastSeqLoc* head_loc = NULL, *last_loc = NULL, *next_loc, *seqloc;
1033 
1034  to = MAX(to, 0);
1035 
1036  /* If there is no mask, or if both coordinates passed are 0, which indicates
1037  the full sequence, just return - there is nothing to be done. */
1038  if (mask == NULL || *mask == NULL || (from == 0 && to == 0))
1039  return;
1040 
1041  for (seqloc = *mask; seqloc; seqloc = next_loc) {
1042  next_loc = seqloc->next;
1043  seqloc->ssr->left = MAX(0, seqloc->ssr->left - from);
1044  seqloc->ssr->right = MIN(seqloc->ssr->right, to) - from;
1045  /* If this mask location does not intersect the [from,to] interval,
1046  do not add it to the newly constructed list and free its contents. */
1047  if (seqloc->ssr->left > seqloc->ssr->right) {
1048  /* Shift the pointer to the next link in chain and free this link. */
1049  if (last_loc)
1050  last_loc->next = seqloc->next;
1051  seqloc = BlastSeqLocNodeFree(seqloc);
1052  } else if (!head_loc) {
1053  /* First time a mask was found within the range. */
1054  head_loc = last_loc = seqloc;
1055  } else {
1056  /* Append to the previous masks. */
1057  last_loc->next = seqloc;
1058  last_loc = last_loc->next;
1059  }
1060  }
1061  *mask = head_loc;
1062 }
1063 
1064 Int2
1066  const SPHIPatternSearchBlk * pattern_blk,
1067  const BLAST_SequenceBlk * query,
1068  const BlastSeqLoc * lookup_segments,
1069  BlastQueryInfo * query_info,
1070  Blast_Message** blast_message)
1071 {
1072  const Boolean kIsNa = (program == eBlastTypePhiBlastn);
1073  Int4 num_patterns = 0;
1074 
1075  ASSERT(Blast_ProgramIsPhiBlast(program));
1076  ASSERT(query_info && pattern_blk);
1077 
1078  query_info->pattern_info = SPHIQueryInfoNew();
1079 
1080  /* If pattern is not found in query, return failure status. */
1081  num_patterns = PHIGetPatternOccurrences(pattern_blk, query, lookup_segments, kIsNa,
1082  query_info);
1083  if (num_patterns == 0)
1084  {
1085  char buffer[512];
1086  sprintf(buffer, "The pattern %s was not found in the query.", pattern_blk->pattern);
1087  if (blast_message)
1089  return -1;
1090  }
1091  else if (num_patterns == INT4_MAX)
1092  {
1093  char buffer[512];
1094  sprintf(buffer, "The pattern (%s) may not cover the entire query.", pattern_blk->pattern);
1095  if (blast_message)
1097  return -1;
1098  }
1099  else if (num_patterns < 0)
1100  {
1101  return -1;
1102  }
1103 
1104  /* Save pattern probability, because it needs to be passed back to
1105  formatting stage, where lookup table will not be available. */
1106  query_info->pattern_info->probability = pattern_blk->patternProbability;
1107 
1108  /* Also needed for formatting. */
1109  query_info->pattern_info->pattern =
1110  (char*) (char *) BlastMemDup(pattern_blk->pattern, 1+strlen(pattern_blk->pattern));
1111 
1112  /* Save minimal pattern length in the length adjustment field, because
1113  that is essentially its meaning. */
1114  query_info->contexts[0].length_adjustment =
1115  pattern_blk->minPatternMatchLength;
1116 
1117  return 0;
1118 }
#define sfree(x)
Safe free a pointer: belongs to a higher level header.
Definition: blast_def.h:112
BLAST filtering functions.
void BlastSetUp_MaskQuery(BLAST_SequenceBlk *query_blk, const BlastQueryInfo *query_info, const BlastMaskLoc *filter_maskloc, EBlastProgramType program_number)
Masks the sequence given a BlastMaskLoc.
BlastMaskLoc * BlastMaskLocFree(BlastMaskLoc *mask_loc)
Deallocate memory for a BlastMaskLoc structure as well as the BlastSeqLoc's pointed to.
Definition: blast_filter.c:789
Int2 BLAST_ComplementMaskLocations(EBlastProgramType program_number, const BlastQueryInfo *query_info, const BlastMaskLoc *mask_loc, BlastSeqLoc **complement_mask)
This function takes the list of mask locations (i.e., regions that should not be searched or not adde...
BlastSeqLoc * BlastSeqLocNodeFree(BlastSeqLoc *node)
Deallocate a single BlastSeqLoc structure and its contents, without following its next pointer.
Definition: blast_filter.c:727
Int2 BlastSetUp_GetFilteringLocations(BLAST_SequenceBlk *query_blk, const BlastQueryInfo *query_info, EBlastProgramType program_number, const SBlastFilterOptions *filter_options, BlastMaskLoc **filter_out, Blast_Message **blast_message)
Does preparation for filtering and then calls BlastSetUp_Filter.
Int2 BlastFilteringOptionsFromString(EBlastProgramType program_number, const char *instructions, SBlastFilterOptions **filtering_options, Blast_Message **blast_message)
Produces SBlastFilterOptions from a string that has been traditionally supported in blast.
Definition: blast_filter.c:436
Int2 BlastMaskLocProteinToDNA(BlastMaskLoc *mask_loc, const BlastQueryInfo *query_info)
Given a BlastMaskLoc with an array of lists of mask locations per protein frame, recalculates all mas...
Definition: blast_filter.c:892
Int2 BLAST_GapAlignStructNew(const BlastScoringParameters *score_params, const BlastExtensionParameters *ext_params, Uint4 max_subject_length, BlastScoreBlk *sbp, BlastGapAlignStruct **gap_align_ptr)
Initializes the BlastGapAlignStruct structure.
@ eBlastSevWarning
Definition: blast_message.h:57
#define Blast_PerrorWithLocation(msg, error_code, context)
Convenient define to call the function Blast_PerrorEx.
#define BLASTERR_SUBJECT_LENGTH_INVALID
Subject seqs min avg length error.
Int2 Blast_MessageWrite(Blast_Message **blast_msg, EBlastSeverity severity, int context, const char *message)
Writes a message to a structure.
const int kBlastMessageNoContext
Declared in blast_message.h as extern const.
Definition: blast_message.c:36
#define BLASTERR_MEMORY
System error: out of memory condition.
#define BLASTERR_INVALIDPARAM
Invalid parameter: possible programmer error or pre-condition not met.
#define BLAST_GAP_OPEN_MEGABLAST
default gap open penalty (megablast with greedy gapped alignment)
Definition: blast_options.h:87
#define BLAST_GAP_EXTN_MEGABLAST
default gap open penalty (megablast) with greedy gapped alignment)
Definition: blast_options.h:95
Boolean SBlastFilterOptionsMaskAtHash(const SBlastFilterOptions *filter_options)
Queries whether masking should be done only for the lookup table or for the entire search.
SBlastFilterOptions * SBlastFilterOptionsFree(SBlastFilterOptions *filter_options)
Frees SBlastFilterOptions and all subservient structures.
#define BLAST_REWARD
default nucleotide match score
#define BLAST_PENALTY
default reward and penalty (only applies to blastn/megablast)
Boolean BlastEffectiveLengthsOptions_IsSearchSpaceSet(const BlastEffectiveLengthsOptions *options)
Return true if the search spaces is set for any of the queries in the search.
BlastEffectiveLengthsParameters * BlastEffectiveLengthsParametersFree(BlastEffectiveLengthsParameters *parameters)
Deallocate memory for BlastEffectiveLengthsParameters*.
Int2 BlastExtensionParametersNew(EBlastProgramType blast_program, const BlastExtensionOptions *options, BlastScoreBlk *sbp, BlastQueryInfo *query_info, BlastExtensionParameters **parameters)
Calculate the raw values for the X-dropoff parameters.
BlastExtensionParameters * BlastExtensionParametersFree(BlastExtensionParameters *parameters)
Deallocate memory for BlastExtensionParameters.
Int2 BlastScoringParametersNew(const BlastScoringOptions *options, BlastScoreBlk *sbp, BlastScoringParameters **parameters)
Calculate scaled cutoff scores and gap penalties.
Int2 BlastHitSavingParametersNew(EBlastProgramType program_number, const BlastHitSavingOptions *options, const BlastScoreBlk *sbp, const BlastQueryInfo *query_info, Int4 avg_subject_length, Int4 compositionBasedStats, BlastHitSavingParameters **parameters)
Allocate memory and initialize the BlastHitSavingParameters structure.
Int2 BlastHitSavingParametersUpdate(EBlastProgramType program_number, const BlastScoreBlk *sbp, const BlastQueryInfo *query_info, Int4 avg_subject_length, Int4 compositionBasedStats, BlastHitSavingParameters *parameters)
Updates cutoff scores in hit saving parameters.
Int2 BlastLinkHSPParametersUpdate(const BlastInitialWordParameters *word_params, const BlastHitSavingParameters *hit_params, Boolean gapped_calculation)
Update BlastLinkHSPParameters, using calculated values of other parameters.
Int2 BlastInitialWordParametersUpdate(EBlastProgramType program_number, const BlastHitSavingParameters *hit_params, const BlastScoreBlk *sbp, BlastQueryInfo *query_info, Uint4 subject_length, BlastInitialWordParameters *parameters)
Update cutoff scores in BlastInitialWordParameters structure.
BlastScoringParameters * BlastScoringParametersFree(BlastScoringParameters *parameters)
Deallocate memory for BlastScoringParameters.
Int2 BlastEffectiveLengthsParametersNew(const BlastEffectiveLengthsOptions *options, Int8 db_length, Int4 num_seqs, BlastEffectiveLengthsParameters **parameters)
Allocate memory for BlastEffectiveLengthsParameters.
Boolean Blast_ProgramIsMapping(EBlastProgramType p)
Definition: blast_program.c:76
Boolean Blast_QueryIsPssm(EBlastProgramType p)
Returns true if the query is PSSM.
Definition: blast_program.c:46
Boolean Blast_ProgramIsPhiBlast(EBlastProgramType p)
Returns true if program is PHI-BLAST (i.e.
Definition: blast_program.c:70
Boolean Blast_QueryIsTranslated(EBlastProgramType p)
Returns true if the query is translated.
Definition: blast_program.c:60
EBlastProgramType
Defines the engine's notion of the different applications of the BLAST algorithm.
Definition: blast_program.h:72
@ eBlastTypeBlastn
Definition: blast_program.h:74
@ eBlastTypeBlastx
Definition: blast_program.h:75
@ eBlastTypePhiBlastn
Definition: blast_program.h:87
@ eBlastTypeMapping
Definition: blast_program.h:88
Boolean Blast_SubjectIsTranslated(EBlastProgramType p)
Returns true if the subject is translated.
Definition: blast_program.c:63
Int8 BlastSeqSrcGetTotLenStats(const BlastSeqSrc *seq_src)
Get the total length of all sequences for calculation of expect value etc.
Definition: blast_seqsrc.c:227
Int4 BlastSeqSrcGetSeqLen(const BlastSeqSrc *seq_src, void *oid)
Retrieve sequence length (number of residues/bases)
Definition: blast_seqsrc.c:281
Int4 BlastSeqSrcGetNumSeqs(const BlastSeqSrc *seq_src)
Get the number of sequences contained in the sequence source.
Definition: blast_seqsrc.c:177
Int8 BlastSeqSrcGetTotLen(const BlastSeqSrc *seq_src)
Get the total length of all sequences in the sequence source.
Definition: blast_seqsrc.c:219
Int4 BlastSeqSrcGetNumSeqsStats(const BlastSeqSrc *seq_src)
Get the number of sequences used for calculation of expect values etc.
Definition: blast_seqsrc.c:185
Int4 BlastSeqSrcGetMaxSeqLen(const BlastSeqSrc *seq_src)
Get the length of the longest sequence in the sequence source.
Definition: blast_seqsrc.c:193
Int4 BlastSeqSrcGetMinSeqLen(const BlastSeqSrc *seq_src)
Get the length of the longest sequence in the sequence source.
Definition: blast_seqsrc.c:201
static Int2 s_PHIScoreBlkFill(BlastScoreBlk *sbp, const BlastScoringOptions *options, Blast_Message **blast_message, GET_MATRIX_PATH get_path)
Fills a scoring block structure for a PHI BLAST search.
Definition: blast_setup.c:137
Int2 BlastSetup_Validate(const BlastQueryInfo *query_info, const BlastScoreBlk *score_blk)
Validation function for the setup of queries for the BLAST search.
Definition: blast_setup.c:535
void BLAST_GetSubjectTotals(const BlastSeqSrc *seqsrc, Int8 *total_length, Int4 *num_seqs)
Auxiliary function to retrieve the subject's number of sequences and total length.
Definition: blast_setup.c:853
static Int2 s_JumperScoreBlkFill(BlastScoreBlk *sbp, const BlastQueryInfo *query_info, Blast_Message **error_return)
Definition: blast_setup.c:396
Int2 BlastSetup_ScoreBlkInit(BLAST_SequenceBlk *query_blk, const BlastQueryInfo *query_info, const BlastScoringOptions *scoring_options, EBlastProgramType program_number, BlastScoreBlk **sbpp, double scale_factor, Blast_Message **blast_message, GET_MATRIX_PATH get_path)
Initializes the score block structure.
Definition: blast_setup.c:456
Int2 Blast_SetPHIPatternInfo(EBlastProgramType program, const SPHIPatternSearchBlk *pattern_blk, const BLAST_SequenceBlk *query, const BlastSeqLoc *lookup_segments, BlastQueryInfo *query_info, Blast_Message **blast_message)
In a PHI BLAST search, adds pattern information to the BlastQueryInfo structure.
Definition: blast_setup.c:1065
static Int8 s_GetEffectiveSearchSpaceForContext(const BlastEffectiveLengthsOptions *eff_len_options, int context_index, Blast_Message **blast_message)
Return the search space appropriate for a given context from a list of tabulated search spaces.
Definition: blast_setup.c:676
Int2 BLAST_MainSetUp(EBlastProgramType program_number, const QuerySetUpOptions *qsup_options, const BlastScoringOptions *scoring_options, BLAST_SequenceBlk *query_blk, const BlastQueryInfo *query_info, double scale_factor, BlastSeqLoc **lookup_segments, BlastMaskLoc **mask, BlastScoreBlk **sbpp, Blast_Message **blast_message, GET_MATRIX_PATH get_path)
"Main" setup routine for BLAST.
Definition: blast_setup.c:563
Int2 Blast_ScoreBlkKbpGappedCalc(BlastScoreBlk *sbp, const BlastScoringOptions *scoring_options, EBlastProgramType program, const BlastQueryInfo *query_info, Blast_Message **error_return)
Blast_ScoreBlkKbpGappedCalc, fills the ScoreBlkPtr for a gapped search.
Definition: blast_setup.c:41
Int2 Blast_ScoreBlkMatrixInit(EBlastProgramType program_number, const BlastScoringOptions *scoring_options, BlastScoreBlk *sbp, GET_MATRIX_PATH get_path)
Initializes the substitution matrix in the BlastScoreBlk according to the scoring options specified.
Definition: blast_setup.c:330
Int2 BLAST_OneSubjectUpdateParameters(EBlastProgramType program_number, Uint4 subject_length, const BlastScoringOptions *scoring_options, BlastQueryInfo *query_info, const BlastScoreBlk *sbp, BlastHitSavingParameters *hit_params, BlastInitialWordParameters *word_params, BlastEffectiveLengthsParameters *eff_len_params)
Recalculates the parameters that depend on an individual sequence, if this is not a database search.
Definition: blast_setup.c:1001
void BlastSeqLoc_RestrictToInterval(BlastSeqLoc **mask, Int4 from, Int4 to)
Adjusts the mask locations coordinates to a sequence interval.
Definition: blast_setup.c:1030
Int2 BLAST_GapAlignSetUp(EBlastProgramType program_number, const BlastSeqSrc *seq_src, const BlastScoringOptions *scoring_options, const BlastEffectiveLengthsOptions *eff_len_options, const BlastExtensionOptions *ext_options, const BlastHitSavingOptions *hit_options, BlastQueryInfo *query_info, BlastScoreBlk *sbp, BlastScoringParameters **score_params, BlastExtensionParameters **ext_params, BlastHitSavingParameters **hit_params, BlastEffectiveLengthsParameters **eff_len_params, BlastGapAlignStruct **gap_align)
Set up the auxiliary structures for gapped alignment / traceback only.
Definition: blast_setup.c:888
Int2 BLAST_CalcEffLengths(EBlastProgramType program_number, const BlastScoringOptions *scoring_options, const BlastEffectiveLengthsParameters *eff_len_params, const BlastScoreBlk *sbp, BlastQueryInfo *query_info, Blast_Message **blast_message)
Function to calculate effective query length and db length as well as effective search space.
Definition: blast_setup.c:699
Utilities initialize/setup BLAST.
char *(* GET_MATRIX_PATH)(const char *, Boolean)
callback to resolve the path to blast score matrices
Definition: blast_stat.h:61
Int2 Blast_GetNuclAlphaBeta(Int4 reward, Int4 penalty, Int4 gap_open, Int4 gap_extend, Blast_KarlinBlk *kbp, Boolean gapped_calculation, double *alpha, double *beta)
Extract the alpha and beta settings for these substitution and gap scores.
Definition: blast_stat.c:3949
Int2 Blast_KarlinBlkGappedCalc(Blast_KarlinBlk *kbp, Int4 gap_open, Int4 gap_extend, const char *matrix_name, Blast_Message **error_return)
Fills in lambda, H, and K values, as calculated by Stephen Altschul in Methods in Enzy.
Definition: blast_stat.c:3527
Blast_KarlinBlk * Blast_KarlinBlkNew(void)
Callocs a Blast_KarlinBlk.
Definition: blast_stat.c:2861
Int2 BLAST_ScoreSetAmbigRes(BlastScoreBlk *sbp, char ambiguous_res)
Set the ambiguous residue (e.g, 'N', 'X') in the BlastScoreBlk*.
Definition: blast_stat.c:1012
Int2 Blast_ScoreBlkKbpUngappedCalc(EBlastProgramType program, BlastScoreBlk *sbp, Uint1 *query, const BlastQueryInfo *query_info, Blast_Message **blast_message)
Calculate and fill the ungapped Karlin-Altschul parameters in the BlastScoreBlk structure (fields kbp...
Definition: blast_stat.c:2737
Int2 Blast_GumbelBlkCalc(Blast_GumbelBlk *gbp, Int4 gap_open, Int4 gap_extend, const char *matrix_name, Blast_Message **error_return)
Fills in gumbel parameters to estimate p-value using FSC.
Definition: blast_stat.c:3652
Int2 Blast_KarlinBlkNuclGappedCalc(Blast_KarlinBlk *kbp, Int4 gap_open, Int4 gap_extend, Int4 reward, Int4 penalty, Blast_KarlinBlk *kbp_ungap, Boolean *round_down, Blast_Message **error_return)
Retrieves Karlin-Altschul parameters from precomputed tables, given the substitution and gap scores.
Definition: blast_stat.c:3836
Int2 Blast_ScoreBlkKbpIdealCalc(BlastScoreBlk *sbp)
Calculates the Karlin-Altschul parameters assuming standard residue compositions for the query and su...
Definition: blast_stat.c:2832
void BLAST_GetAlphaBeta(const char *matrixName, double *alpha, double *beta, Boolean gapped, Int4 gap_open, Int4 gap_extend, const Blast_KarlinBlk *kbp_ungapped)
Extract the alpha and beta settings for this matrixName, and these gap open and gap extension costs.
Definition: blast_stat.c:3094
Blast_ScoreFreq * Blast_ScoreFreqNew(Int4 score_min, Int4 score_max)
Creates a new structure to keep track of score frequencies for a scoring system.
Definition: blast_stat.c:2113
Int2 Blast_KarlinBlkCopy(Blast_KarlinBlk *kbp_to, Blast_KarlinBlk *kbp_from)
Copies contents of one Karlin block to another.
Definition: blast_stat.c:2871
Int4 BLAST_ComputeLengthAdjustment(double K, double logK, double alpha_d_lambda, double beta, Int4 query_length, Int8 db_length, Int4 db_num_seqs, Int4 *length_adjustment)
Computes the adjustment to the lengths of the query and database sequences that is used to compensate...
Definition: blast_stat.c:5025
BlastScoreBlk * BlastScoreBlkNew(Uint1 alphabet, Int4 number_of_contexts)
Allocates and initializes BlastScoreBlk.
Definition: blast_stat.c:884
Int2 Blast_ScoreBlkMatrixFill(BlastScoreBlk *sbp, GET_MATRIX_PATH get_path)
This function fills in the BlastScoreBlk structure.
Definition: blast_stat.c:1599
Various auxiliary BLAST utility functions.
Int2 BLAST_CreateMixedFrameDNATranslation(BLAST_SequenceBlk *query_blk, const BlastQueryInfo *query_info)
Initialize the mixed-frame sequence for out-of-frame gapped extension.
Definition: blast_util.c:931
char * BLAST_StrToUpper(const char *string)
Returns a copy of the input string with all its characters turned to uppercase.
Definition: blast_util.c:1352
ncbi::TMaskedQueryRegions mask
#define BLASTNA_SEQ_CODE
Identifies the blastna alphabet, for use in blast only.
#define BLASTAA_SEQ_CODE
== Seq_code_ncbistdaa
#define NULL
Definition: ncbistd.hpp:225
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
int strcmp(const char *str1, const char *str2)
Definition: odbc_utils.hpp:160
#define strdup
Definition: ncbi_ansi_ext.h:70
#define MIN(a, b)
returns smaller of a and b.
Definition: ncbi_std.h:112
#define INT4_MAX
largest nubmer represented by signed int
Definition: ncbi_std.h:141
void * BlastMemDup(const void *orig, size_t size)
Copies memory using memcpy and malloc.
Definition: ncbi_std.c:35
Uint1 Boolean
bool replacment for C
Definition: ncbi_std.h:94
#define TRUE
bool replacment for C indicating true.
Definition: ncbi_std.h:97
#define FALSE
bool replacment for C indicating false.
Definition: ncbi_std.h:101
#define NULLB
terminating byte of a char* string.
Definition: ncbi_std.h:181
#define ASSERT
macro for assert.
Definition: ncbi_std.h:107
#define MAX(a, b)
returns larger of a and b.
Definition: ncbi_std.h:117
void abort()
Int4 PHIGetPatternOccurrences(const SPHIPatternSearchBlk *pattern_blk, const BLAST_SequenceBlk *query, const BlastSeqLoc *location, Boolean is_dna, BlastQueryInfo *query_info)
Finds all pattern hits in a given query and saves them in the previously allocated SPHIQueryInfo stru...
Definition: pattern.c:553
SPHIQueryInfo * SPHIQueryInfoNew(void)
Allocates the pattern occurrences structure.
Definition: pattern.c:478
static pcre_uint8 * buffer
Definition: pcretest.c:1051
Structure to hold a sequence.
Definition: blast_def.h:242
Uint1 * sequence
Sequence used for search (could be translation).
Definition: blast_def.h:243
Int4 query_length
Length of this query, strand or frame.
Boolean is_valid
Determine if this context is valid or not.
Int4 length_adjustment
Length adjustment for boundary conditions.
Int8 eff_searchsp
Effective search space for this context.
Options for setting up effective lengths and search spaces.
Int8 * searchsp_eff
Search space to be used for statistical calculations (one such per query context)
Int8 db_length
Database length to be used for statistical calculations.
Int4 dbseq_num
Number of database sequences to be used for statistical calculations.
Int4 num_searchspaces
Number of elements in searchsp_eff, this must be equal to the number of contexts in the search.
Parameters for setting up effective lengths and search spaces.
Int8 real_db_length
Total database length to use in search space calculations.
Int4 real_num_seqs
Number of subject sequences to use for search space calculations.
BlastEffectiveLengthsOptions * options
User provided values for these parameters.
Options used for gapped extension These include: a.
Computed values used as parameters for gapped alignments.
Structure supporting the gapped alignment.
Options used when evaluating and saving hits These include: a.
Parameter block that contains a pointer to BlastHitSavingOptions and the values derived from it.
Parameter block that contains a pointer to BlastInitialWordOptions and the values derived from it.
Structure for keeping the query masking information.
Definition: blast_def.h:210
The query related information.
Int4 first_context
Index of the first element of the context array.
BlastContextInfo * contexts
Information per context.
struct SPHIQueryInfo * pattern_info
Counts of PHI BLAST pattern occurrences, used in PHI BLAST only.
Int4 last_context
Index of the last element of the context array.
Structure used for scoring calculations.
Definition: blast_stat.h:177
Blast_KarlinBlk ** kbp
Karlin-Altschul parameters.
Definition: blast_stat.h:207
Int4 loscore
Min.
Definition: blast_stat.h:197
Blast_ScoreFreq ** sfp
score frequencies for scoring matrix.
Definition: blast_stat.h:205
Boolean matrix_only_scoring
Score ungapped/gapped alignment only using the matrix parameters and with raw scores.
Definition: blast_stat.h:189
double scale_factor
multiplier for all cutoff and dropoff scores
Definition: blast_stat.h:201
Blast_KarlinBlk ** kbp_gap
K-A parameters for gapped alignments.
Definition: blast_stat.h:208
Boolean round_down
Score must be rounded down to nearest even score if odd.
Definition: blast_stat.h:221
char * name
name of scoring matrix.
Definition: blast_stat.h:183
Int4 penalty
penalty for mismatch in blastn.
Definition: blast_stat.h:199
Int4 number_of_contexts
Used by sfp and kbp, how large are these.
Definition: blast_stat.h:217
Boolean read_in_matrix
If TRUE, matrix is read in, otherwise produce one from penalty and reward above.
Definition: blast_stat.h:202
Int4 hiscore
Max.
Definition: blast_stat.h:198
Boolean complexity_adjusted_scoring
Use cross_match-like complexity adjustment on raw scores.
Definition: blast_stat.h:195
Blast_KarlinBlk * kbp_ideal
Ideal values (for query with average database composition).
Definition: blast_stat.h:216
Blast_KarlinBlk ** kbp_gap_std
K-A parameters for std (not position-based) alignments.
Definition: blast_stat.h:214
Blast_KarlinBlk ** kbp_std
K-A parameters for ungapped alignments.
Definition: blast_stat.h:212
Int4 reward
reward for match in blastn.
Definition: blast_stat.h:200
Blast_KarlinBlk ** kbp_gap_psi
K-A parameters for psi alignments.
Definition: blast_stat.h:215
Blast_GumbelBlk * gbp
Gumbel parameters for FSC.
Definition: blast_stat.h:209
Scoring options block Used to produce the BlastScoreBlk structure This structure may be needed for lo...
Int2 penalty
Penalty for a mismatch.
Int4 gap_open
Extra penalty for starting a gap.
Int4 gap_extend
Penalty for each gap residue.
Int2 reward
Reward for a match.
Boolean gapped_calculation
gap-free search if FALSE
char * matrix
Name of the matrix containing all scores: needed for finding neighboring words.
Boolean is_ooframe
Should out-of-frame gapping be used in a translated search?
Boolean complexity_adjusted_scoring
Use cross_match-like complexity adjustment on raw scores.
Scoring parameters block Contains scoring-related information that is actually used for the blast sea...
Used to hold a set of positions, mostly used for filtering.
Definition: blast_def.h:204
struct BlastSeqLoc * next
next in linked list
Definition: blast_def.h:205
Complete type definition of Blast Sequence Source ADT.
Definition: blast_seqsrc.c:43
Int8 db_length
total length of database
Definition: blast_stat.h:108
Structure to hold the Karlin-Altschul parameters.
Definition: blast_stat.h:66
double paramC
for use in seed.
Definition: blast_stat.h:71
double K
K value used in statistics.
Definition: blast_stat.h:68
double Lambda
Lambda value used in statistics.
Definition: blast_stat.h:67
double H
H value used in statistics.
Definition: blast_stat.h:70
double logK
natural log of K value used in statistics
Definition: blast_stat.h:69
Structure to hold the a message from the core of the BLAST engine.
Definition: blast_message.h:70
Options required for setting up the query sequence.
char * filter_string
DEPRECATED, filtering options above.
SBlastFilterOptions * filtering_options
structured options for all filtering offered from algo/blast/core for BLAST.
All filtering options.
Structure containing all auxiliary information needed in a pattern search.
Definition: pattern.h:155
double patternProbability
Probability of this letter combination.
Definition: pattern.h:160
Int4 minPatternMatchLength
Minimum length of string to match this pattern.
Definition: pattern.h:161
char * pattern
Pattern used, saved here for error reporting.
Definition: pattern.h:170
char * pattern
Pattern used, saved here for formatting purposes.
Definition: blast_def.h:306
double probability
Estimated probability of the pattern.
Definition: blast_def.h:305
static string query
static CS_CONTEXT * context
Definition: will_convert.c:21
Modified on Tue May 14 16:21:20 2024 by modify_doxy.py rev. 669887