NCBI C++ ToolKit
redoalignment_unit_test.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: redoalignment_unit_test.cpp 92013 2020-12-17 15:27:34Z grichenk $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Tom Madden
27 *
28 * File Description:
29 * Unit test module for traceback calculation
30 *
31 * ===========================================================================
32 */
33 #include <ncbi_pch.hpp>
34 #include <corelib/test_boost.hpp>
35 
36 #include <corelib/ncbitime.hpp>
38 #include <objmgr/scope.hpp>
40 #include <objmgr/util/sequence.hpp>
41 
46 #include <serial/serial.hpp>
47 #include <serial/iterator.hpp>
48 #include <serial/objostr.hpp>
49 
52 #include <blast_objmgr_priv.hpp>
53 #include <blast_psi_priv.h>
54 
60 
63 
66 
69 
70 #include "test_objmgr.hpp"
71 #include "blast_test_util.hpp"
72 
73 using namespace std;
74 using namespace ncbi;
75 using namespace ncbi::objects;
76 using namespace ncbi::blast;
77 
79 {
82 
83  static BlastHSPList* setUpHSPList(int num_hsps,
84  const int query_offset[],
85  const int query_end[],
86  const int subject_offset[],
87  const int subject_end[],
88  const int query_gapped_start[],
89  const int subject_gapped_start[],
90  const int score[],
91  const double evalue[] = NULL,
92  const int num_ident[] = NULL)
93  {
94  const int kQueryContext = 0;
95  const int kSubjectFrame = 0;
96  const int kQueryFrame = 0;
97 
98  BlastHSPList* retval = Blast_HSPListNew(0);
99  if ( !retval ) {
100  return NULL;
101  }
102 
103  for (int i = 0; i < num_hsps; i++) {
104  BlastHSP* hsp = NULL;
105  Blast_HSPInit(query_offset[i], query_end[i],
106  subject_offset[i], subject_end[i],
107  query_gapped_start[i],
108  subject_gapped_start[i],
109  kQueryContext, kQueryFrame, kSubjectFrame, score[i],
110  NULL, &hsp);
111  if (num_ident) {
112  hsp->num_ident = num_ident[i];
113  }
114  if (evalue) {
115  hsp->evalue = evalue[i];
116  }
117  Blast_HSPListSaveHSP(retval, hsp);
118  }
119 
120  return retval;
121  }
122 
123  static int** loadPssmFromFile(const string& filename,
124  unsigned int query_length)
125  {
126  ifstream in(filename.c_str());
127  if ( !in ) {
128  throw runtime_error(filename + " could not be found");
129  }
130 
131  int** retval = (int**) _PSIAllocateMatrix(query_length,
132  BLASTAA_SIZE,
133  sizeof(int));
134  try {
135  for (unsigned int i = 0; i < query_length; i++) {
136  for (unsigned int j = 0; j < BLASTAA_SIZE; j++) {
137  in >> retval[i][j];
138  }
139  }
140  } catch (...) {
141  retval = (int**) _PSIDeallocateMatrix((void**)retval, query_length);
142  throw runtime_error("Error reading from " + filename);
143  }
144 
145  return retval;
146  }
147 
149  unsigned int qlen)
150  {
151  if ( !sbp ) {
152  throw runtime_error("NULL BlastScoreBlk*!");
153  }
154 
155  int** pssm =
156  loadPssmFromFile("data/aa.129295.pssm.txt",
157  qlen);
158  sbp->psi_matrix = SPsiBlastScoreMatrixNew(qlen);
159  _PSICopyMatrix_int(sbp->psi_matrix->pssm->data, pssm,
160  qlen, BLASTAA_SIZE);
161  pssm = (int**)_PSIDeallocateMatrix((void**) pssm, qlen);
162 
163  /* FIXME: Should offer a function that allows the setting of all
164  * PSI-BLAST settings in the BlastScoreBlk */
165  sbp->kbp = sbp->kbp_psi;
166  sbp->kbp_gap = sbp->kbp_gap_psi;
167  }
168 
169  // Core function which executes the unit tests
170  static void runRedoAlignmentCoreUnitTest(EBlastProgramType program,
171  CSeq_id& qid,
172  CSeq_id& sid,
173  BlastHSPList* init_hsp_list,
174  const BlastHSPList* ending_hsp_list,
175  Int8 effective_searchsp,
176  ECompoAdjustModes compositonBasedStatsMode,
177  bool doSmithWaterman,
178  double evalue_threshold =
180  int hit_list_size = BLAST_HITLIST_SIZE);
181 
182  // Core function which executes the unit tests
183  // FIXME: refactor so that blastOptions are passed in!
184  static void runRedoAlignmentCoreUnitTest(EBlastProgramType program,
185  SSeqLoc& qsl,
186  SSeqLoc& ssl,
187  BlastHSPList* init_hsp_list,
188  const BlastHSPList* ending_hsp_list,
189  Int8 effective_searchsp,
190  ECompoAdjustModes compositonBasedStatsMode,
191  bool doSmithWaterman,
192  double evalue_threshold =
194  int hit_list_size = BLAST_HITLIST_SIZE);
195 };
196 
199  CSeq_id& qid,
200  CSeq_id& sid,
201  BlastHSPList* init_hsp_list,
202  const BlastHSPList* ending_hsp_list,
203  Int8 effective_searchsp,
204  ECompoAdjustModes compositonBasedStatsMode,
205  bool doSmithWaterman,
206  double evalue_threshold,
207  int hit_list_size)
208 {
209 
210  unique_ptr<SSeqLoc> qsl(CTestObjMgr::Instance().CreateSSeqLoc(qid));
211  unique_ptr<SSeqLoc> ssl(CTestObjMgr::Instance().CreateSSeqLoc(sid));
212 
213  runRedoAlignmentCoreUnitTest(program, *qsl, *ssl, init_hsp_list,
214  ending_hsp_list, effective_searchsp, compositonBasedStatsMode,
215  doSmithWaterman, evalue_threshold, hit_list_size);
216 
217 }
218 
219 
222  SSeqLoc& qsl,
223  SSeqLoc& ssl,
224  BlastHSPList* init_hsp_list,
225  const BlastHSPList* ending_hsp_list,
226  Int8 effective_searchsp,
227  ECompoAdjustModes compositonBasedStatsMode,
228  bool doSmithWaterman,
229  double evalue_threshold,
230  int hit_list_size)
231 {
232 
233  char* program_buffer = NULL;
234  Int2 rv = BlastNumber2Program(program, &program_buffer);
235  BOOST_REQUIRE_MESSAGE(rv == (Int2)0, "BlastNumber2Program failed");
236  blast::EProgram prog = blast::ProgramNameToEnum(string(program_buffer));
237  sfree(program_buffer);
238  CBl2Seq blaster(qsl, ssl, prog);
239 
240  CBlastQueryInfo query_info;
241  CBLAST_SequenceBlk query_blk;
242  TSearchMessages blast_msg;
243 
244  const CBlastOptions& kOpts = blaster.GetOptionsHandle().GetOptions();
245  EBlastProgramType core_prog = kOpts.GetProgramType();
246  ENa_strand strand_opt = kOpts.GetStrandOption();
247 
248  SetupQueryInfo(const_cast<TSeqLocVector&>(blaster.GetQueries()),
249  core_prog, strand_opt,&query_info);
250  SetupQueries(const_cast<TSeqLocVector&>(blaster.GetQueries()),
251  query_info, &query_blk, core_prog, strand_opt, blast_msg);
252  ITERATE(TSearchMessages, m, blast_msg) {
253  BOOST_REQUIRE(m->empty());
254  }
255 
256  CBlastScoringOptions scoring_opts;
257  rv = BlastScoringOptionsNew(program, &scoring_opts);
258  BOOST_REQUIRE(rv == 0);
259 
260  CBlastExtensionOptions ext_opts;
261  BlastExtensionOptionsNew(program, &ext_opts,
262  scoring_opts->gapped_calculation);
263 
264  CBlastHitSavingOptions hitsaving_opts;
265  rv = BlastHitSavingOptionsNew(program, &hitsaving_opts,
266  scoring_opts->gapped_calculation);
267  hitsaving_opts->expect_value = evalue_threshold;
268  hitsaving_opts->hitlist_size = hit_list_size;
269  BOOST_REQUIRE(rv == 0);
270 
271  // FIXME: how to deal with this in case of PSI-BLAST?
272  // FIXME: GetQueryEncoding/GetSubjectEncoding for PSI-BLAST
273  // FIXME: Figure out what is needed to set up a PSI-BLAST search. Is
274  // lookup table different than eAaLookupTable? See
275  // PsiBlastOptionsHandle class also
276  CBlastSeqSrc seq_src
278  const_cast<TSeqLocVector&>(blaster.GetSubjects()),
281 
282  BlastExtensionOptions* ext_options=NULL;
283  BlastExtensionOptionsNew(program, &ext_options, true);
284  ext_options->compositionBasedStats = compositonBasedStatsMode;
285  if (doSmithWaterman)
286  ext_options->eTbackExt = eSmithWatermanTbck;
287 
290  hitsaving_opts, ext_options->compositionBasedStats,
291  scoring_opts->gapped_calculation));
292 
293  BlastHSPWriter* writer = BlastHSPWriterNew(&writer_info, NULL, NULL);
294  BOOST_REQUIRE(writer_info == NULL);
295 
296  BlastHSPStream* hsp_stream = BlastHSPStreamNew(
297  program, ext_options, FALSE, query_info->num_queries, writer);
298 
299  Blast_HSPListSortByScore(init_hsp_list);
300  BlastHSPStreamWrite(hsp_stream, &init_hsp_list);
301 
302  Blast_Message* blast_message=NULL;
303  BlastScoreBlk* sbp;
304  Int2 status;
305  /* FIXME: modularize this code */
306  const double k_rps_scale_factor = 1.0;
307  status =
308  BlastSetup_ScoreBlkInit(query_blk, query_info, scoring_opts,
309  program, &sbp, k_rps_scale_factor,
310  &blast_message, &BlastFindMatrixPath);
311  if (status > 0) {
312  return;
313  }
314 
315  if (program == eBlastTypePsiBlast) {
316  setupPositionBasedBlastScoreBlk(sbp,
317  sequence::GetLength(*(qsl.seqloc->GetId()), qsl.scope));
318  }
319 
320 
321  BlastEffectiveLengthsOptions* eff_len_opts = NULL;
322  BlastEffectiveLengthsOptionsNew(&eff_len_opts);
323  BLAST_FillEffectiveLengthsOptions(eff_len_opts, 0, 0,
324  &effective_searchsp, 1);
325  BlastEffectiveLengthsParameters* eff_len_params = NULL;
326  BlastEffectiveLengthsParametersNew(eff_len_opts, 0, 0, &eff_len_params);
327 
328  status = BLAST_CalcEffLengths(program, scoring_opts,
329  eff_len_params, sbp, query_info, NULL);
330  if (status > 0) {
331  return;
332  }
333 
334  BOOST_REQUIRE(query_info->contexts[0].eff_searchsp == effective_searchsp);
335 
336  eff_len_opts = BlastEffectiveLengthsOptionsFree(eff_len_opts);
337  BOOST_REQUIRE(eff_len_opts == NULL);
338  eff_len_params = BlastEffectiveLengthsParametersFree(eff_len_params);
339  BOOST_REQUIRE(eff_len_params == NULL);
340 
341  BlastExtensionParameters* ext_params=NULL;
342  BlastExtensionParametersNew(program, ext_options, sbp, query_info,
343  &ext_params);
344 
345  const int kAvgSubjLen = 0;
346  BlastHitSavingParameters* hit_params=NULL;
347  BlastHitSavingParametersNew(program, hitsaving_opts,
348  sbp, query_info, kAvgSubjLen,
349  2, &hit_params);
350 
351  BlastScoringParameters* scoring_params=NULL;
352  BlastScoringParametersNew(scoring_opts, sbp, &scoring_params);
353 
354  PSIBlastOptions* psi_options=NULL;
355  PSIBlastOptionsNew(&psi_options);
356 
357  BlastHSPResults* results =
358  Blast_HSPResultsNew(query_info->num_queries);
359  BOOST_REQUIRE(results);
360 
361  rv = Blast_RedoAlignmentCore(program, query_blk,
362  query_info, sbp, NULL, seq_src,
363  BLAST_GENETIC_CODE, NULL, hsp_stream,
364  scoring_params,
365  ext_params, hit_params, psi_options,
366  results);
367  BOOST_REQUIRE_MESSAGE(rv == (Int2)0, "Blast_RedoAlignmentCore failed!");
368 
369  hsp_stream = BlastHSPStreamFree(hsp_stream);
370  BOOST_REQUIRE(hsp_stream == NULL);
371  ext_params = BlastExtensionParametersFree(ext_params);
372  BOOST_REQUIRE(ext_params == NULL);
373  ext_options = BlastExtensionOptionsFree(ext_options);
374  BOOST_REQUIRE(ext_options == NULL);
375  hit_params = BlastHitSavingParametersFree(hit_params);
376  BOOST_REQUIRE(hit_params == NULL);
377  scoring_params = BlastScoringParametersFree(scoring_params);
378  BOOST_REQUIRE(scoring_params == NULL);
379  psi_options = PSIBlastOptionsFree(psi_options);
380  BOOST_REQUIRE(psi_options == NULL);
381  sbp = BlastScoreBlkFree(sbp);
382  BOOST_REQUIRE(sbp == NULL);
383 
384  BOOST_REQUIRE(results && results->num_queries == 1);
385  BOOST_REQUIRE(*results->hitlist_array != NULL);
386  BOOST_REQUIRE(results->hitlist_array[0]->hsplist_count > 0);
387  BlastHSPList* hsp_list = results->hitlist_array[0]->hsplist_array[0];
388 
389  BOOST_REQUIRE_EQUAL(ending_hsp_list->hspcnt, hsp_list->hspcnt);
390 #if 0
391 
392  if ( ending_hsp_list->hspcnt != hsp_list->hspcnt) {
393  cout << "Expected num hsps=" << ending_hsp_list->hspcnt;
394  cout << " Actual num hsps=" << hsp_list->hspcnt << endl;
395  }
396 #endif
397  for (int index=0; index<hsp_list->hspcnt; index++)
398  {
399  BlastHSP* expected_hsp = ending_hsp_list->hsp_array[index];
400  BlastHSP* actual_hsp = hsp_list->hsp_array[index];
401 
402 #if 0
403  cout << index << ": query_offset="
404  << actual_hsp->query.offset << endl;
405  cout << index << ": query_end="
406  << actual_hsp->query.end << endl;
407  cout << index << ": subject_offset="
408  << actual_hsp->subject.offset << endl;
409  cout << index << ": subject_end="
410  << actual_hsp->subject.end << endl;
411  cout << index << ": score="
412  << actual_hsp->score << endl;
413  cout << index << ": bit_score="
414  << actual_hsp->bit_score << endl;
415  cout << index << ": evalue="
416  << actual_hsp->evalue << endl;
417  cout << index << ": num_ident="
418  << actual_hsp->num_ident << endl;
419 #endif
420  BOOST_REQUIRE_EQUAL(expected_hsp->query.offset,
421  actual_hsp->query.offset);
422  BOOST_REQUIRE_EQUAL(expected_hsp->query.end,
423  actual_hsp->query.end);
424  BOOST_REQUIRE_EQUAL(expected_hsp->subject.offset,
425  actual_hsp->subject.offset);
426  BOOST_REQUIRE_EQUAL(expected_hsp->subject.end,
427  actual_hsp->subject.end);
428  BOOST_REQUIRE_EQUAL(expected_hsp->score,
429  actual_hsp->score);
430  BOOST_REQUIRE_EQUAL(expected_hsp->num_ident,
431  actual_hsp->num_ident);
432 #if 0
433  double diff = fabs((expected_hsp->evalue-actual_hsp->evalue));
434  cerr << "Diff in evalues for " << index << "=" << diff << endl;
435 #endif
436  BOOST_REQUIRE_CLOSE(expected_hsp->evalue, actual_hsp->evalue, 10.0);
437  // cout << "HSP " << index << " OK" << endl;
438  }
439 
440  results = Blast_HSPResultsFree(results);
441  BOOST_REQUIRE(results == NULL);
442 }
443 
444 BOOST_FIXTURE_TEST_SUITE(RedoAlignment, CRedoAlignmentTestFixture)
445 
446 // Start modularized code
447 BOOST_AUTO_TEST_CASE(testRedoAlignmentWithCompBasedStats) {
448  const EBlastProgramType kProgram = eBlastTypeBlastp;
449  const int k_num_hsps_start = 3;
450  const int k_num_hsps_end = 2;
451  CSeq_id query_id("gi|3091");
452  CSeq_id subj_id("gi|402871");
453 
454  // It seems that the last hsp in this set was manually constructed and
455  // expected to be dropped (please note that the first 2 hsps we
456  // constructed using blastall (thus filtering on), but the sequence
457  // passed to RedoAlignmentCore was unfiltered (default for blastpgp)
458  const int query_offset[k_num_hsps_start] = { 28, 46, 463};
459  const int query_end[k_num_hsps_start] = { 485, 331, 488};
460  const int subject_offset[k_num_hsps_start] = { 36, 327, 320};
461  const int subject_end[k_num_hsps_start] = { 512, 604, 345};
462  const int score[k_num_hsps_start] = { 554, 280, 28};
463  const int query_gapped_start[k_num_hsps_start] = { 431, 186, 480};
464  const int subject_gapped_start[k_num_hsps_start] = { 458, 458, 337};
465 
466  // This is freed by the HSPStream interface
467  BlastHSPList* init_hsp_list =
468  setUpHSPList(k_num_hsps_start,
469  query_offset, query_end,
470  subject_offset, subject_end,
471  query_gapped_start,
472  subject_gapped_start,
473  score);
474 
475  const int query_offset_final[k_num_hsps_end] = { 2, 46};
476  const int query_end_final[k_num_hsps_end] = { 485, 331};
477  const int subject_offset_final[k_num_hsps_end] = { 9, 327};
478  const int subject_end_final[k_num_hsps_end] = { 512, 604};
479  const int score_final[k_num_hsps_end] = { 510, 282};
480  const double evalue_final[k_num_hsps_end] = {7.0065e-61, 1.6958e-30};
481  const int num_idents_final[k_num_hsps_end] = { 171, 94 };
482 
483  BlastHSPList* ending_hsp_list =
484  setUpHSPList(k_num_hsps_end,
485  query_offset_final,
486  query_end_final,
487  subject_offset_final,
488  subject_end_final,
489  query_offset_final,
490  subject_offset_final,
491  score_final,
492  evalue_final,
493  num_idents_final);
494 
495  const Int8 kEffSearchSp = 500000;
496  const bool kSmithWaterman = false;
497 
498  runRedoAlignmentCoreUnitTest(kProgram, query_id, subj_id,
499  init_hsp_list, ending_hsp_list,
500  kEffSearchSp, eCompositionBasedStats,
501  kSmithWaterman);
502 
503  ending_hsp_list = Blast_HSPListFree(ending_hsp_list);
504  BOOST_REQUIRE(ending_hsp_list == NULL);
505 }
506 
507 BOOST_AUTO_TEST_CASE(testRedoAlignmentWithConditionalAdjust) {
508  const EBlastProgramType kProgram = eBlastTypeBlastp;
509  const int k_num_hsps_start = 3;
510  const int k_num_hsps_end = 2;
511  CSeq_id query_id("gi|3091");
512  CSeq_id subj_id("gi|402871");
513 
514  // It seems that the last hsp in this set was manually constructed and
515  // expected to be dropped (please note that the first 2 hsps we
516  // constructed using blastall (thus filtering on), but the sequence
517  // passed to RedoAlignmentCore was unfiltered (default for blastpgp)
518  const int query_offset[k_num_hsps_start] = { 28, 46, 463};
519  const int query_end[k_num_hsps_start] = { 485, 331, 488};
520  const int subject_offset[k_num_hsps_start] = { 36, 327, 320};
521  const int subject_end[k_num_hsps_start] = { 512, 604, 345};
522  const int score[k_num_hsps_start] = { 554, 280, 28};
523  const int query_gapped_start[k_num_hsps_start] = { 431, 186, 480};
524  const int subject_gapped_start[k_num_hsps_start] = { 458, 458, 337};
525 
526  // This is freed by the HSPStream interface
527  BlastHSPList* init_hsp_list =
528  setUpHSPList(k_num_hsps_start,
529  query_offset, query_end,
530  subject_offset, subject_end,
531  query_gapped_start,
532  subject_gapped_start,
533  score);
534 
535  const int query_offset_final[k_num_hsps_end] = { 2, 46};
536  const int query_end_final[k_num_hsps_end] = { 517, 331};
537  const int subject_offset_final[k_num_hsps_end] = { 9, 327};
538  const int subject_end_final[k_num_hsps_end] = { 546, 604};
539  const int score_final[k_num_hsps_end] = { 537, 298};
540  const double evalue_final[k_num_hsps_end] = {1.1954e-64, 1.5494e-32};
541  const int num_idents_final[k_num_hsps_end] = { 177, 95 };
542 
543  BlastHSPList* ending_hsp_list =
544  setUpHSPList(k_num_hsps_end,
545  query_offset_final,
546  query_end_final,
547  subject_offset_final,
548  subject_end_final,
549  query_offset_final,
550  subject_offset_final,
551  score_final,
552  evalue_final,
553  num_idents_final);
554 
555  const Int8 kEffSearchSp = 500000;
556  const bool kSmithWaterman = false;
557 
558  runRedoAlignmentCoreUnitTest(kProgram, query_id, subj_id,
559  init_hsp_list, ending_hsp_list,
560  kEffSearchSp, eCompositionMatrixAdjust,
561  kSmithWaterman);
562 
563  ending_hsp_list = Blast_HSPListFree(ending_hsp_list);
564  BOOST_REQUIRE(ending_hsp_list == NULL);
565 }
566 
567 BOOST_AUTO_TEST_CASE(testPSIRedoAlignmentWithCompBasedStats) {
568  const EBlastProgramType kProgram = eBlastTypePsiBlast;
569  const int k_num_hsps_start = 6;
570  const int k_num_hsps_end = 2;
571  CSeq_id query_id("gi|129295");
572  CSeq_id subj_id("gi|7450545");
573 
574  const int query_offset[k_num_hsps_start] = { 24, 99, 16, 84, 6, 223 };
575  const int query_end[k_num_hsps_start] = { 62, 128, 24, 114, 25, 231 };
576  const int subject_offset[k_num_hsps_start] =
577  { 245, 0, 198, 86, 334, 151 };
578  const int subject_end[k_num_hsps_start] =
579  { 287, 29, 206, 119, 353, 159 };
580  const int score[k_num_hsps_start] = { 37, 26, 25, 25, 24, 24 };
581  const int query_gapped_start[k_num_hsps_start] =
582  { 29, 104, 20, 91, 19, 227 };
583  const int subject_gapped_start[k_num_hsps_start] =
584  { 250, 5, 202, 93, 347, 155 };
585 
586  // No gaps were found in these alignments. This is freed by the
587  // HSPStream interface
588  BlastHSPList* init_hsp_list =
589  setUpHSPList(k_num_hsps_start,
590  query_offset, query_end,
591  subject_offset, subject_end,
592  query_gapped_start,
593  subject_gapped_start,
594  score);
595 
596  const int query_offset_final[k_num_hsps_end] = { 24, 18 };
597  const int query_end_final[k_num_hsps_end] = { 30, 31 };
598  const int subject_offset_final[k_num_hsps_end] = { 245, 200 };
599  const int subject_end_final[k_num_hsps_end] = { 251, 210 };
600  const int score_final[k_num_hsps_end] = { 29, 24 };
601  const double evalue_final[k_num_hsps_end] =
602  { 1.361074 , 6.425098 };
603  const int ident_final[k_num_hsps_end] = { 3, 6};
604 
605 
606  BlastHSPList* ending_hsp_list =
607  setUpHSPList(k_num_hsps_end,
608  query_offset_final,
609  query_end_final,
610  subject_offset_final,
611  subject_end_final,
612  query_offset_final,
613  subject_offset_final,
614  score_final,
615  evalue_final,
616  ident_final);
617 
618 
619  const Int8 kEffSearchSp = 84660;
620  const bool kSmithWaterman = false;
621 
622  runRedoAlignmentCoreUnitTest(kProgram, query_id, subj_id,
623  init_hsp_list, ending_hsp_list,
624  kEffSearchSp, eCompositionBasedStats,
625  kSmithWaterman);
626  ending_hsp_list = Blast_HSPListFree(ending_hsp_list);
627  BOOST_REQUIRE(ending_hsp_list == NULL);
628 }
629 
630 BOOST_AUTO_TEST_CASE(testRedoAlignmentWithCompBasedStatsBadlyBiasedSequence) {
631  const EBlastProgramType kProgram = eBlastTypeBlastp;
632  const int k_num_hsps_start = 6;
633  const int k_num_hsps_end = 5;
634  // CSeq_id query_id("gi|129295");
635  CSeq_id subj_id("gb|AAA22059|");
636  unique_ptr<SSeqLoc> ssl(CTestObjMgr::Instance().CreateSSeqLoc(subj_id));
637 
638  CNcbiIfstream infile("data/biased.fsa");
639  const bool is_protein(true);
640  CBlastInputSourceConfig iconfig(is_protein);
642  (new CBlastFastaInputSource(infile, iconfig));
643  CRef<CBlastInput> input(new CBlastInput(&*fasta_src));
644  CRef<CScope> scope = CBlastScopeSource(is_protein).NewScope();
645 
646  blast::TSeqLocVector query_seqs = input->GetAllSeqLocs(*scope);
647 
648  const int query_offset[k_num_hsps_start] = { 3, 1, 4, 3, 0, 1 };
649  const int query_end[k_num_hsps_start] = { 236, 232, 236, 235, 226, 233 };
650  const int subject_offset[k_num_hsps_start] =
651  { 1, 1, 6, 6, 12, 22 };
652  const int subject_end[k_num_hsps_start] =
653  { 238, 238, 238, 238, 238, 254 };
654  const int score[k_num_hsps_start] = { 345, 344, 343, 339, 332, 320 };
655  const int query_gapped_start[k_num_hsps_start] =
656  { 32, 194, 9, 8, 104, 9 };
657  const int subject_gapped_start[k_num_hsps_start] =
658  { 30, 200, 11, 11, 116, 30 };
659 
660  // No gaps were found in these alignments. This is freed by the
661  // HSPStream interface
662  BlastHSPList* init_hsp_list =
663  setUpHSPList(k_num_hsps_start,
664  query_offset, query_end,
665  subject_offset, subject_end,
666  query_gapped_start,
667  subject_gapped_start,
668  score);
669 
670  const int query_offset_final[k_num_hsps_end] = { 4, 3, 3, 0, 0};
671  const int query_end_final[k_num_hsps_end] = { 236, 235, 220, 226, 232};
672  const int subject_offset_final[k_num_hsps_end] = { 6, 6, 1, 12, 6};
673  const int subject_end_final[k_num_hsps_end] = { 238, 238, 218, 238, 238};
674  const int score_final[k_num_hsps_end] = { 73, 72, 69, 68, 66};
675  const double evalue_final[k_num_hsps_end] =
676  { 1.26e-05 , 1.7e-5 , 4.0e-5, 5.1e-5, 0.0000775};
677  const int num_idents_final[k_num_hsps_end] = { 87, 85, 81, 84, 85 };
678 
679 
680  BlastHSPList* ending_hsp_list =
681  setUpHSPList(k_num_hsps_end,
682  query_offset_final,
683  query_end_final,
684  subject_offset_final,
685  subject_end_final,
686  query_offset_final,
687  subject_offset_final,
688  score_final,
689  evalue_final,
690  num_idents_final);
691 
692 
693  const Int8 kEffSearchSp = 84660;
694  const bool kSmithWaterman = false;
695 
696  runRedoAlignmentCoreUnitTest(kProgram, query_seqs[0], *ssl,
697  init_hsp_list, ending_hsp_list,
698  kEffSearchSp, eCompositionMatrixAdjust,
699  kSmithWaterman);
700  ending_hsp_list = Blast_HSPListFree(ending_hsp_list);
701  BOOST_REQUIRE(ending_hsp_list == NULL);
702 }
703 
704 BOOST_AUTO_TEST_CASE(testRedoAlignmentWithSW) {
705  const EBlastProgramType kProgram = eBlastTypeBlastp;
706  const int k_num_hsps_start = 3;
707  const int k_num_hsps_end = 5;
708  CSeq_id query_id("gi|3091");
709  CSeq_id subj_id("gi|402871");
710 
711  // It seems that the last hsp in this set was manually constructed and
712  // expected to be dropped (please note that the first 2 hsps we
713  // constructed using blastall (thus filtering on), but the sequence
714  // passed to RedoAlignmentCore was unfiltered (default for blastpgp)
715  const int query_offset[k_num_hsps_start] = { 28, 46, 463 };
716  const int query_end[k_num_hsps_start] = { 485, 331, 488 };
717  const int subject_offset[k_num_hsps_start] = { 36, 327, 320 };
718  const int subject_end[k_num_hsps_start] = { 512, 604, 345 };
719  const int score[k_num_hsps_start] = { 554, 280, 28 };
720  const int query_gapped_start[k_num_hsps_start] = { 431, 186, 480 };
721  const int subject_gapped_start[k_num_hsps_start] = { 458, 458, 337 };
722 
723  // This is freed by the HSPStream interface
724  BlastHSPList* init_hsp_list =
725  setUpHSPList(k_num_hsps_start,
726  query_offset, query_end,
727  subject_offset, subject_end,
728  query_gapped_start,
729  subject_gapped_start,
730  score);
731 
732  const int query_offset_final[k_num_hsps_end] = { 2, 250, 494, 67, 2 };
733  const int query_end_final[k_num_hsps_end] = { 485, 331, 530, 86, 24 };
734  const int subject_offset_final[k_num_hsps_end] = { 9, 523, 261, 585, 570 };
735  const int subject_end_final[k_num_hsps_end] = { 512, 604, 297, 604, 592 };
736  const int score_final[k_num_hsps_end] = { 591, 39, 37, 33, 32 };
737  const double evalue_final[k_num_hsps_end] = { 2.3451e-72, 0.387,
738  0.6692, 1.9988, 2.6256 };
739  const int num_idents_final[k_num_hsps_end] = { 172, 22, 9, 8, 7 };
740 
741  BlastHSPList* ending_hsp_list =
742  setUpHSPList(k_num_hsps_end,
743  query_offset_final,
744  query_end_final,
745  subject_offset_final,
746  subject_end_final,
747  query_offset_final,
748  subject_offset_final,
749  score_final,
750  evalue_final,
751  num_idents_final);
752 
753  const Int8 kEffSearchSp = 500000;
754  const bool kSmithWaterman = true;
755 
756  runRedoAlignmentCoreUnitTest(kProgram, query_id, subj_id,
757  init_hsp_list, ending_hsp_list,
758  kEffSearchSp, eNoCompositionBasedStats,
759  kSmithWaterman);
760  ending_hsp_list = Blast_HSPListFree(ending_hsp_list);
761  BOOST_REQUIRE(ending_hsp_list == NULL);
762 }
763 
764 BOOST_AUTO_TEST_CASE(testRedoAlignmentWithCompBasedStatsAndSW) {
765  const EBlastProgramType kProgram = eBlastTypeBlastp;
766  const int k_num_hsps_start = 3;
767  const int k_num_hsps_end = 3;
768  CSeq_id query_id("gi|3091");
769  CSeq_id subj_id("gi|402871");
770 
771  const int query_offset[k_num_hsps_start] = { 28, 46, 463};
772  const int query_end[k_num_hsps_start] = { 485, 331, 488};
773  const int subject_offset[k_num_hsps_start] = { 36, 327, 320};
774  const int subject_end[k_num_hsps_start] = { 512, 604, 345};
775  const int score[k_num_hsps_start] = { 554, 280, 28};
776  const int query_gapped_start[k_num_hsps_start] = { 431, 186, 480};
777  const int subject_gapped_start[k_num_hsps_start] = { 458, 458, 337};
778 
779  // This is freed by the HSPStream interface
780  BlastHSPList* init_hsp_list =
781  setUpHSPList(k_num_hsps_start,
782  query_offset, query_end,
783  subject_offset, subject_end,
784  query_gapped_start,
785  subject_gapped_start,
786  score);
787 
788  const int query_offset_final[k_num_hsps_end] = { 2, 250, 67 };
789  const int query_end_final[k_num_hsps_end] = { 485, 331, 86};
790  const int subject_offset_final[k_num_hsps_end] = { 9, 523, 585};
791  const int subject_end_final[k_num_hsps_end] = { 512, 604, 604};
792  const int score_final[k_num_hsps_end] = { 510, 34, 31};
793  const double evalue_final[k_num_hsps_end] = {7.0065e-61, 1.349, 3.7944};
794  const int num_idents_final[k_num_hsps_end] = { 171, 22, 8 };
795 
796  BlastHSPList* ending_hsp_list =
797  setUpHSPList(k_num_hsps_end,
798  query_offset_final,
799  query_end_final,
800  subject_offset_final,
801  subject_end_final,
802  query_offset_final,
803  subject_offset_final,
804  score_final,
805  evalue_final,
806  num_idents_final);
807 
808  const Int8 kEffSearchSp = 500000;
809  const bool kSmithWaterman = true;
810 
811  runRedoAlignmentCoreUnitTest(kProgram, query_id, subj_id,
812  init_hsp_list, ending_hsp_list,
813  kEffSearchSp, eCompositionBasedStats,
814  kSmithWaterman);
815 
816  ending_hsp_list = Blast_HSPListFree(ending_hsp_list);
817  BOOST_REQUIRE(ending_hsp_list == NULL);
818 }
819 
820 BOOST_AUTO_TEST_CASE(testPSIRedoAlignmentWithCompBasedStatsAndSW) {
821  const EBlastProgramType kProgram = eBlastTypePsiBlast;
822  const int k_num_hsps_start = 6;
823  const int k_num_hsps_end = 8;
824  CSeq_id query_id("gi|129295");
825  CSeq_id subj_id("gi|7450545");
826 
827  const int query_offset[k_num_hsps_start] =
828  { 24, 99, 16, 84, 6, 223 };
829  const int query_end[k_num_hsps_start] =
830  { 62, 128, 24, 114, 25, 231 };
831  const int subject_offset[k_num_hsps_start] =
832  { 245, 0, 198, 86, 334, 151 };
833  const int subject_end[k_num_hsps_start] =
834  { 287, 29, 206, 119, 353, 159 };
835  const int score[k_num_hsps_start] =
836  { 37, 26, 25, 25, 24, 24 };
837  const int query_gapped_start[k_num_hsps_start] =
838  { 29, 104, 20, 91, 19, 227 };
839  const int subject_gapped_start[k_num_hsps_start] =
840  { 250, 5, 202, 93, 347, 155 };
841 
842 
843  // No gaps were found in these alignments. This is freed by the
844  // HSPStream interface
845  BlastHSPList* init_hsp_list =
846  setUpHSPList(k_num_hsps_start,
847  query_offset, query_end,
848  subject_offset, subject_end,
849  query_gapped_start,
850  subject_gapped_start,
851  score);
852 
853  const int query_offset_final[k_num_hsps_end] =
854  { 24, 140, 126, 10, 137, 198, 18, 137 };
855  const int query_end_final[k_num_hsps_end] =
856  { 30, 171, 205, 35, 157, 208, 31, 152 };
857  const int subject_offset_final[k_num_hsps_end] =
858  { 245, 408, 212, 130, 339, 388, 200, 186 };
859  const int subject_end_final[k_num_hsps_end] =
860  { 251, 439, 287, 155, 359, 398, 210, 201 };
861  const int score_final[k_num_hsps_end] =
862  { 29, 28, 28, 28, 25, 24, 24, 22 };
863  const double evalue_final[k_num_hsps_end] =
864  { 1.361074, 1.837947, 2.118044, 2.153685, 4.198304, 5.529096,
865  6.425098, 8.532644 };
866  const int ident_final[k_num_hsps_end] =
867  { 3, 8, 23, 10, 6, 5, 6, 5};
868 
869  BlastHSPList* ending_hsp_list =
870  setUpHSPList(k_num_hsps_end,
871  query_offset_final,
872  query_end_final,
873  subject_offset_final,
874  subject_end_final,
875  query_offset_final,
876  subject_offset_final,
877  score_final,
878  evalue_final,
879  ident_final);
880 
881 
882  const Int8 kEffSearchSp = 84660;
883  const bool kSmithWaterman = true;
884 
885  runRedoAlignmentCoreUnitTest(kProgram, query_id, subj_id,
886  init_hsp_list, ending_hsp_list,
887  kEffSearchSp, eCompositionBasedStats,
888  kSmithWaterman);
889 
890  ending_hsp_list = Blast_HSPListFree(ending_hsp_list);
891  BOOST_REQUIRE(ending_hsp_list == NULL);
892 }
893 
894 // Tests following changes from Mike Gertz and Alejandro Schaffer (rev 1.14
895 // of blast_kappa.c):
896 //
897 // For secondary alignments with Smith-Waterman on, use the E-value from
898 // the X-drop alignment computed by ALIGN that has more flexibility in
899 // which positions are allowed in the dynamic program.
900 //
901 // Add a sort of alignments for the same query-subject pair because the
902 // use of X-drop alignments occasionally reorders such alignments.
903 BOOST_AUTO_TEST_CASE(testRedoAlignmentUseXdropEvalue) {
904  const EBlastProgramType kProgram = eBlastTypeBlastp;
905  const int k_num_hsps_start = 4;
906  const int k_num_hsps_end = 4;
907  CSeq_id query_id("gi|48100936");
908  CSeq_id subj_id("gi|7301132");
909 
910  const int query_offset[k_num_hsps_start] = { 995, 1004, 995, 973};
911  const int query_end[k_num_hsps_start] = { 1314, 1314, 1403, 1316};
912  const int subject_offset[k_num_hsps_start] = { 61, 36, 61, 106};
913  const int subject_end[k_num_hsps_start] = { 384, 384, 455, 420};
914  const int score[k_num_hsps_start] = { 341, 327, 314, 301};
915  const int query_gapped_start[k_num_hsps_start] = { 1233, 1017, 1310,
916  1228};
917  const int subject_gapped_start[k_num_hsps_start] = { 303, 49, 347, 331};
918 
919  // This is freed by the HSPStream interface
920  BlastHSPList* init_hsp_list =
921  setUpHSPList(k_num_hsps_start,
922  query_offset, query_end,
923  subject_offset, subject_end,
924  query_gapped_start,
925  subject_gapped_start,
926  score);
927  const int query_offset_final[k_num_hsps_end] =
928  { 995, 1261, 1025, 1210};
929  const int query_end_final[k_num_hsps_end] =
930  { 1314, 1341, 1125, 1243};
931  const int subject_offset_final[k_num_hsps_end] =
932  { 61, 1, 387, 17};
933  const int subject_end_final[k_num_hsps_end] =
934  { 384, 115, 482, 50};
935  const int score_final[k_num_hsps_end] =
936  { 323, 78, 69, 60};
937  const double evalue_final[k_num_hsps_end] =
938  { 2.712e-34, 3.6003e-05, 0.00048334, 0.00441};
939  const int num_idents_final[k_num_hsps_end] = { 108, 31, 30, 12 };
940 
941  BlastHSPList* ending_hsp_list =
942  setUpHSPList(k_num_hsps_end,
943  query_offset_final,
944  query_end_final,
945  subject_offset_final,
946  subject_end_final,
947  query_offset_final,
948  subject_offset_final,
949  score_final,
950  evalue_final,
951  num_idents_final);
952 
953  const Int8 kEffSearchSp = 1000*1000;
954  const bool kSmithWaterman = true;
955 
956  const int kHitListSize = 1;
957  const double kEvalueThreshold = 0.005;
958 
959  runRedoAlignmentCoreUnitTest(kProgram, query_id, subj_id,
960  init_hsp_list, ending_hsp_list,
961  kEffSearchSp, eCompositionBasedStats,
962  kSmithWaterman, kEvalueThreshold,
963  kHitListSize);
964 
965  ending_hsp_list = Blast_HSPListFree(ending_hsp_list);
966  BOOST_REQUIRE(ending_hsp_list == NULL);
967 }
968 
969 // N.B.: the absence of a testPSIRedoAlignmentWithSW is due to the fact
970 // that blastpgp reads frequency rations from a checkpoint file and scales
971 // the PSSM and K parameter, which we cannot do unless we support reading
972 // checkpoint files. QA should be added later
973 
974 
975 BOOST_AUTO_TEST_CASE(CheckLowerCaseMatrix)
976 {
977  BOOST_REQUIRE(Blast_FrequencyDataIsAvailable("blosum62") == 1);
978 }
979 
981 
982 /*
983 * ===========================================================================
984 *
985 * $Log: redoalignment-cppunit.cpp,v $
986 * Revision 1.59 2009/04/27 15:15:41 camacho
987 * Bring up to date with latest filtering strategy changes JIRA SB-170
988 *
989 * Revision 1.55 2009/03/13 18:15:23 maning
990 * Use the writer/pipe framework to construct hsp_stream. Removed queue test.
991 *
992 * Revision 1.58 2009/03/13 19:25:56 maning
993 * Previous commit messed up. Roll back again.
994 *
995 * Revision 1.56 2009/03/13 18:45:15 maning
996 * Roll back to previous version.
997 *
998 * Revision 1.54 2008/11/21 21:13:27 madden
999 * tests for num_ident fix
1000 *
1001 * Revision 1.53 2008/10/27 17:00:12 camacho
1002 * Fix include paths to deprecated headers
1003 *
1004 * Revision 1.52 2008/02/13 21:39:12 camacho
1005 * Re-enable choice to sort by score to meet pre-condition of composition-based
1006 * statistics code.
1007 *
1008 * Revision 1.51 2007/11/19 18:48:34 camacho
1009 * Bring up to date with changes in SVN revision 114090
1010 *
1011 * Revision 1.50 2007/11/08 17:09:08 camacho
1012 * Bring up to date with changes in SVN revision 113729
1013 *
1014 * Revision 1.49 2007/10/22 19:16:10 madden
1015 * BlastExtensionOptionsNew has Boolean gapped arg
1016 *
1017 * Revision 1.48 2007/07/27 18:04:34 papadopo
1018 * change signature of HSPListCollector_Init
1019 *
1020 * Revision 1.47 2007/05/08 22:41:31 papadopo
1021 * change signature of RedoAlignmentCore
1022 *
1023 * Revision 1.46 2007/03/22 14:34:44 camacho
1024 * + support for auto-detection of genetic codes
1025 *
1026 * Revision 1.45 2007/03/20 14:54:02 camacho
1027 * changes related to addition of multiple genetic code specification
1028 *
1029 * Revision 1.44 2006/11/21 17:47:50 papadopo
1030 * use enum for lookup table type
1031 *
1032 * Revision 1.43 2006/09/25 19:34:31 madden
1033 * Changed values to reflect the results of computations done with
1034 * more precise frequencies and frequency ratios.
1035 * [from Mike Gertz]
1036 *
1037 * Revision 1.42 2006/06/29 16:25:24 camacho
1038 * Changed BlastHitSavingOptionsNew signature
1039 *
1040 * Revision 1.41 2006/06/05 13:34:05 madden
1041 * Changes to remove [GS]etMatrixPath and use callback instead
1042 *
1043 * Revision 1.40 2006/05/18 16:32:03 papadopo
1044 * change signature of BLAST_CalcEffLengths
1045 *
1046 * Revision 1.39 2006/01/23 19:57:52 camacho
1047 * Allow new varieties of composition based statistics
1048 *
1049 * Revision 1.38 2005/12/16 20:51:50 camacho
1050 * Diffuse the use of CSearchMessage, TQueryMessages, and TSearchMessages
1051 *
1052 * Revision 1.37 2005/12/01 15:13:52 madden
1053 * replaced Kappa_RedoAlignmentCore with Blast_RedoAlignmentCore
1054 *
1055 * Revision 1.36 2005/10/14 13:47:32 camacho
1056 * Fixes to pacify icc compiler
1057 *
1058 * Revision 1.35 2005/06/09 20:37:06 camacho
1059 * Use new private header blast_objmgr_priv.hpp
1060 *
1061 * Revision 1.34 2005/05/24 20:05:17 camacho
1062 * Changed signature of SetupQueries and SetupQueryInfo
1063 *
1064 * Revision 1.33 2005/05/23 15:53:19 dondosha
1065 * Special case for preliminary hitlist size in RPS BLAST - hence no need for 2 extra parameters in SBlastHitsParametersNew
1066 *
1067 * Revision 1.32 2005/05/19 17:10:04 madden
1068 * Adjust an expect value, use BOOST_REQUIRE_CLOSE when appropriate
1069 *
1070 * Revision 1.31 2005/05/16 12:29:15 madden
1071 * use SBlastHitsParameters in Blast_HSPListCollectorInit and Blast_HSPListCollectorInit[MT]
1072 *
1073 * Revision 1.30 2005/04/27 20:08:40 dondosha
1074 * PHI-blast boolean argument has been removed from BlastSetup_ScoreBlkInit
1075 *
1076 * Revision 1.29 2005/04/06 21:27:18 dondosha
1077 * Use EBlastProgramType instead of EProgram in internal functions
1078 *
1079 * Revision 1.28 2005/04/06 16:27:46 camacho
1080 * Fix to previous commit
1081 *
1082 * Revision 1.27 2005/03/31 20:46:44 camacho
1083 * Changes to avoid making CBlastRedoAlignmentTest a friend of CBlastOptions
1084 *
1085 * Revision 1.26 2005/03/31 13:45:58 camacho
1086 * BLAST options API clean-up
1087 *
1088 * Revision 1.25 2005/03/04 17:20:45 bealer
1089 * - Command line option support.
1090 *
1091 * Revision 1.24 2005/02/16 14:52:41 camacho
1092 * Fix memory leak
1093 *
1094 * Revision 1.23 2005/02/14 14:17:17 camacho
1095 * Changes to use SBlastScoreMatrix
1096 *
1097 * Revision 1.22 2005/01/06 15:43:25 camacho
1098 * Make use of modified signature to blast::SetupQueries
1099 *
1100 * Revision 1.21 2004/12/14 21:16:31 dondosha
1101 * Query frame argument added to Blast_HSPInit; renamed all constants according to toolkit convention
1102 *
1103 * Revision 1.20 2004/12/09 15:24:10 dondosha
1104 * BlastSetup_GetScoreBlock renamed to BlastSetup_ScoreBlkInit
1105 *
1106 * Revision 1.19 2004/12/02 16:12:49 bealer
1107 * - Change multiple-arrays to array-of-struct in BlastQueryInfo
1108 *
1109 * Revision 1.18 2004/11/23 21:51:34 camacho
1110 * Update call to Kappa_RedoAlignmentCore as its signature has changed.
1111 * num_ident field of HSP structure is no longer populated in
1112 * Kappa_RedoAlignmentCore.
1113 *
1114 * Revision 1.17 2004/11/17 21:02:01 camacho
1115 * Add error checking to BlastSeqSrc initialization
1116 *
1117 * Revision 1.16 2004/11/09 15:16:26 camacho
1118 * Minor change in auxiliary function
1119 *
1120 * Revision 1.15 2004/11/02 22:08:30 camacho
1121 * Refactored main body of unit test to be reused by multiple tests
1122 *
1123 * Revision 1.14 2004/11/02 18:28:52 madden
1124 * BlastHitSavingParametersNew no longer requires BlastExtensionParameters
1125 *
1126 * Revision 1.13 2004/11/01 18:38:30 madden
1127 * Change call to BLAST_FillHitSavingOptions
1128 *
1129 * Revision 1.12 2004/10/29 14:19:20 camacho
1130 * Make BlastHSPResults allocation function follow naming conventions
1131 *
1132 * Revision 1.11 2004/10/14 17:14:28 madden
1133 * New parameter in BlastHitSavingParametersNew
1134 *
1135 * Revision 1.10 2004/07/19 15:04:22 dondosha
1136 * Renamed multiseq_src to seqsrc_multiseq, seqdb_src to seqsrc_seqdb
1137 *
1138 * Revision 1.9 2004/07/06 15:58:45 dondosha
1139 * Use EBlastProgramType enumeration type for program when calling C functions
1140 *
1141 * Revision 1.8 2004/06/22 16:46:19 camacho
1142 * Changed the blast_type_* definitions for the EBlastProgramType enumeration.
1143 *
1144 * Revision 1.7 2004/06/21 20:29:07 camacho
1145 * Fix memory leaks, others remain
1146 *
1147 * Revision 1.6 2004/06/21 14:23:32 madden
1148 * Add test testRedoAlignmentUseXdropEvalue that tests recent changes by Mike Gertz to kappa.c ported to blast_kappa.c
1149 *
1150 * Revision 1.5 2004/06/21 13:06:51 madden
1151 * Add check for expect value
1152 *
1153 * Revision 1.4 2004/06/21 12:54:49 camacho
1154 * CVS Id tag fix
1155 *
1156 * Revision 1.3 2004/06/08 15:24:34 dondosha
1157 * Use BlastHSPStream interface
1158 *
1159 * Revision 1.2 2004/05/19 17:10:35 madden
1160 * Fix memory leaks
1161 *
1162 * Revision 1.1 2004/05/18 17:17:45 madden
1163 * Tests for composition-based stats and smith-waterman
1164 *
1165 *
1166 *
1167 * ===========================================================================
1168 */
Declares the CBl2Seq (BLAST 2 Sequences) class.
#define sfree(x)
Safe free a pointer: belongs to a higher level header.
Definition: blast_def.h:112
Interface for reading SRA sequences into blast input.
Structures and API used for saving BLAST hits.
BlastHSPResults * Blast_HSPResultsFree(BlastHSPResults *results)
Deallocate memory for BLAST results.
Definition: blast_hits.c:3364
Int2 Blast_HSPInit(Int4 query_start, Int4 query_end, Int4 subject_start, Int4 subject_end, Int4 query_gapped_start, Int4 subject_gapped_start, Int4 query_context, Int2 query_frame, Int2 subject_frame, Int4 score, GapEditScript **gap_edit, BlastHSP **ret_hsp)
Allocates BlastHSP and inits with information from input.
Definition: blast_hits.c:151
BlastHSPList * Blast_HSPListNew(Int4 hsp_max)
Creates HSP list structure with a default size HSP array.
Definition: blast_hits.c:1558
BlastHSPResults * Blast_HSPResultsNew(Int4 num_queries)
Initialize the results structure.
Definition: blast_hits.c:3344
Int2 Blast_HSPListSaveHSP(BlastHSPList *hsp_list, BlastHSP *hsp)
Saves HSP information into a BlastHSPList structure.
Definition: blast_hits.c:1754
BlastHSPList * Blast_HSPListFree(BlastHSPList *hsp_list)
Deallocate memory for an HSP list structure as well as all it's components.
Definition: blast_hits.c:1542
void Blast_HSPListSortByScore(BlastHSPList *hsp_list)
Sort the HSPs in an HSP list by score.
Definition: blast_hits.c:1374
BlastHSPWriter * BlastHSPWriterNew(BlastHSPWriterInfo **writer_info, BlastQueryInfo *query_info, BLAST_SequenceBlk *query)
A generic function to create writer.
Declaration of ADT to save and retrieve lists of HSPs in the BLAST engine.
int BlastHSPStreamWrite(BlastHSPStream *hsp_stream, BlastHSPList **hsp_list)
Invokes the user-specified write function for this BlastHSPStream implementation.
BlastHSPStream * BlastHSPStreamFree(BlastHSPStream *hsp_stream)
Frees the BlastHSPStream structure by invoking the destructor function set by the user-defined constr...
BlastHSPStream * BlastHSPStreamNew(EBlastProgramType program, const BlastExtensionOptions *extn_opts, Boolean sort_on_read, Int4 num_queries, BlastHSPWriter *writer)
Initialize the HSP stream.
Interface for converting sources of sequence data into blast sequence input.
Header file for composition-based statistics.
Int2 Blast_RedoAlignmentCore(EBlastProgramType program_number, BLAST_SequenceBlk *queryBlk, const BlastQueryInfo *query_info, BlastScoreBlk *sbp, BLAST_SequenceBlk *subjectBlk, const BlastSeqSrc *seqSrc, Int4 db_genetic_code, BlastHSPList *thisMatch, BlastHSPStream *hsp_stream, BlastScoringParameters *scoringParams, const BlastExtensionParameters *extendParams, const BlastHitSavingParameters *hitParams, const PSIBlastOptions *psiOptions, BlastHSPResults *results)
Top level routine to recompute alignments for each match found by the gapped BLAST algorithm (single-...
Definition: blast_kappa.c:2942
Definitions which are dependant on the NCBI C++ Object Manager.
PSIBlastOptions * PSIBlastOptionsFree(PSIBlastOptions *psi_options)
Deallocate PSI BLAST options.
#define BLAST_HITLIST_SIZE
Number of database sequences to save hits for.
Int2 BlastEffectiveLengthsOptionsNew(BlastEffectiveLengthsOptions **options)
Allocate memory for BlastEffectiveLengthsOptions* and fill with default values.
Int2 BLAST_FillEffectiveLengthsOptions(BlastEffectiveLengthsOptions *options, Int4 dbseq_num, Int8 db_length, Int8 *searchsp_eff, Int4 num_searchsp)
Fill the non-default values in the BlastEffectiveLengthsOptions structure.
Int2 BlastScoringOptionsNew(EBlastProgramType program, BlastScoringOptions **options)
Allocate memory for BlastScoringOptions and fill with default values.
BlastEffectiveLengthsOptions * BlastEffectiveLengthsOptionsFree(BlastEffectiveLengthsOptions *options)
Deallocate memory for BlastEffectiveLengthsOptions*.
BlastExtensionOptions * BlastExtensionOptionsFree(BlastExtensionOptions *options)
Deallocate memory for BlastExtensionOptions.
#define BLAST_EXPECT_VALUE
Default parameters for saving hits.
Int2 BlastHitSavingOptionsNew(EBlastProgramType program, BlastHitSavingOptions **options, Boolean gapped_calculation)
Allocate memory for BlastHitSavingOptions.
@ eSmithWatermanTbck
Smith-waterman finds optimal scores, then ALIGN_EX to find alignment.
Int2 PSIBlastOptionsNew(PSIBlastOptions **psi_options)
Initialize default options for PSI BLAST.
#define BLAST_GENETIC_CODE
Default genetic code for query and/or database.
Int2 BlastExtensionOptionsNew(EBlastProgramType program, BlastExtensionOptions **options, Boolean gapped)
Allocate memory for BlastExtensionOptions and fill with default values.
Declares the CBlastOptionsHandle and CBlastOptionsFactory classes.
BlastHitSavingParameters * BlastHitSavingParametersFree(BlastHitSavingParameters *parameters)
Deallocate memory for BlastHitSavingOptions*.
BlastEffectiveLengthsParameters * BlastEffectiveLengthsParametersFree(BlastEffectiveLengthsParameters *parameters)
Deallocate memory for BlastEffectiveLengthsParameters*.
Int2 BlastExtensionParametersNew(EBlastProgramType blast_program, const BlastExtensionOptions *options, BlastScoreBlk *sbp, BlastQueryInfo *query_info, BlastExtensionParameters **parameters)
Calculate the raw values for the X-dropoff parameters.
BlastExtensionParameters * BlastExtensionParametersFree(BlastExtensionParameters *parameters)
Deallocate memory for BlastExtensionParameters.
Int2 BlastScoringParametersNew(const BlastScoringOptions *options, BlastScoreBlk *sbp, BlastScoringParameters **parameters)
Calculate scaled cutoff scores and gap penalties.
Int2 BlastHitSavingParametersNew(EBlastProgramType program_number, const BlastHitSavingOptions *options, const BlastScoreBlk *sbp, const BlastQueryInfo *query_info, Int4 avg_subject_length, Int4 compositionBasedStats, BlastHitSavingParameters **parameters)
Allocate memory and initialize the BlastHitSavingParameters structure.
BlastScoringParameters * BlastScoringParametersFree(BlastScoringParameters *parameters)
Deallocate memory for BlastScoringParameters.
Int2 BlastEffectiveLengthsParametersNew(const BlastEffectiveLengthsOptions *options, Int8 db_length, Int4 num_seqs, BlastEffectiveLengthsParameters **parameters)
Allocate memory for BlastEffectiveLengthsParameters.
EBlastProgramType
Defines the engine's notion of the different applications of the BLAST algorithm.
Definition: blast_program.h:72
@ eBlastTypePsiBlast
Definition: blast_program.h:82
@ eBlastTypeBlastp
Definition: blast_program.h:73
Declares the CBlastProteinOptionsHandle class.
void ** _PSIAllocateMatrix(unsigned int ncols, unsigned int nrows, unsigned int data_type_sz)
Generic 2 dimensional matrix allocator.
void ** _PSIDeallocateMatrix(void **matrix, unsigned int ncols)
Generic 2 dimensional matrix deallocator.
void _PSICopyMatrix_int(int **dest, int **src, unsigned int ncols, unsigned int nrows)
Copies src matrix into dest matrix, both of which must be int matrices with dimensions ncols by nrows...
Private interface for Position Iterated BLAST API, contains the PSSM generation engine.
Utilities initialize/setup BLAST.
Int2 BlastSetup_ScoreBlkInit(BLAST_SequenceBlk *query_blk, const BlastQueryInfo *query_info, const BlastScoringOptions *scoring_options, EBlastProgramType program_number, BlastScoreBlk **sbpp, double scale_factor, Blast_Message **blast_message, GET_MATRIX_PATH get_path)
Initializes the score block structure.
Definition: blast_setup.c:456
Int2 BLAST_CalcEffLengths(EBlastProgramType program_number, const BlastScoringOptions *scoring_options, const BlastEffectiveLengthsParameters *eff_len_params, const BlastScoreBlk *sbp, BlastQueryInfo *query_info, Blast_Message **blast_message)
Function to calculate effective query length and db length as well as effective search space.
Definition: blast_setup.c:699
BlastScoreBlk * BlastScoreBlkFree(BlastScoreBlk *sbp)
Deallocates BlastScoreBlk as well as all associated structures.
Definition: blast_stat.c:965
SPsiBlastScoreMatrix * SPsiBlastScoreMatrixNew(size_t ncols)
Allocates a new SPsiBlastScoreMatrix structure of dimensions ncols by BLASTAA_SIZE.
Definition: blast_stat.c:805
EProgram
This enumeration is to evolve into a task/program specific list that specifies sets of default parame...
Definition: blast_types.hpp:56
Int2 BlastNumber2Program(EBlastProgramType number, char **program)
Return string name for program given a number.
Definition: blast_util.c:312
BOOST_AUTO_TEST_SUITE_END() static int s_GetSegmentFlags(const CBioseq &bioseq)
Wrapper class for BLAST_SequenceBlk .
Definition: blast_aux.hpp:309
Runs the BLAST algorithm between 2 sequences.
Definition: bl2seq.hpp:58
Wrapper class for BlastExtensionOptions .
Definition: blast_aux.hpp:323
Class representing a text file containing sequences in fasta format.
Wrapper class for BlastHitSavingOptions .
Definition: blast_aux.hpp:326
Class that centralizes the configuration data for sequences to be converted.
Definition: blast_input.hpp:48
Generalized converter from an abstract source of biological sequence data to collections of blast inp...
Encapsulates ALL the BLAST algorithm's options.
Wrapper class for BlastQueryInfo .
Definition: blast_aux.hpp:311
Class whose purpose is to create CScope objects which have data loaders added with different prioriti...
CRef< objects::CScope > NewScope()
Create a new, properly configured CScope.
Wrapper class for BlastScoringOptions .
Definition: blast_aux.hpp:334
Wrapper class for BlastSeqSrc .
Definition: blast_aux.hpp:350
static CTestObjMgr & Instance()
Definition: test_objmgr.cpp:69
typedef for the messages for an entire BLAST search, which could be comprised of multiple query seque...
Constants used in compositional score matrix adjustment.
ECompoAdjustModes
An collection of constants that specify all permissible modes of composition adjustment.
@ eCompositionBasedStats
Composition-based statistics as in NAR 29:2994-3005, 2001.
@ eNoCompositionBasedStats
Don't use composition based statistics.
@ eCompositionMatrixAdjust
Composition-based score adjustment as in Bioinformatics 21:902-911, 2005, conditioned on sequence pro...
const CBlastOptionsHandle & GetOptionsHandle() const
Retrieve the options handle.
Definition: bl2seq.hpp:285
#define BLASTAA_SIZE
Size of aminoacid alphabet.
void SetupQueries(TSeqLocVector &queries, BlastQueryInfo *qinfo, BLAST_SequenceBlk **seqblk, EBlastProgramType prog, objects::ENa_strand strand_opt, TSearchMessages &messages)
Populates BLAST_SequenceBlk with sequence data for use in CORE BLAST.
objects::ENa_strand GetStrandOption() const
BlastSeqSrc * MultiSeqBlastSeqSrcInit(TSeqLocVector &seq_vector, EBlastProgramType program, bool dbscan_mode=false)
Initialize the sequence source structure.
EBlastProgramType GetProgramType() const
Returns the CORE BLAST notion of program type.
const CBlastOptions & GetOptions() const
Return the object which this object is a handle for.
char * BlastFindMatrixPath(const char *matrix_name, Boolean is_prot)
Returns the path to a specified matrix.
const TSeqLocVector & GetQueries() const
Retrieve a vector of query sequences.
Definition: bl2seq.hpp:244
void SetupQueryInfo(TSeqLocVector &queries, EBlastProgramType prog, objects::ENa_strand strand_opt, BlastQueryInfo **qinfo)
Allocates the query information structure and fills the context offsets, in case of multiple queries,...
EProgram ProgramNameToEnum(const std::string &program_name)
Map a string into an element of the ncbi::blast::EProgram enumeration (except eBlastProgramMax).
Definition: blast_aux.cpp:757
const TSeqLocVector & GetSubjects() const
Retrieve a vector of subject sequences.
Definition: bl2seq.hpp:272
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NULL
Definition: ncbistd.hpp:225
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
Implementation of a number of BlastHSPWriters to save hits from a BLAST search, and subsequently retu...
BlastHSPCollectorParams * BlastHSPCollectorParamsNew(const BlastHitSavingOptions *hit_options, Int4 compositionBasedStats, Boolean gapped_calculation)
Sets up parameter set for use by collector.
BlastHSPWriterInfo * BlastHSPCollectorInfoNew(BlastHSPCollectorParams *params)
WriterInfo to create a default writer: the collecter.
static int input()
int i
Definitions used to get joint probabilities for a scoring matrix.
int Blast_FrequencyDataIsAvailable(const char *matrix_name)
Retrieve the background letter probabilities implicitly used in constructing the score matrix matrix_...
static char * prog
Definition: mdb_load.c:33
void CheckForBlastSeqSrcErrors(const BlastSeqSrc *seqsrc)
Magic spell ;-) needed for some weird compilers... very empiric.
#define fabs(v)
Definition: ncbi_dispd.c:46
#define FALSE
bool replacment for C indicating false.
Definition: ncbi_std.h:101
Defines: CTimeFormat - storage class for time format.
std::istream & in(std::istream &in_, double &x_)
The Object manager core.
Utilities to develop and debug unit tests for BLAST.
BOOST_AUTO_TEST_CASE(testRedoAlignmentWithCompBasedStats)
Implementation of the BlastSeqSrc interface for a vector of sequence locations.
vector< SSeqLoc > TSeqLocVector
Vector of sequence locations.
Definition: sseqloc.hpp:129
Int8 eff_searchsp
Effective search space for this context.
Options for setting up effective lengths and search spaces.
Parameters for setting up effective lengths and search spaces.
Options used for gapped extension These include: a.
EBlastTbackExt eTbackExt
type of traceback extension.
Int4 compositionBasedStats
mode of compositional adjustment to use; if zero then compositional adjustment is not used
Computed values used as parameters for gapped alignments.
The structure to hold all HSPs for a given sequence after the gapped alignment.
Definition: blast_hits.h:153
Int4 hspcnt
Number of HSPs saved.
Definition: blast_hits.h:158
BlastHSP ** hsp_array
Array of pointers to individual HSPs.
Definition: blast_hits.h:157
The structure to contain all BLAST results, for multiple queries.
Definition: blast_hits.h:183
BlastHitList ** hitlist_array
Array of results for individual query sequences.
Definition: blast_hits.h:185
Int4 num_queries
Number of query sequences.
Definition: blast_hits.h:184
Default implementation of BlastHSPStream.
A wrap of data structure used to create a writer.
ADT definition of BlastHSPWriter.
Structure holding all information about an HSP.
Definition: blast_hits.h:126
double evalue
This HSP's e-value.
Definition: blast_hits.h:130
Int4 num_ident
Number of identical base pairs in this HSP.
Definition: blast_hits.h:128
BlastSeg query
Query sequence info.
Definition: blast_hits.h:131
double bit_score
Bit score, calculated from score.
Definition: blast_hits.h:129
BlastSeg subject
Subject sequence info.
Definition: blast_hits.h:132
Int4 score
This HSP's raw score.
Definition: blast_hits.h:127
BlastHSPList ** hsplist_array
Array of HSP lists for individual database hits.
Definition: blast_hits.h:176
Int4 hsplist_count
Filled size of the HSP lists array.
Definition: blast_hits.h:170
double expect_value
The expect value cut-off threshold for an HSP, or a combined hit if sum statistics is used.
Int4 hitlist_size
Maximal number of database sequences to return results for.
Parameter block that contains a pointer to BlastHitSavingOptions and the values derived from it.
BlastContextInfo * contexts
Information per context.
int num_queries
Number of query sequences.
Structure used for scoring calculations.
Definition: blast_stat.h:177
Blast_KarlinBlk ** kbp
Karlin-Altschul parameters.
Definition: blast_stat.h:207
Blast_KarlinBlk ** kbp_psi
K-A parameters for position-based alignments.
Definition: blast_stat.h:213
Blast_KarlinBlk ** kbp_gap
K-A parameters for gapped alignments.
Definition: blast_stat.h:208
SPsiBlastScoreMatrix * psi_matrix
PSSM and associated data.
Definition: blast_stat.h:186
Blast_KarlinBlk ** kbp_gap_psi
K-A parameters for psi alignments.
Definition: blast_stat.h:215
Boolean gapped_calculation
gap-free search if FALSE
Scoring parameters block Contains scoring-related information that is actually used for the blast sea...
Int4 end
End of hsp.
Definition: blast_hits.h:99
Int4 offset
Start of hsp.
Definition: blast_hits.h:98
Structure to hold the a message from the core of the BLAST engine.
Definition: blast_message.h:70
static int ** loadPssmFromFile(const string &filename, unsigned int query_length)
static BlastHSPList * setUpHSPList(int num_hsps, const int query_offset[], const int query_end[], const int subject_offset[], const int subject_end[], const int query_gapped_start[], const int subject_gapped_start[], const int score[], const double evalue[]=NULL, const int num_ident[]=NULL)
static void runRedoAlignmentCoreUnitTest(EBlastProgramType program, CSeq_id &qid, CSeq_id &sid, BlastHSPList *init_hsp_list, const BlastHSPList *ending_hsp_list, Int8 effective_searchsp, ECompoAdjustModes compositonBasedStatsMode, bool doSmithWaterman, double evalue_threshold=BLAST_EXPECT_VALUE, int hit_list_size=BLAST_HITLIST_SIZE)
static void setupPositionBasedBlastScoreBlk(BlastScoreBlk *sbp, unsigned int qlen)
Options used in protein BLAST only (PSI, PHI, RPS and translated BLAST) Some of these possibly should...
int ** data
actual scoring matrix data, stored in row-major form
Definition: blast_stat.h:140
SBlastScoreMatrix * pssm
position-specific score matrix
Definition: blast_stat.h:150
Structure to represent a single sequence to be fed to BLAST.
Definition: sseqloc.hpp:47
CConstRef< objects::CSeq_loc > seqloc
Seq-loc describing the sequence to use as query/subject to BLAST The types of Seq-loc currently suppo...
Definition: sseqloc.hpp:50
CRef< objects::CScope > scope
Scope where the sequence referenced can be found by the toolkit's object manager.
Definition: sseqloc.hpp:54
Utility stuff for more convenient using of Boost.Test library.
Modified on Wed Jun 12 11:16:48 2024 by modify_doxy.py rev. 669887