NCBI C++ ToolKit
blastengine_unit_test.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: blastengine_unit_test.cpp 95563 2021-11-26 14:51:36Z grichenk $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Tom Madden
27 *
28 * File Description:
29 * Unit test module to test the code in blast_engine.cpp
30 *
31 * ===========================================================================
32 */
33 #include <ncbi_pch.hpp>
34 #include <corelib/test_boost.hpp>
35 
36 #include <corelib/ncbitime.hpp>
38 #include <objmgr/scope.hpp>
40 #include <objmgr/util/sequence.hpp>
41 
49 #include <serial/serial.hpp>
50 #include <serial/iterator.hpp>
51 #include <serial/objostr.hpp>
52 
60 #include <blast_objmgr_priv.hpp>
61 
69 
71 
73 
74 #include "test_objmgr.hpp"
75 #include "blast_test_util.hpp"
76 
77 using namespace std;
78 using namespace ncbi;
79 using namespace ncbi::objects;
80 using namespace ncbi::blast;
81 
83 
86 
88  m_vQuery.clear();
89  m_vSubject.clear();
90  }
91 
92  void setupQueryAndSubject(TGi query_gi, TGi subject_gi)
93  {
94  CRef<CSeq_loc> query_loc(new CSeq_loc());
95  query_loc->SetWhole().SetGi(query_gi);
96  CScope* query_scope = new CScope(CTestObjMgr::Instance().GetObjMgr());
97  query_scope->AddDefaults();
98  m_vQuery.push_back(SSeqLoc(query_loc, query_scope));
99 
100  CRef<CSeq_loc> subject_loc(new CSeq_loc());
101  subject_loc->SetWhole().SetGi(subject_gi);
102  CScope* subject_scope = new CScope(CTestObjMgr::Instance().GetObjMgr());
103  subject_scope->AddDefaults();
104  m_vSubject.push_back(SSeqLoc(subject_loc, subject_scope));
105  }
106 };
107 
108 BOOST_FIXTURE_TEST_SUITE(blastengine, BlastEngineTestFixture)
109 
111 {
112  BlastUngappedStats* ungapped_stats =
113  diagnostics->ungapped_stat;
114  BlastGappedStats* gapped_stats =
115  diagnostics->gapped_stat;
116 
117  BOOST_REQUIRE_EQUAL(22670293, (int)ungapped_stats->lookup_hits);
118  BOOST_REQUIRE_EQUAL(296326, ungapped_stats->init_extends);
119  BOOST_REQUIRE_EQUAL(1258, ungapped_stats->good_init_extends);
120  BOOST_REQUIRE_EQUAL(1254, gapped_stats->extensions);
121  BOOST_REQUIRE_EQUAL(23, gapped_stats->good_extensions);
122 }
123 
125 {
126  BlastUngappedStats* ungapped_stats =
127  diagnostics->ungapped_stat;
128  BlastGappedStats* gapped_stats =
129  diagnostics->gapped_stat;
130 
131  BOOST_REQUIRE_EQUAL(1152, (int)ungapped_stats->lookup_hits);
132  BOOST_REQUIRE_EQUAL(24, ungapped_stats->init_extends);
133  BOOST_REQUIRE_EQUAL(9, ungapped_stats->good_init_extends);
134  BOOST_REQUIRE_EQUAL(8, gapped_stats->extensions);
135  BOOST_REQUIRE_EQUAL(8, gapped_stats->good_extensions);
136 }
137 
138 
139 
140 // This test has a very long (26 mb) subject sequence that is broken into
141 // chunks in BLAST_SearchEngineCore that are no longer than 5 meg. This
142 // test was written to verify a fix for long sequences. Only preliminary
143 // stage of the BLAST search is performed. Also tests that all diagnostic
144 // hit counts are set correctly.
145 // CBlastPrelimSearch class is used with a BlastSeqSrc constructed from a
146 // single sequence. This can be done, because CBlastPrelimSearch requires
147 // a database-based BlastSeqSrc only for the Seq-align creation.
148 BOOST_AUTO_TEST_CASE(testTBLASTNLongMatchBlastEngine) {
149  const Int4 kNumHspsEnd=23;
150  const Int8 kEffectiveSearchSpace = 1050668186940LL;
151  const TGi kQueryGi = GI_CONST(9790067);
152  const TGi kSubjectGi = GI_CONST(30698605);
153  const EBlastProgramType kProgramType = eBlastTypeTblastn;
154  const EProgram kProgram = eTblastn;
155 
156  CRef<CBlastOptionsHandle> opts_handle(
157  CBlastOptionsFactory::Create(kProgram));
158 
159  setupQueryAndSubject(kQueryGi, kSubjectGi);
160 
162  opts_handle->SetFilterString("F");
163  CRef<CBlastOptions> options(&opts_handle->SetOptions());
164 
165  BlastSeqSrc* seq_src = MultiSeqBlastSeqSrcInit(m_vSubject, kProgramType);
166  CRef<IQueryFactory> query_factory(new CObjMgr_QueryFactory(m_vQuery));
167  CBlastPrelimSearch prelim_search(query_factory, options, seq_src);
168  CRef<SInternalData> id(prelim_search.Run());
169 
170  testLongMatchDiagnostics(id->m_Diagnostics->GetPointer());
171 
172  const int kQueryOffsetFinal[kNumHspsEnd] =
173  { 407, 486, 421, 569, 265, 320, 266, 321, 727, 659,
174  92, 1, 1, 727, 422, 216, 167, 825, 167, 831, 216, 369, 49 };
175  const int kQueryLengthFinal[kNumHspsEnd] =
176  { 164, 85, 62, 67, 58, 74, 56, 69, 56, 66, 147, 69,
177  73, 61, 40, 26, 35, 54, 35, 48, 21, 69, 22 };
178  const int kScoreFinal[kNumHspsEnd] =
179  { 368, 199, 160, 104, 99, 95, 94, 92, 94, 89, 108,
180  101, 97, 95, 89, 86, 84, 84, 83, 79, 75, 74, 74};
181  const double kEvalueFinal[kNumHspsEnd] =
182  {1.84467e-35, 4.47098e-34, 4.47098e-34, 4.47098e-34,
183  4.23245e-08, 4.23245e-08, 3.29958e-07, 3.29958e-07,
184  7.11395e-07, 7.11395e-07, 8.64076e-05, 0.000570668,
185  0.001678, 0.00287725, 0.0145032, 0.0325588,
186  0.0558201, 0.0558201, 0.0730883, 0.214807, 0.631249,
187  0.826482, 0.826482 };
188 
189  CBlastHSPResults results
190  (prelim_search.ComputeBlastHSPResults
191  (id->m_HspStream->GetPointer()));
192 
193  BOOST_REQUIRE_EQUAL(1, results->num_queries);
194  BOOST_REQUIRE_EQUAL(1, results->hitlist_array[0]->hsplist_count);
195  BlastHSPList* hsplist = results->hitlist_array[0]->hsplist_array[0];
196 
197  BOOST_REQUIRE_EQUAL(kNumHspsEnd, hsplist->hspcnt);
198 
199  // Check that HSPs are sorted by score; then sort HSPs by e-value,
200  // to assure the proper order for the following assertions
201  BOOST_REQUIRE(Blast_HSPListIsSortedByScore(hsplist));
202  Blast_HSPListSortByEvalue(hsplist);
203 
204  for (int index=0; index<kNumHspsEnd; index++) {
205  BlastHSP* tmp_hsp = hsplist->hsp_array[index];
206  BOOST_REQUIRE_EQUAL(kQueryOffsetFinal[index], tmp_hsp->query.offset);
207  BOOST_REQUIRE_EQUAL(kQueryLengthFinal[index],
208  tmp_hsp->query.end - tmp_hsp->query.offset);
209  BOOST_REQUIRE_EQUAL(kScoreFinal[index], tmp_hsp->score);
210  BOOST_REQUIRE(fabs((kEvalueFinal[index]-tmp_hsp->evalue) /
211  kEvalueFinal[index]) < 0.001);
212  }
213 
214  BlastSeqSrcFree(seq_src);
215 }
216 
217 // Tests a tblastn search with a short subject sequence. Only preliminary
218 // stage of the BLAST search is performed. Checks obtained HSPs and the
219 // diagnostic hit counts.
220 BOOST_AUTO_TEST_CASE(testTBLASTNShortMatchBlastEngine) {
221  const Int4 kNumHspsEnd=8;
222  const Int8 kEffectiveSearchSpace = 1050668186940LL;
223  const TGi kQueryGi = GI_CONST(9790067);
224  const TGi kSubjectGi = GI_CONST(38547463);
225  const EBlastProgramType kProgramType = eBlastTypeTblastn;
226  const EProgram kProgram = eTblastn;
227 
228  setupQueryAndSubject(kQueryGi, kSubjectGi);
229 
230  CRef<CBlastOptionsHandle> opts_handle(
231  CBlastOptionsFactory::Create(kProgram));
232 
234  opts_handle->SetFilterString("F");
235  CRef<CBlastOptions> options(&opts_handle->SetOptions());
236 
237  BlastSeqSrc* seq_src = MultiSeqBlastSeqSrcInit(m_vSubject, kProgramType);
238  CRef<IQueryFactory> query_factory(new CObjMgr_QueryFactory(m_vQuery));
239  CBlastPrelimSearch prelim_search(query_factory, options, seq_src);
240  CRef<SInternalData> id(prelim_search.Run());
241 
242 
243  testShortMatchDiagnostics(id->m_Diagnostics->GetPointer());
244 
245  const int kQueryOffsetFinal[kNumHspsEnd] = { 98, 425, 320, 340, 823, 675, 247, 103};
246  const int kQueryLengthFinal[kNumHspsEnd] = { 223,211, 35, 13, 46, 19, 25, 24};
247  const int kScoreFinal[kNumHspsEnd] = {1138, 173, 72, 46, 40, 36, 32, 30};
248  const double kEvalueFinal[kNumHspsEnd] =
249  {2.52769e-153, 4.98722e-18, 2.4525e-05, 0.0352845, 0.187202, 0.568557, 1.72476, 3.0028};
250 
251  CBlastHSPResults results
252  (prelim_search.ComputeBlastHSPResults
253  (id->m_HspStream->GetPointer()));
254 
255  BOOST_REQUIRE_EQUAL(1, results->num_queries);
256  BOOST_REQUIRE_EQUAL(1, results->hitlist_array[0]->hsplist_count);
257  BlastHSPList* hsplist = results->hitlist_array[0]->hsplist_array[0];
258 
259  BOOST_REQUIRE_EQUAL(kNumHspsEnd, hsplist->hspcnt);
260 
261  // Check that HSPs are sorted by score; then sort HSPs by e-value,
262  // to assure the proper order for the following assertions
263  BOOST_REQUIRE(Blast_HSPListIsSortedByScore(hsplist));
264  Blast_HSPListSortByEvalue(hsplist);
265 
266  for (int index=0; index<kNumHspsEnd; index++)
267  {
268  BlastHSP* tmp_hsp = hsplist->hsp_array[index];
269  BOOST_REQUIRE_EQUAL(kQueryOffsetFinal[index],
270  tmp_hsp->query.offset);
271  BOOST_REQUIRE_EQUAL(kQueryLengthFinal[index],
272  tmp_hsp->query.end - tmp_hsp->query.offset);
273  BOOST_REQUIRE_EQUAL(kScoreFinal[index], tmp_hsp->score);
274  BOOST_REQUIRE(fabs(kEvalueFinal[index]-tmp_hsp->evalue) < 1.0e-10 ||
275  fabs((kEvalueFinal[index]-tmp_hsp->evalue) /
276  kEvalueFinal[index]) < 0.01);
277  }
278  BlastSeqSrcFree(seq_src);
279 }
280 
281 // Human against human sequence comparison which includes some repeats
282 // and hence has a large number of hits, even with an increased word size.
283 BOOST_AUTO_TEST_CASE(testBlastnWithLargeWordSize)
284 {
285  const TGi kQueryGi = GI_CONST(186279); // Short human sequence with repeats
286  const TGi kSubjectGi = GI_CONST(29791382); // Contig from human chromosome 1
287  const int kNumHsps = 330;
288  const EBlastProgramType kProgramType = eBlastTypeBlastn;
289 
290  setupQueryAndSubject(kQueryGi, kSubjectGi);
291 
292  BlastSeqSrc* seq_src = MultiSeqBlastSeqSrcInit(m_vSubject, kProgramType);
293 
294  CBlastNucleotideOptionsHandle opts_handle;
295  opts_handle.SetWordSize(35);
296  CRef<CBlastOptions> options(&opts_handle.SetOptions());
297 
298  CRef<IQueryFactory> query_factory(new CObjMgr_QueryFactory(m_vQuery));
299  CBlastPrelimSearch prelim_search(query_factory, options, seq_src);
300  CRef<SInternalData> id(prelim_search.Run());
301 
302  CBlastHSPResults results
303  (prelim_search.ComputeBlastHSPResults
304  (id->m_HspStream->GetPointer()));
305 
306  // Check results
307  BOOST_REQUIRE_EQUAL(1, results->num_queries);
308  BOOST_REQUIRE_EQUAL(1, results->hitlist_array[0]->hsplist_count);
309  BlastHSPList* hsplist = results->hitlist_array[0]->hsplist_array[0];
310 
311  BOOST_REQUIRE_EQUAL(kNumHsps, hsplist->hspcnt);
312  BlastSeqSrcFree(seq_src);
313 }
314 
315 // Same comparison as in previous test, but with repeats filtering;
316 // only 1 HSP is left.
317 BOOST_AUTO_TEST_CASE(testBlastnWithRepeatsFiltering)
318 {
319  const TGi kQueryGi = GI_CONST(186279); // Short human sequence with repeats
320  const TGi kSubjectGi = GI_CONST(29791382); // Contig from human chromosome 1
321  const int kNumHsps = 3;
322  const int kMaskedLength = 389;
323  const EBlastProgramType kProgramType = eBlastTypeBlastn;
324 
325  setupQueryAndSubject(kQueryGi, kSubjectGi);
326 
327  CBlastNucleotideOptionsHandle nucl_handle;
328  nucl_handle.SetTraditionalBlastnDefaults();
329  nucl_handle.SetEvalueThreshold(1);
330  nucl_handle.SetRepeatFiltering(true);
331 
332  Blast_FindRepeatFilterLoc(m_vQuery, &nucl_handle);
333  CRef<CBlastOptions> options(&nucl_handle.SetOptions());
334 
335  Uint4 masked_length = m_vQuery[0].mask->GetPacked_int().GetLength();
336  BOOST_REQUIRE_EQUAL(kMaskedLength, (int) masked_length);
337 
338  BlastSeqSrc* seq_src = MultiSeqBlastSeqSrcInit(m_vSubject, kProgramType);
339  CRef<IQueryFactory> query_factory(new CObjMgr_QueryFactory(m_vQuery));
340  CBlastPrelimSearch prelim_search(query_factory, options, seq_src);
341  CRef<SInternalData> id(prelim_search.Run());
342 
343  CBlastHSPResults results
344  (prelim_search.ComputeBlastHSPResults
345  (id->m_HspStream->GetPointer()));
346 
347  // Check results
348  BOOST_REQUIRE_EQUAL(1, results->num_queries);
349  BOOST_REQUIRE_EQUAL(1, results->hitlist_array[0]->hsplist_count);
350  BlastHSPList* hsplist = results->hitlist_array[0]->hsplist_array[0];
351 
352  BOOST_REQUIRE_EQUAL(kNumHsps, hsplist->hspcnt);
353  BlastSeqSrcFree(seq_src);
354 }
355 
356 BOOST_AUTO_TEST_CASE(testDiscMegaBlastPartialRun)
357 {
358  const TGi kQueryGi = GI_CONST(14702146);
359  const string kDbName("data/seqn");
360  const size_t kNumHits = 2;
361  const TGi kGis[kNumHits] = { GI_CONST(46071158), GI_CONST(46072400) };
362  const int kScores[kNumHits] = { 1024, 944 };
363  const int kNumIdent[kNumHits] = { 458, 423 };
364 
365  CRef<CSeq_loc> query_loc(new CSeq_loc());
366  query_loc->SetWhole().SetGi(kQueryGi);
367  CScope* query_scope = new CScope(CTestObjMgr::Instance().GetObjMgr());
368  query_scope->AddDefaults();
369  m_vQuery.push_back(SSeqLoc(query_loc, query_scope));
371 
372  CDiscNucleotideOptionsHandle opts_handle;
373 
374  opts_handle.SetWindowSize(40);
375  opts_handle.SetMatchReward(4);
376  opts_handle.SetMismatchPenalty(-5);
377  opts_handle.SetGapOpeningCost(5);
378  opts_handle.SetGapExtensionCost(5);
379  opts_handle.SetPercentIdentity(70);
380  opts_handle.SetMaskAtHash(false);
381 
382  CRef<CBlastOptionsHandle> options(&opts_handle);
383  CRef<IQueryFactory> query_factory(new CObjMgr_QueryFactory(m_vQuery));
384  CLocalBlast blaster(query_factory, options, dbinfo);
385  CSearchResultSet results = *blaster.Run();
386 
387  // Check results
388  BOOST_REQUIRE_EQUAL((int)1, (int)results.GetNumResults());
389  CConstRef<CSeq_align_set> alignment = results[0].GetSeqAlign();
390  BOOST_REQUIRE_EQUAL(kNumHits, alignment->Get().size());
391 
392  CRef<CSeq_align> first_hit = alignment->Get().front();
393  CRef<CSeq_align> second_hit = alignment->Get().back();
394 
395  int score;
396  BOOST_REQUIRE(first_hit->GetNamedScore("score", score));
397  BOOST_REQUIRE_EQUAL(kScores[0], score);
398  BOOST_REQUIRE(second_hit->GetNamedScore("score", score));
399  BOOST_REQUIRE_EQUAL(kScores[1], score);
400 
401  int num_ident;
402  BOOST_REQUIRE(first_hit->GetNamedScore("num_ident", num_ident));
403  BOOST_REQUIRE_EQUAL(kNumIdent[0], num_ident);
404  BOOST_REQUIRE(second_hit->GetNamedScore("num_ident", num_ident));
405  BOOST_REQUIRE_EQUAL(kNumIdent[1], num_ident);
406 
408  BOOST_REQUIRE_EQUAL(CSeq_id::e_Gi, first_hit->GetSeq_id(1).Which());
409  BOOST_REQUIRE_EQUAL(kGis[0], first_hit->GetSeq_id(1).GetGi());
410  BOOST_REQUIRE_EQUAL(CSeq_id::e_Gi, second_hit->GetSeq_id(1).Which());
411  BOOST_REQUIRE_EQUAL(kGis[1], second_hit->GetSeq_id(1).GetGi());
412  }
413  else {
414  BOOST_REQUIRE_EQUAL(CSeq_id::e_Ddbj, first_hit->GetSeq_id(1).Which());
415  BOOST_REQUIRE_EQUAL("BP722565", first_hit->GetSeq_id(1).GetDdbj().GetAccession());
416  BOOST_REQUIRE_EQUAL(1, first_hit->GetSeq_id(1).GetDdbj().GetVersion());
417  BOOST_REQUIRE_EQUAL(CSeq_id::e_Ddbj, second_hit->GetSeq_id(1).Which());
418  BOOST_REQUIRE_EQUAL("BP723807", second_hit->GetSeq_id(1).GetDdbj().GetAccession());
419  BOOST_REQUIRE_EQUAL(1, second_hit->GetSeq_id(1).GetDdbj().GetVersion());
420  }
421 }
422 
423 BOOST_AUTO_TEST_CASE(testBlastpPrelimSearch)
424 {
425  const string kDbName("data/seqp");
426  const TGi kQueryGi1 = GI_CONST(21282798);
427  const TGi kQueryGi2 = GI_CONST(129295);
428  const int kNumHits = 31;
429  const int kNumHitsToCheck = 3;
430  const int kIndices[kNumHitsToCheck] = { 1, 4, 8 };
431  const int kScores[kNumHitsToCheck] = { 519, 56, 54 };
432  const int kOids[kNumHitsToCheck] = { 74, 971, 45 };
433  const int kQueryLengths[kNumHitsToCheck] = { 297, 46, 63 };
434  const int kSubjectLengths[kNumHitsToCheck] = { 298, 48, 55 };
435  const EProgram kProgram = eBlastp;
436 
437  CRef<CSeq_loc> query_loc1(new CSeq_loc());
438  query_loc1->SetWhole().SetGi(kQueryGi1);
439  CScope* query_scope1 = new CScope(CTestObjMgr::Instance().GetObjMgr());
440  query_scope1->AddDefaults();
441  m_vQuery.push_back(SSeqLoc(query_loc1, query_scope1));
442  CRef<CSeq_loc> query_loc2(new CSeq_loc());
443  query_loc2->SetWhole().SetGi(kQueryGi2);
444  CScope* query_scope2 = new CScope(CTestObjMgr::Instance().GetObjMgr());
445  query_scope2->AddDefaults();
446  m_vQuery.push_back(SSeqLoc(query_loc2, query_scope2));
447  BlastSeqSrc* seq_src = SeqDbBlastSeqSrcInit(kDbName, true, 0, 0);
448 
449  CRef<CBlastOptionsHandle> opts_handle(
450  CBlastOptionsFactory::Create(kProgram));
451  CRef<CBlastOptions> options(&opts_handle->SetOptions());
452  options->SetFilterString("L"); /* NCBI_FAKE_WARNING */
453 
454  CRef<IQueryFactory> query_factory(new CObjMgr_QueryFactory(m_vQuery));
455  CBlastPrelimSearch prelim_search(query_factory, options, seq_src);
456  CRef<SInternalData> id(prelim_search.Run());
457 
458  BlastSeqSrcFree(seq_src);
459 
460  CBlastHSPResults results
461  (prelim_search.ComputeBlastHSPResults
462  (id->m_HspStream->GetPointer()));
463  BOOST_REQUIRE_EQUAL(2, results->num_queries);
464  BOOST_REQUIRE_EQUAL(kNumHits, results->hitlist_array[0]->hsplist_count);
465 
466  BOOST_REQUIRE_CLOSE(results->hitlist_array[0]->hsplist_array[0]->best_evalue,
467  results->hitlist_array[0]->hsplist_array[0]->hsp_array[0]->evalue,
468  results->hitlist_array[0]->hsplist_array[0]->hsp_array[0]->evalue/2);
469 
470  for (int index = 0; index < kNumHitsToCheck;
471  ++index) {
472  const int kHitIndex = kIndices[index];
473  BlastHSPList* hsp_list =
474  results->hitlist_array[0]->hsplist_array[kHitIndex];
475  BOOST_REQUIRE_EQUAL(kOids[index], hsp_list->oid);
476  BlastHSP* hsp = hsp_list->hsp_array[0];
477 // cerr << hsp_list->oid << " " << hsp->score << " " << hsp->num_ident << " " << hsp->query.offset << " " << hsp->query.end << " " << hsp->subject.offset << " " << hsp->subject.end << "\n";
478  BOOST_REQUIRE_EQUAL(kScores[index], hsp->score);
479  BOOST_REQUIRE_EQUAL(0, hsp->num_ident);
480  BOOST_REQUIRE_EQUAL(kQueryLengths[index],
481  hsp->query.end - hsp->query.offset);
482  BOOST_REQUIRE_EQUAL(kSubjectLengths[index],
483  hsp->subject.end - hsp->subject.offset);
484  }
485 }
486 
487 BOOST_AUTO_TEST_CASE(testGappedOffsets)
488 {
489  const unsigned char query[] = {'\016', '\007', '\014', '\024', '\004', '\015', '\011',
490  '\022', '\012', '\016', '\001', '\010', '\007', '\005', '\014', '\023',
491  '\021', '\003', '\016', '\005', '\013', '\006', '\020', '\011', '\006',
492  '\015', '\016', '\004', '\017'};
493 
494  const unsigned char subject[] = {'\000', '\000', '\000',
495  '\004', '\015', '\011', '\022', '\012', '\016', '\001', '\010', '\007',
496  '\005', '\014', '\023', '\021', '\003', '\016', '\005', '\013', '\006',
497  '\020', '\011', '\006', '\015', '\016', '\004', '\017'};
498 
499  Boolean retval = FALSE;
501  // generate score options
502  BlastScoringOptions *score_options;
503  BlastScoringOptionsNew(eBlastTypeBlastp, &score_options);
504  BLAST_FillScoringOptions(score_options,
506  FALSE,
507  0,
508  0,
509  NULL,
512  Blast_ScoreBlkMatrixInit(eBlastTypeBlastp, score_options, sbp, NULL);
513  BlastScoringOptionsFree(score_options);
514 
515  Int4 q_offset, s_offset;
516 
517  // This one has a mismatch at the beginning of the HSPs.
518  BlastHSP* hsp = Blast_HSPNew();
519  hsp->query.offset = 0;
520  hsp->query.end = 27;
521  hsp->subject.offset = 0;
522  hsp->subject.end = 26;
523  hsp->score = 45;
524 
525  retval = BlastGetOffsetsForGappedAlignment(query, subject, sbp, hsp, &q_offset, &s_offset);
526 
527  BOOST_REQUIRE_EQUAL(q_offset, 22);
528  BOOST_REQUIRE_EQUAL(s_offset, 21);
529  BOOST_REQUIRE_EQUAL(retval, true);
530 
531  // This one should be OK
532  hsp->query.offset = 4;
533  hsp->query.end = 27;
534  hsp->subject.offset = 3;
535  hsp->subject.end = 26;
536  hsp->score = 45;
537 
538  retval = BlastGetOffsetsForGappedAlignment(query, subject, sbp, hsp, &q_offset, &s_offset);
539 
540  BOOST_REQUIRE_EQUAL(q_offset, 18);
541  BOOST_REQUIRE_EQUAL(s_offset, 17);
542  BOOST_REQUIRE_EQUAL(retval, true);
543 
544  // This should have no solution.
545  hsp->query.offset = 5;
546  hsp->query.end = 20;
547  hsp->subject.offset = 11;
548  hsp->subject.end = 26;
549  hsp->score = 45;
550 
551  retval = BlastGetOffsetsForGappedAlignment(query, subject, sbp, hsp, &q_offset, &s_offset);
552  BOOST_REQUIRE_EQUAL(retval, false);
553 }
554 
556 
User-defined methods of the data storage class.
static const Int8 kEffectiveSearchSpace
Declarations of static arrays used to define some NCBI encodings to be used in a toolkit independent ...
Function calls to actually perform a BLAST search (high level).
Structures and functions prototypes used for BLAST gapped extension.
Boolean BlastGetOffsetsForGappedAlignment(const Uint1 *query, const Uint1 *subject, const BlastScoreBlk *sbp, BlastHSP *hsp, Int4 *q_retval, Int4 *s_retval)
Function to look for the highest scoring window (of size HSP_MAX_WINDOW) in an HSP and return the mid...
void Blast_HSPListSortByEvalue(BlastHSPList *hsp_list)
Sort the HSPs in an HSP list by e-value, with scores and other criteria used to resolve ties.
Definition: blast_hits.c:1437
BlastHSP * Blast_HSPNew(void)
Allocate and zeros out memory for an HSP structure.
Definition: blast_hits.c:141
Boolean Blast_HSPListIsSortedByScore(const BlastHSPList *hsp_list)
Check if HSP list is sorted by score.
Definition: blast_hits.c:1358
Declaration of ADT to save and retrieve lists of HSPs in the BLAST engine.
Declares the CBlastNucleotideOptionsHandle class.
Definitions which are dependant on the NCBI C++ Object Manager.
The structures and functions in blast_options.
#define BLAST_GAP_OPEN_PROT
Protein gap costs are the defaults for the BLOSUM62 scoring matrix.
Definition: blast_options.h:84
Int2 BLAST_FillScoringOptions(BlastScoringOptions *options, EBlastProgramType program, Boolean greedy_extension, Int4 penalty, Int4 reward, const char *matrix, Int4 gap_open, Int4 gap_extend)
Fill non-default values in the BlastScoringOptions structure.
#define BLAST_GAP_EXTN_PROT
cost to extend a gap.
Definition: blast_options.h:92
Int2 BlastScoringOptionsNew(EBlastProgramType program, BlastScoringOptions **options)
Allocate memory for BlastScoringOptions and fill with default values.
BlastScoringOptions * BlastScoringOptionsFree(BlastScoringOptions *options)
Deallocate memory for BlastScoringOptions.
EBlastProgramType
Defines the engine's notion of the different applications of the BLAST algorithm.
Definition: blast_program.h:72
@ eBlastTypeBlastn
Definition: blast_program.h:74
@ eBlastTypeTblastn
Definition: blast_program.h:77
@ eBlastTypeBlastp
Definition: blast_program.h:73
BlastSeqSrc * BlastSeqSrcFree(BlastSeqSrc *seq_src)
Frees the BlastSeqSrc structure by invoking the destructor function set by the user-defined construct...
Definition: blast_seqsrc.c:112
Utilities initialize/setup BLAST.
Int2 Blast_ScoreBlkMatrixInit(EBlastProgramType program_number, const BlastScoringOptions *scoring_options, BlastScoreBlk *sbp, GET_MATRIX_PATH get_path)
Initializes the substitution matrix in the BlastScoreBlk according to the scoring options specified.
Definition: blast_setup.c:330
BlastScoreBlk * BlastScoreBlkNew(Uint1 alphabet, Int4 number_of_contexts)
Allocates and initializes BlastScoreBlk.
Definition: blast_stat.c:884
Functions to do gapped alignment with traceback.
EProgram
This enumeration is to evolve into a task/program specific list that specifies sets of default parame...
Definition: blast_types.hpp:56
@ eBlastp
Protein-Protein.
Definition: blast_types.hpp:59
@ eTblastn
Protein-Translated nucl.
Definition: blast_types.hpp:61
void testLongMatchDiagnostics(BlastDiagnostics *diagnostics)
void testShortMatchDiagnostics(BlastDiagnostics *diagnostics)
BOOST_AUTO_TEST_CASE(testTBLASTNLongMatchBlastEngine)
BOOST_AUTO_TEST_SUITE_END() static int s_GetSegmentFlags(const CBioseq &bioseq)
Wrapper class for BlastHSPResults .
Definition: blast_aux.hpp:343
Handle to the nucleotide-nucleotide options to the BLAST algorithm.
Search class to perform the preliminary stage of the BLAST search.
Handle to the nucleotide-nucleotide options to the discontiguous BLAST algorithm.
Class to perform a BLAST search on local BLAST databases Note that PHI-BLAST can be run using this cl...
Definition: local_blast.hpp:62
NCBI C++ Object Manager dependant implementation of IQueryFactory.
CScope –.
Definition: scope.hpp:92
Blast Search Subject.
Search Results for All Queries.
const CSeq_id & GetSeq_id(TDim row) const
Get seq-id (the first one if segments have different ids).
Definition: Seq_align.cpp:317
bool GetNamedScore(const string &id, int &score) const
Get score.
Definition: Seq_align.cpp:563
static CTestObjMgr & Instance()
Definition: test_objmgr.cpp:69
Declares the CDiscNucleotideOptionsHandle class.
void SetEvalueThreshold(double eval)
Sets EvalueThreshold.
void SetMatchReward(int r)
Sets MatchReward.
CRef< SInternalData > Run()
Borrow the internal data and results results.
BlastSeqSrc * SeqDbBlastSeqSrcInit(const string &dbname, bool is_prot, Uint4 first_seq=0, Uint4 last_seq=0, Int4 mask_algo_id=-1, ESubjectMaskingType mask_type=eNoSubjMasking)
Initialize the sequence source structure.
void SetEffectiveSearchSpace(Int8 eff)
Sets EffectiveSearchSpace.
CRef< CSearchResultSet > Run()
Executes the search.
static CBlastOptionsHandle * Create(EProgram program, EAPILocality locality=CBlastOptions::eLocal)
Creates an options handle object configured with default options for the requested program,...
CBlastOptions & SetOptions()
Returns a reference to the internal options class which this object is a handle for.
void SetTraditionalBlastnDefaults()
Sets TraditionalBlastnDefaults.
void SetMismatchPenalty(int p)
Sets MismatchPenalty.
BlastSeqSrc * MultiSeqBlastSeqSrcInit(TSeqLocVector &seq_vector, EBlastProgramType program, bool dbscan_mode=false)
Initialize the sequence source structure.
#define BLASTAA_SEQ_CODE
== Seq_code_ncbistdaa
void SetWindowSize(int ws)
Sets WindowSize.
void SetRepeatFiltering(bool val)
Enable repeat filtering.
void SetFilterString(const char *f, bool clear=true)
void Blast_FindRepeatFilterLoc(TSeqLocVector &query_loc, const CBlastOptionsHandle *opts_handle)
Finds repeats locations for a given set of sequences.
void SetGapExtensionCost(int e)
Sets GapExtensionCost.
void SetMaskAtHash(bool m=true)
Sets MaskAtHash.
void SetFilterString(const char *f, bool clear=true)
Sets FilterString.
void SetWordSize(int ws)
Sets WordSize.
CRef< TBlastDiagnostics > m_Diagnostics
Diagnostic output from preliminary and traceback stages.
void SetPercentIdentity(double p)
Sets PercentIdentity.
void SetGapOpeningCost(int g)
Sets GapOpeningCost.
CRef< TBlastHSPStream > m_HspStream
HSP output of the preliminary stage goes here.
BlastHSPResults * ComputeBlastHSPResults(BlastHSPStream *stream, Uint4 max_num_hsps=0, bool *rm_hsps=NULL, vector< bool > *rm_hsps_info=NULL) const
Return HSPs in a structure other than the HSPStream? Provide conversion? How to combine this with CBl...
size_type GetNumResults() const
Return the number of results contained by this object.
@ eBlastDbIsNucleotide
nucleotide
#define GI_CONST(gi)
Definition: ncbimisc.hpp:1087
#define NULL
Definition: ncbistd.hpp:225
static bool PreferAccessionOverGi(void)
Check if the option to prefer accession.version over GI is enabled (SeqId/PreferAccessionOverGi or SE...
Definition: Seq_id.cpp:3404
void SetWhole(TWhole &v)
Definition: Seq_loc.hpp:982
void AddDefaults(TPriority pri=kPriority_Default)
Add default data loaders from object manager.
Definition: scope.cpp:504
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
const Tdata & Get(void) const
Get the member data.
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_id_.hpp:746
TGi GetGi(void) const
Get the variant data.
Definition: Seq_id_.hpp:889
TVersion GetVersion(void) const
Get the Version member data.
const TDdbj & GetDdbj(void) const
Get the variant data.
Definition: Seq_id_.cpp:391
const TAccession & GetAccession(void) const
Get the Accession member data.
@ e_Ddbj
DDBJ.
Definition: Seq_id_.hpp:107
@ e_Gi
GenInfo Integrated Database.
Definition: Seq_id_.hpp:106
Main class to perform a BLAST search on the local machine.
Magic spell ;-) needed for some weird compilers... very empiric.
#define fabs(v)
Definition: ncbi_dispd.c:46
Uint1 Boolean
bool replacment for C
Definition: ncbi_std.h:94
#define FALSE
bool replacment for C indicating false.
Definition: ncbi_std.h:101
Defines: CTimeFormat - storage class for time format.
The Object manager core.
NOTE: This file contains work in progress and the APIs are likely to change, please do not rely on th...
Utilities to develop and debug unit tests for BLAST.
NOTE: This file contains work in progress and the APIs are likely to change, please do not rely on th...
C++ implementation of repeats filtering for C++ BLAST.
Implementation of the BlastSeqSrc interface for a vector of sequence locations.
Implementation of the BlastSeqSrc interface using the C++ BLAST databases API.
vector< SSeqLoc > TSeqLocVector
Vector of sequence locations.
Definition: sseqloc.hpp:129
Return statistics from the BLAST search.
BlastUngappedStats * ungapped_stat
Ungapped extension counts.
BlastGappedStats * gapped_stat
Gapped extension counts.
void setupQueryAndSubject(TGi query_gi, TGi subject_gi)
Structure containing hit counts from the gapped stage of a BLAST search.
Int4 good_extensions
Number of HSPs below the e-value threshold after gapped extension.
Int4 extensions
Total number of gapped extensions performed.
The structure to hold all HSPs for a given sequence after the gapped alignment.
Definition: blast_hits.h:153
Int4 oid
The ordinal id of the subject sequence this HSP list is for.
Definition: blast_hits.h:154
Int4 hspcnt
Number of HSPs saved.
Definition: blast_hits.h:158
BlastHSP ** hsp_array
Array of pointers to individual HSPs.
Definition: blast_hits.h:157
double best_evalue
Smallest e-value for HSPs in this list.
Definition: blast_hits.h:162
BlastHitList ** hitlist_array
Array of results for individual query sequences.
Definition: blast_hits.h:185
Int4 num_queries
Number of query sequences.
Definition: blast_hits.h:184
Structure holding all information about an HSP.
Definition: blast_hits.h:126
double evalue
This HSP's e-value.
Definition: blast_hits.h:130
Int4 num_ident
Number of identical base pairs in this HSP.
Definition: blast_hits.h:128
BlastSeg query
Query sequence info.
Definition: blast_hits.h:131
BlastSeg subject
Subject sequence info.
Definition: blast_hits.h:132
Int4 score
This HSP's raw score.
Definition: blast_hits.h:127
BlastHSPList ** hsplist_array
Array of HSP lists for individual database hits.
Definition: blast_hits.h:176
Int4 hsplist_count
Filled size of the HSP lists array.
Definition: blast_hits.h:170
Structure used for scoring calculations.
Definition: blast_stat.h:177
Scoring options block Used to produce the BlastScoreBlk structure This structure may be needed for lo...
Int4 end
End of hsp.
Definition: blast_hits.h:99
Int4 offset
Start of hsp.
Definition: blast_hits.h:98
Complete type definition of Blast Sequence Source ADT.
Definition: blast_seqsrc.c:43
Structure containing hit counts from the ungapped stage of a BLAST search.
Int8 lookup_hits
Number of successful lookup table hits.
Int4 init_extends
Number of initial words found and extended.
Int4 good_init_extends
Number of successful initial extensions, i.e.
Structure to represent a single sequence to be fed to BLAST.
Definition: sseqloc.hpp:47
static string subject
static string query
Utility stuff for more convenient using of Boost.Test library.
Modified on Sun Apr 21 03:43:28 2024 by modify_doxy.py rev. 669887