NCBI C++ ToolKit
psiblast_unit_test.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: psiblast_unit_test.cpp 83266 2018-08-07 18:29:39Z grichenk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Christiam Camacho
27  *
28  */
29 
30 /** @file psiblast-cppunit.cpp
31  * Unit test module for the PSI-BLAST class
32  */
33 #define NCBI_TEST_APPLICATION
34 #include <ncbi_pch.hpp>
35 #include <corelib/test_boost.hpp>
36 #include <serial/iterator.hpp>
37 
38 // BLAST API includes
43 
45 
47 
48 // Pssm engine includes
51 
52 // Object includes
58 #include "psiblast_aux_priv.hpp" // for PsiBlastComputePssmScores
59 
60 // Auxiliary test includes
61 #include "blast_test_util.hpp"
62 //#include "psiblast_test_util.hpp"
63 
64 // SeqAlign comparison includes
65 #include "seqalign_cmp.hpp"
66 #include "seqalign_set_convert.hpp"
67 
70 #include "blast_objmgr_priv.hpp"
71 
72 /// Calculate the size of a static array
73 #define STATIC_ARRAY_SIZE(array) (sizeof(array)/sizeof(*array))
74 
75 using namespace ncbi;
76 using namespace ncbi::objects;
77 using namespace ncbi::blast;
78 
80 
84 
85  /// Contains a single Bioseq
87 
88  /// Contains a Bioseq-set with two Bioseqs, gi 7450545 and gi 129295
90 
92  const string kSeqEntryFile("data/7450545.seqentry.asn");
93  m_SeqEntry = TestUtil::ReadObject<CSeq_entry>(kSeqEntryFile);
94 
95  m_SeqSet.Reset(new CSeq_entry);
96  m_SeqSet->SetSet().SetSeq_set().push_back(m_SeqEntry);
97  BOOST_REQUIRE(m_Pssm &&
98  m_Pssm->CanGetPssm() &&
99  m_Pssm->GetPssm().CanGetQuery());
100  CRef<CSeq_entry> second_bioseq(&m_Pssm->SetPssm().SetQuery());
101  m_SeqSet->SetSet().SetSeq_set().push_back(second_bioseq);
102  }
103 
105  const string kPssmFile("data/pssm_freq_ratios.asn");
106  m_Pssm = TestUtil::ReadObject<CPssmWithParameters>(kPssmFile);
107  BOOST_REQUIRE(m_Pssm->GetPssm().CanGetQuery());
108  BOOST_REQUIRE(m_Pssm->GetPssm().CanGetIntermediateData());
109  BOOST_REQUIRE(!m_Pssm->GetPssm().CanGetFinalData());
110  }
111 
113  m_OptHandle.Reset(new CPSIBlastOptionsHandle);
114  BOOST_REQUIRE_EQUAL(eCompositionMatrixAdjust,
115  m_OptHandle->GetCompositionBasedStats());
116  m_SearchDb = new CSearchDatabase("data/seqp",
118 
119  x_ReadPssmFromFile();
120  PsiBlastComputePssmScores(m_Pssm, m_OptHandle->GetOptions());
121  BOOST_REQUIRE(m_Pssm->GetPssm().GetFinalData().CanGetScores());
122 
123  x_ReadSeqEntriesFromFile();
124  }
125 
127  m_OptHandle.Reset();
128  m_Pssm.Reset();
129  delete m_SearchDb;
130  m_SeqEntry.Reset();
131  m_SeqSet.Reset();
132  }
133 
135  {
136  int num_gis = 0;
137  TGi last_gi = INVALID_GI;
138  CSeq_id last_id;
139  ITERATE(CSeq_align_set::Tdata, itr, sas->Get()){
140  const CSeq_id& seqid = (*itr)->GetSeq_id(1);
142  TGi new_gi = seqid.GetGi();
143  if (new_gi != last_gi)
144  {
145  num_gis++;
146  last_gi = new_gi;
147  }
148  }
149  else if ( !seqid.Equals(last_id) ) {
150  num_gis++;
151  last_id.Assign(seqid);
152  }
153  }
154  return num_gis;
155  }
156 
158  TSeqLocVector subjects;
160  CConstRef<CSeq_id> sid = (scope->AddBioseq(*bioseq)).GetSeqId();
161  CRef<CSeq_loc> sl(new CSeq_loc());
162  sl->SetWhole();
163  sl->SetId(*sid);
164  SSeqLoc ssl(*sl, *scope);
165  subjects.push_back(ssl);
166  return (IQueryFactory *)(new CObjMgr_QueryFactory(subjects));
167  }
168 
170  TSeqLocVector subjects;
173  for (; itr; ++itr) {
174  CConstRef<CSeq_id> sid = (scope->AddBioseq(*itr)).GetSeqId();
175  CRef<CSeq_loc> sl(new CSeq_loc());
176  sl->SetWhole();
177  sl->SetId(*sid);
178  SSeqLoc ssl(*sl, *scope);
179  subjects.push_back(ssl);
180  }
181  return (IQueryFactory *)(new CObjMgr_QueryFactory(subjects));
182  }
183 
188  CConstRef<CBlastAncillaryData> ancillary_data)
189  {
190  CRef<CSeqDB> db(new CSeqDB("data/seqp", CSeqDB::eProtein));
191  CBlastScopeSource blast_om(db);
192  CRef<CScope> scope(blast_om.NewScope());
194  diags->frequency_ratios = true;
196  *opts_handle,
197  ancillary_data,
198  diags);
199  }
200 
201 };
202 
203 BOOST_FIXTURE_TEST_SUITE(psiblast, CPsiBlastTestFixture)
204 
205 BOOST_AUTO_TEST_CASE(TestSingleIteration_ProteinAsQuery_NoCBS) {
206  m_OptHandle->SetCompositionBasedStats(eNoCompositionBasedStats);
207  m_OptHandle->SetEvalueThreshold(1.5);
208  CConstRef<CBioseq> bioseq(&m_SeqEntry->GetSeq());
209  CRef<IQueryFactory> query_factory(s_SetupSubject(bioseq));
210  CRef<CLocalDbAdapter> dbadapter(new CLocalDbAdapter(*m_SearchDb));
211  CPsiBlast psiblast(query_factory, dbadapter, m_OptHandle);
212 
213  CSearchResultSet results(*psiblast.Run());
214  BOOST_REQUIRE(results[0].GetErrors().empty());
215 
216  const int kNumExpectedMatchingSeqs = 3;
217  CConstRef<CSeq_align_set> sas = results[0].GetSeqAlign();
218 
219  BOOST_REQUIRE_EQUAL(kNumExpectedMatchingSeqs, s_CountNumberUniqueGIs(sas));
220 
221  const size_t kNumExpectedHSPs = 3;
222  qa::TSeqAlignSet expected_results(kNumExpectedHSPs);
223 
224  // HSP # 1
225  expected_results[0].score = 64;
226  expected_results[0].evalue = 2.46806e-1;
227  expected_results[0].bit_score = 292610051e-7;
228  expected_results[0].num_ident = 18;
229  expected_results[0].starts.push_back(34);
230  expected_results[0].starts.push_back(95);
231  expected_results[0].lengths.push_back(53);
232 
233  // HSP # 2
234  {
235  int starts[] = { 203, 280, 220, -1, 226, 297, 258, -1, 265, 329 };
236  int lengths[] = { 17, 6, 32, 7, 39 };
237  expected_results[1].score = 63;
238  expected_results[1].evalue = 3.48342e-1;
239  expected_results[1].bit_score = 288758055e-7;
240  expected_results[1].num_ident = 24;
241  copy(&starts[0], &starts[STATIC_ARRAY_SIZE(starts)],
242  back_inserter(expected_results[1].starts));
243  copy(&lengths[0], &lengths[STATIC_ARRAY_SIZE(lengths)],
244  back_inserter(expected_results[1].lengths));
245  }
246 
247  // HSP # 3
248  {
249  int starts[] = { 180, 97, 204, -1, 205, 121, -1, 127, 211, 128,
250  241, -1, 242, 158, -1, 197, 281, 201, 306, -1, 318, 226, 323,
251  -1, 327, 231 };
252  int lengths[] = { 24, 1, 6, 1, 30, 1, 39, 4, 25, 12, 5, 4, 35 };
253  expected_results[2].score = 60;
254  expected_results[2].evalue = 7.36231e-1;
255  expected_results[2].bit_score = 277202068e-7;
256  expected_results[2].num_ident = 42;
257  copy(&starts[0], &starts[STATIC_ARRAY_SIZE(starts)],
258  back_inserter(expected_results[2].starts));
259  copy(&lengths[0], &lengths[STATIC_ARRAY_SIZE(lengths)],
260  back_inserter(expected_results[2].lengths));
261  }
262 
263  /* HSP # 4
264  {
265  int starts[] = { 43, 13, -1, 84, 114, 91, 135, -1, 136, 112,
266  149, -1, 151, 125, -1, 146, 172, 147, 177, -1, 179, 152,
267  195, -1, 200, 168, 226, -1, 228, 194 };
268  int lengths[] = { 71, 7, 21, 1, 13, 2, 21, 1, 5, 2, 16, 5, 26, 2,
269  11 };
270  expected_results[3].score = 57;
271  expected_results[3].evalue = 147345337e-8;
272  expected_results[3].bit_score = 265646081e-7;
273  expected_results[3].num_ident = 48;
274  copy(&starts[0], &starts[STATIC_ARRAY_SIZE(starts)],
275  back_inserter(expected_results[3].starts));
276  copy(&lengths[0], &lengths[STATIC_ARRAY_SIZE(lengths)],
277  back_inserter(expected_results[3].lengths));
278  }
279  */
280 
281  qa::TSeqAlignSet actual_results;
282  qa::SeqAlignSetConvert(*sas, actual_results);
283 
284  qa::CSeqAlignCmpOpts opts;
285  qa::CSeqAlignCmp cmp(expected_results, actual_results, opts);
286  string errors;
287  bool identical_results = cmp.Run(&errors);
288 
289  BOOST_REQUIRE_MESSAGE(identical_results, errors);
290 }
291 
292 BOOST_AUTO_TEST_CASE(TestSingleIteration_ProteinAsQuery_CBS) {
293  m_OptHandle->SetCompositionBasedStats(eCompositionBasedStats);
294  CConstRef<CBioseq> bioseq(&m_SeqEntry->GetSeq());
295  CRef<IQueryFactory> query_factory(s_SetupSubject(bioseq));
296  CRef<CLocalDbAdapter> dbadapter(new CLocalDbAdapter(*m_SearchDb));
297  CPsiBlast psiblast(query_factory, dbadapter, m_OptHandle);
298 
299  CSearchResultSet results(*psiblast.Run());
300  BOOST_REQUIRE(results[0].GetErrors().empty());
301 
302  const int kNumExpectedMatchingSeqs = 4;
303  CConstRef<CSeq_align_set> sas = results[0].GetSeqAlign();
304 
305  BOOST_REQUIRE_EQUAL(kNumExpectedMatchingSeqs, s_CountNumberUniqueGIs(sas));
306 
307  const size_t kNumExpectedHSPs = 4;
308  qa::TSeqAlignSet expected_results(kNumExpectedHSPs);
309 
310  // HSP # 1
311  expected_results[0].score = 63;
312  expected_results[0].evalue = 2.96035e-1;
313  expected_results[0].bit_score = 288758056e-7;
314  expected_results[0].starts.push_back(34);
315  expected_results[0].starts.push_back(95);
316  expected_results[0].lengths.push_back(53);
317  expected_results[0].num_ident = 18;
318 
319  // HSP # 2
320  expected_results[1].score = 59;
321  expected_results[1].evalue = 9.29330e-1;
322  expected_results[1].bit_score = 273350073e-7;
323  expected_results[1].starts.push_back(203);
324  expected_results[1].starts.push_back(280);
325  expected_results[1].starts.push_back(220);
326  expected_results[1].starts.push_back(-1);
327  expected_results[1].starts.push_back(226);
328  expected_results[1].starts.push_back(297);
329  expected_results[1].starts.push_back(258);
330  expected_results[1].starts.push_back(-1);
331  expected_results[1].starts.push_back(265);
332  expected_results[1].starts.push_back(329);
333  expected_results[1].lengths.push_back(17);
334  expected_results[1].lengths.push_back(6);
335  expected_results[1].lengths.push_back(32);
336  expected_results[1].lengths.push_back(7);
337  expected_results[1].lengths.push_back(39);
338  expected_results[1].num_ident = 24;
339 
340  // HSP # 3
341  expected_results[2].score = 52;
342  expected_results[2].evalue = 6.67208;
343  expected_results[2].bit_score = 246386102e-7;
344  expected_results[2].starts.push_back(322);
345  expected_results[2].starts.push_back(46);
346  expected_results[2].lengths.push_back(28);
347  expected_results[2].num_ident = 10;
348 
349 
350  // HSP # 4
351  expected_results[3].score = 50;
352  expected_results[3].evalue = 6.81719;
353  expected_results[3].bit_score = 23.8682;
354  expected_results[3].starts.push_back(295);
355  expected_results[3].starts.push_back(23);
356  expected_results[3].starts.push_back(301);
357  expected_results[3].starts.push_back(-1);
358  expected_results[3].starts.push_back(304);
359  expected_results[3].starts.push_back(29);
360  expected_results[3].lengths.push_back(6);
361  expected_results[3].lengths.push_back(3);
362  expected_results[3].lengths.push_back(23);
363  expected_results[3].num_ident = 16;
364 
365  qa::TSeqAlignSet actual_results;
366  qa::SeqAlignSetConvert(*sas, actual_results);
367 
368  qa::CSeqAlignCmpOpts opts;
369  qa::CSeqAlignCmp cmp(expected_results, actual_results, opts);
370  string errors;
371  bool identical_results = cmp.Run(&errors);
372 
373  BOOST_REQUIRE_MESSAGE(identical_results, errors);
374 }
375 
376 BOOST_AUTO_TEST_CASE(TestSingleIteration_ProteinAsQuery_CBSConditional) {
377  m_OptHandle->SetCompositionBasedStats(eCompositionMatrixAdjust);
378  m_OptHandle->SetEvalueThreshold(5.0);
379  CConstRef<CBioseq> bioseq(&m_SeqEntry->GetSeq());
380  CRef<IQueryFactory> query_factory(s_SetupSubject(bioseq));
381  CRef<CLocalDbAdapter> dbadapter(new CLocalDbAdapter(*m_SearchDb));
382  CPsiBlast psiblast(query_factory, dbadapter, m_OptHandle);
383 
384  CSearchResultSet results(*psiblast.Run());
385  BOOST_REQUIRE(results[0].GetErrors().empty());
386 
387  const int kNumExpectedMatchingSeqs = 3;
388  CConstRef<CSeq_align_set> sas = results[0].GetSeqAlign();
389 
390  BOOST_REQUIRE_EQUAL(kNumExpectedMatchingSeqs, s_CountNumberUniqueGIs(sas));
391 
392  const size_t kNumExpectedHSPs = 3;
393  qa::TSeqAlignSet expected_results(kNumExpectedHSPs);
394 
395  // HSP # 1
396  expected_results[0].score = 59;
397  expected_results[0].evalue = 8.66100e-1;
398  expected_results[0].bit_score = 273350073e-7;
399  expected_results[0].starts.push_back(34);
400  expected_results[0].starts.push_back(95);
401  expected_results[0].lengths.push_back(53);
402  expected_results[0].sequence_gis.SetQuery(7450545);
403  expected_results[0].sequence_gis.SetSubject(22982149);
404  expected_results[0].num_ident = 18;
405 
406  // HSP # 3
407  {
408  int starts[] = { 322 , 46 , -1 , 75 , 351 , 81 , -1 , 94 , 364 ,
409  97 , -1 , 106 , 373 , 109 };
410  int lengths[] = { 29 , 6 , 13 , 3 , 9 , 3 , 17 };
411  expected_results[1].score = 53;
412  expected_results[1].evalue = 4.15768;
413  expected_results[1].bit_score = 250238098e-7;
414  expected_results[1].sequence_gis.SetQuery(7450545);
415  expected_results[1].sequence_gis.SetSubject(43121985);
416  expected_results[1].num_ident = 19;
417  copy(&starts[0], &starts[STATIC_ARRAY_SIZE(starts)],
418  back_inserter(expected_results[1].starts));
419  copy(&lengths[0], &lengths[STATIC_ARRAY_SIZE(lengths)],
420  back_inserter(expected_results[1].lengths));
421  }
422 
423  // HSP # 2
424  {
425  int starts[] = { 125 , 199 , 146 , -1 , 148 , 220 , -1 , 228 ,
426  156 , 233 , -1 , 250 , 173 , 252 , 179 , -1 , 181 , 258 ,
427  220 , -1 , 226 , 297 , 258 , -1 , 265 , 329 };
428  int lengths[] = { 21 , 2 , 8 , 5 , 17 , 2 , 6 , 2 , 39 , 6 , 32 ,
429  7 , 39 };
430  expected_results[2].score = 54;
431  expected_results[2].evalue = 4.40967;
432  expected_results[2].bit_score = 254090094e-7;
433  expected_results[2].sequence_gis.SetQuery(7450545);
434  expected_results[2].sequence_gis.SetSubject(13242404);
435  expected_results[2].num_ident = 39;
436  copy(&starts[0], &starts[STATIC_ARRAY_SIZE(starts)],
437  back_inserter(expected_results[2].starts));
438  copy(&lengths[0], &lengths[STATIC_ARRAY_SIZE(lengths)],
439  back_inserter(expected_results[2].lengths));
440  }
441 
442 
443  // HSP # 4
444 /*
445  expected_results[3].score = 53;
446  expected_results[3].evalue = 5.65033;
447  expected_results[3].bit_score = 25.0238;
448  expected_results[3].sequence_gis.SetQuery(7450545);
449  expected_results[3].sequence_gis.SetSubject(15836829);
450  expected_results[3].starts.push_back(39);
451  expected_results[3].starts.push_back(304);
452  expected_results[3].lengths.push_back(41);
453  expected_results[3].lengths.push_back(41);
454  expected_results[3].num_ident = 15;
455 */
456 
457 
458  qa::TSeqAlignSet actual_results;
459  qa::SeqAlignSetConvert(*sas, actual_results);
460 
461  qa::CSeqAlignCmpOpts opts;
462  qa::CSeqAlignCmp cmp(expected_results, actual_results, opts);
463  string errors;
464  bool identical_results = cmp.Run(&errors);
465 
466  BOOST_REQUIRE_MESSAGE(identical_results, errors);
467 }
468 
469 BOOST_AUTO_TEST_CASE(TestSingleIteration_ProteinAsQuery_CBSUniversal) {
470  m_OptHandle->SetCompositionBasedStats(eCompoForceFullMatrixAdjust);
471  CConstRef<CBioseq> bioseq(&m_SeqEntry->GetSeq());
472  CRef<IQueryFactory> query_factory(s_SetupSubject(bioseq));
473  CRef<CLocalDbAdapter> dbadapter(new CLocalDbAdapter(*m_SearchDb));
474  CPsiBlast psiblast(query_factory, dbadapter, m_OptHandle);
475 
476  CSearchResultSet results(*psiblast.Run());
477  BOOST_REQUIRE(results[0].GetErrors().empty());
478 
479  const int kNumExpectedMatchingSeqs = 6;
480  CConstRef<CSeq_align_set> sas = results[0].GetSeqAlign();
481 
482  BOOST_REQUIRE_EQUAL(kNumExpectedMatchingSeqs, s_CountNumberUniqueGIs(sas));
483 
484  const size_t kNumExpectedHSPs = 6;
485  qa::TSeqAlignSet expected_results(kNumExpectedHSPs);
486 
487  // HSP # 1
488  expected_results[0].score = 59;
489  expected_results[0].evalue = 8.66100e-1;
490  expected_results[0].bit_score = 273350073e-7;
491  expected_results[0].starts.push_back(34);
492  expected_results[0].starts.push_back(95);
493  expected_results[0].lengths.push_back(53);
494  expected_results[0].sequence_gis.SetQuery(7450545);
495  expected_results[0].sequence_gis.SetSubject(22982149);
496  expected_results[0].num_ident = 18;
497 
498  // HSP # 2
499  {
500  int starts[] = { 322 , 46 , -1 , 75 , 351 , 81 , -1 , 94 , 364 ,
501  97 , -1 , 106 , 373 , 109 };
502  int lengths[] = { 29 , 6 , 13 , 3 , 9 , 3 , 17 };
503  expected_results[1].score = 53;
504  expected_results[1].evalue = 4.15768;
505  expected_results[1].bit_score = 250238098e-7;
506  expected_results[1].sequence_gis.SetQuery(7450545);
507  expected_results[1].sequence_gis.SetSubject(43121985);
508  expected_results[1].num_ident = 19;
509  copy(&starts[0], &starts[STATIC_ARRAY_SIZE(starts)],
510  back_inserter(expected_results[1].starts));
511  copy(&lengths[0], &lengths[STATIC_ARRAY_SIZE(lengths)],
512  back_inserter(expected_results[1].lengths));
513  }
514 
515 
516  // HSP # 3
517  {
518  int starts[] = { 125 , 199 , 146 , -1 , 148 , 220 , -1 , 228 ,
519  156 , 233 , -1 , 250 , 173 , 252 , 179 , -1 , 181 , 258 ,
520  220 , -1 , 226 , 297 , 258 , -1 , 265 , 329 };
521  int lengths[] = { 21 , 2 , 8 , 5 , 17 , 2 , 6 , 2 , 39 , 6 , 32 ,
522  7 , 39 };
523  expected_results[2].score = 54;
524  expected_results[2].evalue = 4.40967;
525  expected_results[2].bit_score = 254090094e-7;
526  expected_results[2].sequence_gis.SetQuery(7450545);
527  expected_results[2].sequence_gis.SetSubject(13242404);
528  expected_results[2].num_ident = 39;
529  copy(&starts[0], &starts[STATIC_ARRAY_SIZE(starts)],
530  back_inserter(expected_results[2].starts));
531  copy(&lengths[0], &lengths[STATIC_ARRAY_SIZE(lengths)],
532  back_inserter(expected_results[2].lengths));
533  }
534 
535  /* HSP # 4
536  expected_results[3].score = 53;
537  expected_results[3].evalue = 466014414e-8;
538  expected_results[3].bit_score = 250238098e-7;
539  expected_results[3].sequence_gis.SetQuery(7450545);
540  expected_results[3].sequence_gis.SetSubject(45683609);
541  expected_results[3].starts.push_back(39);
542  expected_results[3].starts.push_back(304);
543  expected_results[3].lengths.push_back(41);
544  */
545 
546  qa::TSeqAlignSet actual_results;
547  qa::SeqAlignSetConvert(*sas, actual_results);
548 
549  qa::CSeqAlignCmpOpts opts;
550  qa::CSeqAlignCmp cmp(expected_results, actual_results, opts);
551  string errors;
552  bool identical_results = cmp.Run(&errors);
553 
554  BOOST_REQUIRE_MESSAGE(identical_results, errors);
555 }
556 
557 BOOST_AUTO_TEST_CASE(TestMultipleIterationsAndConvergence_ProteinAsQuery_NoCBS) {
558  const int kNumIterations = 4;
559  const int kNumExpectedIterations = 2;
560  CPsiBlastIterationState itr(kNumIterations);
561  m_OptHandle->SetCompositionBasedStats(eNoCompositionBasedStats);
562 
563  CConstRef<CBioseq> bioseq(&m_SeqEntry->GetSeq());
564  CRef<IQueryFactory> query_factory(s_SetupSubject(bioseq));
565  CRef<CLocalDbAdapter> dbadapter(new CLocalDbAdapter(*m_SearchDb));
566  CPsiBlast psiblast(query_factory, dbadapter, m_OptHandle);
567 
568  int hits_below_threshold[kNumIterations] = { 0, 0, 0, 0 };
569  size_t number_hits[kNumIterations] = { 11, 14, 0, 0 };
570 
571  int iteration_counter = 0;
572  while (itr) {
573  CSearchResultSet results = *psiblast.Run();
574  BOOST_REQUIRE(results[0].GetErrors().empty());
575  CConstRef<CSeq_align_set> alignment = results[0].GetSeqAlign();
576  BOOST_REQUIRE_EQUAL(number_hits[iteration_counter],
577  alignment->Get().size());
578 
580  CPsiBlastIterationState::GetSeqIds(alignment, m_OptHandle, ids);
581 
582  string m("On round ");
583  m += NStr::IntToString(itr.GetIterationNumber()) + " found ";
584  m += NStr::SizetToString(ids.size()) + " qualifying ids";
585  BOOST_REQUIRE_MESSAGE(
586  hits_below_threshold[iteration_counter]==(int)ids.size(), m);
587  itr.Advance(ids);
588 
589  if (itr) {
591  x_ComputePssmForNextIteration(*bioseq, alignment,
592  m_OptHandle, results[0].GetAncillaryData());
593  psiblast.SetPssm(pssm);
594  }
595  iteration_counter++;
596  }
597 
598  BOOST_REQUIRE_EQUAL(kNumExpectedIterations, iteration_counter);
599 }
600 
601 BOOST_AUTO_TEST_CASE(TestSingleIteration_PssmAsQuery_NoCBS) {
602 
603  m_OptHandle->SetCompositionBasedStats(eNoCompositionBasedStats);
604  CRef<CLocalDbAdapter> dbadapter(new CLocalDbAdapter(*m_SearchDb));
605  CPsiBlast psiblast(m_Pssm, dbadapter, m_OptHandle);
606 
607  CSearchResultSet results(*psiblast.Run());
608  BOOST_REQUIRE(results[0].GetErrors().empty());
609 
610  const int kNumExpectedMatchingSeqs = 6;
611  CConstRef<CSeq_align_set> sas = results[0].GetSeqAlign();
612  BOOST_REQUIRE_EQUAL(kNumExpectedMatchingSeqs, s_CountNumberUniqueGIs(sas));
613 
614  const size_t kNumExpectedHSPs = 7;
615  qa::TSeqAlignSet expected_results(kNumExpectedHSPs);
616 
617  // HSP # 1
618  {
619  int starts[] = { 0, 941, -1, 1093, 152, 1094 };
620  int lengths[] = { 152, 1, 80 };
621  expected_results[0].score = 595;
622  expected_results[0].evalue = 2.70189-71;
623  expected_results[0].bit_score = 233623298e-6;
624  expected_results[0].num_ident = 101;
625  copy(&starts[0], &starts[STATIC_ARRAY_SIZE(starts)],
626  back_inserter(expected_results[0].starts));
627  copy(&lengths[0], &lengths[STATIC_ARRAY_SIZE(lengths)],
628  back_inserter(expected_results[0].lengths));
629  }
630 
631  // HSP # 2
632  {
633  int starts[] = { 0, 154, -1, 308, 154, 309 };
634  int lengths[] = { 154, 1, 24 };
635  expected_results[1].score = 424;
636  expected_results[1].evalue = 6.54598e-49;
637  expected_results[1].bit_score = 167754171e-6;
638  expected_results[1].num_ident = 73;
639  copy(&starts[0], &starts[STATIC_ARRAY_SIZE(starts)],
640  back_inserter(expected_results[1].starts));
641  copy(&lengths[0], &lengths[STATIC_ARRAY_SIZE(lengths)],
642  back_inserter(expected_results[1].lengths));
643  }
644 
645  // HSP # 3
646  {
647  int starts[] =
648  { 0, 190, 65, -1, 67, 255, 91, -1, 92, 279, 111, -1, 113, 298,
649  -1, 304, 119, 305, 151, -1, 152, 337, 163, -1, 164, 348,
650  -1, 374, 190, 380, 200, -1, 202, 390 };
651  int lengths[] =
652  { 65, 2, 24, 1, 19, 2, 6, 1, 32, 1, 11, 1, 26, 6, 10, 2, 30 };
653  expected_results[2].score = 372;
654  expected_results[2].evalue = 1.67386e-43;
655  expected_results[2].bit_score = 147723793e-6;
656  expected_results[2].num_ident = 87;
657  copy(&starts[0], &starts[STATIC_ARRAY_SIZE(starts)],
658  back_inserter(expected_results[2].starts));
659  copy(&lengths[0], &lengths[STATIC_ARRAY_SIZE(lengths)],
660  back_inserter(expected_results[2].lengths));
661  }
662 
663  // HSP # 4
664  expected_results[3].score = 53;
665  expected_results[3].evalue = 2.43336;
666  expected_results[3].bit_score = 248451288e-7;
667  expected_results[3].num_ident = 8;
668  expected_results[3].starts.push_back(206);
669  expected_results[3].starts.push_back(46);
670  expected_results[3].lengths.push_back(19);
671 
672  // HSP # 5
673  {
674  int starts[] = { 177, 100, -1, 106, 183, 107, 205, -1, 215, 129 };
675  int lengths[] = { 6, 1, 22, 10, 14 };
676  expected_results[4].score = 52;
677  expected_results[4].evalue = 3.12771;
678  expected_results[4].bit_score = 244599292e-7;
679  expected_results[4].num_ident = 11;
680  copy(&starts[0], &starts[STATIC_ARRAY_SIZE(starts)],
681  back_inserter(expected_results[4].starts));
682  copy(&lengths[0], &lengths[STATIC_ARRAY_SIZE(lengths)],
683  back_inserter(expected_results[4].lengths));
684  }
685 
686  // HSP # 6
687  {
688  int starts[] = { 74, 181, 108, -1, 109, 215 };
689  int lengths[] = { 34, 1, 23 };
690  expected_results[5].score = 49;
691  expected_results[5].evalue = 8.37737;
692  expected_results[5].bit_score = 233043305e-7;
693  expected_results[5].num_ident = 14;
694  copy(&starts[0], &starts[STATIC_ARRAY_SIZE(starts)],
695  back_inserter(expected_results[5].starts));
696  copy(&lengths[0], &lengths[STATIC_ARRAY_SIZE(lengths)],
697  back_inserter(expected_results[5].lengths));
698  }
699 
700  // HSP # 7
701  expected_results[6].score = 49;
702  expected_results[6].evalue = 8.62465;
703  expected_results[6].bit_score = 233043305e-7;
704  expected_results[6].num_ident = 6;
705  expected_results[6].starts.push_back(188);
706  expected_results[6].starts.push_back(709);
707  expected_results[6].lengths.push_back(30);
708 
709  qa::TSeqAlignSet actual_results;
710  qa::SeqAlignSetConvert(*sas, actual_results);
711 
712  qa::CSeqAlignCmpOpts opts;
713  qa::CSeqAlignCmp cmp(expected_results, actual_results, opts);
714  string errors;
715  bool identical_results = cmp.Run(&errors);
716 
717  BOOST_REQUIRE_MESSAGE(identical_results, errors);
718 }
719 
720 BOOST_AUTO_TEST_CASE(TestSingleIteration_PssmAsQuery_CBS) {
721 
722  m_OptHandle->SetCompositionBasedStats(eCompositionBasedStats);
723  CRef<CLocalDbAdapter> dbadapter(new CLocalDbAdapter(*m_SearchDb));
724  CPsiBlast psiblast(m_Pssm, dbadapter, m_OptHandle);
725 
726  CSearchResultSet results(*psiblast.Run());
727  BOOST_REQUIRE(results[0].GetErrors().empty());
728 
729  const int kNumExpectedMatchingSeqs = 4;
730  CConstRef<CSeq_align_set> sas = results[0].GetSeqAlign();
731  BOOST_REQUIRE_EQUAL(kNumExpectedMatchingSeqs, s_CountNumberUniqueGIs(sas));
732 
733  const size_t kNumExpectedHSPs = 5;
734  qa::TSeqAlignSet expected_results(kNumExpectedHSPs);
735 
736  // HSP # 1
737  {
738  int starts[] = { 0, 941, -1, 1093, 152, 1094 };
739  int lengths[] = { 152, 1, 80 };
740  expected_results[0].score = 593;
741  expected_results[0].evalue = 1.15934e-71;
742  expected_results[0].bit_score = 232843196e-6;
743  expected_results[0].sequence_gis.SetQuery(129295);
744  expected_results[0].sequence_gis.SetSubject(34878800);
745  expected_results[0].num_ident = 101;
746  copy(&starts[0], &starts[STATIC_ARRAY_SIZE(starts)],
747  back_inserter(expected_results[0].starts));
748  copy(&lengths[0], &lengths[STATIC_ARRAY_SIZE(lengths)],
749  back_inserter(expected_results[0].lengths));
750  }
751 
752  // HSP # 2
753  {
754  int starts[] = { 0, 154, -1, 308, 154, 309 };
755  int lengths[] = { 154, 1, 24 };
756  expected_results[1].score = 417;
757  expected_results[1].evalue = 5.13954e-48;
758  expected_results[1].bit_score = 165048071e-6;
759  expected_results[1].sequence_gis.SetQuery(129295);
760  expected_results[1].sequence_gis.SetSubject(34878800);
761  expected_results[1].num_ident = 73;
762  copy(&starts[0], &starts[STATIC_ARRAY_SIZE(starts)],
763  back_inserter(expected_results[1].starts));
764  copy(&lengths[0], &lengths[STATIC_ARRAY_SIZE(lengths)],
765  back_inserter(expected_results[1].lengths));
766  }
767 
768  // HSP # 3
769  {
770  int starts[] =
771  { 0, 190, 65, -1, 67, 255, 93, -1, 94, 281, 111, -1, 113, 298,
772  -1, 304, 119, 305, 153, -1, 154, 339, 164, -1, 165, 349,
773  -1, 378, 194, 382 };
774  int lengths[] =
775  { 65, 2, 26, 1, 17, 2, 6, 1, 34, 1, 10, 1, 29, 4, 38 };
776  expected_results[2].score = 359;
777  expected_results[2].evalue = 1.18595e-41;
778  expected_results[2].bit_score = 142706496e-6;
779  expected_results[2].sequence_gis.SetQuery(129295);
780  expected_results[2].sequence_gis.SetSubject(20092202);
781  expected_results[2].num_ident = 85;
782  copy(&starts[0], &starts[STATIC_ARRAY_SIZE(starts)],
783  back_inserter(expected_results[2].starts));
784  copy(&lengths[0], &lengths[STATIC_ARRAY_SIZE(lengths)],
785  back_inserter(expected_results[2].lengths));
786  }
787 
788  // HSP # 4
789  expected_results[3].score = 53;
790  expected_results[3].evalue = 2.14427;
791  expected_results[3].bit_score = 248354256e-7;
792  expected_results[3].sequence_gis.SetQuery(129295);
793  expected_results[3].sequence_gis.SetSubject(44343511);
794  expected_results[3].starts.push_back(206);
795  expected_results[3].starts.push_back(46);
796  expected_results[3].lengths.push_back(19);
797  expected_results[3].num_ident = 8;
798 
799  // HSP # 5
800  expected_results[4].score = 51;
801  expected_results[4].evalue = 5.09267;
802  expected_results[4].bit_score = 240650265e-7;
803  expected_results[4].sequence_gis.SetQuery(129295);
804  expected_results[4].sequence_gis.SetSubject(23481125);
805  expected_results[4].starts.push_back(188);
806  expected_results[4].starts.push_back(709);
807  expected_results[4].lengths.push_back(30);
808  expected_results[4].num_ident = 6;
809 
810  /* HSP # 6
811  {
812  int starts[] = { 74, 181, 108, -1, 109, 215 };
813  int lengths[] = { 34, 1, 23 };
814  expected_results[5].score = 48;
815  expected_results[5].evalue = 878932376e-8;
816  expected_results[5].bit_score = 229094278e-7;
817  expected_results[5].sequence_gis.SetQuery(129295);
818  expected_results[5].sequence_gis.SetSubject(38088088);
819  copy(&starts[0], &starts[STATIC_ARRAY_SIZE(starts)],
820  back_inserter(expected_results[5].starts));
821  copy(&lengths[0], &lengths[STATIC_ARRAY_SIZE(lengths)],
822  back_inserter(expected_results[5].lengths));
823  }
824  */
825 
826  qa::TSeqAlignSet actual_results;
827  qa::SeqAlignSetConvert(*sas, actual_results);
828 
829  qa::CSeqAlignCmpOpts opts;
830  qa::CSeqAlignCmp cmp(expected_results, actual_results, opts);
831  string errors;
832  bool identical_results = cmp.Run(&errors);
833 
834  BOOST_REQUIRE_MESSAGE(identical_results, errors);
835 }
836 
837 // N.B.: these can't be done from a PSSM
838 #if 0
839 BOOST_AUTO_TEST_CASE(TestSingleIteration_PssmAsQuery_CBSConditional) {
840 
841  m_OptHandle->SetCompositionBasedStats(eCompositionMatrixAdjust);
842  CRef<CLocalDbAdapter> dbadapter(new CLocalDbAdapter(*m_SearchDb));
843  CPsiBlast psiblast(m_Pssm, dbadapter, m_OptHandle);
844 
845  CSearchResultSet results(*psiblast.Run());
846  BOOST_REQUIRE(results[0].GetErrors().empty());
847 
848  const int kNumExpectedMatchingSeqs = 6;
849  CConstRef<CSeq_align_set> sas = results[0].GetSeqAlign();
850  BOOST_REQUIRE_EQUAL(kNumExpectedMatchingSeqs, s_CountNumberUniqueGIs(sas));
851 
852  const size_t kNumExpectedHSPs = 7;
853  qa::TSeqAlignSet expected_results(kNumExpectedHSPs);
854 
855  // HSP # 1
856  {
857  int starts[] = { 0, 941, -1, 1093, 152, 1094 };
858  int lengths[] = { 152, 1, 80 };
859  expected_results[0].score = 595;
860  expected_results[0].evalue = 307180919e-71;
861  expected_results[0].bit_score = 233623298e-6;
862  expected_results[0].num_ident = 101;
863  copy(&starts[0], &starts[STATIC_ARRAY_SIZE(starts)],
864  back_inserter(expected_results[0].starts));
865  copy(&lengths[0], &lengths[STATIC_ARRAY_SIZE(lengths)],
866  back_inserter(expected_results[0].lengths));
867  }
868 
869  // HSP # 2
870  {
871  int starts[] = { 0, 154, -1, 308, 154, 309 };
872  int lengths[] = { 154, 1, 24 };
873  expected_results[1].score = 424;
874  expected_results[1].evalue = 20700336e-50;
875  expected_results[1].bit_score = 167754171e-6;
876  expected_results[1].num_ident = 73;
877  copy(&starts[0], &starts[STATIC_ARRAY_SIZE(starts)],
878  back_inserter(expected_results[1].starts));
879  copy(&lengths[0], &lengths[STATIC_ARRAY_SIZE(lengths)],
880  back_inserter(expected_results[1].lengths));
881  }
882 
883  // HSP # 3
884  {
885  int starts[] =
886  { 0, 190, 65, -1, 67, 255, 91, -1, 92, 279, 111, -1, 113, 298,
887  -1, 304, 119, 305, 151, -1, 152, 337, 163, -1, 164, 348,
888  -1, 374, 190, 380, 200, -1, 202, 390 };
889  int lengths[] =
890  { 65, 2, 24, 1, 19, 2, 6, 1, 32, 1, 11, 1, 26, 6, 10, 2, 30 };
891  expected_results[2].score = 372;
892  expected_results[2].evalue = 221677687e-45;
893  expected_results[2].bit_score = 147723793e-6;
894  expected_results[2].num_ident = 87;
895  copy(&starts[0], &starts[STATIC_ARRAY_SIZE(starts)],
896  back_inserter(expected_results[2].starts));
897  copy(&lengths[0], &lengths[STATIC_ARRAY_SIZE(lengths)],
898  back_inserter(expected_results[2].lengths));
899  }
900 
901  // HSP # 4
902  expected_results[3].score = 53;
903  expected_results[3].evalue = 216713461e-8;
904  expected_results[3].bit_score = 248451288e-7;
905  expected_results[3].num_ident = 8;
906  expected_results[3].starts.push_back(206);
907  expected_results[3].starts.push_back(46);
908  expected_results[3].lengths.push_back(19);
909 
910  // HSP # 5
911  {
912  int starts[] = { 177, 100, -1, 106, 183, 107, 205, -1, 215, 129 };
913  int lengths[] = { 6, 1, 22, 10, 14 };
914  expected_results[4].score = 52;
915  expected_results[4].evalue = 283036546e-8;
916  expected_results[4].bit_score = 244599292e-7;
917  expected_results[4].num_ident = 11;
918  copy(&starts[0], &starts[STATIC_ARRAY_SIZE(starts)],
919  back_inserter(expected_results[4].starts));
920  copy(&lengths[0], &lengths[STATIC_ARRAY_SIZE(lengths)],
921  back_inserter(expected_results[4].lengths));
922  }
923 
924  // HSP # 6
925  {
926  int starts[] = { 74, 181, 108, -1, 109, 215 };
927  int lengths[] = { 34, 1, 23 };
928  expected_results[5].score = 49;
929  expected_results[5].evalue = 630539642e-8;
930  expected_results[5].bit_score = 233043305e-7;
931  expected_results[5].num_ident = 14;
932  copy(&starts[0], &starts[STATIC_ARRAY_SIZE(starts)],
933  back_inserter(expected_results[5].starts));
934  copy(&lengths[0], &lengths[STATIC_ARRAY_SIZE(lengths)],
935  back_inserter(expected_results[5].lengths));
936  }
937 
938  // HSP # 7
939  expected_results[6].score = 49;
940  expected_results[6].evalue = 630539642e-8;
941  expected_results[6].bit_score = 233043305e-7;
942  expected_results[6].num_ident = 6;
943  expected_results[6].starts.push_back(188);
944  expected_results[6].starts.push_back(709);
945  expected_results[6].lengths.push_back(30);
946 
947  qa::TSeqAlignSet actual_results;
948  qa::SeqAlignSetConvert(*sas, actual_results);
949 
950  qa::CSeqAlignCmpOpts opts;
951  qa::CSeqAlignCmp cmp(expected_results, actual_results, opts);
952  string errors;
953  bool identical_results = cmp.Run(&errors);
954 
955  BOOST_REQUIRE_MESSAGE(identical_results, errors);
956 }
957 
958 BOOST_AUTO_TEST_CASE(TestSingleIteration_PssmAsQuery_CBSUniversal) {
959 
960  m_OptHandle->SetCompositionBasedStats(eCompoForceFullMatrixAdjust);
961  CRef<CLocalDbAdapter> dbadapter(new CLocalDbAdapter(*m_SearchDb));
962  CPsiBlast psiblast(m_Pssm, dbadapter, m_OptHandle);
963 
964  CSearchResultSet results(*psiblast.Run());
965  BOOST_REQUIRE(results[0].GetErrors().empty());
966 
967  const int kNumExpectedMatchingSeqs = 6;
968  CConstRef<CSeq_align_set> sas = results[0].GetSeqAlign();
969  BOOST_REQUIRE_EQUAL(kNumExpectedMatchingSeqs, s_CountNumberUniqueGIs(sas));
970 
971  const size_t kNumExpectedHSPs = 7;
972  qa::TSeqAlignSet expected_results(kNumExpectedHSPs);
973 
974  // HSP # 1
975  {
976  int starts[] = { 0, 941, -1, 1093, 152, 1094 };
977  int lengths[] = { 152, 1, 80 };
978  expected_results[0].score = 595;
979  expected_results[0].evalue = 307180919e-71;
980  expected_results[0].bit_score = 233623298e-6;
981  expected_results[0].num_ident = 101;
982  copy(&starts[0], &starts[STATIC_ARRAY_SIZE(starts)],
983  back_inserter(expected_results[0].starts));
984  copy(&lengths[0], &lengths[STATIC_ARRAY_SIZE(lengths)],
985  back_inserter(expected_results[0].lengths));
986  }
987 
988  // HSP # 2
989  {
990  int starts[] = { 0, 154, -1, 308, 154, 309 };
991  int lengths[] = { 154, 1, 24 };
992  expected_results[1].score = 424;
993  expected_results[1].evalue = 20700336e-50;
994  expected_results[1].bit_score = 167754171e-6;
995  expected_results[1].num_ident = 73;
996  copy(&starts[0], &starts[STATIC_ARRAY_SIZE(starts)],
997  back_inserter(expected_results[1].starts));
998  copy(&lengths[0], &lengths[STATIC_ARRAY_SIZE(lengths)],
999  back_inserter(expected_results[1].lengths));
1000  }
1001 
1002  // HSP # 3
1003  {
1004  int starts[] =
1005  { 0, 190, 65, -1, 67, 255, 91, -1, 92, 279, 111, -1, 113, 298,
1006  -1, 304, 119, 305, 151, -1, 152, 337, 163, -1, 164, 348,
1007  -1, 374, 190, 380, 200, -1, 202, 390 };
1008  int lengths[] =
1009  { 65, 2, 24, 1, 19, 2, 6, 1, 32, 1, 11, 1, 26, 6, 10, 2, 30 };
1010  expected_results[2].score = 372;
1011  expected_results[2].evalue = 221677687e-45;
1012  expected_results[2].bit_score = 147723793e-6;
1013  expected_results[2].num_ident = 87;
1014  copy(&starts[0], &starts[STATIC_ARRAY_SIZE(starts)],
1015  back_inserter(expected_results[2].starts));
1016  copy(&lengths[0], &lengths[STATIC_ARRAY_SIZE(lengths)],
1017  back_inserter(expected_results[2].lengths));
1018  }
1019 
1020  // HSP # 4
1021  expected_results[3].score = 53;
1022  expected_results[3].evalue = 216713461e-8;
1023  expected_results[3].bit_score = 248451288e-7;
1024  expected_results[3].num_ident = 8;
1025  expected_results[3].starts.push_back(206);
1026  expected_results[3].starts.push_back(46);
1027  expected_results[3].lengths.push_back(19);
1028 
1029  // HSP # 5
1030  {
1031  int starts[] = { 177, 100, -1, 106, 183, 107, 205, -1, 215, 129 };
1032  int lengths[] = { 6, 1, 22, 10, 14 };
1033  expected_results[4].score = 52;
1034  expected_results[4].evalue = 283036546e-8;
1035  expected_results[4].bit_score = 244599292e-7;
1036  expected_results[4].num_ident = 11;
1037  copy(&starts[0], &starts[STATIC_ARRAY_SIZE(starts)],
1038  back_inserter(expected_results[4].starts));
1039  copy(&lengths[0], &lengths[STATIC_ARRAY_SIZE(lengths)],
1040  back_inserter(expected_results[4].lengths));
1041  }
1042 
1043  // HSP # 6
1044  {
1045  int starts[] = { 74, 181, 108, -1, 109, 215 };
1046  int lengths[] = { 34, 1, 23 };
1047  expected_results[5].score = 49;
1048  expected_results[5].evalue = 630539642e-8;
1049  expected_results[5].bit_score = 233043305e-7;
1050  expected_results[5].num_ident = 14;
1051  copy(&starts[0], &starts[STATIC_ARRAY_SIZE(starts)],
1052  back_inserter(expected_results[5].starts));
1053  copy(&lengths[0], &lengths[STATIC_ARRAY_SIZE(lengths)],
1054  back_inserter(expected_results[5].lengths));
1055  }
1056 
1057  // HSP # 7
1058  expected_results[6].score = 49;
1059  expected_results[6].evalue = 630539642e-8;
1060  expected_results[6].bit_score = 233043305e-7;
1061  expected_results[6].num_ident = 6;
1062  expected_results[6].starts.push_back(188);
1063  expected_results[6].starts.push_back(709);
1064  expected_results[6].lengths.push_back(30);
1065 
1066  qa::TSeqAlignSet actual_results;
1067  qa::SeqAlignSetConvert(*sas, actual_results);
1068 
1069  qa::CSeqAlignCmpOpts opts;
1070  qa::CSeqAlignCmp cmp(expected_results, actual_results, opts);
1071  string errors;
1072  bool identical_results = cmp.Run(&errors);
1073 
1074  BOOST_REQUIRE_MESSAGE(identical_results, errors);
1075 }
1076 #endif
1077 
1078 // This search will converge after 2 iterations
1079 BOOST_AUTO_TEST_CASE(TestMultipleIterationsAndConvergence_PssmAsQuery_NoCBS) {
1080 
1081  const int kNumIterations = 4;
1082  const int kNumExpectedIterations = 2;
1083  CPsiBlastIterationState itr(kNumIterations);
1084  m_OptHandle->SetCompositionBasedStats(eNoCompositionBasedStats);
1085  CRef<CLocalDbAdapter> dbadapter(new CLocalDbAdapter(*m_SearchDb));
1086  CPsiBlast psiblast(m_Pssm, dbadapter, m_OptHandle);
1087 
1088  int hits_below_threshold[kNumIterations] = { 2, 2, 0, 0 };
1089  int number_hits[kNumIterations] = { 6, 5, 0, 0 };
1090 
1091  int iteration_counter = 0;
1092  while (itr) {
1093  CSearchResultSet results = *psiblast.Run();
1094  BOOST_REQUIRE(results[0].GetErrors().empty());
1095  CConstRef<CSeq_align_set> alignment = results[0].GetSeqAlign();
1096 
1097  BOOST_REQUIRE_EQUAL(number_hits[iteration_counter],
1098  s_CountNumberUniqueGIs(alignment));
1099 
1101  CPsiBlastIterationState::GetSeqIds(alignment, m_OptHandle, ids);
1102 
1103  string m("On round ");
1104  m += NStr::IntToString(itr.GetIterationNumber()) + " found ";
1105  m += NStr::SizetToString(ids.size()) + " qualifying ids";
1106  BOOST_REQUIRE_EQUAL(hits_below_threshold[iteration_counter],
1107  (int)ids.size());
1108  itr.Advance(ids);
1109 
1110  if (itr) {
1111  const CBioseq& query = m_Pssm->GetPssm().GetQuery().GetSeq();
1113  x_ComputePssmForNextIteration(query, alignment,
1114  m_OptHandle, results[0].GetAncillaryData());
1115  psiblast.SetPssm(pssm);
1116  }
1117  iteration_counter++;
1118  }
1119 
1120  BOOST_REQUIRE_EQUAL(kNumExpectedIterations, iteration_counter);
1121 }
1122 
1123 // This search will converge after 2 iterations
1124 BOOST_AUTO_TEST_CASE(TestMultipleIterationsAndConvergence_PssmAsQuery_CBS) {
1125 
1126  const int kNumIterations = 4;
1127  const int kNumExpectedIterations = 2;
1128  CPsiBlastIterationState itr(kNumIterations);
1129  m_OptHandle->SetCompositionBasedStats(eCompositionBasedStats);
1130  CRef<CLocalDbAdapter> dbadapter(new CLocalDbAdapter(*m_SearchDb));
1131  CPsiBlast psiblast(m_Pssm, dbadapter, m_OptHandle);
1132 
1133  int hits_below_threshold[kNumIterations] = { 2, 2, 0, 0 };
1134  int number_hits[kNumIterations] = { 4, 3, 0, 0 };
1135 
1136  int iteration_counter = 0;
1137  while (itr) {
1138  CSearchResultSet results = *psiblast.Run();
1139  BOOST_REQUIRE(results[0].GetErrors().empty());
1140  CConstRef<CSeq_align_set> alignment = results[0].GetSeqAlign();
1141  BOOST_REQUIRE(alignment.NotEmpty());
1142  BOOST_REQUIRE_EQUAL(number_hits[iteration_counter],
1143  s_CountNumberUniqueGIs(alignment));
1144 
1146  CPsiBlastIterationState::GetSeqIds(alignment, m_OptHandle, ids);
1147 
1148  string m("On round ");
1149  m += NStr::IntToString(itr.GetIterationNumber()) + " found ";
1150  m += NStr::SizetToString(ids.size()) + " qualifying ids";
1151  BOOST_REQUIRE_EQUAL(hits_below_threshold[iteration_counter],
1152  (int)ids.size());
1153  itr.Advance(ids);
1154 
1155  if (itr) {
1156  const CBioseq& query = m_Pssm->GetPssm().GetQuery().GetSeq();
1158  x_ComputePssmForNextIteration(query, alignment,
1159  m_OptHandle, results[0].GetAncillaryData());
1160  psiblast.SetPssm(pssm);
1161  }
1162  iteration_counter++;
1163  }
1164 
1165  BOOST_REQUIRE_EQUAL(kNumExpectedIterations, iteration_counter);
1166 }
1167 
1168 // Should throw exception as only one query sequence/pssm is allowed
1169 BOOST_AUTO_TEST_CASE(TestMultipleQueries) {
1170  CConstRef<CBioseq_set> bioseq_set(&m_SeqSet->GetSet());
1171  CRef<IQueryFactory> query_factory(s_SetupSubject(bioseq_set));
1172  CRef<CLocalDbAdapter> dbadapter(new CLocalDbAdapter(*m_SearchDb));
1173  BOOST_REQUIRE_THROW(CPsiBlast psiblast(query_factory, dbadapter, m_OptHandle), CBlastException);
1174 }
1175 
1176 BOOST_AUTO_TEST_CASE(TestNullQuery) {
1177  CRef<IQueryFactory> query_factory;
1178  CRef<CLocalDbAdapter> dbadapter(new CLocalDbAdapter(*m_SearchDb));
1179  BOOST_REQUIRE_THROW(CPsiBlast psiblast(query_factory, dbadapter, m_OptHandle),
1180  CBlastException);
1181 }
1182 
1183 BOOST_AUTO_TEST_CASE(TestFrequencyRatiosWithAllZerosInPssm) {
1185  m_Pssm->SetPssm().SetIntermediateData().SetFreqRatios()) {
1186  *fr = 0.0;
1187  }
1188 
1189  CRef<CLocalDbAdapter> dbadapter(new CLocalDbAdapter(*m_SearchDb));
1190  CPsiBlast psiblast(m_Pssm, dbadapter, m_OptHandle);
1191  CSearchResultSet r = *psiblast.Run();
1192  TQueryMessages messages = r[0].GetErrors(eBlastSevWarning);
1193  BOOST_REQUIRE( !messages.empty() );
1194 
1195  string expected_warning("Frequency ratios for PSSM are all zeros");
1196  string warning;
1197  ITERATE(TQueryMessages, m, messages) {
1198  if (((*m)->GetSeverity() == eBlastSevWarning) &&
1199  ((*m)->GetMessage().find(expected_warning) != string::npos)) {
1200  warning = (*m)->GetMessage();
1201  break;
1202  }
1203  }
1204  BOOST_REQUIRE_MESSAGE(!warning.empty(), "Did not find expected warning");
1205 }
1206 
1207 BOOST_AUTO_TEST_CASE(TestNullPssm) {
1208  m_Pssm.Reset();
1209  CRef<CLocalDbAdapter> dbadapter(new CLocalDbAdapter(*m_SearchDb));
1210  BOOST_REQUIRE_THROW(CPsiBlast psiblast(m_Pssm, dbadapter, m_OptHandle),
1211  CBlastException);
1212 }
1213 
1214 BOOST_AUTO_TEST_CASE(TestSetNullPssm) {
1215  CRef<CLocalDbAdapter> dbadapter(new CLocalDbAdapter(*m_SearchDb));
1216  CPsiBlast psiblast(m_Pssm, dbadapter, m_OptHandle);
1218  BOOST_REQUIRE_THROW(psiblast.SetPssm(pssm), CBlastException);
1219 }
1220 
1221 BOOST_AUTO_TEST_CASE(TestNonExistantDb) {
1222  m_SearchDb->SetDatabaseName("dummy");
1223  CRef<CLocalDbAdapter> dbadapter(new CLocalDbAdapter(*m_SearchDb));
1224  CPsiBlast psiblast(m_Pssm, dbadapter, m_OptHandle);
1225  BOOST_REQUIRE_THROW(psiblast.Run(),CSeqDBException);
1226 }
1227 
1228 BOOST_AUTO_TEST_CASE(TestNullOptions) {
1229  m_OptHandle.Reset();
1230  CConstRef<CBioseq> bioseq(&m_SeqEntry->GetSeq());
1231  CRef<IQueryFactory> query_factory(s_SetupSubject(bioseq));
1232  CRef<CLocalDbAdapter> dbadapter(new CLocalDbAdapter(*m_SearchDb));
1233  BOOST_REQUIRE_THROW(CPsiBlast psiblast(query_factory, dbadapter, m_OptHandle),
1234  CBlastException);
1235 }
1236 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
Internal auxiliary setup classes/functions for extracting sequence data from Bioseqs.
@ eBlastSevWarning
Definition: blast_message.h:57
Definitions which are dependant on the NCBI C++ Object Manager.
PSIDiagnosticsRequest * PSIDiagnosticsRequestNew(void)
Allocates a PSIDiagnosticsRequest structure, setting all fields to false.
Definition: blast_psi.c:585
Declares CBlastScopeSource class to create properly configured CScope objects to invoke the BLAST dat...
BOOST_AUTO_TEST_SUITE_END() static int s_GetSegmentFlags(const CBioseq &bioseq)
Defines BLAST error codes (user errors included)
Class whose purpose is to create CScope objects which have data loaders added with different prioriti...
CRef< objects::CScope > NewScope()
Create a new, properly configured CScope.
Interface to create a BlastSeqSrc suitable for use in CORE BLAST from a a variety of BLAST database/s...
NCBI C++ Object Manager dependant implementation of IQueryFactory.
Handle to the protein-protein options to the BLAST algorithm.
Wrapper class for PSIDiagnosticsRequest .
Definition: blast_aux.hpp:347
Represents the iteration state in PSI-BLAST.
Runs a single iteration of the PSI-BLAST algorithm on a BLAST database.
Definition: psiblast.hpp:79
CScope –.
Definition: scope.hpp:92
Blast Search Subject.
Search Results for All Queries.
CSeqDBException.
Definition: seqdbcommon.hpp:73
CSeqDB.
Definition: seqdb.hpp:161
@ eProtein
Definition: seqdb.hpp:174
Definition: Seq_entry.hpp:56
Template class for iteration on objects of class C (non-medifiable version)
Definition: iterator.hpp:767
Source of query sequence data for BLAST Provides an interface for search classes to retrieve sequence...
Definition: query_data.hpp:147
Class for the messages for an individual query sequence.
size_type size() const
Definition: set.hpp:132
@ eCompositionBasedStats
Composition-based statistics as in NAR 29:2994-3005, 2001.
@ eCompoForceFullMatrixAdjust
Composition-based score adjustment as in Bioinformatics 21:902-911, 2005, unconditionally.
@ eNoCompositionBasedStats
Don't use composition based statistics.
@ eCompositionMatrixAdjust
Composition-based score adjustment as in Bioinformatics 21:902-911, 2005, conditioned on sequence pro...
CRef< CSearchResultSet > Run()
Run the PSI-BLAST engine for one iteration.
Definition: psiblast.cpp:95
unsigned int GetIterationNumber() const
Return the number of the current iteration.
void SetPssm(CConstRef< objects::CPssmWithParameters > pssm)
This method allows the same object to be reused when performing multiple iterations.
Definition: psiblast.cpp:83
ECompoAdjustModes GetCompositionBasedStats() const
Returns this mode, which mostly specifies whether composition of db sequence is taken into account wh...
const CBlastOptions & GetOptions() const
Return the object which this object is a handle for.
CRef< objects::CPssmWithParameters > PsiBlastComputePssmFromAlignment(const objects::CBioseq &query, CConstRef< objects::CSeq_align_set > alignment, CRef< objects::CScope > database_scope, const CPSIBlastOptionsHandle &opts_handle, CConstRef< CBlastAncillaryData > ancillary_data, PSIDiagnosticsRequest *diagnostics_req=0)
Computes a PSSM from the result of a PSI-BLAST iteration.
Definition: psiblast.cpp:102
void PsiBlastComputePssmScores(CRef< objects::CPssmWithParameters > pssm, const CBlastOptions &opts)
Given a PSSM with frequency ratios and options, invoke the PSSM engine to compute the scores.
void Advance(const TSeqIds &list)
Advance the iterator by passing it the list of Seq-ids which passed the inclusion criteria for the cu...
static void GetSeqIds(CConstRef< objects::CSeq_align_set > seqalign, CConstRef< CPSIBlastOptionsHandle > opts, TSeqIds &retval)
Extract the sequence ids from the sequence alignment which identify those sequences that will be used...
@ eBlastDbIsProtein
protein
#define INVALID_GI
Definition: ncbimisc.hpp:1089
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
virtual bool Equals(const CSerialObject &object, ESerialRecursionMode how=eRecursive) const
Check if both objects contain the same values.
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
Definition: Seq_id.cpp:318
static bool PreferAccessionOverGi(void)
Check if the option to prefer accession.version over GI is enabled (SeqId/PreferAccessionOverGi or SE...
Definition: Seq_id.cpp:3404
void SetWhole(TWhole &v)
Definition: Seq_loc.hpp:982
void SetId(CSeq_id &id)
set the 'id' field in all parts of this location
Definition: Seq_loc.cpp:3474
CConstBeginInfo ConstBegin(const C &obj)
Get starting point of non-modifiable object hierarchy.
Definition: iterator.hpp:1012
@ eDetectLoops
Definition: iterator.hpp:998
CBioseq_Handle AddBioseq(CBioseq &bioseq, TPriority pri=kPriority_Default, EExist action=eExist_Throw)
Add bioseq, return bioseq handle.
Definition: scope.cpp:530
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CConstRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:1392
static string SizetToString(size_t value, TNumToStringFlags flags=0, int base=10)
Convert size_t to string.
Definition: ncbistr.cpp:2751
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
bool CanGetIntermediateData(void) const
Check if it is safe to call GetIntermediateData method.
Definition: Pssm_.hpp:787
const TFinalData & GetFinalData(void) const
Get the FinalData member data.
Definition: Pssm_.hpp:814
bool CanGetFinalData(void) const
Check if it is safe to call GetFinalData method.
Definition: Pssm_.hpp:808
void SetPssm(TPssm &value)
Assign a value to Pssm data member.
bool CanGetQuery(void) const
Check if it is safe to call GetQuery method.
Definition: Pssm_.hpp:766
bool CanGetScores(void) const
Check if it is safe to call GetScores method.
bool CanGetPssm(void) const
Check if it is safe to call GetPssm method.
const TPssm & GetPssm(void) const
Get the Pssm member data.
list< CRef< CSeq_align > > Tdata
const Tdata & Get(void) const
Get the member data.
TGi GetGi(void) const
Get the variant data.
Definition: Seq_id_.hpp:889
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
TSeq_set & SetSeq_set(void)
Assign a value to Seq_set data member.
Declares class which provides internal BLAST database representations to the internal BLAST APIs.
constexpr bool empty(list< Ts... >) noexcept
Magic spell ;-) needed for some weird compilers... very empiric.
std::vector< SeqAlign > TSeqAlignSet
Vector of neutral sequence alignments.
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
NOTE: This file contains work in progress and the APIs are likely to change, please do not rely on th...
Utilities to develop and debug unit tests for BLAST.
Defines a concrete strategy to obtain PSSM input data for PSI-BLAST.
Declares CPsiBlast, the C++ API for the PSI-BLAST engine.
Declarations of auxiliary functions/classes for PSI-BLAST.
Defines class which represents the iteration state in PSI-BLAST.
#define STATIC_ARRAY_SIZE(array)
Calculate the size of a static array.
BOOST_AUTO_TEST_CASE(TestSingleIteration_ProteinAsQuery_NoCBS)
C++ API for the PSI-BLAST PSSM engine.
static bool GetSeqId(const T &d, set< string > &labels, const string name="", bool detect=false, bool found=false)
API to compare CSeq-aligns produced by BLAST.
void SeqAlignSetConvert(const objects::CSeq_align_set &ss, std::vector< SeqAlign > &retval)
Converts a Seq-align-set into a neutral seqalign for use with the CSeqAlignCmp class.
Defines BLAST database access classes.
vector< SSeqLoc > TSeqLocVector
Vector of sequence locations.
Definition: sseqloc.hpp:129
CRef< CPssmWithParameters > x_ComputePssmForNextIteration(const CBioseq &query, CConstRef< CSeq_align_set > sset, CConstRef< CPSIBlastOptionsHandle > opts_handle, CConstRef< CBlastAncillaryData > ancillary_data)
IQueryFactory * s_SetupSubject(CConstRef< CBioseq_set > bioseq_set)
IQueryFactory * s_SetupSubject(CConstRef< CBioseq > bioseq)
CRef< CPSIBlastOptionsHandle > m_OptHandle
CSearchDatabase * m_SearchDb
CRef< CSeq_entry > m_SeqEntry
Contains a single Bioseq.
int s_CountNumberUniqueGIs(CConstRef< CSeq_align_set > sas)
CRef< CSeq_entry > m_SeqSet
Contains a Bioseq-set with two Bioseqs, gi 7450545 and gi 129295.
CRef< CPssmWithParameters > m_Pssm
Boolean frequency_ratios
request frequency ratios
Definition: blast_psi.h:187
Structure to represent a single sequence to be fed to BLAST.
Definition: sseqloc.hpp:47
static string query
Utility stuff for more convenient using of Boost.Test library.
Uniform BLAST Search Interface.
Modified on Mon Apr 22 03:59:47 2024 by modify_doxy.py rev. 669887