1 /* $Id: psibl2seq_unit_test.cpp 91996 2020-12-17 15:27:17Z grichenk $
2  * ===========================================================================
3  *
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Christiam Camacho
27  *
28  */
30 /** @file psibl2seq-cppunit.cpp
31  * Unit test module for the PSI-BLAST 2 Sequences class
32  */
33 #include <ncbi_pch.hpp>
34 #include <corelib/test_boost.hpp>
35 #include <serial/iterator.hpp>
37 #include "psiblast_aux_priv.hpp" // for PsiBlastComputePssmScores
41 // Object includes
52 #include "test_objmgr.hpp"
53 #include "blast_test_util.hpp" // needed to read datatool generated objects
54 //#include "psiblast_test_util.hpp" // needed for construction of PSSM
56 // SeqAlign comparison includes
57 #include "seqalign_cmp.hpp"
58 #include "seqalign_set_convert.hpp"
60 using namespace ncbi;
61 using namespace ncbi::objects;
62 using namespace ncbi::blast;
65 public:
70  // Data members which store the subject(s)
72  /// must be initialized with one of the two data members below
75  /// Contains a single Bioseq
78  /// Contains a Bioseq-set with two Bioseqs, gi 7450545 and gi 129295
84  m_OptHandle.Reset(new CPSIBlastOptionsHandle);
86  x_ReadPssmFromFile();
87  PsiBlastComputePssmScores(m_Pssm, m_OptHandle->GetOptions());
88  BOOST_REQUIRE(m_Pssm->GetPssm().GetFinalData().CanGetScores());
90  x_ReadSeqEntriesFromFile();
92  CConstRef<CBioseq> bioseq(&m_SeqEntry->GetSeq());
93  x_SetupSubject(bioseq);
94  }
97  m_Scope.Reset();
98  m_Pssm.Reset();
99  m_OptHandle.Reset();
100  m_SeqEntry.Reset();
101  m_SeqSet.Reset();
102  m_Subject.Reset();
103  }
105  // Auxiliary private functions go below...
108  TSeqLocVector subjects;
110  CConstRef<CSeq_id> sid = (m_Scope->AddBioseq(*bioseq)).GetSeqId();
111  CRef<CSeq_loc> sl(new CSeq_loc());
112  sl->SetWhole();
113  sl->SetId(*sid);
114  SSeqLoc ssl(*sl, *m_Scope);
115  subjects.push_back(ssl);
116  m_Subject.Reset(new CObjMgr_QueryFactory(subjects));
117  }
120  TSeqLocVector subjects;
123  for (; itr; ++itr) {
124  CConstRef<CSeq_id> sid = (m_Scope->AddBioseq(*itr)).GetSeqId();
125  CRef<CSeq_loc> sl(new CSeq_loc());
126  sl->SetWhole();
127  sl->SetId(*sid);
128  SSeqLoc ssl(*sl, *m_Scope);
129  subjects.push_back(ssl);
130  }
131  m_Subject.Reset(new CObjMgr_QueryFactory(subjects));
132  }
134  // Note that the scoremat stored in the file does not have scores
136  const string kPssmFile("data/pssm_freq_ratios.asn");
137  m_Pssm = TestUtil::ReadObject<CPssmWithParameters>(kPssmFile);
138  BOOST_REQUIRE(m_Pssm->GetPssm().CanGetQuery());
139  BOOST_REQUIRE(m_Pssm->GetPssm().CanGetIntermediateData());
140  BOOST_REQUIRE(!m_Pssm->GetPssm().CanGetFinalData());
141  }
144  const string kSeqEntryFile("data/7450545.seqentry.asn");
145  m_SeqEntry = TestUtil::ReadObject<CSeq_entry>(kSeqEntryFile);
148  m_SeqSet.Reset(new CSeq_entry);
149  m_SeqSet->SetSet().SetSeq_set().push_back(m_SeqEntry);
150  BOOST_REQUIRE(m_Pssm &&
151  m_Pssm->CanGetPssm() &&
152  m_Pssm->GetPssm().CanGetQuery());
153  CRef<CSeq_entry> second_bioseq(&m_Pssm->SetPssm().SetQuery());
154  m_SeqSet->SetSet().SetSeq_set().push_back(second_bioseq);
156  }
159  BOOST_REQUIRE_EQUAL(false, sa->IsSetSegs());
160  }
163  BOOST_REQUIRE(sa->GetSegs().IsDenseg());
165  const CDense_seg & denseg = sa->GetSegs().GetDenseg();
167  if (hsp_num == 1)
168  {
169  // Validate the first HSP
170  pair<TSeqRange, TSeqRange> first_hsp =
171  make_pair(TSeqRange(24, 29), TSeqRange(245, 250));
172  TSeqRange hsp1_query = denseg.GetSeqRange(0);
173  TSeqRange hsp1_subj = denseg.GetSeqRange(1);
174  BOOST_REQUIRE_EQUAL(first_hsp.first.GetFrom(), hsp1_query.GetFrom());
175  BOOST_REQUIRE_EQUAL(first_hsp.first.GetTo(), hsp1_query.GetTo());
176  BOOST_REQUIRE_EQUAL(first_hsp.second.GetFrom(), hsp1_subj.GetFrom());
177  BOOST_REQUIRE_EQUAL(first_hsp.second.GetTo(), hsp1_subj.GetTo());
178  }
179  else if (hsp_num == 2)
180  {
181  // Validate the second HSP
182  const pair<TSeqRange, TSeqRange> second_hsp =
183  make_pair(TSeqRange(74, 86), TSeqRange(108, 120));
184  TSeqRange hsp2_query = denseg.GetSeqRange(0);
185  TSeqRange hsp2_subj = denseg.GetSeqRange(1);
186  BOOST_REQUIRE_EQUAL(second_hsp.first.GetFrom(), hsp2_query.GetFrom());
187  BOOST_REQUIRE_EQUAL(second_hsp.first.GetTo(), hsp2_query.GetTo());
188  BOOST_REQUIRE_EQUAL(second_hsp.second.GetFrom(), hsp2_subj.GetFrom());
189  BOOST_REQUIRE_EQUAL(second_hsp.second.GetTo(), hsp2_subj.GetTo());
190  }
192  }
195  BOOST_REQUIRE(sa->GetSegs().IsDenseg());
197  const CDense_seg & denseg = sa->GetSegs().GetDenseg();
199  // Validate the first (and only) HSP, which is a self hit
200  const TSeqRange hsp(0, 231);
201  TSeqRange hsp1_query = denseg.GetSeqRange(0);
202  TSeqRange hsp1_subj = denseg.GetSeqRange(1);
203  BOOST_REQUIRE_EQUAL(hsp.GetFrom(), hsp1_query.GetFrom());
204  BOOST_REQUIRE_EQUAL(hsp.GetTo(), hsp1_query.GetTo());
205  BOOST_REQUIRE_EQUAL(hsp.GetFrom(), hsp1_subj.GetFrom());
206  BOOST_REQUIRE_EQUAL(hsp.GetTo(), hsp1_subj.GetTo());
208  }
210 };
212 BOOST_FIXTURE_TEST_SUITE(psibl2seq, CPsiBl2SeqTestFixture)
214 #if 0
216  m_Pssm->SetPssm().SetFinalData().SetScalingFactor(2);
217  BOOST_REQUIRE_THROW(CPsiBl2Seq blaster(m_Pssm, m_Subject, m_OptHandle),
219 }
221 BOOST_AUTO_TEST_CASE(TestInvalidPSSM_MissingScoresAndFreqRatios) {
222  m_Pssm->SetPssm().SetFinalData().ResetScores();
223  m_Pssm->SetPssm().SetIntermediateData().ResetFreqRatios();
224  BOOST_REQUIRE_THROW(CPsiBl2Seq blaster(m_Pssm, m_Subject, m_OptHandle),
226 }
228 BOOST_AUTO_TEST_CASE(TestInvalidPSSM_MissingQuery) {
229  m_Pssm->SetPssm().ResetQuery();
230  BOOST_REQUIRE_THROW(CPsiBl2Seq blaster(m_Pssm, m_Subject, m_OptHandle),
232 }
234 BOOST_AUTO_TEST_CASE(TestInvalidPSSM_Bioseq_setAsQuery) {
235  m_Pssm->SetPssm().SetQuery(*m_SeqSet);
236  BOOST_REQUIRE_THROW(CPsiBl2Seq blaster(m_Pssm, m_Subject, m_OptHandle),
238 }
240 BOOST_AUTO_TEST_CASE(TestInvalidPSSM_NuclScoringMatrix) {
241  m_Pssm->SetPssm().SetIsProtein(false);
242  BOOST_REQUIRE_THROW(CPsiBl2Seq blaster(m_Pssm, m_Subject, m_OptHandle),
244 }
246 BOOST_AUTO_TEST_CASE(TestMissingQuery) {
248  opts(dynamic_cast<CBlastProteinOptionsHandle*>(&*m_OptHandle));
249  CRef<IQueryFactory> empty_query;
250  BOOST_REQUIRE_THROW(CPsiBl2Seq blaster(empty_query, m_Subject, opts),
252 }
253 BOOST_AUTO_TEST_CASE(TestMultipleQueries) {
254  TSeqLocVector queries;
255  int gis[] = {
256  129295, // this gi is protein
257  555 }; // this gi is nucleotide
258  for (size_t i = 0; i < sizeof(gis)/sizeof(*gis); i++) {
259  CRef<CSeq_id> seqid(new CSeq_id(CSeq_id::e_Gi, gis[i]));
260  TSeqRange range(0U, 50U);
261  unique_ptr<SSeqLoc> sl(CTestObjMgr::Instance().
262  CreateSSeqLoc(*seqid, range));
263  queries.push_back(*sl);
264  }
265  CRef<IQueryFactory> qf(new CObjMgr_QueryFactory(queries));
267  opts(dynamic_cast<CBlastProteinOptionsHandle*>(&*m_OptHandle));
268  BOOST_REQUIRE_THROW(CPsiBl2Seq blaster(qf, m_Subject, opts),
270 }
272 BOOST_AUTO_TEST_CASE(TestQueryIsNucleotide) {
273  TSeqLocVector queries;
274  CRef<CSeq_id> seqid(new CSeq_id(CSeq_id::e_Gi, 555));
275  TSeqRange range(0U, 500U);
276  unique_ptr<SSeqLoc> sl(CTestObjMgr::Instance().
277  CreateSSeqLoc(*seqid, range));
278  queries.push_back(*sl);
282  opts(dynamic_cast<CBlastProteinOptionsHandle*>(&*m_OptHandle));
283  BOOST_REQUIRE_THROW(CPsiBl2Seq blaster(query, m_Subject, opts),
285 }
287 BOOST_AUTO_TEST_CASE(TestSubjectIsNucleotide) {
288  TSeqLocVector sequences;
289  CRef<CSeq_id> seqid(new CSeq_id(CSeq_id::e_Gi, 555));
290  TSeqRange range(0U, 500U);
291  unique_ptr<SSeqLoc> sl(CTestObjMgr::Instance().
292  CreateSSeqLoc(*seqid, range));
293  sequences.push_back(*sl);
294  m_Subject.Reset(new CObjMgr_QueryFactory(sequences));
295  BOOST_REQUIRE_THROW(CPsiBl2Seq blaster(m_Pssm, m_Subject, m_OptHandle),
297 }
299 BOOST_AUTO_TEST_CASE(TestMissingSubjects) {
300  m_Subject.Reset();
301  BOOST_REQUIRE_THROW(CPsiBl2Seq blaster(m_Pssm, m_Subject, m_OptHandle),
303 }
306  m_Pssm.Reset();
307  BOOST_REQUIRE_THROW(CPsiBl2Seq blaster(m_Pssm, m_Subject, m_OptHandle),
309 }
311 BOOST_AUTO_TEST_CASE(TestMissingOptions) {
312  m_OptHandle.Reset();
313  BOOST_REQUIRE_THROW(CPsiBl2Seq blaster(m_Pssm, m_Subject, m_OptHandle),
315 }
317 BOOST_AUTO_TEST_CASE(TestComparePssmWithSingleSequence) {
318  CConstRef<CBioseq> bioseq(&m_SeqEntry->GetSeq());
319  x_SetupSubject(bioseq);
321  CPsiBl2Seq blaster(m_Pssm, m_Subject, m_OptHandle);
322  CSearchResultSet results(*blaster.Run());
323  BOOST_REQUIRE(results[0].GetErrors().empty());
325  const size_t kNumExpectedAlignments = 2;
326  CConstRef<CSeq_align_set> sas = results[0].GetSeqAlign();
328  BOOST_REQUIRE_EQUAL(kNumExpectedAlignments, sas->Size());
331  CSeq_align_set::Tdata::const_iterator alignment_itr
332  = sas->Get().begin();
333  x_ValidatePssmVsGi7450545(*alignment_itr, 1);
334  ++alignment_itr;
335  x_ValidatePssmVsGi7450545(*alignment_itr, 2);
336 }
337 #endif
339 BOOST_AUTO_TEST_CASE(TestComparePssmWithMultipleSequences) {
340  const size_t kNumSubjects = 2;
341  CConstRef<CBioseq_set> bioseq_set(&m_SeqSet->GetSet());
342  x_SetupSubject(bioseq_set);
344  CPsiBl2Seq blaster(m_Pssm, m_Subject, m_OptHandle);
345  CSearchResultSet results(*blaster.Run());
346  BOOST_REQUIRE(results[0].GetErrors().empty());
348  const CBlastOptions& opts = m_OptHandle->GetOptions();
349  BOOST_REQUIRE_EQUAL(kNumSubjects,
350  (size_t)m_Subject->MakeLocalQueryData(&opts)->GetNumQueries());
351  BOOST_REQUIRE_EQUAL(kNumSubjects,
352  results[0].GetSeqAlign()->Get().size());
354  const size_t kNumExpectedAlignments = kNumSubjects;
355  CConstRef<CSeq_align_set> sas = results[0].GetSeqAlign();
356  BOOST_REQUIRE_EQUAL(kNumExpectedAlignments, sas->Get().size());
358  CSeq_align_set::Tdata::const_iterator alignment_itr
359  = sas->Get().begin();
360  x_ValidatePssmVsGi7450545(*alignment_itr, 1);
361  ++alignment_itr;
362  x_ValidatePssmVsGi7450545(*alignment_itr, 2);
365  BOOST_REQUIRE(results[1].GetErrors().empty());
366  CConstRef<CSeq_align_set> sas2 = results[1].GetSeqAlign();
368  x_ValidatePssmVsGi129295(*(sas2->Get().begin()));
370 }
372 #if 0
373 BOOST_AUTO_TEST_CASE(TestComparePssmWithMultipleSequences_OneWithNoResults) {
376  // Prepare the subjects
377  TSeqLocVector subjects;
378  {
379  int subj_gis[] = { 7450545, 40456275, 129295 };
380  for (size_t i = 0; i < sizeof(subj_gis)/sizeof(*subj_gis); i++) {
381  CRef<CSeq_loc> subj_loc(new CSeq_loc);
382  subj_loc->SetWhole().SetGi(subj_gis[i]);
383  subjects.push_back(SSeqLoc(subj_loc, scope));
384  }
385  }
387  // set up the query factories for the subjects
388  CRef<IQueryFactory> subj_factory(new CObjMgr_QueryFactory(subjects));
390  CPsiBl2Seq blaster(m_Pssm, subj_factory, m_OptHandle);
391  CSearchResultSet results(*blaster.Run());
392  BOOST_REQUIRE(results[0].GetErrors().empty());
394  BOOST_REQUIRE_EQUAL(subjects.size(), results.GetNumResults());
396  CConstRef<CSeq_align_set> sas = results[0].GetSeqAlign();
397  CSeq_align_set::Tdata::const_iterator alignment_itr
398  = sas->Get().begin();
399  x_ValidatePssmVsGi7450545(*alignment_itr, 1);
400  ++alignment_itr;
401  x_ValidatePssmVsGi7450545(*alignment_itr, 2);
403  CConstRef<CSeq_align_set> sas2 = results[1].GetSeqAlign();
405  // REMOVE??? x_ValidatePssmVsGi40456275(*(sas2->Get().begin()));
406  BOOST_REQUIRE_EQUAL(0, (int) sas2->Size());
408  x_ValidatePssmVsGi129295(*(results[2].GetSeqAlign()->Get().begin()));
410 }
412 BOOST_AUTO_TEST_CASE(TestComparePsiBl2SeqWithBl2Seq) {
415  // Prepare the query
417  {
418  CRef<CSeq_loc> query_loc(new CSeq_loc);
419  query_loc->SetWhole().SetGi(7662354);
420  query.push_back(SSeqLoc(query_loc, scope));
421  }
423  // Prepare the subjects
424  TSeqLocVector subjects;
425  {
426  // These gis have hits against the query sequence above
427  int subj_gis[] = { 34535770, 46125411 };
428  for (size_t i = 0; i < sizeof(subj_gis)/sizeof(*subj_gis); i++) {
429  CRef<CSeq_loc> subj_loc(new CSeq_loc);
430  subj_loc->SetWhole().SetGi(subj_gis[i]);
431  subjects.push_back(SSeqLoc(subj_loc, scope));
432  }
433  }
435  // set up the query factories for CPsiBl2Seq
436  CRef<IQueryFactory> query_factory(new CObjMgr_QueryFactory(query));
437  CRef<IQueryFactory> subj_factory(new CObjMgr_QueryFactory(subjects));
439  // Reset composition based statistics for now
440  m_OptHandle->SetCompositionBasedStats(eNoCompositionBasedStats);
442  // Run BLAST 2 Sequences (objmgr dependent version)
443  CBl2Seq bl2seq(query, subjects, *m_OptHandle);
444  TSeqAlignVector bl2seq_results = bl2seq.Run();
446  // Run BLAST 2 Sequences (objmgr independent version)
447  // Configure the options the same way
449  psi_opts(dynamic_cast<CBlastProteinOptionsHandle*>(&*m_OptHandle));
450  CPsiBl2Seq psibl2seq(query_factory, subj_factory, psi_opts);
451  CSearchResultSet psibl2seq_results = *psibl2seq.Run();
453  qa::TSeqAlignSet results_ref;
454  qa::TSeqAlignSet results_test;
456  qa::SeqAlignSetConvert(*bl2seq_results[0], results_ref);
457  qa::SeqAlignSetConvert(*psibl2seq_results[0].GetSeqAlign(), results_test);
459  qa::CSeqAlignCmpOpts opts;
460  qa::CSeqAlignCmp cmp(results_ref, results_test, opts);
461  string errors;
462  bool identical_results = cmp.Run(&errors);
464  BOOST_REQUIRE_MESSAGE(identical_results, errors);
465 }
466 #endif
