NCBI C++ ToolKit
magicblast_unit_test.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: magicblast_unit_test.cpp 100942 2023-10-03 17:36:50Z ucko $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Greg Boratyn
27  *
28  * Test module for Magic-BLAST API
29  */
30 
31 #include <ncbi_pch.hpp>
32 #include <corelib/test_boost.hpp>
33 
34 // Serial library includes
35 #include <serial/serial.hpp>
36 #include <serial/objistr.hpp>
37 
38 // Object includes
45 
48 #include <objects/seq/Bioseq.hpp>
50 
52 
53 // BLAST includes
58 
60 
61 using namespace std;
62 using namespace ncbi;
63 using namespace ncbi::objects;
64 using namespace ncbi::blast;
65 
67 
70 
71  /// Contains a single Bioseq
73 
74 
76  m_OptHandle.Reset(new CMagicBlastOptionsHandle);
77  m_Db.Reset(new CSearchDatabase("data/pombe",
79 
80  m_Queries.Reset(new CBioseq_set);
81  }
82 
83 
85  m_OptHandle.Reset();
86  m_Db.Reset();
87  m_Queries.Reset();
88  }
89 };
90 
91 
92 BOOST_FIXTURE_TEST_SUITE(magicblast, CMagicBlastTestFixture)
93 
94 
95 struct SExon
96 {
97  unsigned int prod_start;
98  unsigned int prod_end;
99  unsigned int gen_start;
100  unsigned int gen_end;
101 
104  string acceptor;
105  string donor;
106 };
107 
108 struct SMatch
109 {
110  int score;
111  unsigned int prod_length;
112  vector<SExon> exons;
113 };
114 
115 
116 BOOST_AUTO_TEST_CASE(MappingNonMatch)
117 {
118  ifstream istr("data/magicblast_nonmatch.asn");
119  BOOST_REQUIRE(istr);
120  istr >> MSerial_AsnText >> *m_Queries;
121 
122  CRef<IQueryFactory> query_factory(new CObjMgrFree_QueryFactory(m_Queries));
123  CRef<CLocalDbAdapter> db_adapter(new CLocalDbAdapter(*m_Db));
124  m_OptHandle->SetMismatchPenalty(-8);
125  m_OptHandle->SetGapExtensionCost(8);
126  m_OptHandle->SetPaired(true);
127  CMagicBlast magicblast(query_factory, db_adapter, m_OptHandle);
128  CRef<CMagicBlastResultSet> results = magicblast.RunEx();
129 
130  const size_t kExpectedNumResults = 1;
131  BOOST_REQUIRE_EQUAL(results->size(), kExpectedNumResults);
132 }
133 
134 
135 BOOST_AUTO_TEST_CASE(MappingAllConcordant)
136 {
137  ifstream istr("data/magicblast_concordant.asn");
138  BOOST_REQUIRE(istr);
139  istr >> MSerial_AsnText >> *m_Queries;
140 
141  CRef<IQueryFactory> query_factory(new CObjMgrFree_QueryFactory(m_Queries));
142  CRef<CLocalDbAdapter> db_adapter(new CLocalDbAdapter(*m_Db));
143  m_OptHandle->SetPaired(true);
144  CMagicBlast magicblast(query_factory, db_adapter, m_OptHandle);
145  CRef<CMagicBlastResultSet> results = magicblast.RunEx();
146 
147  const size_t kExpectedNumResults = 3;
148  const size_t kExpectedConcordant = 3;
149  BOOST_REQUIRE_EQUAL(results->size(), kExpectedNumResults);
150  size_t count = 0;
151  for (auto r = results->begin(); r != results->end(); ++r) {
153  if (re->IsConcordant()) ++count;
154  }
155  BOOST_REQUIRE_EQUAL(count, kExpectedConcordant);
156 }
157 
158 
159 BOOST_AUTO_TEST_CASE(MappingAllDiscordant)
160 {
161  ifstream istr("data/magicblast_discordant.asn");
162  BOOST_REQUIRE(istr);
163  istr >> MSerial_AsnText >> *m_Queries;
164 
165  CRef<IQueryFactory> query_factory(new CObjMgrFree_QueryFactory(m_Queries));
166  CRef<CLocalDbAdapter> db_adapter(new CLocalDbAdapter(*m_Db));
167  m_OptHandle->SetPaired(true);
168  CMagicBlast magicblast(query_factory, db_adapter, m_OptHandle);
169  CRef<CMagicBlastResultSet> results = magicblast.RunEx();
170 
171  const size_t kExpectedNumResults = 4;
172  const size_t kExpectedDiscordant = 4;
173  BOOST_REQUIRE_EQUAL(results->size(), kExpectedNumResults);
174  size_t count = 0;
175  for (auto r = results->begin(); r != results->end(); ++r) {
177  if (!re->IsConcordant()) ++count;
178  }
179  BOOST_REQUIRE_EQUAL(count, kExpectedDiscordant);
180 }
181 
182 
183 BOOST_AUTO_TEST_CASE(MappingNoPairs)
184 {
185  ifstream istr("data/magicblast_queries.asn");
186  BOOST_REQUIRE(istr);
187  istr >> MSerial_AsnText >> *m_Queries;
188 
189  CRef<IQueryFactory> query_factory(new CObjMgrFree_QueryFactory(m_Queries));
190  CRef<CLocalDbAdapter> db_adapter(new CLocalDbAdapter(*m_Db));
191  m_OptHandle->SetMismatchPenalty(-8);
192  m_OptHandle->SetGapExtensionCost(8);
193  m_OptHandle->SetCutoffScore(49);
194  CMagicBlast magicblast(query_factory, db_adapter, m_OptHandle);
195  CRef<CSeq_align_set> results = magicblast.Run();
196 
197  const size_t kExpectedNumResults = 4;
198  BOOST_REQUIRE_EQUAL(results->Get().size(), kExpectedNumResults);
199 
200  SExon exon;
201  vector<SMatch> expected_hits(kExpectedNumResults);
202 
203  // expected HSPs
204 
205  int results_idx = 0;
206  // HSP #1
207  expected_hits[results_idx].score = 49;
208  expected_hits[results_idx].prod_length = 49;
209 
210  exon.prod_start = 0;
211  exon.prod_end = 21;
212  exon.gen_start = 1827220;
213  exon.gen_end = 1827241;
216  exon.acceptor = "";
217  exon.donor = "CT";
218  expected_hits[results_idx].exons.push_back(exon);
219 
220  exon.prod_start = 22;
221  exon.prod_end = 48;
222  exon.gen_start = 1827292;
223  exon.gen_end = 1827318;
226  exon.acceptor = "AC";
227  exon.donor = "";
228  expected_hits[results_idx].exons.push_back(exon);
229 
230  // HSP #2
231  results_idx++;
232  expected_hits[results_idx].score = 49;
233  expected_hits[results_idx].prod_length = 49;
234 
235  exon.prod_start = 0;
236  exon.prod_end = 28;
237  exon.gen_start = 181290;
238  exon.gen_end = 181318;
241  exon.acceptor = "";
242  exon.donor = "CT";
243  expected_hits[results_idx].exons.push_back(exon);
244 
245  exon.prod_start = 29;
246  exon.prod_end = 48;
247  exon.gen_start = 181367;
248  exon.gen_end = 181386;
251  exon.acceptor = "AC";
252  exon.donor = "";
253  expected_hits[results_idx].exons.push_back(exon);
254 
255  // HSP #3
256  results_idx++;
257  expected_hits[results_idx].score = 49;
258  expected_hits[results_idx].prod_length = 49;
259 
260  exon.prod_start = 0;
261  exon.prod_end = 20;
262  exon.gen_start = 1033352;
263  exon.gen_end = 1033372;
266  exon.acceptor = "";
267  exon.donor = "CT";
268  expected_hits[results_idx].exons.push_back(exon);
269 
270  exon.prod_start = 21;
271  exon.prod_end = 48;
272  exon.gen_start = 1033432;
273  exon.gen_end = 1033459;
276  exon.acceptor = "AC";
277  exon.donor = "";
278  expected_hits[results_idx].exons.push_back(exon);
279 
280  // HSP #4
281  results_idx++;
282  expected_hits[results_idx].score = 49;
283  expected_hits[results_idx].prod_length = 49;
284 
285  exon.prod_start = 0;
286  exon.prod_end = 23;
287  exon.gen_start = 89112;
288  exon.gen_end = 89135;
291  exon.acceptor = "";
292  exon.donor = "CT";
293  expected_hits[results_idx].exons.push_back(exon);
294 
295  exon.prod_start = 24;
296  exon.prod_end = 48;
297  exon.gen_start = 89420;
298  exon.gen_end = 89444;
301  exon.acceptor = "AC";
302  exon.donor = "";
303  expected_hits[results_idx].exons.push_back(exon);
304 
305  // compare computed HSPs with the expected ones
306  results_idx = 0;
307  for (auto it: results->Get()) {
308 
309  // we do not expect paired results
310  BOOST_REQUIRE(it->GetSegs().IsSpliced());
311 
312  int score;
313  it->GetNamedScore("score", score);
314  BOOST_REQUIRE_EQUAL(score, expected_hits[results_idx].score);
315 
316  const CSpliced_seg& seg = it->GetSegs().GetSpliced();
317  BOOST_REQUIRE_EQUAL(seg.GetProduct_length(),
318  expected_hits[results_idx].prod_length);
319 
320  BOOST_REQUIRE_EQUAL(seg.GetExons().size(),
321  expected_hits[results_idx].exons.size());
322 
323  // compare exon data
324  auto expected_exon = expected_hits[results_idx].exons.begin();
325  for (auto exon: seg.GetExons()) {
326 
327  // exon starts and stops
328  BOOST_REQUIRE_EQUAL(exon->GetProduct_start().GetNucpos(),
329  expected_exon->prod_start);
330 
331  BOOST_REQUIRE_EQUAL(exon->GetProduct_end().GetNucpos(),
332  expected_exon->prod_end);
333 
334  BOOST_REQUIRE_EQUAL(exon->GetGenomic_start(),
335  expected_exon->gen_start);
336 
337  BOOST_REQUIRE_EQUAL(exon->GetGenomic_end(),
338  expected_exon->gen_end);
339 
340  // strands
341  BOOST_REQUIRE_EQUAL(exon->GetProduct_strand(),
342  expected_exon->prod_strand);
343 
344  BOOST_REQUIRE_EQUAL(exon->GetGenomic_strand(),
345  expected_exon->gen_strand);
346 
347  // splice signals
348  if (!expected_exon->acceptor.empty()) {
349  BOOST_REQUIRE(exon->CanGetAcceptor_before_exon());
350  BOOST_REQUIRE_EQUAL(exon->GetAcceptor_before_exon().GetBases(),
351  expected_exon->acceptor);
352  }
353 
354  if (!expected_exon->donor.empty()) {
355  BOOST_REQUIRE(exon->CanGetDonor_after_exon());
356  BOOST_REQUIRE_EQUAL(exon->GetDonor_after_exon().GetBases(),
357  expected_exon->donor);
358  }
359 
360  ++expected_exon;
361  }
362  results_idx++;
363  }
364 }
365 
366 
367 BOOST_AUTO_TEST_CASE(MappingPaired)
368 {
369  ifstream istr("data/magicblast_paired.asn");
370  BOOST_REQUIRE(istr);
371  istr >> MSerial_AsnText >> *m_Queries;
372 
373  bool queries_paired = false;
374  auto q = m_Queries->GetSeq_set().begin();
375  BOOST_REQUIRE(q != m_Queries->GetSeq_set().end());
376  const CBioseq& bioseq = (*q)->GetSeq();
377  BOOST_REQUIRE(bioseq.CanGetDescr());
378  for (auto it: bioseq.GetDescr().Get()) {
379  if (it->IsUser()) {
380  const CUser_object& obj = it->GetUser();
381  if (obj.GetType().IsStr() && obj.GetType().GetStr() == "Mapping") {
382  queries_paired = obj.HasField("has_pair");
383  }
384  }
385  }
386  BOOST_REQUIRE(queries_paired);
387 
388  CRef<IQueryFactory> query_factory(new CObjMgrFree_QueryFactory(m_Queries));
389  CRef<CLocalDbAdapter> db_adapter(new CLocalDbAdapter(*m_Db));
390 
391  m_OptHandle->SetPaired(true);
392  CMagicBlast magicblast(query_factory, db_adapter, m_OptHandle);
393  CRef<CSeq_align_set> results = magicblast.Run();
394 
395  const size_t kExpectedNumResults = 3;
396  BOOST_REQUIRE_EQUAL(results->Get().size(), kExpectedNumResults);
397 
398  SExon exon;
399  vector<SMatch> expected_hits(2 * kExpectedNumResults);
400 
401  // expected HSPs
402 
403  int results_idx = 0;
404 
405  // HSP #1
406  expected_hits[results_idx].score = 68;
407  expected_hits[results_idx].prod_length = 75;
408 
409  exon.prod_start = 0;
410  exon.prod_end = 67;
411  exon.gen_start = 9925;
412  exon.gen_end = 9992;
415  exon.acceptor = "";
416  exon.donor = "";
417  expected_hits[results_idx].exons.push_back(exon);
418 
419  // HSP #2
420  results_idx++;
421  expected_hits[results_idx].score = 74;
422  expected_hits[results_idx].prod_length = 75;
423 
424  exon.prod_start = 1;
425  exon.prod_end = 74;
426  exon.gen_start = 9842;
427  exon.gen_end = 9915;
430  exon.acceptor = "";
431  exon.donor = "";
432  expected_hits[results_idx].exons.push_back(exon);
433 
434  // HSP #3
435  results_idx++;
436  expected_hits[results_idx].score = 68;
437  expected_hits[results_idx].prod_length = 75;
438 
439  exon.prod_start = 0;
440  exon.prod_end = 67;
441  exon.gen_start = 20795;
442  exon.gen_end = 20862;
445  exon.acceptor = "";
446  exon.donor = "";
447  expected_hits[results_idx].exons.push_back(exon);
448 
449  // HSP #4
450  results_idx++;
451  expected_hits[results_idx].score = 74;
452  expected_hits[results_idx].prod_length = 75;
453 
454  exon.prod_start = 1;
455  exon.prod_end = 74;
456  exon.gen_start = 20712;
457  exon.gen_end = 20785;
460  exon.acceptor = "";
461  exon.donor = "";
462  expected_hits[results_idx].exons.push_back(exon);
463 
464  // HSP #5
465  results_idx++;
466  expected_hits[results_idx].score = 68;
467  expected_hits[results_idx].prod_length = 75;
468 
469  exon.prod_start = 7;
470  exon.prod_end = 74;
471  exon.gen_start = 2443260;
472  exon.gen_end = 2443327;
475  exon.acceptor = "";
476  exon.donor = "";
477  expected_hits[results_idx].exons.push_back(exon);
478 
479  // HSP #6
480  results_idx++;
481  expected_hits[results_idx].score = 74;
482  expected_hits[results_idx].prod_length = 75;
483 
484  exon.prod_start = 0;
485  exon.prod_end = 73;
486  exon.gen_start = 2443337;
487  exon.gen_end = 2443410;
490  exon.acceptor = "";
491  exon.donor = "";
492  expected_hits[results_idx].exons.push_back(exon);
493 
494  // compare computed HSPs with the expected ones
495  results_idx = 0;
496  for (auto seg: results->Get()) {
497 
498  // we do not expect paired results
499  BOOST_REQUIRE(seg->GetSegs().IsDisc());
500  BOOST_REQUIRE_EQUAL(seg->GetSegs().GetDisc().Get().size(), 2u);
501 
502  for (auto it: seg->GetSegs().GetDisc().Get()) {
503 
504  BOOST_REQUIRE(it->GetSegs().IsSpliced());
505 
506  int score;
507  it->GetNamedScore("score", score);
508  BOOST_REQUIRE_EQUAL(score, expected_hits[results_idx].score);
509 
510 
511  const CSpliced_seg& seg = it->GetSegs().GetSpliced();
512  BOOST_REQUIRE_EQUAL(seg.GetProduct_length(),
513  expected_hits[results_idx].prod_length);
514 
515  BOOST_REQUIRE_EQUAL(seg.GetExons().size(),
516  expected_hits[results_idx].exons.size());
517 
518  // compare exon data
519  auto expected_exon = expected_hits[results_idx].exons.begin();
520  for (auto exon: seg.GetExons()) {
521 
522  // exon starts and stops
523  BOOST_REQUIRE_EQUAL(exon->GetProduct_start().GetNucpos(),
524  expected_exon->prod_start);
525 
526  BOOST_REQUIRE_EQUAL(exon->GetProduct_end().GetNucpos(),
527  expected_exon->prod_end);
528 
529  BOOST_REQUIRE_EQUAL(exon->GetGenomic_start(),
530  expected_exon->gen_start);
531 
532  BOOST_REQUIRE_EQUAL(exon->GetGenomic_end(),
533  expected_exon->gen_end);
534 
535  // strands
536  BOOST_REQUIRE_EQUAL(exon->GetProduct_strand(),
537  expected_exon->prod_strand);
538 
539  BOOST_REQUIRE_EQUAL(exon->GetGenomic_strand(),
540  expected_exon->gen_strand);
541 
542  // splice signals
543  if (!expected_exon->acceptor.empty()) {
544  BOOST_REQUIRE(exon->CanGetAcceptor_before_exon());
545  BOOST_REQUIRE_EQUAL(
546  exon->GetAcceptor_before_exon().GetBases(),
547  expected_exon->acceptor);
548  }
549 
550  if (!expected_exon->donor.empty()) {
551  BOOST_REQUIRE(exon->CanGetDonor_after_exon());
552  BOOST_REQUIRE_EQUAL(exon->GetDonor_after_exon().GetBases(),
553  expected_exon->donor);
554  }
555 
556  ++expected_exon;
557  }
558  results_idx++;
559  }
560  }
561 }
562 
563 
565 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
BOOST_AUTO_TEST_SUITE_END() static int s_GetSegmentFlags(const CBioseq &bioseq)
Interface to create a BlastSeqSrc suitable for use in CORE BLAST from a a variety of BLAST database/s...
Handle to the nucleotide mapping options to the BLAST algorithm.
BLAST RNA-Seq mapper.
Definition: magicblast.hpp:58
NCBI C++ Object Manager free implementation of IQueryFactory.
Blast Search Subject.
bool HasField(const string &str, const string &delim=".", NStr::ECase use_case=NStr::eCase) const
Verify that a named field exists.
Ensure direct dependencies on enough of the core xncbi library to satisfy shared libraries that depen...
CRef< CMagicBlastResultSet > RunEx(void)
Definition: magicblast.cpp:87
CRef< CSeq_align_set > Run(void)
Run the RNA-Seq mapping.
Definition: magicblast.cpp:70
@ eBlastDbIsNucleotide
nucleotide
#define MSerial_AsnText
I/O stream manipulators –.
Definition: serialbase.hpp:696
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool IsStr(void) const
Check if variant Str is selected.
Definition: Object_id_.hpp:291
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
const TType & GetType(void) const
Get the Type member data.
TProduct_length GetProduct_length(void) const
Get the Product_length member data.
const TExons & GetExons(void) const
Get the Exons member data.
const Tdata & Get(void) const
Get the member data.
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
bool CanGetDescr(void) const
Check if it is safe to call GetDescr method.
Definition: Bioseq_.hpp:309
const TDescr & GetDescr(void) const
Get the Descr member data.
Definition: Bioseq_.hpp:315
Declares class which provides internal BLAST database representations to the internal BLAST APIs.
Declares CMagicBlast, the C++ API for the BLAST RNA-Seq mapping engine.
BOOST_AUTO_TEST_CASE(MappingNonMatch)
Magic spell ;-) needed for some weird compilers... very empiric.
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
NOTE: This file contains work in progress and the APIs are likely to change, please do not rely on th...
CRef< CSearchDatabase > m_Db
CRef< CMagicBlastOptionsHandle > m_OptHandle
CRef< CBioseq_set > m_Queries
Contains a single Bioseq.
unsigned int prod_end
ENa_strand gen_strand
unsigned int gen_end
unsigned int prod_start
ENa_strand prod_strand
unsigned int gen_start
vector< SExon > exons
unsigned int prod_length
Utility stuff for more convenient using of Boost.Test library.
Modified on Wed Apr 24 14:11:26 2024 by modify_doxy.py rev. 669887