NCBI C++ ToolKit
rps_unit_test.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: rps_unit_test.cpp 92009 2020-12-17 15:27:32Z grichenk $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Jason Papadopoulos
27 *
28 * File Description:
29 * Unit test for RPS blast
30 *
31 * ===========================================================================
32 */
33 #include <ncbi_pch.hpp>
34 #include <corelib/test_boost.hpp>
35 
39 
45 #include <blast_seqalign.hpp>
46 
49 
50 #include "test_objmgr.hpp"
51 #include "blast_test_util.hpp"
52 
53 using namespace std;
54 using namespace ncbi;
55 using namespace ncbi::objects;
56 using namespace ncbi::blast;
57 
58 void testNuclHitList(const CSeq_align_set& results, ENa_strand strand)
59 {
60  const size_t num_hsps_total = 12;
61  const size_t num_hsps_plus = 4;
62  const int scores[num_hsps_total] =
63  {62, 51, 49, 44, 44, 44, 43, 43, 43, 48, 45, 46};
64  const ENa_strand strands[num_hsps_total] =
65  {
78  };
79  const int q_offsets[num_hsps_total] =
80  {3244, 1045, 3133, 9204, 3163, 8179, 1090, 1328, 2300, 832, 6776, 3633};
81  const int s_offsets[num_hsps_total] =
82  {700, 662, 812, 1385, 146, 538, 930, 1340, 1373, 1917, 1467, 966};
83  const int q_ends[num_hsps_total] =
84  {3430, 1159, 3283, 9300, 3328, 8287, 1174, 1388, 2408, 937, 6968, 3759};
85  const int s_ends[num_hsps_total] =
86  {761, 699, 861, 1418, 205, 564, 958, 1360, 1403, 1952, 1531, 1007};
87  const double evalues[num_hsps_total] =
88  {0.0546847, 1.0657, 2.00126, 7.04787, 7.83533, 8.12416, 9.05612, 9.84108, 9.90728, 2.53047, 5.47761, 3.70474};
89 
90  // compute the total number of alignments
91  size_t num_hsps = results.Size();
92 /*
93  ITERATE(CSeq_align_set::Tdata, itr, results.Get()) {
94  num_hsps += (*itr).Size();
95  }
96 */
97  switch (strand) {
98  case eNa_strand_plus:
99  BOOST_REQUIRE_EQUAL(num_hsps_plus, num_hsps);
100  break;
101  case eNa_strand_minus:
102  BOOST_REQUIRE_EQUAL(num_hsps_total - num_hsps_plus, num_hsps);
103  break;
104  default:
105  BOOST_REQUIRE_EQUAL(num_hsps_total, num_hsps);
106  break;
107  }
108 
109  // for each subject sequence
110  size_t align_num = 0;
111  ITERATE(CSeq_align_set::Tdata, list1, results.Get()) {
112  const CSeq_align& hitlist = **list1;
113 
114  // for each hit
115 
116  BOOST_REQUIRE(hitlist.GetSegs().IsStd());
117  const list<CRef<CStd_seg> >& stdseg = hitlist.GetSegs().GetStd();
118  ENa_strand curr_strand =
119  stdseg.front()->GetLoc().front()->GetInt().GetStrand();
120 
121  // skip ahead to the known answer for this hit
122 
123  if (strand != eNa_strand_both) {
124  while (align_num < num_hsps_total &&
125  strands[align_num] != curr_strand)
126  align_num++;
127  BOOST_REQUIRE(align_num < num_hsps_total);
128  }
129 
130  // test scores and e-values
131 
132  ITERATE(CSeq_align::TScore, sitr, hitlist.GetScore()) {
133  const CScore& curr_score = **sitr;
134  if (curr_score.GetId().GetStr() == "e_value" ) {
135  BOOST_REQUIRE_CLOSE(evalues[align_num],
136  curr_score.GetValue().GetReal(), 0.1);
137  }
138  else if (curr_score.GetId().GetStr() == "score") {
139  BOOST_REQUIRE_EQUAL(scores[align_num],
140  curr_score.GetValue().GetInt());
141  }
142  }
143 
144  // test sequence offsets; the end offsets can only
145  // be computed by iterating through traceback
146 
147  const CStd_seg::TLoc& locs = stdseg.front()->GetLoc();
148  int off1 = locs[0]->GetInt().GetFrom();
149  int off2 = locs[1]->GetInt().GetFrom();
150  BOOST_REQUIRE_EQUAL(q_offsets[align_num], off1);
151  BOOST_REQUIRE_EQUAL(s_offsets[align_num], off2);
152 
153  ITERATE(list<CRef<CStd_seg> >, seg_itr, stdseg) {
154  const CStd_seg::TLoc& seqloc = (*seg_itr)->GetLoc();
155  if (seqloc[0]->IsEmpty()) {
156  off2 += seqloc[1]->GetInt().GetTo() -
157  seqloc[1]->GetInt().GetFrom() + 1;
158  }
159  else if (seqloc[1]->IsEmpty()) {
160  off1 += seqloc[0]->GetInt().GetTo() -
161  seqloc[0]->GetInt().GetFrom() + 1;
162  }
163  else {
164  off1 += seqloc[0]->GetInt().GetTo() -
165  seqloc[0]->GetInt().GetFrom() + 1;
166  off2 += seqloc[1]->GetInt().GetTo() -
167  seqloc[1]->GetInt().GetFrom() + 1;
168  }
169  }
170  BOOST_REQUIRE_EQUAL(q_ends[align_num], off1);
171  BOOST_REQUIRE_EQUAL(s_ends[align_num], off2);
172  align_num++;
173  }
174 }
175 
177  string m_DbName;
178 
180 #if defined(WORDS_BIGENDIAN) || defined(IS_BIG_ENDIAN)
181  m_DbName = "data/rpstest_be";
182 #else
183  m_DbName = "data/rpstest_le";
184 #endif
185  }
186 
187  void NuclSearch(ENa_strand strand) {
189  opts->SetFilterString("F");
191  rpstblastn_opts(dynamic_cast<CRPSTBlastnOptionsHandle*> (opts.GetPointer()));
192  rpstblastn_opts->SetCompositionBasedStats(false);
193  opts->SetFilterString("F");
194 
195  CSeq_id id("gi|19572546");
196  unique_ptr<SSeqLoc> query(
197  CTestObjMgr::Instance().CreateSSeqLoc(id, strand));
198  TSeqLocVector query_v;
199  query_v.push_back(*query);
200 
202  CRef<IQueryFactory> query_factory(new CObjMgr_QueryFactory(query_v));
203  CLocalBlast blaster(query_factory, opts, dbinfo);
204  CSearchResultSet results = *blaster.Run();
205 
206  testNuclHitList(*results[0].GetSeqAlign(), strand);
207  }
208 };
209 
210 BOOST_FIXTURE_TEST_SUITE(rps, RpsTestFixture)
211 
212 /* prompted by a bug that caused alignments reaching the end
213  of a DB sequence to not include the last letter */
214 BOOST_AUTO_TEST_CASE(WholeSequenceMatch) {
215 
216  CSeq_id id("gi|38092615"); /* query = first DB sequence */
217  unique_ptr<SSeqLoc> query(
218  CTestObjMgr::Instance().CreateSSeqLoc(id, eNa_strand_unknown));
219  TSeqLocVector query_v;
220  query_v.push_back(*query);
221 
223 
225  CRef<IQueryFactory> query_factory(new CObjMgr_QueryFactory(query_v));
226  CLocalBlast blaster(query_factory, opts, dbinfo);
227  CSearchResultSet results = *blaster.Run();
228 
229  const CSeq_align& hitlist_sa = *results[0].GetSeqAlign()->Get().front();
230  const CDense_seg& denseg = hitlist_sa.GetSegs().GetDenseg();
231 
232  int off1 = denseg.GetStarts()[0];
233  int off2 = denseg.GetStarts()[1];
234  BOOST_REQUIRE_EQUAL(0, off1);
235  BOOST_REQUIRE_EQUAL(0, off2);
236  off1 += denseg.GetLens()[0];
237  off2 += denseg.GetLens()[0];
238  BOOST_REQUIRE_EQUAL(1016, (int)denseg.GetLens()[0]);
239 }
240 BOOST_AUTO_TEST_CASE(NuclSearchPlusStrand) {
241  NuclSearch(eNa_strand_plus);
242 }
243 BOOST_AUTO_TEST_CASE(NuclSearchMinusStrand) {
244  NuclSearch(eNa_strand_minus);
245 }
246 BOOST_AUTO_TEST_CASE(NuclSearchBothStrands) {
247  NuclSearch(eNa_strand_both);
248 }
249 
250 BOOST_AUTO_TEST_CASE(testPreliminarySearch)
251 {
252  const TGi kQueryGi = GI_CONST(129295);
253  const int kNumHits = 2;
254  const int kOids[kNumHits] = { 3, 1 };
255  const int kNumHsps[kNumHits] = { 2, 3 };
256  const int kTotalHsps = 5;
257  const int kScores[kTotalHsps] = { 7458, 6898, 7055, 6997, 6542};
258  const int kLengths[kTotalHsps] = { 36, 21, 15, 15, 10};
259 
260  CRef<CSeq_loc> query_loc(new CSeq_loc());
261  query_loc->SetWhole().SetGi(kQueryGi);
262  CScope* query_scope = new CScope(CTestObjMgr::Instance().GetObjMgr());
263  query_scope->AddDefaults();
264  TSeqLocVector query_v;
265  query_v.push_back(SSeqLoc(query_loc, query_scope));
266  CBlastSeqSrc seq_src(SeqDbBlastSeqSrcInit(m_DbName, TRUE));
268 
270  CRef<CBlastOptions> options(&opts->SetOptions());
271  CRef<IQueryFactory> query_factory(new CObjMgr_QueryFactory(query_v));
272  CBlastPrelimSearch prelim_search(query_factory, options, seq_src);
273  CRef<SInternalData> id(prelim_search.Run());
274 
275  CBlastHSPResults results
276  (prelim_search.ComputeBlastHSPResults
277  (id->m_HspStream->GetPointer()));
278 
279  BOOST_REQUIRE_EQUAL(1, results->num_queries);
280  BOOST_REQUIRE(results->hitlist_array[0]);
281  BOOST_REQUIRE_EQUAL(kNumHits,
282  results->hitlist_array[0]->hsplist_count);
283 
284  int hsp_index = 0;
285  for (int index = 0; index < kNumHits; ++index) {
286  BlastHSPList* hsp_list =
287  results->hitlist_array[0]->hsplist_array[index];
288  BOOST_REQUIRE_EQUAL(kOids[index], hsp_list->oid);
289  BOOST_REQUIRE_EQUAL(kNumHsps[index], hsp_list->hspcnt);
290  for (int index1 = 0; index1 < kNumHsps[index];
291  ++index1, ++hsp_index) {
292  BlastHSP* hsp = hsp_list->hsp_array[index1];
293  BOOST_REQUIRE_EQUAL(kScores[hsp_index], hsp->score);
294  //BOOST_REQUIRE(hsp->evalue == 0.0);
295  BOOST_REQUIRE_EQUAL(kLengths[hsp_index],
296  hsp->query.end - hsp->query.offset);
297  }
298  }
299  BOOST_REQUIRE_EQUAL(kTotalHsps, hsp_index);
300 }
301 
302 
303 // test hanling of the case when CBS 1 is requested, but .freq file is missing
304 BOOST_AUTO_TEST_CASE(TestCBSFreqsNotFound)
305 {
306  // make sure that the '.freq' file does not exist for the test database
307  string dbname = "data/deltatest_nocbs";
308  CFile freq_file(dbname + ".freq");
309  BOOST_REQUIRE(!freq_file.Exists());
310 
311  // set coposition based statistics to 1 (requires .freq file)
313  (dynamic_cast<CBlastRPSOptionsHandle*>(
314  opts.GetNonNullPointer()))->SetCompositionBasedStats(true);
315 
316  CSeq_id id("gi|129295");
317  unique_ptr<SSeqLoc> query(CTestObjMgr::Instance().CreateSSeqLoc(id));
318  TSeqLocVector query_v;
319  query_v.push_back(*query);
320 
322  CRef<IQueryFactory> query_factory(new CObjMgr_QueryFactory(query_v));
323 
324  // exception must be throws when the file is not found
325  BOOST_REQUIRE_THROW(CLocalBlast(query_factory, opts, dbinfo),
327 }
328 
329 
User-defined methods of the data storage class.
Common definitions for protein and nucleotide lookup tables.
Declares the CBlastRPSOptionsHandle class.
Utility function to convert internal BLAST result structures into objects::CSeq_align_set objects.
@ eRPSBlast
protein-pssm (reverse-position-specific BLAST)
Definition: blast_types.hpp:63
@ eRPSTblastn
nucleotide-pssm (RPS blast with translated query)
Definition: blast_types.hpp:64
BOOST_AUTO_TEST_SUITE_END() static int s_GetSegmentFlags(const CBioseq &bioseq)
Defines BLAST error codes (user errors included)
Wrapper class for BlastHSPResults .
Definition: blast_aux.hpp:343
Search class to perform the preliminary stage of the BLAST search.
Handle to the rpsblast options to the BLAST algorithm.
Wrapper class for BlastSeqSrc .
Definition: blast_aux.hpp:350
CFile –.
Definition: ncbifile.hpp:1604
Class to perform a BLAST search on local BLAST databases Note that PHI-BLAST can be run using this cl...
Definition: local_blast.hpp:62
NCBI C++ Object Manager dependant implementation of IQueryFactory.
Handle to the options for translated nucleotide-RPS blast.
CRef –.
Definition: ncbiobj.hpp:618
CScope –.
Definition: scope.hpp:92
Definition: Score.hpp:57
Blast Search Subject.
Search Results for All Queries.
Tdata::size_type Size() const
static CTestObjMgr & Instance()
Definition: test_objmgr.cpp:71
void SetCompositionBasedStats(bool mode)
Set composition based statistics mode.
CRef< SInternalData > Run()
Borrow the internal data and results results.
BlastSeqSrc * SeqDbBlastSeqSrcInit(const string &dbname, bool is_prot, Uint4 first_seq=0, Uint4 last_seq=0, Int4 mask_algo_id=-1, ESubjectMaskingType mask_type=eNoSubjMasking)
Initialize the sequence source structure.
CRef< CSearchResultSet > Run()
Executes the search.
static CBlastOptionsHandle * Create(EProgram program, EAPILocality locality=CBlastOptions::eLocal)
Creates an options handle object configured with default options for the requested program,...
CBlastOptions & SetOptions()
Returns a reference to the internal options class which this object is a handle for.
void SetFilterString(const char *f, bool clear=true)
Sets FilterString.
CRef< TBlastHSPStream > m_HspStream
HSP output of the preliminary stage goes here.
BlastHSPResults * ComputeBlastHSPResults(BlastHSPStream *stream, Uint4 max_num_hsps=0, bool *rm_hsps=NULL, vector< bool > *rm_hsps_info=NULL) const
Return HSPs in a structure other than the HSPStream? Provide conversion? How to combine this with CBl...
@ eBlastDbIsProtein
protein
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define GI_CONST(gi)
Definition: ncbimisc.hpp:1087
virtual bool Exists(void) const
Check existence of file.
Definition: ncbifile.hpp:4038
void SetWhole(TWhole &v)
Definition: Seq_loc.hpp:982
void AddDefaults(TPriority pri=kPriority_Default)
Add default data loaders from object manager.
Definition: scope.cpp:504
TObjectType * GetNonNullPointer(void)
Get pointer value and throw a null pointer exception if pointer is null.
Definition: ncbiobj.hpp:968
TObjectType * GetPointer(void) THROWS_NONE
Get pointer,.
Definition: ncbiobj.hpp:998
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
const TDenseg & GetDenseg(void) const
Get the variant data.
Definition: Seq_align_.cpp:153
vector< CRef< CSeq_loc > > TLoc
Definition: Std_seg_.hpp:93
vector< CRef< CScore > > TScore
Definition: Seq_align_.hpp:398
const TStarts & GetStarts(void) const
Get the Starts member data.
Definition: Dense_seg_.hpp:530
const TLens & GetLens(void) const
Get the Lens member data.
Definition: Dense_seg_.hpp:555
const TStd & GetStd(void) const
Get the variant data.
Definition: Seq_align_.hpp:752
TInt GetInt(void) const
Get the variant data.
Definition: Score_.hpp:411
const TValue & GetValue(void) const
Get the Value member data.
Definition: Score_.hpp:465
bool IsStd(void) const
Check if variant Std is selected.
Definition: Seq_align_.hpp:746
list< CRef< CSeq_align > > Tdata
const TScore & GetScore(void) const
Get the Score member data.
Definition: Seq_align_.hpp:896
TReal GetReal(void) const
Get the variant data.
Definition: Score_.hpp:384
const TId & GetId(void) const
Get the Id member data.
Definition: Score_.hpp:444
const Tdata & Get(void) const
Get the member data.
const TSegs & GetSegs(void) const
Get the Segs member data.
Definition: Seq_align_.hpp:921
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
@ eNa_strand_unknown
Definition: Na_strand_.hpp:65
@ eNa_strand_both
in forward orientation
Definition: Na_strand_.hpp:68
char * dbname(DBPROCESS *dbproc)
Get name of current database.
Definition: dblib.c:6929
Main class to perform a BLAST search on the local machine.
Wrapper for all lookup tables used in BLAST.
void CheckForBlastSeqSrcErrors(const BlastSeqSrc *seqsrc)
Magic spell ;-) needed for some weird compilers... very empiric.
NOTE: This file contains work in progress and the APIs are likely to change, please do not rely on th...
Utilities to develop and debug unit tests for BLAST.
void testNuclHitList(const CSeq_align_set &results, ENa_strand strand)
BOOST_AUTO_TEST_CASE(WholeSequenceMatch)
Declares the CRPSTBlastnOptionsHandle class.
Implementation of the BlastSeqSrc interface using the C++ BLAST databases API.
vector< SSeqLoc > TSeqLocVector
Vector of sequence locations.
Definition: sseqloc.hpp:129
The structure to hold all HSPs for a given sequence after the gapped alignment.
Definition: blast_hits.h:153
Int4 oid
The ordinal id of the subject sequence this HSP list is for.
Definition: blast_hits.h:154
Int4 hspcnt
Number of HSPs saved.
Definition: blast_hits.h:158
BlastHSP ** hsp_array
Array of pointers to individual HSPs.
Definition: blast_hits.h:157
BlastHitList ** hitlist_array
Array of results for individual query sequences.
Definition: blast_hits.h:185
Int4 num_queries
Number of query sequences.
Definition: blast_hits.h:184
Structure holding all information about an HSP.
Definition: blast_hits.h:126
BlastSeg query
Query sequence info.
Definition: blast_hits.h:131
Int4 score
This HSP's raw score.
Definition: blast_hits.h:127
BlastHSPList ** hsplist_array
Array of HSP lists for individual database hits.
Definition: blast_hits.h:176
Int4 hsplist_count
Filled size of the HSP lists array.
Definition: blast_hits.h:170
Int4 end
End of hsp.
Definition: blast_hits.h:99
Int4 offset
Start of hsp.
Definition: blast_hits.h:98
void NuclSearch(ENa_strand strand)
Structure to represent a single sequence to be fed to BLAST.
Definition: sseqloc.hpp:47
static string query
Utility stuff for more convenient using of Boost.Test library.
@ TRUE
Definition: testodbc.c:27
Modified on Thu Nov 30 04:56:34 2023 by modify_doxy.py rev. 669887