NCBI C++ ToolKit
local_dataloader_test.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: local_dataloader_test.cpp 96810 2022-05-12 15:55:28Z fongah2 $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Tom Madden, NCBI
27 *
28 * File Description:
29 * Unit tests for remote data loader
30 *
31 * ===========================================================================
32 */
33 
34 #include <ncbi_pch.hpp>
35 #include <serial/serial.hpp>
36 #include <serial/objostr.hpp>
37 #include <serial/exception.hpp>
38 #include <util/range.hpp>
40 #include <objects/seq/Seq_inst.hpp>
41 #include <objects/seq/Bioseq.hpp>
44 #include <objects/seq/Seq_ext.hpp>
45 #include <objects/seq/Seqdesc.hpp>
48 #include <objmgr/scope.hpp>
49 #include <objmgr/bioseq_handle.hpp>
50 
52 #include "../local_blastdb_adapter.hpp"
53 #include <util/random_gen.hpp>
55 
56 
57 // This macro should be defined before inclusion of test_boost.hpp in all
58 // "*.cpp" files inside executable except one. It is like function main() for
59 // non-Boost.Test executables is defined only in one *.cpp file - other files
60 // should not include it. If NCBI_BOOST_NO_AUTO_TEST_MAIN will not be defined
61 // then test_boost.hpp will define such "main()" function for tests.
62 //
63 // Usually if your unit tests contain only one *.cpp file you should not
64 // care about this macro at all.
65 //
66 #define NCBI_BOOST_NO_AUTO_TEST_MAIN
67 
68 
69 // This header must be included before all Boost.Test headers if there are any
70 #include <corelib/test_boost.hpp>
71 
73 using namespace ncbi::objects;
74 BEGIN_SCOPE(blast)
75 
76 BOOST_AUTO_TEST_CASE(LocalFetchNucleotideBioseq)
77 {
79  string dbname("nt");
80  string loader_name =
83  BOOST_REQUIRE_EQUAL("BLASTDB_ntNucleotide", loader_name);
84  CScope scope(*objmgr);
85 
86  scope.AddDataLoader(loader_name);
87 
88  CSeq_id seqid1(CSeq_id::e_Gi, 555); // nucleotide
89 
90  CBioseq_Handle handle1 = scope.GetBioseqHandle(seqid1);
91  BOOST_REQUIRE_EQUAL(624U, handle1.GetInst().GetLength());
92  TTaxId taxid = scope.GetTaxId(seqid1);
93  BOOST_REQUIRE_EQUAL(TAX_ID_CONST(9913), taxid);
94  BOOST_REQUIRE_EQUAL(CSeq_inst::eMol_na, scope.GetSequenceType(seqid1));
95 
96  CConstRef<CBioseq> bioseq1 = handle1.GetCompleteBioseq();
97  BOOST_REQUIRE_EQUAL(624, bioseq1->GetInst().GetLength());
98 
99  CSeq_id seqid2(CSeq_id::e_Gi, 129295); // protein
100 
101  CBioseq_Handle handle2 = scope.GetBioseqHandle(seqid2);
102 
103  BOOST_REQUIRE(handle2.State_NoData());
104 }
105 
106 BOOST_AUTO_TEST_CASE(LocalFetchBatchData)
107 {
109  string dbname("nt");
110  string loader_name =
113  CScope scope(*objmgr);
114 
115  scope.AddDataLoader(loader_name);
116 
117  CScope::TSequenceLengths reference_L, test_L;
118  CScope::TSequenceTypes reference_T, test_T;
119  CScope::TTaxIds reference_TI, test_TI;
120 
122  idhs.push_back(CSeq_id_Handle::GetHandle(GI_CONST(4)));
123  reference_L.push_back(556);
124  reference_TI.push_back(TAX_ID_CONST(9646));
125 
126  idhs.push_back(CSeq_id_Handle::GetHandle(GI_CONST(7)));
127  reference_L.push_back(437);
128  reference_TI.push_back(TAX_ID_CONST(9913));
129 
130  idhs.push_back(CSeq_id_Handle::GetHandle(GI_CONST(9)));
131  reference_L.push_back(1512);
132  reference_TI.push_back(TAX_ID_CONST(9913));
133 
134  idhs.push_back(CSeq_id_Handle::GetHandle(GI_CONST(11)));
135  reference_L.push_back(2367);
136  reference_TI.push_back(TAX_ID_CONST(9913));
137 
138  idhs.push_back(CSeq_id_Handle::GetHandle(GI_CONST(15)));
139  reference_L.push_back(540);
140  reference_TI.push_back(TAX_ID_CONST(9915));
141 
142  idhs.push_back(CSeq_id_Handle::GetHandle(GI_CONST(16)));
143  reference_L.push_back(1759);
144  reference_TI.push_back(TAX_ID_CONST(9771));
145 
146  idhs.push_back(CSeq_id_Handle::GetHandle(GI_CONST(17)));
147  reference_L.push_back(1758);
148  reference_TI.push_back(TAX_ID_CONST(9771));
149 
150  idhs.push_back(CSeq_id_Handle::GetHandle(GI_CONST(18)));
151  reference_L.push_back(1758);
152  reference_TI.push_back(TAX_ID_CONST(9771));
153 
154  idhs.push_back(CSeq_id_Handle::GetHandle(GI_CONST(19)));
155  reference_L.push_back(422);
156  reference_TI.push_back(TAX_ID_CONST(9771));
157 
158  idhs.push_back(CSeq_id_Handle::GetHandle(GI_CONST(20)));
159  reference_L.push_back(410);
160  reference_TI.push_back(TAX_ID_CONST(9771));
161 
162  reference_T.assign(idhs.size(), CSeq_inst::eMol_na);
163  BOOST_REQUIRE_EQUAL(idhs.size(), reference_L.size());
164  BOOST_REQUIRE_EQUAL(idhs.size(), reference_T.size());
165  BOOST_REQUIRE_EQUAL(idhs.size(), reference_TI.size());
166 
167  scope.GetSequenceLengths(&test_L, idhs);
168  scope.GetSequenceTypes(&test_T, idhs);
169  scope.GetTaxIds(&test_TI, idhs);
170 
171  BOOST_REQUIRE_EQUAL(idhs.size(), test_L.size());
172  BOOST_REQUIRE_EQUAL(idhs.size(), test_T.size());
173  BOOST_REQUIRE_EQUAL(idhs.size(), test_TI.size());
174 
175  BOOST_CHECK_EQUAL_COLLECTIONS(test_L.begin(), test_L.end(),
176  reference_L.begin(), reference_L.end());
177  BOOST_CHECK_EQUAL_COLLECTIONS(test_TI.begin(), test_TI.end(),
178  reference_TI.begin(), reference_TI.end());
179  BOOST_CHECK_EQUAL_COLLECTIONS(test_T.begin(), test_T.end(),
180  reference_T.begin(), reference_T.end());
181 }
182 
183 BOOST_AUTO_TEST_CASE(LocalFetchNucleotideBioseqNotFixedSize)
184 {
186  string dbname("refseq_genomic");
187  string loader_name =
190  BOOST_REQUIRE_EQUAL("BLASTDB_refseq_genomicNucleotide", loader_name);
191  CScope scope(*objmgr);
192 
193  scope.AddDataLoader(loader_name);
194 
195  CSeq_id seqid1(CSeq_id::e_Other, "NC_000022"); // nucleotide
196 
197  CBioseq_Handle handle1 = scope.GetBioseqHandle(seqid1);
198  BOOST_REQUIRE(handle1);
199  BOOST_REQUIRE_EQUAL(50818468, handle1.GetInst().GetLength());
200  BOOST_REQUIRE_EQUAL(TAX_ID_CONST(9606), scope.GetTaxId(seqid1));
201  BOOST_REQUIRE_EQUAL(CSeq_inst::eMol_na, scope.GetSequenceType(seqid1));
202 
203  CConstRef<CBioseq> bioseq1 = handle1.GetCompleteBioseq();
204  BOOST_REQUIRE_EQUAL(50818468, bioseq1->GetInst().GetLength());
205 
206 }
207 
208 BOOST_AUTO_TEST_CASE(LocalFetchProteinBioseq)
209 {
211  string dbname("nr");
212  string loader_name =
215 
216  BOOST_REQUIRE_EQUAL("BLASTDB_nrProtein", loader_name);
217 
218  CScope scope(*objmgr);
219 
220  scope.AddDataLoader(loader_name);
221 
222  CSeq_id seqid1("P01013.1"); // protein
223  CBioseq_Handle handle1 = scope.GetBioseqHandle(seqid1);
224  BOOST_REQUIRE(handle1);
225  BOOST_REQUIRE_EQUAL(232, handle1.GetInst().GetLength());
226  BOOST_REQUIRE_EQUAL(TAX_ID_CONST(9031), scope.GetTaxId(seqid1));
227  BOOST_REQUIRE_EQUAL(CSeq_inst::eMol_aa, scope.GetSequenceType(seqid1));
228 
229  CConstRef<CBioseq> bioseq1 = handle1.GetCompleteBioseq();
230  BOOST_REQUIRE_EQUAL(232, bioseq1->GetInst().GetLength());
231 
232  CSeq_id seqid2(CSeq_id::e_Gi, 555); // nucleotide
233  CBioseq_Handle handle2 = scope.GetBioseqHandle(seqid2);
234  BOOST_REQUIRE(!handle2);
235  BOOST_REQUIRE(handle2.State_NoData());
236  BOOST_REQUIRE_EQUAL(INVALID_TAX_ID, scope.GetTaxId(seqid2));
237 
238  CSeq_id seqid3(CSeq_id::e_Genbank, "AZT36267"); // by accession
239  CBioseq_Handle handle3 = scope.GetBioseqHandle(seqid3);
240  BOOST_REQUIRE(handle3);
241  BOOST_REQUIRE_EQUAL("AZT36267", handle3.GetSeqId()->GetSeqIdString());
242  CConstRef<CBioseq> bioseq3 = handle3.GetCompleteBioseq();
243  string defline = "";
244  if (bioseq3->IsSetDescr()) {
245  const CBioseq::TDescr::Tdata& data = bioseq3->GetDescr().Get();
247  if((*iter)->IsTitle()) {
248  defline += (*iter)->GetTitle();
249  }
250  }
251  }
252  // Finds beginning of EGA25625 title.
253  BOOST_REQUIRE(defline.find("metal ABC transporter") == 0);
254 }
255 
256 // Motivated by WB-1712
257 BOOST_AUTO_TEST_CASE(FetchNonRedundantEntry)
258 {
260  string dbname("nr");
261  string loader_name =
264 
265  BOOST_REQUIRE_EQUAL("BLASTDB_nrProtein", loader_name);
266 
267  CScope scope(*objmgr);
268 
269  scope.AddDataLoader(loader_name);
270 
271  const size_t kExpectedLength(536);
272  const TTaxId kExpectedTaxid = TAX_ID_CONST(9606);
273 
274  CSeq_id seqid1("NP_001308920"); // human protein
275  CBioseq_Handle handle1 = scope.GetBioseqHandle(seqid1);
276  BOOST_REQUIRE(handle1);
277  BOOST_REQUIRE_EQUAL(kExpectedLength, handle1.GetInst().GetLength());
278  BOOST_CHECK_EQUAL(kExpectedTaxid, scope.GetTaxId(seqid1));
279  BOOST_REQUIRE_EQUAL(CSeq_inst::eMol_aa, scope.GetSequenceType(seqid1));
280 
281  CConstRef<CBioseq> bioseq1 = handle1.GetCompleteBioseq();
282  BOOST_REQUIRE(bioseq1.NotNull());
283  BOOST_REQUIRE_EQUAL(kExpectedLength, bioseq1->GetInst().GetLength());
284  BOOST_CHECK_EQUAL(kExpectedTaxid, bioseq1->GetTaxId());
285 
286  CSeq_id monkey_id("XP_001165763"); // monkey sequence
287  CBioseq_Handle monkey_handle = scope.GetBioseqHandle(monkey_id);
288  BOOST_REQUIRE(monkey_handle);
289  BOOST_REQUIRE_EQUAL(kExpectedLength, monkey_handle.GetInst().GetLength());
290  BOOST_CHECK_EQUAL(TAX_ID_CONST(9598), scope.GetTaxId(monkey_id));
291 }
292 
293 #ifdef NCBI_THREADS
294 
296 {
297 public:
299  : m_SeqDB(seq_db) {
300  }
301 
302  virtual void* Main() {
303  CLocalBlastDbAdapter ldb(m_SeqDB);
304 
305  TSeqPos length_0 = ldb.GetSeqLength(0);
306  TSeqPos length_1 = ldb.GetSeqLength(1);
307  CRandom r;
308  for(int i=0; i < 100; i++) {
309  CRef<CSeq_data> s1 = ldb.GetSequence(0);
310  {
311  TSeqPos from = r.GetRand(0, length_0 -100);
312  TSeqPos to = r.GetRand(from, length_0);
313  CRef<CSeq_data> s2 = ldb.GetSequence(0, from, to);
314  }
315  {
316  TSeqPos from = r.GetRand(0, length_1 -100);
317  TSeqPos to = r.GetRand(from, length_1);
318  CRef<CSeq_data> s3 = ldb.GetSequence(1, from, to);
319  }
320  }
321  return (void*)0;
322  }
323 
324 private:
326 };
327 
328 BOOST_AUTO_TEST_CASE(LocalBlastDbAdapterMT)
329 {
330  string dbname("data/testdb");
331  typedef vector< CRef<CLocalAdapterThread> > TTesterThreads;
332  const TSeqPos kNumThreads = 48;
333  TTesterThreads the_threads(kNumThreads);
334 
336  for (TSeqPos i = 0; i < kNumThreads; i++) {
337  the_threads[i].Reset(new CLocalAdapterThread(seqdb));
338  BOOST_REQUIRE(the_threads[i].NotEmpty());
339  }
340 
341  NON_CONST_ITERATE(TTesterThreads, thread, the_threads) {
342  (*thread)->Run();
343  }
344 
345  NON_CONST_ITERATE(TTesterThreads, thread, the_threads) {
346  long result = 0;
347  (*thread)->Join(reinterpret_cast<void**>(&result));
348  BOOST_REQUIRE_EQUAL(0L, result);
349  }
350 
351  for (TSeqPos i = 0; i < kNumThreads; i++) {
352  the_threads[i].Reset();
353  }
354 
355 }
356 
357 #endif
358 
359 BOOST_AUTO_TEST_CASE(BlastDbAdapterGetSequenceWithRange)
360 {
361  string dbname("data/testdb");
362  const string s1("GTTTTCAATAAT");
363  const string s2("ACCGTTTCACAAGTAGGGCGTAGCGCATTTGCAG");
364  const string s3("AATTGGCTGTTTTTGAACTACTGTA");
365  const string s4("AGATTAATTATCATTTGCAG");
366 
368  CLocalBlastDbAdapter ldb(seqdb);
369  {
370  CRef<CSeq_data> d1 = ldb.GetSequence(0, 1233, 1245);
371  string t1;
372  vector<char> ncbi4na = d1->GetNcbi4na().Get();
373  CSeqConvert::Convert(ncbi4na, CSeqUtil::e_Ncbi4na, 0, s1.size(), t1, CSeqUtil::e_Iupacna);
374  BOOST_REQUIRE_EQUAL(t1, s1);
375  }
376 
377  {
378  CRef<CSeq_data> d2 = ldb.GetSequence(1, 98764, 98798);
379  string t2;
380  vector<char> ncbi4na = d2->GetNcbi4na().Get();
381  CSeqConvert::Convert(ncbi4na, CSeqUtil::e_Ncbi4na, 0, s2.size(), t2, CSeqUtil::e_Iupacna);
382  BOOST_REQUIRE_EQUAL(t2, s2);
383  }
384  {
385  CRef<CSeq_data> d3 = ldb.GetSequence(1, 100245, 100270);
386  string t3;
387  vector<char> ncbi4na = d3->GetNcbi4na().Get();
388  CSeqConvert::Convert(ncbi4na, CSeqUtil::e_Ncbi4na, 0, s3.size(), t3, CSeqUtil::e_Iupacna);
389  BOOST_REQUIRE_EQUAL(t3, s3);
390  }
391  {
392  CRef<CSeq_data> d4 = ldb.GetSequence(0, 12439, 12459);
393  string t4;
394  vector<char> ncbi4na = d4->GetNcbi4na().Get();
395  CSeqConvert::Convert(ncbi4na, CSeqUtil::e_Ncbi4na, 0, s4.size(), t4, CSeqUtil::e_Iupacna);
396  BOOST_REQUIRE_EQUAL(t4, s4);
397  }
398 
399 
400 }
401 
402 END_SCOPE(blast)
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
Data loader implementation that uses the blast databases.
CBioseq_Handle –.
TTaxId GetTaxId() const
Determine the tax-id for this bioseq.
Definition: Bioseq.cpp:177
static TRegisterLoaderInfo RegisterInObjectManager(CObjectManager &om, const string &dbname="nr", const EDbType dbtype=eUnknown, bool use_fixed_size_slices=true, CObjectManager::EIsDefault is_default=CObjectManager::eNonDefault, CObjectManager::TPriority priority=CObjectManager::kPriority_NotSet)
Definition: bdbloader.cpp:52
@ eNucleotide
nucleotide database
Definition: bdbloader.hpp:58
@ eProtein
protein database
Definition: bdbloader.hpp:59
virtual void * Main()
Derived (user-created) class must provide a real thread function.
CLocalAdapterThread(CRef< CSeqDB > seq_db)
This class allows retrieval of sequence data from locally installed BLAST databases via CSeqDB.
virtual int GetSeqLength(int oid)
@inheritDoc
virtual CRef< CSeq_data > GetSequence(int oid, int begin=0, int end=0)
@inheritDoc
CRandom::
Definition: random_gen.hpp:66
CScope –.
Definition: scope.hpp:92
static SIZE_TYPE Convert(const CTempString &src, TCoding src_coding, TSeqPos pos, TSeqPos length, string &dst, TCoding dst_coding)
CSeqDB.
Definition: seqdb.hpp:161
@ eNucleotide
Definition: seqdb.hpp:175
@ e_Iupacna
Definition: sequtil.hpp:47
@ e_Ncbi4na
Definition: sequtil.hpp:50
char data[12]
Definition: iconv.c:80
#define TAX_ID_CONST(id)
Definition: ncbimisc.hpp:1112
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define INVALID_TAX_ID
Definition: ncbimisc.hpp:1116
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
#define GI_CONST(gi)
Definition: ncbimisc.hpp:1087
const TPrim & Get(void) const
Definition: serialbase.hpp:347
string GetSeqIdString(bool with_version=false) const
Return seqid string with optional version for text seqid type.
Definition: Seq_id.cpp:2145
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
TLoader * GetLoader(void) const
Get pointer to the loader.
void AddDataLoader(const string &loader_name, TPriority pri=kPriority_Default)
Add data loader by name.
Definition: scope.cpp:510
vector< TTaxId > TTaxIds
Get taxonomy ids of sequences Return -1 for sequences that aren't found Return 0 for sequences that d...
Definition: scope.hpp:567
vector< TSeqPos > TSequenceLengths
Get lengths of sequences Return kInvalidSeqPos for sequences that aren't found.
Definition: scope.hpp:577
TTaxId GetTaxId(const CSeq_id &id, TGetFlags flags=0)
Get taxonomy id of bioseq Return -1 if sequence is not found Return 0 if sequence doesn't have taxono...
Definition: scope.cpp:474
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
TSequenceTypes GetSequenceTypes(const TSeq_id_Handles &idhs, TGetFlags flags=0)
CSeq_inst::TMol GetSequenceType(const CSeq_id &id, TGetFlags flags=0)
Get molecular type of sequence (protein/dna/rna) Return CSeq_inst::eMol_not_set if sequence is not fo...
Definition: scope.cpp:804
vector< CSeq_inst::TMol > TSequenceTypes
Get molecular types of sequences (protein/dna/rna) Return CSeq_inst::eMol_not_set for sequences that ...
Definition: scope.hpp:588
TSequenceLengths GetSequenceLengths(const TSeq_id_Handles &idhs, TGetFlags flags=0)
TTaxIds GetTaxIds(const TSeq_id_Handles &idhs, TGetFlags flags=0)
vector< CSeq_id_Handle > TSeq_id_Handles
Bulk retrieval methods Common argument typedef - vector of requested ids.
Definition: scope.hpp:518
@ kPriority_NotSet
Deprecated: use kPriority_Default instead.
CConstRef< CBioseq > GetCompleteBioseq(void) const
Get the complete bioseq.
bool State_NoData(void) const
CConstRef< CSeq_id > GetSeqId(void) const
Get id which can be used to access this bioseq handle Throws an exception if none is available.
const TInst & GetInst(void) const
bool NotNull(void) const THROWS_NONE
Check if pointer is not null – same effect as NotEmpty().
Definition: ncbiobj.hpp:1410
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
@ e_Other
for historical reasons, 'other' = 'refseq'
Definition: Seq_id_.hpp:104
@ e_Gi
GenInfo Integrated Database.
Definition: Seq_id_.hpp:106
list< CRef< CSeqdesc > > Tdata
Definition: Seq_descr_.hpp:91
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
TLength GetLength(void) const
Get the Length member data.
Definition: Seq_inst_.hpp:659
bool IsSetDescr(void) const
descriptors Check if a value has been assigned to Descr data member.
Definition: Bioseq_.hpp:303
const TNcbi4na & GetNcbi4na(void) const
Get the variant data.
Definition: Seq_data_.hpp:570
const TDescr & GetDescr(void) const
Get the Descr member data.
Definition: Bioseq_.hpp:315
@ eMol_na
just a nucleic acid
Definition: Seq_inst_.hpp:113
char * dbname(DBPROCESS *dbproc)
Get name of current database.
Definition: dblib.c:6929
int i
BOOST_AUTO_TEST_CASE(LocalFetchNucleotideBioseq)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
Utility stuff for more convenient using of Boost.Test library.
else result
Definition: token2.c:20
Modified on Fri Sep 20 14:58:28 2024 by modify_doxy.py rev. 669887