NCBI C++ ToolKit
seqdb_unit_test.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: seqdb_unit_test.cpp 100132 2023-06-22 17:54:42Z fongah2 $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Kevin Bealer
27  *
28  * File Description:
29  * Unit test.
30  *
31  */
32 #define NCBI_TEST_APPLICATION
33 #include <ncbi_pch.hpp>
38 #include <serial/serialbase.hpp>
39 #include <objects/seq/seq__.hpp>
40 #include <corelib/ncbifile.hpp>
42 #include <objmgr/util/sequence.hpp>
44 #include <cmath>
45 #include <algorithm>
46 #include <random>
47 #include <chrono>
48 
50 
51 #include <corelib/test_boost.hpp>
52 #ifndef SKIP_DOXYGEN_PROCESSING
53 
54 #ifdef NCBI_OS_MSWIN
55 # define DEV_NULL "nul:"
56 #else
57 # define DEV_NULL "/dev/null"
58 #endif
59 
62 
63 // Helper functions
64 
65 template<class A, class B, class C, class D, class E>
66 string s_ToString(const A & a, const B & b, const C & c, const D & d, const E & e)
67 {
68  ostringstream oss;
69  oss << a << b << c << d << e;
70  return oss.str();
71 }
72 
75 };
76 
77 static void s_TestPartialAmbigRange(CSeqDB & db, int oid, int begin, int end)
78 {
79  const char * slice (0);
80  int sliceL (0);
81  const char * whole (0);
82  int wholeL(0);
83 
84  sliceL = db.GetAmbigSeq(oid, & slice, kSeqDBNuclNcbiNA8, begin, end);
85  wholeL = db.GetAmbigSeq(oid, & whole, kSeqDBNuclNcbiNA8);
86  (void) wholeL; // not actually used, mute warning
87 
88  string op =
89  s_ToString("Checking NcbiNA8 subsequence range [", begin, ",", end, "].");
90 
91  // NOTE: Ignore compiler warnings about 'wholeL' being set but never
92  // used; its existence is necessary for the test below to succeed.
93 
94  BOOST_REQUIRE_MESSAGE(0 == memcmp(slice, whole + begin, sliceL), op);
95 
96  db.RetAmbigSeq(& whole);
97  db.RetAmbigSeq(& slice);
98 }
99 
100 static void s_TestPartialAmbig(CSeqDB & db, TGi nt_gi)
101 {
102  int oid(-1);
103  bool success = db.GiToOid(nt_gi, oid);
104 
105  CNcbiOstrstream oss;
106  oss << "GI " << nt_gi << " was not found in nt";
107  string msg = CNcbiOstrstreamToString(oss);
108  BOOST_REQUIRE_MESSAGE(success, msg);
109 
110  int length = db.GetSeqLength(oid);
111 
112  s_TestPartialAmbigRange(db, oid, 0, length);
113  s_TestPartialAmbigRange(db, oid, 0, length/2);
114  s_TestPartialAmbigRange(db, oid, length/2, length);
115 
116  for(int i = 1; i<length; i *= 2) {
117  for(int j = 0; j<length; j += i) {
118  int endpt = j + i;
119  if (endpt > length)
120  endpt = length;
121 
122  s_TestPartialAmbigRange(db, oid, j, endpt);
123  }
124  }
125 }
126 
127 static bool s_MaskingTest(EMaskingType mask, unsigned oid)
128 {
129  switch(mask) {
130  case eAll:
131  return true;
132 
133  case eOdd:
134  return (oid & 1) != 0;
135 
136  case eEven:
137  return (oid & 1) == 0;
138 
139  case ePrime:
140 
141  switch(oid) {
142  case 0:
143  case 1:
144  return false;
145 
146  case 2:
147  return true;
148 
149  default:
150  for(unsigned d = 2; d < oid; d++) {
151  if ((oid % d) == 0)
152  return false;
153  }
154  break;
155  }
156  return true;
157 
158  default:
159  break;
160  }
161 
162  BOOST_REQUIRE(0);
163 
164  return false;
165 }
166 
168  unsigned first,
169  unsigned last,
170  unsigned lowest,
171  unsigned highest,
172  unsigned count)
173 {
174 
175  if (first > last) {
176  return;
177  }
178 
179  while((first <= last) && (! s_MaskingTest(mask, first))) {
180  first++;
181 
182  if (first > last) {
183  return;
184  }
185  }
186 
187  while((first <= last) && (! s_MaskingTest(mask, last))) {
188  if (last > first) {
189  last--;
190  } else {
191  return;
192  }
193  }
194 
195  BOOST_REQUIRE(first <= last);
196 
197  unsigned exp_count(0);
198 
199  for(unsigned i=first; i<=last; i++) {
200  if (s_MaskingTest(mask, i)) {
201  exp_count++;
202  }
203  }
204 
205  BOOST_REQUIRE_EQUAL(first, lowest);
206  BOOST_REQUIRE_EQUAL(last, highest);
207  BOOST_REQUIRE_EQUAL(count, exp_count);
208 }
209 
210 template<class NUM, class DIF>
211 void
213 {
214 
215  if (! ((a <= (b + epsilon)) &&
216  (b <= (a + epsilon))) ) {
217 
218  cout << "\nMismatch: line " << lineno
219  << " a " << a
220  << " b " << b
221  << " eps " << epsilon << endl;
222 
223  BOOST_REQUIRE( a <= (b + epsilon) );
224  BOOST_REQUIRE( b <= (a + epsilon) );
225  }
226 }
227 
228 static Uint4 s_BufHash(const char * buf_in, Uint4 length, Uint4 start = 1)
229 {
230  const signed char * buf = (const signed char *) buf_in;
231  Uint4 hash = start;
232  Uint4 i = 0;
233 
234  while(i < length) {
235  if (i & 1) {
236  hash += buf[i++];
237  } else {
238  hash ^= buf[i++];
239  }
240  hash = ((hash << 13) | (hash >> 19)) + length;
241  }
242 
243  return hash;
244 }
245 
246 template<class ASNOBJ>
248 {
249  CNcbiOstrstream oss;
250  oss << MSerial_AsnText << *a;
251  return CNcbiOstrstreamToString(oss);
252 }
253 
254 // Test Cases
256 
257 BOOST_AUTO_TEST_CASE(ConstructLocal)
258 {
259  // Test both constructors; make sure sizes are equal and non-zero.
260 
261  CSeqDB local1("data/seqp", CSeqDB::eProtein);
262  CSeqDB local2("data/seqp", CSeqDB::eProtein, 0, 0, false);
263 
264  Int4 num1(0), num2(0);
265  local1.GetTotals(CSeqDB::eFilteredAll, & num1, 0);
266  local2.GetTotals(CSeqDB::eFilteredAll, & num2, 0);
267 
268  BOOST_REQUIRE(num1 >= 1);
269  BOOST_REQUIRE_EQUAL(num1, num2);
270 }
271 
272 BOOST_AUTO_TEST_CASE(PathDelimiters)
273 {
274  // Test both constructors; make sure sizes are equal and non-zero.
275 
276  CSeqDB local1("data\\seqp", CSeqDB::eProtein);
277  CSeqDB local2("data/seqp", CSeqDB::eProtein);
278 
279  Int4 num1(0), num2(0);
280  local1.GetTotals(CSeqDB::eFilteredAll, & num1, 0);
281  local2.GetTotals(CSeqDB::eFilteredAll, & num2, 0);
282 
283  BOOST_REQUIRE(num1 >= 1);
284  BOOST_REQUIRE_EQUAL(num1, num2);
285 }
286 
287 BOOST_AUTO_TEST_CASE(ConstructMissing)
288 {
289  bool caught_exception = false;
290 
291  try {
292  CSeqDB local1("data/spork", CSeqDB::eProtein);
293  CSeqDB local2("data/spork", CSeqDB::eProtein, 0, 0, false);
294 
295  Int4 num1(0), num2(0);
296  local1.GetTotals(CSeqDB::eFilteredAll, & num1, 0);
297  local2.GetTotals(CSeqDB::eFilteredAll, & num2, 0);
298 
299  BOOST_REQUIRE(num1 >= 1);
300  BOOST_REQUIRE_EQUAL(num1, num2);
301  } catch(CSeqDBException &) {
302  caught_exception = true;
303  }
304 
305  if (! caught_exception) {
306  BOOST_ERROR("ConstructMissing() did not throw an exception of type CSeqDBException.");
307  }
308 }
309 
310 BOOST_AUTO_TEST_CASE(InvalidSeqType)
311 {
312  bool caught_exception = false;
313 
314  try {
315  CSeqDB local1("data/seqp", CSeqDB::ESeqType(99));
316  } catch(CSeqDBException &) {
317  caught_exception = true;
318  }
319 
320  if (! caught_exception) {
321  BOOST_ERROR("InvalidSeqType() did not throw an exception of type CSeqDBException.");
322  }
323 }
324 
326 {
327  CSeqDB local1("data/seqp", CSeqDB::eProtein);
328 
329  Int4 num1(0);
330  local1.GetTotals(CSeqDB::eFilteredAll, & num1, 0);
331 
332  BOOST_REQUIRE(num1 >= 1);
333 }
334 
336 {
337  bool caught_exception = false;
338 
339  try {
340  CSeqDB local1("seqp", CSeqDB::eProtein);
341 
342  Int4 num1(0);
343  local1.GetTotals(CSeqDB::eFilteredAll, & num1, 0);
344 
345  BOOST_REQUIRE(num1 >= 1);
346  } catch(CSeqDBException &) {
347  caught_exception = true;
348  }
349 
350  if (! caught_exception) {
351  BOOST_ERROR("InvalidPath() did not throw an exception of type CSeqDBException.");
352  }
353 }
354 
355 BOOST_AUTO_TEST_CASE(SummaryDataN)
356 {
357  string dbname;
358  CSeqDB localN(dbname = "data/seqn", CSeqDB::eNucleotide);
359 
360  Int4 nseqs(0);
361  Uint8 vlength(0);
362  localN.GetTotals(CSeqDB::eUnfilteredAll, & nseqs, & vlength);
363 
364  BOOST_REQUIRE_EQUAL(CSeqDB::eNucleotide, localN.GetSequenceType());
365  BOOST_REQUIRE_EQUAL(int(100), nseqs);
366  BOOST_REQUIRE_EQUAL(Uint8(51718), vlength);
367  BOOST_REQUIRE_EQUAL(Uint4(875), Uint4(localN.GetMaxLength()));
368  BOOST_REQUIRE_EQUAL((string)dbname, (string)localN.GetDBNameList());
369 
370  BOOST_REQUIRE_EQUAL(string("Another test DB for CPPUNIT, SeqDB."),
371  localN.GetTitle());
372 
373  Uint8 vol1(0);
374  int seq1(0);
375  localN.GetTotals(CSeqDB::eFilteredRange, & seq1, & vol1);
376 
377  int oid_values[] = { 0, 100000 };
378  for (auto end_oid : oid_values) {
379  localN.SetIterationRange(1, end_oid);
380 
381  Uint8 vol2(0);
382  int seq2(0);
383 
384  localN.GetTotals(CSeqDB::eFilteredRange, & seq2, & vol2);
385 
386  BOOST_REQUIRE(vol2 < vol1);
387  BOOST_REQUIRE_EQUAL(seq2, seq1 - 1);
388 
389  localN.SetIterationRange(2, end_oid);
390 
391  Uint8 vol3(0);
392  int seq3(0);
393  localN.GetTotals(CSeqDB::eFilteredRange, & seq3, & vol3);
394 
395  BOOST_REQUIRE(vol3 < vol2);
396  BOOST_REQUIRE_EQUAL(seq3, seq2 - 1);
397  }
398 
399  // Try negative values
400  {
401  Uint8 vol4(0);
402  int seq4(0);
403  localN.SetIterationRange(0, -1);
404  localN.GetTotals(CSeqDB::eFilteredRange, & seq4, & vol4);
405  BOOST_CHECK_EQUAL(0U, vol4);
406  BOOST_CHECK_EQUAL(0, seq4);
407 
408  localN.SetIterationRange(-2, 10);
409  localN.GetTotals(CSeqDB::eFilteredRange, & seq4, & vol4);
410  BOOST_CHECK_EQUAL(10, seq4);
411  BOOST_CHECK(vol4 > 0);
412  }
413 }
414 
415 BOOST_AUTO_TEST_CASE(SummaryDataP)
416 {
417  string dbname;
418  CSeqDB localP(dbname = "data/seqp", CSeqDB::eProtein);
419 
420  Int4 nseqs(0), noids(0);
421  Uint8 vlength(0), tlength(0);
422 
423  localP.GetTotals(CSeqDB::eUnfilteredAll, & nseqs, & vlength);
424  localP.GetTotals(CSeqDB::eFilteredAll, & noids, & tlength);
425 
426  BOOST_REQUIRE_EQUAL(CSeqDB::eProtein, localP.GetSequenceType());
427  BOOST_REQUIRE_EQUAL(int(100), nseqs);
428  BOOST_REQUIRE_EQUAL(int(100), noids);
429  BOOST_REQUIRE_EQUAL(Uint8(26945), tlength);
430  BOOST_REQUIRE_EQUAL(Uint8(26945), vlength);
431  BOOST_REQUIRE_EQUAL(Uint4(1224), Uint4(localP.GetMaxLength()));
432  BOOST_REQUIRE_EQUAL((string)dbname, (string)localP.GetDBNameList());
433 
434  BOOST_REQUIRE_EQUAL(string("Test database for BLAST unit tests"),
435  localP.GetTitle());
436 }
437 
438 BOOST_AUTO_TEST_CASE(GetAmbigSeqAllocN)
439 {
440  CSeqDB seqp("data/seqn", CSeqDB::eNucleotide);
441 
442  char * bufp_blst = 0;
443  char * bufp_ncbi = 0;
444 
445  Uint4 length_blst =
446  seqp.GetAmbigSeqAlloc(0,
447  & bufp_blst,
449  eNew);
450 
451  Uint4 length_ncbi =
452  seqp.GetAmbigSeqAlloc(0,
453  & bufp_ncbi,
455  eMalloc);
456 
457  Uint4 hashval_blst = s_BufHash(bufp_blst, length_blst);
458  Uint4 hashval_ncbi = s_BufHash(bufp_ncbi, length_ncbi);
459 
460  delete[] bufp_blst;
461  free(bufp_ncbi);
462 
463  BOOST_REQUIRE_EQUAL(Uint4(30118382ul), hashval_blst);
464  BOOST_REQUIRE_EQUAL(Uint4(3084382219ul), hashval_ncbi);
465 }
466 
467 BOOST_AUTO_TEST_CASE(GetAmbigSeqAllocP)
468 {
469  CSeqDB seqp("data/seqp", CSeqDB::eProtein);
470 
471  char * bufp_blst = 0;
472  char * bufp_ncbi = 0;
473 
474  Uint4 length_blst =
475  seqp.GetAmbigSeqAlloc(0,
476  & bufp_blst,
478  eNew);
479 
480  Uint4 length_ncbi =
481  seqp.GetAmbigSeqAlloc(0,
482  & bufp_ncbi,
484  eMalloc);
485 
486  Uint4 hashval_blst = s_BufHash(bufp_blst, length_blst);
487  Uint4 hashval_ncbi = s_BufHash(bufp_ncbi, length_ncbi);
488 
489  delete [] bufp_blst;
490  free( bufp_ncbi );
491 
492  BOOST_REQUIRE_EQUAL(Uint4(3219499033ul), hashval_blst);
493  BOOST_REQUIRE_EQUAL(Uint4(3219499033ul), hashval_ncbi);
494 }
495 
496 BOOST_AUTO_TEST_CASE(GetAmbigSeqN)
497 {
498  CSeqDB seqp("data/seqn", CSeqDB::eNucleotide);
499 
500  const char * bufp1 = 0;
501  const char * bufp2 = 0;
502  Uint4 length1 = seqp.GetAmbigSeq(0, & bufp1, kSeqDBNuclBlastNA8);
503  Uint4 length2 = seqp.GetAmbigSeq(0, & bufp2, kSeqDBNuclNcbiNA8);
504 
505  Uint4 hashval1 = s_BufHash(bufp1, length1);
506  Uint4 hashval2 = s_BufHash(bufp2, length2);
507 
508  seqp.RetAmbigSeq(& bufp1);
509  seqp.RetAmbigSeq(& bufp2);
510 
511  BOOST_REQUIRE_EQUAL(Uint4(30118382ul), hashval1);
512  BOOST_REQUIRE_EQUAL(Uint4(3084382219ul), hashval2);
513 }
514 
515 BOOST_AUTO_TEST_CASE(GetAmbigSeqP)
516 {
517  CSeqDB seqp("data/seqp", CSeqDB::eProtein);
518 
519  const char * bufp1 = 0;
520  const char * bufp2 = 0;
521  Uint4 length1 = seqp.GetAmbigSeq(0, & bufp1, kSeqDBNuclBlastNA8);
522  Uint4 length2 = seqp.GetAmbigSeq(0, & bufp2, kSeqDBNuclNcbiNA8);
523 
524  Uint4 hashval1 = s_BufHash(bufp1, length1);
525  Uint4 hashval2 = s_BufHash(bufp2, length2);
526 
527  seqp.RetAmbigSeq(& bufp1);
528  seqp.RetAmbigSeq(& bufp2);
529 
530  BOOST_REQUIRE_EQUAL(Uint4(3219499033ul), hashval1);
531  BOOST_REQUIRE_EQUAL(Uint4(3219499033ul), hashval2);
532 }
533 
535 {
536  string got( s_Stringify(CSeqDB("data/seqn", CSeqDB::eNucleotide).GetBioseq(2)) );
537 
538  string expected("Bioseq ::= {\n"
539  " id {\n"
540  " gi 46071107,\n"
541  " ddbj {\n"
542  " accession \"BP722514\",\n"
543  " version 1\n"
544  " }\n"
545  " },\n"
546  " descr {\n"
547  " title \"Xenopus laevis NBRP cDNA clone:XL452f07ex, 3' end\",\n"
548  " user {\n"
549  " type str \"ASN1_BlastDefLine\",\n"
550  " data {\n"
551  " {\n"
552  " label str \"ASN1_BlastDefLine\",\n"
553  " num 1,\n"
554  " data oss {\n"
555  " '30803080A0801A3158656E6F707573206C6165766973204E4252502063444E4120\n"
556  "636C6F6E653A584C34353266303765782C20332720656E640000A1803080AB80020402BEFD4300\n"
557  "00AC803080A1801A0842503732323531340000A38002010100000000000000000000A280020100\n"
558  "000000000000'H\n"
559  " }\n"
560  " }\n"
561  " }\n"
562  " }\n"
563  " },\n"
564  " inst {\n"
565  " repr raw,\n"
566  " mol na,\n"
567  " length 165,\n"
568  " seq-data ncbi4na '11428288218841844814141422818811214421121482118428221114\n"
569  "82211121141881228484211141128842148481121112222F882124422141148188842112118488\n"
570  "41114822882844214144144148281181'H\n"
571  " }\n"
572  "}\n");
573 
574  BOOST_REQUIRE_EQUAL(expected, got);
575 }
576 
578 {
579 
580  string got( s_Stringify(CSeqDB("data/seqp", CSeqDB::eProtein).GetBioseq(2)) );
581 
582  string expected("Bioseq ::= {\n"
583  " id {\n"
584  " gi 44268131,\n"
585  " genbank {\n"
586  " accession \"EAI08555\",\n"
587  " version 1\n"
588  " }\n"
589  " },\n"
590  " descr {\n"
591  " title \"unknown [environmental sequence]\",\n"
592  " user {\n"
593  " type str \"ASN1_BlastDefLine\",\n"
594  " data {\n"
595  " {\n"
596  " label str \"ASN1_BlastDefLine\",\n"
597  " num 1,\n"
598  " data oss {\n"
599  " '30803080A0801A20756E6B6E6F776E205B656E7669726F6E6D656E74616C207365\n"
600  "7175656E63655D0000A1803080AB80020402A37A630000A4803080A1801A084541493038353535\n"
601  "0000A38002010100000000000000000000A280020100000000000000'H\n"
602  " }\n"
603  " }\n"
604  " }\n"
605  " }\n"
606  " },\n"
607  " inst {\n"
608  " repr raw,\n"
609  " mol aa,\n"
610  " length 64,\n"
611  " seq-data ncbistdaa '0C0A0A0606090B0909060909060B09060909131004160F090A0A0A\n"
612  "0A0B0A0B0D0D010B0D110D0606090F12090D0A0B0904050D0A160D0B0B05051009100B1005'H\n"
613  " }\n"
614  "}\n");
615 
616  BOOST_REQUIRE_EQUAL(expected, got);
617 }
618 
620 {
621 
622  string got( s_Stringify(CSeqDB("data/seqn", CSeqDB::eNucleotide).GetHdr(0)) );
623 
624  string expected = ("Blast-def-line-set ::= {\n"
625  " {\n"
626  " title \"Xenopus laevis NBRP cDNA clone:XL452f05ex, 3' end\",\n"
627  " seqid {\n"
628  " gi 46071105,\n"
629  " ddbj {\n"
630  " accession \"BP722512\",\n"
631  " version 1\n"
632  " }\n"
633  " },\n"
634  " taxid 0\n"
635  " }\n"
636  "}\n");
637 
638  BOOST_REQUIRE_EQUAL(expected, got);
639 }
640 
642 {
643 
644  string got( s_Stringify(CSeqDB("data/seqp", CSeqDB::eProtein).GetHdr(0)) );
645 
646  string expected = ("Blast-def-line-set ::= {\n"
647  " {\n"
648  " title \"similar to KIAA0960 protein [Mus musculus]\",\n"
649  " seqid {\n"
650  " gi 38083732,\n"
651  " other {\n"
652  " accession \"XP_357594\",\n"
653  " version 1\n"
654  " }\n"
655  " },\n"
656  " taxid 0\n"
657  " }\n"
658  "}\n");
659 
660  BOOST_REQUIRE_EQUAL(expected, got);
661 }
662 
664 {
665 
666  CSeqDB seqp("data/seqn", CSeqDB::eNucleotide);
667 
668  list< CRef< CSeq_id > > seqids =
669  seqp.GetSeqIDs(0);
670 
671  Uint4 h = 0;
672 
673  int cnt =0;
674  for(list< CRef< CSeq_id > >::iterator i = seqids.begin();
675  i != seqids.end();
676  i++) {
677 
678  string s( s_Stringify(*i) );
679 
680  cnt++;
681 
682  h = s_BufHash(s.data(), s.length(), h);
683  }
684 
685  BOOST_REQUIRE_EQUAL(Uint4(136774894ul), h);
686 }
687 
689 {
690 
691  CSeqDB seqp("data/seqp", CSeqDB::eProtein);
692 
693  list< CRef< CSeq_id > > seqids =
694  seqp.GetSeqIDs(0);
695 
696  Uint4 h = 0;
697 
698  int cnt =0;
699  for(list< CRef< CSeq_id > >::iterator i = seqids.begin();
700  i != seqids.end();
701  i++) {
702 
703  string s( s_Stringify(*i) );
704 
705  cnt++;
706 
707  h = s_BufHash(s.data(), s.length(), h);
708  }
709 
710  BOOST_REQUIRE_EQUAL(Uint4(2942938647ul), h);
711 }
712 
714 {
715 
716  CSeqDB dbp("data/seqp", CSeqDB::eProtein);
717  CSeqDB dbn("data/seqn", CSeqDB::eNucleotide);
718 
719  BOOST_REQUIRE_EQUAL( (int) 330, dbp.GetSeqLength(13) );
720  BOOST_REQUIRE_EQUAL( (int) 422, dbp.GetSeqLength(19) );
721  BOOST_REQUIRE_EQUAL( (int) 67, dbp.GetSeqLength(26) );
722  BOOST_REQUIRE_EQUAL( (int) 104, dbp.GetSeqLength(27) );
723  BOOST_REQUIRE_EQUAL( (int) 282, dbp.GetSeqLength(38) );
724  BOOST_REQUIRE_EQUAL( (int) 158, dbp.GetSeqLength(43) );
725  BOOST_REQUIRE_EQUAL( (int) 472, dbp.GetSeqLength(54) );
726  BOOST_REQUIRE_EQUAL( (int) 207, dbp.GetSeqLength(93) );
727 
728  BOOST_REQUIRE_EQUAL( (int) 833, dbn.GetSeqLength(9) );
729  BOOST_REQUIRE_EQUAL( (int) 250, dbn.GetSeqLength(26) );
730  BOOST_REQUIRE_EQUAL( (int) 708, dbn.GetSeqLength(39) );
731  BOOST_REQUIRE_EQUAL( (int) 472, dbn.GetSeqLength(43) );
732  BOOST_REQUIRE_EQUAL( (int) 708, dbn.GetSeqLength(39) );
733  BOOST_REQUIRE_EQUAL( (int) 448, dbn.GetSeqLength(47) );
734  BOOST_REQUIRE_EQUAL( (int) 825, dbn.GetSeqLength(61) );
735  BOOST_REQUIRE_EQUAL( (int) 371, dbn.GetSeqLength(70) );
736 }
737 
738 BOOST_AUTO_TEST_CASE(GetSeqLengthApprox)
739 {
740 
741  CSeqDB dbp("data/seqp", CSeqDB::eProtein);
742  CSeqDB dbn("data/seqn", CSeqDB::eNucleotide);
743 
744  int plen(0), nlen(0);
745 
746  dbp.GetTotals(CSeqDB::eFilteredAll, & plen, 0);
747  dbn.GetTotals(CSeqDB::eFilteredAll, & nlen, 0);
748 
749  int i = 0;
750 
751  Uint8 ptot(0);
752 
753  // For protein, approximate should be the same as exact.
754  for(i = 0; i < plen; i++) {
755  Uint4 len = dbp.GetSeqLength(i);
756  ptot += len;
757  BOOST_REQUIRE_EQUAL( len, (Uint4)dbp.GetSeqLengthApprox(i) );
758  }
759 
760  // For nucleotide, approx should be within 3 of exact.
761  Uint8 ex_tot = 0;
762  Uint8 ap_tot = 0;
763 
764  for(i = 0; i < nlen; i++) {
765  Uint4 ex = dbn.GetSeqLength(i);
766  Uint4 ap = dbn.GetSeqLengthApprox(i);
767 
768  s_ApproxEqual(ex, ap, 3, __LINE__);
769  ex_tot += ex;
770  ap_tot += ap;
771  }
772 
773  // Test case: guarantee the approximation over 2004 sequences
774  // to be between .999 and 1.001 of correct value.
775 
776  s_ApproxEqual(ex_tot, ap_tot, ex_tot / 1000, __LINE__);
777 
778  BOOST_REQUIRE_EQUAL(int(100), nlen);
779  BOOST_REQUIRE_EQUAL(int(100), plen);
780  BOOST_REQUIRE_EQUAL(Uint8(26945), ptot);
781  BOOST_REQUIRE_EQUAL(Uint8(51718), ex_tot);
782  BOOST_REQUIRE_EQUAL(Uint8(51726), ap_tot);
783 }
784 
785 BOOST_AUTO_TEST_CASE(GetSequenceN)
786 {
787 
788  CSeqDB seqp("data/seqn", CSeqDB::eNucleotide);
789 
790  const char * bufp = 0;
791 
792  Uint4 length = seqp.GetSequence(0, & bufp);
793  Uint4 hashval = s_BufHash(bufp, length);
794  seqp.RetSequence(& bufp);
795 
796  BOOST_REQUIRE_EQUAL(Uint4(1128126064ul), hashval);
797 }
798 
799 BOOST_AUTO_TEST_CASE(GetSequenceP)
800 {
801 
802  CSeqDB seqp("data/seqp", CSeqDB::eProtein);
803 
804  const char * bufp = 0;
805  Uint4 length = seqp.GetSequence(0, & bufp);
806  Uint4 hashval = s_BufHash(bufp, length);
807  seqp.RetSequence(& bufp);
808 
809  BOOST_REQUIRE_EQUAL(Uint4(3219499033ul), hashval);
810 }
811 
812 BOOST_AUTO_TEST_CASE(NrAndSwissProt)
813 {
814 
815  CSeqDB nr("nr", CSeqDB::eProtein);
816  CSeqDB sp("swissprot", CSeqDB::eProtein);
817 
818  int nr_seqs(0), nr_oids(0), sp_seqs(0), sp_oids(0);
819  Uint8 nr_tlen(0), nr_vlen(0), sp_tlen(0), sp_vlen(0);
820 
821  nr.GetTotals(CSeqDB::eFilteredAll, & nr_seqs, & nr_tlen);
822  sp.GetTotals(CSeqDB::eFilteredAll, & sp_seqs, & sp_tlen);
823 
824  nr.GetTotals(CSeqDB::eUnfilteredAll, & nr_oids, & nr_vlen);
825  sp.GetTotals(CSeqDB::eUnfilteredAll, & sp_oids, & sp_vlen);
826 
827  BOOST_REQUIRE_EQUAL(nr_seqs, nr_oids);
828  BOOST_REQUIRE_EQUAL(nr_tlen, nr_vlen);
829 
830  BOOST_REQUIRE_GT(nr_seqs, sp_seqs);
831  BOOST_REQUIRE_NE(nr_oids, sp_oids);
832  BOOST_REQUIRE_GT(nr_tlen, sp_tlen);
833  BOOST_REQUIRE_NE(nr_vlen, sp_vlen);
834 
835  BOOST_REQUIRE_GE(nr.GetMaxLength(), sp.GetMaxLength());
836 }
837 
838 BOOST_AUTO_TEST_CASE(TranslateIdents)
839 {
840 
841  CSeqDB nr("nr", CSeqDB::eProtein);
842 
843  const char * seqid_list[] = {
844  "AAA03612.1", "prf||1922246A", "P51728.1", "AAB84238.1", "BAA25256.1", "AAC15878.1",
845  "1A8U_A", "AAC82254.1", "AAD31141.1", "1R24_A", "AAF63214.1", "AAF95963.1",
846  "WP_003095644.1", "AAC59341.1"
847  };
848 
849  Uint4 pig_list[] = {
850  1153908, 507276, 851580, 200775, 1028308, 939134,
851  199107, 511756, 27645, 429124, 575812, 648744,
852  421191, 1128836
853  };
854 
855  Uint4 len_list[] = {
856  199, 233, 186, 441, 96, 206,
857  277, 205, 110, 206, 510, 293,
858  394, 174
859  };
860 
861  size_t L_seqid = ArraySize(seqid_list);
862  size_t L_pig = ArraySize(pig_list);
863  size_t L_len = ArraySize(len_list);
864 
865  // In case of hand-editing
866  BOOST_REQUIRE((L_seqid == L_len) && (L_len == L_pig));
867 
868  for(size_t i = 0; i<L_seqid; i++) {
869  string arr_seqid(seqid_list[i]);
870  int arr_pig(pig_list[i]), arr_len(len_list[i]);
871  vector<int> seqid2oid;
872  int pig2oid = 0, oid2pig=0, oid2len =0;
873  nr.AccessionToOids(arr_seqid, seqid2oid);
874 
875  BOOST_REQUIRE(nr.PigToOid(arr_pig, pig2oid));
876 
877  BOOST_CHECK_EQUAL(pig2oid, seqid2oid[0]);
878  BOOST_REQUIRE(pig2oid != int(-1));
879 
880  oid2len = nr.GetSeqLength(pig2oid);
881  BOOST_REQUIRE(nr.OidToPig (pig2oid, oid2pig));
882 
883  BOOST_REQUIRE_EQUAL(arr_len, oid2len);
884  BOOST_REQUIRE_EQUAL(arr_pig, oid2pig);
885  }
886 }
887 
888 BOOST_AUTO_TEST_CASE(StringIdentSearch)
889 {
890  const string kDb("nr");
892 
893  // Sets of equivalent strings
894 
895  const Uint4 NUM_ITEMS = 6;
896 
897  const char ** str_list[NUM_ITEMS];
898 
899  const char * s0[] =
900  { "AAP90615.1", "AAP90628.1", "AAP90641.1", "AAP90654.1", "AAP90667.1", 0 };
901  const char * s1[] =
902  { "P01013", 0 };
903  const char * s2[] =
904  { "1NPQ", "1NPQ_A", "1NPQ_B", 0 };
905  const char * s3[] =
906  { "1NPQ", 0 };
907  const char * s4[] =
908  { "1LCT_A", "1LCT", 0 };
909  const char * s5[] =
910  { "1GWB_A", "1GWB_B", "1GWB", 0 };
911 
912  str_list[0] = s0;
913  str_list[1] = s1;
914  str_list[2] = s2;
915  str_list[3] = s3;
916  str_list[4] = s4;
917  str_list[5] = s5;
918 
919  Uint4 * len_list[NUM_ITEMS];
920 
921  Uint4 l0[] = { 261, 0 };
922  Uint4 l1[] = { 232, 0 };
923  Uint4 l2[] = { 17, 90, 0 };
924  Uint4 l3[] = { 17, 90, 0 };
925  Uint4 l4[] = { 333, 0 };
926  Uint4 l5[] = { 281, 0 };
927 
928  len_list[0] = l0;
929  len_list[1] = l1;
930  len_list[2] = l2;
931  len_list[3] = l3;
932  len_list[4] = l4;
933  len_list[5] = l5;
934 
935  size_t L_str = ArraySize(str_list);
936  size_t L_len = ArraySize(len_list);
937 
938  // Verify lengths in case of typo.
939 
940  BOOST_REQUIRE_EQUAL(NUM_ITEMS, L_str);
941  BOOST_REQUIRE_EQUAL(NUM_ITEMS, L_len);
942 
943  for(Uint4 i = 0; i< NUM_ITEMS; i++) {
944  set<int> str_oids;
945 
946  set<int> exp_len;
947  set<int> oid_len;
948 
949  for(const char ** strp = str_list[i]; *strp; strp++) {
950  vector<int> oids;
951  nr.AccessionToOids(*strp, oids);
952 
953  BOOST_REQUIRE_MESSAGE(! oids.empty(), "Failed to find accession "
954  << *strp << " in " << kDb);
955 
956  ITERATE(vector<int>, iter, oids) {
957  str_oids.insert(*iter);
958  }
959  }
960 
961  set<int>::iterator str_iter;
962 
963  // Phase 1: compare oids
964 
965  str_iter = str_oids.begin();
966 
967  // Phase 2: compare lengths
968 
969  Uint4 * llp = len_list[i];
970 
971  while(*llp) {
972  exp_len.insert(*llp++);
973  }
974 
975  ITERATE(set<int>, iter, str_oids) {
976  oid_len.insert(nr.GetSeqLength(*iter));
977  }
978 
979  set<int>::iterator oid_iter, exp_iter;
980 
981  oid_iter = oid_len.begin();
982  exp_iter = exp_len.begin();
983 
984  while(oid_iter != oid_len.end()) {
985  BOOST_REQUIRE(exp_iter != exp_len.end());
986  BOOST_REQUIRE_EQUAL(*oid_iter, *exp_iter);
987 
988  oid_iter++;
989  exp_iter++;
990  }
991  }
992 }
993 
995 {
996 
997  // Originally, this code compared SeqDB's output to readb's; this
998  // is no longer the case because it would create a dependency on
999  // the C toolkit.
1000 
1001  const char * dbname = "nt";
1002 
1003  bool is_prot = false;
1004  TGi gi = 12831944;
1005 
1006  CNcbiOstrstream oss_fn;
1007  oss_fn << "." << dbname << "." << gi;
1008 
1009  vector<char> seqdb_data;
1010  vector<char> expected_data;
1011 
1012  string seqdb_bs;
1013 
1014  {
1016 
1017  int oid(0);
1018 
1019  bool gi_trans = db.GiToOid(gi, oid);
1020 
1021  BOOST_REQUIRE(gi_trans);
1022 
1023  CRef<CBioseq> bs = db.GetBioseq(oid);
1024 
1025  // These have changed for SeqDB.
1026  bs->ResetDescr();
1027 
1028  seqdb_bs = s_Stringify(bs);
1029 
1030  BOOST_REQUIRE(! bs.Empty());
1031 
1032  seqdb_data = bs->GetInst().GetSeq_data().GetNcbi4na().Get();
1033 
1034  BOOST_REQUIRE_EQUAL(int(seqdb_data.size()), 872);
1035  }
1036 
1037  string expected_bs =
1038  "Bioseq ::= {\n"
1039  " id {\n"
1040  " gi 12831944,\n"
1041  " embl {\n"
1042  " accession \"AJ389663\",\n"
1043  " version 1\n"
1044  " }\n"
1045  " },\n"
1046  " inst {\n"
1047  " repr raw,\n"
1048  " mol na,\n"
1049  " length 1744,\n"
1050  " seq-data ncbi4na '42184822114812141288821418148411122424118442821881118214\n"
1051  "824144882288141824882211822512824418112848442118828141428118121842111211428224\n"
1052  "122228888112244444411141424288881881418211112211842444888848282442118222428211\n"
1053  "288884484128284418112888484284182421244222824142244241248182888211184828422281\n"
1054  "821128881482488124841818422811241448848812444811244441182144488241882244141444\n"
1055  "142184141112442812212182211441144214214424242111881222128222442124444144814841\n"
1056  "241111181124184244412828182414422224811824411841481212888111822888112414418211\n"
1057  "884414442114828448422142142242448118822142822118142481818811148848842148811111\n"
1058  "428248148844182824444411442814244864242248844424822812842824122841228122442244\n"
1059  "814888484222414484282884128414848282444841224424148881288841111118814148428211\n"
1060  "142144228848422422241181484484218441181184411414412282448828188884884488882441\n"
1061  "124841448118418811414441214124444421688248188424424281414484111882884412242242\n"
1062  "11412441281284241114218884221142184888821881FFF1141124111482141448824114124182\n"
1063  "141812248244814882841221811124FFF241284424182243241148812812818412824424442142\n"
1064  "228214441112211148288844488224444411481844884F11142841112881114411884124411444\n"
1065  "212212214414844142284244288118884128211212444111128212224422244121224841441884\n"
1066  "121418841414282888282418824484448448448421844224882881488448441424188848284488\n"
1067  "11882241811241124141282814228428111814822A224188242228182482442144412882881414\n"
1068  "441241484424818142212424141884142118112144828484184222881418488244442242124242\n"
1069  "428121284114411821421248284228222844222411144488444811222428411228228824842814\n"
1070  "441884444288481188488222218411241441188222148114242414821811428242488418812482\n"
1071  "228422288848121212242224824281281221188414244888128414441211441884422224124144\n"
1072  "24282244248282842448A88842241411284222211148421284'H\n"
1073  " }\n"
1074  "}\n";
1075 
1076  string data_str =
1077  "GCATGTCCAAGTACAGACTTTCAGATAGTGAAACCGCGAATGGCTCATTAAATCAGTCGA"
1078  "GGTTCCTTAGATCGTTCCAATCCRACTCGGATAACTGTGGCAATTCTAGAGCTAATACAT"
1079  "GCAAACAAGCTCCGACCCCTTTTAACCGGGGGGAAAGAGCGCTTTTATTAGATCAAAACC"
1080  "AATGCGGGTTTTGTCTCGGCAATCCCGCTCAACTTTTGGTGACTCTGGATAACTTTGTGC"
1081  "TGATCGCACGGCCCTCGAGCCGGCGACGTATCTTTCAAATGTCTGCCCTATCAACTTTAG"
1082  "TCGTTACGTGATATGCCTAACGAGGTTGTTACGGGTAACGGGGAATCAGGGTTCGATTCC"
1083  "GGAGAGGGAGCATGAGAAACGGCTACCACATCCAAGGAAGGCAGCAGGCGCGCAAATTAC"
1084  "CCACTCCCGGCACGGGGAGGTAGTGACGAAAAATAACGATGCGGGACTCTATCGAGGCCC"
1085  "CGTAATCGGAATGAGTACACTTTAAATCCTTTAACGAGGATCAATTGGAGGGCAAGTCTG"
1086  "GTGCCAGCAGCCGCGGTAATTCCAGCTCCAATAGCGTATATTAAAGTTGTTGCAGTTAAA"
1087  "AAGCTCGTAGTTGGATCTCGGGGGAAGGCTAGCGGTSGCGCCGTTGGGCGTCCTACTGCT"
1088  "CGACCTGACCTACCGGCCGGTAGTTTGTGCCCGAGGTGCTCTTGACTGAGTGTCTCGGGT"
1089  "GACCGGCGAGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCAGGCCTTGTGCCGCCCGAA"
1090  "TAGTGGTGCATGGAATAATGGAAGAGGACCTCGGTTCTATTTTGTTGGTTTTCGGAACGT"
1091  "GAGGTAATGATTAAGAGGGACAGACGGGGGCA";
1092 
1093  expected_data.assign(data_str.data(),
1094  data_str.data() + data_str.size());
1095 
1096  vector<char> seqdb_tmp;
1097 
1098  CSeqConvert::Convert(seqdb_data,
1100  0,
1101  seqdb_data.size(),
1102  seqdb_tmp,
1104 
1105  seqdb_tmp.swap(seqdb_data);
1106 
1107  BOOST_REQUIRE_EQUAL(expected_bs, seqdb_bs);
1108  BOOST_REQUIRE_EQUAL(expected_data.size(), seqdb_data.size());
1109 
1110  Uint4 num_diffs = 0;
1111 
1112  for(Uint4 i = 0; i < expected_data.size(); i++) {
1113  unsigned R = unsigned(expected_data[i]) & 0xFF;
1114  unsigned S = unsigned(seqdb_data[i]) & 0xFF;
1115 
1116  if (R != S) {
1117  if (! num_diffs)
1118  cout << "\n\n";
1119 
1120  cout << "At location " << dec << i << ", Readdb has: " << hex << int(R) << " whereas SeqDB has: " << hex << int(S);
1121 
1122  if (R > S) {
1123  cout << " (R += " << (R - S) << ")\n";
1124  } else {
1125  cout << " (S += " << (S - R) << ")\n";
1126  }
1127 
1128  num_diffs ++;
1129  }
1130  }
1131 
1132  if (num_diffs) {
1133  cout << "Num diffs: " << dec << num_diffs << endl;
1134  }
1135 
1136  BOOST_REQUIRE_EQUAL((int) 0, (int)num_diffs);
1137 }
1138 
1139 BOOST_AUTO_TEST_CASE(GetLenHighOID)
1140 {
1141 
1142  bool caught_exception = false;
1143 
1144  try {
1145  CSeqDB dbp("data/seqp", CSeqDB::eProtein);
1146  int num_seqs(0);
1147  dbp.GetTotals(CSeqDB::eFilteredAll, & num_seqs, 0);
1148 
1149  int len = dbp.GetSeqLength(num_seqs);
1150 
1151  BOOST_REQUIRE_EQUAL((int) 11112222, len);
1152  } catch(CSeqDBException &) {
1153  caught_exception = true;
1154  }
1155 
1156  if (! caught_exception) {
1157  BOOST_ERROR("GetLenHighOID() did not throw an exception of type CSeqDBException.");
1158  }
1159 }
1160 
1162 {
1163 
1164  bool caught_exception = false;
1165 
1166  try {
1167  CSeqDB dbp("data/seqp", CSeqDB::eProtein);
1168  Uint4 len = dbp.GetSeqLength(0-1);
1169 
1170  BOOST_REQUIRE_EQUAL((Uint4) 11112222, len);
1171  } catch(CSeqDBException &) {
1172  caught_exception = true;
1173  }
1174 
1175  if (! caught_exception) {
1176  BOOST_ERROR("GetLenNegOID() did not throw an exception of type CSeqDBException.");
1177  }
1178 }
1179 
1180 BOOST_AUTO_TEST_CASE(GetSeqHighOID)
1181 {
1182 
1183  bool caught_exception = false;
1184 
1185  try {
1186  CSeqDB dbp("data/seqp", CSeqDB::eProtein);
1187 
1188  int nseqs(0);
1189  dbp.GetTotals(CSeqDB::eFilteredAll, & nseqs, 0);
1190 
1191  const char * buffer = 0;
1192  Uint4 len = dbp.GetSequence(nseqs, & buffer);
1193 
1194  BOOST_REQUIRE_EQUAL((Uint4) 11112222, len);
1195  } catch(CSeqDBException &) {
1196  caught_exception = true;
1197  }
1198 
1199  if (! caught_exception) {
1200  BOOST_ERROR("GetSeqHighOID() did not throw an exception of type CSeqDBException.");
1201  }
1202 }
1203 
1205 {
1206 
1207  bool caught_exception = false;
1208 
1209  try {
1210  CSeqDB dbp("data/seqp", CSeqDB::eProtein);
1211 
1212  const char * buffer = 0;
1213  Uint4 len = dbp.GetSequence(0-1, & buffer);
1214 
1215  BOOST_REQUIRE_EQUAL((Uint4) 11112222, len);
1216  } catch(CSeqDBException &) {
1217  caught_exception = true;
1218  }
1219 
1220  if (! caught_exception) {
1221  BOOST_ERROR("GetSeqNegOID() did not throw an exception of type CSeqDBException.");
1222  }
1223 }
1224 
1225 BOOST_AUTO_TEST_CASE(Offset2OidBadOffset)
1226 {
1227 
1228  bool caught_exception = false;
1229 
1230  try {
1231  //, CSeqDBException);
1232  CSeqDB nr("nr", CSeqDB::eProtein);
1233 
1234  Uint8 vlength(0);
1235  nr.GetTotals(CSeqDB::eUnfilteredAll, 0, & vlength);
1236 
1237  nr.GetOidAtOffset(0, vlength + 1);
1238  } catch(CSeqDBException &) {
1239  caught_exception = true;
1240  }
1241 
1242  if (! caught_exception) {
1243  BOOST_ERROR("Offset2OidBadOffset() did not throw an exception of type CSeqDBException.");
1244  }
1245 }
1246 
1247 BOOST_AUTO_TEST_CASE(Offset2OidBadOid)
1248 {
1249 
1250  bool caught_exception = false;
1251 
1252  try {
1253  CSeqDB nr("nr", CSeqDB::eProtein);
1254 
1255  Int4 noids(0);
1256  nr.GetTotals(CSeqDB::eUnfilteredAll, & noids, 0);
1257 
1258  nr.GetOidAtOffset(noids + 1, 0);
1259  } catch(CSeqDBException &) {
1260  caught_exception = true;
1261  }
1262 
1263  if (! caught_exception) {
1264  BOOST_ERROR("Offset2OidBadOid() did not throw an exception of type CSeqDBException.");
1265  }
1266 }
1267 
1268 BOOST_AUTO_TEST_CASE(Offset2OidMonotony)
1269 {
1270 
1271  Uint4 segments = 1000;
1272 
1273  for(Uint4 i = 0; i<2; i++) {
1274  string dbname((i == 0) ? "nr" : "nt");
1275  CSeqDB::ESeqType sqtype((i == 0)
1278 
1279  CSeqDB db(dbname, sqtype);
1280 
1281  int prev_oid = 0;
1282  int num_oids = 0;
1283  Uint8 vol_length(0);
1284 
1285  db.GetTotals(CSeqDB::eUnfilteredAll, & num_oids, & vol_length);
1286 
1287  for(Uint4 j = 0; j < segments; j++) {
1288  Uint8 range_target = (vol_length * j) / segments;
1289 
1290  int oid_here = db.GetOidAtOffset(0, range_target);
1291 
1292  double range_ratio = double(range_target) / vol_length;
1293  double oid_ratio = double(oid_here) / num_oids;
1294  double percent_diff = 100.0 * fabs(oid_ratio - range_ratio);
1295 
1296  // Up to 30 % slack will be permitted. Normally this runs
1297  // to a maximum under 5%. This can break when the db is
1298  // reorganized (as they are every day) if reorganization
1299  // moves a lot of heavy sequences to one end.
1300  //
1301  // This was 15% - changed on Jan 2, 2008.
1302 
1303  BOOST_REQUIRE(prev_oid <= oid_here);
1304  BOOST_REQUIRE(percent_diff <= 30.0);
1305 
1306  prev_oid = oid_here;
1307  }
1308  }
1309 }
1310 
1312 public:
1313  CTmpEnvironmentSetter(const char* name, const char* value = NULL) {
1314  m_Name.assign(name);
1316  m_Env.Set(m_Name, (value == NULL ? kEmptyStr : string(value)));
1317  }
1319  if ( !m_PrevValue.empty() ) {
1321  }
1322  }
1323 private:
1325  string m_Name;
1326  string m_PrevValue;
1327 };
1328 
1329 BOOST_AUTO_TEST_CASE(OpenWithBLASTDBEnv)
1330 {
1331  CTmpEnvironmentSetter tmpenv("BLASTDB", "/blast/db/blast");
1332  CSeqDB db1("nr", CSeqDB::eProtein);
1333  CSeqDB db2("pdb", CSeqDB::eProtein);
1334  CSeqDB db3("pdbnt", CSeqDB::eNucleotide);
1335 }
1336 
1337 BOOST_AUTO_TEST_CASE(OpenWithoutBLASTDBEnv)
1338 {
1339  CTmpEnvironmentSetter tmpenv("BLASTDB");
1340  CSeqDB db1("nr", CSeqDB::eProtein);
1341  CSeqDB db2("pdbnt", CSeqDB::eNucleotide);
1342  // When the line below is removed, things work (06/02/08 2:53PM EST) ?
1343  CSeqDB db3("pdb", CSeqDB::eProtein);
1344 }
1345 
1347 {
1348 
1349  vector<string> names;
1350 
1351  names.push_back("p,nr");
1352  names.push_back("n,nt");
1353  names.push_back("n,pdbnt");
1354  names.push_back("p,pdb");
1355  names.push_back("p,CDSEARCH/oasis_pfam");
1356 
1357  ITERATE(vector<string>, s, names) {
1358  BOOST_REQUIRE(s->length() > 2);
1359 
1360  char prot_nucl = (*s)[0];
1361  string dbname(*s, 2, s->length()-2);
1362 
1363  CSeqDB db(dbname, prot_nucl == 'p' ? CSeqDB::eProtein : CSeqDB::eNucleotide);
1364  }
1365 }
1366 
1368 {
1369 
1370  // Alias file name prefixes
1371 
1372  const char * mask_name[] = {
1373  "range", "odd", "even", "prime", "ERROR"
1374  };
1375 
1376  // For each of the masking types, an alias file exists that
1377  // covers each of the OID intervals shown below. There are
1378  // several cases in the OID mask / OID range combination code;
1379  // these intervals (except the 0,0) are intended to measure
1380  // the ability of that code to deal with edge conditions
1381  // (edges of bytes and edges of OID ranges for example).
1382 
1383  int ranges[] = {
1384  1, 2,
1385  1, 5,
1386  1, 7,
1387  1, 8,
1388  1, 9,
1389  3, 7,
1390  3, 13,
1391  1, 1,
1392  10, 100,
1393  8, 128,
1394  10, 130,
1395  9, 130,
1396  9, 129,
1397  8, 130,
1398  8, 129,
1399  9, 128,
1400  1, 10,
1401  0, 0
1402  };
1403 
1404  vector<int> oids;
1405 
1407  for(int i = 0; ranges[i]; i += 2) {
1408  unsigned first = ranges[i];
1409  unsigned second = ranges[i+1];
1410 
1411  if ((((mask == eOdd) && (first != 3)) || (second != 7)) || (mask == eAll)) {
1412  continue;
1413  }
1414 
1415  string dbname = s_ToString("data/ranges/", mask_name[mask], first, "_", second);
1417 
1418  int obegin(0), oend(0);
1419 
1420  int count(0);
1421  int lowest(INT_MAX);
1422  int highest(0);
1423 
1424  // This could be done more cleanly with CheckOrFindOID
1425  // as in the next example, but this serves as an
1426  // additional wrinkle (ie for testing purposes). Or
1427  // else I forgot that CheckOrFindOID was available.
1428 
1429  // This tests that all returned OIDs should in fact be
1430  // included. It currently does not make any attempt
1431  // to verify that no OIDs are missed (but it could).
1432 
1433  while(1) {
1434  CSeqDB::EOidListType range_type =
1435  db.GetNextOIDChunk(obegin, oend, 10, oids);
1436 
1437  unsigned num_found(0);
1438 
1439  if (range_type == CSeqDB::eOidList) {
1440  num_found = (int) oids.size();
1441 
1442  ITERATE(vector<int>, iter, oids) {
1443  if ((*iter) > highest) {
1444  highest = (*iter);
1445  }
1446 
1447  if ((*iter) < lowest) {
1448  lowest = (*iter);
1449  }
1450 
1451  BOOST_REQUIRE(s_MaskingTest(mask, *iter));
1452  }
1453  } else {
1454  num_found = oend-obegin;
1455 
1456  if (num_found) {
1457  if (oend > highest) {
1458  highest = oend;
1459  }
1460 
1461  if (obegin < lowest) {
1462  lowest = obegin;
1463  }
1464  }
1465 
1466  for(int v = obegin; v < oend; v++) {
1467  BOOST_REQUIRE(s_MaskingTest(mask, v));
1468  }
1469  }
1470 
1471  if (obegin == oend) {
1472  break;
1473  }
1474 
1475  count += num_found;
1476  }
1477 
1478  s_TestMaskingLimits(mask, first-1, second-1, lowest, highest, count);
1479  }
1480  }
1481 }
1482 
1483 BOOST_AUTO_TEST_CASE(GiListOidRange)
1484 {
1485 
1486  TGi low_gi = 20*1000*1000;
1487  TGi high_gi = 30*1000*1000;
1488 
1489  int low_oid = 50;
1490  int high_oid = 150;
1491 
1492  vector<string> dbs;
1493  dbs.push_back("data/seqp");
1494  dbs.push_back("data/ranges/seqp15");
1495  dbs.push_back("data/ranges/twenty");
1496  dbs.push_back("data/ranges/twenty15");
1497 
1498  for(Uint4 dbnum = 0; dbnum < dbs.size(); dbnum++) {
1499  CSeqDB db(dbs[dbnum], CSeqDB::eProtein);
1500 
1501  bool all_gis_in_range = true;
1502  bool all_oids_in_range = true;
1503 
1504  // Get all included OIDs and GIs for this database
1505  for(int oid = 0; db.CheckOrFindOID(oid); oid++) {
1506  if (! (all_oids_in_range || all_gis_in_range)) {
1507  break;
1508  }
1509 
1510  if (all_oids_in_range) {
1511  if ((oid < (low_oid-1)) || ((high_oid-1)) < oid) {
1512  all_oids_in_range = false;
1513  }
1514  }
1515 
1516  if (all_gis_in_range) {
1517  list< CRef<CSeq_id> > ids = db.GetSeqIDs(oid);
1518 
1519  bool gi_in_range = false;
1520 
1521  ITERATE(list< CRef<CSeq_id> >, seqid, ids) {
1522  if ((**seqid).IsGi()) {
1523  TGi gi = (**seqid).GetGi();
1524 
1525  if ((gi > low_gi) && (gi < high_gi)) {
1526  gi_in_range = true;
1527  break;
1528  }
1529  }
1530  }
1531 
1532  if (! gi_in_range) {
1533  all_gis_in_range = false;
1534  }
1535  }
1536  }
1537 
1538  bool gis_confined (false);
1539  bool oids_confined(false);
1540 
1541  switch(dbnum) {
1542  case 0:
1543  gis_confined = false;
1544  oids_confined = false;
1545  break;
1546 
1547  case 1:
1548  gis_confined = false;
1549  oids_confined = true;
1550  break;
1551 
1552  case 2:
1553  gis_confined = true;
1554  oids_confined = false;
1555  break;
1556 
1557  case 3:
1558  gis_confined = true;
1559  oids_confined = true;
1560  break;
1561  }
1562 
1563  BOOST_REQUIRE_EQUAL(oids_confined, all_oids_in_range);
1564  BOOST_REQUIRE_EQUAL(gis_confined, all_gis_in_range);
1565  }
1566 }
1567 
1569 {
1570 
1571  bool caught_exception = false;
1572 
1573  try {
1574  CSeqDB db("", CSeqDB::eProtein);
1575  } catch(CSeqDBException &) {
1576  caught_exception = true;
1577  }
1578 
1579  if (! caught_exception) {
1580  BOOST_ERROR("EmptyDBList() did not throw an exception of type CSeqDBException.");
1581  }
1582 }
1583 
1584 BOOST_AUTO_TEST_CASE(IsBinaryGiList_True)
1585 {
1586  BOOST_REQUIRE_EQUAL(true, SeqDB_IsBinaryGiList("data/prot345b.gil"));
1587 }
1588 
1589 BOOST_AUTO_TEST_CASE(IsBinaryGiList_False)
1590 {
1591  BOOST_REQUIRE_EQUAL(false, SeqDB_IsBinaryGiList("data/prot345t.gil"));
1592  BOOST_REQUIRE(SeqDB_IsBinaryGiList("data/totals.nal") == false);
1593 }
1594 
1595 BOOST_AUTO_TEST_CASE(IsBinaryGiList_EmptyFile)
1596 {
1597  BOOST_REQUIRE_THROW(SeqDB_IsBinaryGiList("data/broken-mask-data-db.paa"),
1598  CSeqDBException);
1599 }
1600 
1601 BOOST_AUTO_TEST_CASE(IsBinaryGiList_InvalidFile)
1602 {
1603  if (CFile(DEV_NULL).Exists()) {
1604  BOOST_REQUIRE_THROW(SeqDB_IsBinaryGiList(DEV_NULL),
1605  CSeqDBException);
1606  }
1607 }
1608 
1609 BOOST_AUTO_TEST_CASE(BinaryUserGiList)
1610 {
1611 
1612  CRef<CSeqDBGiList> gi_list(new CSeqDBFileGiList("data/prot345b.gil"));
1613  CSeqDB db("data/seqp", CSeqDB::eProtein, 0, 0, true, gi_list);
1614 
1615  int found(0);
1616 
1617  for(int oid = 0; db.CheckOrFindOID(oid); oid++) {
1618  found ++;
1619  }
1620 
1621  // The GI list has 471 elements, only 58 of those are in the DB.
1622  BOOST_REQUIRE_EQUAL(29, found);
1623 }
1624 
1625 
1626 BOOST_AUTO_TEST_CASE(TextUserGiList)
1627 {
1628 
1629  CRef<CSeqDBGiList> gi_list(new CSeqDBFileGiList("data/prot345t.gil"));
1630  CSeqDB db("data/seqp", CSeqDB::eProtein, 0, 0, true, gi_list);
1631 
1632  int found(0);
1633 
1634  for(int oid = 0; db.CheckOrFindOID(oid); oid++) {
1635  found ++;
1636  }
1637 
1638  // The GI list has 471 elements, only 58 of those are in the DB.
1639  BOOST_REQUIRE_EQUAL(29, found);
1640 }
1641 
1642 BOOST_AUTO_TEST_CASE(UserSeqIdList)
1643 {
1644 
1645  CRef<CSeqDBGiList> gi_list(new CSeqDBFileGiList("data/prot345.sil", CSeqDBFileGiList::eSiList));
1646  CSeqDB db("data/seqp", CSeqDB::eProtein, 0, 0, true, gi_list);
1647 
1648  int found(0);
1649 
1650  for(int oid = 0; db.CheckOrFindOID(oid); oid++) {
1651  found ++;
1652  }
1653 
1654  BOOST_REQUIRE_EQUAL(58, found);
1655 }
1656 
1657 BOOST_AUTO_TEST_CASE(CSeqDBFileGiList_GetGis)
1658 {
1659 
1660  const string kFileName("data/prot345t.gil");
1661 
1662  // Read using CSeqDBFileGiList
1663  CSeqDBFileGiList seqdbgifile(kFileName);
1664  vector<TGi> gis;
1665  seqdbgifile.GetGiList(gis);
1666  BOOST_REQUIRE_EQUAL((size_t) seqdbgifile.GetNumGis(), gis.size());
1667  sort(gis.begin(), gis.end());
1668 
1669  // Read text gi list manually
1671  ifstream gifile(fn.c_str());
1672  BOOST_REQUIRE(gifile);
1673 
1674  vector<TGi> reference;
1675  reference.reserve(gis.size());
1676  while ( !gifile.eof() ) {
1677  Int8 tgi = -1; // gis can't be negative
1678  gifile >> tgi;
1679  if (tgi == -1) break;
1680  reference.push_back(GI_FROM(Int8, tgi));
1681  }
1682  sort(reference.begin(), reference.end());
1683  BOOST_REQUIRE_EQUAL(reference.size(), gis.size());
1684 
1685  // Compare the contents
1686  for (size_t i = 0; i < reference.size(); i++) {
1687  string msg = "Failed on element " + NStr::SizetToString(i);
1688  BOOST_REQUIRE_MESSAGE(reference[i] == gis[i], msg);
1689  }
1690 }
1691 
1692 BOOST_AUTO_TEST_CASE(TwoGiListsOneVolume)
1693 {
1694 
1695  vector<string> dbs;
1696  dbs.push_back("Test/Giardia.01");
1697  dbs.push_back("Test/baylor_wgs_contigs.01");
1698  dbs.push_back(dbs[0] + " " + dbs[1]);
1699 
1700  vector< vector<TGi> > gis(dbs.size());
1701  vector< vector<string> > volumes(dbs.size());
1702 
1703  for(int i = 0; i < (int)dbs.size(); i++) {
1704  CRef<CSeqDB> db;
1705  BOOST_REQUIRE_NO_THROW(db.Reset(new CSeqDB(dbs[i],
1707 
1708  db->FindVolumePaths(volumes[i]);
1709 
1710  // Collect all the included gis.
1711 
1712  for(int oid = 0; db->CheckOrFindOID(oid); oid++) {
1713  db->GetGis(oid, gis[i], true);
1714  }
1715  }
1716 
1717  // Check that the same volume underlies both database aliases.
1718 
1719  BOOST_REQUIRE(volumes[0] == volumes[1]);
1720  BOOST_REQUIRE(volumes[0] == volumes[2]);
1721  BOOST_REQUIRE_EQUAL(gis[0].size() + gis[1].size(), gis[2].size());
1722 
1723  vector<TGi> zero_one(gis[0]);
1724  zero_one.insert(zero_one.end(), gis[1].begin(), gis[1].end());
1725 
1726  sort(zero_one.begin(), zero_one.end());
1727  sort(gis[2].begin(), gis[2].end());
1728 
1729  BOOST_REQUIRE(zero_one == gis[2]);
1730 }
1731 
1732 BOOST_AUTO_TEST_CASE(GetTaxIDs_gi_to_taxid)
1733 {
1734 
1735  TGi gi1a = 446106212;
1736  int tax1 = 1386;
1737 
1738  TGi gi2a = 494110381;
1739  TGi gi2b = 30172867;
1740  int tax2a = 1678;
1741  int tax2b = 206672;
1742 
1743  int oid1 = -1;
1744  int oid2 = -1;
1745 
1746  CSeqDB db("data/wp_nr", CSeqDB::eProtein);
1747 
1748  bool success = db.GiToOid(gi1a, oid1);
1749  BOOST_REQUIRE(success);
1750 
1751  success = db.GiToOid(gi2a, oid2);
1752  BOOST_REQUIRE(success);
1753 
1754  BOOST_REQUIRE(oid1 != oid2);
1755 
1756  map<TGi, TTaxId> gi2taxid;
1757 
1758  db.GetTaxIDs(oid1, gi2taxid);
1759  BOOST_REQUIRE_EQUAL((int)gi2taxid.size(), 44);
1760  BOOST_REQUIRE_EQUAL(gi2taxid[gi1a], tax1);
1761 
1762  db.GetTaxIDs(oid2, gi2taxid, false);
1763  BOOST_REQUIRE_EQUAL((int)gi2taxid.size(), 23);
1764  BOOST_REQUIRE_EQUAL(gi2taxid[gi2a], tax2a);
1765  BOOST_REQUIRE_EQUAL(gi2taxid[gi2b], tax2b);
1766 
1767  db.GetTaxIDs(oid1, gi2taxid, true);
1768  BOOST_REQUIRE_EQUAL((int)gi2taxid.size(), 67);
1769  BOOST_REQUIRE_EQUAL(gi2taxid[gi1a], tax1);
1770  BOOST_REQUIRE_EQUAL(gi2taxid[gi2a], tax2a);
1771  BOOST_REQUIRE_EQUAL(gi2taxid[gi2b], tax2b);
1772 }
1773 
1774 #define BEGIN(X) (X)
1775 #define END(X) ((X) + (sizeof (X) / sizeof *(X)))
1776 
1777 BOOST_AUTO_TEST_CASE(GetLeafTaxIDs_gi_to_taxid_set)
1778 {
1779 
1780  TGi gi1a = 446106212;
1781  int tax1[] = {
1782  1386,
1783  1392,
1784  1396,
1785  1428,
1786  1234146
1787  };
1788 
1789  TGi gi2a = 494110381;
1790  int tax2a[] = {
1791  1678,
1792  216816,
1793  469594,
1794  1263059
1795  };
1796 
1797  int oid1 = -1;
1798  int oid2 = -1;
1799 
1800  CSeqDB db("data/wp_nr", CSeqDB::eProtein);
1801 
1802  bool success = db.GiToOid(gi1a, oid1);
1803  BOOST_REQUIRE(success);
1804 
1805  success = db.GiToOid(gi2a, oid2);
1806  BOOST_REQUIRE(success);
1807 
1808  BOOST_REQUIRE(oid1 != oid2);
1809 
1810  map<TGi, set<TTaxId> > gi2taxids;
1811 
1812  set<int> expected1;
1813  expected1.insert(BEGIN(tax1), END(tax1));
1814 
1815  set<int> expected2a;
1816  expected2a.insert(BEGIN(tax2a), END(tax2a));
1817 
1818  // At this point, gi2taxids is empty.
1819  BOOST_REQUIRE(gi2taxids.empty());
1820  db.GetLeafTaxIDs(oid1, gi2taxids);
1821  BOOST_REQUIRE_EQUAL((int) gi2taxids.size(), 44);
1822  BOOST_REQUIRE_EQUAL((int) gi2taxids[gi1a].size(), 5);
1823  BOOST_REQUIRE_EQUAL_COLLECTIONS(
1824  gi2taxids[gi1a].begin(), gi2taxids[gi1a].end(),
1825  expected1.begin(), expected1.end()
1826  );
1827 
1828  // At this point, gi2taxids is NOT empty, but 'persist' is false.
1829  BOOST_REQUIRE(!gi2taxids.empty());
1830  db.GetLeafTaxIDs(oid2, gi2taxids, false);
1831  BOOST_REQUIRE_EQUAL((int) gi2taxids.size(), 23);
1832  BOOST_REQUIRE_EQUAL((int) gi2taxids[gi2a].size(), 4);
1833  BOOST_REQUIRE_EQUAL_COLLECTIONS(
1834  gi2taxids[gi2a].begin(), gi2taxids[gi2a].end(),
1835  expected2a.begin(), expected2a.end()
1836  );
1837 
1838  // At this point, gi2taxids is NOT empty, and 'persist' is true.
1839  BOOST_REQUIRE(!gi2taxids.empty());
1840  db.GetLeafTaxIDs(oid1, gi2taxids, true);
1841  BOOST_REQUIRE_EQUAL((int) gi2taxids.size(), 67);
1842  BOOST_REQUIRE_EQUAL((int) gi2taxids[gi1a].size(), 5);
1843  BOOST_REQUIRE_EQUAL((int) gi2taxids[gi2a].size(), 4);
1844  BOOST_REQUIRE_EQUAL_COLLECTIONS(
1845  gi2taxids[gi1a].begin(), gi2taxids[gi1a].end(),
1846  expected1.begin(), expected1.end()
1847  );
1848  BOOST_REQUIRE_EQUAL_COLLECTIONS(
1849  gi2taxids[gi2a].begin(), gi2taxids[gi2a].end(),
1850  expected2a.begin(), expected2a.end()
1851  );
1852 }
1853 
1854 BOOST_AUTO_TEST_CASE(GetTaxIDs_vector_of_taxids)
1855 {
1856 
1857  TGi gi1a = 446106212;
1858  int tax1[] = {
1859  198094,
1860  261594,
1861  260799,
1862  281309,
1863  412694,
1864  405535,
1865  568206,
1866  592021,
1867  637380,
1868  347495,
1869  768494,
1870  1386,
1871  198094,
1872  261594,
1873  260799,
1874  281309,
1875  412694,
1876  486624,
1877  486619,
1878  486621,
1879  486623,
1880  486620,
1881  486622,
1882  405536,
1883  405917,
1884  451709,
1885  451707,
1886  405535,
1887  568206,
1888  592021,
1889  637380,
1890  347495,
1891  768494,
1892  1053216,
1893  1211117,
1894  1213182,
1895  673518,
1896  743835,
1897  1439874,
1898  1412843,
1899  1412842,
1900  1412844,
1901  1392837,
1902  1437442
1903  };
1904 
1905  TGi gi2a = 494110381;
1906  int tax2a[] = {
1907  206672,
1908  205913,
1909  565040,
1910  565042,
1911  1035817,
1912  1678,
1913  206672,
1914  205913,
1915  206672,
1916  205913,
1917  537937,
1918  469594,
1919  565042,
1920  565040,
1921  1035817,
1922  1161745,
1923  1161744,
1924  1161904,
1925  1161743,
1926  1298922,
1927  1263059,
1928  1205679,
1929  1322347
1930  };
1931 
1932  int oid1 = -1;
1933  int oid2 = -1;
1934 
1935  vector<int> expected1;
1936  expected1.assign(BEGIN(tax1), END(tax1));
1937  sort(expected1.begin(), expected1.end());
1938 
1939  vector<int> expected2a;
1940  expected2a.assign(BEGIN(tax2a), END(tax2a));
1941  sort(expected2a.begin(), expected2a.end());
1942 
1943  CSeqDB db("data/wp_nr", CSeqDB::eProtein);
1944 
1945  bool success = db.GiToOid(gi1a, oid1);
1946  BOOST_REQUIRE(success);
1947 
1948  success = db.GiToOid(gi2a, oid2);
1949  BOOST_REQUIRE(success);
1950 
1951  BOOST_REQUIRE(oid1 != oid2);
1952 
1953  vector<TTaxId> taxids;
1954 
1955  // At this point, taxids is empty.
1956  db.GetTaxIDs(oid1, taxids);
1957  sort(taxids.begin(), taxids.end());
1958  BOOST_REQUIRE_EQUAL((int) taxids.size(), (int) expected1.size());
1959  BOOST_REQUIRE_EQUAL_COLLECTIONS(
1960  taxids.begin(), taxids.end(),
1961  expected1.begin(), expected1.end()
1962  );
1963 
1964  // At this point, taxids is NOT empty, but 'persist' is false.
1965  db.GetTaxIDs(oid2, taxids, false);
1966  sort(taxids.begin(), taxids.end());
1967  BOOST_REQUIRE_EQUAL((int) taxids.size(), (int) expected2a.size());
1968  BOOST_REQUIRE_EQUAL_COLLECTIONS(
1969  taxids.begin(), taxids.end(),
1970  expected2a.begin(), expected2a.end()
1971  );
1972 
1973  expected2a.insert(
1974  expected2a.end(),
1975  expected1.begin(),
1976  expected1.end()
1977  );
1978  sort(expected2a.begin(), expected2a.end());
1979 
1980  // At this point, taxids is NOT empty, and 'persist' is true.
1981  db.GetTaxIDs(oid1, taxids, true);
1982  sort(taxids.begin(), taxids.end());
1983  BOOST_REQUIRE_EQUAL((int) taxids.size(), (int) expected2a.size());
1984  BOOST_REQUIRE_EQUAL_COLLECTIONS(
1985  taxids.begin(), taxids.end(),
1986  expected2a.begin(), expected2a.end()
1987  );
1988 }
1989 
1990 BOOST_AUTO_TEST_CASE(GetLeafTaxIDs_vector_of_taxids)
1991 {
1992 
1993  TGi gi1a = 446106212;
1994  int tax1[] = {
1995  1386, 1392, 1396, 1428, 1234146
1996  };
1997 
1998  TGi gi2a = 494110381;
1999  int tax2a[] = {
2000  1678, 216816, 469594, 1263059
2001  };
2002 
2003  int oid1 = -1;
2004  int oid2 = -1;
2005 
2006  vector<int> expected1;
2007  expected1.assign(BEGIN(tax1), END(tax1));
2008  sort(expected1.begin(), expected1.end());
2009 
2010  vector<int> expected2a;
2011  expected2a.assign(BEGIN(tax2a), END(tax2a));
2012  sort(expected2a.begin(), expected2a.end());
2013 
2014  CSeqDB db("data/wp_nr", CSeqDB::eProtein);
2015 
2016  bool success = db.GiToOid(gi1a, oid1);
2017  BOOST_REQUIRE(success);
2018 
2019  success = db.GiToOid(gi2a, oid2);
2020  BOOST_REQUIRE(success);
2021 
2022  BOOST_REQUIRE(oid1 != oid2);
2023 
2024  vector<TTaxId> taxids;
2025 
2026  // At this point, taxids is empty.
2027  db.GetLeafTaxIDs(oid1, taxids);
2028  sort(taxids.begin(), taxids.end());
2029  BOOST_REQUIRE_EQUAL((int) taxids.size(), (int) expected1.size());
2030  BOOST_REQUIRE_EQUAL_COLLECTIONS(
2031  taxids.begin(), taxids.end(),
2032  expected1.begin(), expected1.end()
2033  );
2034 
2035  // At this point, taxids is NOT empty, but 'persist' is false.
2036  db.GetLeafTaxIDs(oid2, taxids, false);
2037  sort(taxids.begin(), taxids.end());
2038  BOOST_REQUIRE_EQUAL((int) taxids.size(), (int) expected2a.size());
2039  BOOST_REQUIRE_EQUAL_COLLECTIONS(
2040  taxids.begin(), taxids.end(),
2041  expected2a.begin(), expected2a.end()
2042  );
2043 
2044  expected2a.insert(
2045  expected2a.end(),
2046  expected1.begin(),
2047  expected1.end()
2048  );
2049  sort(expected2a.begin(), expected2a.end());
2050 
2051  // At this point, taxids is NOT empty, and 'persist' is true.
2052  db.GetLeafTaxIDs(oid1, taxids, true);
2053  sort(taxids.begin(), taxids.end());
2054  BOOST_REQUIRE_EQUAL((int) taxids.size(), (int) expected2a.size());
2055  BOOST_REQUIRE_EQUAL_COLLECTIONS(
2056  taxids.begin(), taxids.end(),
2057  expected2a.begin(), expected2a.end()
2058  );
2059 }
2060 
2061 BOOST_AUTO_TEST_CASE(PartialSequences)
2062 {
2063 
2064  // 57340989 - is nicely marbled with ambiguities.
2065  // 24430781 - has several long ambiguous subsequences, one at the start.
2066  // 8885782 - has three ambiguities, one at the end.
2067 
2068  CSeqDB nt("nt", CSeqDB::eNucleotide);
2069 
2070  s_TestPartialAmbig(nt, 57340989);
2071  s_TestPartialAmbig(nt, 24430781);
2072  s_TestPartialAmbig(nt, 1059791394);
2073 }
2074 
2075 BOOST_AUTO_TEST_CASE(GiListInOidRangeIteration)
2076 {
2077 
2078  const int kNumTestGis = 3;
2079  const int kGiOids[kNumTestGis] = { 15, 51, 84 };
2080  CRef<CSeqDBGiList> gi_list(new CSeqDBFileGiList("data/seqn_3gis.gil"));
2081 
2082  CSeqDB db("data/seqn", CSeqDB::eNucleotide, gi_list);
2083 
2084  int start, end;
2085  vector<int> oid_list;
2086 
2087  db.SetIterationRange(0, kGiOids[0]+1);
2088 
2089  CSeqDB::EOidListType chunk_type =
2090  db.GetNextOIDChunk(start, end, kNumTestGis, oid_list);
2091  BOOST_REQUIRE(chunk_type == CSeqDB::eOidList);
2092 
2093  // One of the 3 gis falls within ordinal id range.
2094  BOOST_REQUIRE_EQUAL(1, (int)oid_list.size());
2095 
2096  db.SetIterationRange(kGiOids[0]+1, kGiOids[1]+1);
2097 
2098  chunk_type = db.GetNextOIDChunk(start, end, kNumTestGis, oid_list);
2099  BOOST_REQUIRE(chunk_type == CSeqDB::eOidList);
2100 
2101  // Two of the 3 gis falls within ordinal id range.
2102  BOOST_REQUIRE_EQUAL(1, (int)oid_list.size());
2103 
2104  db.SetIterationRange(kGiOids[1]+1, 0);
2105 
2106  chunk_type = db.GetNextOIDChunk(start, end, kNumTestGis, oid_list);
2107  BOOST_REQUIRE(chunk_type == CSeqDB::eOidList);
2108 
2109  // Two of the 3 gis falls within ordinal id range.
2110  BOOST_REQUIRE_EQUAL(1, (int)oid_list.size());
2111 }
2112 
2114 {
2115 
2116  CSeqDB db("nr", CSeqDB::eProtein);
2117 
2118  int oid = 0;
2119 
2120  vector<int> oids1;
2121  vector<int> oids2;
2122 
2123  CSeq_id seqid("P01013.1");
2124 
2125  BOOST_REQUIRE(db.SeqidToOid(seqid, oid));
2126  oids1.push_back(oid);
2127 
2128  db.SeqidToOids(seqid, oids2);
2129  BOOST_REQUIRE(! oids2.empty());
2130 
2131  ITERATE(vector<int>, iter, oids1) {
2132  BOOST_REQUIRE(*iter == oids2[0]);
2133  }
2134 }
2135 
2136 BOOST_AUTO_TEST_CASE(TestResetInternalChunkBookmark)
2137 {
2138 
2139  CSeqDB db("data/seqp", CSeqDB::eProtein);
2140 
2141  const int kFirstOid(0);
2142  const int kLastOid(100);
2143  db.SetIterationRange(kFirstOid, kLastOid);
2144 
2145  int start, end;
2146  vector<int> oid_list;
2147 
2148  CSeqDB::EOidListType chunk_type =
2149  db.GetNextOIDChunk(start, end, kLastOid, oid_list);
2150  BOOST_REQUIRE(chunk_type == CSeqDB::eOidRange);
2151  BOOST_REQUIRE_EQUAL(kFirstOid, start);
2152  BOOST_REQUIRE_EQUAL(kLastOid, end);
2153 
2154  chunk_type = db.GetNextOIDChunk(start, end, kLastOid, oid_list);
2155  BOOST_REQUIRE(chunk_type == CSeqDB::eOidRange);
2156  BOOST_REQUIRE_EQUAL(kFirstOid, start);
2157  BOOST_REQUIRE_EQUAL(kFirstOid, end);
2158 
2160  chunk_type = db.GetNextOIDChunk(start, end, kLastOid, oid_list);
2161  BOOST_REQUIRE(chunk_type == CSeqDB::eOidRange);
2162  BOOST_REQUIRE_EQUAL(kFirstOid, start);
2163  BOOST_REQUIRE_EQUAL(kLastOid, end);
2164 }
2165 
2166 BOOST_AUTO_TEST_CASE(ExpertNullConstructor)
2167 {
2168 
2169  CSeqDBExpert db;
2170 }
2171 
2172 BOOST_AUTO_TEST_CASE(ExpertTaxInfo)
2173 {
2174 
2175  CSeqDBExpert db;
2176 
2178  db.GetTaxInfo(57176, info);
2179 
2180  BOOST_REQUIRE_EQUAL(info.taxid, 57176);
2181  BOOST_REQUIRE_EQUAL((string)info.scientific_name, string("Aotus vociferans"));
2182  BOOST_REQUIRE_EQUAL((string)info.common_name, string("noisy night monkey"));
2183  BOOST_REQUIRE_EQUAL((string)info.blast_name, string("primates"));
2184  BOOST_REQUIRE_EQUAL((string)info.s_kingdom, string("Eukaryota"));
2185 
2186  db.GetTaxInfo(562, info);
2187  BOOST_REQUIRE_EQUAL(info.taxid, 562);
2188 
2189  BOOST_REQUIRE_THROW(db.GetTaxInfo(2147483647, info), CSeqDBException);
2190  BOOST_REQUIRE_THROW(db.GetTaxInfo(0, info), CSeqDBException);
2191  BOOST_REQUIRE_THROW(db.GetTaxInfo(-3, info), CSeqDBException);
2192 }
2193 
2194 BOOST_AUTO_TEST_CASE(ExpertRawData)
2195 {
2196 
2198 
2199  int oid(-1);
2200  db.GiToOid(1465582, oid);
2201 
2202  int slen(0),alen(0);
2203  const char * buffer(0);
2204 
2205  db.GetRawSeqAndAmbig(oid, & buffer, & slen, & alen);
2206 
2207  unsigned h = s_BufHash(buffer + slen, alen);
2208  unsigned exp_hash = 705445389u;
2209 
2210  BOOST_REQUIRE_EQUAL((290/4) + 1, slen);
2211  BOOST_REQUIRE_EQUAL(20, alen);
2212  BOOST_REQUIRE_EQUAL(exp_hash, h);
2213 }
2214 
2215 BOOST_AUTO_TEST_CASE(ExpertRawDataProteinNulls)
2216 {
2217 
2218  // Test the intersequence zero termination bytes.
2219 
2220  CSeqDBExpert db("nr", CSeqDB::eProtein);
2221 
2222  vector<int> oids;
2223  oids.push_back(0);
2224  oids.push_back(db.GetNumOIDs()-1);
2225 
2226  // This should not throw any exceptions (or core dump) if the
2227  // implementation of database reading and writing is correct.
2228 
2229  ITERATE(vector<int>, oid, oids) {
2230  int slen(0),alen(0);
2231  const char * buffer(0);
2232 
2233  db.GetRawSeqAndAmbig(*oid, & buffer, & slen, & alen);
2234 
2235  string S(buffer, slen);
2236  string A(buffer + slen, alen);
2237 
2238  int len = db.GetSeqLength(*oid);
2239 
2240  BOOST_REQUIRE_EQUAL((int) A.size(), 0);
2241  BOOST_REQUIRE_EQUAL((int) S.size(), len);
2242  BOOST_REQUIRE_EQUAL((int) *(buffer-1), 0);
2243  BOOST_REQUIRE_EQUAL((int) *(buffer+slen), 0);
2244  }
2245 }
2246 
2247 BOOST_AUTO_TEST_CASE(ExpertRawDataLength)
2248 {
2249 
2250  // Tests that it is possible to get the length without getting
2251  // the data, and that RetSequence need not be called in this
2252  // case.
2253 
2255 
2256  int oid(-1);
2257  db.GiToOid(1465582, oid);
2258 
2259  int slen(0),alen(0);
2260 
2261  db.GetRawSeqAndAmbig(oid, 0, & slen, & alen);
2262 
2263  BOOST_REQUIRE_EQUAL((290/4) + 1, slen);
2264  BOOST_REQUIRE_EQUAL(20, alen);
2265 }
2266 
2267 BOOST_AUTO_TEST_CASE(ExpertIdBounds)
2268 {
2269 
2271 
2272 
2273  {
2274  int low(0), high(0), count(0);
2275 
2276  nr.GetPigBounds(& low, & high, & count);
2277 
2278  BOOST_REQUIRE(low < high);
2279  BOOST_REQUIRE(count);
2280  }
2281 }
2282 
2283 BOOST_AUTO_TEST_CASE(ExpertIdBoundsNoPig)
2284 {
2285 
2286  bool caught_exception = false;
2287 
2289  // Tests ID bound functions.
2290  {
2291  TGi low(ZERO_GI);
2292  TGi high(ZERO_GI);
2293  int count(0);
2294 
2295  nt.GetGiBounds(& low, & high, & count);
2296 
2297  BOOST_REQUIRE(low < high);
2298  BOOST_REQUIRE(count);
2299  }
2300 
2301  try {
2302 
2303  int low(0), high(0), count(0);
2304 
2305  nt.GetPigBounds(& low, & high, & count);
2306 
2307  BOOST_REQUIRE(low < high);
2308  BOOST_REQUIRE(count);
2309  } catch(CSeqDBException &) {
2310  caught_exception = true;
2311  }
2312 
2313  if (! caught_exception) {
2314  BOOST_ERROR("ExpertIdBoundsNoPig() did not throw an exception of type CSeqDBException.");
2315  }
2316 }
2317 
2318 BOOST_AUTO_TEST_CASE(ResolveDbPath)
2319 {
2320 
2321  typedef pair<bool, string> TStringBool;
2322  typedef vector< TStringBool > TStringBoolVec;
2323 
2324  TStringBoolVec paths;
2325  paths.push_back(TStringBool(true, "nt.000.nin"));
2326  paths.push_back(TStringBool(true, "Test/ITS_RefSeq_Fungi.nal"));
2327  paths.push_back(TStringBool(true, "taxdb.bti"));
2328  paths.push_back(TStringBool(true, "data/seqp.pin"));
2329  paths.push_back(TStringBool(false, "nr.00")); // missing extension
2330 
2331  // Try to resolve each of the above paths.
2332 
2333  ITERATE(TStringBoolVec, iter, paths) {
2334  string filename = iter->second;
2335  string resolved = SeqDB_ResolveDbPath(filename);
2336  bool found = ! resolved.empty();
2337 
2338  if (iter->first) {
2339  int position = resolved.find(filename);
2340  // Should be found.
2341  BOOST_REQUIRE(found);
2342 
2343  // Resolved names are longer.
2344  BOOST_REQUIRE(resolved.size() > filename.size());
2345 
2346  // Filename must occur at end of resolved name.
2347  BOOST_REQUIRE_EQUAL(position + filename.size(), resolved.size());
2348  } else {
2349  BOOST_REQUIRE(! found);
2350  }
2351  }
2352 }
2353 
2354 
2355 class CSimpleGiList : public CSeqDBGiList {
2356 public:
2357  CSimpleGiList(const vector<TGi> & gis)
2358  {
2359  for(size_t i = 0; i < gis.size(); i++) {
2360  m_GisOids.push_back(gis[i]);
2361  }
2362  }
2363 };
2364 
2365 BOOST_AUTO_TEST_CASE(IntersectionGiList)
2366 {
2367 
2368  vector<TGi> a3; // multiples of 3 from 0..500
2369  vector<TGi> a5; // multiples of 5 from 0..500
2370 
2371  // The number 41 is added to the front of one set and the end of
2372  // the other to verify that the code computing the intersection
2373  // correctly sorts its inputs.
2374 
2375  TGi special = GI_CONST(41);
2376 
2377  // Add to start of a3
2378  a3.push_back(special);
2379 
2380  for(Uint4 i = 0; (i*3) < 500; i++) {
2381  a3.push_back(GI_FROM(Uint4, i*3));
2382 
2383  if (i*5 < 500) {
2384  a5.push_back(GI_FROM(Uint4, i*5));
2385  }
2386  }
2387 
2388  // Add to end of a5
2389  a5.push_back(special);
2390 
2391  CSimpleGiList gi3(a3);
2392 
2393  // Intersection == multiples of 15.
2394  CIntersectionGiList both(gi3, a5);
2395 
2396  for(Uint4 i = 0; i < 500; i++) {
2397  TGi gi = GI_FROM(Uint4, i);
2398  if (((i % 15) == 0) || (gi == special)) {
2399  BOOST_REQUIRE(true == both.FindGi(gi));
2400  } else {
2401  BOOST_REQUIRE(false == both.FindGi(gi));
2402  }
2403  }
2404 }
2405 
2406 BOOST_AUTO_TEST_CASE(IntersectionNegGiList)
2407 {
2408 
2409  vector<TGi> a3; // multiples of 3 from 0..500
2410  vector<TGi> a5; // multiples of 5 from 0..500
2411 
2412  // The number 41 is added to the front of one set and the end of
2413  // the other to verify that the code computing the intersection
2414  // correctly sorts its inputs.
2415 
2416  TGi special = GI_CONST(41);
2417 
2418  // Add to start of a3
2419  a3.push_back(special);
2420 
2421  for(Uint4 i = 0; (i*3) < 500; i++) {
2422  a3.push_back(GI_FROM(Uint4, i*3));
2423 
2424  if (i*5 < 500) {
2425  a5.push_back(GI_FROM(Uint4, i*5));
2426  }
2427  }
2428 
2429  // Add to end of a5
2430  a5.push_back(special);
2431  a5.push_back(GI_CONST(1000));
2432 
2433  CSeqDBNegativeList gi3;
2434  gi3.SetGiList(a3);
2435 
2436  // Intersection <> multiples of 15.
2437  CIntersectionGiList both(gi3, a5);
2438 
2439  // all elements of a5 have to be in the intersect list
2440  // unless they are also found in a3
2441  for(int i = 0; i < (int)a5.size(); i++) {
2442  if (gi3.FindGi(a5[i])) {
2443  BOOST_REQUIRE(false == both.FindGi(a5[i]));
2444  } else {
2445  BOOST_REQUIRE(true == both.FindGi(a5[i]));
2446  }
2447  }
2448 
2449  // all elements in the intersect list have to be found in a5
2450  for(int i = 0; i < both.GetNumGis(); i++) {
2451  const TGi gi = both.GetKey<TGi>(i);
2452  BOOST_REQUIRE(std::find(a5.begin(), a5.end(), gi) != a5.end());
2453  }
2454 }
2455 
2457 {
2458 
2459  vector<int> a3; // multiples of 3 from 0..500
2460  vector<int> a5; // multiples of 5 from 0..500
2461 
2462  // The number 41 is added to the front of one set and the end of
2463  // the other to verify that the code computing the intersection
2464  // correctly sorts its inputs.
2465 
2466  int special = 41;
2467 
2468  // Add to start of a3
2469  a3.push_back(special);
2470 
2471  for(int i = 0; (i*3) < 500; i++) {
2472  a3.push_back(i*3);
2473 
2474  if (i*5 < 500) {
2475  a5.push_back(i*5);
2476  }
2477  }
2478 
2479  // Add to end of a5
2480  a5.push_back(special);
2481 
2483 
2484  // X and not Y operation : multiples of 3 (or 41) that aren't
2485  // multiples of 5 (or 41).
2486 
2487  calc->Compute(CSeqDBIdSet::eAnd, a5, false);
2488 
2489  BOOST_REQUIRE(calc->IsPositive());
2490  CRef<CSeqDBGiList> and_not = calc->GetPositiveList();
2491 
2492  for(int i = 0; i < 500; i++) {
2493  bool is_3 = ((i % 3) == 0) || (i == special);
2494  bool is_5 = ((i % 5) == 0) || (i == special);
2495 
2496  if (is_3 && (! is_5)) {
2497  BOOST_REQUIRE(true == and_not->FindGi(i));
2498  } else {
2499  BOOST_REQUIRE(false == and_not->FindGi(i));
2500  }
2501  }
2502 }
2503 
2504 BOOST_AUTO_TEST_CASE(ComplexComputedList)
2505 {
2506 
2507  vector<int> m2; // multiples of 2
2508  vector<int> m3; // multiples of 3
2509  vector<int> m5; // multiples of 5
2510  vector<int> m7; // multiples of 7
2511 
2512  // The number 11 is is added to the beginning and end of each list
2513  // to insure that all lists are sorted and uniqued properly.
2514 
2515  int special = 11;
2516 
2517  m2.push_back(special);
2518  m3.push_back(special);
2519  m5.push_back(special);
2520  m7.push_back(special);
2521 
2522  for(int i = 0; i < 1000; i++) {
2523  if (! (i % 2)) {
2524  m2.push_back(i);
2525  }
2526  if (! (i % 3)) {
2527  m3.push_back(i);
2528  }
2529  if (! (i % 5)) {
2530  m5.push_back(i);
2531  }
2532  if (! (i % 7)) {
2533  m7.push_back(i);
2534  }
2535  }
2536 
2537  m2.push_back(special);
2538  m3.push_back(special);
2539  m5.push_back(special);
2540  m7.push_back(special);
2541 
2542  //----------------------------------------
2543  // c1: (m2 AND NOT m3) OR (m5 AND NOT m7)
2544 
2545  CSeqDBIdSet c1(m2, CSeqDBIdSet::eGi);
2546  c1.Compute(CSeqDBIdSet::eAnd, m3, false);
2547 
2548  CSeqDBIdSet m5_not_m7(m5, CSeqDBIdSet::eGi);
2549  m5_not_m7.Compute(CSeqDBIdSet::eAnd, m7, false);
2550 
2551  c1.Compute(CSeqDBIdSet::eOr, m5_not_m7);
2552 
2553  //----------------------------------------
2554  // c2: (NOT m2 OR m3) AND (m5 XOR m7)
2555 
2556  CSeqDBIdSet c2(m2, CSeqDBIdSet::eGi, false);
2557  c2.Compute(CSeqDBIdSet::eOr, m3);
2558 
2559  CSeqDBIdSet m5_xor_m7(m5, CSeqDBIdSet::eGi);
2560  m5_xor_m7.Compute(CSeqDBIdSet::eXor, m7);
2561 
2562  c2.Compute(CSeqDBIdSet::eAnd, m5_xor_m7);
2563 
2564  //----------------------------------------
2565  // c3: (m2 OR NOT m3) AND (NOT m5 OR NOT m7)
2566 
2567  CSeqDBIdSet c3(m2, CSeqDBIdSet::eGi);
2568  c3.Compute(CSeqDBIdSet::eOr, m3, false);
2569 
2570  BOOST_REQUIRE(! c3.IsPositive());
2571 
2572  CSeqDBIdSet not_m5_ornot_m7(m5, CSeqDBIdSet::eGi, false);
2573  not_m5_ornot_m7.Compute(CSeqDBIdSet::eOr, m7, false);
2574 
2575  BOOST_REQUIRE(! not_m5_ornot_m7.IsPositive());
2576  c3.Compute(CSeqDBIdSet::eAnd, not_m5_ornot_m7);
2577 
2578  BOOST_REQUIRE(! c3.IsPositive());
2579 
2580  // check lists.
2581 
2582  CRef<CSeqDBGiList> c1p, c2p;
2584 
2585  BOOST_REQUIRE(c1.IsPositive());
2586  BOOST_REQUIRE(c2.IsPositive());
2587  BOOST_REQUIRE(! c3.IsPositive());
2588 
2589  c1p = c1.GetPositiveList();
2590  c2p = c2.GetPositiveList();
2591  c3n = c3.GetNegativeList();
2592 
2593  for(int i = 0; i < 1000; i++) {
2594  bool d2(!(i%2)), d3(!(i%3)), d5(!(i%5)), d7(!(i%7));
2595 
2596  if (i == special) {
2597  d2 = d3 = d5 = d7 = true;
2598  }
2599 
2600  // c1: (m2 AND NOT m3) OR (m5 AND NOT m7)
2601  // c2: (NOT m2 OR m3) AND (m5 XOR m7)
2602  // c3: (m2 OR NOT m3) AND (NOT m5 OR NOT m7)
2603 
2604  bool in_c1 = ( d2 && !d3) || ( d5 && !d7);
2605  bool in_c2 = (!d2 || d3) && ( d5 != d7);
2606  bool in_c3 = ( d2 || !d3) && (!d5 || !d7);
2607 
2608  BOOST_REQUIRE_EQUAL(in_c1, c1p->FindGi(i));
2609  BOOST_REQUIRE_EQUAL(in_c2, c2p->FindGi(i));
2610  BOOST_REQUIRE_EQUAL(in_c3, ! c3n->FindGi(i));
2611  }
2612 }
2613 
2614 static bool s_DbHasOID(CSeqDB & db, int & count, int oid)
2615 {
2616  int oid2 = oid;
2617  bool have = db.CheckOrFindOID(oid) && (oid == oid2);
2618 
2619  if (have) {
2620  count++;
2621  }
2622 
2623  return have;
2624 }
2625 
2626 BOOST_AUTO_TEST_CASE(ComputedListFilter)
2627 {
2628 
2629  int v1[] = {
2630  46071115, 46071116, 46071117, 46071118, 46071119,
2631  46071120, 46071121, 46071122, 46071123, 46071124,
2632  46071125, 46071126, 46071127, 46071128, 46071129,
2633  46071130, 46071131, 46071132, 46071133, 46071134 };
2634 
2635  BOOST_REQUIRE((sizeof(v1)/sizeof(int)) == 20);
2636 
2637  vector<int> all(v1, v1 + 20);
2638  vector<int> mid(v1 + 5, v1 + 15);
2639 
2641  CSeqDBIdSet Mid(mid, CSeqDBIdSet::eGi);
2642  CSeqDBIdSet Neg(all, CSeqDBIdSet::eGi, false);
2643 
2644  CSeqDBIdSet TopBot(all, CSeqDBIdSet::eGi);
2645  TopBot.Compute(CSeqDBIdSet::eAnd, mid, false);
2646 
2647  // Compute inverse of TopBot, using a different sequence.
2648 
2649  CSeqDBIdSet NotTopBot(all, CSeqDBIdSet::eGi, false);
2650 
2651  NotTopBot.Compute(CSeqDBIdSet::eOr, mid);
2652 
2653  string nm = "data/seqn";
2655 
2656  CSeqDB seqn(nm, ty);
2657 
2658  CSeqDB db_A(nm, ty, All);
2659  CSeqDB db_M(nm, ty, Mid);
2660  CSeqDB db_N(nm, ty, Neg);
2661  CSeqDB db_TB(nm, ty, TopBot);
2662  CSeqDB db_NTB(nm, ty, NotTopBot);
2663 
2664  int A_count = 0;
2665  int M_count = 0;
2666  int N_count = 0;
2667  int TB_count = 0;
2668  int NTB_count = 0;
2669 
2670  for(int oid = 0; seqn.CheckOrFindOID(oid); oid++) {
2671  bool A_have = s_DbHasOID(db_A, A_count, oid);
2672  bool M_have = s_DbHasOID(db_M, M_count, oid);
2673  bool N_have = s_DbHasOID(db_N, N_count, oid);
2674  bool TB_have = s_DbHasOID(db_TB, TB_count, oid);
2675  bool NTB_have = s_DbHasOID(db_NTB, NTB_count, oid);
2676 
2677  BOOST_REQUIRE((! M_have) || A_have); // M -> A (implies)
2678  BOOST_REQUIRE(A_have != N_have); // A = ! N
2679  BOOST_REQUIRE((! TB_have) || A_have); // TB -> A
2680 
2681  BOOST_REQUIRE((!M_have) || (!N_have)); // M -> !N
2682  BOOST_REQUIRE((!M_have) || (!TB_have)); // M -> !TB
2683  BOOST_REQUIRE((!M_have) || NTB_have); // M -> NTB
2684 
2685  BOOST_REQUIRE((!N_have) || (!TB_have)); // N -> !TB
2686  BOOST_REQUIRE((!N_have) || NTB_have); // N -> NTB
2687 
2688  BOOST_REQUIRE(TB_have != NTB_have); // TB != NTB
2689  }
2690 
2691  int NSEQ = seqn.GetNumOIDs();
2692 
2693  BOOST_REQUIRE_EQUAL(NSEQ, 100);
2694 
2695  BOOST_REQUIRE_EQUAL(A_count, 20);
2696  BOOST_REQUIRE_EQUAL(M_count, 10);
2697  BOOST_REQUIRE_EQUAL(N_count, NSEQ-A_count);
2698  BOOST_REQUIRE_EQUAL(TB_count, A_count - M_count);
2699  BOOST_REQUIRE_EQUAL(NTB_count + TB_count, 100);
2700 
2701  CSeqDBIdSet idset_TB = db_TB.GetIdSet();
2702 
2703  BOOST_REQUIRE(! idset_TB.Blank());
2704 }
2705 
2706 BOOST_AUTO_TEST_CASE(SharedMemoryMaps)
2707 {
2708 
2709  CSeqDB seqdb1("nt", CSeqDB::eNucleotide);
2710  CSeqDB seqdb2("nt", CSeqDB::eNucleotide);
2711 
2712  const char *s1 = 0, *s2 = 0;
2713 
2714  seqdb1.GetSequence(0, & s1);
2715  seqdb2.GetSequence(0, & s2);
2716 
2717  try {
2718  BOOST_REQUIRE(string(s1) == string(s2));
2719  }
2720  catch(...) {
2721  if (s1)
2722  seqdb1.RetSequence(& s1);
2723  if (s2)
2724  seqdb2.RetSequence(& s2);
2725  throw;
2726  }
2727 
2728  if (s1)
2729  seqdb1.RetSequence(& s1);
2730  if (s2)
2731  seqdb2.RetSequence(& s2);
2732 }
2733 
2734 class CSeqIdList : public CSeqDBGiList {
2735 public:
2736  // Takes a NULL-terminated list of null-terminated strings. If these
2737  // start with '#' they are treated as GIs for the GI list; otherwise they
2738  // go in the Seq-id list.
2739  CSeqIdList(const char ** str)
2740  {
2741  for(const char ** p = str; *p; p++) {
2742  if ((*p)[0] == '#') {
2743  Uint4 g = (Uint4) atoi((*p) + 1);
2744  m_GisOids.push_back(GI_FROM(Uint4, g));
2745  } else {
2746  string acc(*p);
2747  m_SisOids.push_back(acc);
2748  }
2749  }
2750  }
2751 
2753  {
2754  }
2755 
2756  void Append(const char * p)
2757  {
2758  m_SisOids.push_back(string(p));
2759  }
2760 };
2761 
2763 {
2764 
2765  const char * str[] =
2766  { "BAR77217.1",
2767  "2J28_I",
2768  "P66272",
2769  "WP_003405746.1",
2770  "NP_200967.1",
2771  "MFY79158.1",
2772  "WP_002211004.1",
2773  "XP_645408.1",
2774  "RMS51295.1",
2775  NULL };
2776 
2777  CRef<CSeqIdList> ids(new CSeqIdList(str));
2778 
2779  BOOST_REQUIRE_EQUAL((int)ids->GetNumSis(), 9);
2780 
2781  // Check that all IDs are initially unresolved:
2782 
2783  for(int i = 0; i < ids->GetNumSis(); i++) {
2784  BOOST_REQUIRE(ids->GetSiOid(i).oid == -1);
2785  }
2786 
2787  // Check that SeqDB construction has resolved all IDs:
2788 
2789  CSeqDB db("nr", CSeqDB::eProtein, &*ids);
2790 
2791  for(int i = 0; i < ids->GetNumSis(); i++) {
2792  BOOST_CHECK_MESSAGE(ids->GetSiOid(i).oid != -1,
2793  "Seqid " << ids->GetSiOid(i).si << " is unresolved");
2794  }
2795 
2796  // Check that the set of returned ids is constrained to the same
2797  // size as the SeqIdList set.
2798 
2799  int k = 0;
2800 
2801  for(int i = 0; db.CheckOrFindOID(i); i++) {
2802  k += db.GetHdr(i)->Get().size();
2803  }
2804 
2805  BOOST_REQUIRE_EQUAL(k, ids->GetNumSis());
2806 }
2807 
2808 BOOST_AUTO_TEST_CASE(OidToGiLookup)
2809 {
2810  CSeqDB dbp("data/ranges/twenty", CSeqDB::eProtein);
2811  for(int oid = 0; dbp.CheckOrFindOID(oid); oid++) {
2812  TGi gi = dbp.GetSeqGI(oid);
2813  int the_oid;
2814  BOOST_REQUIRE( dbp.GiToOid(gi, the_oid));
2815  BOOST_REQUIRE_EQUAL(oid, the_oid);
2816  }
2817 
2818  CSeqDB dbn("data/seqn", CSeqDB::eNucleotide);
2819  for(int oid = 0; dbp.CheckOrFindOID(oid); oid++) {
2820  TGi gi = dbp.GetSeqGI(oid);
2821  int the_oid;
2822  BOOST_REQUIRE( dbp.GiToOid(gi, the_oid));
2823  BOOST_REQUIRE_EQUAL(oid, the_oid);
2824  }
2825 }
2826 
2827 
2828 BOOST_AUTO_TEST_CASE(SeqIdListAndGiList)
2829 {
2830 
2831  const char * str[] = {
2832  // Non-existant (fake):
2833  "ref|XP_12345.1|", // s0-2
2834  "#11223344",
2835  "gb|EAH98765.9|",
2836  "#123456", // g0,1
2837  "#3142007",
2838 
2839  // GIs found in volume but not volume list:
2840  "#38083732", // s3-5
2841  "#671595",
2842  "#43544756",
2843  "#45917153", // gi2,3
2844  "#15705575",
2845 
2846  // Non-GIs found in volume but not volume list:
2847  "ref|NP_912855.1|", // s6-10
2848  "gb|EAF49211.1|",
2849  "sp|Q63931|CCKR_CAVPO",
2850  "emb|CAE61105.1|",
2851  "gb|AAL05711.1|", // Note: same as "#15705575"
2852 
2853  // GIs Found in volume and volume list:
2854  "#28378617", // s11-13
2855  "#23474175",
2856  "#27364740",
2857  "#23113886", // gi4,5
2858  "#28563952",
2859 
2860  // Non-GIs Found in volume and volume list:
2861  "gb|AAP03339.1|", // s14-18
2862  "ref|NP_760268.1|",
2863  "ref|NP_817911.1|",
2864  "emb|CAD70761.1|",
2865  "gb|AAM45611.1|",
2866  NULL
2867  };
2868 
2869  CRef<CSeqIdList> ids(new CSeqIdList(str));
2870 
2871  // (Need to +1 for the terminating NULL.)
2872  BOOST_REQUIRE_EQUAL((int)ids->GetNumSis(), 12);
2873  BOOST_REQUIRE_EQUAL((int)ids->GetNumGis(), 13);
2874 
2875  // Check that all IDs are initially unresolved:
2876  int i;
2877 
2878  for(i = 0; i < ids->GetNumSis(); i++) {
2879  BOOST_REQUIRE(ids->GetSiOid(i).oid == -1);
2880  }
2881  for(i = 0; i < ids->GetNumGis(); i++) {
2882  BOOST_REQUIRE(ids->GetGiOid(i).oid == -1);
2883  }
2884 
2885  CSeqDB db("data/ranges/twenty", CSeqDB::eProtein, &*ids);
2886 
2887  // Check that SeqDB construction resolves needed GIs/Seq-ids, but does not
2888  // resolve fake ids; other ids can be resolved or not discretionally.
2889 
2890  for(i = 0; str[i]; i++) {
2891  bool found = false;
2892  int oid = -1;
2893 
2894  if (str[i][0] == '#') {
2895  int gi = atoi(str[i] + 1);
2896  found = ids->GiToOid(gi, oid);
2897  } else {
2898  string str_id = SeqDB_SimplifyAccession(str[i]);
2899  found = ids->SiToOid(str_id, oid);
2900  }
2901 
2902  BOOST_REQUIRE_EQUAL(found, true);
2903 
2904  if (i >= 0 && i < 4) {
2905  BOOST_REQUIRE_EQUAL(oid, -1);
2906  } else if (i >= 15 && i < 25) {
2907  if (oid == -1) {
2908  cout << "oid = -1, id=" << str[i] << endl;
2909  }
2910 
2911  BOOST_REQUIRE(oid != -1);
2912  }
2913  }
2914 
2915  // Set of Seq-ids that we want: the Seq-ids found in the deflines that are
2916  // the intersection of the deflines associated with the Seq-ids in each of
2917  // the user and volume GI lists.
2918 
2919  const char * inter[] = {
2920  // This is the set of all Seq-ids that should be found on iteration;
2921  // it includes all Seq-ids from the selected deflines. A defline is
2922  // selected if it has one or more GIs matching a database volume GI
2923  // list and one or more GIs or Seq-ids from the User GI List.
2924 
2925  "gi|28378617", "ref|NP_785509.1|",
2926  "gi|23474175", "ref|ZP_00129469.1|",
2927  "gi|27364740", "ref|NP_760268.1|",
2928  "gi|23113886", "ref|ZP_00099225.1|",
2929  "gi|28563952", "ref|NP_788261.1|",
2930  "gi|29788717", "gb|AAP03339.1|",
2931  "gi|29566344", "ref|NP_817911.1|",
2932  "gi|28950006", "emb|CAD70761.1|",
2933  "gi|21305377", "gb|AAM45611.1|",
2934  NULL
2935  };
2936 
2937  set<string> need;
2938 
2939  for(const char ** p = inter; *p; p++)
2940  need.insert(*p);
2941 
2942  // For each id found in iteration, verify that it is found in the "need"
2943  // list and then remove it from that list.
2944 
2945  for(int oid = 0; db.CheckOrFindOID(oid); oid++) {
2946  typedef list< CRef<CSeq_id> > TIds;
2947 
2948  TIds the_ids = db.GetSeqIDs(oid);
2949 
2950  ITERATE(TIds, iter, the_ids) {
2951  CRef<CSeq_id> seqid(*iter);
2952  string afs = seqid->AsFastaString();
2953  set<string>::iterator itr = need.find(afs);
2954  BOOST_REQUIRE(itr != need.end());
2955  need.erase(itr);
2956  }
2957  }
2958 
2959  // We should have emptied the 'need' set at this point.
2960 
2961  BOOST_REQUIRE(need.empty());
2962 }
2963 
2964 
2966 {
2967 
2968  CSeqDB db("data/empty", CSeqDB::eProtein);
2969 
2970  BOOST_REQUIRE_EQUAL(db.GetNumSeqs(), 0);
2971  BOOST_REQUIRE_EQUAL(db.GetNumOIDs(), 0);
2972  BOOST_REQUIRE_EQUAL((string)db.GetTitle(), string("empty test database"));
2973 
2974  BOOST_REQUIRE_THROW(db.GetSeqLength(0), CSeqDBException);
2975  BOOST_REQUIRE_THROW(db.GetSeqLengthApprox(0), CSeqDBException);
2976  BOOST_REQUIRE_THROW(db.GetHdr(0), CSeqDBException);
2977 
2978  map<TGi, TTaxId> gi_to_taxid;
2979  vector<TTaxId> taxids;
2980  vector<TGi> gis;
2981 
2982  BOOST_REQUIRE_THROW(db.GetTaxIDs(0, gi_to_taxid), CSeqDBException);
2983  BOOST_REQUIRE_THROW(db.GetTaxIDs(0, taxids), CSeqDBException);
2984  BOOST_REQUIRE_THROW(db.GetBioseq(0), CSeqDBException);
2985  BOOST_REQUIRE_THROW(db.GetBioseqNoData(0, 129295), CSeqDBException);
2986  BOOST_REQUIRE_THROW(db.GetBioseq(0, 129295), CSeqDBException);
2987 
2988  const char * buffer = 0;
2989  char * ncbuffer = 0;
2990 
2991  BOOST_REQUIRE_THROW(db.GetSequence(0, & buffer), CSeqDBException);
2992  BOOST_REQUIRE_THROW(db.GetAmbigSeq(0, & buffer, kSeqDBNuclBlastNA8),
2994  BOOST_REQUIRE_THROW(db.GetAmbigSeq(0, & buffer, kSeqDBNuclBlastNA8, 10, 20),
2996  BOOST_REQUIRE_THROW(db.GetAmbigSeqAlloc(0,
2997  & ncbuffer,
3000 
3001  // Don't check CSeqDB::RetSequence, because it uses an assert(),
3002  // which is more helpful from a debugging POV.
3003 
3004  BOOST_REQUIRE_THROW(db.GetSeqIDs(0), CSeqDBException);
3005  BOOST_REQUIRE_THROW(db.GetGis(0, gis), CSeqDBException);
3006  BOOST_REQUIRE_EQUAL(db.GetSequenceType(), CSeqDB::eProtein);
3007  BOOST_REQUIRE_EQUAL((string)db.GetTitle(), string("empty test database"));
3008  BOOST_REQUIRE_EQUAL((string)db.GetDate(), string("Mar 19, 2007 11:38 AM"));
3009  BOOST_REQUIRE_EQUAL(db.GetNumSeqs(), 0);
3010  BOOST_REQUIRE_EQUAL(db.GetNumOIDs(), 0);
3011  BOOST_REQUIRE_EQUAL(db.GetTotalLength(), Uint8(0));
3012  BOOST_REQUIRE_EQUAL(db.GetVolumeLength(), Uint8(0));
3013 
3014  int oid_count = 0;
3015  Uint8 seq_total = 0;
3016 
3017  BOOST_REQUIRE_NO_THROW(db.GetTotals(CSeqDB::eUnfilteredAll,
3018  & oid_count,
3019  & seq_total,
3020  false));
3021 
3022  BOOST_REQUIRE_EQUAL(oid_count, 0);
3023  BOOST_REQUIRE_EQUAL(seq_total, Uint8(0));
3024 
3025  BOOST_REQUIRE_EQUAL(db.GetMaxLength(), 0);
3026  BOOST_REQUIRE_NO_THROW(db.Begin());
3027 
3028  int oid = 0;
3029 
3030  BOOST_REQUIRE_EQUAL(false, db.CheckOrFindOID(oid));
3031 
3032  int begin(0), end(0);
3033  vector<int> oids;
3034 
3036  BOOST_REQUIRE_NO_THROW(ol_type = db.GetNextOIDChunk(begin, end, 100, oids, NULL));
3037 
3038  if (ol_type == CSeqDB::eOidList) {
3039  BOOST_REQUIRE_EQUAL(size_t(0), oids.size());
3040  } else {
3041  BOOST_REQUIRE_EQUAL(begin, end);
3042  }
3043 
3044  BOOST_REQUIRE_NO_THROW(db.ResetInternalChunkBookmark());
3045  BOOST_REQUIRE_EQUAL((string)db.GetDBNameList(), string("data/empty"));
3046  BOOST_REQUIRE_EQUAL(db.GetGiList(), (CSeqDBGiList*)NULL);
3047 
3048  int pig(123);
3049  TGi gi = 129295;
3050  string acc("P01013");
3051  CSeq_id seqid("sp|P01013|OVALX_CHICK");
3052 
3053  // This looks assymetric, but its logically consistent. Looking
3054  // up a non-existant GI, PIG, or Seq-id always returns a failure,
3055  // but never throws an exception. Since OIDs must be in range,
3056  // the OidToXyz functions will all throw exceptions (there are no
3057  // valid OIDs for an empty db).
3058 
3059  BOOST_REQUIRE_THROW(db.OidToPig(oid, pig), CSeqDBException);
3060  BOOST_REQUIRE_THROW(db.OidToGi(oid, gi), CSeqDBException);
3061 
3062  BOOST_REQUIRE_EQUAL(false, db.PigToOid(pig, oid));
3063  BOOST_REQUIRE_EQUAL(false, db.GiToOid(gi, oid));
3064  BOOST_REQUIRE_EQUAL(false, db.GiToPig(gi, pig));
3065  BOOST_REQUIRE_EQUAL(false, db.PigToGi(pig, gi));
3066  BOOST_REQUIRE_NO_THROW(db.AccessionToOids(acc, oids));
3067  BOOST_REQUIRE(oids.size() == 0);
3068  BOOST_REQUIRE_NO_THROW(db.SeqidToOids(seqid, oids));
3069  BOOST_REQUIRE(oids.size() == 0);
3070  BOOST_REQUIRE_EQUAL(false, db.SeqidToOid(seqid, oid));
3071 
3072  Uint8 residue(12345);
3073 
3074  // GetOidAtOffset() must throw. The specified starting OID must
3075  // be valid (and of course can't be, for an empty DB.)
3076 
3077  BOOST_REQUIRE_THROW(db.GetOidAtOffset(0, residue), CSeqDBException);
3078  BOOST_REQUIRE(db.GiToBioseq(gi).Empty());
3079  BOOST_REQUIRE(db.PigToBioseq(pig).Empty());
3080  BOOST_REQUIRE(db.SeqidToBioseq(seqid).Empty());
3081 
3082  vector<string> paths1;
3083  vector<string> paths2;
3084 
3085  BOOST_REQUIRE_NO_THROW(CSeqDB::FindVolumePaths("data/empty",
3087  paths1));
3088 
3089  BOOST_REQUIRE_NO_THROW(db.FindVolumePaths(paths2));
3090 
3091  BOOST_REQUIRE_EQUAL(paths1.size(), size_t(1));
3092  BOOST_REQUIRE_EQUAL(paths2.size(), size_t(1));
3093  BOOST_REQUIRE_EQUAL((string)paths1[0], (string)paths2[0]);
3094 
3095  // The end OID is higher than GetNumOIDs(), but as stated in the
3096  // documentation, this function silently adjusts the end value to
3097  // the number of OIDs if it is out of range.
3098 
3099  BOOST_REQUIRE_NO_THROW(db.SetIterationRange(0, 100));
3100 
3102  BOOST_REQUIRE_NO_THROW(db.GetAliasFileValues(afv));
3103 
3104  int taxid(57176);
3105 
3106  // An empty database should still be able to look up the
3107  // vociferans taxid.
3108 
3110  BOOST_REQUIRE_NO_THROW(db.GetTaxInfo(taxid, info));
3111 
3112  BOOST_REQUIRE_THROW(db.GetSeqData(0, 10, 20), CSeqDBException);
3113 }
3114 
3115 BOOST_AUTO_TEST_CASE(GetSeqData_Protein)
3116 {
3117  CSeqDB db("data/seqp", CSeqDB::eProtein);
3118  CRef<CSeq_data> sd = db.GetSeqData(0, 10, 20);
3119  BOOST_REQUIRE(!sd.Empty());
3120 }
3121 
3122 BOOST_AUTO_TEST_CASE(GetSeqData_Nucleotide)
3123 {
3124  CSeqDB db("data/seqn", CSeqDB::eNucleotide);
3125  CRef<CSeq_data> sd = db.GetSeqData(0, 10, 20);
3126  BOOST_REQUIRE(!sd.Empty());
3127 }
3128 
3130 {
3131 
3132  CSeqDB db56("data/f555 data/f556", CSeqDB::eNucleotide);
3133  CSeqDB db65("data/f556 data/f555", CSeqDB::eNucleotide);
3134 
3135  for(int di = 0; di < 2; di++) {
3136  CSeqDB & db = di ? db65 : db56;
3137 
3138  for(int oi = 0; oi < 2; oi++) {
3139  list< CRef<CSeq_id> > ids = db.GetSeqIDs(oi);
3140 
3141  int count = 0;
3142  int oid = -1;
3143 
3144  while(! ids.empty()) {
3145  const CSeq_id & id = *ids.front();
3146 
3147  if (id.Which() == CSeq_id::e_General &&
3148  id.GetGeneral().GetDb() == "BL_ORD_ID") {
3149 
3150  oid = id.GetGeneral().GetTag().GetId();
3151  count ++;
3152  }
3153 
3154  ids.pop_front();
3155  }
3156 
3157  BOOST_REQUIRE(count == 1);
3158  BOOST_REQUIRE(oid == oi);
3159  }
3160  }
3161 }
3162 
3163 BOOST_AUTO_TEST_CASE(GetSequenceAsString)
3164 {
3165 
3166  CSeqDB N("data/seqn", CSeqDB::eNucleotide);
3167  CSeqDB P("data/seqp", CSeqDB::eProtein);
3168 
3169  string nucl, prot;
3170 
3171  TGi nucl_gi = 46071107;
3172  string nucl_str = ("AAGCTCTTCATTGATGGTAGAGAGCCTATTAACAGGCAAC"
3173  "AGTCAATGCTCCAAAGTCCAAACAAGATTACCTGTGCAAA"
3174  "GAACTTGCAGTGTAACAAACCCCNTTCACGGCCAGAAGTA"
3175  "TTTGCAACAATGTTGAAAGTCCTTCTGGCAGAGGAGGAGT"
3176  "CTAAT");
3177 
3178  TGi prot_gi = 43914529;
3179  string prot_str = "MINKSGYEAKYKKSIKNNEEFWRKEGKRITWIKPYKKIKNVRYS";
3180 
3181  int nucl_oid(-1), prot_oid(-1);
3182 
3183  N.GiToOid(nucl_gi, nucl_oid);
3184  P.GiToOid(prot_gi, prot_oid);
3185 
3186  string nstr, pstr;
3187  N.GetSequenceAsString(nucl_oid, nstr);
3188  P.GetSequenceAsString(prot_oid, pstr);
3189 
3190  BOOST_REQUIRE_EQUAL((string)nstr, (string)nucl_str);
3191  BOOST_REQUIRE_EQUAL((string)pstr, (string)prot_str);
3192 }
3193 
3195 {
3196 
3197  // Test both constructors; make sure sizes are equal and non-zero.
3198 
3199  CSeqDB local("data/totals", CSeqDB::eNucleotide);
3200  CSeqDB seqn("data/seqn", CSeqDB::eNucleotide);
3201 
3202  BOOST_REQUIRE_EQUAL((int)local.GetTotalLength(), 12345);
3203  BOOST_REQUIRE_EQUAL((int)local.GetTotalLengthStats(), 23456);
3204  BOOST_REQUIRE_EQUAL((int)local.GetNumSeqs(), 123);
3205  BOOST_REQUIRE_EQUAL((int)local.GetNumSeqsStats(), 234);
3206  BOOST_REQUIRE_EQUAL((int)seqn.GetNumSeqsStats(), 0);
3207  BOOST_REQUIRE_EQUAL((int)seqn.GetTotalLengthStats(), 0);
3208 }
3209 
3211 public:
3212  CNegativeIdList(const int * ids, bool use_tis)
3213  {
3214  while(*ids) {
3215  if (use_tis) {
3216  m_Tis.push_back(*ids);
3217  } else {
3218  m_Gis.push_back(*ids);
3219  }
3220  ++ ids;
3221  }
3222  }
3223 
3225  {
3226  }
3227 };
3228 
3229 static void s_ModifyMap(map<int,int> & m, int key, int c, int & total)
3230 {
3231  int & amt = m[key];
3232  amt += c;
3233  total += c;
3234 
3235  if (! amt) {
3236  m.erase(key);
3237  }
3238 }
3239 
3240 static void s_MapAllGis(CSeqDB & db,
3241  map<int,int> & m,
3242  int change,
3243  int & total)
3244 {
3245  total = 0;
3246  vector<TGi> gis;
3247 
3248  for(int oid = 0; db.CheckOrFindOID(oid); oid++) {
3249  gis.clear();
3250 
3251  db.GetGis(oid, gis, false);
3252 
3253  ITERATE(vector<TGi>, iter, gis) {
3254  s_ModifyMap(m, GI_TO(int, *iter), change, total);
3255  }
3256  }
3257 }
3258 
3259 BOOST_AUTO_TEST_CASE(NegativeGiList)
3260 {
3261 
3262  // 15 ids from the middle of the seqp database.
3263 
3264  int gis[] = {
3265  23058829,
3266  9910844,
3267  23119763,
3268  7770223,
3269  15705575,
3270  9651810,
3271  27364740,
3272  23113886,
3273  21593385,
3274  15217498,
3275  39592435,
3276  22126577,
3277  44281419,
3278  14325807,
3279  15605992,
3280  0
3281  };
3282 
3283  int seqp_gis = 146;
3284  int nlist_gis = 15;
3285 
3286  CRef<CSeqDBNegativeList> neg(new CNegativeIdList(gis, false));
3287 
3288  CSeqDB have_got("data/seqp", CSeqDB::eProtein);
3289  CSeqDB have_not("data/seqp", CSeqDB::eProtein, &* neg);
3290 
3291  BOOST_REQUIRE_EQUAL((int)have_got.GetTotalLength(), 26945);
3292  BOOST_REQUIRE_EQUAL((int)have_got.GetNumSeqs(), 100);
3293 
3294  // From 100 original OIDs, 15 GIs were removed, but 4 of the OIDs
3295  // had multiple deflines, leaving a final count of 89 OIDs.
3296 
3297  BOOST_REQUIRE_EQUAL((int)have_not.GetTotalLength(), 23602);
3298  BOOST_REQUIRE_EQUAL((int)have_not.GetNumSeqs(), 89);
3299 
3300  map<int, int> id_pop;
3301 
3302  int total = 0;
3303 
3304  // Add all 'negated' IDs to the map; verify that the map size is
3305  // correct.
3306 
3307  for(int * idp = gis; *idp; ++idp) {
3308  s_ModifyMap(id_pop, *idp, 1, total);
3309  }
3310 
3311  BOOST_REQUIRE_EQUAL((int) id_pop.size(), nlist_gis);
3312  BOOST_REQUIRE_EQUAL(total, nlist_gis);
3313 
3314  // Add all filtered IDs to the map; verify that the map size is
3315  // correct and that the total change is seqp_gis-nlist_gis
3316 
3317  s_MapAllGis(have_not, id_pop, 1, total);
3318 
3319  BOOST_REQUIRE_EQUAL((int) id_pop.size(), seqp_gis);
3320  BOOST_REQUIRE_EQUAL(total, seqp_gis-nlist_gis);
3321 
3322  // Remove all unfiltered IDs from the map; the result should be a
3323  // negative change of (the number of gis in seqp) and cause the
3324  // map to be empty. This verifies that the negative GI list and
3325  // the set of GIs in the filtered DB are an exact partition of the
3326  // unfiltered database.
3327 
3328  s_MapAllGis(have_got, id_pop, -1, total);
3329 
3330  BOOST_REQUIRE_EQUAL((int) id_pop.size(), 0);
3331  BOOST_REQUIRE_EQUAL(total, -seqp_gis);
3332 
3333  // One last thing: since there is some non-redundancy in the seqp
3334  // database, I want to check that it affects the header data that
3335  // is reported from SeqDB::GetHdr().
3336 
3337  TGi gi1 = 27360885;
3338  int oid1 = -1;
3339 
3340  bool ok = have_got.GiToOid(gi1, oid1);
3341  BOOST_REQUIRE(ok);
3342 
3343  list< CRef<CSeq_id> > got_ids = have_got.GetSeqIDs(oid1);
3344  list< CRef<CSeq_id> > not_ids = have_not.GetSeqIDs(oid1);
3345 
3346  int diff = 0;
3347 
3348  ITERATE(list< CRef<CSeq_id> >, iter, got_ids) {
3349  diff ++;
3350  }
3351  ITERATE(list< CRef<CSeq_id> >, iter, not_ids) {
3352  diff --;
3353  }
3354  BOOST_REQUIRE_EQUAL(diff, 2);
3355 }
3356 
3357 BOOST_AUTO_TEST_CASE(NegativeListNt)
3358 {
3359 
3360  int gis[] = {
3361  555, 0
3362  };
3363 
3364  CRef<CSeqDBNegativeList> neg(new CNegativeIdList(gis, false));
3365 
3366  string db = "nt";
3367 
3368  CSeqDB have_got(db, CSeqDB::eNucleotide);
3369  CSeqDB have_not(db, CSeqDB::eNucleotide, &* neg);
3370 
3371  BOOST_REQUIRE_EQUAL(have_got.GetNumSeqs(), have_not.GetNumSeqs() + 1);
3372 
3373  int oid = -1;
3374  bool found = have_got.GiToOid(gis[0], oid);
3375  BOOST_REQUIRE(found);
3376 
3377  vector<TGi> gis_w, gis_wo;
3378  have_got.GetGis(oid, gis_w);
3379  have_not.GetGis(oid, gis_wo);
3380 
3381  // Check that exactly 1 GI was removed.
3382 
3383  int count_w = (int) gis_w.size();
3384  int count_wo = (int) gis_wo.size();
3385  BOOST_REQUIRE_EQUAL(count_w, (count_wo+1));
3386 }
3387 
3388 BOOST_AUTO_TEST_CASE(NegativeListSwissprot)
3389 {
3390 
3391  // 1 id from the swissprot database.
3392  vector<unsigned int> pigs;
3393  pigs.push_back(281224);
3394 
3396  neg->SetPigList(pigs);
3397 
3398  string db = "swissprot";
3399  const int len = 134;
3400 
3401  CSeqDB have_got(db, CSeqDB::eProtein);
3402  CSeqDB have_not(db, CSeqDB::eProtein, &* neg);
3403 
3404  BOOST_REQUIRE_EQUAL(have_got.GetTotalLength(), have_not.GetTotalLength() + len);
3405  BOOST_REQUIRE_EQUAL(have_got.GetNumSeqs(), have_not.GetNumSeqs() +1);
3406 
3407  int oid = -1;
3408  bool found = have_got.PigToOid(pigs[0], oid);
3409  BOOST_REQUIRE(found);
3410 
3411  int pig_w;
3412  have_got.OidToPig(oid, pig_w);
3413  BOOST_REQUIRE_EQUAL((unsigned int)pig_w, pigs[0]);
3414 }
3415 
3417 {
3418 
3419  CSeqDBExpert seqp("data/seqp", CSeqDB::eProtein);
3420  CSeqDBExpert seqn("data/seqn", CSeqDB::eNucleotide);
3421 
3422  int oid(0);
3423 
3424  for(oid = 0; oid < 10 && seqp.CheckOrFindOID(oid); oid++) {
3425  unsigned h = seqp.GetSequenceHash(oid);
3426 
3427  vector<int> oids;
3428  seqp.HashToOids(h, oids);
3429 
3430  bool found = false;
3431 
3432  ITERATE(vector<int>, iter, oids) {
3433  if (*iter == oid) {
3434  found = true;
3435  break;
3436  }
3437  }
3438 
3439  BOOST_REQUIRE(found);
3440  }
3441 
3442  for(oid = 0; oid < 10 && seqn.CheckOrFindOID(oid); oid++) {
3443  unsigned h = seqn.GetSequenceHash(oid);
3444 
3445  vector<int> oids;
3446  seqn.HashToOids(h, oids);
3447 
3448  bool found = false;
3449 
3450  ITERATE(vector<int>, iter, oids) {
3451  if (*iter == oid) {
3452  found = true;
3453  break;
3454  }
3455  }
3456 
3457  BOOST_REQUIRE(found);
3458  }
3459 }
3460 
3461 #if 0
3462 BOOST_AUTO_TEST_CASE(TraceIdLookup)
3463 {
3464 
3465  vector<string> ids;
3466  NStr::Tokenize("1234 2468 4936 9872 19744 1234000 "
3467  "1234000000 1234000000000 1234000000000000",
3468  " ", ids);
3469 
3470  string sides("B44448888");
3471 
3472  CSeqDB db4("data/short-tis", CSeqDB::eNucleotide);
3473  CSeqDB db8("data/long-tis", CSeqDB::eNucleotide);
3474 
3475  BOOST_REQUIRE_EQUAL(sides.size(), ids.size());
3476 
3477  for(size_t i = 0; i < ids.size(); i++) {
3478  bool is4(false), is8(false);
3479 
3480  switch(sides[i]) {
3481  case 'B':
3482  is4 = true;
3483  is8 = true;
3484  break;
3485 
3486  case '4':
3487  is4 = true;
3488  break;
3489 
3490  case '8':
3491  is8 = true;
3492  break;
3493  }
3494 
3495  string idstr = ids[i];
3496  Int8 idnum = NStr::StringToInt8(idstr);
3497 
3498  int oid = -2;
3499 
3500  bool have = db4.TiToOid(idnum, oid);
3501  BOOST_REQUIRE_EQUAL(is4, have);
3502  BOOST_REQUIRE_EQUAL(is4, (oid >= 0));
3503 
3504  have = db8.TiToOid(idnum, oid);
3505  BOOST_REQUIRE_EQUAL(is8, have);
3506  BOOST_REQUIRE_EQUAL(is8, (oid >= 0));
3507 
3508  CSeq_id seqid(string("gnl|ti|") + idstr);
3509  vector<int> oids;
3510 
3511  db4.SeqidToOids(seqid, oids);
3512  BOOST_REQUIRE_EQUAL(is4, (oids.size() == 1));
3513 
3514  db8.SeqidToOids(seqid, oids);
3515  BOOST_REQUIRE_EQUAL(is8, (oids.size() == 1));
3516  }
3517 }
3518 #endif
3519 
3520 BOOST_AUTO_TEST_CASE(FilteredHeaders)
3521 {
3522 
3523  CSeqDB p1("nr", CSeqDB::eProtein);
3524  CSeqDB p2("refseq_protein", CSeqDB::eProtein);
3525 
3526  // Use a pig in case of GI evaporation.
3527 
3528  int pig = 1401930;
3529 
3530  int oid1(-1), oid2(-1);
3531  bool okay1 = p1.PigToOid(pig, oid1);
3532  bool okay2 = p2.PigToOid(pig, oid2);
3533 
3534  BOOST_REQUIRE(okay1);
3535  BOOST_REQUIRE(okay2);
3536  BOOST_REQUIRE(oid1 > 0);
3537  BOOST_REQUIRE(oid2 > 0);
3538  BOOST_REQUIRE(oid1 == oid2); // same underlying volumes -> same OID
3539 
3540  int size1 = p1.GetHdr(oid1)->Get().size();
3541  int size2 = p2.GetHdr(oid2)->Get().size();
3542 
3543  // Currently there are 15 matching GIs in nr, and only one in
3544  // refseq_protein. This can drift over time (in either direction)
3545  // so the criteria here are less strict; I'm assuming we will gain
3546  // at least one redundant GI for each GI that evaporates. I'm
3547  // also assuming that at least 5 more redundant GIs will exist
3548  // than we have proteins in refseq for this PIG.
3549 
3550  BOOST_CHECK_NE(0, size1);
3551  BOOST_CHECK_NE(0, size2);
3552  BOOST_CHECK_GE(size1, 14);
3553  BOOST_CHECK_GT(size1, (size2 + 5));
3554 }
3555 
3556 static void s_CheckIdLookup(CSeqDB & db, const string & acc, size_t exp_oids, size_t exp_size)
3557 {
3558  list<string> ids;
3560 
3561  vector<int> oids;
3562 
3563  ostringstream fasta;
3564  CFastaOstream fos(fasta);
3565  fos.SetWidth(80);
3566 
3567  ITERATE(list<string>, iter, ids) {
3568  // For each ID, check that:
3569  // 1. SeqDB can find it.
3570 
3571  vector<int> tmp_oids;
3572  db.AccessionToOids(*iter, tmp_oids);
3573 
3574  BOOST_REQUIRE_MESSAGE(tmp_oids.size(),
3575  string("No OIDs found for ")+(*iter));
3576 
3577  oids.insert(oids.end(), tmp_oids.begin(), tmp_oids.end());
3578  }
3579 
3580  // sort/unique
3581 
3582  sort(oids.begin(), oids.end());
3583  oids.erase(unique(oids.begin(), oids.end()), oids.end());
3584 
3585  ITERATE(vector<int>, iter, oids) {
3586  fos.Write(*db.GetBioseq(*iter));
3587  }
3588 
3589  string all_fasta = fasta.str();
3590  string msg = string("Error for accession: ") + acc;
3591 
3592  BOOST_REQUIRE_MESSAGE(all_fasta.size() == exp_size, msg);
3593  BOOST_REQUIRE_MESSAGE(exp_oids == oids.size(), msg);
3594 }
3595 
3597 {
3598  CSeqDB db("data/nrshort.old", CSeqDB::eUnknown);
3599 
3600  s_CheckIdLookup(db, "gi|67472376", 1, 6590);
3601  s_CheckIdLookup(db, "sp|P0A7U1|RS18_SALTI", 1, 6590);
3602  s_CheckIdLookup(db, "sp||RS18_SALTI", 1, 6590);
3603  s_CheckIdLookup(db, "sp|P0A7U1|", 1, 6590);
3604  s_CheckIdLookup(db, "P0A7U1", 1, 6590);
3605  s_CheckIdLookup(db, "RS18_SALTI", 1, 6590);
3606  s_CheckIdLookup(db, "ref|NP_313205.1|", 1, 6590);
3607  s_CheckIdLookup(db, "NP_313205.1", 1, 6590);
3608  s_CheckIdLookup(db, "pir||AI1052", 1, 6590);
3609  s_CheckIdLookup(db, "AI1052", 1, 6590);
3610  s_CheckIdLookup(db, "NP_268346, XP_642837.1, 30262378, ABD21303.1", 4, 5411);
3611  s_CheckIdLookup(db, "pdb|1VS7|R", 1, 6590);
3612  s_CheckIdLookup(db, "1VS7", 1, 6590);
3613  s_CheckIdLookup(db, "prf||2202317B", 1, 628);
3614  s_CheckIdLookup(db, "2202317B", 1, 628);
3615  s_CheckIdLookup(db, "tr|Q4QBU6|Q4QBU6_LEIMA", 1, 609);
3616  s_CheckIdLookup(db, "Q4QBU6", 1, 609);
3617 }
3618 
3620 {
3621  CSeqDB db("data/nrshort", CSeqDB::eUnknown);
3622 
3623  s_CheckIdLookup(db, "gi|67472376", 1, 6590);
3624  s_CheckIdLookup(db, "sp|P0A7U1|RS18_SALTI", 1, 6590);
3625  s_CheckIdLookup(db, "sp||RS18_SALTI", 1, 6590);
3626  s_CheckIdLookup(db, "sp|P0A7U1|", 1, 6590);
3627  s_CheckIdLookup(db, "P0A7U1", 1, 6590);
3628  s_CheckIdLookup(db, "RS18_SALTI", 1, 6590);
3629  s_CheckIdLookup(db, "ref|NP_313205.1|", 1, 6590);
3630  s_CheckIdLookup(db, "ref|NP_313205|", 1, 6590);
3631  s_CheckIdLookup(db, "NP_313205.1", 1, 6590);
3632  s_CheckIdLookup(db, "pir||AI1052", 1, 6590);
3633  s_CheckIdLookup(db, "AI1052", 1, 6590);
3634  s_CheckIdLookup(db, "NP_268346, XP_642837.1, 30262378, ABD21303.1", 4, 5411);
3635  s_CheckIdLookup(db, "pdb|1VS7|R", 1, 6590);
3636  s_CheckIdLookup(db, "1VS7", 1, 6590);
3637  s_CheckIdLookup(db, "prf||2202317B", 1, 628);
3638  s_CheckIdLookup(db, "2202317B", 1, 628);
3639  s_CheckIdLookup(db, "tr|Q4QBU6|Q4QBU6_LEIMA", 1, 609);
3640  s_CheckIdLookup(db, "Q4QBU6", 1, 609);
3641  s_CheckIdLookup(db, "15127771", 1, 345);
3642  s_CheckIdLookup(db, "aaa15484", 1, 345);
3643 }
3644 
3646 {
3647  CSeqDB db("data/ntshort.old", CSeqDB::eUnknown);
3648 
3649  s_CheckIdLookup(db, "gi|2695850", 1, 683);
3650  s_CheckIdLookup(db, "2695850", 1, 683);
3651  s_CheckIdLookup(db, "emb|Y13260.1|ABY13260", 1, 683);
3652  s_CheckIdLookup(db, "emb||ABY13260", 1, 683);
3653  s_CheckIdLookup(db, "emb|Y13260.1|", 1, 683);
3654  s_CheckIdLookup(db, "gb|Y13260.1|ABY13260", 1, 683);
3655  s_CheckIdLookup(db, "Y13260.1", 1, 683);
3656  s_CheckIdLookup(db, "ABY13260", 1, 683);
3657  s_CheckIdLookup(db, "emb|Y13260|ABY13260", 1, 683);
3658  s_CheckIdLookup(db, "emb|Y13260|", 1, 683);
3659  s_CheckIdLookup(db, "gb|Y13260|ABY13260", 1, 683);
3660  s_CheckIdLookup(db, "Y13260", 1, 683);
3661  s_CheckIdLookup(db, "gnl|ti|43939557", 1, 972);
3662 }
3663 
3665 {
3666  CSeqDB db("data/ntshort", CSeqDB::eUnknown);
3667 
3668  s_CheckIdLookup(db, "gi|2695850", 1, 683);
3669  s_CheckIdLookup(db, "2695850", 1, 683);
3670  s_CheckIdLookup(db, "emb|Y13260.1|ABY13260", 1, 683);
3671  s_CheckIdLookup(db, "emb||ABY13260", 1, 683);
3672  s_CheckIdLookup(db, "emb|Y13260.1|", 1, 683);
3673  s_CheckIdLookup(db, "gb|Y13260.1|ABY13260", 1, 683);
3674  s_CheckIdLookup(db, "Y13260.1", 1, 683);
3675  s_CheckIdLookup(db, "ABY13260", 1, 683);
3676  s_CheckIdLookup(db, "emb|Y13260|ABY13260", 1, 683);
3677  s_CheckIdLookup(db, "emb|Y13260|", 1, 683);
3678  s_CheckIdLookup(db, "gb|Y13260|ABY13260", 1, 683);
3679  s_CheckIdLookup(db, "Y13260", 1, 683);
3680  s_CheckIdLookup(db, "gnl|ti|43939557", 1, 972);
3681 }
3682 
3683 BOOST_AUTO_TEST_CASE(PdbIdWithChain)
3684 {
3685 
3686  CSeqDB nr("nr", CSeqDB::eProtein);
3687 
3688  string acc("1QCF_A");
3689 
3690  vector<int> oids;
3691  nr.AccessionToOids(acc, oids);
3692 
3693  BOOST_REQUIRE(oids.size());
3694 }
3695 
3696 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
3697  (!defined(NCBI_COMPILER_MIPSPRO)) )
3698 BOOST_AUTO_TEST_CASE(UserDefinedColumns)
3699 {
3700 
3701  string fname("data/user-column");
3702  string vname("data/user-column-db");
3703  const string title("comedy");
3704 
3705  CSeqDBExpert db(vname, CSeqDB::eProtein);
3706  CSeqDB_ColumnReader CR(fname, 'a');
3707 
3708  BOOST_REQUIRE_EQUAL(CR.GetTitle(), title);
3709 
3710  // Meta Data
3711 
3712  vector<string> columns;
3713  db.ListColumns(columns);
3714 
3715  BOOST_REQUIRE_EQUAL((int)columns.size(), 1);
3716  BOOST_REQUIRE_EQUAL(title, columns[0]);
3717 
3718  int comedy_column = db.GetColumnId(title);
3719  BOOST_REQUIRE(comedy_column >= 0);
3720 
3721  const map<string, string> & metadata_db =
3722  db.GetColumnMetaData(comedy_column);
3723 
3724  const map<string, string> & metadata_user =
3725  CR.GetMetaData();
3726 
3727  BOOST_REQUIRE_EQUAL((int)metadata_db.size(), 3);
3728  BOOST_REQUIRE_EQUAL(metadata_db.find("created-by")->second, string("unit test"));
3729  BOOST_REQUIRE_EQUAL(metadata_db.find("purpose")->second, string("none"));
3730  BOOST_REQUIRE_EQUAL(metadata_db.find("format")->second, string("text"));
3731 
3732  // Meta data for both should be identical.
3733  BOOST_REQUIRE(metadata_db == metadata_user);
3734 
3735  // You can also just find the value for a given key. This method
3736  // does not determine if the requested key has different values in
3737  // different columns, nor does it distinguish between keys which
3738  // are not specified versus keys which exist but have an empty
3739  // string as their value.
3740 
3741  BOOST_REQUIRE(db.GetColumnValue(comedy_column, "format") == "text");
3742  BOOST_REQUIRE(db.GetColumnValue(comedy_column, "duck soup") == "");
3743  BOOST_REQUIRE(CR.GetValue("format") == "text");
3744  BOOST_REQUIRE(CR.GetValue("who's on first") == "");
3745 
3746  // This code gets a more list of data, namely a map from the
3747  // volume name to the set of properties found in each volume.
3748 
3749  vector<string> volumes;
3750  db.FindVolumePaths(volumes);
3751 
3752  const map<string, string> & meta_vol0 =
3753  db.GetColumnMetaData(comedy_column, volumes[0]);
3754 
3755  BOOST_REQUIRE(meta_vol0.find("format") != meta_vol0.end());
3756  BOOST_REQUIRE(meta_vol0.find("format")->second == "text");
3757 
3758  // Column data.
3759 
3760  vector<string> column_data;
3761  column_data.push_back("Groucho Marx");
3762  column_data.push_back("Charlie Chaplain");
3763  column_data.push_back("");
3764  column_data.push_back("Abbott and Costello");
3765  column_data.push_back("Jackie Gleason");
3766  column_data.push_back("Jerry Seinfeld");
3767  column_data.back()[5] = (char) 0;
3768 
3769  CBlastDbBlob db_blob, cr_blob;
3770 
3771  BOOST_REQUIRE_EQUAL((int) column_data.size(), db.GetNumOIDs());
3772  BOOST_REQUIRE_EQUAL((int) column_data.size(), CR.GetNumOIDs());
3773 
3774  int count = std::min((int)column_data.size(), db.GetNumOIDs());
3775 
3776  for(int oid = 0; oid < count; oid++) {
3777  db.GetColumnBlob(comedy_column, oid, db_blob);
3778  CR.GetBlob(oid, cr_blob);
3779 
3780  BOOST_REQUIRE(db_blob.Str() == column_data[oid]);
3781  BOOST_REQUIRE(cr_blob.Str() == column_data[oid]);
3782  }
3783 }
3784 #endif
3785 
3786 BOOST_AUTO_TEST_CASE(VersionedSparseId)
3787 {
3788 
3789  CSeqDB db("data/sparse_id", CSeqDB::eNucleotide);
3790 
3791  string good("Z12841.1");
3792  string bad ("Z12842.1");
3793  string both("Z12843.1");
3794 
3795  vector<int> o1, o2, o3;
3796  db.AccessionToOids(good, o1);
3797  db.AccessionToOids(bad, o2);
3798  db.AccessionToOids(both, o3);
3799 
3800  BOOST_REQUIRE(o1.size() == 1);
3801  BOOST_REQUIRE(o2.size() == 0);
3802  BOOST_REQUIRE(o3.size() == 1);
3803 }
3804 
3805 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
3806  (!defined(NCBI_COMPILER_MIPSPRO)) )
3807 BOOST_AUTO_TEST_CASE(MaskDataColumn)
3808 {
3809 
3810  CSeqDB db("data/mask-data-db", CSeqDB::eProtein);
3811 
3812  vector<int> algos;
3813  db.GetAvailableMaskAlgorithms(algos);
3814 
3815  // Check each algorithm definition.
3816 
3817  BOOST_REQUIRE_EQUAL((int)algos.size(), 2);
3818  BOOST_REQUIRE_EQUAL((int)(eBlast_filter_program_seg), algos[0]);
3819  BOOST_REQUIRE_EQUAL((int)(eBlast_filter_program_repeat), algos[1]);
3820 
3821  string algo_opts, algo_name;
3822  EBlast_filter_program filtering_algo;
3823 
3824  db.GetMaskAlgorithmDetails(algos.front(),
3825  filtering_algo, algo_name, algo_opts);
3826 
3827  BOOST_REQUIRE_EQUAL(filtering_algo, objects::eBlast_filter_program_seg);
3828  //BOOST_REQUIRE_EQUAL(algo_opts, string("-use-defaults"));
3829  BOOST_REQUIRE_EQUAL(algo_opts, kEmptyStr);
3830 
3831  db.GetMaskAlgorithmDetails(algos.back(),
3832  filtering_algo, algo_name, algo_opts);
3833  BOOST_REQUIRE_EQUAL(filtering_algo, objects::eBlast_filter_program_repeat);
3834  BOOST_REQUIRE_EQUAL(algo_opts, string("-species Desmodus_rotundus"));
3835 
3836  const int kCount = 10;
3837 
3838  BOOST_REQUIRE_EQUAL(db.GetNumOIDs(), kCount);
3839 }
3840 
3841 BOOST_AUTO_TEST_CASE(CheckColumnFailureCleanup)
3842 {
3843 
3844  CSeqDB db("data/broken-mask-data-db", CSeqDB::eProtein);
3845 
3846  vector<int> lst;
3847  BOOST_REQUIRE_THROW(db.GetAvailableMaskAlgorithms(lst), CSeqDBException);
3848 }
3849 
3850 BOOST_AUTO_TEST_CASE(EmptyMaskData)
3851 {
3852  CSeqDB db("data/empty-mask-data-db", CSeqDB::eNucleotide);
3853 
3854  vector<int> algos;
3855  db.GetAvailableMaskAlgorithms(algos);
3856 
3857  BOOST_REQUIRE_EQUAL(algos.size(), 1U);
3858  BOOST_REQUIRE_EQUAL(11, algos.front());
3859 
3860  CSeqDB::TSequenceRanges ranges;
3861  db.GetMaskData(0, algos.front(), ranges);
3862  BOOST_REQUIRE(ranges.empty());
3863 }
3864 #endif
3865 
3866 struct SDbSumInfo {
3867 public:
3869  {
3871  total_oids = db.GetNumOIDs();
3872 
3874  filtered_oids = db.GetNumSeqs();
3875 
3876  measured_length = 0;
3877  measured_oids = 0;
3878 
3879  for(int oid=0; db.CheckOrFindOID(oid); oid++) {
3880  measured_length += db.GetSeqLength(oid);
3881  measured_oids ++;
3882  }
3883  }
3884 
3885  void CompareField(Int8 X, Int8 Y, string & sum, char ch)
3886  {
3887  if (X > Y) {
3888  sum.append("+");
3889  } else if (X < Y) {
3890  sum.append("-");
3891  } else {
3892  sum.append("=");
3893  }
3894  sum.push_back(ch);
3895  }
3896 
3897  string Compare(SDbSumInfo & other)
3898  {
3899  // This compares the fields for two databases to each other;
3900  // it tells us which database has more sequences and bases
3901  // before and after filtering.
3902 
3903  string sum;
3904  sum.reserve(20);
3905 
3906  CompareField(total_length, other.total_length, sum, 'T');
3907  CompareField(filtered_length, other.filtered_length, sum, 'F');
3908  CompareField(measured_length, other.measured_length, sum, 'M');
3909 
3910  CompareField(total_oids, other.total_oids, sum, 't');
3911  CompareField(filtered_oids, other.filtered_oids, sum, 'f');
3912  CompareField(measured_oids, other.measured_oids, sum, 'm');
3913 
3914  return sum;
3915  }
3916 
3917  string CompareSelf()
3918  {
3919  // This compares the fields within a database to each other;
3920  // it tells us if filtering has any effect and whether the
3921  // measured values equal the expected totals.
3922 
3923  string sum;
3924  sum.reserve(20);
3925 
3929 
3933 
3934  return sum;
3935  }
3936 
3937  /// Total length, sum of all volume lengths.
3939 
3940  /// Filtered length, result of all filtering.
3942 
3943  /// Measured length should equal filtered if alias files are correct.
3945 
3946  /// Total oid count, sum of all volume oid counts.
3948 
3949  /// Filtered oid count, result of all filtering.
3951 
3952  /// Measured oid count should equal filtered if alias files are correct.
3954 };
3955 
3956 BOOST_AUTO_TEST_CASE(OidAndGiLists)
3957 {
3958 
3959  CSeqDB nr("nr", CSeqDB::eProtein);
3960  CSeqDB sp("swissprot", CSeqDB::eProtein);
3961  CSeqDB sc("data/swiss_cheese", CSeqDB::eProtein);
3962  CSeqDB ac("data/all_cheese", CSeqDB::eProtein);
3963 
3964  SDbSumInfo nr_sum(nr);
3965  SDbSumInfo sp_sum(sp);
3966  SDbSumInfo ac_sum(ac);
3967  SDbSumInfo sc_sum(sc);
3968 
3969  BOOST_CHECK_EQUAL((string) nr_sum.CompareSelf(), "=A=B=C=a=b=c");
3970  BOOST_CHECK_EQUAL((string) sp_sum.CompareSelf(), "=A=B=C=a=b=c");
3971  BOOST_CHECK_EQUAL((string) ac_sum.CompareSelf(), "+A+B=C+a+b=c");
3972  BOOST_CHECK_EQUAL((string) sc_sum.CompareSelf(), "+A+B=C+a+b=c");
3973 
3974  BOOST_CHECK_EQUAL((string) nr_sum.Compare(sp_sum), "+T+F+M+t+f+m");
3975  BOOST_CHECK_EQUAL((string) nr_sum.Compare(ac_sum), "=T+F+M=t+f+m");
3976  BOOST_CHECK_EQUAL((string) nr_sum.Compare(sc_sum), "+T+F+M+t+f+m");
3977 
3978  BOOST_CHECK_EQUAL((string) sp_sum.Compare(sc_sum), "=T+F+M=t+f+m");
3979  BOOST_CHECK_EQUAL((string) ac_sum.Compare(sc_sum), "+T+F+M+t+f+m");
3980 }
3981 
3982 BOOST_AUTO_TEST_CASE(DeltaSequenceHash)
3983 {
3984 
3985  // Get hash #1
3986 
3987  CSeqDBExpert nucl("nucl_dbs", CSeqDB::eNucleotide);
3988  int oid(-1);
3989  nucl.GiToOid(4512300, oid);
3990  unsigned h1 = nucl.GetSequenceHash(oid);
3991 
3992  // Get hash #2
3993 
3994  char ch = CFile::GetPathSeparator();
3995  string path = string("data") + ch + "deltaseq";
3996  ifstream f(path.c_str());
3997 
3998  CBioseq bs;
3999  f >> MSerial_AsnText >> bs;
4000 
4001  unsigned h2 = SeqDB_SequenceHash(bs);
4002 
4003  // Check that we don't have a real Seq-data.
4004 
4005  BOOST_REQUIRE(! bs.GetInst().CanGetSeq_data());
4006 
4007  // Check that the hash values match.
4008 
4009  BOOST_REQUIRE_EQUAL(h1, h2);
4010 }
4011 
4012 BOOST_AUTO_TEST_CASE(RestartWithVolumes)
4013 {
4014  CSeqDB db("data/restart", CSeqDB::eProtein);
4015 }
4016 
4017 BOOST_AUTO_TEST_CASE(ExtractBlastDefline)
4018 {
4019  CSeqDB db("nt", CSeqDB::eNucleotide);
4020  int oid;
4021  BOOST_REQUIRE(db.GiToOid(555, oid));
4022  CRef<CBioseq> bs = db.GetBioseq(oid);
4024  BOOST_REQUIRE(deflines.NotEmpty());
4025 
4026  // simulate this bioseq having come from the Genbank data loader
4027  bs->ResetDescr();
4028  deflines = CSeqDB::ExtractBlastDefline(*bs);
4029  BOOST_REQUIRE(deflines.Empty());
4030 }
4031 
4032 BOOST_AUTO_TEST_CASE(TestDiskUsage)
4033 {
4034  CSeqDB db("data/mini-gnomon", CSeqDB::eProtein);
4035  const Int8 kExpectedSize = 1420;
4036  BOOST_REQUIRE_EQUAL(kExpectedSize, db.GetDiskUsage());
4037 }
4038 
4039 BOOST_AUTO_TEST_CASE(FindGnomonIds)
4040 {
4041  vector<string> gnomon_ids;
4042  gnomon_ids.push_back("gnl|GNOMON|334.p");
4043  gnomon_ids.push_back("gnl|GNOMON|2334.p");
4044  gnomon_ids.push_back("gnl|GNOMON|4334.p");
4045  gnomon_ids.push_back("gnl|GNOMON|6334.p");
4046  gnomon_ids.push_back("gnl|GNOMON|8334.p");
4047 
4048  CSeqDB db("data/mini-gnomon", CSeqDB::eProtein);
4049  for (size_t i = 0; i < gnomon_ids.size(); i++) {
4050  {{
4051  vector<int> oids;
4052  db.AccessionToOids(gnomon_ids[i], oids);
4053  BOOST_REQUIRE( !oids.empty() );
4054  BOOST_REQUIRE_EQUAL(i, (size_t)oids.front());
4055  }}
4056  {{
4057  vector<int> oids;
4058  CSeq_id id(gnomon_ids[i]);
4059  db.SeqidToOids(id, oids);
4060  BOOST_REQUIRE( !oids.empty() );
4061  BOOST_REQUIRE_EQUAL(i, (size_t)oids.front());
4062  }}
4063  {{
4064  int oid = -1;
4065  CSeq_id id(gnomon_ids[i]);
4066  bool found = db.SeqidToOid(id, oid);
4067  BOOST_REQUIRE(found);
4068  BOOST_REQUIRE_EQUAL(i, (size_t)oid);
4069  }}
4070  }
4071 }
4072 
4073 BOOST_AUTO_TEST_CASE(TestOidNotFoundWithUserAliasFileAndGiList)
4074 {
4075  CTmpFile seqidlist_tmpfile;
4076  CTmpFile alias_file_tmpfile;
4077  string seqidlist_name = seqidlist_tmpfile.GetFileName();
4078  string blastdb_name = alias_file_tmpfile.GetFileName() + ".pal";
4079  CFileDeleteAtExit::Add(seqidlist_name);
4080  CFileDeleteAtExit::Add(blastdb_name);
4081  const string kSeqIdIncluded = "P01013.1";
4082 
4083  {{
4084  ofstream stream(seqidlist_name.c_str());
4085  stream << kSeqIdIncluded << endl;
4086  stream.close();
4087  }}
4088  {{
4089  ofstream stream(blastdb_name.c_str());
4090  stream << "TITLE test for 129295 JIRA SB-646" << endl;
4091  stream << "DBLIST nr" << endl;
4092  stream << "SEQIDLIST " << seqidlist_name << endl;
4093  stream.close();
4094  }}
4095 
4096  CRef<CSeqDB> db(new CSeqDB(alias_file_tmpfile.GetFileName(),
4097  CSeqDB::eProtein));
4098  vector<int> oids;
4099  db->AccessionToOids(kSeqIdIncluded, oids);
4100  BOOST_REQUIRE_EQUAL(1U, oids.size());
4101 
4102  const string seqid2search = "WP_138200753.1"; // shouldn't be found
4103  oids.clear();
4104  db->AccessionToOids(seqid2search, oids);
4105  BOOST_CHECK_EQUAL(0U, oids.size());
4106 }
4107 
4108 BOOST_AUTO_TEST_CASE(TestSpaceInDbName)
4109 {
4110  // SVN does not allow filename with space, so we need to make one up on the fly
4111  int rv = system("cp data/swiss_cheese.pal 'data/test space.pal'");
4112  BOOST_REQUIRE_EQUAL(0, rv);
4113  string db_name = "\"data/test space\"";
4114  CSeqDB dbs(db_name, CSeqDB::eProtein);
4115 
4116  // Reuse the swiss-cheese test as sanity check for 'test space' db
4117  SDbSumInfo dbs_sum(dbs);
4118  BOOST_REQUIRE_EQUAL((string) dbs_sum.CompareSelf(), "+A+B=C+a+b=c");
4119 }
4120 
4121 BOOST_AUTO_TEST_CASE(MultiTaxidBlastDefLine)
4122 {
4123  CBlast_def_line bdl;
4124  CBlast_def_line::TTaxIds taxids;
4125  taxids.insert(9606);
4126  taxids.insert(10090);
4127  BOOST_CHECK(bdl.IsSetTaxid() == false);
4128  BOOST_CHECK(bdl.IsSetLinks() == false);
4129 
4130  bdl.SetLeafTaxIds(taxids);
4131  // Next line changed from 'false' to 'true' 4/10/2014 by rackerst
4132  // to conform to current implementation of CBlast_def_line::SetTaxIds.
4133  // And then changed back to 'false' 5/7/2014 by rackerst
4134  // to conform to newer implementation of CBlast_def_line::SetLeafTaxIds.
4135  BOOST_REQUIRE(bdl.IsSetTaxid() == false);
4136  BOOST_CHECK(bdl.IsSetLinks() == true);
4137  CBlast_def_line::TTaxIds returned = bdl.GetLeafTaxIds();
4138  BOOST_REQUIRE_EQUAL_COLLECTIONS(taxids.begin(), taxids.end(),
4139  returned.begin(), returned.end());
4140 }
4141 
4142 BOOST_AUTO_TEST_CASE(SingleTaxidBlastDefLine)
4143 {
4144  CBlast_def_line bdl;
4145  BOOST_CHECK(bdl.IsSetTaxid() == false);
4146  BOOST_CHECK(bdl.IsSetLinks() == false);
4147 
4148  bdl.SetTaxid(ZERO_TAX_ID);
4149  BOOST_REQUIRE(bdl.IsSetTaxid() == true);
4150  BOOST_CHECK(bdl.IsSetLinks() == false);
4151  BOOST_REQUIRE_EQUAL(ZERO_TAX_ID, bdl.GetTaxid());
4152 
4153  const TTaxId kTaxid = TAX_ID_CONST(9606);
4154  bdl.SetTaxid(kTaxid);
4155  BOOST_REQUIRE(bdl.IsSetTaxid() == true);
4156  BOOST_CHECK(bdl.IsSetLinks() == false);
4157  BOOST_REQUIRE_EQUAL(kTaxid, bdl.GetTaxid());
4158 
4159  CBlast_def_line::TTaxIds returned = bdl.GetLeafTaxIds();
4161  expected.clear();
4162  BOOST_REQUIRE_EQUAL_COLLECTIONS(expected.begin(), expected.end(),
4163  returned.begin(), returned.end());
4164 
4165  expected.insert(kTaxid);
4166  expected.insert(kTaxid + 1);
4167  bdl.SetLeafTaxIds(expected);
4168  BOOST_REQUIRE(bdl.IsSetTaxid() == true);
4169  BOOST_CHECK(bdl.IsSetLinks() == true);
4170  BOOST_CHECK(bdl.GetLeafTaxIds().size() == 2);
4171 }
4172 
4173 BOOST_AUTO_TEST_CASE(CSeqDBIsam_32bit_GI)
4174 {
4175  // OIDs stored in database.
4176  const int oids[] = {
4177  0x7acee466, 0x4cbc1ab0,
4178  0x7d219922, 0x7e096431,
4179  0x276283ea, 0x13cee382,
4180  0x51f8b267, 0x37183674,
4181  0x03559cd6, 0x6bdcfbb7
4182  };
4183  const Uint4 nrecs = (Uint4) (sizeof oids / sizeof oids[0]);
4184 
4185  // Open database for reading.
4186  CSeqDBAtlas atlas(true);
4187  CRef<CSeqDBIsam> rdb(
4188  new CSeqDBIsam(
4189  atlas,
4190  "data/big_gi",
4191  'p',
4192  'n',
4193  eGiId
4194  )
4195  );
4196 
4197 #ifndef NCBI_INT8_GI
4198  {
4199  const Uint4 uint4_gi = 0xFFFFFFFF;
4200  TGi gi = GI_FROM(Uint4, uint4_gi);
4201  BOOST_REQUIRE_THROW( CSeq_id(CSeq_id::e_Gi, gi), CException );
4202  }
4203 
4204 #else
4205  const Int8 big_gi = 0xC0000000; // 3 "billion"
4206  for (Uint4 i = 0; i < nrecs; ++i) {
4207  TGi gi = GI_FROM(Int8, (big_gi + i));
4208 
4209  try {
4210  CRef<CSeq_id> seqid(
4211  new CSeq_id(CSeq_id::e_Gi, gi)
4212  );
4213  int oid;
4214  rdb->IdToOid(GI_TO(long, seqid->GetGi()), oid);
4215  BOOST_REQUIRE(oid == oids[i]);
4216  } catch (...) {
4217  BOOST_FAIL("CSeq_id constructor threw exception");
4218  return;
4219  }
4220  }
4221 #endif
4222 }
4223 
4224 BOOST_AUTO_TEST_CASE(Test_SeqIdList_AliasFile)
4225 {
4226  CSeqDB db("data/prot_alias", CSeqDB::eProtein, 0, 0, true);
4227 
4228  int found = 0;
4229  for(blastdb::TOid oid = 0; db.CheckOrFindOID(oid); oid++) {
4230  found++;
4231  }
4232  BOOST_REQUIRE_EQUAL(55, found);
4233 }
4234 
4235 BOOST_AUTO_TEST_CASE(Test_SeqIdList_FilteredID)
4236 {
4237  CSeqDB db("data/test_seqidlist_v4", CSeqDB::eProtein, 0, 0, true);
4238 
4239  /// Oid 1 has 11 ids but only one in seqid list should be in the id list
4240  list< CRef<CSeq_id> > ids = db.GetSeqIDs(1);
4241  string fasta_id = kEmptyStr;
4242  int num_acc =0;
4243  ITERATE(list< CRef<CSeq_id> >, itr, ids) {
4244  if((*itr)->IsGi()) {
4245  continue;
4246  }
4247  else {
4248  // special prf id
4249  fasta_id = (*itr)->AsFastaString();
4250  num_acc ++;
4251  }
4252  }
4253  BOOST_REQUIRE_EQUAL(1 , num_acc);
4254  BOOST_REQUIRE_EQUAL(fasta_id , "prf||2209341B");
4255 }
4256 
4257 BOOST_AUTO_TEST_CASE(Test_Multi_SeqIdList_AliasFile)
4258 {
4259  CSeqDB db("data/alias_2_v4", CSeqDB::eProtein, 0, 0, true);
4260 
4261  int found = 0;
4262  for(blastdb::TOid oid = 0; db.CheckOrFindOID(oid); oid++) {
4263  found++;
4264  }
4265  BOOST_REQUIRE_EQUAL(63, found);
4266 }
4267 
4268 BOOST_AUTO_TEST_CASE(Test_Mix_GI_SeqId_List_AliasFile)
4269 {
4270  CSeqDB db("data/multi_list_alias_v4", CSeqDB::eProtein, 0, 0, true);
4271 
4272  int found = 0;
4273  blastdb::TOid oid = 0;
4274  for(blastdb::TOid i=0; db.CheckOrFindOID(i); i++) {
4275  oid = i;
4276  found++;
4277  }
4278  BOOST_REQUIRE_EQUAL(1, found);
4279  BOOST_REQUIRE_EQUAL(3, oid);
4280 }
4281 
4282 BOOST_AUTO_TEST_CASE(Test_Mix_User_SeqIdList_AliasFile)
4283 {
4284  CRef<CSeqDBGiList> gi_list( new CSeqDBFileGiList( "data/test.seqidlist", CSeqDBFileGiList::eSiList));
4285  CSeqDB db("data/test_seqidlist_v4", CSeqDB::eProtein, 0, 0, true, gi_list);
4286 
4287  int found = 0;
4288  for(blastdb::TOid i=0; db.CheckOrFindOID(i); i++) {
4289  found++;
4290  }
4291  BOOST_REQUIRE_EQUAL(2, found);
4292 }
4293 
4294 BOOST_AUTO_TEST_CASE(PigListSwissprot)
4295 {
4296  // 2 is not founc in swissprot
4297  const unsigned int num_pigs = 5;
4298  const int pigs[num_pigs] = {4377482, 1287445, 2, 6066974, 5303747};
4299  const unsigned int num_valid_pig = 4;
4300 
4301  CRef<CSeqDBGiList> pig_list(new CSeqDBGiList());
4302  CRef<CSeqDBNegativeList> neg_pig_list(new CSeqDBNegativeList());
4303 
4304  for (unsigned int i =0; i < num_pigs; i++) {
4305  pig_list->AddPig(pigs[i]);
4306  }
4307 
4308  vector<TPig> p;
4309  pig_list->GetPigList(p);
4310  neg_pig_list->SetPigList(p);
4311 
4312  string db_name = "swissprot";
4313 
4314  CSeqDB db(db_name, CSeqDB::eProtein);
4315  CSeqDB pig_db(db_name, CSeqDB::eProtein, &* pig_list);
4316  CSeqDB negative_pig_db(db_name, CSeqDB::eProtein, &* neg_pig_list);
4317 
4318  int total_num_seqs = db.GetNumSeqs();
4319  BOOST_REQUIRE_EQUAL(pig_db.GetNumSeqs(), 4);
4320  BOOST_REQUIRE_EQUAL(negative_pig_db.GetNumSeqs(), (int) (total_num_seqs - num_valid_pig));
4321 
4322  vector<string> seq_ids;
4323  for(int oid=0; pig_db.CheckOrFindOID(oid); oid++) {
4324  int oid_found = -1;
4325  list< CRef<CSeq_id> > ids = pig_db.GetSeqIDs(oid);
4326  db.SeqidToOid(*(ids.front()), oid_found);
4327  seq_ids.push_back(ids.front()->GetSeqIdString());
4328  BOOST_REQUIRE_EQUAL(oid_found, oid);
4329  }
4330  BOOST_REQUIRE_EQUAL(seq_ids.size(), num_valid_pig);
4331 
4332  for(unsigned int i=0; i < seq_ids.size(); i ++){
4333  vector<int> not_found;
4334  negative_pig_db.AccessionToOids(seq_ids[i], not_found);
4335  BOOST_REQUIRE_EQUAL(not_found.size(), (unsigned int) 0);
4336  }
4337 
4338 }
4339 
4340 BOOST_AUTO_TEST_CASE(CombinedFilters)
4341 {
4342  // 2 is not founc in swissprot
4343  const unsigned int num_pigs = 5;
4344  const int pigs[num_pigs] = {2, 355704, 863725, 1727116, 24036443};
4345  string db_name = "data/ipg_test";
4346 
4347  {
4348  CRef<CSeqDBGiList> pos_list(new CSeqDBGiList());
4350 
4351  for (unsigned int i =0; i < num_pigs; i++) {
4352  pos_list->AddPig(pigs[i]);
4353  }
4354 
4355  set<TTaxId> t;
4356  t.insert(TAX_ID_CONST(9606));
4357  t.insert(TAX_ID_CONST(83333));
4358  neg_list->AddTaxIds(t);
4359 
4360  CSeqDB db(db_name, CSeqDB::eProtein, &*pos_list, &* neg_list);
4361 
4362  int total_num_seqs = db.GetNumSeqs();
4363  BOOST_REQUIRE_EQUAL(total_num_seqs, 1);
4364 
4365  const int check_oids[1] = {12};
4366  for(int oid=0, c=0; db.CheckOrFindOID(oid); oid++, c++)
4367  BOOST_REQUIRE_EQUAL(oid, check_oids[c]);
4368  }
4369 
4370  {
4371  CRef<CSeqDBGiList> pos_list(new CSeqDBGiList());
4372  for (unsigned int i =0; i < num_pigs; i++) {
4373  pos_list->AddPig(pigs[i]);
4374  }
4375 
4376  set<TTaxId> t;
4377  t.insert(TAX_ID_CONST(9606));
4378  t.insert(TAX_ID_CONST(83333));
4379  pos_list->AddTaxIds(t);
4380 
4381  CSeqDB db(db_name, CSeqDB::eProtein, &*pos_list);
4382 
4383  int total_num_seqs = db.GetNumSeqs();
4384  BOOST_REQUIRE_EQUAL(total_num_seqs, 3);
4385 
4386  const int check_oids[3] = {2, 6, 8};
4387  for(int oid=0, c=0; db.CheckOrFindOID(oid); oid++, c++) {
4388  BOOST_REQUIRE_EQUAL(oid, check_oids[c]);
4389  }
4390  }
4391  {
4392  CRef<CSeqDBGiList> pos_list(new CSeqDBGiList());
4394 
4395  vector<TPig> p;
4396  for (unsigned int i =0; i < num_pigs; i++) {
4397  p.push_back(pigs[i]);
4398  }
4399  neg_list->SetPigList(p);
4400 
4401  set<TTaxId> t;
4402  t.insert(TAX_ID_CONST(9606));
4403  t.insert(TAX_ID_CONST(83333));
4404  pos_list->AddTaxIds(t);
4405 
4406  CSeqDB db(db_name, CSeqDB::eProtein, &*pos_list, &* neg_list);
4407 
4408  int total_num_seqs = db.GetNumSeqs();
4409  BOOST_REQUIRE_EQUAL(total_num_seqs, 5);
4410 
4411  const int check_oids[5] = {0, 1, 3, 5, 7};
4412  for(int oid=0, c=0; db.CheckOrFindOID(oid); oid++, c++) {
4413  BOOST_REQUIRE_EQUAL(oid, check_oids[c]);
4414  }
4415  }
4416 
4417  {
4418  CRef<CSeqDBGiList> pos_list(new CSeqDBGiList());
4420 
4421  vector<TPig> p;
4422  for (unsigned int i =0; i < num_pigs; i++) {
4423  p.push_back(pigs[i]);
4424  }
4425  neg_list->SetPigList(p);
4426 
4427  set<TTaxId> t;
4428  t.insert(TAX_ID_CONST(9606));
4429  t.insert(TAX_ID_CONST(83333));
4430  pos_list->AddTaxIds(t);
4431 
4432  CSeqDB db(db_name, CSeqDB::eProtein, 1, 4, &*pos_list, &* neg_list);
4433 
4434  const int check_oids[2] = {1, 3 };
4435  for(int oid=0, c=0; db.CheckOrFindOID(oid); oid++, c++) {
4436  BOOST_REQUIRE_EQUAL(oid, check_oids[c]);
4437  }
4438  }
4439 
4440 
4441 }
4442 
4443 BOOST_AUTO_TEST_CASE(TaxFilterWithGiListDB)
4444 {
4445  string db_name = "refseq_mrna";
4446  CRef<CSeqDBGiList> pos_list(new CSeqDBGiList());
4447  set<TTaxId> t;
4448  t.insert(TAX_ID_CONST(9606));
4449  pos_list->AddTaxIds(t);
4450 
4451  CSeqDB db(db_name, CSeqDB::eNucleotide, &*pos_list);
4452  int total_num_seqs = 0;
4453  Uint8 total_length = 0;
4454  db.GetTotals(CSeqDB::eFilteredAll, &total_num_seqs, &total_length, true);
4455  BOOST_REQUIRE(total_num_seqs > 0);
4456  BOOST_REQUIRE(total_length > 0);
4457 }
4458 
4459 BOOST_AUTO_TEST_CASE(TestMemoryMapFile)
4460 {
4461  const int MAX_FD_COUNT = CSeqDBAtlas::e_MaxFileDescritors;
4462  CSeqDBAtlas atlas(true);
4463  for(int i=0; i < MAX_FD_COUNT; i++) {
4465  }
4466  BOOST_REQUIRE_EQUAL(atlas.GetOpenedFilseCount(), MAX_FD_COUNT);
4467  CRef<CSeqDBIsam> isam( new CSeqDBIsam(atlas, "data/big_gi", 'p', 'n', eGiId));
4468  // 2 Files (index, data) are opened for each CSeqDBIsam
4469  BOOST_REQUIRE_EQUAL(atlas.GetOpenedFilseCount(), MAX_FD_COUNT+2);
4470  isam->UnLease();
4471  BOOST_REQUIRE_EQUAL(atlas.GetOpenedFilseCount(), MAX_FD_COUNT);
4472 }
4473 
4474 #ifdef NCBI_THREADS
4475 class CTestThread : public CThread
4476 {
4477 public:
4478  CTestThread(CSeqDBAtlas & atlas): m_atlas(atlas) { }
4479 
4480  virtual void* Main(void) {
4481  m_Isam.Reset(new CSeqDBIsam(m_atlas, "data/seqp_v5", 'p', 'n', eGiId));
4482  CSeqDB::TOID oid;
4483  for (Int8 i=0; i < 10000; i++) {
4484  m_Isam->IdToOid(i, oid);
4485  }
4486  return NULL;
4487  }
4489 private:
4492 };
4493 
4494 
4495 BOOST_AUTO_TEST_CASE(TestMemoryMapFile_MT)
4496 {
4497  CSeqDBAtlas atlas(true);
4498  const int kNumThreads=64;
4499  vector<CTestThread*> threads;
4500  BOOST_REQUIRE_EQUAL(atlas.GetOpenedFilseCount(), 0);
4501  for (int i=0; i < kNumThreads; i++) {
4502  threads.push_back(new CTestThread(atlas));
4503  }
4504  for (int i=0; i < kNumThreads; i++) {
4505  threads[i]->Run();
4506  }
4507  for (int i=0; i < kNumThreads; i++) {
4508  threads[i]->Join();
4509  }
4510  BOOST_REQUIRE_EQUAL(atlas.GetOpenedFilseCount(), 2);
4511 }
4512 #endif
4513 
4514 BOOST_AUTO_TEST_CASE(TestTaxIdsLookup)
4515 {
4516  string db_name = "data/wp_nr_v5";
4517  CSeqDB db(db_name, CSeqDB::eProtein);
4518  {
4519  string acc = "WP_007051162.1";
4520  vector<TTaxId> tax_ids;
4521  db.GetTaxIdsForAccession(acc, tax_ids);
4522  BOOST_REQUIRE_EQUAL(tax_ids.size(), 4);
4523  BOOST_REQUIRE_EQUAL(tax_ids[0], 1678);
4524  BOOST_REQUIRE_EQUAL(tax_ids[3], 1263059);
4525  }
4526  {
4527  CSeq_id seqid("CCK34598");
4528  vector<TTaxId> tax_ids;
4529  db.GetTaxIdsForSeqId(seqid, tax_ids);
4530  BOOST_REQUIRE_EQUAL(tax_ids.size(), 1);
4531  BOOST_REQUIRE_EQUAL(tax_ids[0], 1205679);
4532  }
4533  {
4534  string acc = "junk";
4535  vector<TTaxId> tax_ids;
4536  db.GetTaxIdsForAccession(acc, tax_ids);
4537  BOOST_REQUIRE_EQUAL(tax_ids.size(), 0);
4538  }
4539 }
4540 
4541 BOOST_AUTO_TEST_CASE(TestTaxIdsLookup_v4)
4542 {
4543  string db_name = "data/test_v4";
4544  CSeqDB db(db_name, CSeqDB::eProtein);
4545  {
4546  string acc = "pir||T49736";
4547  vector<TTaxId> tax_ids;
4548  db.GetTaxIdsForAccession(acc, tax_ids);
4549  BOOST_REQUIRE_EQUAL(tax_ids.size(), 1);
4550  BOOST_REQUIRE_EQUAL(tax_ids[0], 0);
4551  }
4552  {
4553  string acc = "junk";
4554  vector<TTaxId> tax_ids;
4555  db.GetTaxIdsForAccession(acc, tax_ids);
4556  BOOST_REQUIRE_EQUAL(tax_ids.size(), 0);
4557  }
4558 }
4559 
4560 
4561 
4563 #endif /* SKIP_DOXYGEN_PROCESSING */
4564 
vector< char > NSEQ
Definition: NSeq.hpp:49
static const char * kFileName
BOOST_AUTO_TEST_SUITE_END() static int s_GetSegmentFlags(const CBioseq &bioseq)
ncbi::TMaskedQueryRegions mask
vector< CRef< CSeq_id > > SeqIdList
`Blob' Class for SeqDB (and WriteDB).
Definition: seqdbblob.hpp:56
CTempString Str() const
Get blob contents as a CTempString.
Definition: seqdbblob.cpp:526
TTaxIds GetLeafTaxIds() const
void SetLeafTaxIds(const TTaxIds &t)
FASTA-format output; see also ReadFasta in <objtools/readers/fasta.hpp>
Definition: sequence.hpp:770
CFile –.
Definition: ncbifile.hpp:1605
GI list containing the intersection of two other lists of GIs.
CNcbiEnvironment –.
Definition: ncbienv.hpp:110
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
CNegativeIdList(const int *ids, bool use_tis)
CRef –.
Definition: ncbiobj.hpp:618
static SIZE_TYPE Convert(const CTempString &src, TCoding src_coding, TSeqPos pos, TSeqPos length, string &dst, TCoding dst_coding)
CSeqDBAtlas class.
Definition: seqdbatlas.hpp:297
int ChangeOpenedFilseCount(EFilesCount fc)
Definition: seqdbatlas.hpp:547
@ eFileCounterIncrement
Definition: seqdbatlas.hpp:541
int GetOpenedFilseCount(void)
Definition: seqdbatlas.hpp:565
CSeqDBException.
Definition: seqdbcommon.hpp:73
CSeqDBExpert.
Definition: seqdbexpert.hpp:55
void GetRawSeqAndAmbig(int oid, const char **buffer, int *seq_length, int *ambig_length) const
Raw Sequence and Ambiguity Data.
Definition: seqdbexpert.cpp:64
void GetPigBounds(int *low_id, int *high_id, int *count)
Get PIG Bounds.
Definition: seqdbexpert.cpp:83
unsigned GetSequenceHash(int oid)
Get the sequence hash for a given OID.
void HashToOids(unsigned hash, vector< int > &oids)
Get the OIDs for a given sequence hash.
void GetGiBounds(TGi *low_id, TGi *high_id, int *count)
Get GI Bounds.
Definition: seqdbexpert.cpp:74
CSeqDBFileGiList.
CSeqDBGiList.
vector< SGiOid > m_GisOids
Pairs of GIs and OIDs.
int GetNumGis() const
Get the number of GIs in the array.
void GetPigList(vector< TPig > &pigs) const
void GetGiList(vector< TGi > &gis) const
Get the gi list.
void AddTaxIds(const set< TTaxId > &tax_ids)
T GetKey(int index) const
void AddPig(TPig pig)
bool FindGi(TGi gi) const
Test for existence of a GI.
vector< SSiOid > m_SisOids
Pairs of Seq-ids and OIDs.
SeqDB ID list for performing boolean set operations.
bool Blank() const
Check if an ID list is blank.
void Compute(EOperation op, const vector< int > &ids, bool positive=true)
Perform a logical operation on a list.
bool IsPositive()
Checks whether a positive GI list was produced.
CRef< CSeqDBNegativeList > GetNegativeList()
Retrieve a negative GI list.
CRef< CSeqDBGiList > GetPositiveList()
Retrieve a positive GI list.
CSeqDBIsam.
Definition: seqdbisam.hpp:127
bool IdToOid(Int8 id, TOid &oid)
GI or TI translation.
Definition: seqdbisam.hpp:225
void UnLease()
Return any memory held by this object to the atlas.
Definition: seqdbisam.cpp:1215
CSeqDBNegativeList.
void AddTaxIds(const set< TTaxId > &tax_ids)
void SetGiList(const vector< TGi > &new_list)
Set ID set for this negative list.
void SetPigList(const vector< TPig > &new_list)
vector< TTi > m_Tis
TIs to exclude from the SeqDB instance.
bool FindGi(TGi gi)
Test for existence of a GI.
vector< TGi > m_Gis
GIs to exclude from the SeqDB instance.
Reader for BlastDb format column files.
CSeqDB.
Definition: seqdb.hpp:161
void GetColumnBlob(int col_id, int oid, CBlastDbBlob &blob)
Fetch the data blob for the given column and oid.
Definition: seqdb.cpp:1220
int TOID
Sequence type accepted and returned for OID indices.
Definition: seqdb.hpp:216
static void FindVolumePaths(const string &dbname, ESeqType seqtype, vector< string > &paths, vector< string > *alias_paths=NULL, bool recursive=true, bool expand_links=true)
Find volume paths.
Definition: seqdb.cpp:1040
bool OidToPig(int oid, int &pig) const
Translate an OID to a PIG.
Definition: seqdb.cpp:790
Uint8 GetTotalLength() const
Returns the sum of the lengths of all available sequences.
Definition: seqdb.cpp:685
void GetGis(int oid, vector< TGi > &gis, bool append=false) const
Gets a list of GIs for an OID.
Definition: seqdb.cpp:1070
bool PigToOid(int pig, int &oid) const
Translate a PIG to an OID.
Definition: seqdb.cpp:781
void SetIterationRange(int oid_begin, int oid_end)
Set Iteration Range.
Definition: seqdb.cpp:1093
int GetNumOIDs() const
Returns the size of the (possibly sparse) OID range.
Definition: seqdb.cpp:680
TGi GetSeqGI(int oid) const
Returns the first Gi (if any) of the sequence.
Definition: seqdb.cpp:776
Uint8 GetVolumeLength() const
Returns the sum of the lengths of all volumes.
Definition: seqdb.cpp:700
void GetAvailableMaskAlgorithms(vector< int > &algorithms)
Get a list of algorithm IDs for which mask data exists.
Definition: seqdb.cpp:1227
bool OidToGi(int oid, TGi &gi) const
Translate an OID to a GI.
Definition: seqdb.cpp:826
const string & GetDBNameList() const
Get list of database names.
Definition: seqdb.cpp:760
list< CRef< CSeq_id > > GetSeqIDs(int oid) const
Gets a list of sequence identifiers.
Definition: seqdb.cpp:765
Int8 GetDiskUsage() const
Retrieve the disk usage in bytes for this BLAST database.
Definition: seqdb.cpp:1464
void ResetInternalChunkBookmark()
Resets this object's internal chunk bookmark, which is used when the oid_state argument to GetNextOID...
Definition: seqdb.cpp:755
EOidListType
Indicates how block of OIDs was returned.
Definition: seqdb.hpp:167
@ eOidRange
Definition: seqdb.hpp:169
@ eOidList
Definition: seqdb.hpp:168
CRef< CSeq_data > GetSeqData(int oid, TSeqPos begin, TSeqPos end) const
Fetch data as a CSeq_data object.
Definition: seqdb.cpp:539
bool GiToPig(TGi gi, int &pig) const
Translate a GI to a PIG.
Definition: seqdb.cpp:854
void GetAliasFileValues(TAliasFileValues &afv)
Get Name/Value Data From Alias Files.
Definition: seqdb.cpp:1098
int GetMaxLength() const
Returns the length of the largest sequence in the database.
Definition: seqdb.cpp:705
int GetSeqLength(int oid) const
Returns the sequence length in base pairs or residues.
Definition: seqdb.cpp:400
bool PigToGi(int pig, TGi &gi) const
Translate a PIG to a GI.
Definition: seqdb.cpp:837
ESeqType GetSequenceType() const
Returns the type of database opened - protein or nucleotide.
Definition: seqdb.cpp:427
const CSeqDBGiList * GetGiList() const
Get GI list attached to this database.
Definition: seqdb.cpp:1120
ESeqType
Sequence types (eUnknown tries protein, then nucleotide).
Definition: seqdb.hpp:173
@ eNucleotide
Definition: seqdb.hpp:175
@ eUnknown
Definition: seqdb.hpp:176
@ eProtein
Definition: seqdb.hpp:174
bool SeqidToOid(const CSeq_id &seqid, int &oid) const
Translate a Seq-id to any matching OID.
Definition: seqdb.cpp:903
void RetAmbigSeq(const char **buffer) const
Returns any resources associated with the sequence.
Definition: seqdb.cpp:563
int GetOidAtOffset(int first_seq, Uint8 residue) const
Find the sequence closest to the given offset into the database.
Definition: seqdb.cpp:923
CRef< CBioseq > GetBioseq(int oid, TGi target_gi=ZERO_GI, const CSeq_id *target_seq_id=NULL) const
Get a CBioseq for a sequence.
Definition: seqdb.cpp:504
CRef< CBioseq > GetBioseqNoData(int oid, TGi target_gi=ZERO_GI, const CSeq_id *target_seq_id=NULL) const
Get a CBioseq for a sequence without sequence data.
Definition: seqdb.cpp:514
void GetTaxIDs(int oid, map< TGi, TTaxId > &gi_to_taxid, bool persist=false) const
Get taxid for an OID.
Definition: seqdb.cpp:441
void GetMaskAlgorithmDetails(int algorithm_id, objects::EBlast_filter_program &program, string &program_name, string &algo_opts)
Get information about one type of masking available here.
Definition: seqdb.cpp:1263
void GetTotals(ESummaryType sumtype, int *oid_count, Uint8 *total_length, bool use_approx=true) const
Returns the sum of the sequence lengths.
Definition: seqdb.cpp:1110
void RetSequence(const char **buffer) const
Returns any resources associated with the sequence.
Definition: seqdb.cpp:523
string GetTitle() const
Returns the database title.
Definition: seqdb.cpp:630
int GetNumSeqs() const
Returns the number of sequences available.
Definition: seqdb.cpp:670
void GetTaxIdsForSeqId(const CSeq_id &seq_id, vector< TTaxId > &taxids)
Get all tax ids for a seq id.
Definition: seqdb.cpp:1758
EOidListType GetNextOIDChunk(int &begin_chunk, int &end_chunk, int oid_size, vector< int > &oid_list, int *oid_state=NULL)
Return a chunk of OIDs, and update the OID bookmark.
Definition: seqdb.cpp:739
int GetSequence(int oid, const char **buffer) const
Get a pointer to raw sequence data.
Definition: seqdb.cpp:530
void AccessionToOids(const string &acc, vector< int > &oids) const
Translate an Accession to a list of OIDs.
Definition: seqdb.cpp:870
void ListColumns(vector< string > &titles)
List columns titles found in this database.
Definition: seqdb.cpp:1191
void GetTaxIdsForAccession(const string &accs, vector< TTaxId > &taxids)
Get all tax ids for an accessions.
Definition: seqdb.cpp:1752
bool CheckOrFindOID(int &next_oid) const
Find an included OID, incrementing next_oid if necessary.
Definition: seqdb.cpp:728
string GetDate() const
Returns the construction date of the database.
Definition: seqdb.cpp:635
int GetNumSeqsStats() const
Returns the number of sequences available.
Definition: seqdb.cpp:675
@ eUnfilteredAll
Sum of all sequences, ignoring GI and OID lists and alias files.
Definition: seqdb.hpp:185
@ eFilteredRange
Sum of included sequences with OIDs within the iteration range.
Definition: seqdb.hpp:191
@ eFilteredAll
Values from alias files, or summation over all included sequences.
Definition: seqdb.hpp:188
int GetColumnId(const string &title)
Get an ID number for a given column title.
Definition: seqdb.cpp:1196
void SeqidToOids(const CSeq_id &seqid, vector< int > &oids) const
Translate a Seq-id to a list of OIDs.
Definition: seqdb.cpp:896
int GetAmbigSeqAlloc(int oid, char **buffer, int nucl_code, ESeqDBAllocType strategy, TSequenceRanges *masks=NULL) const
Get a pointer to sequence data with ambiguities.
Definition: seqdb.cpp:591
CSeqDBIter Begin() const
Returns a sequence iterator.
Definition: seqdb.cpp:723
const string & GetColumnValue(int column_id, const string &key)
Look up the value for a specific column metadata key.
Definition: seqdb.cpp:1207
CRef< CBioseq > GiToBioseq(TGi gi) const
Get a CBioseq for a given GI.
Definition: seqdb.cpp:987
static void GetTaxInfo(TTaxId taxid, SSeqDBTaxInfo &info)
Get taxonomy information.
Definition: seqdb.cpp:1105
CRef< CBlast_def_line_set > GetHdr(int oid) const
Get the ASN.1 header for the sequence.
Definition: seqdb.cpp:418
Uint8 GetTotalLengthStats() const
Returns the sum of the lengths of all available sequences.
Definition: seqdb.cpp:695
int GetSeqLengthApprox(int oid) const
Returns an unbiased, approximate sequence length.
Definition: seqdb.cpp:409
CRef< CBioseq > SeqidToBioseq(const CSeq_id &seqid) const
Get a CBioseq for a given Seq-id.
Definition: seqdb.cpp:1021
static CRef< CBlast_def_line_set > ExtractBlastDefline(const CBioseq &bioseq)
Extract a Blast-def-line-set object from a Bioseq retrieved by CSeqDB.
Definition: seqdbvol.cpp:1247
CRef< CBioseq > PigToBioseq(int pig) const
Get a CBioseq for a given PIG.
Definition: seqdb.cpp:1004
int GetAmbigSeq(int oid, const char **buffer, int nucl_code) const
Get a pointer to sequence data with ambiguities.
Definition: seqdb.cpp:550
void GetMaskData(int oid, const vector< int > &algo_ids, TSequenceRanges &ranges)
Get masked ranges of a sequence.
Definition: seqdb.hpp:1408
bool GiToOid(TGi gi, int &oid) const
Translate a GI to an OID.
Definition: seqdb.cpp:808
const map< string, string > & GetColumnMetaData(int column_id)
Get all metadata for the specified column.
Definition: seqdb.cpp:1202
CSeqDBIdSet GetIdSet() const
Get IdSet list attached to this database.
Definition: seqdb.cpp:1125
void GetLeafTaxIDs(int oid, map< TGi, set< TTaxId > > &gi_to_taxid_set, bool persist=false) const
Get taxid for an OID.
Definition: seqdb.cpp:473
void Append(const char *p)
CSeqIdList(const char **str)
@ e_Iupacna
Definition: sequtil.hpp:47
@ e_Ncbi4na
Definition: sequtil.hpp:50
CSimpleGiList(const vector< TGi > &gis)
CSeqDBAtlas & m_atlas
virtual void * Main(void)
Derived (user-created) class must provide a real thread function.
CRef< CSeqDBIsam > m_Isam
CTestThread(CSeqDBAtlas &atlas)
CTmpEnvironmentSetter(const char *name, const char *value=NULL)
CTmpFile –.
Definition: ncbifile.hpp:2353
void erase(iterator pos)
Definition: map.hpp:167
size_type size() const
Definition: map.hpp:148
const_iterator end() const
Definition: map.hpp:152
bool empty() const
Definition: map.hpp:149
const_iterator find(const key_type &key) const
Definition: map.hpp:153
Definition: map.hpp:338
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
const_iterator begin() const
Definition: set.hpp:135
size_type size() const
Definition: set.hpp:132
bool empty() const
Definition: set.hpp:133
const_iterator find(const key_type &key) const
Definition: set.hpp:137
void erase(iterator pos)
Definition: set.hpp:151
const_iterator end() const
Definition: set.hpp:136
Defines column reader class for SeqDB.
int GetSeqLength(const CBioseq &bioseq)
Definition: cuSequence.cpp:216
#define C(s)
Definition: common.h:231
static const unsigned long CR
Blast defline related defines.
#define S(s)
static const struct name_t names[]
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:51
#define P(a, b)
Definition: sqlwparams.h:19
static const char * expected[]
Definition: bcp.c:42
static const char * str(char *buf, int n)
Definition: stats.c:84
static FILE * f
Definition: readconf.c:23
static const column_t columns[]
Definition: utf8_2.c:22
#define TAX_ID_CONST(id)
Definition: ncbimisc.hpp:1112
#define ZERO_TAX_ID
Definition: ncbimisc.hpp:1115
#define GI_FROM(T, value)
Definition: ncbimisc.hpp:1086
constexpr size_t ArraySize(const Element(&)[Size])
Definition: ncbimisc.hpp:1532
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
#define GI_CONST(gi)
Definition: ncbimisc.hpp:1087
#define ZERO_GI
Definition: ncbimisc.hpp:1088
#define GI_TO(T, gi)
Definition: ncbimisc.hpp:1085
string
Definition: cgiapp.hpp:690
#define NULL
Definition: ncbistd.hpp:225
void Set(const string &name, const string &value)
Set an environment variable by name.
Definition: ncbienv.cpp:147
const string & Get(const string &name, bool *found=NULL) const
Get environment value by name.
Definition: ncbienv.cpp:109
static void Add(const string &path)
Add the name of a dir entry; it will be deleted on (normal) exit.
Definition: ncbifile.cpp:5367
const string & GetFileName(void) const
Return used file name (generated or given in the constructor).
Definition: ncbifile.cpp:5429
static string ConvertToOSPath(const string &path)
Convert "path" on any OS to the current OS-dependent path.
Definition: ncbifile.cpp:745
static char GetPathSeparator(void)
Get path separator symbol specific for the current platform.
Definition: ncbifile.cpp:433
const float epsilon
Definition: math.hpp:61
const TPrim & Get(void) const
Definition: serialbase.hpp:347
#define MSerial_AsnText
I/O stream manipulators –.
Definition: serialbase.hpp:696
const string AsFastaString(void) const
Definition: Seq_id.cpp:2266
virtual void Write(const CSeq_entry_Handle &handle, const CSeq_loc *location=0)
Unspecified locations designate complete sequences; non-empty custom titles override the usual title ...
Definition: sequence.cpp:2727
void SetWidth(TSeqPos width)
Definition: sequence.cpp:3456
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
static string SizetToString(size_t value, TNumToStringFlags flags=0, int base=10)
Convert size_t to string.
Definition: ncbistr.cpp:2742
#define kEmptyStr
Definition: ncbistr.hpp:123
static Int8 StringToInt8(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to Int8.
Definition: ncbistr.cpp:793
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3452
@ fSplit_Truncate
Definition: ncbistr.hpp:2503
@ fSplit_MergeDelimiters
Merge adjacent delimiters.
Definition: ncbistr.hpp:2500
bool IsSetLinks(void) const
Check if a value has been assigned to Links data member.
TTaxid GetTaxid(void) const
Get the Taxid member data.
bool IsSetTaxid(void) const
Check if a value has been assigned to Taxid data member.
void SetTaxid(TTaxid value)
Assign a value to Taxid data member.
EBlast_filter_program
This defines the possible sequence filtering algorithms to be used in a BLAST database.
const Tdata & Get(void) const
Get the member data.
@ eBlast_filter_program_repeat
@ eBlast_filter_program_seg
const TTag & GetTag(void) const
Get the Tag member data.
Definition: Dbtag_.hpp:267
TId GetId(void) const
Get the variant data.
Definition: Object_id_.hpp:270
TGi GetGi(void) const
Get the variant data.
Definition: Seq_id_.hpp:889
const TGeneral & GetGeneral(void) const
Get the variant data.
Definition: Seq_id_.cpp:369
@ e_General
for other databases
Definition: Seq_id_.hpp:105
@ e_Gi
GenInfo Integrated Database.
Definition: Seq_id_.hpp:106
void ResetDescr(void)
Reset Descr data member.
Definition: Bioseq_.cpp:60
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
const TNcbi4na & GetNcbi4na(void) const
Get the variant data.
Definition: Seq_data_.hpp:570
bool CanGetSeq_data(void) const
Check if it is safe to call GetSeq_data method.
Definition: Seq_inst_.hpp:811
const TSeq_data & GetSeq_data(void) const
Get the Seq_data member data.
Definition: Seq_inst_.hpp:817
char * dbname(DBPROCESS *dbproc)
Get name of current database.
Definition: dblib.c:6929
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
where boath are integers</td > n< td ></td > n</tr > n< tr > n< td > tse</td > n< td > optional</td > n< td > String</td > n< td class=\"description\"> TSE option controls what blob is whole
where boath are integers</td > n< td ></td > n</tr > n< tr > n< td > tse</td > n< td > optional</td > n< td > String</td > n< td class=\"description\"> TSE option controls what blob is smart and slim</td> n<td> orig</td> n</tr> n<tr> n<td> last_modified</td> n<td> optional</td> n<td> Integer</td> n<td class=\"description\"> The blob last modification If provided then the exact match will be requested with n the Cassandra storage corresponding field value</td> n<td> Positive integer Not provided means that the most recent match will be selected</td> n<td></td> n</tr> n<tr> n<td> use_cache</td> n<td> optional</td> n<td> String</td> n<td class=\"description\"> The option controls if the Cassandra LMDB cache and or database should be used It n affects the seq id resolution step and the blob properties lookup step The following n options are BIOSEQ_INFO and BLOB_PROP at all
char * buf
int i
int len
CBioseq_Info & GetBioseq(CTSE_Info &tse, const CBioObjectId &id)
unique_ptr< CLocalTaxon > tax1
static void hex(unsigned char c)
Definition: mdb_dump.c:56
static size_t lineno
Definition: mdb_load.c:28
static MDB_envinfo info
Definition: mdb_load.c:37
constexpr auto sort(_Init &&init)
const struct ncbi::grid::netcache::search::fields::SIZE size
const struct ncbi::grid::netcache::search::fields::KEY key
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
#define fabs(v)
Definition: ncbi_dispd.c:46
unsigned int a
Definition: ncbi_localip.c:102
EIPRangeType t
Definition: ncbi_localip.c:101
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
#define NUM
Definition: newick.tab.cpp:72
const double E
T min(T x_, T y_)
static unsigned cnt[256]
#define A
#define U
#define count
static uint8_t * buffer
Definition: pcre2test.c:1016
BOOST_AUTO_TEST_SUITE(psiblast_iteration)
static const string kDb("db")
#define BEGIN(X)
USING_SCOPE(objects)
string s_Stringify(CRef< ASNOBJ > a)
static void s_TestMaskingLimits(EMaskingType mask, unsigned first, unsigned last, unsigned lowest, unsigned highest, unsigned count)
static void s_MapAllGis(CSeqDB &db, map< int, int > &m, int change, int &total)
static void s_TestPartialAmbig(CSeqDB &db, TGi nt_gi)
void s_ApproxEqual(NUM a, NUM b, DIF epsilon, int lineno)
#define END(X)
EMaskingType
@ eAll
@ ePrime
@ eEven
@ eOdd
@ eEND
#define DEV_NULL
BOOST_AUTO_TEST_CASE(ConstructLocal)
static bool s_DbHasOID(CSeqDB &db, int &count, int oid)
static void s_ModifyMap(map< int, int > &m, int key, int c, int &total)
static void s_CheckIdLookup(CSeqDB &db, const string &acc, size_t exp_oids, size_t exp_size)
static void s_TestPartialAmbigRange(CSeqDB &db, int oid, int begin, int end)
static Uint4 s_BufHash(const char *buf_in, Uint4 length, Uint4 start=1)
USING_NCBI_SCOPE
string s_ToString(const A &a, const B &b, const C &c, const D &d, const E &e)
static bool s_MaskingTest(EMaskingType mask, unsigned oid)
Defines exception class and several constants for SeqDB.
@ eAtlas
@ eNew
@ eMalloc
Int4 TOid
Ordinal ID in BLAST databases.
Definition: seqdbcommon.hpp:58
const int kSeqDBNuclNcbiNA8
Used to request ambiguities in Ncbi/NA8 format.
ESeqDBIdType SeqDB_SimplifyAccession(const string &acc, Int8 &num_id, string &str_id, bool &simpler)
String id simplification.
bool SeqDB_IsBinaryGiList(const string &fname)
Read a text or binary SeqId list from a file.
unsigned SeqDB_SequenceHash(const char *sequence, int length)
Returns a path minus filename.
Definition: seqdbobj.cpp:146
const int kSeqDBNuclBlastNA8
Used to request ambiguities in BLAST/NA8 format.
@ eGiId
string SeqDB_ResolveDbPath(const string &filename)
Resolve a file path using SeqDB's path algorithms.
Defines `expert' version of CSeqDB interfaces.
ISAM index database access object.
static const char * kTaxid
#define B
#define D(d)
static SLJIT_INLINE sljit_ins nr(sljit_gpr dst, sljit_gpr src)
static SLJIT_INLINE sljit_ins msg(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
List of sequence offset ranges.
Definition: seqdb.hpp:236
bool empty() const
Definition: seqdb.hpp:272
int measured_oids
Measured oid count should equal filtered if alias files are correct.
int total_oids
Total oid count, sum of all volume oid counts.
Int8 filtered_length
Filtered length, result of all filtering.
void CompareField(Int8 X, Int8 Y, string &sum, char ch)
SDbSumInfo(CSeqDB &db)
Int8 total_length
Total length, sum of all volume lengths.
string Compare(SDbSumInfo &other)
int filtered_oids
Filtered oid count, result of all filtering.
string CompareSelf()
Int8 measured_length
Measured length should equal filtered if alias files are correct.
SSeqDBTaxInfo.
Definition: _hash_fun.h:40
Utility stuff for more convenient using of Boost.Test library.
int g(Seg_Gsm *spe, Seq_Mtf *psm, Thd_Gsm *tdg)
Definition: thrddgri.c:44
static string kCount("Count")
#define N
Definition: crc32.c:57
void free(voidpf ptr)
#define local
Definition: zutil.h:33
Modified on Fri Sep 20 14:58:15 2024 by modify_doxy.py rev. 669887