NCBI C++ ToolKit
writedb_unit_test.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: writedb_unit_test.cpp 101152 2023-11-07 15:39:13Z camacho $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Kevin Bealer
27  *
28  * File Description:
29  * CWriteDB unit test.
30  *
31  */
32 #define NCBI_TEST_APPLICATION
33 #include <ncbi_pch.hpp>
34 
38 #include <objmgr/seq_vector.hpp>
40 #include <serial/objistr.hpp>
41 #include <serial/serial.hpp>
42 #include <serial/iterator.hpp>
43 #include "../mask_info_registry.hpp"
44 #include <sstream>
45 
46 #include <corelib/test_boost.hpp>
47 #include <boost/current_function.hpp>
54 
55 #include <unordered_map>
56 
57 #ifndef SKIP_DOXYGEN_PROCESSING
58 
61 
62 // Fetch sequence and nucleotide data for the given oid as a pair of
63 // strings (in ncbi2na packed format), one for sequence data and one
64 // for ambiguities.
65 
66 void
68  int oid,
69  string & sequence,
70  string & ambig)
71 {
72  const char * buffer (0);
73  int slength(0);
74  int alength(0);
75 
76  seqdb.GetRawSeqAndAmbig(oid, & buffer, & slength, & alength);
77 
78  sequence.assign(buffer, slength);
79  ambig.assign(buffer + slength, alength);
80 }
81 
82 // Return a Seq-id built from the given int (gi).
83 
85 {
86  CRef<CSeq_id> seqid(new CSeq_id(CSeq_id::e_Gi, gi));
87 
88  return seqid;
89 }
90 
91 // Return a Seq-id built from the given string (accession or FASTA
92 // format Seq-id).
93 
94 CRef<CSeq_id> s_AccToSeqId(const char * acc)
95 {
96  CRef<CSeq_id> seqid(new CSeq_id(acc));
97 
98  return seqid;
99 }
100 
101 // HexDump utility functions
102 
103 string s_HexDumpText(const string & raw,
104  const vector<int> & layout,
105  int base)
106 {
107  BOOST_REQUIRE(layout.size());
108 
109  string visible;
110  string tmp;
111 
112  int layout_i = 0;
113  int width = 0;
114 
115  for(int i = 0; i < (int)raw.size(); i += width) {
116  width = layout[layout_i];
117  BOOST_REQUIRE(width);
118 
119  Uint8 mask = Uint8(Int8(-1));
120  mask >>= (64 - 8*width);
121 
122  int left = raw.size() - i;
123  int width1 = (left < width) ? left : width;
124 
125  string sub(raw, i, width1);
126 
127  // Read a standard order value into x.
128 
129  Uint8 x = 0;
130 
131  for(int by = 0; by < (int)sub.size(); by++) {
132  x = (x << 8) + (sub[by] & 0xFF);
133  }
134 
135  if (visible.size())
136  visible += " ";
137 
138  tmp.resize(0);
139  NStr::UInt8ToString(tmp, x & mask, 0, base);
140 
141  visible += tmp;
142  layout_i = (layout_i + 1) % layout.size();
143  }
144 
145  return visible;
146 }
147 
148 string s_HexDumpText(const string & raw, int per, int base)
149 {
150  vector<int> layout;
151  layout.push_back(per);
152 
153  return s_HexDumpText(raw, layout, base);
154 }
155 
156 // Overlay version
157 
158 string s_HexDumpFile(const string & fname,
159  const vector<int> & layout,
160  int base)
161 {
162  ifstream f(fname.c_str());
163 
164  string raw;
165 
166  while(f && ! f.eof()) {
167  char buf[1024];
168  f.read(buf, 1024);
169 
170  int amt = f.gcount();
171 
172  if (! amt)
173  break;
174 
175  raw.append(buf, amt);
176  }
177 
178  return s_HexDumpText(raw, layout, base);
179 }
180 
181 string s_HexDumpFile(const string & fname,
182  int per,
183  int base)
184 {
185  vector<int> layout;
186  layout.push_back(per);
187 
188  return s_HexDumpFile(fname, layout, base);
189 }
190 
191 // Copy the sequences listed in 'ids' (integers or FASTA Seq-ids) from
192 // the CSeqDB object to the CWriteDB object, using CBioseqs as the
193 // intermediate data.
194 
195 typedef vector< CRef<CSeq_id> > TIdList;
196 
197 class CNonException : exception {
198 public:
199 
200 };
201 
202 #define BOOST_REQUIRE_CUTPOINT(X) if (cutpoint == X) throw CNonException()
203 
205 
206 static void
208  CSeqDB & s,
209  const TIdList & ids,
210  int cutpoint)
211 {
212  int count1 = 0;
213 
214  ITERATE(TIdList, iter, ids) {
215  CRef<CSeq_id> seqid = *iter;
216 
217  BOOST_REQUIRE(seqid.NotEmpty());
218 
220 
221  int oid = -1;
222  bool found = s.SeqidToOid(*seqid, oid);
223 if(!found) {
224  cerr << seqid->GetSeqIdString() << endl;
225 }
226 
227  BOOST_REQUIRE(found);
228 
229  CRef<CBioseq> bs;
230 
232 
233  if (seqid->IsGi()) {
234  bs = s.GetBioseq(oid, seqid->GetGi());
235  } else {
236  bs = s.GetBioseq(oid);
237  }
238 
240 
241  CRef<CBlast_def_line_set> bdls = s.GetHdr(oid);
242 
243  BOOST_REQUIRE(bs.NotEmpty());
244  BOOST_REQUIRE(bdls.NotEmpty());
245 
247 
248  w.AddSequence(*bs);
249  w.SetDeflines(*bdls);
250 
251  count1++;
253 
254  if (count1 > 3) {
256  }
257 
258  if (count1 > g_NuclJ_OidCount) {
260  }
261  }
262 }
263 
264 // Copy the sequences listed in 'ids' (integers or FASTA Seq-ids) from
265 // the CSeqDB object to the CWriteDB object, using packed ncbi2na
266 // strings ('raw' data) as the intermediate data.
267 
268 static void
270  CSeqDBExpert & seqdb,
271  const TIdList & ids)
272 {
273  bool is_nucl = seqdb.GetSequenceType() == CSeqDB::eNucleotide;
274 
275  ITERATE(TIdList, iter, ids) {
276  CRef<CSeq_id> seqid = *iter;
277 
278  int oid = -1;
279  bool found = seqdb.SeqidToOid(*seqid, oid);
280 
281  BOOST_REQUIRE(found);
282 
283  string seq, ambig;
284 
285  s_FetchRawData(seqdb, oid, seq, ambig);
286  CRef<CBlast_def_line_set> bdls = seqdb.GetHdr(oid);
287 
288  BOOST_REQUIRE(! seq.empty());
289  BOOST_REQUIRE(ambig.empty() || is_nucl);
290  BOOST_REQUIRE(bdls.NotEmpty());
291 
292  w.AddSequence(seq, ambig);
293  w.SetDeflines(*bdls);
294  }
295 }
296 
297 // Serialize the provided ASN.1 object into a string.
298 
299 template<class ASNOBJ>
300 void s_Stringify(const ASNOBJ & a, string & s)
301 {
302  CNcbiOstrstream oss;
303  oss << MSerial_AsnText << a;
304  s = CNcbiOstrstreamToString(oss);
305 }
306 
307 // Deserialize the provided string into an ASN.1 object.
308 
309 template<class ASNOBJ>
310 void s_Unstringify(const string & s, ASNOBJ & a)
311 {
312  istringstream iss;
313  iss.str(s);
314  iss >> MSerial_AsnText >> a;
315 }
316 
317 // Duplicate the provided ASN.1 object (via {,de}serialization).
318 
319 template<class ASNOBJ>
320 CRef<ASNOBJ> s_Duplicate(const ASNOBJ & a)
321 {
322  CRef<ASNOBJ> newobj(new ASNOBJ);
323 
324  string s;
325  s_Stringify(a, s);
326  s_Unstringify(s, *newobj);
327 
328  return newobj;
329 }
330 
331 // Compare the two CBioseqs by comparing their serialized forms.
332 
333 void s_CompareBioseqs(CBioseq & src, CBioseq & dst)
334 {
335  string s1, s2;
336  s_Stringify(src, s1);
337  s_Stringify(dst, s2);
338 
339  BOOST_REQUIRE_EQUAL(s1, s2);
340 }
341 
342 // Test the database compared to a reference database, usually the
343 // database that provided the source data.
344 
345 void
347  const string & name,
348  const string & title)
349 {
350  CSeqDBExpert dst(name, src.GetSequenceType());
351 
352  for(int oid = 0; dst.CheckOrFindOID(oid); oid++) {
353  TGi gi = ZERO_GI;
354  int src_oid(0);
355 
356  bool rv1 = dst.OidToGi(oid, gi);
357  bool rv2 = src.GiToOid(gi, src_oid);
358 
359  BOOST_REQUIRE(rv1);
360  BOOST_REQUIRE(rv2);
361 
362  CRef<CBioseq> bss = src.GetBioseq(src_oid);
363  CRef<CBioseq> bsd = dst.GetBioseq(oid);
364 
365  s_CompareBioseqs(*bss, *bsd);
366  }
367 
368  BOOST_REQUIRE_EQUAL(dst.GetTitle(), title);
369 }
370 
371 // Remove the specified files.
372 
373 void s_RemoveFile(const string & f)
374 {
375  CDirEntry de(f);
377  /// @todo the test below fails, leaking resources
378  /// BOOST_REQUIRE(de.Exists() == false);
379 }
380 
381 void s_RemoveFiles(const vector<string> & files)
382 {
383  for(unsigned i = 0; i < files.size(); i++) {
384  s_RemoveFile(files[i]);
385  }
386 }
387 
388 // Check if the given file is already sorted.
389 
390 void s_CheckSorted(const string & fname)
391 {
392  CNcbiIfstream file(fname.c_str());
393 
394  string s, s2;
395 
396  while(NcbiGetlineEOL(file, s)) {
397  if (s.size() == 0) break;
398  BOOST_REQUIRE(s2 <= s);
399  s.swap(s2);
400  }
401 }
402 
403 // Check the files that make up a database volume.
404 //
405 // nsd/psd: Check that the file is in sorted order
406 
407 string s_ExtractLast(const string & data, const string & delim)
408 {
409  size_t pos = data.rfind(delim);
410 
411  if (pos == string::npos)
412  return "";
413 
414  return string(data,
415  pos+delim.size(),
416  data.size()-(pos + delim.size()));
417 }
418 
419 // Check the files that make up a database volume.
420 //
421 // nsd/psd: Check that the file is in sorted order
422 
423 void s_CheckFiles(const vector<string> & files,
424  bool need_hash = false)
425 {
426  bool found_hash = false;
427 
428  for(unsigned i = 0; i < files.size(); i++) {
429  string ext = s_ExtractLast(files[i], ".");
430 
431  if (ext == "nsd" || ext == "psd") {
432  s_CheckSorted(files[i]);
433  }
434  if (ext == "nhd" || ext == "phd") {
435  s_CheckSorted(files[i]);
436  found_hash = true;
437  }
438  }
439 
440  if (need_hash) {
441  BOOST_REQUIRE(found_hash);
442  }
443 }
444 
445 // Do sanity checks appropriate for some files, then remove them.
446 
447 void s_WrapUpFiles(const vector<string> & files)
448 {
449  s_CheckFiles(files);
450  s_RemoveFiles(files);
451 }
452 
453 // Like s_WrapUpFiles but starting with the DB.
454 
456 {
457  vector<string> files;
458  db.ListFiles(files);
459  s_WrapUpFiles(files);
460 }
461 
463 public:
465  {
466  }
467 
469  {
470  if (m_Db.NotEmpty()) {
471  s_WrapUpDb(*m_Db);
472  }
473  }
474 
475  void SetDb(CWriteDB & db)
476  {
477  m_Db.Reset(& db);
478  }
479 
480 private:
482 };
483 
484 // Copy the specified ids (int -> GI, string -> FASTA Seq-id) from the
485 // source database (src_name) to a new CWriteDB object, then perform
486 // checks on the resulting database and remove it.
487 
488 static void
490  bool is_protein,
491  bool raw_data,
492  const string & src_name,
493  const string & dst_name,
494  const string & title,
495  int cutpoint = 99)
496 {
497  CWrapperUpper wrap;
498 
500 
501  // Ensure no strange files are left after text execution
502  string basename = dst_name;
503  basename += (is_protein ? ".p" : ".n");
504  const char* ext[] = { "si", "sd", "og", "ni", "nd" };
505  for (size_t i = 0; i < (sizeof(ext)/sizeof(*ext)); i++) {
506  string fname(basename+string(ext[i]));
507  CFileDeleteAtExit::Add(fname);
508  }
509 
510  CSeqDBExpert src(src_name, (is_protein
513 
514  vector<string> files;
515 
516  CRef<CWriteDB> db;
517 
519 
520  db.Reset(new CWriteDB(dst_name,
521  (is_protein
524  title,
526 
527  wrap.SetDb(*db);
528 
530 
531  if (raw_data) {
532  s_DupIdsRaw(*db, src, ids);
533  } else {
534  s_DupIdsBioseq(*db, src, ids, cutpoint);
535  }
536 
538 
539  db->Close();
540  db->ListFiles(files);
541  db.Reset();
542 
544 
545  s_TestDatabase(src, dst_name, title);
546 
548 }
549 
550 // Get and return a CScope with local copies of test sequences loaded.
551 
553 {
555 
556  CRef<CScope> scope(new CScope(*obj_mgr));
557 
558  unique_ptr<CObjectIStream> ois
559  (CObjectIStream::Open(eSerial_AsnText, "data/gi129295.asn"));
560  CRef<CSeq_entry> entry(new CSeq_entry);
561 
562  *ois >> *entry;
563  scope->AddTopLevelSeqEntry(*entry);
564 
565  ois.reset(CObjectIStream::Open(eSerial_AsnText, "data/gi129296.asn"));
566  entry.Reset(new CSeq_entry);
567 
568  *ois >> *entry;
569  scope->AddTopLevelSeqEntry(*entry);
570 
571  return scope;
572 }
573 
574 static void s_BuildIds(TIdList & ids, TGi * gis)
575 {
576  for(TGi * ptr = gis; *ptr != ZERO_GI; ptr ++) {
577  ids.push_back(s_GiToSeqId(*ptr));
578  }
579 }
580 
581 static void s_BuildIds(TIdList & ids, const char ** gis)
582 {
583  for(const char ** ptr = gis; *ptr; ptr ++) {
584  ids.push_back(s_AccToSeqId(*ptr));
585  }
586 }
587 
588 CRef<CBioseq> s_FastaStringToBioseq(const string & str, bool protein)
589 {
590  CNcbiIstrstream istr(str);
591 
592  CRef<ILineReader> lr(new CStreamLineReader(istr));
593 
594  typedef CFastaReader::EFlags TFlags;
595 
596  TFlags flags = (TFlags) (protein ? CFastaReader::fAssumeProt : CFastaReader::fAssumeNuc);
597 
598  CFastaReader fr(*lr, flags);
599 
600  BOOST_REQUIRE(! lr->AtEOF());
601  CRef<CSeq_entry> entry = fr.ReadOneSeq();
602 
603  BOOST_REQUIRE(! entry.Empty());
604  BOOST_REQUIRE(entry->IsSeq());
605 
606  CRef<CBioseq> bs(& entry->SetSeq());
607 
608  return bs;
609 }
610 
611 
612 //
613 // Actual test cases.
614 //
615 
616 static void s_NuclBioseqDupSwitch(int cutpoint)
617 {
618 
619  TGi gis[] = {
620  78883515, 78883517, /*71143095,*/ 24431485, 19110479, 15054463,
621  15054465, 15054467, 15054469, 15054471, 19570808, 18916476,
622  1669608, 1669610, 1669612, 1669614, 1669616, 10944307,
623  10944309, 10944311, 19909844, 19909846, 19909860, 19911180,
624  19911220, 19911222, 19911224, 57472140, 20126670, 20387092,
625  57639630, 57639632, 7670507, 2394289, 21280378, 21327938,
626  6518520, 20086356, 20086357, 21392391, 20086359, 19110509,
627  21623739, 21623761, 38303844, 38197377, 56788779, 57032781,
628  57870443, 56789136, 0
629  };
630 
631  TIdList ids;
632  s_BuildIds(ids, gis);
633 
635 
636  const string srcname("data/writedb_nucl");
637  const string dstname("w-nucl-bs");
638  const string title("bioseq nucleotide dup");
639 
640  s_DupSequencesTest(ids,
641  false,
642  false,
643  srcname,
644  dstname,
645  title,
646  cutpoint);
647 
649 
650  const string dstname2("w-nucl-raw");
651  const string title2("raw nucleotide dup");
652  s_DupSequencesTest(ids,
653  false,
654  true,
655  srcname,
656  dstname2,
657  title2,
658  cutpoint);
659 
661 }
662 
663 
664 BOOST_AUTO_TEST_SUITE(writedb)
665 
666 #if 0
667 BOOST_AUTO_TEST_CASE(NuclBioseqDupZ)
668 {
669 
670  try {
672  }
673  catch(CNonException &) {
674  }
675 }
676 
677 BOOST_AUTO_TEST_CASE(NuclBioseqDupA)
678 {
679 
680  try {
682  }
683  catch(CNonException &) {
684  }
685 }
686 
687 BOOST_AUTO_TEST_CASE(NuclBioseqDupB)
688 {
689 
690  try {
692  }
693  catch(CNonException &) {
694  }
695 }
696 
697 BOOST_AUTO_TEST_CASE(NuclBioseqDupC)
698 {
699 
700  try {
702  }
703  catch(CNonException &) {
704  }
705 }
706 
707 BOOST_AUTO_TEST_CASE(NuclBioseqDupD)
708 {
709 
710  try {
712  }
713  catch(CNonException &) {
714  }
715 }
716 
717 BOOST_AUTO_TEST_CASE(NuclBioseqDupE)
718 {
719 
720  try {
722  }
723  catch(CNonException &) {
724  }
725 }
726 
727 BOOST_AUTO_TEST_CASE(NuclBioseqDupF)
728 {
729 
730  try {
732  }
733  catch(CNonException &) {
734  }
735 }
736 
737 BOOST_AUTO_TEST_CASE(NuclBioseqDupG)
738 {
739 
740  try {
742  }
743  catch(CNonException &) {
744  }
745 }
746 
747 BOOST_AUTO_TEST_CASE(NuclBioseqDupH)
748 {
749 
750  try {
752  }
753  catch(CNonException &) {
754  }
755 }
756 #endif
757 
758 BOOST_AUTO_TEST_CASE(NuclBioseqDupI)
759 {
760 
761  try {
763  }
764  catch(CNonException &) {
765  }
766 }
767 
768 BOOST_AUTO_TEST_CASE(NuclBioseqDupJ4)
769 {
770 
771  g_NuclJ_OidCount = 4;
772 
773  try {
775  }
776  catch(CNonException &) {
777  }
778 }
779 
780 BOOST_AUTO_TEST_CASE(NuclBioseqDupJ8)
781 {
782 
783  g_NuclJ_OidCount = 8;
784 
785  try {
787  }
788  catch(CNonException &) {
789  }
790 }
791 
792 BOOST_AUTO_TEST_CASE(NuclBioseqDupJ12)
793 {
794 
795  g_NuclJ_OidCount = 12;
796 
797  try {
799  }
800  catch(CNonException &) {
801  }
802 }
803 
804 BOOST_AUTO_TEST_CASE(NuclBioseqDupJ16)
805 {
806 
807  g_NuclJ_OidCount = 16;
808 
809  try {
811  }
812  catch(CNonException &) {
813  }
814 }
815 
816 BOOST_AUTO_TEST_CASE(NuclBioseqDupJ20)
817 {
818 
819  g_NuclJ_OidCount = 20;
820 
821  try {
823  }
824  catch(CNonException &) {
825  }
826 }
827 
828 BOOST_AUTO_TEST_CASE(NuclBioseqDupJ24)
829 {
830 
831  g_NuclJ_OidCount = 24;
832 
833  try {
835  }
836  catch(CNonException &) {
837  }
838 }
839 
840 BOOST_AUTO_TEST_CASE(NuclBioseqDupJ28)
841 {
842 
843  g_NuclJ_OidCount = 28;
844 
845  try {
847  }
848  catch(CNonException &) {
849  }
850 }
851 
852 BOOST_AUTO_TEST_CASE(NuclBioseqDupJ32)
853 {
854 
855  g_NuclJ_OidCount = 32;
856 
857  try {
859  }
860  catch(CNonException &) {
861  }
862 }
863 
864 BOOST_AUTO_TEST_CASE(NuclBioseqDupJ33)
865 {
866 
867  g_NuclJ_OidCount = 33;
868 
869  try {
871  }
872  catch(CNonException &) {
873  }
874 }
875 
876 BOOST_AUTO_TEST_CASE(NuclBioseqDupJ34)
877 {
878 
879  g_NuclJ_OidCount = 34;
880 
881  try {
883  }
884  catch(CNonException &) {
885  }
886 }
887 
888 BOOST_AUTO_TEST_CASE(NuclBioseqDupJ35)
889 {
890 
891  g_NuclJ_OidCount = 35;
892 
893  try {
895  }
896  catch(CNonException &) {
897  }
898 }
899 
900 BOOST_AUTO_TEST_CASE(NuclBioseqDupJ36)
901 {
902 
903  g_NuclJ_OidCount = 36;
904 
905  try {
907  }
908  catch(CNonException &) {
909  }
910 }
911 
912 BOOST_AUTO_TEST_CASE(NuclBioseqDupJ40)
913 {
914 
915  g_NuclJ_OidCount = 40;
916 
917  try {
919  }
920  catch(CNonException &) {
921  }
922 }
923 
924 BOOST_AUTO_TEST_CASE(NuclBioseqDupJ44)
925 {
926 
927  g_NuclJ_OidCount = 44;
928 
929  try {
931  }
932  catch(CNonException &) {
933  }
934 }
935 
936 BOOST_AUTO_TEST_CASE(NuclBioseqDupJ45)
937 {
938 
939  g_NuclJ_OidCount = 45;
940 
941  try {
943  }
944  catch(CNonException &) {
945  }
946 }
947 
948 BOOST_AUTO_TEST_CASE(NuclBioseqDupJ46)
949 {
950 
951  g_NuclJ_OidCount = 46;
952 
953  try {
955  }
956  catch(CNonException &) {
957  }
958 }
959 
960 BOOST_AUTO_TEST_CASE(NuclBioseqDupJ47)
961 {
962 
963  g_NuclJ_OidCount = 47;
964 
965  try {
967  }
968  catch(CNonException &) {
969  }
970 }
971 
972 BOOST_AUTO_TEST_CASE(NuclBioseqDupJ48)
973 {
974 
975  g_NuclJ_OidCount = 48;
976 
977  try {
979  }
980  catch(CNonException &) {
981  }
982 }
983 
984 BOOST_AUTO_TEST_CASE(NuclBioseqDupJ49)
985 {
986 
987  g_NuclJ_OidCount = 49;
988 
989  try {
991  }
992  catch(CNonException &) {
993  }
994 }
995 
996 BOOST_AUTO_TEST_CASE(NuclBioseqDupJ50)
997 {
998 
999  g_NuclJ_OidCount = 50;
1000 
1001  try {
1003  }
1004  catch(CNonException &) {
1005  }
1006 }
1007 
1008 BOOST_AUTO_TEST_CASE(NuclBioseqDupJ)
1009 {
1010 
1011  try {
1013  }
1014  catch(CNonException &) {
1015  }
1016 }
1017 
1018 BOOST_AUTO_TEST_CASE(NuclBioseqDupK)
1019 {
1020 
1021  try {
1023  }
1024  catch(CNonException &) {
1025  }
1026 }
1027 
1028 #if 0
1029 BOOST_AUTO_TEST_CASE(NuclBioseqDupL)
1030 {
1031 
1032  try {
1034  }
1035  catch(CNonException &) {
1036  }
1037 }
1038 
1039 BOOST_AUTO_TEST_CASE(NuclBioseqDupM)
1040 {
1041 
1042  try {
1044  }
1045  catch(CNonException &) {
1046  }
1047 }
1048 
1049 BOOST_AUTO_TEST_CASE(NuclBioseqDupN)
1050 {
1051 
1052  try {
1054  }
1055  catch(CNonException &) {
1056  }
1057 }
1058 #endif
1059 
1060 BOOST_AUTO_TEST_CASE(NuclBioseqDup)
1061 {
1062 
1064 }
1065 
1066 BOOST_AUTO_TEST_CASE(ProtBioseqDup)
1067 {
1068 
1069  TGi gis[] = {
1070  1477444, 1669609, 1669611, 1669615, 1669617, 7544146,
1071  22652804, /*1310870,*/ 3114354, 3891778, 3891779, 81294290,
1072  81294330, 49089974, 62798905, 3041810, 7684357, 7684359,
1073  7684361, 7684363, 7544148, 3452560, 3452564, 6681587,
1074  6681590, 6729087, 7259315, 2326257, 3786310, 3845607,
1075  13516469, 2575863, 4049591, 3192363, 1871126, 2723484,
1076  6723181, 11125717, 2815400, 1816433, 3668177, 6552408,
1077  13365559, 8096667, 3721768, 9857600, 2190043, 3219276,
1078  10799943, 10799945, 0
1079  };
1080 
1081  TIdList ids;
1082  s_BuildIds(ids, gis);
1083 
1084  s_DupSequencesTest(ids,
1085  true,
1086  false,
1087  "data/writedb_prot",
1088  "w-prot-bs",
1089  "bioseq protein dup");
1090 
1091  s_DupSequencesTest(ids,
1092  true,
1093  true,
1094  "data/writedb_prot",
1095  "w-prot-raw",
1096  "raw protein dup");
1097 }
1098 
1100 {
1101 
1102  CWriteDB fails("failing-db",
1104  "title",
1106 
1107  CRef<CBioseq> bs(new CBioseq);
1108  fails.AddSequence(*bs);
1109 
1110  BOOST_REQUIRE_THROW(fails.Close(), CWriteDBException);
1111 }
1112 
1114 {
1115 
1116  CWriteDB db("from-loader",
1118  "title",
1120 
1121  CRef<CScope> scope = s_GetScope();
1122 
1123  // Normal bioseq handle.
1124 
1125  CRef<CSeq_id> id1(new CSeq_id("gi|129295"));
1126  CBioseq_Handle bsh1 = scope->GetBioseqHandle(*id1);
1127  db.AddSequence(bsh1);
1128 
1129  // Clean up.
1130 
1131  db.Close();
1132  s_WrapUpDb(db);
1133 }
1134 
1135 BOOST_AUTO_TEST_CASE(BioseqHandleAndSeqVectorNonWriteDB)
1136 {
1137 
1138  // This is a modified version of the following test. The
1139  // assumption is that some errors occur due to environmental
1140  // factors. Hopefully this test will help to determine the
1141  // library in which these intermittent errors occur.
1142 
1143  CRef<CScope> scope = s_GetScope();
1144 
1145  CRef<CSeq_id> id2(new CSeq_id("gi|129296"));
1146  CBioseq_Handle bsh2 = scope->GetBioseqHandle(*id2);
1147  CConstRef<CBioseq> bs1c = bsh2.GetCompleteBioseq();
1148 
1149  CRef<CBioseq> bs1 = s_Duplicate(*bs1c);
1150  CSeqVector sv(bsh2);
1151 
1152  string bytes;
1153  sv.GetSeqData(0, sv.size(), bytes);
1154 
1155  BOOST_REQUIRE(bytes.size() == sv.size());
1156 }
1157 
1158 BOOST_AUTO_TEST_CASE(BioseqHandleAndSeqVector)
1159 {
1160 
1161  CRef<CScope> scope = s_GetScope();
1162 
1163  // Bioseq + CSeqVector.
1164 
1165  CRef<CSeq_id> id2(new CSeq_id("gi|129296"));
1166  CBioseq_Handle bsh2 = scope->GetBioseqHandle(*id2);
1167  CConstRef<CBioseq> bs1c = bsh2.GetCompleteBioseq();
1168 
1169  CRef<CBioseq> bs1 = s_Duplicate(*bs1c);
1170  CSeqVector sv(bsh2);
1171 
1172  string bytes;
1173  sv.GetSeqData(0, sv.size(), bytes);
1174 }
1175 
1176 BOOST_AUTO_TEST_CASE(BioseqHandleAndSeqVectorWriteDB)
1177 {
1178 
1179  CWriteDB db("from-loader",
1181  "title",
1183 
1184  CRef<CScope> scope = s_GetScope();
1185 
1186  // Bioseq + CSeqVector.
1187 
1188  CRef<CSeq_id> id2(new CSeq_id("gi|129296"));
1189  CBioseq_Handle bsh2 = scope->GetBioseqHandle(*id2);
1190  CConstRef<CBioseq> bs1c = bsh2.GetCompleteBioseq();
1191 
1192  CRef<CBioseq> bs1 = s_Duplicate(*bs1c);
1193  CSeqVector sv(bsh2);
1194 
1195  // Make sure CSeqVector is exercised by removing the Seq-data.
1196 
1197  bs1->SetInst().ResetSeq_data();
1198  db.AddSequence(*bs1, sv);
1199 
1200  // Clean up.
1201 
1202  db.Close();
1203  s_WrapUpDb(db);
1204 }
1205 
1207 {
1208 
1209  string nm = "pigs";
1210  vector<string> files;
1211 
1212  {
1213  CSeqDB wdb("data/writedb_prot", CSeqDB::eProtein);
1214 
1215  CWriteDB db(nm,
1217  "title",
1219 
1220  db.AddSequence(*wdb.GiToBioseq(129295));
1221  db.SetPig(101);
1222 
1223  db.AddSequence(*wdb.GiToBioseq(129296));
1224  db.SetPig(102);
1225 
1226  db.AddSequence(*wdb.GiToBioseq(129297));
1227  db.SetPig(103);
1228 
1229  db.Close();
1230  db.ListFiles(files);
1231  }
1232 
1233  CSeqDB db2(nm, CSeqDB::eProtein);
1234 
1235  int oid = 0;
1236 
1237  for(; db2.CheckOrFindOID(oid); oid++) {
1238  int pig(0);
1239  vector<TGi> gis;
1240 
1241  bool rv1 = db2.OidToPig(oid, pig);
1242  db2.GetGis(oid, gis, false);
1243 
1244  bool found_gi = false;
1245  for(unsigned i = 0; i < gis.size(); i++) {
1246  if (gis[i] == 129295 + oid) {
1247  found_gi = true;
1248  }
1249  }
1250 
1251  BOOST_REQUIRE(rv1);
1252  BOOST_REQUIRE(found_gi);
1253  BOOST_REQUIRE_EQUAL(pig-oid, 101);
1254  }
1255 
1256  BOOST_REQUIRE_EQUAL(oid, 3);
1257 
1258  s_WrapUpFiles(files);
1259 }
1260 
1261 // Test multiple volume construction and maximum letter limit.
1262 
1264 {
1265 
1266  CSeqDB wdb("data/writedb_prot", CSeqDB::eProtein);
1267 
1268  CWriteDB db("multivol",
1270  "title",
1272 
1273  db.SetMaxVolumeLetters(500);
1274 
1275  int gis[] = { 129295, 129296, 129297, 129299, 0 };
1276 
1277  Uint8 letter_count = 0;
1278 
1279  for(int i = 0; gis[i]; i++) {
1280  int oid(0);
1281  wdb.GiToOid(gis[i], oid);
1282 
1283  db.AddSequence(*wdb.GetBioseq(oid));
1284  letter_count += wdb.GetSeqLength(oid);
1285  }
1286 
1287  db.Close();
1288 
1289  vector<string> v;
1290  vector<string> f;
1291  db.ListVolumes(v);
1292  db.ListFiles(f);
1293 
1294  BOOST_REQUIRE_EQUAL(3, (int) v.size());
1295  BOOST_REQUIRE_EQUAL(v[0], string("multivol.00"));
1296  BOOST_REQUIRE_EQUAL(v[1], string("multivol.01"));
1297  BOOST_REQUIRE_EQUAL(v[2], string("multivol.02"));
1298 
1299  BOOST_REQUIRE_EQUAL(25, (int) f.size());
1300 
1301  // Check resulting db.
1302 
1303  CRef<CSeqDB> seqdb(new CSeqDB("multivol", CSeqDB::eProtein));
1304 
1305  int oids(0);
1306  Uint8 letters(0);
1307 
1308  seqdb->GetTotals(CSeqDB::eUnfilteredAll, & oids, & letters, false);
1309 
1310  BOOST_REQUIRE_EQUAL(oids, 4);
1311  BOOST_REQUIRE_EQUAL(letter_count, letters);
1312 
1313  seqdb.Reset();
1314 
1315  s_WrapUpFiles(f);
1316 }
1317 
1319 {
1320 
1321  CRef<CSeq_id> seqid(new CSeq_id("pat|us|123|456"));
1322  vector<string> files;
1323 
1324  {
1325  CRef<CWriteDB> writedb
1326  (new CWriteDB("uspatid",
1328  "patent id test",
1330 
1331  CSeqDB seqdb("data/writedb_prot", CSeqDB::eProtein);
1332 
1333  CRef<CBioseq> bs = seqdb.GiToBioseq(129297);
1334 
1337  bdls->Set().push_back(dl);
1338 
1339  dl->SetTitle("Some protein sequence");
1340  dl->SetSeqid().push_back(seqid);
1341  dl->SetTaxid(12345);
1342 
1343  writedb->AddSequence(*bs);
1344  writedb->SetDeflines(*bdls);
1345 
1346  writedb->Close();
1347  writedb->ListFiles(files);
1348  BOOST_REQUIRE(files.size() != 0);
1349  }
1350 
1351  CSeqDB seqdb("uspatid", CSeqDB::eProtein);
1352  int oid(-1);
1353  bool found = seqdb.SeqidToOid(*seqid, oid);
1354 
1355  BOOST_REQUIRE_EQUAL(found, true);
1356  BOOST_REQUIRE_EQUAL(oid, 0);
1357 
1358  s_WrapUpFiles(files);
1359 }
1360 
1362 {
1363 
1364  // This checks whether the following IDs are fetchable from the
1365  // given database. It will fail if either the production blast
1366  // databases (i.e. found at $BLASTDB) are corrupted or if the
1367  // newly produced database is corrupted. It will also fail if any
1368  // of the IDs are legitimately missing (removed by the curators),
1369  // in which case the given ID must be removed from the list.
1370 
1371  // However, the selection of these specific IDs is not arbitrary;
1372  // these are several sets of IDs which have a common 6 letter
1373  // prefix. The test will not work correctly if these IDs are
1374  // replaced with IDs that don't have this trait, if too many are
1375  // removed, or if the IDs are put in sorted order.
1376 
1377  // A null terminated array of NUL terminated strings.
1378 
1379  const char* accs[] = {
1380  /*"AAC76335.1",*/ "AAC77159.1", /*"AAA58145.1",*/ "AAC76880.1",
1381  "AAC76230.1", "AAC76373.1", "AAC77137.1", "AAC76637.2",
1382  "AAA58101.1", /*"AAC76329.1",*/ "AAC76702.1", "AAC77109.1",
1383  "AAC76757.1", "AAA58162.1", "AAC76604.1", "AAC76539.1",
1384  "AAA24224.1", /*"AAC76351.1",*/ "AAC76926.1", "AAC77047.1",
1385  /*"AAC76390.1", "AAC76195.1",*/ "AAA57930.1", "AAC76134.1",
1386  "AAC76586.2", "AAA58123.1", "AAC76430.1", "AAA58107.1",
1387  /*"AAC76765.1",*/ "AAA24272.1", "AAC76396.2", /*"AAA24183.1",*/
1388  "AAC76918.1", "AAC76727.1", /*"AAC76161.1",*/ "AAA57964.1",
1389  "AAA24251.1", 0
1390  };
1391 
1392  TIdList ids;
1393  s_BuildIds(ids, accs);
1394 
1395  s_DupSequencesTest(ids,
1396  true,
1397  false,
1398  "data/writedb_prot",
1399  "w-isam-sort-bs",
1400  "test of string ISAM sortedness");
1401 }
1402 
1404 {
1405 
1406  // This checks if duplicate IDs (AAC76373 and AAA58145) are found
1407 
1408  const char* accs[] = {
1409  "AAC76335.1", "AAC77159.1", "AAA58145.1", "AAC76880.1",
1410  "AAC76230.1", "AAC76373.1", "AAC77137.1", "AAC76637.2",
1411  "AAA58101.1", "AAC76329.1", "AAC76702.1", "AAC77109.1",
1412  "AAC76757.1", "AAA58162.1", "AAC76604.1", "AAC76539.1",
1413  "AAA24224.1", "AAC76351.1", "AAC76926.1", "AAC77047.1",
1414  "AAC76390.1", "AAC76195.1", "AAA57930.1", "AAC76134.1",
1415  "AAC76586.2", "AAA58123.1", "AAC76430.1", "AAA58107.1",
1416  "AAC76765.1", "AAA24272.1", "AAC76396.2", "AAA24183.1",
1417  "AAC76918.1", "AAC76727.1", "AAC76161.1", "AAA57964.1",
1418  "AAA24251.1", 0
1419  };
1420 
1421  TIdList ids;
1422  s_BuildIds(ids, accs);
1423 
1424  BOOST_REQUIRE_THROW(s_DupSequencesTest(ids,
1425  true,
1426  false,
1427  "data/writedb_prot",
1428  "w-isam-sort-bs",
1429  "test of string ISAM sortedness"),
1431 }
1432 
1434 {
1435 
1436  CSeqDBExpert wdb_p("data/writedb_prot", CSeqDB::eProtein);
1437  CSeqDBExpert wdb_n("data/writedb_nucl", CSeqDB::eNucleotide);
1438 
1439  TGi prot_gis[] = { 129295, 129296, 129297, 0 };
1440  TGi nucl_gis[] = { 555, 556, 405832, 0 };
1441 
1442  TIdList prot_ids, nucl_ids;
1443  s_BuildIds(prot_ids, prot_gis);
1444  s_BuildIds(nucl_ids, nucl_gis);
1445 
1446  typedef CWriteDB::EIndexType TType;
1447 
1448  TType itype = TType(CWriteDB::eFullWithTrace |
1450 
1451  CRef<CWriteDB> prot(new CWriteDB("w-prot-hash",
1453  "test of hash ISAMs (P)",
1454  itype));
1455 
1456  CRef<CWriteDB> nucl(new CWriteDB("w-nucl-hash",
1458  "test of hash ISAMs (N)",
1459  itype));
1460 
1461  s_DupIdsBioseq(*prot, wdb_p, prot_ids, 99);
1462  s_DupIdsBioseq(*nucl, wdb_n, nucl_ids, 99);
1463 
1464  prot->Close();
1465  nucl->Close();
1466 
1467  s_WrapUpDb(*prot);
1468  s_WrapUpDb(*nucl);
1469 }
1470 
1471 BOOST_AUTO_TEST_CASE(MismatchedDb_Bioseq) // per SB-1330
1472 {
1473  vector<string> files;
1474  string title = "pdb-id";
1475  string I1("pdb|3E3Q|BB"), T1("Lower case chain b");
1476 
1477  {
1478  CRef<CWriteDB> wr(new CWriteDB(title,
1480  "title",
1482 
1483  // Build a multi-defline bioseq and read it with CFastaReader.
1484 
1485  string str = ">" + I1 + " " + T1 + "\n" + "ELVISLIVES\n";
1486 
1488 
1489  BOOST_REQUIRE_THROW(wr->AddSequence(*bs), CWriteDBException);
1490  wr->Close();
1491 
1492  // Clean up.
1493 
1494  wr->ListFiles(files);
1495  }
1496 
1497  s_RemoveFiles(files);
1498 }
1499 
1500 BOOST_AUTO_TEST_CASE(PDBIdLowerCase)
1501 {
1502 
1503  vector<string> files;
1504 
1505  string title = "pdb-id";
1506 
1507  string
1508  I1("pdb|3E3Q|b"), T1("Lower case chain b");
1509 
1510  {
1511  CRef<CWriteDB> wr(new CWriteDB(title,
1513  "title",
1515 
1516  // Build a multi-defline bioseq and read it with CFastaReader.
1517 
1518  string str = ">" + I1 + " " + T1 + "\n" + "ELVISLIVES\n";
1519 
1521 
1522  wr->AddSequence(*bs);
1523  wr->Close();
1524 
1525  // Clean up.
1526 
1527  wr->ListFiles(files);
1528  }
1529 
1530  {
1531  CSeqDB rd("pdb-id", CSeqDB::eProtein);
1532  BOOST_REQUIRE(rd.GetNumOIDs() == 1);
1533 
1534  vector<int> oids;
1535  rd.AccessionToOids("3e3q_b", oids);
1536 
1537  BOOST_REQUIRE(oids.size() == 1);
1538  }
1539 
1540  s_WrapUpFiles(files);
1541 }
1542 
1543 BOOST_AUTO_TEST_CASE(FastaReaderBioseq)
1544 {
1545 
1546  vector<string> files;
1547 
1548  string title = "from-fasta-reader";
1549 
1550  string
1551  I1("gi|123"), T1("One two three."),
1552  I2("gi|124"), T2("One two four.");
1553 
1554  {
1555  CRef<CWriteDB> wr(new CWriteDB(title,
1557  "title",
1559 
1560  // Build a multi-defline bioseq and read it with CFastaReader.
1561 
1562  string str =
1563  ">" + I1 + " " + T1 +
1564  "\001" + I2 + " " + T2 + "\n" +
1565  "ELVISLIVES\n";
1566 
1568 
1569  wr->AddSequence(*bs);
1570  wr->Close();
1571 
1572  // Clean up.
1573 
1574  wr->ListFiles(files);
1575  }
1576 
1577  {
1578  CSeqDB rd("from-fasta-reader", CSeqDB::eProtein);
1579  BOOST_REQUIRE(rd.GetNumOIDs() == 1);
1580 
1582  rd.GetHdr(0);
1583 
1584  // Prefer BOOST_REQUIRE_EQUAL(a, b) over BOOST_REQUIRE(a == b).
1585  // The former will print the non-equal values for you, the latter
1586  // does not.
1587  BOOST_REQUIRE_EQUAL(bdls->Get().size(), 1);
1588  BOOST_REQUIRE_EQUAL(bdls->Get().front()->GetTitle(), T1);
1589  BOOST_REQUIRE_EQUAL(bdls->Get().front()->GetSeqid().size(), 1);
1590  BOOST_REQUIRE_EQUAL(bdls->Get().front()->GetSeqid().front()->AsFastaString(), I1);
1591  }
1592 
1593  s_WrapUpFiles(files);
1594 }
1595 
1596 BOOST_AUTO_TEST_CASE(BinaryListBuilder)
1597 {
1598 
1599  string fn4("test4.til"), fn8("test8.til");
1600 
1601  {
1604 
1605  for(int i = 0; i<10; i++) {
1606  blb4.AppendId(Int8(1) << (i*2));
1607  blb8.AppendId(Int8(1) << (i*4));
1608  }
1609 
1610  blb4.Write(fn4);
1611  blb8.Write(fn8);
1612  }
1613 
1614  string h4 = s_HexDumpFile(fn4, 4, 16);
1615  string h8 = s_HexDumpFile(fn8, 4, 16);
1616 
1617  // The FF...FD symbol indicates a 4 byte TI list; the FF..FC
1618  // symbol is the eight byte version.
1619 
1620  BOOST_REQUIRE(h4 ==
1621  "FFFFFFFD A "
1622  "1 4 10 40 100 "
1623  "400 1000 4000 10000 40000");
1624 
1625  BOOST_REQUIRE(h8 ==
1626  "FFFFFFFC A "
1627  "0 1 0 10 0 100 0 1000 0 10000 "
1628  "0 100000 0 1000000 0 10000000 1 0 10 0");
1629 
1630  CFile(fn4).Remove();
1631  CFile(fn8).Remove();
1632 }
1633 
1634 BOOST_AUTO_TEST_CASE(FourAndEightByteTis)
1635 {
1636 
1637  typedef pair<string, string> TPair;
1638  vector< TPair > ids48;
1639 
1640  // Generate gnl|ti# IDs where # is 1234*2^N for db4, and
1641  // 1234*1000^N for db8.
1642 
1643  {
1644  Int8 a4(1234), b4(2), a8(1234), b8(1000);
1645 
1646  string prefix = "gnl|ti|";
1647 
1648  for(int i = 0; i < 5; i++) {
1649  TPair p;
1650  p.first = prefix + NStr::Int8ToString(a4);
1651  p.second = prefix + NStr::Int8ToString(a8);
1652 
1653  ids48.push_back(p);
1654  Int8 p4(a4), p8(a8);
1655 
1656  a4 *= b4;
1657  a8 *= b8;
1658 
1659  // Check for overflow.
1660 
1661  BOOST_REQUIRE(a4 > p4);
1662  BOOST_REQUIRE(a8 > p8);
1663  }
1664 
1665  // Make sure we really do have 32 and 64 bit IDs.
1666 
1667  BOOST_REQUIRE((a4 >> 32) == 0);
1668  BOOST_REQUIRE((a8 >> 32) != 0);
1669  }
1670 
1671  string dbname4 = "test-db-short-tis";
1672  string dbname8 = "test-db-long-tis";
1673 
1674  CWriteDB db4(dbname4,
1676  dbname4 + " database.",
1678 
1679  CWriteDB db8(dbname8,
1681  dbname8 + " database.",
1683 
1684  string iupac = "GATTACA";
1685 
1686  ITERATE(vector< TPair >, iter, ids48) {
1687  string f4 = string(">") + iter->first + " test\n" + iupac + "\n";
1688  string f8 = string(">") + iter->second + " test\n" + iupac + "\n";
1689 
1690  db4.AddSequence( *s_FastaStringToBioseq(f4, false) );
1691  db8.AddSequence( *s_FastaStringToBioseq(f8, false) );
1692  }
1693 
1694  db4.Close();
1695  db8.Close();
1696 
1697  // Use 4 byte dumps for the (mixed field width) index files.
1698 
1699  string index4 = s_HexDumpFile(dbname4 + ".nti", 4, 16);
1700  string index8 = s_HexDumpFile(dbname8 + ".nti", 4, 16);
1701 
1702  string
1703  i4("1 0 28 5 1 100 0 0 0 4D2 0 FFFFFFFF 0"),
1704  i8("1 5 3C 5 1 100 0 0 0 0 4D2 0 FFFFFFFF FFFFFFFF 0"),
1705  d4("1234 0 2468 1 4936 2 9872 3 19744 4"),
1706  d8("1234 0 1234000 1 1234000000 2 1234000000000 3 1234000000000000 4");
1707 
1708  BOOST_REQUIRE(index4 == i4);
1709  BOOST_REQUIRE(index8 == i8);
1710 
1711  vector<int> overlay;
1712  overlay.push_back(8);
1713  overlay.push_back(4);
1714 
1715  // The 32-bit TI data file is uniformly 4 bytes. The 8 byte file
1716  // alternates between 8 and 4 byte fields.
1717 
1718  string data4 = s_HexDumpFile(dbname4 + ".ntd", 4, 10);
1719  string data8 = s_HexDumpFile(dbname8 + ".ntd", overlay, 10);
1720 
1721  s_WrapUpDb(db4);
1722  s_WrapUpDb(db8);
1723 
1724  BOOST_REQUIRE(data4 == d4);
1725  BOOST_REQUIRE(data8 == d8);
1726 }
1727 
1728 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
1729  (!defined(NCBI_COMPILER_MIPSPRO)) )
1731 {
1732  vector<string> files;
1733  cb.ListFiles(files);
1734  s_WrapUpFiles(files);
1735 }
1736 
1737 BOOST_AUTO_TEST_CASE(UserDefinedColumns)
1738 {
1739 
1740  // Create and open the DBs and columns.
1741 
1742  typedef map<string,string> TMeta;
1743  TMeta meta_data;
1744  meta_data["created-by"] = "unit test";
1745  meta_data["purpose"] = "none";
1746  meta_data["format"] = "text";
1747 
1748  vector<string> column_data;
1749  column_data.push_back("Groucho Marx");
1750  column_data.push_back("Charlie Chaplain");
1751  column_data.push_back("");
1752  column_data.push_back("Abbott and Costello");
1753  column_data.push_back("Jackie Gleason");
1754  column_data.push_back("Jerry Seinfeld");
1755  column_data.back()[5] = (char) 0;
1756 
1757  string fname("user-column");
1758  string vname("user-column-db");
1759  string title("comedy");
1760 
1761  CSeqDB R("data/writedb_prot", CSeqDB::eProtein);
1762  CWriteDB W(vname,
1764  "User defined column");
1765 
1766  CWriteDB_ColumnBuilder CB(title, fname);
1767 
1768  int col_id = W.CreateUserColumn(title);
1769 
1770  ITERATE(TMeta, iter, meta_data) {
1771  CB.AddMetaData(iter->first, iter->second);
1772  W.AddColumnMetaData(col_id, iter->first, iter->second);
1773  }
1774 
1775  // Build database and column.
1776 
1777  int i = 0;
1778 
1779  ITERATE(vector<string>, iter, column_data) {
1780  W.AddSequence(*R.GetBioseq(i++));
1781 
1782  CBlastDbBlob & b1 = W.SetBlobData(col_id);
1783  b1.WriteString(*iter, CBlastDbBlob::eNone);
1784 
1785  CBlastDbBlob b2(*iter, false);
1786  CB.AddBlob(b2);
1787  }
1788 
1789  // Close the DB and the column.
1790 
1791  W.Close();
1792  CB.Close();
1793 
1794  // Test the resulting files.
1795 
1796  // (Currently, the files created here are not tested. Instead,
1797  // the SeqDB test uses copies of these files and tests the data
1798  // integrity via the SeqDB functionality.)
1799 
1800  // Clean up.
1801 
1802  s_WrapUpColumn(CB);
1803  s_WrapUpDb(W);
1804 }
1805 
1806 // Register standard masking algorithms with default/sensible options
1807 BOOST_AUTO_TEST_CASE(RegisterMaskingAlgorithms)
1808 {
1810 
1811  vector<int> algo_ids;
1812  algo_ids.push_back(registry.Add(eBlast_filter_program_seg));
1813  algo_ids.push_back(registry.Add(eBlast_filter_program_dust));
1814  algo_ids.push_back(registry.Add(eBlast_filter_program_windowmasker));
1815  algo_ids.push_back(registry.Add(eBlast_filter_program_repeat, "9606"));
1816  algo_ids.push_back(registry.Add(eBlast_filter_program_other, "dummy1"));
1817  algo_ids.push_back(registry.Add(eBlast_filter_program_other, "dummy2"));
1818  algo_ids.push_back(registry.Add(eBlast_filter_program_other, "dummy2",
1819  "funnyname"));
1820 
1821  ITERATE(vector<int>, id, algo_ids) {
1822  BOOST_REQUIRE_EQUAL(true, registry.IsRegistered(*id));
1823  }
1824 }
1825 
1826 BOOST_AUTO_TEST_CASE(RegisterVariantsOfSameMaskingAlgorithm)
1827 {
1829 
1830  int id1 = registry.Add(eBlast_filter_program_seg);
1831  int id2 = registry.Add(eBlast_filter_program_seg, "dummy");
1832  BOOST_REQUIRE_EQUAL(id1+1, id2);
1833 }
1834 
1835 void
1837  (EBlast_filter_program masking_algo,
1838  size_t kMaxNumSupportedAlgorithmVariants)
1839 {
1841 
1842  vector<int> algo_ids;
1843  for (size_t i = 0; i < kMaxNumSupportedAlgorithmVariants*2; i++) {
1844  string options;
1845  // for repeat and other masking algorithms, there must be options,
1846  // otherwise the actual masking algorithm value becomes the algorithm
1847  // id when no options are provided
1848  if (i == 0 && masking_algo < eBlast_filter_program_repeat) {
1849  options.assign("");
1850  } else {
1851  options.assign(NStr::SizetToString(i));
1852  }
1853 
1854  int algo_id = -1;
1855  if (i >= kMaxNumSupportedAlgorithmVariants) {
1856  BOOST_REQUIRE_THROW(algo_id = registry.Add(masking_algo, options),
1858  } else {
1859  algo_id = registry.Add(masking_algo, options);
1860  }
1861  if (algo_id != -1) {
1862  //cerr << "Inserted id " << algo_id << endl;
1863  algo_ids.push_back(algo_id);
1864  }
1865  }
1866 
1867  // Ensure that the IDs were assigned in increasing order
1868  BOOST_REQUIRE_EQUAL(kMaxNumSupportedAlgorithmVariants, algo_ids.size());
1869  for (size_t i = 0; i < algo_ids.size(); i++) {
1870  BOOST_REQUIRE_EQUAL((int)(masking_algo + i), algo_ids[i]);
1871  }
1872 
1873  // Ensure that only valid IDs were assigned
1874  for (size_t i = 0; i < kMaxNumSupportedAlgorithmVariants*2; i++) {
1875  int algo_id = masking_algo + i;
1876  if (i >= kMaxNumSupportedAlgorithmVariants) {
1877  BOOST_REQUIRE_EQUAL(false, registry.IsRegistered(algo_id));
1878  } else {
1879  BOOST_REQUIRE_EQUAL(true, registry.IsRegistered(algo_id));
1880  }
1881  }
1882 }
1883 
1884 BOOST_AUTO_TEST_CASE(RegisterTooManyVariantsOfDust)
1885 {
1887  const size_t max_algo_variants = eBlast_filter_program_seg - self;
1888  RegisterTooManyVariantsOfSameMaskingAlgorithm(self, max_algo_variants);
1889 }
1890 
1891 BOOST_AUTO_TEST_CASE(RegisterTooManyVariantsOfSeg)
1892 {
1894  const size_t max_algo_variants = eBlast_filter_program_windowmasker - self;
1895  RegisterTooManyVariantsOfSameMaskingAlgorithm(self, max_algo_variants);
1896 }
1897 
1898 BOOST_AUTO_TEST_CASE(RegisterTooManyVariantsOfWindowMasker)
1899 {
1901  const size_t max_algo_variants = eBlast_filter_program_repeat - self;
1902  RegisterTooManyVariantsOfSameMaskingAlgorithm(self, max_algo_variants);
1903 }
1904 
1905 BOOST_AUTO_TEST_CASE(RegisterTooManyVariantsOfRepeats)
1906 {
1908  const size_t max_algo_variants = eBlast_filter_program_other - self;
1909  RegisterTooManyVariantsOfSameMaskingAlgorithm(self, max_algo_variants);
1910 }
1911 
1912 BOOST_AUTO_TEST_CASE(RegisterTooManyVariantsOfOther)
1913 {
1915  const size_t max_algo_variants = eBlast_filter_program_max - self;
1916  RegisterTooManyVariantsOfSameMaskingAlgorithm(self, max_algo_variants);
1917 }
1918 
1919 BOOST_AUTO_TEST_CASE(MaskDataColumn)
1920 {
1921 
1922  CSeqDB R("data/writedb_prot", CSeqDB::eProtein);
1923  CWriteDB W("mask-data-db", CWriteDB::eProtein, "Mask data test");
1924  const int kNumSeqs = 3;
1925 
1926  vector<int> oids;
1927  int next_oid = 0;
1928 
1929  // Get kNumSeqs sequences with length less than 1024
1930  for(int i = 0; i < kNumSeqs; i++) {
1931  int L = R.GetSeqLength(next_oid);
1932 
1933  while(L < 1024) {
1934  ++next_oid;
1935  L = R.GetSeqLength(next_oid);
1936  }
1937 
1938  oids.push_back(next_oid++);
1939  }
1940 
1941  int seg_id = W.RegisterMaskAlgorithm(eBlast_filter_program_seg);
1942 
1943  int repeat_id = W.RegisterMaskAlgorithm(eBlast_filter_program_repeat,
1944  "-species Desmodus_rotundus");
1945 
1946  // Populate it.
1947 
1948  for(int i = 0; i < kNumSeqs; i++) {
1949  int oid = oids[i];
1950  W.AddSequence(*R.GetBioseq(oid));
1951 
1952  CMaskedRangesVector ranges;
1953 
1954  if (i & 1) {
1955  ranges.push_back(SBlastDbMaskData());
1956  ranges.back().algorithm_id = seg_id;
1957 
1958  for(int j = 0; j < (i+5); j++) {
1959  pair<TSeqPos, TSeqPos> rng;
1960  rng.first = i * 13 + j * 7 + 2;
1961  rng.second = rng.first + 3 + (i+j) % 11;
1962 
1963  ranges.back().offsets.push_back(rng);
1964  }
1965  }
1966 
1967  if (i & 2) {
1968  ranges.push_back(SBlastDbMaskData());
1969  ranges.back().algorithm_id = repeat_id;
1970 
1971  for(int j = 0; j < (i+5); j++) {
1972  pair<TSeqPos, TSeqPos> rng;
1973  rng.first = i * 10 + j * 5 + 2;
1974  rng.second = rng.first + 20;
1975 
1976  ranges.back().offsets.push_back(rng);
1977  }
1978  }
1979 
1980  // Set the mask data if either list above was used, or in some
1981  // cases when neither is. (Calling SetMaskData() with an
1982  // empty array should be the same as not calling it at all;
1983  // this code tests that equivalence.)
1984 
1985  vector <TGi> gis;
1986  if (i & 7) {
1987  W.SetMaskData(ranges, gis);
1988  }
1989  }
1990 
1991  // Close the DB.
1992 
1993  W.Close();
1994 
1995  // Test the resulting files.
1996 
1997  // (Currently, the files created here are not tested. Instead,
1998  // the SeqDB test uses copies of these files and tests the data
1999  // integrity via the SeqDB functionality.)
2000 
2001  // Clean up.
2002 
2003  s_WrapUpDb(W);
2004 }
2005 
2006 BOOST_AUTO_TEST_CASE(DuplicateAlgoId)
2007 {
2008 
2009  CWriteDB W("mask-data-db", CWriteDB::eProtein, "Mask data test");
2010 
2011  (void)W.RegisterMaskAlgorithm(eBlast_filter_program_seg);
2012  int seg_repeated_id;
2013  BOOST_REQUIRE_THROW( seg_repeated_id =
2014  W.RegisterMaskAlgorithm(eBlast_filter_program_seg),
2016  (void)seg_repeated_id; /* to pacify compiler warning */
2017 }
2018 
2019 BOOST_AUTO_TEST_CASE(TooManyAlgoId)
2020 {
2021 
2022  CWriteDB W("mask-data-db", CWriteDB::eProtein, "Mask data test");
2023 
2025  vector<int> algo_ids;
2026 
2027  // Ensure that the last one fails
2028  const size_t kMaxNumSupportedAlgorithmVariants =
2029  eBlast_filter_program_windowmasker - masking_algorithm;
2030  for (size_t i = 0; i < kMaxNumSupportedAlgorithmVariants*2; i++) {
2031  string options( i == 0 ? "" : NStr::SizetToString(i));
2032  int algo_id = -1;
2033  if (i >= kMaxNumSupportedAlgorithmVariants) {
2034  BOOST_REQUIRE_THROW(
2035  algo_id = W.RegisterMaskAlgorithm(masking_algorithm, options),
2037  } else {
2038  algo_id = W.RegisterMaskAlgorithm(masking_algorithm, options);
2039  }
2040  if (algo_id != -1) {
2041  algo_ids.push_back(algo_id);
2042  }
2043  }
2044 
2045  // Ensure that the IDs were assigned in increasing order
2046  BOOST_REQUIRE_EQUAL(kMaxNumSupportedAlgorithmVariants, algo_ids.size());
2047  for (size_t i = 0; i < algo_ids.size(); i++) {
2048  BOOST_REQUIRE_EQUAL((int)(masking_algorithm + i), (int)algo_ids[i]);
2049  }
2050 }
2051 
2052 BOOST_AUTO_TEST_CASE(UndefinedAlgoID)
2053 {
2054 
2055  CSeqDB R("data/writedb_prot", CSeqDB::eProtein);
2056  CWriteDB W("mask-data-db", CWriteDB::eProtein, "Mask data test");
2057 
2058  W.RegisterMaskAlgorithm(eBlast_filter_program_seg);
2059 
2060  W.RegisterMaskAlgorithm(eBlast_filter_program_seg,
2061  "-species Aotus_vociferans");
2062 
2063  W.RegisterMaskAlgorithm(eBlast_filter_program_repeat,
2064  "-species Desmodus_rotundus");
2065 
2066  // Populate it.
2067 
2068  int oid = 0;
2069 
2070  int L = R.GetSeqLength(oid);
2071  W.AddSequence(*R.GetBioseq(oid));
2072 
2073  CMaskedRangesVector ranges;
2074 
2075  ranges.push_back(SBlastDbMaskData());
2076  ranges.back().algorithm_id = (int)eBlast_filter_program_dust;
2077 
2078  pair<TSeqPos, TSeqPos> rng;
2079  rng.first = L/3;
2080  rng.second = L;
2081 
2082  ranges.back().offsets.push_back(rng);
2083 
2084  vector <TGi> gis;
2085  BOOST_REQUIRE_THROW(W.SetMaskData(ranges, gis), CWriteDBException);
2086 
2087  W.Close();
2088  s_WrapUpDb(W);
2089 }
2090 
2091 BOOST_AUTO_TEST_CASE(MaskDataBoundsError)
2092 {
2093  CSeqDB R("data/writedb_prot", CSeqDB::eProtein);
2094  CWriteDB W("mask-data-db", CWriteDB::eProtein, "Mask data test");
2095 
2096  W.RegisterMaskAlgorithm(eBlast_filter_program_seg);
2097 
2098  W.RegisterMaskAlgorithm(eBlast_filter_program_seg,
2099  "-species Aotus_vociferans");
2100 
2101  W.RegisterMaskAlgorithm(eBlast_filter_program_repeat,
2102  "-species Desmodus_rotundus");
2103 
2104  // Populate it.
2105 
2106  int oid = 0;
2107 
2108  int L = R.GetSeqLength(oid);
2109  W.AddSequence(*R.GetBioseq(oid));
2110 
2111  CMaskedRangesVector ranges;
2112 
2113  ranges.push_back(SBlastDbMaskData());
2114  ranges.back().algorithm_id = (int)eBlast_filter_program_dust;
2115 
2116  pair<TSeqPos, TSeqPos> rng;
2117  rng.first = L/3;
2118  rng.second = L+1;
2119 
2120  ranges.back().offsets.push_back(rng);
2121  vector <TGi> gis;
2122  BOOST_REQUIRE_THROW(W.SetMaskData(ranges,gis), CWriteDBException);
2123 
2124  W.Close();
2125  s_WrapUpDb(W);
2126 }
2127 #endif
2128 
2129 /// Auxiliary class to parse the contents of an alias file
2131 
2132  /// Encapsulates the alias' file key-value pair
2133  struct Value {
2134  Value(const string& name) : m_Found(false), m_Name(name) {}
2135  bool Found() const { return m_Found;}
2136  string Get() const { return m_Value;}
2137  void Set(const string& v) {
2138  if ( !v.empty() ) {
2139  m_Value = v;
2140  m_Found = true;
2141  }
2142  }
2143  string GetKey() const { return m_Name; }
2144  private:
2145  bool m_Found;
2146  string m_Name;
2147  string m_Value;
2148  };
2149 
2159 
2160  SAliasFileData(const string& fname) :
2161  m_Title("TITLE"), m_DbList("DBLIST"), m_NSeqs("NSEQ"),
2162  m_Length("LENGTH"), m_FirstOid("FIRST_OID"), m_LastOid("LAST_OID"),
2163  m_GiList("GILIST"), m_TiList("TILIST"), m_SeqidList("SEQIDLIST")
2164  { x_Parse(fname); }
2165 
2166 private:
2167  bool x_HasKeyword(string line, Value& data) {
2168  bool retval = false;
2169  if (NStr::Find(line, data.GetKey()) != NPOS) {
2170  line = line.erase(0, data.GetKey().size()+1);
2171  data.Set(line);
2172  retval = true;
2173  }
2174  return retval;
2175  }
2176 
2177  /// Parse the alias file's contents
2178  void x_Parse(const string& fname) {
2179  string line;
2180  ifstream alias_file(fname.c_str());
2181  if ( ! alias_file ) { return; }
2182  while (getline(alias_file, line)) {
2183  if (x_HasKeyword(line, m_Title)) {
2184  continue;
2185  } else if (x_HasKeyword(line, m_DbList)) {
2186  continue;
2187  } else if (x_HasKeyword(line, m_NSeqs)) {
2188  continue;
2189  } else if (x_HasKeyword(line, m_Length)) {
2190  continue;
2191  } else if (x_HasKeyword(line, m_FirstOid)) {
2192  continue;
2193  } else if (x_HasKeyword(line, m_LastOid)) {
2194  continue;
2195  } else if (x_HasKeyword(line, m_GiList)) {
2196  continue;
2197  } else if (x_HasKeyword(line, m_TiList)) {
2198  continue;
2199  } else if (x_HasKeyword(line, m_SeqidList)) {
2200  continue;
2201  }
2202  if (NStr::Find(line, "Alias file created") != NPOS) {
2203  // this should be enough granularity
2204  const string kCurrentYear =
2206  BOOST_REQUIRE(NStr::Find(line, kCurrentYear) != NPOS);
2207  }
2208  }
2209  }
2210 };
2211 
2212 BOOST_AUTO_TEST_CASE(AliasFileGeneration)
2213 {
2214  CDiagRestorer diag_restorer;
2216  CTmpFile tmp_aliasfile, tmp_gifile;
2217  const string kDbName("data/writedb_prot");
2218  const string kTitle("My alias file");
2219  string kAliasFileName(tmp_aliasfile.GetFileName());
2220  string kGiFileName(tmp_gifile.GetFileName());
2221  {
2222  ofstream gifile(tmp_gifile.GetFileName().c_str());
2223  gifile << "129295" << endl;
2224  gifile << "555" << endl;
2225  gifile << "55" << endl;
2226  gifile.close();
2227  }
2228 
2229  CWriteDB_CreateAliasFile(kAliasFileName, kDbName, CWriteDB::eProtein,
2230  kGiFileName, kTitle);
2231  kAliasFileName += ".pal";
2232  CFileDeleteAtExit::Add(kAliasFileName);
2233 
2234  BOOST_REQUIRE(CFile(kAliasFileName).Exists());
2235  SAliasFileData alias_file_data(kAliasFileName);
2236 
2237  BOOST_CHECK(alias_file_data.m_Title.Found());
2238  BOOST_CHECK_EQUAL(kTitle, alias_file_data.m_Title.Get());
2239  BOOST_CHECK(alias_file_data.m_DbList.Found());
2240  BOOST_CHECK(NStr::Find(alias_file_data.m_DbList.Get(), kDbName) != NPOS);
2241  BOOST_CHECK(alias_file_data.m_NSeqs.Found());
2242  BOOST_CHECK_EQUAL("1", alias_file_data.m_NSeqs.Get());
2243  BOOST_CHECK(alias_file_data.m_Length.Found());
2244  BOOST_CHECK_EQUAL("232", alias_file_data.m_Length.Get());
2245  BOOST_CHECK(alias_file_data.m_GiList.Found());
2246  BOOST_CHECK(alias_file_data.m_FirstOid.Found() == false);
2247  BOOST_CHECK(alias_file_data.m_LastOid.Found() == false);
2248  BOOST_CHECK(alias_file_data.m_TiList.Found() == false);
2249  BOOST_CHECK(alias_file_data.m_SeqidList.Found() == false);
2250 }
2251 
2252 BOOST_AUTO_TEST_CASE(AliasFileGeneration_SeqIdList)
2253 {
2254  CDiagRestorer diag_restorer;
2256  CTmpFile tmp_aliasfile, tmp_gifile;
2257  const string kDbName("data/writedb_prot");
2258  const string kTitle("My alias file");
2259  string kAliasFileName(tmp_aliasfile.GetFileName());
2260  string kGiFileName(tmp_gifile.GetFileName());
2261  {
2262  ofstream gifile(tmp_gifile.GetFileName().c_str());
2263  gifile << "P01013.1" << endl; // GI 129295
2264  gifile << "X65215.1" << endl; // GI 555 (shouldn't be found)
2265  gifile.close();
2266  }
2267 
2268  CWriteDB_CreateAliasFile(kAliasFileName, kDbName, CWriteDB::eProtein,
2269  kGiFileName, kTitle, eSeqIdList);
2270  kAliasFileName += ".pal";
2271  CFileDeleteAtExit::Add(kAliasFileName);
2272 
2273  BOOST_REQUIRE(CFile(kAliasFileName).Exists());
2274  SAliasFileData alias_file_data(kAliasFileName);
2275 
2276  BOOST_CHECK(alias_file_data.m_Title.Found());
2277  BOOST_CHECK_EQUAL(kTitle, alias_file_data.m_Title.Get());
2278  BOOST_CHECK(alias_file_data.m_DbList.Found());
2279  BOOST_CHECK(NStr::Find(alias_file_data.m_DbList.Get(), kDbName) != NPOS);
2280  BOOST_CHECK(alias_file_data.m_NSeqs.Found());
2281  BOOST_CHECK_EQUAL("1", alias_file_data.m_NSeqs.Get());
2282  BOOST_CHECK(alias_file_data.m_Length.Found());
2283  BOOST_CHECK_EQUAL("232", alias_file_data.m_Length.Get());
2284  BOOST_CHECK(alias_file_data.m_SeqidList.Found());
2285  BOOST_CHECK(alias_file_data.m_FirstOid.Found() == false);
2286  BOOST_CHECK(alias_file_data.m_LastOid.Found() == false);
2287  BOOST_CHECK(alias_file_data.m_GiList.Found() == false);
2288  BOOST_CHECK(alias_file_data.m_TiList.Found() == false);
2289 }
2290 
2291 BOOST_AUTO_TEST_CASE(AliasFileGeneration_WithDbListNumVolumes)
2292 {
2293  CDiagRestorer diag_restorer;
2295  CTmpFile tmpfile;
2296  const string kTitle("My alias file");
2297  // nr should have at least two volumes
2298  const unsigned int kNumVols(9);
2299  const string kMyAliasDb("nr");
2300  const string kAliasFileName(kMyAliasDb + ".pal");
2301  CFileDeleteAtExit::Add(kAliasFileName);
2302 
2303  CWriteDB_CreateAliasFile(kMyAliasDb, kNumVols, CWriteDB::eProtein,
2304  kTitle);
2305 
2306  BOOST_REQUIRE(CFile(kAliasFileName).Exists());
2307  SAliasFileData alias_file_data(kAliasFileName);
2308 
2309  BOOST_CHECK(alias_file_data.m_Title.Found());
2310  BOOST_CHECK_EQUAL(kTitle, alias_file_data.m_Title.Get());
2311 
2312  BOOST_CHECK(alias_file_data.m_DbList.Found());
2313  BOOST_CHECK(NStr::Find(alias_file_data.m_DbList.Get(), kMyAliasDb) != NPOS);
2314  BOOST_CHECK(NStr::Find(alias_file_data.m_DbList.Get(), NStr::IntToString(kNumVols-1)) != NPOS);
2315  BOOST_CHECK(NStr::Find(alias_file_data.m_DbList.Get(), NStr::IntToString(kNumVols)) == NPOS);
2316 
2317  BOOST_CHECK(alias_file_data.m_NSeqs.Found());
2318  BOOST_CHECK(alias_file_data.m_Length.Found());
2319  BOOST_CHECK(alias_file_data.m_FirstOid.Found() == false);
2320  BOOST_CHECK(alias_file_data.m_LastOid.Found() == false);
2321  BOOST_CHECK(alias_file_data.m_GiList.Found() == false);
2322  BOOST_CHECK(alias_file_data.m_TiList.Found() == false);
2323  BOOST_CHECK(alias_file_data.m_SeqidList.Found() == false);
2324 }
2325 
2326 BOOST_AUTO_TEST_CASE(AliasFileGenerationWithOidRange)
2327 {
2328  CDiagRestorer diag_restorer;
2330  CTmpFile tmp_aliasfile;
2331  const string kDbName("nr");
2332  const string kTitle("My alias file");
2333  string kAliasFileName(tmp_aliasfile.GetFileName());
2334  const TSeqRange oid_range(100, 3500);
2335 
2336  CWriteDB_CreateAliasFile(kAliasFileName, vector<string>(1,kDbName),
2337  CWriteDB::eProtein, oid_range, kTitle);
2338  kAliasFileName += ".pal";
2339  CFileDeleteAtExit::Add(kAliasFileName);
2340 
2341  BOOST_REQUIRE(CFile(kAliasFileName).Exists());
2342  SAliasFileData alias_file_data(kAliasFileName);
2343 
2344  BOOST_CHECK(alias_file_data.m_Title.Found());
2345  BOOST_CHECK_EQUAL(kTitle, alias_file_data.m_Title.Get());
2346  BOOST_CHECK(alias_file_data.m_DbList.Found());
2347  BOOST_CHECK(NStr::Find(alias_file_data.m_DbList.Get(), kDbName) != NPOS);
2348  BOOST_CHECK(alias_file_data.m_NSeqs.Found());
2349  BOOST_CHECK(alias_file_data.m_Length.Found());
2350  BOOST_CHECK(alias_file_data.m_FirstOid.Found());
2351  BOOST_CHECK_EQUAL(NStr::IntToString(oid_range.GetFrom()),
2352  alias_file_data.m_FirstOid.Get());
2353  BOOST_CHECK(alias_file_data.m_LastOid.Found());
2354  BOOST_CHECK_EQUAL(NStr::IntToString(oid_range.GetToOpen()),
2355  alias_file_data.m_LastOid.Get());
2356  BOOST_CHECK(alias_file_data.m_GiList.Found() == false);
2357  BOOST_CHECK(alias_file_data.m_TiList.Found() == false);
2358  BOOST_CHECK(alias_file_data.m_SeqidList.Found() == false);
2359 }
2360 
2361 BOOST_AUTO_TEST_CASE(AliasFileGeneration_WithDbListAggregateBlastDbs)
2362 {
2363  CDiagRestorer diag_restorer;
2365  CTmpFile tmpfile;
2366  const string kTitle("My alias file");
2367  const string kMyAliasDb("est");
2368  const string kAliasFileName(kMyAliasDb + ".nal");
2369  CFileDeleteAtExit::Add(kAliasFileName);
2370  vector<string> dbs2aggregate;
2371  dbs2aggregate.push_back("est_human");
2372  dbs2aggregate.push_back("est_others");
2373  dbs2aggregate.push_back("est_mouse");
2374 
2375  CWriteDB_CreateAliasFile(kMyAliasDb, dbs2aggregate, CWriteDB::eNucleotide,
2376  kEmptyStr, kTitle);
2377 
2378  BOOST_REQUIRE(CFile(kAliasFileName).Exists());
2379  SAliasFileData alias_file_data(kAliasFileName);
2380 
2381  BOOST_CHECK(alias_file_data.m_Title.Found());
2382  BOOST_CHECK_EQUAL(kTitle, alias_file_data.m_Title.Get());
2383  BOOST_CHECK(alias_file_data.m_DbList.Found());
2384  ITERATE(vector<string>, itr, dbs2aggregate) {
2385  BOOST_REQUIRE(NStr::Find(alias_file_data.m_DbList.Get(), *itr) != NPOS);
2386  }
2387 
2388  BOOST_CHECK(alias_file_data.m_NSeqs.Found());
2389  BOOST_CHECK(alias_file_data.m_Length.Found());
2390  BOOST_CHECK(alias_file_data.m_GiList.Found() == false);
2391  BOOST_CHECK(alias_file_data.m_FirstOid.Found() == false);
2392  BOOST_CHECK(alias_file_data.m_LastOid.Found() == false);
2393  BOOST_CHECK(alias_file_data.m_TiList.Found() == false);
2394  BOOST_CHECK(alias_file_data.m_SeqidList.Found() == false);
2395 }
2396 
2397 BOOST_AUTO_TEST_CASE(InvalidAliasFileGeneration_NonExistentDb)
2398 {
2399  CTmpFile tmpfile;
2400  const string kTitle("My alias file");
2401  const string kMyAliasDb(tmpfile.GetFileName());
2402  const string kAliasFileName(kMyAliasDb + ".pal");
2403  CFileDeleteAtExit::Add(kAliasFileName);
2404 
2405  if (CFile(kAliasFileName).Exists()) {
2406  CFile(kAliasFileName).Remove();
2407  }
2408  BOOST_REQUIRE(CFile(kAliasFileName).Exists() == false);
2409 
2410  BOOST_REQUIRE_THROW( CWriteDB_CreateAliasFile(kMyAliasDb, "dummy",
2412  "gifile.txt"),
2413  CSeqDBException);
2414 
2415  BOOST_REQUIRE(CFile(kAliasFileName).Exists() == false);
2416 }
2417 
2418 // All databases exist at NCBI but one, which makes the whose set fail
2419 BOOST_AUTO_TEST_CASE(InvalidAliasFileGeneration_NonExistentDbAggregation)
2420 {
2421  CTmpFile tmpfile;
2422  const string kTitle("My alias file");
2423  const string kMyAliasDb(tmpfile.GetFileName());
2424  const string kAliasFileName(kMyAliasDb + ".pal");
2425  CFileDeleteAtExit::Add(kAliasFileName);
2426 
2427  if (CFile(kAliasFileName).Exists()) {
2428  CFile(kAliasFileName).Remove();
2429  }
2430  BOOST_REQUIRE(CFile(kAliasFileName).Exists() == false);
2431 
2432  vector<string> dbs2aggregate;
2433  dbs2aggregate.push_back("nr");
2434  dbs2aggregate.push_back("pataa");
2435  dbs2aggregate.push_back("env_nr");
2436  dbs2aggregate.push_back("dummy!");
2437  dbs2aggregate.push_back("ecoli");
2438 
2439  BOOST_REQUIRE_THROW( CWriteDB_CreateAliasFile(kMyAliasDb, dbs2aggregate,
2441  kEmptyStr,
2442  kTitle),
2443  CSeqDBException);
2444 
2445  BOOST_REQUIRE(CFile(kAliasFileName).Exists() == false);
2446 }
2447 
2448 // All databases exist at NCBI but one, which makes the whose set fail
2449 BOOST_AUTO_TEST_CASE(InvalidAliasFileGeneration_NonExistentMultiVolDbAggregation)
2450 {
2451  const string kTitle("My alias file");
2452  const string kBlastDb("ecoli");
2453  const string kAliasFileName(kBlastDb + ".pal");
2454  CFileDeleteAtExit::Add(kAliasFileName);
2455 
2456  if (CFile(kAliasFileName).Exists()) {
2457  CFile(kAliasFileName).Remove();
2458  }
2459  BOOST_REQUIRE(CFile(kAliasFileName).Exists() == false);
2460 
2461  BOOST_REQUIRE_THROW( CWriteDB_CreateAliasFile(kBlastDb, 10,
2463  kTitle),
2464  CSeqDBException);
2465 
2466  BOOST_REQUIRE(CFile(kAliasFileName).Exists() == false);
2467 }
2468 
2469 BOOST_AUTO_TEST_CASE(InvalidAliasFileGeneration_NoGisInBlastDB)
2470 {
2471  CTmpFile tmp_aliasfile, tmp_gifile;
2472  const string kDbName("nr");
2473  const string kTitle("My alias file");
2474  string kAliasFileName(tmp_aliasfile.GetFileName());
2475  string kGiFileName(tmp_gifile.GetFileName());
2476  {
2477  ofstream gifile(tmp_gifile.GetFileName().c_str());
2478  // These are nucleotide GIs
2479  gifile << "556" << endl;
2480  gifile << "555" << endl;
2481  gifile.close();
2482  }
2483 
2484  BOOST_REQUIRE_THROW(
2485  CWriteDB_CreateAliasFile(kAliasFileName, kDbName, CWriteDB::eProtein,
2486  kGiFileName, kTitle),
2487  CSeqDBException);
2488 
2489  kAliasFileName += ".pal";
2490  CFileDeleteAtExit::Add(kAliasFileName);
2491 
2492  BOOST_REQUIRE(!CFile(kAliasFileName).Exists());
2493 }
2494 
2495 BOOST_AUTO_TEST_CASE(CBuildDatabase_WriteToInvalidPathWindows)
2496 {
2497  CTmpFile tmpfile;
2499  const string kOutput("nul:");
2501  BOOST_REQUIRE_THROW(
2502  bd.Reset(new CBuildDatabase(kOutput, "foo", true,
2503  CWriteDB::eDefault, false, &log)),
2504  CFileException);
2505  BOOST_REQUIRE(bd.Empty());
2506 /* temporarily disabled.
2507  CFile f1(kOutput + ".pal"), f2(kOutput + ".pin");
2508  BOOST_REQUIRE(f1.Exists() == false);
2509  BOOST_REQUIRE(f2.Exists() == false);
2510 */
2511 }
2512 
2513 BOOST_AUTO_TEST_CASE(CBuildDatabase_WriteToInvalidPathUnix)
2514 {
2515  CTmpFile tmpfile;
2517  const string kOutput("/dev/null");
2519  BOOST_REQUIRE_THROW(
2520  bd.Reset(new CBuildDatabase(kOutput, "foo", true,
2521  CWriteDB::eDefault, false, &log)),
2523  BOOST_REQUIRE(bd.Empty());
2524  CFile f1(kOutput + ".pal"), f2(kOutput + ".pin");
2525  BOOST_REQUIRE(f1.Exists() == false);
2526  BOOST_REQUIRE(f2.Exists() == false);
2527 }
2528 
2529 BOOST_AUTO_TEST_CASE(CWriteDB_SetTaxonomy)
2530 {
2531  const TTaxId kTaxId = TAX_ID_CONST(9986);
2532  CTaxIdSet tis(kTaxId);
2533  const string kDbName("foo");
2534  CWriteDB blastdb(kDbName, CWriteDB::eNucleotide, kDbName);
2536  // This file contains TAB characters, which shouldn't create any warnings
2537  CFastaReader reader("data/rabbit_mrna.fsa", flags);
2538  set<TGi> gis;
2539  while (!reader.AtEOF()) {
2540  CRef<CSeq_entry> se = reader.ReadOneSeq();
2541  BOOST_REQUIRE(se.NotEmpty());
2542  BOOST_REQUIRE(se->IsSeq());
2543  CRef<CBioseq> bs(&se->SetSeq());
2545  tis.FixTaxId(bds);
2546  blastdb.AddSequence(*bs);
2547  blastdb.SetDeflines(*bds);
2548  gis.insert(FindGi(bs->GetId()));
2549  }
2550  blastdb.Close();
2551 
2552  CSeqDB db(kDbName, CSeqDB::eNucleotide);
2553  int total=db.GetNumSeqs();
2554  for (int oid=0; oid<total; oid++)
2555  {
2556  vector<TTaxId> taxids;
2557  db.GetTaxIDs(oid, taxids);
2558  BOOST_REQUIRE(taxids.size() == 1);
2559  BOOST_REQUIRE_EQUAL(kTaxId, taxids.front());
2560  }
2562 }
2563 
2564 BOOST_AUTO_TEST_CASE(CWriteDB_SetTaxonomyFromMap)
2565 {
2566  const TTaxId kTaxId = TAX_ID_CONST(9986);
2567  CRef<CTaxIdSet> tis(new CTaxIdSet());
2568  const string kDbName("foo");
2569  CWriteDB blastdb(kDbName, CWriteDB::eNucleotide, kDbName);
2571  // This file contains TAB characters, which shouldn't create any warnings
2572  CFastaReader reader("data/rabbit_mrna.fsa", flags);
2573  CNcbiIfstream taxidmap("data/rabbit_taxidmap.txt");
2574  tis->SetMappingFromFile(taxidmap);
2575  set<TGi> gis;
2576  while (!reader.AtEOF()) {
2577  CRef<CSeq_entry> se = reader.ReadOneSeq();
2578  BOOST_REQUIRE(se.NotEmpty());
2579  BOOST_REQUIRE(se->IsSeq());
2580  CRef<CBioseq> bs(&se->SetSeq());
2582  tis->FixTaxId(bds);
2583  blastdb.AddSequence(*bs);
2584  blastdb.SetDeflines(*bds);
2585  gis.insert(FindGi(bs->GetId()));
2586  }
2587  blastdb.Close();
2588 
2589  CSeqDB db(kDbName, CSeqDB::eNucleotide);
2590  int total=db.GetNumSeqs();
2591  for (int oid=0; oid<total; oid++)
2592  {
2593  vector<TTaxId> taxids;
2594  db.GetTaxIDs(oid, taxids);
2595  BOOST_REQUIRE(taxids.size() == 1);
2596  BOOST_REQUIRE_EQUAL(kTaxId, taxids.front());
2597  }
2599 }
2600 
2601 BOOST_AUTO_TEST_CASE(CWriteDB_SetTaxonomyFromMapLclIds)
2602 {
2603  const TTaxId kTaxId = TAX_ID_CONST(382);
2604  CRef<CTaxIdSet> tis(new CTaxIdSet());
2605  const string kDbName("foo");
2606  CWriteDB blastdb(kDbName, CWriteDB::eProtein, kDbName);
2608  // This file contains TAB characters, which shouldn't create any warnings
2609  CFastaReader reader("data/lclseqs.fsa", flags);
2610  CNcbiIfstream taxidmap("data/lclseqs_taxidmap.txt");
2611  tis->SetMappingFromFile(taxidmap);
2612  while (!reader.AtEOF()) {
2613  CRef<CSeq_entry> se = reader.ReadOneSeq();
2614  BOOST_REQUIRE(se.NotEmpty());
2615  BOOST_REQUIRE(se->IsSeq());
2616  CRef<CBioseq> bs(&se->SetSeq());
2618  tis->FixTaxId(bds);
2619  blastdb.AddSequence(*bs);
2620  blastdb.SetDeflines(*bds);
2621  }
2622  blastdb.Close();
2623 
2624  CSeqDB db(kDbName, CSeqDB::eProtein);
2625  int total=db.GetNumSeqs();
2626  for (int oid=0; oid<total; oid++)
2627  {
2628  vector<TTaxId> taxids;
2629  db.GetTaxIDs(oid, taxids);
2630  BOOST_REQUIRE(taxids.size() == 1);
2631  BOOST_REQUIRE_EQUAL(kTaxId, taxids.front());
2632  }
2634 }
2635 
2636 BOOST_AUTO_TEST_CASE(CBuildDatabase_TestDirectoryCreation)
2637 {
2638  CTmpFile tmpfile;
2640  const string kOutput("a/b/c/d");
2641  CFileDeleteAtExit::Add("a/b/c");
2642  CFileDeleteAtExit::Add("a/b");
2644 
2646  bd.Reset(new CBuildDatabase(kOutput, "foo", true,
2647  CWriteDB::eNoIndex, false, &log));
2648  //CWriteDB::eDefault, false, &cerr));
2649  CRef<CTaxIdSet> tid(new CTaxIdSet(9301));
2650  bd->SetTaxids(*tid);
2651  bd->StartBuild();
2652  bd->SetSourceDb("data/writedb_prot");
2653  //bd->SetVerbosity(true);
2654  bd->SetUseRemote(true);
2655  vector<string> ids(1, "129295");
2656  bd->AddIds(ids);
2657  bd->EndBuild();
2658  CFile f1(kOutput + ".pin");
2659  BOOST_REQUIRE(f1.Exists() == true);
2660 
2661  bd->EndBuild(true);
2662  BOOST_REQUIRE(f1.Exists() == false);
2663 }
2664 
2665 BOOST_AUTO_TEST_CASE(CBuildDatabase_TestBasicDatabaseCreation)
2666 {
2667  CTmpFile tmpfile;
2669  const string kOutput("x");
2670  CFileDeleteAtExit::Add("x.pin");
2671  CFileDeleteAtExit::Add("x.phr");
2672  CFileDeleteAtExit::Add("x.psq");
2673 
2675  bd.Reset(new CBuildDatabase(kOutput, "foo", true,
2676  CWriteDB::eNoIndex, false, &log));
2677  //CWriteDB::eDefault, false, &cerr));
2678  CRef<CTaxIdSet> tid(new CTaxIdSet(9301));
2679  //CRef<CTaxIdSet> tid(new CTaxIdSet(9606));
2680  bd->SetTaxids(*tid);
2681  bd->StartBuild();
2682  bd->SetSourceDb("data/writedb_prot");
2683  //bd->SetVerbosity(true);
2684  bd->SetUseRemote(true);
2685  vector<string> ids(1, "129295");
2686  bd->AddIds(ids);
2687  bd->EndBuild();
2688  CFile f1(kOutput + ".pin");
2689  BOOST_REQUIRE(f1.Exists() == true);
2690 
2691  bd->EndBuild(true);
2692  BOOST_REQUIRE(f1.Exists() == false);
2693 }
2694 
2695 BOOST_AUTO_TEST_CASE(CBuildDatabase_TestQuickDatabaseCreation)
2696 {
2697  CTmpFile tmpfile;
2699  const string kOutput("x");
2700  const string title("fuwafuwa"); // it's Japanese...
2701  CFileDeleteAtExit::Add("x.pin");
2702  CFileDeleteAtExit::Add("x.phr");
2703  CFileDeleteAtExit::Add("x.psq");
2704 
2705  // FASTA file contains 25 sequences.
2706  CNcbiIfstream fasta_file("data/some_prots.fsa");
2708  bd.Reset(new CBuildDatabase(
2709  kOutput,
2710  title,
2711  true, // is_protein
2712  CWriteDB::eNoIndex, // indexing
2713  false, // use_gi_mask
2714  &log
2715  ));
2716  bd->SetSourceDb("data/writedb_prot");
2717 
2718  // These two IDs are NOT in the FASTA file.
2719  vector<string> ids;
2720  ids.push_back("166225656");
2721  ids.push_back("259646160");
2722 
2723  bool success = bd->Build(ids, &fasta_file);
2724  // Created DB should now contain 27 sequences.
2725  BOOST_REQUIRE(success);
2726 
2727  CFile f1(kOutput + ".pin");
2728  BOOST_REQUIRE(f1.Exists() == true);
2729 
2730  bd->EndBuild(true);
2731  BOOST_REQUIRE(f1.Exists() == false);
2732 }
2733 
2734 BOOST_AUTO_TEST_CASE(CBuildDatabase_TestQuickDatabaseCreation_NoIds)
2735 {
2736  CTmpFile tmpfile;
2738  const string kOutput("x1");
2739  const string title("fuwafuwa");
2740  CFileDeleteAtExit::Add("x1.pin");
2741  CFileDeleteAtExit::Add("x1.phr");
2742  CFileDeleteAtExit::Add("x1.psq");
2743 
2744  // FASTA file contains 25 sequences.
2745  CNcbiIfstream fasta_file("data/some_prots.fsa");
2747  bd.Reset(new CBuildDatabase(
2748  kOutput,
2749  title,
2750  true, // is_protein
2751  CWriteDB::eNoIndex, // indexing
2752  false, // use_gi_mask
2753  &log
2754  ));
2755  bd->SetSourceDb("data/writedb_prot");
2756 
2757  // Not adding any IDs.
2758  vector<string> ids; // empty
2759 
2760  bool success = bd->Build(ids, &fasta_file);
2761  // Created DB should now contain 25 sequences.
2762  BOOST_REQUIRE(success);
2763 
2764  CFile f1(kOutput + ".pin");
2765  BOOST_REQUIRE(f1.Exists() == true);
2766 
2767  bd->EndBuild(true);
2768  BOOST_REQUIRE(f1.Exists() == false);
2769 }
2770 
2772 public:
2774  : m_objmgr(CObjectManager::GetInstance()),
2775  m_scope(new CScope(*m_objmgr)),
2776  m_entry(seq_entry)
2777  {
2778  m_Bioseq = Begin(*m_entry);
2779  for (CTypeIterator<CBioseq> it = Begin(*m_entry); it; ++it)
2780  {
2781  m_scope->AddBioseq(*it);
2782  }
2783  m_entry->Parentize();
2784  }
2785 
2786 
2788  {
2789  CConstRef<CBioseq> rv;
2790 
2791  if (m_Bioseq)
2792  {
2793  rv.Reset(&(*m_Bioseq));
2794  ++m_Bioseq;
2795  }
2796  return rv;
2797  }
2798 
2799 private:
2804 };
2805 
2806 BOOST_AUTO_TEST_CASE(CBuildDatabase_WGS_gap)
2807 {
2808 
2809  CTmpFile tmpfile;
2811  const string kOutput("x");
2812  CFileDeleteAtExit::Add("x.nin");
2813  CFileDeleteAtExit::Add("x.nhr");
2814  CFileDeleteAtExit::Add("x.nsq");
2815 
2817  bd.Reset(new CBuildDatabase(kOutput, "foo", false,
2818  CWriteDB::eNoIndex, false, &log));
2819  bd->StartBuild();
2820 
2821  unique_ptr<CObjectIStream> ois
2822  (CObjectIStream::Open(eSerial_AsnText, "data/AXBT01000003.asn"));
2823  CRef<CSeq_entry> entry(new CSeq_entry);
2824  *ois >> *entry;
2825  CSeqEntryGetSource seqentry_source(entry);
2826 
2827  bool status = bd->AddSequences(seqentry_source);
2828  BOOST_REQUIRE(status == true);
2829  bd->EndBuild();
2830  CFile f1(kOutput + ".nin");
2831  BOOST_REQUIRE(f1.Exists() == true);
2832 }
2833 
2834 #ifdef NCBI_INT8_GI
2835 BOOST_AUTO_TEST_CASE(CSeqDBIsam_32bit_GI)
2836 {
2837  // When process exits, clean up these files if they still exist.
2838  CFileDeleteAtExit::Add("big_gi.00.pni");
2839  CFileDeleteAtExit::Add("big_gi.00.pnd");
2840 
2841  const Int8 big_gi = 0xC0000000; // 3 "billion"
2842  // Write a numeric ISAM DB containing GI/OID records using GIs starting
2843  // with big_gi above.
2844 
2845  CWriteDB_Isam wdb(
2846  eGi,
2847  "big_gi",
2848  true, // is protein?
2849  0, // volume index
2850  1024L, // 1 kiB
2851  false // use sparse mode?
2852  );
2853 
2854  // Set number of records to be written/read
2855  const int nrecs = 10;
2856 
2857  // (Try to) create seqid reference from GI.
2858  // If we succeed, add seqid and random OID value to DB.
2859  for (Uint4 i = 0; i < nrecs; ++i) {
2860  CWriteDB_Isam::TIdList tidlist;
2861  TGi gi = GI_FROM(Int8, (big_gi + i));
2862  try {
2863  CRef<CSeq_id> seqid( new CSeq_id(CSeq_id::e_Gi, gi));
2864  tidlist.push_back(seqid);
2865  wdb.AddIds(i, tidlist);
2866  } catch (...) {
2867  BOOST_FAIL("CSeq_id constructor threw exception");
2868  return;
2869  }
2870  }
2871  // Close database.
2872  wdb.Close();
2873 
2874  // Reopen DB for reading.
2875  CSeqDBAtlas atlas(true);
2876  CSeqDBLockHold lock(atlas);
2877  CRef<CSeqDBIsam> rdb( new CSeqDBIsam( atlas, "big_gi.00", 'p', 'n', eGiId));
2878 
2879  // Read back records and verify.
2880  for (int i = 0; i < nrecs; ++i) {
2881  TGi gi = GI_FROM(Int8, (big_gi + i));
2882  try {
2883  CRef<CSeq_id> seqid( new CSeq_id(CSeq_id::e_Gi, gi));
2884  int oid;
2885  rdb->IdToOid(GI_TO(Int8, seqid->GetGi()), oid);
2886  BOOST_REQUIRE(oid == i);
2887  } catch (...) {
2888  BOOST_FAIL("CSeq_id constructor threw exception");
2889  return;
2890  }
2891  }
2892 }
2893 #endif
2894 
2895 BOOST_AUTO_TEST_CASE(ReadBareIDProtein)
2896 {
2897  // create a FASTA file with bare and legacy IDs
2898  CTmpFile tmpfile;
2899  CNcbiOfstream ostr(tmpfile.GetFileName().c_str());
2900  string sequence = "MASTQNIVEEVQKMLDTYDTNKDGEITKAEAVEYFKGKKAFNPER";
2901 
2902  std::unordered_map<string, CSeq_id::E_Choice> fasta_ids = {
2903  {"XP_642131.1", CSeq_id::e_Other},
2904  {"ref|XP_642837.1", CSeq_id::e_Other},
2905  {"BAA06266.1", CSeq_id::e_Ddbj},
2906  {"dbj|GAE97797.1", CSeq_id::e_Ddbj},
2907  {"320460102", CSeq_id::e_Local},
2908  {"gi|716054866", CSeq_id::e_Gi},
2909  {"Q02VU1.1", CSeq_id::e_Swissprot},
2910  {"sp|Q6GIX1.1|CADA_STAAR", CSeq_id::e_Swissprot},
2911  {"EQR80552.1", CSeq_id::e_Genbank},
2912  {"gb|EQS08124.1", CSeq_id::e_Genbank},
2913  {"Somestring", CSeq_id::e_Local},
2914  {"lcl|anotherstring", CSeq_id::e_Local},
2915  {"12AS_A", CSeq_id::e_Pdb},
2916  {"pdb|1I4D|D", CSeq_id::e_Pdb},
2917  {"2209341B", CSeq_id::e_Local},
2918  {"prf||2209335A", CSeq_id::e_Prf},
2919  {"T49736", CSeq_id::e_Local},
2920  {"pir||AI1052", CSeq_id::e_Pir}};
2921 
2922  for (auto it: fasta_ids) {
2923  ostr << ">" << it.first << endl << sequence << endl;
2924  }
2925  ostr.close();
2926 
2927  // create a database from the fasta file
2929  BOOST_REQUIRE(istr);
2930  string dbname = "data/bare_id_test_prot";
2931  string title = "Temporary unit test db";
2932  ostringstream log;
2933  CBuildDatabase db(dbname, title, true, false, true, false, &log);
2934 
2935  db.StartBuild();
2936  db.AddFasta(istr);
2937  db.EndBuild();
2938 
2939  CFileDeleteAtExit::Add(dbname + ".phr");
2940  CFileDeleteAtExit::Add(dbname + ".pin");
2941  CFileDeleteAtExit::Add(dbname + ".psq");
2942  CFileDeleteAtExit::Add(dbname + ".pog");
2943  CFileDeleteAtExit::Add(dbname + ".psd");
2944  CFileDeleteAtExit::Add(dbname + ".psi");
2945 
2946  int index = 0;
2947  CSeqDB seqdb(dbname, CSeqDB::eProtein);
2948 
2949  // check that each sequence id has a correct type
2950  for (auto it: fasta_ids) {
2951  list< CRef<CSeq_id> > ids = seqdb.GetSeqIDs(index++);
2952  BOOST_REQUIRE_MESSAGE(ids.front()->Which() == it.second,
2953  (string)"Sequence id type for " +
2954  it.first + " is " +
2955  NStr::IntToString(ids.front()->Which()) +
2956  " (expected " + NStr::IntToString(it.second)
2957  + ")");
2958  }
2959  BOOST_REQUIRE_EQUAL(index, (int)fasta_ids.size());
2960 }
2961 
2962 
2963 BOOST_AUTO_TEST_CASE(ReadMultipleBareIDs)
2964 {
2965  // create a FASTA file with bare and legacy IDs
2966  CTmpFile tmpfile;
2967  CNcbiOfstream ostr(tmpfile.GetFileName().c_str());
2968  string sequence = "MASTQNIVEEVQKMLDTYDTNKDGEITKAEAVEYFKGKKAFNPER";
2969 
2970  std::unordered_map<string, CSeq_id::E_Choice> fasta_ids = {
2971  {"XP_642131.1", CSeq_id::e_Other},
2972  {"ref|XP_642837.1", CSeq_id::e_Other},
2973  {"BAA06266.1", CSeq_id::e_Ddbj},
2974  {"dbj|GAE97797.1", CSeq_id::e_Ddbj},
2975  {"320460102", CSeq_id::e_Local},
2976  {"gi|716054866", CSeq_id::e_Gi}};
2977 
2978  auto it = fasta_ids.begin();
2979  ostr << ">" << it->first << " Some defline";
2980  ++it;
2981  for (; it != fasta_ids.end(); ++it) {
2982  ostr << '\01' << it->first << " Some defline";
2983  }
2984  ostr << endl << sequence << endl;
2985  ostr.close();
2986 
2987  // create a database from the fasta file
2989  BOOST_REQUIRE(istr);
2990  string dbname = "data/bare_id_test_prot2";
2991  string title = "Temporary unit test db";
2992  ostringstream log;
2993  CBuildDatabase db(dbname, title, true, false, true, false, &log);
2994 
2995  db.StartBuild();
2996  db.AddFasta(istr);
2997  db.EndBuild();
2998 
2999  CFileDeleteAtExit::Add(dbname + ".phr");
3000  CFileDeleteAtExit::Add(dbname + ".pin");
3001  CFileDeleteAtExit::Add(dbname + ".psq");
3002  CFileDeleteAtExit::Add(dbname + ".pog");
3003  CFileDeleteAtExit::Add(dbname + ".psd");
3004  CFileDeleteAtExit::Add(dbname + ".psi");
3005 
3006  CSeqDB seqdb(dbname, CSeqDB::eProtein);
3007 
3008  list< CRef<CSeq_id> > ids = seqdb.GetSeqIDs(0);
3009  BOOST_REQUIRE_EQUAL(ids.size(), fasta_ids.size());
3010 
3011  auto seqdb_id = ids.begin();
3012  for (auto it: fasta_ids) {
3013  BOOST_REQUIRE_MESSAGE((*seqdb_id)->Which() == it.second,
3014  (string)"Sequence id type for " +
3015  it.first + " is " +
3016  NStr::IntToString(ids.front()->Which()) +
3017  " (expected " + NStr::IntToString(it.second)
3018  + ")");
3019  ++seqdb_id;
3020  }
3021  BOOST_REQUIRE(seqdb_id == ids.end());
3022 }
3023 
3024 
3025 BOOST_AUTO_TEST_CASE(ReadBareIDNucleotide)
3026 {
3027  // create a FASTA file with bare and legacy IDs
3028  CTmpFile tmpfile;
3029  CNcbiOfstream ostr(tmpfile.GetFileName().c_str());
3030  string sequence = "AACTAGTATTAGAGGCACTGCCTGCCCAGTGACAATCGTTAAACGGCCG";
3031 
3032  std::unordered_map<string, CSeq_id::E_Choice> fasta_ids = {
3033  {"U13103.1", CSeq_id::e_Genbank},
3034  {"gb|U13080.1", CSeq_id::e_Genbank},
3035  {"Z18633.1", CSeq_id::e_Embl},
3036  {"emb|Z18632.1", CSeq_id::e_Embl},
3037  {"NM_176670.2", CSeq_id::e_Other},
3038  {"ref|NM_175822.2", CSeq_id::e_Other},
3039  {"SRR1272186", CSeq_id::e_Local},
3040  {"gnl|SRA|SRR342213.1", CSeq_id::e_General},
3041  {"gi|971149218", CSeq_id::e_Gi},
3042  {"emb|LO018508.1", CSeq_id::e_Embl}};
3043 
3044  for (auto it: fasta_ids) {
3045  ostr << ">" << it.first << endl << sequence << endl;
3046  }
3047  ostr.close();
3048 
3049  // create a database from the fasta file
3051  BOOST_REQUIRE(istr);
3052  string dbname = "data/bare_id_test_nucl";
3053  string title = "Temporary unit test db";
3054  ostringstream log;
3055  CBuildDatabase db(dbname, title, false, false, true, false, &log);
3056 
3057  db.StartBuild();
3058  db.AddFasta(istr);
3059  db.EndBuild();
3060 
3061  CFileDeleteAtExit::Add(dbname + ".nhr");
3062  CFileDeleteAtExit::Add(dbname + ".nin");
3063  CFileDeleteAtExit::Add(dbname + ".nsq");
3064  CFileDeleteAtExit::Add(dbname + ".nog");
3065  CFileDeleteAtExit::Add(dbname + ".nsd");
3066  CFileDeleteAtExit::Add(dbname + ".nsi");
3067 
3068  int index = 0;
3070 
3071  // check that each sequence id has a correct type
3072  for (auto it: fasta_ids) {
3073  list< CRef<CSeq_id> > ids = seqdb.GetSeqIDs(index++);
3074  BOOST_REQUIRE_MESSAGE(ids.front()->Which() == it.second,
3075  (string)"Sequence id type for " +
3076  it.first + " is " +
3077  NStr::IntToString(ids.front()->Which()) +
3078  " (expected " + NStr::IntToString(it.second)
3079  + ")");
3080  }
3081  BOOST_REQUIRE_EQUAL(index, (int)fasta_ids.size());
3082 }
3083 
3084 
3085 BOOST_AUTO_TEST_CASE(ReadMixIDsProtein)
3086 {
3087  // create a FASTA file with bare and legacy IDs
3088  CTmpFile tmpfile;
3089  CNcbiOfstream ostr(tmpfile.GetFileName().c_str());
3090  string sequence = "MASTQNIVEEVQKMLDTYDTNKDGEITKAEAVEYFKGKKAFNPER";
3091 
3092  std::unordered_map<string, CSeq_id::E_Choice> fasta_ids = {
3093  {"XP_642131.1", CSeq_id::e_Other},
3094  {"ref|XP_642837.1", CSeq_id::e_Other},
3095  {"BAA06266.1", CSeq_id::e_Ddbj},
3096  {"dbj|GAE97797.1", CSeq_id::e_Ddbj},
3097  {"320460102", CSeq_id::e_Local},
3098  {"gi|716054866", CSeq_id::e_Gi},
3099  {"Q02VU1.1", CSeq_id::e_Swissprot},
3100  {"sp|Q6GIX1.1|CADA_STAAR", CSeq_id::e_Swissprot},
3101  {"EQR80552.1", CSeq_id::e_Genbank},
3102  {"gb|EQS08124.1", CSeq_id::e_Genbank},
3103  {"Somestring", CSeq_id::e_Local},
3104  {"lcl|anotherstring", CSeq_id::e_Local},
3105  {"12AS_A", CSeq_id::e_Pdb},
3106  {"pdb|1I4D|D", CSeq_id::e_Pdb},
3107  {"2209341B", CSeq_id::e_Local},
3108  {"prf||2209335A", CSeq_id::e_Prf},
3109  {"T49736", CSeq_id::e_Local},
3110  {"pir||AI1052", CSeq_id::e_Pir}};
3111 
3112 
3113  for (auto it: fasta_ids) {
3114  ostr << ">" << it.first << endl << sequence << endl;
3115  }
3116  ostr.close();
3117 
3118  // create a database from the fasta file
3120  BOOST_REQUIRE(istr);
3121  string dbname = "data/bare_id_test_prot_legacy";
3122  string title = "Temporary unit test db";
3123  ostringstream log;
3124  CBuildDatabase db(dbname, title, true, false, true, false, &log, true);
3125 
3126  db.StartBuild();
3127  db.AddFasta(istr);
3128  db.EndBuild();
3129 
3130  CFileDeleteAtExit::Add(dbname + ".phr");
3131  CFileDeleteAtExit::Add(dbname + ".pin");
3132  CFileDeleteAtExit::Add(dbname + ".psq");
3133  CFileDeleteAtExit::Add(dbname + ".pog");
3134  CFileDeleteAtExit::Add(dbname + ".psd");
3135  CFileDeleteAtExit::Add(dbname + ".psi");
3136 
3137  int index = 0;
3138  CSeqDB seqdb(dbname, CSeqDB::eProtein);
3139 
3140  // check that each sequence id has a correct type
3141  for (auto it: fasta_ids) {
3142  list< CRef<CSeq_id> > ids = seqdb.GetSeqIDs(index++);
3143  BOOST_REQUIRE_MESSAGE(ids.front()->Which() == it.second,
3144  (string)"Sequence id type for " +
3145  it.first + " is " +
3146  NStr::IntToString(ids.front()->Which()) +
3147  " (expected " + NStr::IntToString(it.second)
3148  + ")");
3149  }
3150  BOOST_REQUIRE_EQUAL(index, (int)fasta_ids.size());
3151 }
3152 
3153 
3154 BOOST_AUTO_TEST_CASE(ReadMultipleMixLongIDs)
3155 {
3156  // create a FASTA file with bare and legacy IDs
3157  CTmpFile tmpfile;
3158  CNcbiOfstream ostr(tmpfile.GetFileName().c_str());
3159  string sequence = "MASTQNIVEEVQKMLDTYDTNKDGEITKAEAVEYFKGKKAFNPER";
3160 
3161  std::unordered_map<string, CSeq_id::E_Choice> fasta_ids = {
3162  {"XP_642131.1", CSeq_id::e_Other},
3163  {"ref|XP_642837.1", CSeq_id::e_Other},
3164  {"BAA06266.1", CSeq_id::e_Ddbj},
3165  {"dbj|GAE97797.1", CSeq_id::e_Ddbj},
3166  {"320460102", CSeq_id::e_Local},
3167  {"gi|716054866", CSeq_id::e_Gi}};
3168 
3169  auto it = fasta_ids.begin();
3170  ostr << ">" << it->first << " Some defline";
3171  ++it;
3172  for (; it != fasta_ids.end(); ++it) {
3173  ostr << '\01' << it->first << " Some defline";
3174  }
3175  ostr << endl << sequence << endl;
3176  ostr.close();
3177 
3178  // create a database from the fasta file
3180  BOOST_REQUIRE(istr);
3181  string dbname = "data/bare_id_test_legacy_prot2";
3182  string title = "Temporary unit test db";
3183  ostringstream log;
3184  CBuildDatabase db(dbname, title, true, false, true, false, &log, true);
3185 
3186  db.StartBuild();
3187  db.AddFasta(istr);
3188  db.EndBuild();
3189 
3190  CFileDeleteAtExit::Add(dbname + ".phr");
3191  CFileDeleteAtExit::Add(dbname + ".pin");
3192  CFileDeleteAtExit::Add(dbname + ".psq");
3193  CFileDeleteAtExit::Add(dbname + ".pog");
3194  CFileDeleteAtExit::Add(dbname + ".psd");
3195  CFileDeleteAtExit::Add(dbname + ".psi");
3196 
3197  CSeqDB seqdb(dbname, CSeqDB::eProtein);
3198 
3199  list< CRef<CSeq_id> > ids = seqdb.GetSeqIDs(0);
3200  BOOST_REQUIRE_EQUAL(ids.size(), fasta_ids.size());
3201 
3202  auto seqdb_id = ids.begin();
3203  for (auto it: fasta_ids) {
3204  BOOST_REQUIRE_MESSAGE((*seqdb_id)->Which() == it.second,
3205  (string)"Sequence id type for " +
3206  it.first + " is " +
3207  NStr::IntToString(ids.front()->Which()) +
3208  " (expected " + NStr::IntToString(it.second)
3209  + ")");
3210  ++seqdb_id;
3211  }
3212  BOOST_REQUIRE(seqdb_id == ids.end());
3213 }
3214 
3215 
3216 BOOST_AUTO_TEST_CASE(ReadMixIDNucleotide)
3217 {
3218  // create a FASTA file with bare and legacy IDs
3219  CTmpFile tmpfile;
3220  CNcbiOfstream ostr(tmpfile.GetFileName().c_str());
3221  string sequence = "AACTAGTATTAGAGGCACTGCCTGCCCAGTGACAATCGTTAAACGGCCG";
3222 
3223  std::unordered_map<string, CSeq_id::E_Choice> fasta_ids = {
3224  {"U13103.1", CSeq_id::e_Genbank},
3225  {"gb|U13080.1", CSeq_id::e_Genbank},
3226  {"Z18633.1", CSeq_id::e_Embl},
3227  {"emb|Z18632.1", CSeq_id::e_Embl},
3228  {"NM_176670.2", CSeq_id::e_Other},
3229  {"ref|NM_175822.2", CSeq_id::e_Other}};
3230 
3231  for (auto it: fasta_ids) {
3232  ostr << ">" << it.first << endl << sequence << endl;
3233  }
3234  ostr.close();
3235 
3236  // create a database from the fasta file
3238  BOOST_REQUIRE(istr);
3239  string dbname = "data/bare_id_test_nucl_legacy";
3240  string title = "Temporary unit test db";
3241  ostringstream log;
3242  CBuildDatabase db(dbname, title, false, false, true, false, &log, true);
3243 
3244  db.StartBuild();
3245  db.AddFasta(istr);
3246  db.EndBuild();
3247 
3248  CFileDeleteAtExit::Add(dbname + ".nhr");
3249  CFileDeleteAtExit::Add(dbname + ".nin");
3250  CFileDeleteAtExit::Add(dbname + ".nsq");
3251  CFileDeleteAtExit::Add(dbname + ".nog");
3252  CFileDeleteAtExit::Add(dbname + ".nsd");
3253  CFileDeleteAtExit::Add(dbname + ".nsi");
3254 
3255  int index = 0;
3257 
3258  // check that each sequence id has a correct type
3259  for (auto it: fasta_ids) {
3260  list< CRef<CSeq_id> > ids = seqdb.GetSeqIDs(index++);
3261  BOOST_REQUIRE_EQUAL(ids.front()->Which(), it.second);
3262  }
3263  BOOST_REQUIRE_EQUAL(index, (int)fasta_ids.size());
3264 }
3265 
3266 BOOST_AUTO_TEST_CASE(CreateV5Seqidlist)
3267 {
3268  CNcbiIfstream seqidFile("data/seqidlist.nucl");
3269  vector<string> idlist;
3270  while (seqidFile) {
3271  string line;
3272  NcbiGetlineEOL(seqidFile, line);
3273  if ( !line.empty() ) {
3274  idlist.push_back(line);
3275  }
3276  }
3277  // Test create seqidlsit from text file
3278  {
3279  const string kTitle("Unit Test Seqidlist");
3280  const size_t num_of_ids = 12;
3281  CTmpFile tmpfile;
3282  CNcbiOstream & os = tmpfile.AsOutputFile(CTmpFile::eIfExists_Reset, IOS_BASE::out | IOS_BASE::binary);
3283  SBlastSeqIdListInfo list_info;
3284  vector<CSeqDBGiList::SSiOid> read_idlist;
3285  WriteBlastSeqidlistFile(idlist, os, kTitle);
3286  CMemoryFile mf(tmpfile.GetFileName());
3287  CBlastSeqidlistFile::GetSeqidlist(mf,read_idlist, list_info);
3288  BOOST_REQUIRE_EQUAL(num_of_ids, list_info.num_ids);
3289  BOOST_REQUIRE_EQUAL(num_of_ids, read_idlist.size());
3290  BOOST_REQUIRE_EQUAL(list_info.title, kTitle);
3291  BOOST_REQUIRE_EQUAL(list_info.file_size, (Uint8) mf.GetFileSize());
3292  BOOST_REQUIRE_EQUAL(read_idlist[2].si, "D88758.1");
3293  BOOST_REQUIRE_EQUAL(read_idlist[7].si, "SRA:SRR066117.18823.2");
3294  BOOST_REQUIRE_EQUAL(read_idlist[11].si, "u00001.1");
3295  }
3296  // Test create seqidlsit with db lookup
3297  {
3298  const string kTitle("Unit Test Seqidlist w DB");
3299  const size_t num_of_ids = 9;
3300  CTmpFile tmpfile;
3301  CNcbiOstream & os = tmpfile.AsOutputFile(CTmpFile::eIfExists_Reset, IOS_BASE::out | IOS_BASE::binary);
3302  SBlastSeqIdListInfo list_info;
3303  vector<CSeqDBGiList::SSiOid> read_idlist;
3304  CSeqDB db("data/writedb_nucl_v5", CSeqDB::eNucleotide);
3305  WriteBlastSeqidlistFile(idlist, os, kTitle, &db);
3306  CMemoryFile mf(tmpfile.GetFileName());
3307  CBlastSeqidlistFile::GetSeqidlist(mf,read_idlist, list_info);
3308  BOOST_REQUIRE_EQUAL(num_of_ids, list_info.num_ids);
3309  BOOST_REQUIRE_EQUAL(num_of_ids, read_idlist.size());
3310  BOOST_REQUIRE_EQUAL(list_info.title, kTitle);
3311  BOOST_REQUIRE_EQUAL(list_info.file_size, (Uint8) mf.GetFileSize());
3312  BOOST_REQUIRE_EQUAL(list_info.db_vol_length, db.GetVolumeLength());
3313  BOOST_REQUIRE_EQUAL(read_idlist[2].si, "D88758.1");
3314  BOOST_REQUIRE_EQUAL(read_idlist[6].si, "U00001.1");
3315 
3316  }
3317 
3318  // Test create seqidlsit, remove duplicate ids
3319  {
3320  const string kTitle("Unit Test Seqidlist Duplicate");
3321  const size_t num_of_ids = 12;
3322  CTmpFile tmpfile;
3323  CNcbiOstream & os = tmpfile.AsOutputFile(CTmpFile::eIfExists_Reset, IOS_BASE::out | IOS_BASE::binary);
3324  SBlastSeqIdListInfo list_info;
3325  vector<string> dup_list;
3326  dup_list.insert(dup_list.begin(), idlist.begin(), idlist.end());
3327  dup_list.insert(dup_list.end(), idlist.begin(), idlist.end());
3328  vector<CSeqDBGiList::SSiOid> read_idlist;
3329  WriteBlastSeqidlistFile(dup_list, os, kTitle);
3330  CMemoryFile mf(tmpfile.GetFileName());
3331  CBlastSeqidlistFile::GetSeqidlist(mf,read_idlist, list_info);
3332  BOOST_REQUIRE_EQUAL(num_of_ids, list_info.num_ids);
3333  BOOST_REQUIRE_EQUAL(num_of_ids, read_idlist.size());
3334  BOOST_REQUIRE_EQUAL(list_info.title, kTitle);
3335  BOOST_REQUIRE_EQUAL(list_info.file_size, (Uint8) mf.GetFileSize());
3336  BOOST_REQUIRE_EQUAL(read_idlist[2].si, "D88758.1");
3337  BOOST_REQUIRE_EQUAL(read_idlist[7].si, "SRA:SRR066117.18823.2");
3338  BOOST_REQUIRE_EQUAL(read_idlist[11].si, "u00001.1");
3339  }
3340 }
3341 
3342 BOOST_AUTO_TEST_CASE(ReadMultiSeqIdsDefline)
3343 {
3344  static const int num_ids = 7;
3345  pair <string, CSeq_id::E_Choice> fasta_ids[num_ids] = {
3346  make_pair("497371450", CSeq_id::e_Gi),
3347  make_pair("WP_009685663.1", CSeq_id::e_Other),
3348  make_pair("955937162", CSeq_id::e_Gi),
3349  make_pair("KSD99966.1", CSeq_id::e_Genbank),
3350  make_pair("956677830", CSeq_id::e_Gi),
3351  make_pair("KSL27839.1", CSeq_id::e_Genbank),
3352  make_pair("6ES9_A", CSeq_id::e_Pdb)};
3353 
3354  CNcbiIfstream istr("data/WP_009685663.fasta");
3355  string dbname = "data/multiseqids";
3356  string title = "Temporary unit test db";
3357  ostringstream log;
3358  CBuildDatabase db(dbname, title, true, false, true, false, &log, true);
3359 
3360  db.StartBuild();
3361  db.AddFasta(istr);
3362  db.EndBuild();
3363 
3364  CFileDeleteAtExit::Add(dbname + ".phr");
3365  CFileDeleteAtExit::Add(dbname + ".pin");
3366  CFileDeleteAtExit::Add(dbname + ".psq");
3367  CFileDeleteAtExit::Add(dbname + ".pog");
3368  CFileDeleteAtExit::Add(dbname + ".psd");
3369  CFileDeleteAtExit::Add(dbname + ".psi");
3370  CFileDeleteAtExit::Add(dbname + ".pni");
3371  CFileDeleteAtExit::Add(dbname + ".pnd");
3372 
3373  CSeqDB seqdb(dbname, CSeqDB::eProtein);
3374 
3375  list< CRef<CSeq_id> > ids = seqdb.GetSeqIDs(0);
3376  BOOST_REQUIRE_EQUAL(ids.size(), num_ids);
3377 
3378  auto seqdb_id = ids.begin();
3379  for (auto it: fasta_ids) {
3380  BOOST_REQUIRE_EQUAL((*seqdb_id)->Which(),it.second);
3381  BOOST_REQUIRE_EQUAL((*seqdb_id)->GetSeqIdString(true),it.first);
3382  ++seqdb_id;
3383  }
3384 }
3385 
3387 {
3388  CNcbiIfstream istr("data/pdbs.fasta");
3389  string dbname = "data/pdbs_v5";
3390  string title = "Temporary unit test db";
3391  ostringstream log;
3392  {
3393  CNcbiApplication::Instance()->SetEnvironment("BLASTDB_LMDB_MAP_SIZE", "100000");
3394  CBuildDatabase db(dbname, title, true, false, true, false, &log, true, eBDB_Version5);
3395  db.StartBuild();
3396  db.AddFasta(istr);
3397  db.EndBuild();
3398  }
3399 
3400 
3401  vector<string> db_ids;
3402  vector<int> db_oids;
3403  CSeqDB seqdb(dbname, CSeqDB::eProtein);
3404  int oid= 0;
3405  while (seqdb.CheckOrFindOID(oid)) {
3406  list<CRef<CSeq_id> > seq_ids = seqdb.GetSeqIDs(oid);
3407  ITERATE(list<CRef<CSeq_id> >, itr, seq_ids) {
3408  BOOST_REQUIRE_EQUAL((*itr)->Which(), CSeq_id::e_Pdb);
3409  }
3410  oid++;
3411  }
3412  CNcbiIfstream ref_ids_file("data/pdbs_ids.ref");
3413  vector<string> ref_ids;
3414  string line;
3415  while (getline(ref_ids_file, line)) {
3416  ref_ids.push_back(line);
3417  }
3418  vector<blastdb::TOid> oids;
3419  seqdb.AccessionsToOids(ref_ids, oids);
3420 
3421  ITERATE(vector<blastdb::TOid>, itr, oids){
3422  BOOST_CHECK(*itr != kSeqDBEntryNotFound);
3423  }
3424 
3425  CFileDeleteAtExit::Add(dbname + ".phr");
3426  CFileDeleteAtExit::Add(dbname + ".pin");
3427  CFileDeleteAtExit::Add(dbname + ".psq");
3428  CFileDeleteAtExit::Add(dbname + ".pog");
3429  CFileDeleteAtExit::Add(dbname + ".psd");
3430  CFileDeleteAtExit::Add(dbname + ".psi");
3431  CFileDeleteAtExit::Add(dbname + ".pos");
3432  CFileDeleteAtExit::Add(dbname + ".pot");
3433  CFileDeleteAtExit::Add(dbname + ".ptf");
3434  CFileDeleteAtExit::Add(dbname + ".pto");
3435  CFileDeleteAtExit::Add(dbname + ".pdb");
3436 
3437 }
3438 
3439 void s_TestReadPDBAsn1(CNcbiIfstream & istr, CNcbiIfstream & ref_ids_file, int num_oids)
3440 {
3441  string dbname = "data/asn1_v5";
3442  string title = "Temporary unit test db";
3443  ostringstream log;
3444  {
3445  CNcbiApplication::Instance()->SetEnvironment("BLASTDB_LMDB_MAP_SIZE", "100000");
3446  CRef<CSeq_entry> seq_entry(new CSeq_entry);
3447  istr >> MSerial_AsnText >> *seq_entry;
3448  CSeqEntryGetSource seq_src(seq_entry);
3449  CBuildDatabase db(dbname, title, true, false, true, false, &log, true, eBDB_Version5);
3450  db.StartBuild();
3451  db.AddSequences(seq_src);
3452  db.EndBuild();
3453  }
3454 
3455 
3456  vector<string> db_ids;
3457  vector<int> db_oids;
3458  CSeqDB seqdb(dbname, CSeqDB::eProtein);
3459  int oid= 0;
3460  while (seqdb.CheckOrFindOID(oid)) {
3461  list<CRef<CSeq_id> > seq_ids = seqdb.GetSeqIDs(oid);
3462  ITERATE(list<CRef<CSeq_id> >, itr, seq_ids) {
3463  BOOST_REQUIRE_EQUAL((*itr)->Which(), CSeq_id::e_Pdb);
3464  }
3465  oid++;
3466  }
3467 
3468  BOOST_REQUIRE_EQUAL(oid, num_oids);
3469 
3470  vector<string> ref_ids;
3471  string line;
3472  while (getline(ref_ids_file, line)) {
3473  ref_ids.push_back(line);
3474  }
3475  vector<blastdb::TOid> oids;
3476  seqdb.AccessionsToOids(ref_ids, oids);
3477 
3478  for(unsigned int i=0; i <oids.size(); i++){
3479  BOOST_REQUIRE_EQUAL(oids[i], i);
3480  }
3481 
3482  CFileDeleteAtExit::Add(dbname + ".phr");
3483  CFileDeleteAtExit::Add(dbname + ".pin");
3484  CFileDeleteAtExit::Add(dbname + ".psq");
3485  CFileDeleteAtExit::Add(dbname + ".pog");
3486  CFileDeleteAtExit::Add(dbname + ".psd");
3487  CFileDeleteAtExit::Add(dbname + ".psi");
3488  CFileDeleteAtExit::Add(dbname + ".pos");
3489  CFileDeleteAtExit::Add(dbname + ".pot");
3490  CFileDeleteAtExit::Add(dbname + ".ptf");
3491  CFileDeleteAtExit::Add(dbname + ".pto");
3492  CFileDeleteAtExit::Add(dbname + ".pdb");
3493 }
3494 
3496 {
3497  {
3498  CNcbiIfstream istr("data/a4WZJ.ASN1");
3499  CNcbiIfstream ref_ids_file("data/a4WZJ.ids");
3500  s_TestReadPDBAsn1(istr, ref_ids_file, 84);
3501  }
3502  {
3503  CNcbiIfstream istr("data/a5AJ4.ASN1");
3504  CNcbiIfstream ref_ids_file("data/a5AJ4.ids");
3505  s_TestReadPDBAsn1(istr, ref_ids_file, 83);
3506  }
3507 
3508 
3509 }
3510 
3511 BOOST_AUTO_TEST_CASE(LimitProteinDeflines)
3512 {
3513  const int kNumOfDeflines=4;
3514  string dbname="limit_df";
3515  {
3516  CNcbiIfstream istr("data/redundant_deflines.asn");
3517  CWriteDB writedb(dbname, CWriteDB::eProtein, "Redundant Deflines", eDefault, true, false,
3518  false, eBDB_Version4,true);
3519  char seq[9]={1,2,3,4,1,2,3,4,'\0'};
3520  for(unsigned int i=0; i < kNumOfDeflines; i++){
3521  set<int> taxids;
3523  istr >> MSerial_AsnText >> *df_line_set;
3524  writedb.AddSequence(seq);
3525  writedb.SetDeflines(*df_line_set);
3526  }
3527  writedb.Close();
3528  }
3529 
3530  static const int num_taxids[kNumOfDeflines] = {14, 107, 1, 45};
3531  static const int num_deflines[kNumOfDeflines] = {11, 107, 6, 43};
3532  CSeqDB readdb(dbname, CSeqDB::eProtein);
3533  for(unsigned int i=0; i < kNumOfDeflines; i++){
3534  CRef<CBlast_def_line_set> new_set = readdb.GetHdr(i);
3535  set<TTaxId> t;
3536  readdb.GetAllTaxIDs(i, t);
3537  BOOST_REQUIRE_EQUAL(num_taxids[i], t.size());
3538  BOOST_REQUIRE_EQUAL(num_deflines[i], new_set->Set().size());
3539  }
3540 
3541  CFileDeleteAtExit::Add(dbname + ".phr");
3542  CFileDeleteAtExit::Add(dbname + ".pin");
3543  CFileDeleteAtExit::Add(dbname + ".psq");
3544  CFileDeleteAtExit::Add(dbname + ".pog");
3545  CFileDeleteAtExit::Add(dbname + ".psd");
3546  CFileDeleteAtExit::Add(dbname + ".psi");
3547 }
3548 
3549 
3550 
3551 
3553 
3554 #endif /* SKIP_DOXYGEN_PROCESSING */
BOOST_AUTO_TEST_SUITE_END() static int s_GetSegmentFlags(const CBioseq &bioseq)
ncbi::TMaskedQueryRegions mask
Code to build a database given various sources of sequence data.
CB –.
Definition: B.hpp:64
Binary GI or TI List Builder.
Definition: writedb.hpp:456
void Write(const string &fname)
Write the list to a file.
Definition: writedb.cpp:201
@ eTi
Trace id.
Definition: writedb.hpp:470
void AppendId(const Int8 &id)
Add an identifier to the list.
Definition: writedb.hpp:485
CBioseq_Handle –.
`Blob' Class for SeqDB (and WriteDB).
Definition: seqdbblob.hpp:56
@ eNone
Write the string as-is.
Definition: seqdbblob.hpp:234
int WriteString(CTempString str, EStringFormat fmt)
Write string data to the blob.
Definition: seqdbblob.cpp:383
static int GetSeqidlist(CMemoryFile &file, vector< CSeqDBGiList::SSiOid > &idlist, SBlastSeqIdListInfo &list_info)
Get seqidlist from dbv5 seqidlist file.
Build BlastDB format databases from various data sources.
Definition: build_db.hpp:136
bool AddSequences(IBioseqSource &src, bool add_pig=false)
Add sequences from an IBioseqSource object.
Definition: build_db.cpp:794
bool AddFasta(CNcbiIstream &fasta_file)
Add sequences from a file containing FASTA data.
Definition: build_db.cpp:1398
void SetUseRemote(bool use_remote)
Specify whether to use remote fetching for locally absent IDs.
Definition: build_db.hpp:385
void SetSourceDb(const string &src_db_name)
Specify source database(s) via the database name(s).
Definition: build_db.cpp:1250
void SetTaxids(CTaxIdSet &taxids)
Specify a mapping of sequence ids to taxonomic ids.
Definition: build_db.cpp:1216
bool Build(const vector< string > &ids, CNcbiIstream *fasta_file)
Build the database.
Definition: build_db.cpp:1289
void StartBuild()
Start building a new database.
Definition: build_db.cpp:1317
bool EndBuild(bool erase=false)
Finish building a new database.
Definition: build_db.cpp:1423
bool AddIds(const vector< string > &ids)
Add the specified sequences from the source database.
Definition: build_db.cpp:1321
CDiagRestorer –.
Definition: ncbidiag.hpp:2941
CDirEntry –.
Definition: ncbifile.hpp:262
Base class for reading FASTA sequences.
Definition: fasta.hpp:80
CFileException –.
Definition: ncbifile.hpp:136
CFile –.
Definition: ncbifile.hpp:1604
Registry class for the sequence masking/filtering algorithms used to create masks to be added to a CW...
This represents a set of masks for a given sequence.
Definition: writedb.hpp:65
CMemoryFile –.
Definition: ncbifile.hpp:2860
CMultisourceException.
static CNcbiApplication * Instance(void)
Singleton method.
Definition: ncbiapp.cpp:264
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
CObjectManager –.
CScope –.
Definition: scope.hpp:92
CSeqDBAtlas class.
Definition: seqdbatlas.hpp:298
CSeqDBException.
Definition: seqdbcommon.hpp:73
CSeqDBExpert.
Definition: seqdbexpert.hpp:55
void GetRawSeqAndAmbig(int oid, const char **buffer, int *seq_length, int *ambig_length) const
Raw Sequence and Ambiguity Data.
Definition: seqdbexpert.cpp:64
CSeqDBIsam.
Definition: seqdbisam.hpp:127
CSeqDBLockHold.
Definition: seqdbatlas.hpp:167
CSeqDB.
Definition: seqdb.hpp:161
bool OidToPig(int oid, int &pig) const
Translate an OID to a PIG.
Definition: seqdb.cpp:790
void GetGis(int oid, vector< TGi > &gis, bool append=false) const
Gets a list of GIs for an OID.
Definition: seqdb.cpp:1070
int GetNumOIDs() const
Returns the size of the (possibly sparse) OID range.
Definition: seqdb.cpp:680
Uint8 GetVolumeLength() const
Returns the sum of the lengths of all volumes.
Definition: seqdb.cpp:700
bool OidToGi(int oid, TGi &gi) const
Translate an OID to a GI.
Definition: seqdb.cpp:826
list< CRef< CSeq_id > > GetSeqIDs(int oid) const
Gets a list of sequence identifiers.
Definition: seqdb.cpp:765
int GetSeqLength(int oid) const
Returns the sequence length in base pairs or residues.
Definition: seqdb.cpp:400
ESeqType GetSequenceType() const
Returns the type of database opened - protein or nucleotide.
Definition: seqdb.cpp:427
@ eNucleotide
Definition: seqdb.hpp:175
@ eProtein
Definition: seqdb.hpp:174
bool SeqidToOid(const CSeq_id &seqid, int &oid) const
Translate a Seq-id to any matching OID.
Definition: seqdb.cpp:903
CRef< CBioseq > GetBioseq(int oid, TGi target_gi=ZERO_GI, const CSeq_id *target_seq_id=NULL) const
Get a CBioseq for a sequence.
Definition: seqdb.cpp:504
void GetTaxIDs(int oid, map< TGi, TTaxId > &gi_to_taxid, bool persist=false) const
Get taxid for an OID.
Definition: seqdb.cpp:441
void GetTotals(ESummaryType sumtype, int *oid_count, Uint8 *total_length, bool use_approx=true) const
Returns the sum of the sequence lengths.
Definition: seqdb.cpp:1110
string GetTitle() const
Returns the database title.
Definition: seqdb.cpp:630
int GetNumSeqs() const
Returns the number of sequences available.
Definition: seqdb.cpp:670
void GetAllTaxIDs(int oid, set< TTaxId > &taxids) const
Get all tax ids for an oid.
Definition: seqdb.cpp:467
void AccessionToOids(const string &acc, vector< int > &oids) const
Translate an Accession to a list of OIDs.
Definition: seqdb.cpp:870
bool CheckOrFindOID(int &next_oid) const
Find an included OID, incrementing next_oid if necessary.
Definition: seqdb.cpp:728
@ eUnfilteredAll
Sum of all sequences, ignoring GI and OID lists and alias files.
Definition: seqdb.hpp:185
CRef< CBioseq > GiToBioseq(TGi gi) const
Get a CBioseq for a given GI.
Definition: seqdb.cpp:987
CRef< CBlast_def_line_set > GetHdr(int oid) const
Get the ASN.1 header for the sequence.
Definition: seqdb.cpp:418
void AccessionsToOids(const vector< string > &accs, vector< blastdb::TOid > &oids) const
Definition: seqdb.cpp:252
bool GiToOid(TGi gi, int &oid) const
Translate a GI to an OID.
Definition: seqdb.cpp:808
CSeqEntryGetSource(CRef< CSeq_entry > seq_entry)
CTypeIterator< CBioseq > m_Bioseq
CRef< CSeq_entry > m_entry
CRef< CObjectManager > m_objmgr
virtual CConstRef< CBioseq > GetNext()
Get a Bioseq object if there are any more to get.
CSeqVector –.
Definition: seq_vector.hpp:65
Definition: Seq_entry.hpp:56
void Parentize(void)
Definition: Seq_entry.cpp:71
Simple implementation of ILineReader for i(o)streams.
void FixTaxId(CRef< objects::CBlast_def_line_set > deflines)
Check that each defline has the specified taxid; if not, replace the defline and set the taxid.
Definition: taxid_set.cpp:131
void SetMappingFromFile(CNcbiIstream &f)
Definition: taxid_set.cpp:45
CTime –.
Definition: ncbitime.hpp:296
CTmpFile –.
Definition: ncbifile.hpp:2352
CRef< CWriteDB > m_Db
void SetDb(CWriteDB &db)
CWriteDBException.
Builder for BlastDb format column files.
Definition: writedb.hpp:540
void ListFiles(vector< string > &files) const
List Filenames.
CWriteDB_Isam class.
vector< CRef< CSeq_id > > TIdList
Type used for lists of sequence identifiers.
CWriteDB.
Definition: writedb.hpp:92
@ eProtein
Protein database.
Definition: writedb.hpp:97
@ eNucleotide
Nucleotide database.
Definition: writedb.hpp:100
void ListFiles(vector< string > &files)
List Filenames.
Definition: writedb.cpp:146
void SetPig(int pig)
Set the PIG to be used for the sequence.
Definition: writedb.cpp:99
void AddSequence(const CBioseq &bs)
Add a sequence as a CBioseq.
Definition: writedb.cpp:79
void SetMaxVolumeLetters(Uint8 letters)
Set maximum letters for output volumes.
Definition: writedb.cpp:123
EIndexType
Whether and what kind of indices to build.
Definition: writedb.hpp:104
@ eFullIndex
Use several forms of each Seq-id in the string index.
Definition: writedb.hpp:112
@ eAddHash
Add an index from sequence hash to OID.
Definition: writedb.hpp:126
@ eDefault
Like eFullIndex but also build a numeric Trace ID index.
Definition: writedb.hpp:121
@ eNoIndex
Build a database without any indices.
Definition: writedb.hpp:106
@ eFullWithTrace
Like eFullIndex but also build a numeric Trace ID index.
Definition: writedb.hpp:118
void ListVolumes(vector< string > &vols)
List Volumes.
Definition: writedb.cpp:141
static CRef< CBlast_def_line_set > ExtractBioseqDeflines(const CBioseq &bs, bool parse_ids=true, bool long_ids=false, bool scan_bioseq_4_cfastareader_usrobj=false)
Extract Deflines From Bioseq.
Definition: writedb.cpp:129
void SetDeflines(const CBlast_def_line_set &deflines)
Set the deflines to be used for the sequence.
Definition: writedb.cpp:94
void Close()
Close the Database.
Definition: writedb.cpp:104
Interface to a source of Bioseq objects.
Definition: build_db.hpp:54
Definition: set.hpp:45
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
static CMemoryRegistry registry
Definition: cn3d_tools.cpp:81
static uch flags
static const char si[8][64]
Definition: des.c:146
SStaticPair< const char *, const char * > TPair
std::ofstream out("events_result.xml")
main entry point for tests
Operators to edit gaps in sequences.
#define false
Definition: bool.h:36
static const char * str(char *buf, int n)
Definition: stats.c:84
static char tmp[3200]
Definition: utf8.c:42
#define basename(path)
Definition: replacements.h:116
char data[12]
Definition: iconv.c:80
#define TAX_ID_CONST(id)
Definition: ncbimisc.hpp:1112
#define GI_FROM(T, value)
Definition: ncbimisc.hpp:1086
CNcbiEnvironment & SetEnvironment(void)
Get a non-const copy of the application's cached environment.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
#define ZERO_GI
Definition: ncbimisc.hpp:1088
#define GI_TO(T, gi)
Definition: ncbimisc.hpp:1085
@ eDefault
Definition: ncbi_types.h:112
string
Definition: cgiapp.hpp:687
EDiagSev SetDiagPostLevel(EDiagSev post_sev=eDiag_Error)
Set the threshold severity for posting the messages.
Definition: ncbidiag.cpp:6129
@ eDiag_Fatal
Fatal error – guarantees exit(or abort)
Definition: ncbidiag.hpp:655
CNcbiIstream & AsInputFile(EIfExists if_exists, IOS_BASE::openmode mode=IOS_BASE::in)
Create I/O stream on the base of our file.
Definition: ncbifile.cpp:5435
virtual bool Remove(TRemoveFlags flags=eRecursive) const
Remove a directory entry.
Definition: ncbifile.cpp:2595
CNcbiOstream & AsOutputFile(EIfExists if_exists, IOS_BASE::openmode mode=IOS_BASE::out)
Definition: ncbifile.cpp:5455
static void Add(const string &path)
Add the name of a dir entry; it will be deleted on (normal) exit.
Definition: ncbifile.cpp:5367
Int8 GetFileSize(void) const
Get length of the mapped file.
Definition: ncbifile.cpp:5874
const string & GetFileName(void) const
Return used file name (generated or given in the constructor).
Definition: ncbifile.cpp:5429
virtual bool Exists(void) const
Check existence of file.
Definition: ncbifile.hpp:4038
@ eIfExists_Throw
You can make call of AsInputFile/AsOutputFile only once, on each following call throws CFileException...
Definition: ncbifile.hpp:2377
@ eIfExists_Reset
Delete previous stream and return reference to new object.
Definition: ncbifile.hpp:2380
@ eOnlyEmpty
Directory entry only, no other files or subdirectories.
Definition: ncbifile.hpp:727
#define MSerial_AsnText
I/O stream manipulators –.
Definition: serialbase.hpp:696
@ eSerial_AsnText
ASN.1 text.
Definition: serialdef.hpp:73
virtual CRef< CSeq_entry > ReadOneSeq(ILineErrorListener *pMessageListener=nullptr)
Read a single effective sequence, which may turn out to be a segmented set.
Definition: fasta.cpp:312
long TFlags
binary OR of EFlags
Definition: fasta.hpp:117
EFlags
Note on fAllSeqIds: some databases (notably nr) have merged identical sequences, joining their deflin...
Definition: fasta.hpp:86
bool AtEOF(void) const
Indicates (negatively) whether there is any more input.
Definition: fasta.hpp:141
virtual bool AtEOF(void) const =0
Indicates (negatively) whether there is any more input.
@ fAssumeNuc
Assume nucs unless accns indicate otherwise.
Definition: fasta.hpp:87
@ fAssumeProt
Assume prots unless accns indicate otherwise.
Definition: fasta.hpp:88
TGi FindGi(const container &ids)
Return gi from id list if exists, return 0 otherwise.
Definition: Seq_id.hpp:1041
string GetSeqIdString(bool with_version=false) const
Return seqid string with optional version for text seqid type.
Definition: Seq_id.cpp:2145
CBeginInfo Begin(C &obj)
Get starting point of object hierarchy.
Definition: iterator.hpp:1004
static CObjectIStream * Open(ESerialDataFormat format, CNcbiIstream &inStream, bool deleteInStream)
Create serial object reader and attach it to an input stream.
Definition: objistr.cpp:195
CBioseq_Handle AddBioseq(CBioseq &bioseq, TPriority pri=kPriority_Default, EExist action=eExist_Throw)
Add bioseq, return bioseq handle.
Definition: scope.cpp:530
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CSeq_entry_Handle AddTopLevelSeqEntry(CSeq_entry &top_entry, TPriority pri=kPriority_Default, EExist action=eExist_Default)
Add seq_entry, default priority is higher than for defaults or loaders Add object to the score with p...
Definition: scope.cpp:522
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
CConstRef< CBioseq > GetCompleteBioseq(void) const
Get the complete bioseq.
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
Definition: seq_vector.cpp:304
TSeqPos size(void) const
Definition: seq_vector.hpp:291
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:1439
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
position_type GetToOpen(void) const
Definition: range.hpp:138
CNcbiIstream & NcbiGetlineEOL(CNcbiIstream &is, string &str, string::size_type *count=NULL)
Read from "is" to "str" the next line (taking into account platform specifics of End-of-Line)
IO_PREFIX::ofstream CNcbiOfstream
Portable alias for ofstream.
Definition: ncbistre.hpp:500
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
static string SizetToString(size_t value, TNumToStringFlags flags=0, int base=10)
Convert size_t to string.
Definition: ncbistr.cpp:2751
static string Int8ToString(Int8 value, TNumToStringFlags flags=0, int base=10)
Convert Int8 to string.
Definition: ncbistr.hpp:5159
#define kEmptyStr
Definition: ncbistr.hpp:123
#define NPOS
Definition: ncbistr.hpp:133
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2891
static string UInt8ToString(Uint8 value, TNumToStringFlags flags=0, int base=10)
Convert UInt8 to string.
Definition: ncbistr.hpp:5168
@ eCurrent
Use current time. See also CCurrentTime.
Definition: ncbitime.hpp:300
EBlast_filter_program
This defines the possible sequence filtering algorithms to be used in a BLAST database.
Tdata & Set(void)
Assign a value to data member.
const Tdata & Get(void) const
Get the member data.
@ eBlast_filter_program_dust
@ eBlast_filter_program_max
@ eBlast_filter_program_repeat
@ eBlast_filter_program_seg
@ eBlast_filter_program_windowmasker
@ eBlast_filter_program_other
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
TGi GetGi(void) const
Get the variant data.
Definition: Seq_id_.hpp:889
bool IsGi(void) const
Check if variant Gi is selected.
Definition: Seq_id_.hpp:883
@ e_Other
for historical reasons, 'other' = 'refseq'
Definition: Seq_id_.hpp:104
@ e_General
for other databases
Definition: Seq_id_.hpp:105
@ e_Ddbj
DDBJ.
Definition: Seq_id_.hpp:107
@ e_Gi
GenInfo Integrated Database.
Definition: Seq_id_.hpp:106
@ e_Prf
PRF SEQDB.
Definition: Seq_id_.hpp:108
@ e_Local
local use
Definition: Seq_id_.hpp:95
@ e_Pdb
PDB sequence.
Definition: Seq_id_.hpp:109
bool IsSeq(void) const
Check if variant Seq is selected.
Definition: Seq_entry_.hpp:257
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:290
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
char * dbname(DBPROCESS *dbproc)
Get name of current database.
Definition: dblib.c:6929
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
FILE * file
char * buf
int i
const std::string kOutput
Command line flag to specify the output.
unsigned int a
Definition: ncbi_localip.c:102
EIPRangeType t
Definition: ncbi_localip.c:101
double f(double x_, const double &y_)
Definition: njn_root.hpp:188
The Object manager core.
static const char * prefix[]
Definition: pcregrep.c:405
static pcre_uint8 * buffer
Definition: pcretest.c:1051
BOOST_AUTO_TEST_SUITE(psiblast_iteration)
bool DeleteBlastDb(const string &dbpath, CSeqDB::ESeqType seq_type)
Deletes all files associated with a BLAST database.
Definition: seqdb.cpp:1542
const blastdb::TOid kSeqDBEntryNotFound
@ eBDB_Version4
Definition: seqdbcommon.hpp:52
@ eBDB_Version5
Definition: seqdbcommon.hpp:53
@ eGiId
Defines `expert' version of CSeqDB interfaces.
ISAM index database access object.
int WriteBlastSeqidlistFile(const vector< string > &idlist, CNcbiOstream &os, const string &title, const CSeqDB *seqdb=NULL)
const string kTaxId
Encapsulates the alias' file key-value pair.
Value(const string &name)
void Set(const string &v)
Auxiliary class to parse the contents of an alias file.
void x_Parse(const string &fname)
Parse the alias file's contents.
bool x_HasKeyword(string line, Value &data)
SAliasFileData(const string &fname)
Structure describing filtered regions created using a particular sequence filtering algorithm.
Blast DB v5 seqid list info.
Template structure SStaticPair is simlified replacement of STL pair<> Main reason of introducing this...
Definition: static_set.hpp:60
first_type first
Definition: static_set.hpp:64
second_type second
Definition: static_set.hpp:65
Utility stuff for more convenient using of Boost.Test library.
static const string kTitle
CTraceGlyph inline method implementation.
static bool ambig(char c)
Defines BLAST database construction classes.
void CWriteDB_CreateAliasFile(const string &file_name, const string &db_name, CWriteDB::ESeqType seq_type, const string &gi_file_name, const string &title=string(), EAliasFileFilterType alias_type=eGiList)
Writes an alias file that restricts a database with a gi list.
@ eSeqIdList
Filter a BLAST database via a Seq-id list.
Definition: writedb.hpp:613
Code for database isam construction.
@ eGi
GI Index.
USING_SCOPE(objects)
string s_HexDumpFile(const string &fname, const vector< int > &layout, int base)
void RegisterTooManyVariantsOfSameMaskingAlgorithm(EBlast_filter_program masking_algo, size_t kMaxNumSupportedAlgorithmVariants)
CRef< CSeq_id > s_GiToSeqId(TGi gi)
static void s_DupIdsBioseq(CWriteDB &w, CSeqDB &s, const TIdList &ids, int cutpoint)
vector< CRef< CSeq_id > > TIdList
void s_Unstringify(const string &s, ASNOBJ &a)
CRef< CBioseq > s_FastaStringToBioseq(const string &str, bool protein)
void s_CheckFiles(const vector< string > &files, bool need_hash=false)
void s_Stringify(const ASNOBJ &a, string &s)
static void s_DupIdsRaw(CWriteDB &w, CSeqDBExpert &seqdb, const TIdList &ids)
int g_NuclJ_OidCount
void s_WrapUpColumn(CWriteDB_ColumnBuilder &cb)
static CRef< CScope > s_GetScope()
void s_WrapUpDb(CWriteDB &db)
BOOST_AUTO_TEST_CASE(NuclBioseqDupI)
string s_ExtractLast(const string &data, const string &delim)
#define BOOST_REQUIRE_CUTPOINT(X)
void s_RemoveFile(const string &f)
void s_TestDatabase(CSeqDBExpert &src, const string &name, const string &title)
static void s_DupSequencesTest(const TIdList &ids, bool is_protein, bool raw_data, const string &src_name, const string &dst_name, const string &title, int cutpoint=99)
string s_HexDumpText(const string &raw, const vector< int > &layout, int base)
CRef< ASNOBJ > s_Duplicate(const ASNOBJ &a)
void s_RemoveFiles(const vector< string > &files)
void s_CheckSorted(const string &fname)
CRef< CSeq_id > s_AccToSeqId(const char *acc)
void s_TestReadPDBAsn1(CNcbiIfstream &istr, CNcbiIfstream &ref_ids_file, int num_oids)
void s_WrapUpFiles(const vector< string > &files)
static void s_BuildIds(TIdList &ids, TGi *gis)
void s_FetchRawData(CSeqDBExpert &seqdb, int oid, string &sequence, string &ambig)
USING_NCBI_SCOPE
static void s_NuclBioseqDupSwitch(int cutpoint)
void s_CompareBioseqs(CBioseq &src, CBioseq &dst)
#define W
Definition: crc32.c:85
Modified on Tue May 21 10:57:33 2024 by modify_doxy.py rev. 669887