NCBI C++ ToolKit
seqdb.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: seqdb.cpp 101660 2024-01-22 12:28:49Z camacho $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Kevin Bealer
27  *
28  */
29 
30 /// @file seqdb.cpp
31 /// Implementation for the CSeqDB class, the top level class for SeqDB.
32 #include <ncbi_pch.hpp>
35 #include "seqdbimpl.hpp"
37 #include <map>
38 #include <string>
39 
40 #include <serial/objistr.hpp>
41 #include <serial/objostr.hpp>
42 #include <serial/serial.hpp>
43 #include <serial/objostrasnb.hpp>
44 #include <serial/objistrasnb.hpp>
45 
50 
52 
53 const string CSeqDB::kOidNotFound("OID not found");
54 
55 /// Helper function to translate enumerated type to character.
56 ///
57 /// @param seqtype
58 /// The sequence type (eProtein, eNucleotide, or eUnknown).
59 /// @return
60 /// The sequence type as a char ('p', 'n', or '-').
61 
62 static char s_GetSeqTypeChar(CSeqDB::ESeqType seqtype)
63 {
64  switch(seqtype) {
65  case CSeqDB::eProtein:
66  return 'p';
68  return 'n';
69  case CSeqDB::eUnknown:
70  return '-';
71  }
72 
74  eArgErr,
75  "Invalid sequence type specified.");
76 }
77 
78 /// Helper function to build private implementation object.
79 ///
80 /// This method builds and returns the object which implements the
81 /// functionality for the CSeqDB API. If this method is called with
82 /// '-' for the sequence data type, protein will be tried first, then
83 /// nucleotide. The created object will be returned. Either
84 /// kSeqTypeProt for a protein database, kSeqTypeNucl for nucleotide,
85 /// or kSeqTypeUnkn to less this function try one then the other.
86 ///
87 /// @param dbname
88 /// A list of database or alias names, seperated by spaces.
89 /// @param prot_nucl
90 /// Specify whether to use protein, nucleotide, or either.
91 /// @param oid_begin
92 /// Iterator will skip OIDs less than this value. Only OIDs
93 /// found in the OID lists (if any) will be returned.
94 /// @param oid_end
95 /// Iterator will return up to (but not including) this OID.
96 /// @param use_mmap
97 /// If kSeqDBMMap is specified (the default), memory mapping is
98 /// attempted. If kSeqDBNoMMap is specified, or memory mapping
99 /// fails, this platform does not support it, the less efficient
100 /// read and write calls are used instead.
101 /// @param gi_list
102 /// This ID list specifies OIDs and deflines to include.
103 /// @param neg_list
104 /// This negative ID list specifies deflines and OIDs to exclude.
105 /// @param idset
106 /// If set, this specifies IDs to either include or exclude.
107 /// @return
108 /// The CSeqDBImpl object that was created.
109 
110 static CSeqDBImpl *
111 s_SeqDBInit(const string & dbname,
112  char prot_nucl,
113  int oid_begin,
114  int oid_end,
115  bool use_atlas_lock,
116  CSeqDBGiList * gi_list = NULL,
117  CSeqDBNegativeList * neg_list = NULL,
118  CSeqDBIdSet idset = CSeqDBIdSet())
119 {
120  CSeqDBImpl * impl = 0;
121 
122  if (prot_nucl == '-') {
123  try {
124  prot_nucl = 'p';
125  impl = new CSeqDBImpl(dbname,
126  prot_nucl,
127  oid_begin,
128  oid_end,
129  gi_list,
130  neg_list,
131  idset,
132  use_atlas_lock);
133  }
134  catch(CSeqDBException &) {
135  prot_nucl = 'n';
136  }
137  }
138 
139  if (! impl) {
140  impl = new CSeqDBImpl(dbname,
141  prot_nucl,
142  oid_begin,
143  oid_end,
144  gi_list,
145  neg_list,
146  idset,
147  use_atlas_lock);
148  }
149 
150  _ASSERT(impl);
151 
152  return impl;
153 }
154 
155 CSeqDB::CSeqDB(const string & dbname,
156  ESeqType seqtype,
157  CSeqDBGiList * gi_list,
158  bool use_atlas_lock)
159 
160 {
161  if (dbname.size() == 0) {
163  eArgErr,
164  "Database name is required.");
165  }
166 
167  char seq_type = s_GetSeqTypeChar(seqtype);
168 
170  seq_type,
171  0,
172  0,
173  use_atlas_lock,
174  gi_list);
175 
176  ////m_Impl->Verify();
177 }
178 
179 CSeqDB::CSeqDB(const string & dbname,
180  ESeqType seqtype,
181  CSeqDBNegativeList * nlist)
182 {
183  if (dbname.size() == 0) {
185  eArgErr,
186  "Database name is required.");
187  }
188 
189  const bool kUseAtlasLock = true;
191  s_GetSeqTypeChar(seqtype),
192  0,
193  0,
194  kUseAtlasLock,
195  NULL,
196  nlist);
197 
198  ////m_Impl->Verify();
199 }
200 
201 CSeqDB::CSeqDB(const string & dbname,
202  ESeqType seqtype,
203  CSeqDBGiList * gi_list,
204  CSeqDBNegativeList * nlist)
205 {
206  if (dbname.size() == 0) {
208  eArgErr,
209  "Database name is required.");
210  }
211 
212  char seq_type = s_GetSeqTypeChar(seqtype);
213 
215  seq_type,
216  0,
217  0,
218  true,
219  gi_list,
220  nlist);
221 
222  ////m_Impl->Verify();
223 }
224 
225 CSeqDB::CSeqDB(const string & dbname,
226  ESeqType seqtype,
227  int oid_begin,
228  int oid_end,
229  CSeqDBGiList * gi_list,
230  CSeqDBNegativeList * nlist)
231 {
232  if (dbname.size() == 0) {
234  eArgErr,
235  "Database name is required.");
236  }
237 
238  char seq_type = s_GetSeqTypeChar(seqtype);
239 
241  seq_type,
242  oid_begin,
243  oid_end,
244  true,
245  gi_list,
246  nlist);
247 
248  ////m_Impl->Verify();
249 }
250 
251 
252 void CSeqDB::AccessionsToOids(const vector<string>& accs, vector<blastdb::TOid>& oids) const
253 {
254  m_Impl->AccessionsToOids(accs, oids);
255 }
256 
257 void CSeqDB::TaxIdsToOids(set<TTaxId>& tax_ids, vector<blastdb::TOid>& rv) const
258 {
259  m_Impl->TaxIdsToOids(tax_ids, rv);
260 }
261 
262 void CSeqDB::GetDBTaxIds(set<TTaxId> & tax_ids) const
263 {
264  m_Impl->GetDBTaxIds(tax_ids);
265 }
266 
267 void CSeqDB::GetTaxIdsForOids(const vector<blastdb::TOid> & oids, set<TTaxId> & tax_ids) const
268 {
269  m_Impl->GetTaxIdsForOids(oids, tax_ids);
270 }
271 
272 // This could become the primary constructor for SeqDB, and those
273 // taking positive and negative lists could be deprecated. This
274 // implies refactoring of code using SeqDB, addition of the third
275 // (string/Seq-id) type IDs to the IdSet, and changes to client code.
276 // Some non-SeqDB code uses FindOID and other methods of the GI list,
277 // comparable functionality would need to be added to IdSet().
278 //
279 // Before any of that is done, all the SeqDB classes should be made to
280 // use CSeqDBIdSet instead of using positive and negative lists. This
281 // implies widespread changes to CSeqDBIdSet and SeqDB internal code.
282 //
283 // I'll leave those changes for another time -- for now I'll just add
284 // the pieces of framework that seem useful and are implied by the
285 // current design.
286 
287 CSeqDB::CSeqDB(const string & dbname, ESeqType seqtype, CSeqDBIdSet ids)
288 {
289  if (dbname.size() == 0) {
291  eArgErr,
292  "Database name is required.");
293  }
294 
296  CRef<CSeqDBGiList> pos;
297 
298  if (! ids.Blank()) {
299  if (ids.IsPositive()) {
300  pos = ids.GetPositiveList();
301  } else {
302  neg = ids.GetNegativeList();
303  }
304  }
305 
306  const bool kUseAtlasLock = true;
308  s_GetSeqTypeChar(seqtype),
309  0,
310  0,
311  kUseAtlasLock,
312  pos.GetPointerOrNull(),
313  neg.GetPointerOrNull(),
314  ids);
315 
316  ////m_Impl->Verify();
317 }
318 
319 CSeqDB::CSeqDB(const vector<string> & dbs,
320  ESeqType seqtype,
321  CSeqDBGiList * gi_list)
322 {
323  string dbname;
325 
326  if (dbname.size() == 0) {
328  eArgErr,
329  "Database name is required.");
330  }
331 
332  const bool kUseAtlasLock = true;
334  s_GetSeqTypeChar(seqtype),
335  0,
336  0,
337  kUseAtlasLock,
338  gi_list);
339 
340  ////m_Impl->Verify();
341 }
342 
343 CSeqDB::CSeqDB(const string & dbname,
344  ESeqType seqtype,
345  int oid_begin,
346  int oid_end,
347  bool use_mmap,
348  CSeqDBGiList * gi_list)
349 {
350  if (dbname.size() == 0) {
352  eArgErr,
353  "Database name is required.");
354  }
355 
356  const bool kUseAtlasLock = true;
358  s_GetSeqTypeChar(seqtype),
359  oid_begin,
360  oid_end,
361  kUseAtlasLock,
362  gi_list);
363 
364  ////m_Impl->Verify();
365 }
366 
367 CSeqDB::CSeqDB(const vector<string> & dbs,
368  ESeqType seqtype,
369  int oid_begin,
370  int oid_end,
371  bool use_mmap,
372  CSeqDBGiList * gi_list)
373 {
374  string dbname;
376 
377  if (dbname.size() == 0) {
379  eArgErr,
380  "Database name is required.");
381  }
382 
383  const bool kUseAtlasLock = true;
385  s_GetSeqTypeChar(seqtype),
386  oid_begin,
387  oid_end,
388  kUseAtlasLock,
389  gi_list);
390 
391  ////m_Impl->Verify();
392 }
393 
395 {
396  m_Impl = new CSeqDBImpl();
397  ////m_Impl->Verify();
398 }
399 
400 int CSeqDB::GetSeqLength(int oid) const
401 {
402  ////m_Impl->Verify();
403  int length = m_Impl->GetSeqLength(oid);
404  ////m_Impl->Verify();
405 
406  return length;
407 }
408 
409 int CSeqDB::GetSeqLengthApprox(int oid) const
410 {
411  ////m_Impl->Verify();
412  int length = m_Impl->GetSeqLengthApprox(oid);
413  ////m_Impl->Verify();
414 
415  return length;
416 }
417 
419 {
420  ////m_Impl->Verify();
422  ////m_Impl->Verify();
423 
424  return rv;
425 }
426 
428 {
429  switch(m_Impl->GetSeqType()) {
430  case 'p':
431  return eProtein;
432  case 'n':
433  return eNucleotide;
434  }
435 
437  eArgErr,
438  "Internal sequence type is not valid.");
439 }
440 
441 void CSeqDB::GetTaxIDs(int oid,
442  map<TGi, TTaxId> & gi_to_taxid,
443  bool persist) const
444 {
445  ////m_Impl->Verify();
446  typedef map<TGi, TTaxId> TmpMap;
447  TmpMap gi_to_taxid_tmp;
448  m_Impl->GetTaxIDs(oid, gi_to_taxid_tmp, persist);
449  if ( !persist ) {
450  gi_to_taxid.clear();
451  }
452  ITERATE ( TmpMap, it, gi_to_taxid_tmp ) {
453  gi_to_taxid[it->first] = it->second;
454  }
455  ////m_Impl->Verify();
456 }
457 
458 void CSeqDB::GetTaxIDs(int oid,
459  vector<TTaxId> & taxids,
460  bool persist) const
461 {
462  ////m_Impl->Verify();
463  m_Impl->GetTaxIDs(oid, taxids, persist);
464  ////m_Impl->Verify();
465 }
466 
467 void CSeqDB::GetAllTaxIDs(int oid,
468  set<TTaxId> & taxids) const
469 {
470  m_Impl->GetAllTaxIDs(oid, taxids);
471 }
472 
474  int oid,
475  map<TGi, set<TTaxId> >& gi_to_taxid_set,
476  bool persist
477 ) const
478 {
479  ////m_Impl->Verify();
480  typedef map<TGi, set<TTaxId> > TmpMap;
481  TmpMap gi_to_taxid_set_tmp;
482  m_Impl->GetLeafTaxIDs(oid, gi_to_taxid_set_tmp, persist);
483  if ( !persist ) {
484  gi_to_taxid_set.clear();
485  }
486  ITERATE ( TmpMap, it, gi_to_taxid_set_tmp ) {
487  gi_to_taxid_set[it->first] = it->second;
488  }
489  //m_Impl->Verify();
490 }
491 
493  int oid,
494  vector<TTaxId>& taxids,
495  bool persist
496 ) const
497 {
498  //m_Impl->Verify();
499  m_Impl->GetLeafTaxIDs(oid, taxids, persist);
500  //m_Impl->Verify();
501 }
502 
504 CSeqDB::GetBioseq(int oid, TGi target_gi, const CSeq_id * target_id) const
505 {
506  //m_Impl->Verify();
507  CRef<CBioseq> rv = m_Impl->GetBioseq(oid, target_gi, target_id, true);
508  //m_Impl->Verify();
509 
510  return rv;
511 }
512 
514 CSeqDB::GetBioseqNoData(int oid, TGi target_gi, const CSeq_id * target_id) const
515 {
516  //m_Impl->Verify();
517  CRef<CBioseq> rv = m_Impl->GetBioseq(oid, target_gi, target_id, false);
518  //m_Impl->Verify();
519 
520  return rv;
521 }
522 
523 void CSeqDB::RetSequence(const char ** buffer) const
524 {
525  //m_Impl->Verify();
527  //m_Impl->Verify();
528 }
529 
530 int CSeqDB::GetSequence(int oid, const char ** buffer) const
531 {
532  //m_Impl->Verify();
533  int rv = m_Impl->GetSequence(oid, buffer);
534  //m_Impl->Verify();
535 
536  return rv;
537 }
538 
540  TSeqPos begin,
541  TSeqPos end) const
542 {
543  //m_Impl->Verify();
544  CRef<CSeq_data> rv = m_Impl->GetSeqData(oid, begin, end);
545  //m_Impl->Verify();
546 
547  return rv;
548 }
549 
550 int CSeqDB::GetAmbigSeq(int oid, const char ** buffer, int nucl_code) const
551 {
552  //m_Impl->Verify();
553  int rv = m_Impl->GetAmbigSeq(oid,
554  (char **)buffer,
555  nucl_code,
556  0,
557  (ESeqDBAllocType) 0);
558  //m_Impl->Verify();
559 
560  return rv;
561 }
562 
563 void CSeqDB::RetAmbigSeq(const char ** buffer) const
564 {
565  //m_Impl->Verify();
567  //m_Impl->Verify();
568 }
569 
571  const char ** buffer,
572  int nucl_code,
573  int begin_offset,
574  int end_offset) const
575 {
576  //m_Impl->Verify();
577 
578  SSeqDBSlice region(begin_offset, end_offset);
579 
580  int rv = m_Impl->GetAmbigSeq(oid,
581  (char **)buffer,
582  nucl_code,
583  & region,
584  (ESeqDBAllocType) 0);
585 
586  //m_Impl->Verify();
587 
588  return rv;
589 }
590 
592  char ** buffer,
593  int nucl_code,
595  TSequenceRanges *masks) const
596 {
597  //m_Impl->Verify();
598 
599  if ((strategy != eMalloc) && (strategy != eNew)) {
601  eArgErr,
602  "Invalid allocation strategy specified.");
603  }
604 
605  int rv = m_Impl->GetAmbigSeq(oid, buffer, nucl_code, 0, strategy, masks);
606 
607  //m_Impl->Verify();
608 
609  return rv;
610 }
611 
613  char ** buffer,
614  int nucl_code,
616  TSequenceRanges * partial_ranges,
617  TSequenceRanges * masks) const
618 {
619 
620  if ((strategy != eMalloc) && (strategy != eNew)) {
622  eArgErr,
623  "Invalid allocation strategy specified.");
624  }
625 
626  int rv = m_Impl->GetAmbigPartialSeq(oid, buffer, nucl_code, strategy, partial_ranges, masks);
627  return rv;
628 }
629 
630 string CSeqDB::GetTitle() const
631 {
632  return m_Impl->GetTitle();
633 }
634 
635 string CSeqDB::GetDate() const
636 {
637  return m_Impl->GetDate();
638 }
639 
640 CTime
641 CSeqDB::GetDate(const string & dbname,
642  ESeqType seqtype)
643 {
644  vector<string> vols;
645  CSeqDB::FindVolumePaths(dbname, seqtype, vols);
646  string fmt = "b d, Y H:m P";
647  CTime retv;
648  char date[128];
649  ITERATE(vector<string>, vol, vols) {
650  string fn = *vol + ((seqtype == CSeqDB::eProtein)? ".pin" : ".nin");
651  ifstream f(fn.c_str(), ios::in|ios::binary);
652  char s[4]; // size of next chunk
653  if (f.is_open()) {
654  f.seekg(8, ios::beg);
655  f.read(s, 4);
657  f.seekg(offset, ios::cur);
658  f.read(s, 4);
659  offset = SeqDB_GetStdOrd((Uint4 *) s);
660  f.read(date, offset);
661  CTime d(string(date), fmt);
662  if (retv.IsEmpty() || d > retv) {
663  retv = d;
664  }
665  }
666  }
667  return retv;
668 }
669 
671 {
672  return m_Impl->GetNumSeqs();
673 }
674 
676 {
677  return m_Impl->GetNumSeqsStats();
678 }
679 
681 {
682  return m_Impl->GetNumOIDs();
683 }
684 
686 {
687  return m_Impl->GetTotalLength();
688 }
689 
691 {
692  return m_Impl->GetExactTotalLength();
693 }
694 
696 {
697  return m_Impl->GetTotalLengthStats();
698 }
699 
701 {
702  return m_Impl->GetVolumeLength();
703 }
704 
706 {
707  return m_Impl->GetMaxLength();
708 }
709 
711 {
712  return m_Impl->GetMinLength();
713 }
714 
716 {
717  ////m_Impl->Verify();
718 
719  if (m_Impl)
720  delete m_Impl;
721 }
722 
724 {
725  return CSeqDBIter(this, 0);
726 }
727 
728 bool CSeqDB::CheckOrFindOID(int & oid) const
729 {
730  ////m_Impl->Verify();
731  bool rv = m_Impl->CheckOrFindOID(oid);
732  ////m_Impl->Verify();
733 
734  return rv;
735 }
736 
737 
740  int & end,
741  int size,
742  vector<int> & lst,
743  int * state)
744 {
745  ////m_Impl->Verify();
746 
748  m_Impl->GetNextOIDChunk(begin, end, size, lst, state);
749 
750  ////m_Impl->Verify();
751 
752  return rv;
753 }
754 
756 {
758 }
759 
760 const string & CSeqDB::GetDBNameList() const
761 {
762  return m_Impl->GetDBNameList();
763 }
764 
765 list< CRef<CSeq_id> > CSeqDB::GetSeqIDs(int oid) const
766 {
767  ////m_Impl->Verify();
768 
769  list< CRef<CSeq_id> > rv = m_Impl->GetSeqIDs(oid);
770 
771  ////m_Impl->Verify();
772 
773  return rv;
774 }
775 
776 TGi CSeqDB::GetSeqGI(int oid) const
777 {
778  return m_Impl->GetSeqGI(oid);
779 }
780 
781 bool CSeqDB::PigToOid(int pig, int & oid) const
782 {
783  ////m_Impl->Verify();
784  bool rv = m_Impl->PigToOid(pig, oid);
785  ////m_Impl->Verify();
786 
787  return rv;
788 }
789 
790 bool CSeqDB::OidToPig(int oid, int & pig) const
791 {
792  ////m_Impl->Verify();
793  bool rv = m_Impl->OidToPig(oid, pig);
794  ////m_Impl->Verify();
795 
796  return rv;
797 }
798 
799 bool CSeqDB::TiToOid(Int8 ti, int & oid) const
800 {
801  ////m_Impl->Verify();
802  bool rv = m_Impl->TiToOid(ti, oid);
803  ////m_Impl->Verify();
804 
805  return rv;
806 }
807 
808 bool CSeqDB::GiToOid(TGi gi, int & oid) const
809 {
810  ////m_Impl->Verify();
811  bool rv = m_Impl->GiToOid(gi, oid);
812  ////m_Impl->Verify();
813 
814  return rv;
815 }
816 
817 bool CSeqDB::GiToOidwFilterCheck(TGi gi, int & oid) const
818 {
819  ////m_Impl->Verify();
820  bool rv = m_Impl->GiToOidwFilterCheck(gi, oid);
821  ////m_Impl->Verify();
822 
823  return rv;
824 }
825 
826 bool CSeqDB::OidToGi(int oid, TGi & gi) const
827 {
828  ////m_Impl->Verify();
829  TGi gi_tmp;
830  bool rv = m_Impl->OidToGi(oid, gi_tmp);
831  gi = gi_tmp;
832  ////m_Impl->Verify();
833 
834  return rv;
835 }
836 
837 bool CSeqDB::PigToGi(int pig, TGi & gi) const
838 {
839  ////m_Impl->Verify();
840  bool rv = false;
841 
842  int oid(0);
843 
844  if (m_Impl->PigToOid(pig, oid)) {
845  TGi gi_tmp;
846  rv = m_Impl->OidToGi(oid, gi_tmp);
847  gi = gi_tmp;
848  }
849  ////m_Impl->Verify();
850 
851  return rv;
852 }
853 
854 bool CSeqDB::GiToPig(TGi gi, int & pig) const
855 {
856  ////m_Impl->Verify();
857  bool rv = false;
858 
859  int oid(0);
860 
861  if (m_Impl->GiToOid(gi, oid)) {
862  rv = m_Impl->OidToPig(oid, pig);
863  }
864 
865  ////m_Impl->Verify();
866 
867  return rv;
868 }
869 
870 void CSeqDB::AccessionToOids(const string & acc, vector<int> & oids) const
871 {
872  ////m_Impl->Verify();
873  m_Impl->AccessionToOids(acc, oids);
874 
875  // If we have a numeric ID and the search failed, try to look it
876  // up as a GI (but not as a PIG or TI). Due to the presence of
877  // PDB ids like "pdb|1914|a", the faster GitToOid is not done
878  // first (unless the caller does so.)
879 
880  if (oids.empty()) {
881  try {
882  TGi gi = NStr::StringToNumeric<TGi>(acc, NStr::fConvErr_NoThrow);
883  int oid(-1);
884 
885  if (gi > ZERO_GI && m_Impl->GiToOidwFilterCheck(gi, oid)) {
886  oids.push_back(oid);
887  }
888  }
889  catch(...) {
890  }
891  }
892 
893  ////m_Impl->Verify();
894 }
895 
896 void CSeqDB::SeqidToOids(const CSeq_id & seqid, vector<int> & oids) const
897 {
898  ////m_Impl->Verify();
899  m_Impl->SeqidToOids(seqid, oids, true);
900  ////m_Impl->Verify();
901 }
902 
903 bool CSeqDB::SeqidToOid(const CSeq_id & seqid, int & oid) const
904 {
905  ////m_Impl->Verify();
906  bool rv = false;
907 
908  oid = -1;
909 
910  vector<int> oids;
911  m_Impl->SeqidToOids(seqid, oids, false);
912 
913  if (! oids.empty()) {
914  rv = true;
915  oid = oids[0];
916  }
917 
918  ////m_Impl->Verify();
919 
920  return rv;
921 }
922 
923 int CSeqDB::GetOidAtOffset(int first_seq, Uint8 residue) const
924 {
925  ////m_Impl->Verify();
926  int rv = m_Impl->GetOidAtOffset(first_seq, residue);
927  ////m_Impl->Verify();
928 
929  return rv;
930 }
931 
932 CSeqDBIter::CSeqDBIter(const CSeqDB * db, int oid)
933  : m_DB (db),
934  m_OID (oid),
935  m_Data (0),
936  m_Length((int) -1)
937 {
938  if (m_DB->CheckOrFindOID(m_OID)) {
939  x_GetSeq();
940  }
941 }
942 
944  : m_DB (other.m_DB),
945  m_OID (other.m_OID),
946  m_Data (0),
947  m_Length((int) -1)
948 {
949  if (m_DB->CheckOrFindOID(m_OID)) {
950  x_GetSeq();
951  }
952 }
953 
954 /// Copy one iterator to another.
956 {
957  x_RetSeq();
958 
959  m_DB = other.m_DB;
960  m_OID = other.m_OID;
961  m_Data = 0;
962  m_Length = -1;
963 
964  if (m_DB->CheckOrFindOID(m_OID)) {
965  x_GetSeq();
966  }
967 
968  return *this;
969 }
970 
972 {
973  x_RetSeq();
974 
975  ++m_OID;
976 
977  if (m_DB->CheckOrFindOID(m_OID)) {
978  x_GetSeq();
979  } else {
980  m_Length = -1;
981  }
982 
983  return *this;
984 }
985 
988 {
989  ////m_Impl->Verify();
990 
991  CRef<CBioseq> bs;
992  int oid(0);
993 
994  if (m_Impl->GiToOid(gi, oid)) {
995  bs = m_Impl->GetBioseq(oid, gi, NULL, true);
996  }
997 
998  ////m_Impl->Verify();
999 
1000  return bs;
1001 }
1002 
1004 CSeqDB::PigToBioseq(int pig) const
1005 {
1006  ////m_Impl->Verify();
1007 
1008  int oid(0);
1009  CRef<CBioseq> bs;
1010 
1011  if (m_Impl->PigToOid(pig, oid)) {
1012  bs = m_Impl->GetBioseq(oid, ZERO_GI, NULL, true);
1013  }
1014 
1015  ////m_Impl->Verify();
1016 
1017  return bs;
1018 }
1019 
1021 CSeqDB::SeqidToBioseq(const CSeq_id & seqid) const
1022 {
1023  ////m_Impl->Verify();
1024 
1025  vector<int> oids;
1026  CRef<CBioseq> bs;
1027 
1028  m_Impl->SeqidToOids(seqid, oids, false);
1029 
1030  if (! oids.empty()) {
1031  bs = m_Impl->GetBioseq(oids[0], ZERO_GI, &seqid, true);
1032  }
1033 
1034  ////m_Impl->Verify();
1035 
1036  return bs;
1037 }
1038 
1039 void
1041  ESeqType seqtype,
1042  vector<string> & paths,
1043  vector<string> * alias_paths,
1044  bool recursive,
1045  bool expand_links)
1046 {
1047  if (seqtype == CSeqDB::eProtein) {
1048  CSeqDBImpl::FindVolumePaths(dbname, 'p', paths, alias_paths, recursive, expand_links);
1049  } else if (seqtype == CSeqDB::eNucleotide) {
1050  CSeqDBImpl::FindVolumePaths(dbname, 'n', paths, alias_paths, recursive, expand_links);
1051  } else {
1052  try {
1053  CSeqDBImpl::FindVolumePaths(dbname, 'p', paths, alias_paths, recursive, expand_links);
1054  }
1055  catch(...) {
1056  CSeqDBImpl::FindVolumePaths(dbname, 'n', paths, alias_paths, recursive, expand_links);
1057  }
1058  }
1059 }
1060 
1061 void
1062 CSeqDB::FindVolumePaths(vector<string> & paths, bool recursive) const
1063 {
1064  ////m_Impl->Verify();
1065  m_Impl->FindVolumePaths(paths, recursive);
1066  ////m_Impl->Verify();
1067 }
1068 
1069 void
1070 CSeqDB::GetGis(int oid, vector<TGi> & gis, bool append) const
1071 {
1072  ////m_Impl->Verify();
1073 
1074  // This could be done a little faster at a lower level, but not
1075  // necessarily by too much. If this operation is important to
1076  // performance, that decision can be revisited.
1077 
1078  list< CRef<CSeq_id> > seqids = GetSeqIDs(oid);
1079 
1080  if (! append) {
1081  gis.clear();
1082  }
1083 
1084  ITERATE(list< CRef<CSeq_id> >, seqid, seqids) {
1085  if ((**seqid).IsGi()) {
1086  gis.push_back((**seqid).GetGi());
1087  }
1088  }
1089 
1090  ////m_Impl->Verify();
1091 }
1092 
1093 void CSeqDB::SetIterationRange(int oid_begin, int oid_end)
1094 {
1095  m_Impl->SetIterationRange(oid_begin, oid_end);
1096 }
1097 
1099 {
1100  ////m_Impl->Verify();
1101  m_Impl->GetAliasFileValues(afv);
1102  ////m_Impl->Verify();
1103 }
1104 
1106 {
1107  CSeqDBImpl::GetTaxInfo(taxid, info);
1108 }
1109 
1111  int * oid_count,
1112  Uint8 * total_length,
1113  bool use_approx) const
1114 {
1115  ////m_Impl->Verify();
1116  m_Impl->GetTotals(sumtype, oid_count, total_length, use_approx);
1117  ////m_Impl->Verify();
1118 }
1119 
1121 {
1122  return m_Impl->GetGiList();
1123 }
1124 
1126 {
1127  return m_Impl->GetIdSet();
1128 }
1129 
1131  string & output,
1132  TSeqRange range /* = TSeqRange() */) const
1133 {
1137 
1138  GetSequenceAsString(oid, code_to, output, range);
1139 }
1140 
1142  CSeqUtil::ECoding coding,
1143  string & output,
1144  TSeqRange range /* = TSeqRange() */) const
1145 {
1146  output.erase();
1147 
1148  string raw;
1149  const char * buffer = 0;
1150  int length = 0;
1151 
1152  // Protein dbs ignore encodings, always returning ncbistdaa.
1153  if (range.NotEmpty()) {
1154  length = GetAmbigSeq(oid, & buffer, kSeqDBNuclNcbiNA8,
1155  range.GetFrom(), range.GetToOpen());
1156  } else {
1157  length = GetAmbigSeq(oid, & buffer, kSeqDBNuclNcbiNA8);
1158  }
1159 
1160  try {
1161  raw.assign(buffer, length);
1162  }
1163  catch(...) {
1164  RetAmbigSeq(& buffer);
1165  throw;
1166  }
1167  RetAmbigSeq(& buffer);
1168 
1172 
1173  string result;
1174 
1175  if (code_from == coding) {
1176  result.swap(raw);
1177  } else {
1179  code_from,
1180  0,
1181  length,
1182  result,
1183  coding);
1184  }
1185 
1186  output.swap(result);
1187 }
1188 
1189 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
1190  (!defined(NCBI_COMPILER_MIPSPRO)) )
1191 void CSeqDB::ListColumns(vector<string> & titles)
1192 {
1193  m_Impl->ListColumns(titles);
1194 }
1195 
1196 int CSeqDB::GetColumnId(const string & title)
1197 {
1198  return m_Impl->GetColumnId(title);
1199 }
1200 
1201 const map<string,string> &
1203 {
1204  return m_Impl->GetColumnMetaData(column_id);
1205 }
1206 
1207 const string & CSeqDB::GetColumnValue(int column_id, const string & key)
1208 {
1209  static string mt;
1210  return SeqDB_MapFind(GetColumnMetaData(column_id), key, mt);
1211 }
1212 
1213 const map<string,string> &
1215  const string & volname)
1216 {
1217  return m_Impl->GetColumnMetaData(column_id, volname);
1218 }
1219 
1220 void CSeqDB::GetColumnBlob(int col_id,
1221  int oid,
1222  CBlastDbBlob & blob)
1223 {
1224  m_Impl->GetColumnBlob(col_id, oid, true, blob);
1225 }
1226 
1227 void CSeqDB::GetAvailableMaskAlgorithms(vector<int> & algorithms)
1228 {
1229  m_Impl->GetAvailableMaskAlgorithms(algorithms);
1230 }
1231 
1232 int CSeqDB::GetMaskAlgorithmId(const string &algo_name) const
1233 {
1234  return m_Impl->GetMaskAlgorithmId(algo_name);
1235 }
1236 
1238 {
1240 }
1241 
1242 vector<int> CSeqDB::ValidateMaskAlgorithms(const vector<int>& algorithm_ids)
1243 {
1244  vector<int> invalid_algo_ids, available_algo_ids;
1245  GetAvailableMaskAlgorithms(available_algo_ids);
1246  invalid_algo_ids.reserve(algorithm_ids.size());
1247  if (available_algo_ids.empty()) {
1248  copy(algorithm_ids.begin(), algorithm_ids.end(),
1249  back_inserter(invalid_algo_ids));
1250  return invalid_algo_ids;
1251  }
1252 
1253  ITERATE(vector<int>, itr, algorithm_ids) {
1254  vector<int>::const_iterator pos = find(available_algo_ids.begin(),
1255  available_algo_ids.end(), *itr);
1256  if (pos == available_algo_ids.end()) {
1257  invalid_algo_ids.push_back(*itr);
1258  }
1259  }
1260  return invalid_algo_ids;
1261 }
1262 
1263 void CSeqDB::GetMaskAlgorithmDetails(int algorithm_id,
1265  string & program_name,
1266  string & algo_opts)
1267 {
1268  string sid;
1269  m_Impl->GetMaskAlgorithmDetails(algorithm_id, sid, program_name,
1270  algo_opts);
1271  Int4 id(0);
1273  program = (objects::EBlast_filter_program)id;
1274 }
1275 
1276 void CSeqDB::GetMaskAlgorithmDetails(int algorithm_id,
1277  string & program,
1278  string & program_name,
1279  string & algo_opts)
1280 {
1281  m_Impl->GetMaskAlgorithmDetails(algorithm_id, program, program_name,
1282  algo_opts);
1283 }
1284 
1285 void CSeqDB::GetMaskData(int oid,
1286  int algo_id,
1287  TSequenceRanges & ranges)
1288 {
1289  m_Impl->GetMaskData(oid, algo_id, ranges);
1290 }
1291 
1292 #endif
1293 
1294 
1296  const CSeqDB::TRangeList & offset_ranges,
1297  bool append_ranges,
1298  bool cache_data)
1299 {
1300  ////m_Impl->Verify();
1301 
1302  m_Impl->SetOffsetRanges(oid,
1303  offset_ranges,
1304  append_ranges,
1305  cache_data);
1306 
1307  ////m_Impl->Verify();
1308 }
1309 
1311 {
1312  static TRangeList empty;
1313  SetOffsetRanges(oid, empty, false, false);
1314 }
1315 
1317 {
1319 }
1320 
1321 void CSeqDB::SetNumberOfThreads(int num_threads, bool force_mt)
1322 {
1323  ////m_Impl->Verify();
1324 
1325  m_Impl->SetNumberOfThreads(num_threads, force_mt);
1326 }
1327 
1329 {
1330  string retval("Unknown");
1331  switch (type) {
1332  case eProtein: retval.assign("Protein"); break;
1333  case eNucleotide: retval.assign("Nucleotide"); break;
1334  case eUnknown:
1335  default: break;
1336  }
1337  return retval;
1338 }
1339 
1341 {
1343 }
1344 
1346 {
1347  m_Impl->SetVolsMemBit(mbit);
1348 }
1349 
1350 /// Functor class for FindFilesInDir
1352 public:
1354  const string& extn = de.GetPath().substr(de.GetPath().length() - 3, 1);
1356  // rm extension
1357  value.m_BlastDbName = de.GetPath().substr(0, de.GetPath().length() - 4);
1358  CNcbiOstrstream oss;
1359  // Needed for escaping spaces
1360  oss << "\"" << value.m_BlastDbName << "\"";
1361  value.m_BlastDbName = CNcbiOstrstreamToString(oss);
1362  value.m_MoleculeType =
1363  (extn == "n" ? CSeqDB::eNucleotide : CSeqDB::eProtein);
1364  m_DBs.push_back(value);
1365  }
1366 
1367  vector<SSeqDBInitInfo> m_DBs;
1368 
1369  /// Auxiliary function to get the original file name found by this object
1370  string GetFileName(size_t idx) {
1371  SSeqDBInitInfo& info = m_DBs[idx];
1372  string retval = NStr::Replace(info.m_BlastDbName, "\"", kEmptyStr);
1373  if (info.m_MoleculeType == CSeqDB::eNucleotide) {
1374  string alias = retval + ".nal", index = retval + ".nin";
1375  retval = (CFile(alias).Exists() ? alias : index);
1376  } else {
1377  string alias = retval + ".pal", index = retval + ".pin";
1378  retval = (CFile(alias).Exists() ? alias : index);
1379  }
1380  return retval;
1381  }
1382 };
1383 
1384 /** Functor object for s_RemoveAliasComponents where the path name is matched
1385  * in SSeqDBInitInfo */
1386 class PathFinder {
1387 public:
1388  PathFinder(const string& p) : m_Path(p) {}
1389  bool operator() (const SSeqDBInitInfo& value) const {
1390  return (NStr::Find(value.m_BlastDbName, m_Path) != NPOS);
1391  }
1392 
1393 private:
1394  string m_Path;
1395 };
1396 
1398 {
1399  set<string> dbs2remove;
1400  for (size_t i = 0; i < finder.m_DBs.size(); i++) {
1401  string path = finder.GetFileName(i);
1402  if (path[path.size()-1] != 'l') { // not an alias file
1403  continue;
1404  }
1405  CNcbiIfstream in(path.c_str());
1406  if (!in) {
1407  continue;
1408  }
1409  string line;
1410  while (getline(in, line)) {
1411  if (NStr::StartsWith(line, "DBLIST")) {
1412  vector<string> tokens;
1414  for (size_t j = 1; j < tokens.size(); j++) {
1415  dbs2remove.insert(tokens[j]);
1416  }
1417  }
1418  }
1419  }
1420 
1421  ITERATE(set<string>, i, dbs2remove) {
1422  finder.m_DBs.erase(remove_if(finder.m_DBs.begin(), finder.m_DBs.end(),
1423  PathFinder(*i)),
1424  finder.m_DBs.end());
1425  }
1426 }
1427 
1428 vector<SSeqDBInitInfo>
1429 FindBlastDBs(const string& path, const string& dbtype, bool recurse,
1430  bool include_alias_files /* = false */,
1431  bool remove_redundant_dbs /* = false */)
1432 {
1433  // 1. Find every database volume (but not alias files etc).
1434  vector<string> fmasks, dmasks;
1435 
1436  // If the type is 'guess' we do both types of databases.
1437 
1438  if (dbtype != "nucl") {
1439  fmasks.push_back("*.pin");
1440  if (include_alias_files) {
1441  fmasks.push_back("*.pal");
1442  }
1443  }
1444  if (dbtype != "prot") {
1445  fmasks.push_back("*.nin");
1446  if (include_alias_files) {
1447  fmasks.push_back("*.nal");
1448  }
1449  }
1450  dmasks.push_back("*");
1451 
1453  (fFF_File | (recurse ? fFF_Recursive : 0));
1454 
1455  CBlastDbFinder dbfinder;
1456  FindFilesInDir(CDir(path), fmasks, dmasks, dbfinder, flags);
1457  if (remove_redundant_dbs) {
1458  s_RemoveAliasComponents(dbfinder);
1459  }
1460  sort(dbfinder.m_DBs.begin(), dbfinder.m_DBs.end());
1461  return dbfinder.m_DBs;
1462 }
1463 
1465 {
1466  vector<string> paths;
1467  FindVolumePaths(paths);
1468  _ASSERT( !paths.empty() );
1469 
1470  Int8 retval = 0;
1471 
1472  vector<string> extn;
1473  const bool is_protein(GetSequenceType() == CSeqDB::eProtein);
1474  SeqDB_GetFileExtensions(is_protein, extn, GetBlastDbVersion());
1475  string blastdb_dirname;
1476 
1477  ITERATE(vector<string>, path, paths) {
1478  ITERATE(vector<string>, ext, extn) {
1479  CFile file(*path + "." + *ext);
1480  if (file.Exists()) {
1481  Int8 length = file.GetLength();
1482  if (length != -1) {
1483  retval += length;
1484  LOG_POST(Trace << "File " << file.GetPath() << " " << length << " bytes");
1485  blastdb_dirname = file.GetDir();
1486  } else {
1487  ERR_POST(Error << "Error retrieving file size for "
1488  << file.GetPath());
1489  }
1490  }
1491  }
1492  }
1493  // For multi-volume databases, take into account files that apply to the
1494  // entire BLASTDB
1495  if (paths.size() > 1) {
1496  _ASSERT( !blastdb_dirname.empty() );
1497  auto dbname = GetDBNameList();
1498  vector<string> dblist;
1499  NStr::Split(dbname, " ", dblist, NStr::fSplit_Tokenize);
1500  if (dblist.size() > 1) {
1501  CNcbiOstrstream oss;
1502  oss << "Cannot compute disk usage for multiple BLASTDBs (i.e.: '"
1503  << dbname << "') at once. Please try again using one BLASTDB "
1504  << "at a time.";
1506  }
1507 
1508  for (const auto& ext: extn) {
1509  CFile file(CDirEntry::MakePath(blastdb_dirname, dbname, ext));
1510  if (file.Exists()) {
1511  Int8 length = file.GetLength();
1512  if (length != -1) {
1513  retval += length;
1514  LOG_POST(Trace << "File " << file.GetPath() << " " << length << " bytes");
1515  } else {
1516  ERR_POST(Error << "Error retrieving file size for "
1517  << file.GetPath());
1518  }
1519  }
1520  }
1521  }
1522  return retval;
1523 }
1524 
1525 
1527 ParseMoleculeTypeString(const string& s)
1528 {
1530  if (NStr::StartsWith(s, "prot", NStr::eNocase)) {
1531  retval = CSeqDB::eProtein;
1532  } else if (NStr::StartsWith(s, "nucl", NStr::eNocase)) {
1533  retval = CSeqDB::eNucleotide;
1534  } else if (NStr::StartsWith(s, "guess", NStr::eNocase)) {
1535  retval = CSeqDB::eUnknown;
1536  } else {
1537  _ASSERT("Unknown molecule for BLAST DB" != 0);
1538  }
1539  return retval;
1540 }
1541 
1542 bool DeleteBlastDb(const string& dbpath, CSeqDB::ESeqType seq_type)
1543 {
1544  int num_files_removed = 0;
1545  vector<string> db_files, alias_files;
1546  bool is_protein = (seq_type == CSeqDB::eProtein);
1547 
1548  vector<string> extn;
1549  SeqDB_GetFileExtensions( is_protein, extn, eBDB_Version4);
1550  vector<string> lmdb_extn;
1551  SeqDB_GetLMDBFileExtensions(is_protein, lmdb_extn);
1552  ITERATE(vector<string>, lmdb, lmdb_extn) {
1553  CNcbiOstrstream oss;
1554  oss << dbpath << "." << *lmdb;
1555  const string fname = CNcbiOstrstreamToString(oss);
1556  if (CFile(fname).Remove()) {
1557  LOG_POST(Info << "Deleted " << fname);
1558  num_files_removed++;
1559  }
1560  else {
1561  unsigned int index = 0;
1562  string vfname = dbpath + "." + NStr::IntToString(index/10) +
1563  NStr::IntToString(index%10) + "." + *lmdb;
1564  while (CFile(vfname).Remove()) {
1565  index++;
1566  vfname = dbpath + "." + NStr::IntToString(index/10) +
1567  NStr::IntToString(index%10) + "." + *lmdb;
1568 
1569  }
1570  }
1571  }
1572 
1573  try { CSeqDB::FindVolumePaths(dbpath, seq_type, db_files, &alias_files); }
1574  catch (...) {} // ignore any errors from the invocation above
1575  ITERATE(vector<string>, f, db_files) {
1576  ITERATE(vector<string>, e, extn) {
1577  CNcbiOstrstream oss;
1578  oss << *f << "." << *e;
1579  const string fname = CNcbiOstrstreamToString(oss);
1580  if (CFile(fname).Remove()) {
1581  LOG_POST(Info << "Deleted " << fname);
1582  num_files_removed++;
1583  }
1584  }
1585  }
1586  ITERATE(vector<string>, f, alias_files) {
1587  if (CFile(*f).Remove()) {
1588  LOG_POST(Info << "Deleted " << *f);
1589  num_files_removed++;
1590  }
1591  }
1592  return static_cast<bool>(num_files_removed != 0);
1593 }
1594 
1595 const char* CSeqDB::kBlastDbDateFormat = "b d, Y H:m P";
1596 
1597 void CSeqDB::DebugDump(CDebugDumpContext ddc, unsigned int depth) const
1598 {
1599  ddc.SetFrame("CSeqDB");
1600  CObject::DebugDump(ddc, depth);
1601  ddc.Log("m_Impl", m_Impl, depth);
1602 }
1603 
1605 {
1606  return m_Impl->GetBlastDbVersion();
1607 }
1608 
1609 
1610 void CSeqDB::x_GetDBFilesMetaData(Int8 & disk_bytes, Int8 & cached_bytes, vector<string> & db_files, const string & user_path) const
1611 {
1612  vector<string> paths;
1613  vector<string> alias;
1614  m_Impl->FindVolumePaths(paths, alias, true);
1615  _ASSERT( !paths.empty() );
1616 
1617  db_files.clear();
1618  cached_bytes = 0;
1619  disk_bytes = 0;
1620 
1621  ITERATE(vector<string>, a, alias) {
1622  CFile af(*a);
1623  if (af.Exists()) {
1624  string fn = user_path + af.GetName();
1625  db_files.push_back(fn);
1626  Int8 afl = af.GetLength();
1627  if (afl != -1) {
1628  disk_bytes += afl;
1629  } else {
1630  ERR_POST(Error << "Error retrieving file size for " << af.GetPath());
1631  }
1632  }
1633  }
1634 
1635  vector<string> extn;
1636  const bool is_protein(GetSequenceType() == CSeqDB::eProtein);
1638 
1639  const string kExtnMol(1, is_protein ? 'p' : 'n');
1640  const string index_ext = kExtnMol + "in";
1641  const string seq_ext = kExtnMol + "sq";
1642 
1643  ITERATE(vector<string>, path, paths) {
1644  ITERATE(vector<string>, ext, extn) {
1645  CFile file(*path + "." + *ext);
1646  if (file.Exists()) {
1647  string f = user_path + file.GetName();
1648  db_files.push_back(f);
1649  Int8 length = file.GetLength();
1650  if (length != -1) {
1651  disk_bytes += length;
1652  if((*ext == index_ext) || (*ext == seq_ext)) {
1653  cached_bytes += length;
1654  }
1655  } else {
1656  ERR_POST(Error << "Error retrieving file size for "
1657  << file.GetPath());
1658  }
1659  }
1660  }
1661  }
1662 
1664  vector<string> lmdb_list;
1665  m_Impl->GetLMDBFileNames(lmdb_list);
1666 
1667  ITERATE(vector<string>, l, lmdb_list) {
1668  CFile file(*l);
1669  if (file.Exists()) {
1670  string f = user_path + file.GetName();
1671  db_files.push_back(f);
1672  Int8 length = file.GetLength();
1673  if (length != -1) {
1674  disk_bytes += length;
1675  } else {
1676  ERR_POST(Error << "Error retrieving file size for " << file.GetPath());
1677  }
1678  static const char * v5_exts[]={"os", "ot", "tf", "to", NULL};
1679  for(const char ** p=v5_exts; *p != NULL; p++) {
1680  CFile v(file.GetDir() + file.GetBase() + "." + kExtnMol + (*p));
1681  if (v.Exists()) {
1682  string vf = user_path + v.GetName();
1683  db_files.push_back(vf);
1684  Int8 vl = v.GetLength();
1685  if (vl != -1) {
1686  disk_bytes += vl;
1687  } else {
1688  ERR_POST(Error << "Error retrieving file size for " << v.GetPath());
1689  }
1690  }
1691  }
1692  }
1693  }
1694  }
1695 
1696  // FIXME: increase the version for this new file type
1697  //string ext;
1698  //SeqDB_GetMetadataFileExtension(is_protein, ext);
1699  //const CFile dbfile(paths.front());
1700  //db_files.push_back(user_path + dbfile.GetBase() + "." + ext);
1701 
1702  sort(db_files.begin(), db_files.end());
1703 }
1704 
1706 {
1708  int num_seqs = 0;
1709  Uint8 total_length = 0;
1710 
1711  GetTotals(CSeqDB::eFilteredAll, &num_seqs, &total_length, true);
1712  vector<string> dblist;
1714  NON_CONST_ITERATE(vector<string>, itr, dblist) {
1715  size_t off = (*itr).find_last_of(CFile::GetPathSeparator());
1716  if (off != string::npos ) {
1717  (*itr).erase(0, off+1);
1718  }
1719  }
1720 
1721  string dbnames = NStr::Join(dblist, " ");
1722  m->SetDbname(dbnames);
1723 
1724  m->SetDbtype(GetSequenceType() == CSeqDB::eProtein ? "Protein" : "Nucleotide" );
1725  m->SetDb_version(GetBlastDbVersion() == EBlastDbVersion::eBDB_Version5?5:4);
1726  m->SetDescription(GetTitle());
1727  m->SetNumber_of_letters(total_length);
1728  m->SetNumber_of_sequences(num_seqs);
1729 
1731  string fmt = "b d, Y H:m P";
1732  CTime date(GetDate(), fmt);
1733  m->SetLast_updated(date.AsString(timeFmt));
1734 
1735  Int8 disk_bytes(0), cached_bytes(0);
1736  x_GetDBFilesMetaData(disk_bytes, cached_bytes, m->SetFiles(), user_path);
1737  m->SetBytes_total(disk_bytes);
1738  m->SetBytes_to_cache(cached_bytes);
1739 
1740  m->SetNumber_of_volumes(m_Impl->GetNumOfVols());
1741 
1743  set<TTaxId> tax_ids;
1744  GetDBTaxIds(tax_ids);
1745  if((tax_ids.size() > 1) || ((tax_ids.size() == 1) && (0 != *tax_ids.begin()))){
1746  m->SetNumber_of_taxids(static_cast<int>(tax_ids.size()));
1747  }
1748  }
1749  return m;
1750 }
1751 
1752 void CSeqDB::GetTaxIdsForAccession(const string & accs, vector<TTaxId> & taxids)
1753 {
1755  m_Impl->GetTaxIdsForSeqId(seqid, taxids);
1756 }
1757 
1758 void CSeqDB::GetTaxIdsForSeqId(const CSeq_id & seq_id, vector<TTaxId> & taxids)
1759 {
1760  m_Impl->GetTaxIdsForSeqId(seq_id, taxids);
1761 }
1762 
1763 
1765 
void remove_if(Container &c, Predicate *__pred)
Definition: chainer.hpp:69
`Blob' Class for SeqDB (and WriteDB).
Definition: seqdbblob.hpp:56
Functor class for FindFilesInDir.
Definition: seqdb.cpp:1351
void operator()(CDirEntry &de)
Definition: seqdb.cpp:1353
vector< SSeqDBInitInfo > m_DBs
Definition: seqdb.cpp:1367
string GetFileName(size_t idx)
Auxiliary function to get the original file name found by this object.
Definition: seqdb.cpp:1370
CBlast_db_metadata –.
void SetFrame(const string &frame)
Definition: ddumpable.cpp:137
void Log(const string &name, const char *value, CDebugDumpFormatter::EValueType type=CDebugDumpFormatter::eValue, const string &comment=kEmptyStr)
Definition: ddumpable.cpp:151
CDirEntry –.
Definition: ncbifile.hpp:262
CDir –.
Definition: ncbifile.hpp:1696
CFile –.
Definition: ncbifile.hpp:1605
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
static SIZE_TYPE Convert(const CTempString &src, TCoding src_coding, TSeqPos pos, TSeqPos length, string &dst, TCoding dst_coding)
static const string GenerateSearchPath()
Generate search path.
Definition: seqdbatlas.hpp:510
CSeqDBException.
Definition: seqdbcommon.hpp:73
CSeqDBGiList.
SeqDB ID list for performing boolean set operations.
bool Blank() const
Check if an ID list is blank.
bool IsPositive()
Checks whether a positive GI list was produced.
CRef< CSeqDBNegativeList > GetNegativeList()
Retrieve a negative GI list.
CRef< CSeqDBGiList > GetPositiveList()
Retrieve a positive GI list.
CSeqDBImpl class.
Definition: seqdbimpl.hpp:138
void GetTaxIDs(int oid, map< TGi, TTaxId > &gi_to_taxid, bool persist)
Get gi to taxid map for an OID.
Definition: seqdbimpl.cpp:448
void GetDBTaxIds(set< TTaxId > &tax_ids)
Get all unique tax ids from db.
Definition: seqdbimpl.cpp:1311
char GetSeqType() const
Get the sequence type.
Definition: seqdbimpl.cpp:994
int GetOidAtOffset(int first_seq, Uint8 residue) const
Find the OID corresponding to the offset given in residues, into the database as a whole.
Definition: seqdbimpl.cpp:1467
int GetMinLength() const
Returns the length of the smallest sequence in the database.
Definition: seqdbimpl.cpp:1077
void GetLeafTaxIDs(int oid, map< TGi, set< TTaxId > > &gi_to_taxid_set, bool persist)
Get gi to taxid map for an OID.
Definition: seqdbimpl.cpp:525
void AccessionsToOids(const vector< string > &accs, vector< blastdb::TOid > &oids)
Definition: seqdbimpl.cpp:1351
EBlastDbVersion GetBlastDbVersion() const
Return blast db version.
Definition: seqdbimpl.cpp:2665
int GetMaskAlgorithmId(const string &algo_name)
Get the numeric ID for a algorithm name.
Definition: seqdbimpl.cpp:2296
int GetColumnId(const string &title)
Get an ID number for a given column title.
Definition: seqdbimpl.cpp:1974
bool GiToOidwFilterCheck(TGi gi, int &oid)
GiToOis is meant to simply return oid for a gi if one exisits This method finds the oid and checks if...
Definition: seqdbimpl.cpp:1194
void GetColumnBlob(int col_id, int oid, bool keep, CBlastDbBlob &blob)
Fetch the data blob for the given column and oid.
Definition: seqdbimpl.cpp:2086
void SetIterationRange(int oid_begin, int oid_end)
Set Iteration Range.
Definition: seqdbimpl.cpp:183
int GetNumOIDs() const
Returns the size of the (possibly sparse) OID range.
Definition: seqdbimpl.cpp:839
void GetMaskAlgorithmDetails(int algorithm_id, string &program, string &program_name, string &algo_opts)
Get information about one type of masking available here.
Definition: seqdbimpl.cpp:2384
void SetNumberOfThreads(int num_threads, bool force_mt=false)
Invoke the garbage collector to free up memory.
Definition: seqdbimpl.cpp:2566
TGi GetSeqGI(int oid)
Look up for the GI of a sequence.
Definition: seqdbimpl.cpp:820
int GetMaxLength() const
Returns the length of the largest sequence in the database.
Definition: seqdbimpl.cpp:1071
void TaxIdsToOids(set< TTaxId > &tax_ids, vector< blastdb::TOid > &rv)
Get Oid list for input tax ids.
Definition: seqdbimpl.cpp:1289
void RetAmbigSeq(const char **buffer) const
Returns any resources associated with the sequence.
Definition: seqdbimpl.cpp:653
void FlushOffsetRangeCache()
Flush all offset ranges cached.
Definition: seqdbimpl.cpp:1850
void GetAllTaxIDs(int oid, set< TTaxId > &taxids)
Get all tax ids (leaf and non-leaf for an oid.
Definition: seqdbimpl.cpp:509
int GetNumOfVols() const
Definition: seqdbimpl.cpp:2671
Uint8 GetVolumeLength() const
Returns the sum of the lengths of all volumes.
Definition: seqdbimpl.cpp:876
void GetLMDBFileNames(vector< string > &lmdb_list) const
Definition: seqdbimpl.cpp:2676
const string & GetDBNameList() const
Get list of database names.
Definition: seqdbimpl.cpp:1083
CRef< CBioseq > GetBioseq(int oid, TGi target_gi, const CSeq_id *target_seq_id, bool seqdata)
Get a CBioseq for a sequence.
Definition: seqdbimpl.cpp:606
static void FindVolumePaths(const string &dbname, char prot_nucl, vector< string > &paths, vector< string > *alias_paths, bool recursive, bool expand_links)
Find volume paths.
Definition: seqdbimpl.cpp:1522
void ListColumns(vector< string > &titles)
List columns titles found in this database.
Definition: seqdbimpl.cpp:1959
int GetAmbigPartialSeq(int oid, char **buffer, int nucl_code, ESeqDBAllocType alloc_type, CSeqDB::TSequenceRanges *partial_ranges, CSeqDB::TSequenceRanges *masks) const
Definition: seqdbimpl.cpp:776
bool OidToPig(int oid, int &pig) const
Translate a PIG to an OID.
Definition: seqdbimpl.cpp:1129
void GetAliasFileValues(TAliasFileValues &afv)
Get Name/Value Data From Alias Files.
Definition: seqdbimpl.cpp:1552
Uint8 GetExactTotalLength()
Returns the exact sum of the lengths of all available sequences.
Definition: seqdbimpl.cpp:851
void GetTaxIdsForSeqId(const CSeq_id &seq_id, vector< TTaxId > &taxids)
Definition: seqdbimpl.cpp:2707
CRef< CSeq_data > GetSeqData(int oid, TSeqPos begin, TSeqPos end) const
Fetch data as a CSeq_data object.
Definition: seqdbimpl.cpp:736
const map< string, string > & GetColumnMetaData(int column_id)
Get all metadata for the specified column.
Definition: seqdbimpl.cpp:2023
int GetSequence(int oid, const char **buffer) const
Get the sequence data for a sequence.
Definition: seqdbimpl.cpp:718
static void GetTaxInfo(TTaxId taxid, SSeqDBTaxInfo &info)
Get taxonomy information.
Definition: seqdbimpl.cpp:1618
void RetSequence(const char **buffer) const
Returns any resources associated with the sequence.
Definition: seqdbimpl.cpp:632
string GetAvailableMaskAlgorithmDescriptions()
Returns a formatted string with the list of available masking algorithms in this database for display...
Definition: seqdbimpl.cpp:2313
bool CheckOrFindOID(int &next_oid)
Find an included OID, incrementing next_oid if necessary.
Definition: seqdbimpl.cpp:265
void SeqidToOids(const CSeq_id &seqid, vector< int > &oids, bool multi)
Translate a CSeq-id to a list of OIDs.
Definition: seqdbimpl.cpp:1385
int GetNumSeqsStats() const
Returns the number of sequences available.
Definition: seqdbimpl.cpp:833
int GetAmbigSeq(int oid, char **buffer, int nucl_code, SSeqDBSlice *region, ESeqDBAllocType strategy, CSeqDB::TSequenceRanges *masks=NULL) const
Get a pointer to a range of sequence data with ambiguities.
Definition: seqdbimpl.cpp:754
void GetAvailableMaskAlgorithms(vector< int > &algorithms)
Get a list of algorithm IDs for which mask data exists.
Definition: seqdbimpl.cpp:2277
int GetSeqLengthApprox(int oid) const
Get the approximate sequence length.
Definition: seqdbimpl.cpp:429
void GetTaxIdsForOids(const vector< blastdb::TOid > &oids, set< TTaxId > &tax_ids)
Definition: seqdbimpl.cpp:1339
void GetTotals(ESummaryType sumtype, int *oid_count, Uint8 *total_length, bool use_approx)
Returns the sum of the sequence lengths.
Definition: seqdbimpl.cpp:1628
const CSeqDBGiList * GetGiList() const
Get GI list attached to this database.
Definition: seqdbimpl.hpp:580
void GetMaskData(int oid, int algo_id, CSeqDB::TSequenceRanges &ranges)
Get masked ranges of a sequence.
Definition: seqdbimpl.cpp:2512
bool TiToOid(Int8 ti, int &oid)
Translate a TI to an OID.
Definition: seqdbimpl.cpp:1142
Uint8 GetTotalLengthStats() const
Returns the sum of the lengths of all available sequences.
Definition: seqdbimpl.cpp:870
void SetVolsMemBit(int mbit)
Set the membership bit of all volumes.
Definition: seqdbimpl.cpp:2620
bool GiToOid(TGi gi, int &oid) const
Translate a GI to an OID.
Definition: seqdbimpl.cpp:1163
void SetOffsetRanges(int oid, const TRangeList &offset_ranges, bool append_ranges, bool cache_data)
Apply a range of offsets to a database sequence.
Definition: seqdbimpl.cpp:1832
list< CRef< CSeq_id > > GetSeqIDs(int oid)
Gets a list of sequence identifiers.
Definition: seqdbimpl.cpp:797
void AccessionToOids(const string &acc, vector< int > &oids)
Find OIDs matching the specified string.
Definition: seqdbimpl.cpp:1230
CRef< CBlast_def_line_set > GetHdr(int oid)
Get the sequence header data.
Definition: seqdbimpl.cpp:1042
int GetNumSeqs() const
Returns the number of sequences available.
Definition: seqdbimpl.cpp:827
Uint8 GetTotalLength() const
Returns the sum of the lengths of all available sequences.
Definition: seqdbimpl.cpp:845
bool PigToOid(int pig, int &oid) const
Translate a PIG to an OID.
Definition: seqdbimpl.cpp:1115
CSeqDB::EOidListType GetNextOIDChunk(int &begin_chunk, int &end_chunk, int oid_size, vector< int > &oid_list, int *oid_state)
Return a chunk of OIDs, and update the OID bookmark.
Definition: seqdbimpl.cpp:301
string GetDate() const
Returns the construction date of the database.
Definition: seqdbimpl.cpp:1003
string GetTitle() const
Returns the database title.
Definition: seqdbimpl.cpp:988
void ResetInternalChunkBookmark()
Restart chunk iteration at the beginning of the database.
Definition: seqdbimpl.cpp:398
int GetSeqLength(int oid) const
Get the sequence length.
Definition: seqdbimpl.cpp:405
CSeqDBIdSet GetIdSet()
Get IdSet list attached to this database.
Definition: seqdbimpl.cpp:1952
bool OidToGi(int oid, TGi &gi)
Translate a GI to an OID.
Definition: seqdbimpl.cpp:1212
CSeqDBIter.
Definition: seqdb.hpp:77
CSeqDBIter & operator++()
Increment operator.
Definition: seqdb.cpp:971
int m_Length
The length of this OID.
Definition: seqdb.hpp:141
const CSeqDB * m_DB
The CSeqDB object which this object iterates over.
Definition: seqdb.hpp:132
CSeqDBIter & operator=(const CSeqDBIter &)
Copy one iterator to another.
Definition: seqdb.cpp:955
void x_RetSeq()
Release hold on current sequence.
Definition: seqdb.hpp:1654
const char * m_Data
The sequence data for this OID.
Definition: seqdb.hpp:138
int m_OID
The OID this iterator is currently accessing.
Definition: seqdb.hpp:135
void x_GetSeq()
Get data pointer and length for the current sequence.
Definition: seqdb.hpp:1649
CSeqDBIter(const CSeqDBIter &)
Construct one iterator from another.
Definition: seqdb.cpp:943
CSeqDBNegativeList.
CSeqDB.
Definition: seqdb.hpp:161
void GetColumnBlob(int col_id, int oid, CBlastDbBlob &blob)
Fetch the data blob for the given column and oid.
Definition: seqdb.cpp:1220
int GetMinLength() const
Returns the length of the shortest sequence in the database.
Definition: seqdb.cpp:710
void GetDBTaxIds(set< TTaxId > &tax_ids) const
Get all unique tax ids from db.
Definition: seqdb.cpp:262
static void FindVolumePaths(const string &dbname, ESeqType seqtype, vector< string > &paths, vector< string > *alias_paths=NULL, bool recursive=true, bool expand_links=true)
Find volume paths.
Definition: seqdb.cpp:1040
bool OidToPig(int oid, int &pig) const
Translate an OID to a PIG.
Definition: seqdb.cpp:790
void TaxIdsToOids(set< TTaxId > &tax_ids, vector< blastdb::TOid > &rv) const
Get Oid list for input tax ids.
Definition: seqdb.cpp:257
bool GiToOidwFilterCheck(TGi gi, int &oid) const
Translate a GI To an OID with filter check.
Definition: seqdb.cpp:817
Uint8 GetTotalLength() const
Returns the sum of the lengths of all available sequences.
Definition: seqdb.cpp:685
static string ESeqType2String(ESeqType type)
Converts a CSeqDB sequence type into a human readable string.
Definition: seqdb.cpp:1328
void GetGis(int oid, vector< TGi > &gis, bool append=false) const
Gets a list of GIs for an OID.
Definition: seqdb.cpp:1070
bool PigToOid(int pig, int &oid) const
Translate a PIG to an OID.
Definition: seqdb.cpp:781
void SetIterationRange(int oid_begin, int oid_end)
Set Iteration Range.
Definition: seqdb.cpp:1093
void GetSequenceAsString(int oid, CSeqUtil::ECoding coding, string &output, TSeqRange range=TSeqRange()) const
Get a sequence in a given encoding.
Definition: seqdb.cpp:1141
int GetNumOIDs() const
Returns the size of the (possibly sparse) OID range.
Definition: seqdb.cpp:680
static string GenerateSearchPath()
Returns the default BLAST database search path configured for this local installation of BLAST.
Definition: seqdb.cpp:1340
TGi GetSeqGI(int oid) const
Returns the first Gi (if any) of the sequence.
Definition: seqdb.cpp:776
vector< int > ValidateMaskAlgorithms(const vector< int > &algorithm_ids)
Validates the algorithm IDs passed to this function, returning a vector of those algorithm IDs not pr...
Definition: seqdb.cpp:1242
Uint8 GetVolumeLength() const
Returns the sum of the lengths of all volumes.
Definition: seqdb.cpp:700
void GetAvailableMaskAlgorithms(vector< int > &algorithms)
Get a list of algorithm IDs for which mask data exists.
Definition: seqdb.cpp:1227
bool OidToGi(int oid, TGi &gi) const
Translate an OID to a GI.
Definition: seqdb.cpp:826
const string & GetDBNameList() const
Get list of database names.
Definition: seqdb.cpp:760
list< CRef< CSeq_id > > GetSeqIDs(int oid) const
Gets a list of sequence identifiers.
Definition: seqdb.cpp:765
Int8 GetDiskUsage() const
Retrieve the disk usage in bytes for this BLAST database.
Definition: seqdb.cpp:1464
string GetAvailableMaskAlgorithmDescriptions()
Returns a formatted string with the list of available masking algorithms in this database for display...
Definition: seqdb.cpp:1237
void ResetInternalChunkBookmark()
Resets this object's internal chunk bookmark, which is used when the oid_state argument to GetNextOID...
Definition: seqdb.cpp:755
EOidListType
Indicates how block of OIDs was returned.
Definition: seqdb.hpp:167
CRef< CSeq_data > GetSeqData(int oid, TSeqPos begin, TSeqPos end) const
Fetch data as a CSeq_data object.
Definition: seqdb.cpp:539
bool GiToPig(TGi gi, int &pig) const
Translate a GI to a PIG.
Definition: seqdb.cpp:854
void GetAliasFileValues(TAliasFileValues &afv)
Get Name/Value Data From Alias Files.
Definition: seqdb.cpp:1098
void RemoveOffsetRanges(int oid)
Remove any offset ranges for the given OID.
Definition: seqdb.cpp:1310
int GetMaxLength() const
Returns the length of the largest sequence in the database.
Definition: seqdb.cpp:705
int GetSeqLength(int oid) const
Returns the sequence length in base pairs or residues.
Definition: seqdb.cpp:400
bool PigToGi(int pig, TGi &gi) const
Translate a PIG to a GI.
Definition: seqdb.cpp:837
ESeqType GetSequenceType() const
Returns the type of database opened - protein or nucleotide.
Definition: seqdb.cpp:427
const CSeqDBGiList * GetGiList() const
Get GI list attached to this database.
Definition: seqdb.cpp:1120
ESeqType
Sequence types (eUnknown tries protein, then nucleotide).
Definition: seqdb.hpp:173
@ eNucleotide
Definition: seqdb.hpp:175
@ eUnknown
Definition: seqdb.hpp:176
@ eProtein
Definition: seqdb.hpp:174
bool SeqidToOid(const CSeq_id &seqid, int &oid) const
Translate a Seq-id to any matching OID.
Definition: seqdb.cpp:903
void RetAmbigSeq(const char **buffer) const
Returns any resources associated with the sequence.
Definition: seqdb.cpp:563
int GetOidAtOffset(int first_seq, Uint8 residue) const
Find the sequence closest to the given offset into the database.
Definition: seqdb.cpp:923
void SetOffsetRanges(int oid, const TRangeList &offset_ranges, bool append_ranges, bool cache_data)
Apply a range of offsets to a database sequence.
Definition: seqdb.cpp:1295
CRef< CBioseq > GetBioseq(int oid, TGi target_gi=ZERO_GI, const CSeq_id *target_seq_id=NULL) const
Get a CBioseq for a sequence.
Definition: seqdb.cpp:504
int GetAmbigPartialSeq(int oid, char **buffer, int nucl_code, ESeqDBAllocType strategy, TSequenceRanges *partial_ranges, TSequenceRanges *masks=NULL) const
Definition: seqdb.cpp:612
CRef< CBioseq > GetBioseqNoData(int oid, TGi target_gi=ZERO_GI, const CSeq_id *target_seq_id=NULL) const
Get a CBioseq for a sequence without sequence data.
Definition: seqdb.cpp:514
void GetTaxIDs(int oid, map< TGi, TTaxId > &gi_to_taxid, bool persist=false) const
Get taxid for an OID.
Definition: seqdb.cpp:441
void SetVolsMemBit(int mbit)
Set the membership of all volumes.
Definition: seqdb.cpp:1345
void GetTaxIdsForOids(const vector< blastdb::TOid > &oids, set< TTaxId > &tax_ids) const
Definition: seqdb.cpp:267
void GetMaskAlgorithmDetails(int algorithm_id, objects::EBlast_filter_program &program, string &program_name, string &algo_opts)
Get information about one type of masking available here.
Definition: seqdb.cpp:1263
void GetTotals(ESummaryType sumtype, int *oid_count, Uint8 *total_length, bool use_approx=true) const
Returns the sum of the sequence lengths.
Definition: seqdb.cpp:1110
void RetSequence(const char **buffer) const
Returns any resources associated with the sequence.
Definition: seqdb.cpp:523
string GetTitle() const
Returns the database title.
Definition: seqdb.cpp:630
int GetNumSeqs() const
Returns the number of sequences available.
Definition: seqdb.cpp:670
~CSeqDB()
Destructor.
Definition: seqdb.cpp:715
void GetTaxIdsForSeqId(const CSeq_id &seq_id, vector< TTaxId > &taxids)
Get all tax ids for a seq id.
Definition: seqdb.cpp:1758
EOidListType GetNextOIDChunk(int &begin_chunk, int &end_chunk, int oid_size, vector< int > &oid_list, int *oid_state=NULL)
Return a chunk of OIDs, and update the OID bookmark.
Definition: seqdb.cpp:739
void x_GetDBFilesMetaData(Int8 &disk_bytes, Int8 &cached_bytes, vector< string > &db_files, const string &user_path) const
Definition: seqdb.cpp:1610
CRef< CBlast_db_metadata > GetDBMetaData(string user_path=kEmptyStr)
Definition: seqdb.cpp:1705
void GetAllTaxIDs(int oid, set< TTaxId > &taxids) const
Get all tax ids for an oid.
Definition: seqdb.cpp:467
int GetSequence(int oid, const char **buffer) const
Get a pointer to raw sequence data.
Definition: seqdb.cpp:530
void AccessionToOids(const string &acc, vector< int > &oids) const
Translate an Accession to a list of OIDs.
Definition: seqdb.cpp:870
void ListColumns(vector< string > &titles)
List columns titles found in this database.
Definition: seqdb.cpp:1191
void GetTaxIdsForAccession(const string &accs, vector< TTaxId > &taxids)
Get all tax ids for an accessions.
Definition: seqdb.cpp:1752
bool CheckOrFindOID(int &next_oid) const
Find an included OID, incrementing next_oid if necessary.
Definition: seqdb.cpp:728
string GetDate() const
Returns the construction date of the database.
Definition: seqdb.cpp:635
int GetNumSeqsStats() const
Returns the number of sequences available.
Definition: seqdb.cpp:675
ESummaryType
Types of summary information available.
Definition: seqdb.hpp:183
@ eFilteredAll
Values from alias files, or summation over all included sequences.
Definition: seqdb.hpp:188
int GetColumnId(const string &title)
Get an ID number for a given column title.
Definition: seqdb.cpp:1196
void SeqidToOids(const CSeq_id &seqid, vector< int > &oids) const
Translate a Seq-id to a list of OIDs.
Definition: seqdb.cpp:896
int GetMaskAlgorithmId(const string &algo_name) const
Get the numeric algorithm ID for a string.
Definition: seqdb.cpp:1232
int GetAmbigSeqAlloc(int oid, char **buffer, int nucl_code, ESeqDBAllocType strategy, TSequenceRanges *masks=NULL) const
Get a pointer to sequence data with ambiguities.
Definition: seqdb.cpp:591
static const string kOidNotFound
String containing the error message in exceptions thrown when a given OID cannot be found.
Definition: seqdb.hpp:316
bool TiToOid(Int8 ti, int &oid) const
Translate a TI to an OID.
Definition: seqdb.cpp:799
CSeqDBIter Begin() const
Returns a sequence iterator.
Definition: seqdb.cpp:723
const string & GetColumnValue(int column_id, const string &key)
Look up the value for a specific column metadata key.
Definition: seqdb.cpp:1207
static const char * kBlastDbDateFormat
Format string for the date returned by CSeqDB::GetDate.
Definition: seqdb.hpp:851
CRef< CBioseq > GiToBioseq(TGi gi) const
Get a CBioseq for a given GI.
Definition: seqdb.cpp:987
class CSeqDBImpl * m_Impl
Implementation details are hidden. (See seqdbimpl.hpp).
Definition: seqdb.hpp:1529
EBlastDbVersion GetBlastDbVersion() const
Return blast db version.
Definition: seqdb.cpp:1604
void SetNumberOfThreads(int num_threads, bool force_mt=false)
Setting the number of threads.
Definition: seqdb.cpp:1321
static void GetTaxInfo(TTaxId taxid, SSeqDBTaxInfo &info)
Get taxonomy information.
Definition: seqdb.cpp:1105
Uint8 GetExactTotalLength()
Returns the exact sum of the lengths of all available sequences.
Definition: seqdb.cpp:690
CRef< CBlast_def_line_set > GetHdr(int oid) const
Get the ASN.1 header for the sequence.
Definition: seqdb.cpp:418
void DebugDump(CDebugDumpContext ddc, unsigned int depth) const
Dump debug information for this object.
Definition: seqdb.cpp:1597
void AccessionsToOids(const vector< string > &accs, vector< blastdb::TOid > &oids) const
Definition: seqdb.cpp:252
Uint8 GetTotalLengthStats() const
Returns the sum of the lengths of all available sequences.
Definition: seqdb.cpp:695
int GetSeqLengthApprox(int oid) const
Returns an unbiased, approximate sequence length.
Definition: seqdb.cpp:409
CRef< CBioseq > SeqidToBioseq(const CSeq_id &seqid) const
Get a CBioseq for a given Seq-id.
Definition: seqdb.cpp:1021
CSeqDB()
No-argument Constructor.
Definition: seqdb.cpp:394
CRef< CBioseq > PigToBioseq(int pig) const
Get a CBioseq for a given PIG.
Definition: seqdb.cpp:1004
int GetAmbigSeq(int oid, const char **buffer, int nucl_code) const
Get a pointer to sequence data with ambiguities.
Definition: seqdb.cpp:550
void GetMaskData(int oid, const vector< int > &algo_ids, TSequenceRanges &ranges)
Get masked ranges of a sequence.
Definition: seqdb.hpp:1408
bool GiToOid(TGi gi, int &oid) const
Translate a GI to an OID.
Definition: seqdb.cpp:808
const map< string, string > & GetColumnMetaData(int column_id)
Get all metadata for the specified column.
Definition: seqdb.cpp:1202
CSeqDBIdSet GetIdSet() const
Get IdSet list attached to this database.
Definition: seqdb.cpp:1125
void FlushOffsetRangeCache()
Flush all offset ranges cached.
Definition: seqdb.cpp:1316
void GetLeafTaxIDs(int oid, map< TGi, set< TTaxId > > &gi_to_taxid_set, bool persist=false) const
Get taxid for an OID.
Definition: seqdb.cpp:473
@ e_Ncbi8na
Definition: sequtil.hpp:52
@ e_Iupacna
Definition: sequtil.hpp:47
@ e_Ncbistdaa
Definition: sequtil.hpp:58
@ e_Iupacaa
Definition: sequtil.hpp:55
CTimeFormat –.
Definition: ncbitime.hpp:131
CTime –.
Definition: ncbitime.hpp:296
Functor object for s_RemoveAliasComponents where the path name is matched in SSeqDBInitInfo.
Definition: seqdb.cpp:1386
string m_Path
Definition: seqdb.cpp:1394
PathFinder(const string &p)
Definition: seqdb.cpp:1388
bool operator()(const SSeqDBInitInfo &value) const
Definition: seqdb.cpp:1389
void clear()
Definition: map.hpp:169
Definition: map.hpp:338
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
const_iterator begin() const
Definition: set.hpp:135
size_type size() const
Definition: set.hpp:132
static uch flags
static unsigned char depth[2 *(256+1+29)+1]
static void DLIST_NAME() append(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:78
static SQLCHAR output[256]
Definition: print.c:5
int offset
Definition: replacements.h:160
static FILE * f
Definition: readconf.c:23
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
#define ZERO_GI
Definition: ncbimisc.hpp:1088
#define NULL
Definition: ncbistd.hpp:225
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
void Trace(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1179
void Info(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1185
void FindFilesInDir(const CDir &dir, const vector< string > &masks, const vector< string > &masks_subdir, TFindFunc &find_func, TFindFiles flags=fFF_Default)
Find files in the specified directory.
Definition: ncbifile.hpp:3023
Int8 GetLength(void) const
Get size of file.
Definition: ncbifile.cpp:3204
virtual bool Remove(TRemoveFlags flags=eRecursive) const
Remove a directory entry.
Definition: ncbifile.cpp:2595
static string MakePath(const string &dir=kEmptyStr, const string &base=kEmptyStr, const string &ext=kEmptyStr)
Assemble a path from basic components.
Definition: ncbifile.cpp:413
EFindFiles
File finding flags.
Definition: ncbifile.hpp:3008
static char GetPathSeparator(void)
Get path separator symbol specific for the current platform.
Definition: ncbifile.cpp:433
string GetName(void) const
Get the base entry name with extension (if any).
Definition: ncbifile.hpp:3917
const string & GetPath(void) const
Get entry path.
Definition: ncbifile.hpp:3911
virtual bool Exists(void) const
Check existence of file.
Definition: ncbifile.hpp:4039
@ fFF_Recursive
descend into sub-dirs
Definition: ncbifile.hpp:3013
@ fFF_File
find files
Definition: ncbifile.hpp:3009
@ fParse_RawText
Try to ID raw non-numeric accessions.
Definition: Seq_id.hpp:81
@ fParse_ValidLocal
Treat otherwise unidentified strings as raw accessions, provided that they pass rudimentary validatio...
Definition: Seq_id.hpp:87
virtual void DebugDump(CDebugDumpContext ddc, unsigned int depth) const
Define method for dumping debug information.
Definition: ncbiobj.cpp:988
TObjectType * GetPointerOrNull(void) THROWS_NONE
Get pointer value.
Definition: ncbiobj.hpp:986
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
#define kEmptyStr
Definition: ncbistr.hpp:123
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3452
#define NPOS
Definition: ncbistr.hpp:133
static TNumeric StringToNumeric(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to a numeric value.
Definition: ncbistr.hpp:330
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5078
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2882
static string Join(const TContainer &arr, const CTempString &delim)
Join strings using the specified delimiter.
Definition: ncbistr.hpp:2699
static string & Replace(const string &src, const string &search, const string &replace, string &dst, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3305
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5406
@ fConvErr_NoThrow
Do not throw an exception on error.
Definition: ncbistr.hpp:285
@ fSplit_Truncate
Definition: ncbistr.hpp:2503
@ fSplit_Tokenize
All delimiters are merged and trimmed, to get non-empty tokens only.
Definition: ncbistr.hpp:2510
@ fSplit_MergeDelimiters
Merge adjacent delimiters.
Definition: ncbistr.hpp:2500
@ eNocase
Case insensitive compare.
Definition: ncbistr.hpp:1206
string AsString(const CTimeFormat &format=kEmptyStr, TSeconds out_tz=eCurrentTimeZone) const
Transform time to string.
Definition: ncbitime.cpp:1512
bool IsEmpty(void) const
Is time object empty (date and time)?
Definition: ncbitime.hpp:2377
static CTimeFormat GetPredefined(EPredefined fmt, TFlags flags=fDefault)
Get predefined format.
Definition: ncbitime.cpp:389
@ eISO8601_DateTimeSec
Y-M-DTh:m:s (eg 1997-07-16T19:20:30)
Definition: ncbitime.hpp:196
strategy
Block allocation strategies.
Definition: bmconst.h:146
EBlast_filter_program
This defines the possible sequence filtering algorithms to be used in a BLAST database.
char * dbname(DBPROCESS *dbproc)
Get name of current database.
Definition: dblib.c:6929
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
FILE * file
int i
static MDB_envinfo info
Definition: mdb_load.c:37
range(_Ty, _Ty) -> range< _Ty >
constexpr auto sort(_Init &&init)
constexpr bool empty(list< Ts... >) noexcept
<lmdb++.h> - C++11 wrapper for LMDB.
Definition: lmdb++.h:37
const struct ncbi::grid::netcache::search::fields::SIZE size
const struct ncbi::grid::netcache::search::fields::KEY key
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
unsigned int a
Definition: ncbi_localip.c:102
std::istream & in(std::istream &in_, double &x_)
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
static uint8_t * buffer
Definition: pcre2test.c:1016
CSeqDB::ESeqType ParseMoleculeTypeString(const string &s)
Convert a string to a CSeqDB ESeqType object.
Definition: seqdb.cpp:1527
static char s_GetSeqTypeChar(CSeqDB::ESeqType seqtype)
Helper function to translate enumerated type to character.
Definition: seqdb.cpp:62
static CSeqDBImpl * s_SeqDBInit(const string &dbname, char prot_nucl, int oid_begin, int oid_end, bool use_atlas_lock, CSeqDBGiList *gi_list=NULL, CSeqDBNegativeList *neg_list=NULL, CSeqDBIdSet idset=CSeqDBIdSet())
Helper function to build private implementation object.
Definition: seqdb.cpp:111
bool DeleteBlastDb(const string &dbpath, CSeqDB::ESeqType seq_type)
Deletes all files associated with a BLAST database.
Definition: seqdb.cpp:1542
vector< SSeqDBInitInfo > FindBlastDBs(const string &path, const string &dbtype, bool recurse, bool include_alias_files, bool remove_redundant_dbs)
Find BLAST DBs in the directory specified.
Definition: seqdb.cpp:1429
static void s_RemoveAliasComponents(CBlastDbFinder &finder)
Definition: seqdb.cpp:1397
Defines BLAST database access classes.
ESeqDBAllocType
Certain methods have an "Alloc" version.
@ eMalloc
void SeqDB_GetLMDBFileExtensions(bool db_is_protein, vector< string > &extn)
Retrieves file extensions for BLAST LMDB files.
const int kSeqDBNuclNcbiNA8
Used to request ambiguities in Ncbi/NA8 format.
void SeqDB_GetFileExtensions(bool db_is_protein, vector< string > &extensions, EBlastDbVersion dbver=eBDB_Version4)
Retrieves a list of all supported file extensions for BLAST databases.
EBlastDbVersion
BLAST database version.
Definition: seqdbcommon.hpp:51
@ eBDB_Version4
Definition: seqdbcommon.hpp:52
@ eBDB_Version5
Definition: seqdbcommon.hpp:53
This file defines several SeqDB utility functions related to byte order and file system portability.
void SeqDB_CombineAndQuote(const vector< string > &dbs, string &dbname)
Combine and quote list of database names.
const U & SeqDB_MapFind(const std::map< T, U > &m, const T &k, const U &dflt)
Find a map value or return a default.
T SeqDB_GetStdOrd(const T *stdord_obj)
Read a network order integer value.
The top level of the private implementation layer for SeqDB.
static SLJIT_INLINE sljit_ins l(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
List of sequence offset ranges.
Definition: seqdb.hpp:236
Structure to define basic information to initialize a BLAST DB.
Definition: seqdb.hpp:1541
OID-Range type to simplify interfaces.
SSeqDBTaxInfo.
Definition: type.c:6
#define _ASSERT
else result
Definition: token2.c:20
Modified on Fri Sep 20 14:57:01 2024 by modify_doxy.py rev. 669887