NCBI C++ ToolKit
seqdb.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef OBJTOOLS_BLAST_SEQDB_READER___SEQDB__HPP
2 #define OBJTOOLS_BLAST_SEQDB_READER___SEQDB__HPP
3 
4 /* $Id: seqdb.hpp 97719 2022-08-18 16:44:30Z fongah2 $
5  * ===========================================================================
6  *
7  * PUBLIC DOMAIN NOTICE
8  * National Center for Biotechnology Information
9  *
10  * This software/database is a "United States Government Work" under the
11  * terms of the United States Copyright Act. It was written as part of
12  * the author's official duties as a United States Government employee and
13  * thus cannot be copyrighted. This software/database is freely available
14  * to the public for use. The National Library of Medicine and the U.S.
15  * Government have not placed any restriction on its use or reproduction.
16  *
17  * Although all reasonable efforts have been taken to ensure the accuracy
18  * and reliability of the software and data, the NLM and the U.S.
19  * Government do not and cannot warrant the performance or results that
20  * may be obtained by using this software or data. The NLM and the U.S.
21  * Government disclaim all warranties, express or implied, including
22  * warranties of performance, merchantability or fitness for any particular
23  * purpose.
24  *
25  * Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Author: Kevin Bealer
30  *
31  */
32 
33 /// @file seqdb.hpp
34 /// Defines BLAST database access classes.
35 ///
36 /// Defines classes:
37 /// CSeqDB
38 /// CSeqDBSequence
39 ///
40 /// Implemented for: UNIX, MS-Windows
41 
42 
50 #include <objects/seq/Bioseq.hpp>
51 #include <objects/seq/Seq_data.hpp>
55 #include <util/sequtil/sequtil.hpp>
56 #include <util/range.hpp>
57 #include <set>
58 #include <objmgr/bioseq_handle.hpp>
59 
61 
62 /// Include definitions from the objects namespace.
64 
65 
66 /// Forward declaration of CSeqDB class
67 class CSeqDB;
68 
69 
70 /// CSeqDBIter
71 ///
72 /// Small class to iterate over a seqdb database.
73 ///
74 /// This serves something of the same role for a CSeqDB object that a
75 /// vector iterator might serve in the standard template library.
76 
78 public:
79  /// Destructor
80  virtual ~CSeqDBIter()
81  {
82  x_RetSeq();
83  }
84 
85  /// Increment operator
86  ///
87  /// Returns the currently held sequence and gets pointers to the
88  /// next sequence.
89  CSeqDBIter & operator++();
90 
91  /// Get the OID of the currently held sequence.
92  int GetOID()
93  {
94  return m_OID;
95  }
96 
97  /// Get the sequence data for the currently held sequence.
98  const char * GetData()
99  {
100  return m_Data;
101  }
102 
103  /// Get the length (in base pairs) of the currently held sequence.
104  int GetLength()
105  {
106  return m_Length;
107  }
108 
109  /// Returns true if the iterator points to a valid sequence.
110  DECLARE_OPERATOR_BOOL(m_Length != -1);
111 
112  /// Construct one iterator from another.
113  CSeqDBIter(const CSeqDBIter &);
114 
115  /// Copy one iterator to another.
116  CSeqDBIter & operator =(const CSeqDBIter &);
117 
118 private:
119  /// Get data pointer and length for the current sequence.
120  inline void x_GetSeq();
121 
122  /// Release hold on current sequence.
123  inline void x_RetSeq();
124 
125  /// CSeqDB is a friend so it alone can create objects of this type.
126  friend class CSeqDB;
127 
128  /// Build an iterator (called only from CSeqDB).
129  CSeqDBIter(const CSeqDB *, int oid);
130 
131  /// The CSeqDB object which this object iterates over.
132  const CSeqDB * m_DB;
133 
134  /// The OID this iterator is currently accessing.
135  int m_OID;
136 
137  /// The sequence data for this OID.
138  const char * m_Data;
139 
140  /// The length of this OID.
141  int m_Length;
142 };
143 
144 
145 /// Forward declaration of CSeqDBGiList base class.
146 class CSeqDBGiList;
147 
148 /// Forward declaration of CSeqDBIdSet class.
149 class CSeqDBIdSet;
150 
151 
152 /// CSeqDB
153 ///
154 /// User interface class for blast databases.
155 ///
156 /// This class provides the top-level interface class for BLAST
157 /// database users. It defines access to the database component by
158 /// calling methods on objects which represent the various database
159 /// files, such as the index, header, sequence, and alias files.
160 
162 public:
163  /// Import type to allow shorter name.
165 
166  /// Indicates how block of OIDs was returned.
169  eOidRange
170  };
171 
172  /// Sequence types (eUnknown tries protein, then nucleotide).
173  enum ESeqType {
176  eUnknown
177  };
178 
179  /// Converts a CSeqDB sequence type into a human readable string
180  static string ESeqType2String(ESeqType type);
181 
182  /// Types of summary information available.
184  /// Sum of all sequences, ignoring GI and OID lists and alias files.
186 
187  /// Values from alias files, or summation over all included sequences.
189 
190  /// Sum of included sequences with OIDs within the iteration range.
191  eFilteredRange
192  };
193 
194  /// File type for which mmap strategy may be set.
196  /// Index files (name ends with ".pin" or ".nin").
198 
199  /// Sequence files (name ends with ".psq" or ".nsq").
200  eMmap_SequenceFile
201  };
202 
203  /// Permitted mmap strategies.
205  /// Normal, no special behavior (should undo next two options).
207 
208  /// Expect sequential page references.
210 
211  /// Expect access in the near future.
212  eMmap_WillNeed
213  };
214 
215  /// Sequence type accepted and returned for OID indices.
216  typedef int TOID;
217 
218  /// Sequence type accepted and returned for PIG indices.
219  typedef int TPIG;
220 
221  /// Sequence type accepted and returned for GI indices.
222  typedef TGi TGI;
223 
224  /// Structure to represent a range
225  struct TOffsetPair {
228 
229  /// Default constructor
230  TOffsetPair() : first(0), second(0) {}
231  /// Convenient operator to convert to TSeqRange
232  operator TSeqRange() const { return TSeqRange(first, second-1); }
233  };
234 
235  /// List of sequence offset ranges.
237  typedef size_t size_type;
239  typedef const value_type* const_iterator;
240 
241  private:
245 
246  void x_reset_all() {
247  _size = 0;
248  _capacity = 0;
249  _data = NULL;
250  }
251 
253  static size_t kResizeFactor = 2;
254  if (_size + 1 > _capacity) {
255  reserve((_capacity + 1) * kResizeFactor -1);
256  }
257  }
258 
259  public:
261  x_reset_all();
262  reserve(7); // must reserve at least 1 element
263  }
264 
266  free(_data);
267  x_reset_all();
268  }
269 
270  void clear() { _size = 0; }
271 
272  bool empty() const { return _size == 0; }
273 
274  size_type size() const { return _size; }
275 
276  const_iterator begin() const { return const_iterator(&_data[1]); }
277 
278  const_iterator end() const { return const_iterator(&_data[1+ 2*_size]); }
279 
280  value_type& operator[](size_type i) { return (value_type &)_data[1+ 2*i]; }
281 
282  value_type * get_data() const { return (value_type *) _data; }
283 
284  /// Reserves capacity for at least num_elements elements
285  /// @throw CSeqDBException in case of memory allocation failure
286  void reserve(size_t num_elements) {
287  if (num_elements > _capacity) {
288  value_type* reallocation =
289  (value_type*) realloc(_data, (num_elements + 1) *
290  sizeof(value_type));
291  if ( !reallocation ) {
292  string msg("Failed to allocate ");
293  msg += NStr::SizetToString(num_elements + 1) + " elements";
294  NCBI_THROW(CSeqDBException, eMemErr, msg);
295  }
296  _data = (TSeqPos*) reallocation;
297  _capacity = num_elements;
298  }
299  }
300 
301  /// Append extra elements at the end
302  void append(const void *src, size_type num_elements) {
303  reserve(_size + num_elements);
304  memcpy(&_data[1+ 2*_size], src, num_elements * sizeof(value_type));
305  _size += num_elements;
306  }
307 
308  /// Append extra element at the end
309  void push_back(const value_type& element) {
310  x_reallocate_if_necessary();
311  append(&element, 1);
312  }
313  };
314  /// String containing the error message in exceptions thrown when a given
315  /// OID cannot be found
316  static const string kOidNotFound;
317 
318  /// Short Constructor
319  ///
320  /// This version of the constructor assumes memory mapping and
321  /// that the entire possible OID range will be included. Please
322  /// use quotes ("") around database names that contains space
323  /// characters.
324  ///
325  /// @param dbname
326  /// A list of database or alias names, seperated by spaces
327  /// @param seqtype
328  /// Specify eProtein, eNucleotide, or eUnknown.
329  /// @param gilist
330  /// The database will be filtered by this GI list if non-null.
331  /// @param use_atlas_lock
332  /// Enable/diable thread synchronization. If true single Atlas mutex
333  /// will be used to protect most of critical parts of the code.
334  /// If false, CSeqdDBAtlas::Lock and CSeqDBAtlas::Unlock functions
335  /// will be noops. If each thread accesses
336  /// a different database vloume, then setting this parameter to false
337  /// will reduce contention. Otherwise it should be set to true.
338  CSeqDB(const string & dbname, ESeqType seqtype, CSeqDBGiList * gilist = 0,
339  bool use_atlas_lock = true);
340 
341  /// Short Constructor with Negative ID list.
342  ///
343  /// This version of the constructor assumes the entire OID range
344  /// will be included, and applies filtering by a negative ID list.
345  /// Please use quotes ("") around database names that contains
346  /// space characters.
347  ///
348  /// @param dbname
349  /// A list of database or alias names, seperated by spaces
350  /// @param seqtype
351  /// Specify eProtein, eNucleotide, or eUnknown.
352  /// @param nlist
353  /// The database will be filtered to not include these GIs or TIs.
354  CSeqDB(const string & dbname,
355  ESeqType seqtype,
356  CSeqDBNegativeList * nlist);
357 
358  /// Short Constructor with Positive and Negative ID list.
359  ///
360  /// This version of the constructor assumes the entire OID range
361  /// will be included, and applies filtering by a negative ID list.
362  /// Please use quotes ("") around database names that contains
363  /// space characters.
364  ///
365  /// @param dbname
366  /// A list of database or alias names, seperated by spaces
367  /// @param seqtype
368  /// Specify eProtein, eNucleotide, or eUnknown.
369  /// @param nlist
370  /// The database will be filtered to not include these GIs or TIs.
371  CSeqDB(const string & dbname,
372  ESeqType seqtype,
373  CSeqDBGiList * gilist,
374  CSeqDBNegativeList * nlist);
375 
376  /// Short Constructor with Positive and Negative ID list with oid range.
377  ///
378  /// This version of the constructor assumes the entire OID range
379  /// will be included, and applies filtering by a negative ID list.
380  /// Please use quotes ("") around database names that contains
381  /// space characters.
382  ///
383  /// @param dbname
384  /// A list of database or alias names, seperated by spaces
385  /// @param seqtype
386  /// Specify eProtein, eNucleotide, or eUnknown.
387  /// @param nlist
388  /// The database will be filtered to not include these GIs or TIs.
389  CSeqDB(const string & dbname,
390  ESeqType seqtype,
391  int oid_begin,
392  int oid_end,
393  CSeqDBGiList * gilist,
394  CSeqDBNegativeList * nlist);
395 
396  /// Short Constructor with Computed ID list.
397  ///
398  /// This version of the constructor takes a computed CSeqDBIdSet
399  /// list which can be positive or negative. This is equivalent to
400  /// building a positive or negative list from the IdSet object and
401  /// and passing it into one of the previous constructors.
402  ///
403  /// @param dbname
404  /// A list of database or alias names, seperated by spaces
405  /// @param seqtype
406  /// Specify eProtein, eNucleotide, or eUnknown.
407  /// @param ids
408  /// The database will be filtered by this set of IDs.
409  CSeqDB(const string & dbname, ESeqType seqtype, CSeqDBIdSet ids);
410 
411  /// Short Constructor
412  ///
413  /// This version of the constructor assumes memory mapping and
414  /// that the entire possible OID range will be included.
415  ///
416  /// @param dbs
417  /// A list of database or alias names.
418  /// @param seqtype
419  /// Specify eProtein, eNucleotide, or eUnknown.
420  /// @param gilist
421  /// The database will be filtered by this GI list if non-null.
422  CSeqDB(const vector<string> & dbs,
423  ESeqType seqtype,
424  CSeqDBGiList * gilist = 0);
425 
426  /// Constructor with MMap Flag and OID Range.
427  ///
428  /// If the oid_end value is specified as zero, or as a value
429  /// larger than the number of OIDs, it will be adjusted to the
430  /// number of OIDs in the database. Specifying 0,0 for the start
431  /// and end will cause inclusion of the entire database. This
432  /// version of the constructor is obsolete because the sequence
433  /// type is specified as a character (eventually only the ESeqType
434  /// version will exist). Please use quotes ("") around database
435  /// names that contains space characters.
436  ///
437  /// @param dbname
438  /// A list of database or alias names, seperated by spaces.
439  /// @param seqtype
440  /// Specify eProtein, eNucleotide, or eUnknown.
441  /// @param oid_begin
442  /// Iterator will skip OIDs less than this value. Only OIDs
443  /// found in the OID lists (if any) will be returned.
444  /// @param oid_end
445  /// Iterator will return up to (but not including) this OID.
446  /// @param use_mmap
447  /// If kSeqDBMMap is specified (the default), memory mapping is
448  /// attempted. If kSeqDBNoMMap is specified, or memory mapping
449  /// fails, this platform does not support it, the less efficient
450  /// read and write calls are used instead.
451  /// @param gi_list
452  /// The database will be filtered by this GI list if non-null.
453  CSeqDB(const string & dbname,
454  ESeqType seqtype,
455  int oid_begin,
456  int oid_end,
457  bool use_mmap,
458  CSeqDBGiList * gi_list = 0);
459 
460  /// Constructor with MMap Flag and OID Range.
461  ///
462  /// If the oid_end value is specified as zero, or as a value
463  /// larger than the number of OIDs, it will be adjusted to the
464  /// number of OIDs in the database. Specifying 0,0 for the start
465  /// and end will cause inclusion of the entire database. This
466  /// version of the constructor is obsolete because the sequence
467  /// type is specified as a character (eventually only the ESeqType
468  /// version will exist).
469  ///
470  /// @param dbname
471  /// A list of database or alias names.
472  /// @param seqtype
473  /// Specify eProtein, eNucleotide, or eUnknown.
474  /// @param oid_begin
475  /// Iterator will skip OIDs less than this value. Only OIDs
476  /// found in the OID lists (if any) will be returned.
477  /// @param oid_end
478  /// Iterator will return up to (but not including) this OID.
479  /// @param use_mmap
480  /// If kSeqDBMMap is specified (the default), memory mapping is
481  /// attempted. If kSeqDBNoMMap is specified, or memory mapping
482  /// fails, this platform does not support it, the less efficient
483  /// read and write calls are used instead.
484  /// @param gi_list
485  /// The database will be filtered by this GI list if non-null.
486  CSeqDB(const vector<string> & dbname,
487  ESeqType seqtype,
488  int oid_begin,
489  int oid_end,
490  bool use_mmap,
491  CSeqDBGiList * gi_list = 0);
492 
493  /// Destructor.
494  ///
495  /// This will return resources acquired by this object, including
496  /// any gotten by the GetSequence() call, whether or not they have
497  /// been returned by RetSequence().
498  ~CSeqDB();
499 
500  /// Returns the default BLAST database search path
501  /// configured for this local installation of BLAST
502  static string GenerateSearchPath();
503 
504  /// Returns the sequence length in base pairs or residues.
505  int GetSeqLength(int oid) const;
506 
507  /// Returns the first Gi (if any) of the sequence. This method does NOT
508  /// check whether the OID in question belongs to the BLAST database after
509  /// all filtering is applied (e.g.: GI list restriction or membership bit).
510  /// If you need those checks, please use GetGis()
511  /// @sa GetGis
512  TGi GetSeqGI(int oid) const;
513 
514  /// Returns an unbiased, approximate sequence length.
515  ///
516  /// For protein DBs, this method is identical to GetSeqLength().
517  /// In the nucleotide case, computing the exact length requires
518  /// examination of the sequence data. This method avoids doing
519  /// that, returning an approximation ranging from L-3 to L+3
520  /// (where L indicates the exact length), and unbiased on average.
521  int GetSeqLengthApprox(int oid) const;
522 
523  /// Get the ASN.1 header for the sequence.
524  ///
525  /// Do not modify the object returned here (e.g. by removing some
526  /// of the deflines), as the object is cached internally and
527  /// future operations on this OID may be affected.
528  ///
529  /// @param oid The ordinal ID of the sequence.
530  /// @return The blast deflines for this sequence.
531  CRef<CBlast_def_line_set> GetHdr(int oid) const;
532 
533  /// Get taxid for an OID.
534  ///
535  /// This finds the leaf-node TAXIDS associated with a given OID and
536  /// computes a mapping from GI to taxid. This mapping is added to the
537  /// map<TGi,set<TTaxId>> provided by the user. If the "persist" flag is
538  /// set to true, the new associations will simply be added to the
539  /// map. If it is false (the default), the map will be cleared
540  /// first.
541  ///
542  /// @param oid
543  /// The ordinal id of the sequence.
544  /// @param gi_to_taxid_set
545  /// A returned mapping from GI to set of taxids.
546  /// @param persist
547  /// If false, the map will be cleared before adding new entries.
548  void GetLeafTaxIDs(
549  int oid,
550  map<TGi, set<TTaxId> >& gi_to_taxid_set,
551  bool persist = false
552  ) const;
553 
554  /// Get taxids for an OID.
555  ///
556  /// This finds the leaf-node TAXIDS associated with a given OID and
557  /// returns them in a vector. If the "persist" flag is set to true, the
558  /// new taxids will simply be appended to the vector. If it is
559  /// false (the default), the vector will be cleared first. One
560  /// advantage of this interface over the map<int,set<int>> version is
561  /// that the vector interface works with databases with local IDs
562  /// but lacking GIs.
563  ///
564  /// @param oid
565  /// The ordinal id of the sequence.
566  /// @param taxids
567  /// A returned vector of taxids.
568  /// @param persist
569  /// If false, the map will be cleared before adding new entries.
570  void GetLeafTaxIDs(
571  int oid,
572  vector<TTaxId> & taxids,
573  bool persist = false
574  ) const;
575 
576  /// Get taxid for an OID.
577  ///
578  /// This finds the TAXIDS associated with a given OID and computes
579  /// a mapping from GI to a set of taxids. This mapping is added to the
580  /// map<int,int> provided by the user. If the "persist" flag is
581  /// set to true, the new associations will simply be added to the
582  /// map. If it is false (the default), the map will be cleared
583  /// first.
584  ///
585  /// @param oid
586  /// The ordinal id of the sequence.
587  /// @param gi_to_taxid
588  /// A returned mapping from GI to taxid.
589  /// @param persist
590  /// If false, the map will be cleared before adding new entries.
591  void GetTaxIDs(int oid,
592  map<TGi, TTaxId> & gi_to_taxid,
593  bool persist = false) const;
594 
595  /// Get taxids for an OID.
596  ///
597  /// This finds the TAXIDS associated with a given OID and returns
598  /// them in a vector. If the "persist" flag is set to true, the
599  /// new taxids will simply be appended to the vector. If it is
600  /// false (the default), the vector will be cleared first. One
601  /// advantage of this interface over the map<int,int> version is
602  /// that the vector interface works with databases with local IDs
603  /// but lacking GIs.
604  ///
605  /// @param oid
606  /// The ordinal id of the sequence.
607  /// @param taxids
608  /// A returned list of taxids.
609  /// @param persist
610  /// If false, the map will be cleared before adding new entries.
611  void GetTaxIDs(int oid,
612  vector<TTaxId> & taxids,
613  bool persist = false) const;
614 
615  /// Get all tax ids for an oid
616  ///
617  /// This includes leaf and non-leaf tax ids associated with the oid
618  /// @param oid
619  /// The ordinal id of the sequence.
620  /// @param taxids
621  /// A returned a set of taxids.
622  void GetAllTaxIDs(int oid,
623  set<TTaxId> & taxids) const;
624 
625  /// Get a CBioseq for a sequence.
626  ///
627  /// This builds and returns the header and sequence data
628  /// corresponding to the indicated sequence as a CBioseq. If
629  /// target_gi is non-zero or target_seq_id is non-null, the header
630  /// information will be filtered to only include the defline associated
631  /// with that gi/seq_id.
632  ///
633  /// @param oid
634  /// The ordinal id of the sequence.
635  /// @param target_gi
636  /// If nonzero, the target gi to filter the header information by.
637  /// @param target_seq_id
638  /// The target seq_id to filter the header information by.
639  /// @return
640  /// A CBioseq object corresponding to the sequence.
641  CRef<CBioseq> GetBioseq(int oid,
642  TGi target_gi = ZERO_GI,
643  const CSeq_id * target_seq_id = NULL) const;
644 
645  /// Get a CBioseq for a sequence without sequence data.
646  ///
647  /// This builds and returns the data corresponding to the
648  /// indicated sequence as a CBioseq, but without the sequence
649  /// data. It is used when processing large sequences, to avoid
650  /// accessing unused parts of the sequence.
651  ///
652  /// @param oid
653  /// The ordinal id of the sequence.
654  /// @param target_gi
655  /// If nonzero, the target gi to filter the header information by.
656  /// @param target_seq_id
657  /// The target seq_id to filter the header information by.
658  /// @return
659  /// A CBioseq object corresponding to the sequence, but without
660  /// sequence data.
661  CRef<CBioseq> GetBioseqNoData(int oid,
662  TGi target_gi = ZERO_GI,
663  const CSeq_id * target_seq_id = NULL) const;
664 
665  /// Extract a Blast-def-line-set object from a Bioseq retrieved by CSeqDB
666  /// @param bioseq Bioseq retrieved from CSeqDB [in]
668  ExtractBlastDefline(const CBioseq & bioseq);
669  /// Extract a Blast-def-line-set object from a Bioseq_Handle retrieved by
670  /// CSeqDB
671  /// @param bioseq Bioseq retrieved from CSeqDB [in]
673  ExtractBlastDefline(const CBioseq_Handle& handle);
674 
675  /// Get a pointer to raw sequence data.
676  ///
677  /// Get the raw sequence (strand data). When done, resources
678  /// should be returned with RetSequence. This data pointed to
679  /// by *buffer is in read-only memory (where supported).
680  ///
681  /// @param oid
682  /// The ordinal id of the sequence.
683  /// @param buffer
684  /// A returned pointer to the data in the sequence.
685  /// @return
686  /// The return value is the sequence length (in base pairs or
687  /// residues). In case of an error, an exception is thrown.
688  int GetSequence(int oid, const char ** buffer) const;
689 
690  /// Get a pointer to sequence data with ambiguities.
691  ///
692  /// In the protein case, this is identical to GetSequence(). In
693  /// the nucleotide case, it stores 2 bases per byte instead of 4.
694  /// The third parameter indicates the encoding for nucleotide
695  /// data, either kSeqDBNuclNcbiNA8 or kSeqDBNuclBlastNA8, ignored
696  /// if the sequence is a protein sequence. When done, resources
697  /// should be returned with RetSequence.
698  ///
699  /// @param oid
700  /// The ordinal id of the sequence.
701  /// @param buffer
702  /// A returned pointer to the data in the sequence.
703  /// @param nucl_code
704  /// The encoding to use for the returned sequence data.
705  /// @return
706  /// The return value is the sequence length (in base pairs or
707  /// residues). In case of an error, an exception is thrown.
708  int GetAmbigSeq(int oid, const char ** buffer, int nucl_code) const;
709 
710  /// Get a pointer to a range of sequence data with ambiguities.
711  ///
712  /// This is like GetAmbigSeq(), but only a range of the sequence
713  /// data is computed and returned. When done, resources should be
714  /// returned with RetSequence.
715  ///
716  /// @param oid
717  /// The ordinal id of the sequence.
718  /// @param buffer
719  /// A returned pointer to the data in the sequence.
720  /// @param nucl_code
721  /// The encoding to use for the returned sequence data.
722  /// @param begin_offset
723  /// The zero-based offset at which to start translating.
724  /// @param end_offset
725  /// The zero-based offset at which to end translation.
726  /// @return
727  /// The return value is the subsequence length (in base pairs or
728  /// residues). In case of an error, an exception is thrown.
729  int GetAmbigSeq(int oid,
730  const char ** buffer,
731  int nucl_code,
732  int begin_offset,
733  int end_offset) const;
734 
735  /// Get a pointer to sequence data with ambiguities.
736  ///
737  /// This is like GetAmbigSeq(), but the allocated object should be
738  /// deleted by the caller. This is intended for users who are
739  /// going to modify the sequence data, or are going to mix the
740  /// data into a container with other data, and who are mixing data
741  /// from multiple sources and want to free the data in the same
742  /// way. The fourth parameter should be given one of the values
743  /// from EAllocStrategy; the corresponding method should be used
744  /// to delete the object. Note that "delete[]" should be used
745  /// instead of "delete"
746  ///
747  /// @param oid
748  /// Ordinal ID.
749  /// @param buffer
750  /// Address of a char pointer to access the sequence data.
751  /// @param nucl_code
752  /// The NA encoding, kSeqDBNuclNcbiNA8 or kSeqDBNuclBlastNA8.
753  /// @param strategy
754  /// Indicate which allocation strategy to use.
755  /// @param masks
756  /// If not empty, the return sequence will be (hard) masked.
757  /// Masks are cleared on return.
758  /// @return
759  /// The return value is the sequence length (in base pairs or
760  /// residues). In case of an error, an exception is thrown.
761  int GetAmbigSeqAlloc(int oid,
762  char ** buffer,
763  int nucl_code,
765  TSequenceRanges * masks = NULL) const;
766 
767  int GetAmbigPartialSeq(int oid,
768  char ** buffer,
769  int nucl_code,
771  TSequenceRanges * partial_ranges,
772  TSequenceRanges * masks = NULL) const;
773 
774  /// Returns any resources associated with the sequence.
775  ///
776  /// Calls to GetSequence (but not GetBioseq())
777  /// either increment a counter corresponding to a section of the
778  /// database where the sequence data lives, or allocate a buffer
779  /// to return to the user. This method decrements that counter or
780  /// frees the allocated buffer, so that the memory can be used by
781  /// other processes. Each allocating call should be paired with a
782  /// returning call. Note that this does not apply to GetBioseq(),
783  /// or GetHdr(), for example.
784  ///
785  /// @param buffer
786  /// A pointer to the sequence data to release.
787  void RetSequence(const char ** buffer) const;
788 
789  /// Returns any resources associated with the sequence.
790  ///
791  /// Calls to GetAmbigSeq (but not GetBioseq())
792  /// either increment a counter corresponding to a section of the
793  /// database where the sequence data lives, or allocate a buffer
794  /// to return to the user. This method decrements that counter or
795  /// frees the allocated buffer, so that the memory can be used by
796  /// other processes. Each allocating call should be paired with a
797  /// returning call. Note that this does not apply to GetBioseq(),
798  /// or GetHdr(), for example.
799  ///
800  /// @param buffer
801  /// A pointer to the sequence data to release.
802  void RetAmbigSeq(const char ** buffer) const;
803 
804  /// Gets a list of sequence identifiers.
805  ///
806  /// This returns the list of CSeq_id identifiers associated with
807  /// the sequence specified by the given OID.
808  ///
809  /// @param oid
810  /// The oid of the sequence.
811  /// @return
812  /// A list of Seq-id objects for this sequence.
813  list< CRef<CSeq_id> > GetSeqIDs(int oid) const;
814 
815  /// Gets a list of GIs for an OID.
816  ///
817  /// This returns the GIs associated with the sequence specified by
818  /// the given OID. If append is true, gis will be appended to the
819  /// end of the provided vector; otherwise the vector will be
820  /// emptied first.
821  ///
822  /// @param oid
823  /// The oid of the sequence.
824  /// @param gis
825  /// The returned list of gis.
826  /// @param append
827  /// Specify true to append to gis, keeping existing elements.
828  void GetGis(int oid, vector<TGi> & gis, bool append = false) const;
829 
830  /// Returns the type of database opened - protein or nucleotide.
831  ///
832  /// This uses the same constants as the constructor.
833  ESeqType GetSequenceType() const;
834 
835  /// Returns the database title.
836  ///
837  /// This is usually read from database volumes or alias files. If
838  /// multiple databases were passed to the constructor, this will
839  /// be a concatenation of those databases' titles.
840  string GetTitle() const;
841 
842  /// Returns the construction date of the database.
843  ///
844  /// This is encoded in the database. If multiple databases or
845  /// multiple volumes were accessed, the latest date will
846  /// be used.
847  string GetDate() const;
848 
849  /// Format string for the date returned by CSeqDB::GetDate
850  /// @sa CTime
851  static const char* kBlastDbDateFormat;
852 
853  /// Returns the construction date of the database.
854  ///
855  /// @param dbname
856  /// The database name.
857  /// @param seqtype
858  /// The type of database (nucleotide or protein)
859  /// @return
860  /// The latest date
861  static CTime GetDate(const string & dbname,
862  ESeqType seqtype);
863 
864  /// Returns the number of sequences available.
865  int GetNumSeqs() const;
866 
867  /// Returns the number of sequences available.
868  ///
869  /// This may be overridden by the STATS_NSEQ key.
870  int GetNumSeqsStats() const;
871 
872  /// Returns the size of the (possibly sparse) OID range.
873  int GetNumOIDs() const;
874 
875  /// Returns the sum of the lengths of all available sequences.
876  ///
877  /// This uses summary information stored in the database volumes
878  /// or alias files. It provides an approx value without iterating
879  /// over individual sequences for cases when scanning the db is
880  /// the only way to determine the exact total length
881  Uint8 GetTotalLength() const;
882 
883  /// Returns the exact sum of the lengths of all available sequences.
884  ///
885  /// Calling this function may trigger a complete db scan if the
886  /// total length of a db cannot be determined without iterating
887  /// thorugh the sequences i.e. a db with gi list
888  Uint8 GetExactTotalLength();
889 
890 
891  /// Returns the sum of the lengths of all available sequences.
892  ///
893  /// This uses summary information stored in the database volumes
894  /// or alias files. It provides either an exact value or a value
895  /// changed in the alias files by the STATS_TOTLEN key.
896  Uint8 GetTotalLengthStats() const;
897 
898  /// Returns the sum of the lengths of all volumes.
899  ///
900  /// This uses summary information stored in the database volumes
901  /// (but not the alias files). It provides an exact value,
902  /// without iterating over individual sequences. It includes all
903  /// OIDs regardless of inclusion by the filtering mechanisms of
904  /// the alias files.
905  Uint8 GetVolumeLength() const;
906 
907  /// Returns the sum of the sequence lengths.
908  ///
909  /// This uses summary information and iteration to compute the
910  /// total length and number of sequences for some subset of the
911  /// database. If eUnfilteredAll is specified, it uses information
912  /// from the underlying database volumes, without filtering. If
913  /// eFilteredAll is specified, all of the included sequences are
914  /// used, for all possible OIDs. If eFilteredRange is specified,
915  /// the returned values correspond to the sum over only those
916  /// sequences that survive filtering, and are within the iteration
917  /// range. If either of oid_count or total_length is passed NULL,
918  /// that result is not returned. In some cases, the results can
919  /// be computed in constant time; other cases require iteration
920  /// proportional to the length of the database or the included OID
921  /// range (see SetIterationRange()).
922  ///
923  /// @param sumtype
924  /// Specifies the subset of sequences to include.
925  /// @param oid_count
926  /// The returned number of included OIDs.
927  /// @param total_length
928  /// The returned sum of included sequence lengths.
929  /// @param use_approx
930  /// Whether to use approximate lengths for nucleotide.
931  void GetTotals(ESummaryType sumtype,
932  int * oid_count,
933  Uint8 * total_length,
934  bool use_approx = true) const;
935 
936  /// Returns the length of the largest sequence in the database.
937  ///
938  /// This uses summary information stored in the database volumes
939  /// or alias files. This might be used to chose buffer sizes.
940  int GetMaxLength() const;
941 
942  /// Returns the length of the shortest sequence in the database.
943  ///
944  /// This uses summary information stored in the database volumes
945  /// or alias files. This might be used to chose cutoff score.
946  int GetMinLength() const;
947 
948  /// Returns a sequence iterator.
949  ///
950  /// This gets an iterator designed to allow traversal of the
951  /// database from beginning to end.
952  CSeqDBIter Begin() const;
953 
954  /// Find an included OID, incrementing next_oid if necessary.
955  ///
956  /// If the specified OID is not included in the set (i.e. the OID
957  /// mask), the input parameter is incremented until one is found
958  /// that is. The user will probably want to increment between
959  /// calls, if iterating over the db.
960  ///
961  /// @return
962  /// True if a valid OID was found, false otherwise.
963  bool CheckOrFindOID(int & next_oid) const;
964 
965  /// Return a chunk of OIDs, and update the OID bookmark.
966  ///
967  /// This method allows the caller to iterate over the database by
968  /// fetching batches of OIDs. It will either return a list of OIDs in
969  /// a vector, or set a pair of integers to indicate a range of OIDs.
970  /// The return value will indicate which technique was used. The
971  /// caller sets the number of OIDs to get by setting the size of the
972  /// vector. If eOidRange is returned, the first included oid is
973  /// oid_begin and oid_end is the oid after the last included oid. If
974  /// eOidList is returned, the vector contain the included OIDs, and may
975  /// be resized to a smaller value if fewer entries are available (for
976  /// the last chunk). In some cases it may be desireable to have
977  /// several concurrent, independent iterations over the same database
978  /// object. If this is required, the caller should specify the address
979  /// of an int to the optional parameter oid_state. This should be
980  /// initialized to zero (before the iteration begins) but should
981  /// otherwise not be modified by the calling code (except that it can
982  /// be reset to zero to restart the iteration). For the normal case of
983  /// one iteration per program, this parameter can be omitted.
984  ///
985  /// @param begin_chunk
986  /// First included oid (if eOidRange is returned).
987  /// @param end_chunk
988  /// OID after last included (if eOidRange is returned).
989  /// @param oid_size
990  /// Number of OID to retrieve (ignored in MT environment)
991  /// @param oid_list
992  /// An empty list. Will contain oid list if eOidList is returned.
993  /// @param oid_state
994  /// Optional address of a state variable (for concurrent iterations).
995  /// @return
996  /// eOidList in enumeration case, or eOidRange in begin/end range case.
998  GetNextOIDChunk(int & begin_chunk, // out
999  int & end_chunk, // out
1000  int oid_size, // in
1001  vector<int> & oid_list, // out
1002  int * oid_state = NULL); // in+out
1003 
1004  /// Resets this object's internal chunk bookmark, which is used when the
1005  /// oid_state argument to GetNextOIDChunk is NULL. This allows for several
1006  /// iterations to be performed over the same CSeqDB object
1007  void ResetInternalChunkBookmark();
1008 
1009  /// Get list of database names.
1010  ///
1011  /// This returns the database name list used at construction.
1012  /// @return
1013  /// List of database names.
1014  const string & GetDBNameList() const;
1015 
1016  /// Get GI list attached to this database.
1017  ///
1018  /// This returns the GI list attached to this database, or NULL,
1019  /// if no GI list was used. The effects of changing the contents
1020  /// of this GI list are undefined. This method only deals with
1021  /// the GI list passed to the top level CSeqDB constructor; it
1022  /// does not consider volume GI lists.
1023  ///
1024  /// @return A pointer to the attached GI list, or NULL.
1025  const CSeqDBGiList * GetGiList() const;
1026 
1027  /// Get IdSet list attached to this database.
1028  ///
1029  /// This returns the ID set used to filter this database. If a
1030  /// CSeqDBGiList or CSeqDBNegativeList was used instead, then an
1031  /// ID set object will be constructed and returned (and cached
1032  /// here). This method only deals with filtering applied to the
1033  /// top level CSeqDB constructor; it does not consider GI or TI
1034  /// lists attached from alias files. If no filtering was used, a
1035  /// 'blank' list will be returned (an empty negative list).
1036  ///
1037  /// @return A pointer to the attached ID set, or NULL.
1038  CSeqDBIdSet GetIdSet() const;
1039 
1040  /// Translate a PIG to an OID.
1041  bool PigToOid(int pig, int & oid) const;
1042 
1043  /// Translate an OID to a PIG.
1044  bool OidToPig(int oid, int & pig) const;
1045 
1046  /// Translate a TI to an OID.
1047  bool TiToOid(Int8 ti, int & oid) const;
1048 
1049  /// Translate an OID to a GI.
1050  bool OidToGi(int oid, TGi & gi) const;
1051 
1052  /// Translate a GI to an OID.
1053  bool GiToOid(TGi gi, int & oid) const;
1054 
1055  /// Translate a GI To an OID with filter check
1056  bool GiToOidwFilterCheck(TGi gi, int & oid) const;
1057 
1058  /// Translate a GI to a PIG.
1059  bool GiToPig(TGi gi, int & pig) const;
1060 
1061  /// Translate a PIG to a GI.
1062  bool PigToGi(int pig, TGi & gi) const;
1063 
1064  /// Translate an Accession to a list of OIDs.
1065  void AccessionToOids(const string & acc, vector<int> & oids) const;
1066 
1067  void AccessionsToOids(const vector<string>& accs, vector<blastdb::TOid>& oids) const;
1068 
1069  /// Translate a Seq-id to a list of OIDs.
1070  void SeqidToOids(const CSeq_id & seqid, vector<int> & oids) const;
1071 
1072  /// Translate a Seq-id to any matching OID.
1073  bool SeqidToOid(const CSeq_id & seqid, int & oid) const;
1074 
1075  /// Find the sequence closest to the given offset into the database.
1076  ///
1077  /// The database volumes can be viewed as a single array of
1078  /// residues, partitioned into sequences by OID order. The length
1079  /// of this array is given by GetTotalLength(). Given an offset
1080  /// between 0 and this length, this method returns the OID of the
1081  /// sequence at the given offset into the array. It is normally
1082  /// used to split the database into sections with approximately
1083  /// equal numbers of residues.
1084  /// @param first_seq
1085  /// First oid to consider (will always return this or higher).
1086  /// @param residue
1087  /// The approximate number residues offset to search for.
1088  /// @return
1089  /// An OID near the specified residue offset.
1090  int GetOidAtOffset(int first_seq, Uint8 residue) const;
1091 
1092  /// Get a CBioseq for a given GI
1093  ///
1094  /// This builds and returns the header and sequence data
1095  /// corresponding to the indicated GI as a CBioseq.
1096  ///
1097  /// @param gi
1098  /// The GI of the sequence.
1099  /// @return
1100  /// A CBioseq object corresponding to the sequence.
1101  CRef<CBioseq> GiToBioseq(TGi gi) const;
1102 
1103  /// Get a CBioseq for a given PIG
1104  ///
1105  /// This builds and returns the header and sequence data
1106  /// corresponding to the indicated PIG (a numeric identifier used
1107  /// for proteins) as a CBioseq.
1108  ///
1109  /// @param pig
1110  /// The protein identifier group id of the sequence.
1111  /// @return
1112  /// A CBioseq object corresponding to the sequence.
1113  CRef<CBioseq> PigToBioseq(int pig) const;
1114 
1115  /// Get a CBioseq for a given Seq-id
1116  ///
1117  /// This builds and returns the header and sequence data
1118  /// corresponding to the indicated Seq-id as a CBioseq. Note that
1119  /// certain forms of Seq-id map to more than one OID. If this is
1120  /// the case for the provided Seq-id, the first matching OID will
1121  /// be used.
1122  ///
1123  /// @param seqid
1124  /// The Seq-id identifier of the sequence.
1125  /// @return
1126  /// A CBioseq object corresponding to the sequence.
1127  CRef<CBioseq> SeqidToBioseq(const CSeq_id & seqid) const;
1128 
1129  /// Find volume paths
1130  ///
1131  /// Find the base names of all volumes (and alias nodes). This
1132  /// method builds an alias hierarchy (which should be much faster
1133  /// than constructing an entire CSeqDB object), and returns the
1134  /// resolved volume/alias file base names from that hierarchy.
1135  ///
1136  /// @param dbname
1137  /// The input name of the database
1138  /// @param seqtype
1139  /// Specify eProtein, eNucleotide, or eUnknown.
1140  /// @param paths
1141  /// The set of resolved database volume file names
1142  /// @param alias_paths
1143  /// The set of resolved database alias file names
1144  /// @param recursive
1145  /// If true, the search will traverse the full alias node tree
1146  /// @param expand_links
1147  /// If true, the search will expand the soft links
1148  static void
1149  FindVolumePaths(const string & dbname,
1150  ESeqType seqtype,
1151  vector<string> & paths,
1152  vector<string> * alias_paths = NULL,
1153  bool recursive = true,
1154  bool expand_links = true);
1155 
1156  /// Find volume paths
1157  ///
1158  /// Find the base names of all volumes. This method returns the
1159  /// resolved base names of all referenced blast database volumes.
1160  ///
1161  /// @param paths
1162  /// The returned set of resolved database path names
1163  /// @param recursive
1164  /// If true, the search will traverse the full alias node tree
1165  void FindVolumePaths(vector<string> & paths, bool recursive=true) const;
1166 
1167  /// Set Iteration Range
1168  ///
1169  /// This method sets the iteration range as a pair of OIDs.
1170  /// Iteration proceeds from begin, up to but not including end.
1171  /// End will be adjusted to the number of OIDs in the case that it
1172  /// is 0, negative, or greater than the number of OIDs.
1173  ///
1174  /// @param oid_begin
1175  /// Iterator will skip OIDs less than this value. Only OIDs
1176  /// found in the OID lists (if any) will be returned.
1177  /// @param oid_end
1178  /// Iterator will return up to (but not including) this OID.
1179  void SetIterationRange(int oid_begin, int oid_end);
1180 
1181  /// Get Name/Value Data From Alias Files
1182  ///
1183  /// SeqDB treats each alias file as a map from a variable name to
1184  /// a value. This method will return a map from the basename of
1185  /// the filename of each alias file, to a vector of maps from
1186  /// variable name to value for each entry in that file. For
1187  /// example, the value of the "DBLIST" entry in the "wgs.nal" file
1188  /// would be values["wgs"][0]["DBLIST"]. The lines returned have
1189  /// been processed somewhat by SeqDB, including normalizing tabs
1190  /// to whitespace, trimming leading and trailing whitespace, and
1191  /// removal of comments and other non-value lines. Care should be
1192  /// taken when using the values returned by this method. SeqDB
1193  /// uses an internal "virtual" alias file entry, which maps from a
1194  /// filename of "-" and contains a single entry mapping "DBLIST"
1195  /// to SeqDB's database name input. This entry is the root of the
1196  /// alias file inclusion tree. Also note that alias files that
1197  /// appear in several places in the alias file inclusion tree may
1198  /// be different -- SeqDB's internal editing distributes GI lists
1199  /// over sub-alias files, which is why the value type of the
1200  /// returned data is a vector.
1201  ///
1202  /// @param afv
1203  /// The alias file contents will be returned here.
1204  void GetAliasFileValues(TAliasFileValues & afv);
1205 
1206  /// Get taxonomy information
1207  ///
1208  /// This method returns taxonomy information for a single taxid.
1209  /// This information does not vary with sequence type (protein
1210  /// vs. nucleotide) and is the same for all blast databases. If
1211  /// the taxonomy database is not available or the taxid is not
1212  /// found, this method will throw an exception.
1213  ///
1214  /// @param taxid
1215  /// An integer identifying the taxid to fetch.
1216  /// @param info
1217  /// A structure containing taxonomic description strings.
1218  static void GetTaxInfo(TTaxId taxid, SSeqDBTaxInfo & info);
1219 
1220  /// Fetch data as a CSeq_data object.
1221  ///
1222  /// All or part of the sequence is fetched in a CSeq_data object.
1223  /// The portion of the sequence returned is specified by begin and
1224  /// end. An exception will be thrown if begin is greater than or
1225  /// equal to end, or if end is greater than or equal to the length
1226  /// of the sequence. Begin and end should be specified in bases;
1227  /// a range like (0,1) specifies 1 base, not 2. Nucleotide data
1228  /// will always be returned in ncbi4na format.
1229  ///
1230  /// @param oid Specifies the sequence to fetch.
1231  /// @param begin Specifies the start of the data to get. [in]
1232  /// @param end Specifies the end of the data to get. [in]
1233  /// @return The sequence data as a Seq-data object.
1234  CRef<CSeq_data> GetSeqData(int oid,
1235  TSeqPos begin,
1236  TSeqPos end) const;
1237 
1238  /// Get a sequence in a given encoding.
1239  ///
1240  /// This method gets the sequence data for the given OID, converts
1241  /// it to the specified encoding, and returns it in a string. It
1242  /// supports all values of the CSeqUtil::ECoding enumeration (but
1243  /// the type must match the database type). This method returns the
1244  /// same data as GetAmbigSeq() (or GetSequence() for protein), but
1245  /// may be less efficient due to the cost of translation and string
1246  /// allocation.
1247  ///
1248  /// @param oid The OID of the sequence to fetch.
1249  /// @param coding The encoding to use for the data.
1250  /// @param output The returned sequence data as a string.
1251  /// @param range The range of the sequence to retrieve, if empty, the
1252  /// entire sequence will be retrived [in]
1253  void GetSequenceAsString(int oid,
1254  CSeqUtil::ECoding coding,
1255  string & output,
1256  TSeqRange range = TSeqRange()) const;
1257 
1258  /// Get a sequence in a readable text encoding.
1259  ///
1260  /// This method gets the sequence data for an OID, converts it to a
1261  /// human-readable encoding (either Iupacaa for protein, or Iupacna
1262  /// for nucleotide), and returns it in a string. This is equivalent
1263  /// to calling the three-argument versions of this method with those
1264  /// encodings.
1265  ///
1266  /// @param oid The OID of the sequence to fetch.
1267  /// @param output The returned sequence data as a string.
1268  /// @param range The range of the sequence to retrieve, if empty, the
1269  /// entire sequence will be retrived [in]
1270  void GetSequenceAsString(int oid,
1271  string & output,
1272  TSeqRange range = TSeqRange()) const;
1273 
1274 
1275 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
1276  (!defined(NCBI_COMPILER_MIPSPRO)) )
1277  /// List columns titles found in this database.
1278  ///
1279  /// This returns a list of the column titles of all user created
1280  /// (and system generated) columns found in any of this database's
1281  /// volumes. Column titles appearing in more than one volume are
1282  /// only listed here once.
1283  ///
1284  /// @param titles Column titles are returned here. [out]
1285  void ListColumns(vector<string> & titles);
1286 
1287  /// Get an ID number for a given column title.
1288  ///
1289  /// For a given column title, this returns an ID that can be used
1290  /// to access that column in the future. The returned ID number
1291  /// is specific to this instance of SeqDB. If the database does
1292  /// not have a column with this name, -1 will be returned.
1293  ///
1294  /// @param title Column title to search for. [in]
1295  /// @return Column ID number for this column, or -1. [in]
1296  int GetColumnId(const string & title);
1297 
1298  /// Get all metadata for the specified column.
1299  ///
1300  /// Columns may contain user-defined metadata as a list of
1301  /// key-value pairs. For the specified column, this returns that
1302  /// column's metadata in the provided map. If multiple volumes
1303  /// are present, and they define contradictory meta data (this is
1304  /// more common when multiple databases are opened at once), this
1305  /// method returns the first value it finds for each metadata key.
1306  /// If this is unsatisfactory, the two-argument version of this
1307  /// method may be used to get more precise values for specific
1308  /// volumes.
1309  ///
1310  /// @param column_id The column id from GetColumnId. [in]
1311  /// @return The map of metadata for this column. [out]
1312  const map<string,string> & GetColumnMetaData(int column_id);
1313 
1314  /// Look up the value for a specific column metadata key.
1315  ///
1316  /// Columns can contain user-defined metadata as a list of
1317  /// key-value pairs. For the specified column, this returns the
1318  /// value associated with one particular key.
1319  ///
1320  /// @param column_id The column id from GetColumnId. [in]
1321  /// @return The value corresponding to the specified key. [out]
1322  const string & GetColumnValue(int column_id, const string & key);
1323 
1324  /// Get all metadata for the specified column.
1325  ///
1326  /// Columns may contain user-defined metadata as a list of
1327  /// key-value pairs. For the specified database volume and column
1328  /// id, this returns that column's metadata (as defined for that
1329  /// volume) in the provided map. The volume name should match
1330  /// the string returned by FindVolumePaths(vector<string>&).
1331  ///
1332  /// @param column_id The column id from GetColumnId. [in]
1333  /// @param volname The volume to get metadata for. [in]
1334  /// @return The map of metadata for this column + volume. [out]
1335  const map<string,string> &
1336  GetColumnMetaData(int column_id,
1337  const string & volname);
1338 
1339  /// Fetch the data blob for the given column and oid.
1340  /// @param col_id The column to fetch data from. [in]
1341  /// @param oid The OID of the blob. [in]
1342  /// @param blob The data will be returned here. [out]
1343  void GetColumnBlob(int col_id, int oid, CBlastDbBlob & blob);
1344 
1345  // Mask data support.
1346 
1347  /// Get a list of algorithm IDs for which mask data exists.
1348  ///
1349  /// Multiple sources of masking data may be used when building
1350  /// blast databases. This method retrieves a list of the IDs used
1351  /// to identify those types of filtering data to SeqDB. If the
1352  /// blast database volumes used by this instance of SeqDB were
1353  /// built with conflicting algorithm ID definitions, SeqDB will
1354  /// resolve the conflicts by renumbering some of the conflicting
1355  /// descriptions. For this reason, the IDs reported here may not
1356  /// match what was given to WriteDB when the database was created.
1357  ///
1358  /// @param algorithms List of algorithm ids. [out]
1359  void GetAvailableMaskAlgorithms(vector<int> & algorithms);
1360 
1361  /// Get the numeric algorithm ID for a string.
1362  /// @param algo_name The name of the filtering algorithm
1363  int GetMaskAlgorithmId(const string &algo_name) const;
1364 
1365  /// Returns a formatted string with the list of available masking
1366  /// algorithms in this database for display purposes (i.e.: help)
1367  string GetAvailableMaskAlgorithmDescriptions();
1368 
1369  /// Validates the algorithm IDs passed to this function, returning a vector
1370  /// of those algorithm IDs not present in this object
1371  vector<int> ValidateMaskAlgorithms(const vector<int>& algorithm_ids);
1372 
1373  /// Get information about one type of masking available here.
1374  ///
1375  /// For a given algorithm_id, this method fetches information
1376  /// describing the basic algorithm used, as well as options passed
1377  /// to that algorithm to generate the data stored here. Each
1378  /// sequence in the database can provide sequence masking data
1379  /// from one or more sources. There can also be multiple types of
1380  /// masking data from the same algorithm (such as DUST), but
1381  /// generated with different sets of input parameters.
1382  ///
1383  /// @param algorithm_id The ID as from GetAvailableMaskAlgorithms [in]
1384  /// @param program The filtering program used (DUST, SEG, etc.) [out]
1385  /// @param program_name string representation of program [out]
1386  /// @param algo_opts Describes options passed to `program'. [out]
1387  void GetMaskAlgorithmDetails(int algorithm_id,
1389  string & program_name,
1390  string & algo_opts);
1391 
1392  void GetMaskAlgorithmDetails(int algorithm_id,
1393  string & program,
1394  string & program_name,
1395  string & algo_opts);
1396 
1397  /// Get masked ranges of a sequence.
1398  ///
1399  /// For the provided OID and list of algorithm IDs, this method
1400  /// gets a list of masked areas of those sequences for the first
1401  /// algorithm ID. The list of masked areas is returned via the
1402  /// ranges parameter.
1403  ///
1404  /// @param oid The ordinal ID of the sequence. [in]
1405  /// @param algo_id The algorithm ID to get data for. [in]
1406  /// @param ranges The list of sequence offset ranges. [out]
1408  void GetMaskData(int oid,
1409  const vector<int> & algo_ids,
1410  TSequenceRanges & ranges)
1411  {
1412  GetMaskData(oid, algo_ids[0], ranges);
1413  }
1414 
1415  /// Get masked ranges of a sequence.
1416  ///
1417  /// For the provided OID and algorithm ID, this method
1418  /// gets a list of masked areas of those sequences. The list of
1419  /// masked areas is returned via the ranges parameter.
1420  ///
1421  /// @param oid The ordinal ID of the sequence. [in]
1422  /// @param algo_id The algorithm ID to get data for. [in]
1423  /// @param ranges The list of sequence offset ranges. [out]
1424  void GetMaskData(int oid,
1425  int algo_id,
1426  TSequenceRanges &ranges);
1427 #endif
1428 
1429  /***********************************************************************/
1430  /* BEGIN: support for partial sequence fetching */
1431 
1432  /// List of sequence offset ranges.
1434 
1435  /// Apply a range of offsets to a database sequence.
1436  ///
1437  /// The GetAmbigSeq() method requires an amount of work (and I/O)
1438  /// which is proportional to the size of the sequence data (more
1439  /// if ambiguities are present). In some cases, only certain
1440  /// subranges of this data will be utilized. This method allows
1441  /// the user to specify which parts of a sequence are actually
1442  /// needed by the user. (Care should be taken if one SeqDB object
1443  /// is shared by several program components.) (Note that offsets
1444  /// above the length of the sequence will not generate an error,
1445  /// and are replaced by the sequence length.)
1446  ///
1447  /// If ranges are specified for a sequence, data areas in
1448  /// specified sequences will be accurate, but data outside the
1449  /// specified ranges should not be accessed, and no guarantees are
1450  /// made about what data they will contain. If the append_ranges
1451  /// flag is true, the range will be added to existing ranges. If
1452  /// false, existing ranges will be flushed and replaced by new
1453  /// ranges. To remove ranges, call this method with an empty list
1454  /// of ranges (and append_ranges == false); future calls will then
1455  /// return the complete sequence.
1456  ///
1457  /// If the cache_data flag is set, data for this sequence will be
1458  /// kept for the duration of SeqDB's lifetime. To disable caching
1459  /// (and flush cached data) for this sequence, call the method
1460  /// again, but specify cache_data to be false.
1461  ///
1462  /// @param oid OID of the sequence.
1463  /// @param offset_ranges Ranges of sequence data to return.
1464  /// @param append_ranges Append new ranges to existing list.
1465  /// @param cache_data Keep sequence data for future callers.
1466  void SetOffsetRanges(int oid,
1467  const TRangeList & offset_ranges,
1468  bool append_ranges,
1469  bool cache_data);
1470 
1471  /// Remove any offset ranges for the given OID
1472  /// @param oid OID of the sequence.
1473  void RemoveOffsetRanges(int oid);
1474 
1475  /// Flush all offset ranges cached
1476  void FlushOffsetRangeCache();
1477 
1478  /* END: support for partial sequence fetching */
1479  /***********************************************************************/
1480 
1481  /// Setting the number of threads
1482  ///
1483  /// This should be called by the master thread, before and after
1484  /// multiple threads run.
1485  ///
1486  /// @param num_threads Number of threads
1487  void SetNumberOfThreads(int num_threads, bool force_mt = false);
1488 
1489  /// Retrieve the disk usage in bytes for this BLAST database
1490  Int8 GetDiskUsage() const;
1491 
1492  /// Set the membership of all volumes
1493  void SetVolsMemBit(int mbit);
1494 
1495  /// Dump debug information for this object
1496  /// @sa CDebugDumpable
1497  void DebugDump(CDebugDumpContext ddc, unsigned int depth) const;
1498 
1499  /// Return blast db version
1500  EBlastDbVersion GetBlastDbVersion() const;
1501 
1502  /// Get Oid list for input tax ids
1503  /// @param tax_ids taxonomy ids, return only tax ids found in db
1504  // @param rv oids corrpond to tax ids
1505  void TaxIdsToOids(set<TTaxId>& tax_ids, vector<blastdb::TOid>& rv) const;
1506 
1507  /// Get all unique tax ids from db
1508  /// @param tax_ids return taxonomy ids in db
1509  void GetDBTaxIds(set<TTaxId> & tax_ids) const;
1510 
1511  void GetTaxIdsForOids(const vector<blastdb::TOid> & oids, set<TTaxId> & tax_ids) const;
1512 
1513  CRef<CBlast_db_metadata> GetDBMetaData(string user_path = kEmptyStr);
1514 
1515 
1516  /// Get all tax ids for an accessions
1517  /// @param accs input accession
1518  /// @param taxids taxids for accession
1519  void GetTaxIdsForAccession(const string & accs, vector<TTaxId> & taxids);
1520 
1521 
1522  /// Get all tax ids for a seq id
1523  /// @param seq_id input seq id
1524  /// @param taxids taxids for accession
1525  void GetTaxIdsForSeqId(const CSeq_id & seq_id, vector<TTaxId> & taxids);
1526 
1527 protected:
1528  /// Implementation details are hidden. (See seqdbimpl.hpp).
1530 
1531  /// No-argument Constructor
1532  ///
1533  /// This version of the constructor is used as an extension by the
1534  /// 'expert' interface in seqdbexpert.hpp.
1535  CSeqDB();
1536 
1537  void x_GetDBFilesMetaData(Int8 & disk_bytes, Int8 & cached_bytes, vector<string> & db_files, const string & user_path) const;
1538 };
1539 
1540 /// Structure to define basic information to initialize a BLAST DB
1542  /// The BLAST DB name
1544  /// The molecule type
1546 
1547  /// Default constructor
1549  m_MoleculeType = CSeqDB::eUnknown;
1550  }
1551 
1552  /// operator less to support sorting
1553  inline bool operator<(const SSeqDBInitInfo& rhs) const {
1554  if (m_BlastDbName < rhs.m_BlastDbName) {
1555  return true;
1556  } else if (m_BlastDbName > rhs.m_BlastDbName) {
1557  return false;
1558  } else {
1559  return ((int)m_MoleculeType < (int)rhs.m_MoleculeType);
1560  }
1561  }
1562 
1563  /// Create a new CSeqDB instance from this object
1565  return CRef<CSeqDB>(new CSeqDB(m_BlastDbName, m_MoleculeType));
1566  }
1567 };
1568 
1569 /// Find BLAST DBs in the directory specified
1570 /// @param path directory to search BLAST DBs [in]
1571 /// @param dbtype BLAST DB molecule type, allowed values are 'prot', 'nucl',
1572 /// and 'guess' (which means any) [in]
1573 /// @param recurse whether BLAST DBs should be found recursively or not [in]
1574 /// @param include_alias_files Should alias files be included also? [in]
1575 /// @param remove_redundant_dbs Should BLASTDBs that are referenced by other
1576 /// alias files in the return value be removed? [in]
1578 vector<SSeqDBInitInfo>
1579 FindBlastDBs(const string& path, const string& dbtype, bool recurse,
1580  bool include_alias_files = false,
1581  bool remove_redundant_dbs = false);
1582 
1583 /// CSeqDBSequence --
1584 ///
1585 /// Small class to implement RIAA for sequences.
1586 ///
1587 /// The CSeqDB class requires that sequences be returned at some point
1588 /// after they are gotten. This class provides that service via the
1589 /// destructor. It also insures that the database itself stays around
1590 /// for at least the duration of its lifetime, by holding a CRef<> to
1591 /// that object. CSeqDB::GetSequence may be used directly to avoid
1592 /// the small overhead of this class, provided care is taken to call
1593 /// CSeqDB::RetSequence. The data referred to by this object is not
1594 /// modifyable, and is memory mapped (read only) where supported.
1595 
1597 public:
1598  /// Defines the type used to select which sequence to get.
1600 
1601  /// Get a hold a database sequence.
1602  CSeqDBSequence(CSeqDB * db, int oid)
1603  : m_DB (db),
1604  m_Data (0),
1605  m_Length(0)
1606  {
1607  m_Length = m_DB->GetSequence(oid, & m_Data);
1608  }
1609 
1610  /// Destructor, returns the sequence.
1612  {
1613  if (m_Data) {
1614  m_DB->RetSequence(& m_Data);
1615  }
1616  }
1617 
1618  /// Get pointer to sequence data.
1619  const char * GetData()
1620  {
1621  return m_Data;
1622  }
1623 
1624  /// Get sequence length.
1626  {
1627  return m_Length;
1628  }
1629 
1630 private:
1631  /// Prevent copy construct.
1633 
1634  /// Prevent copy.
1636 
1637  /// The CSeqDB object this sequence is from.
1639 
1640  /// The sequence data for this sequence.
1641  const char * m_Data;
1642 
1643  /// The length of this sequence.
1645 };
1646 
1647 // Inline methods for CSeqDBIter
1648 
1650 {
1652 }
1653 
1655 {
1656  if (m_Data)
1657  m_DB->RetSequence(& m_Data);
1658 }
1659 
1660 /// Convert a string to a CSeqDB ESeqType object
1661 /// @param str string containing the molecule type (e.g.: prot, nucl, guess)
1664 
1665 /// Deletes all files associated with a BLAST database
1666 /// @param dbpath BLAST database file path [in]
1667 /// @param seq_type Sequence type [in]
1668 /// @return true if relevant files were deleted, else false
1670 bool DeleteBlastDb(const string& dbpath, CSeqDB::ESeqType seq_type);
1671 
1673 
1674 #endif // OBJTOOLS_BLAST_SEQDB_READER___SEQDB__HPP
1675 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
bool GetSeqData(ParserPtr pp, const DataBlk &entry, CBioseq &bioseq, Int4 nodetype, unsigned char *seqconv, Uint1 seq_data_type)
Definition: asci_blk.cpp:1674
const size_t kResizeFactor
factor by which these arrays are resized
@ eOidRange
Data is a range of contiguous ordinal ids (indices)
CBioseq_Handle –.
`Blob' Class for SeqDB (and WriteDB).
Definition: seqdbblob.hpp:56
CObject –.
Definition: ncbiobj.hpp:180
CSeqDBException.
Definition: seqdbcommon.hpp:73
CSeqDBGiList.
SeqDB ID list for performing boolean set operations.
CSeqDBImpl class.
Definition: seqdbimpl.hpp:138
CSeqDBIter.
Definition: seqdb.hpp:77
virtual ~CSeqDBIter()
Destructor.
Definition: seqdb.hpp:80
int m_Length
The length of this OID.
Definition: seqdb.hpp:141
const CSeqDB * m_DB
The CSeqDB object which this object iterates over.
Definition: seqdb.hpp:132
void x_RetSeq()
Release hold on current sequence.
Definition: seqdb.hpp:1654
const char * m_Data
The sequence data for this OID.
Definition: seqdb.hpp:138
int GetLength()
Get the length (in base pairs) of the currently held sequence.
Definition: seqdb.hpp:104
int m_OID
The OID this iterator is currently accessing.
Definition: seqdb.hpp:135
void x_GetSeq()
Get data pointer and length for the current sequence.
Definition: seqdb.hpp:1649
const char * GetData()
Get the sequence data for the currently held sequence.
Definition: seqdb.hpp:98
int GetOID()
Get the OID of the currently held sequence.
Definition: seqdb.hpp:92
DECLARE_OPERATOR_BOOL(m_Length !=-1)
Returns true if the iterator points to a valid sequence.
CSeqDBNegativeList.
CSeqDBSequence –.
Definition: seqdb.hpp:1596
CSeqDBSequence(const CSeqDBSequence &)
Prevent copy construct.
const char * GetData()
Get pointer to sequence data.
Definition: seqdb.hpp:1619
CRef< CSeqDB > m_DB
The CSeqDB object this sequence is from.
Definition: seqdb.hpp:1638
~CSeqDBSequence()
Destructor, returns the sequence.
Definition: seqdb.hpp:1611
const char * m_Data
The sequence data for this sequence.
Definition: seqdb.hpp:1641
CSeqDBSequence & operator=(const CSeqDBSequence &)
Prevent copy.
int m_Length
The length of this sequence.
Definition: seqdb.hpp:1644
CSeqDB::TOID TOID
Defines the type used to select which sequence to get.
Definition: seqdb.hpp:1599
CSeqDBSequence(CSeqDB *db, int oid)
Get a hold a database sequence.
Definition: seqdb.hpp:1602
int GetLength()
Get sequence length.
Definition: seqdb.hpp:1625
CSeqDB.
Definition: seqdb.hpp:161
int TOID
Sequence type accepted and returned for OID indices.
Definition: seqdb.hpp:216
int TPIG
Sequence type accepted and returned for PIG indices.
Definition: seqdb.hpp:219
EOidListType
Indicates how block of OIDs was returned.
Definition: seqdb.hpp:167
@ eOidList
Definition: seqdb.hpp:168
ESeqType
Sequence types (eUnknown tries protein, then nucleotide).
Definition: seqdb.hpp:173
@ eNucleotide
Definition: seqdb.hpp:175
@ eUnknown
Definition: seqdb.hpp:176
@ eProtein
Definition: seqdb.hpp:174
void RetSequence(const char **buffer) const
Returns any resources associated with the sequence.
Definition: seqdb.cpp:523
int GetSequence(int oid, const char **buffer) const
Get a pointer to raw sequence data.
Definition: seqdb.cpp:530
ESummaryType
Types of summary information available.
Definition: seqdb.hpp:183
@ eUnfilteredAll
Sum of all sequences, ignoring GI and OID lists and alias files.
Definition: seqdb.hpp:185
@ eFilteredAll
Values from alias files, or summation over all included sequences.
Definition: seqdb.hpp:188
static const string kOidNotFound
String containing the error message in exceptions thrown when a given OID cannot be found.
Definition: seqdb.hpp:316
static const char * kBlastDbDateFormat
Format string for the date returned by CSeqDB::GetDate.
Definition: seqdb.hpp:851
class CSeqDBImpl * m_Impl
Implementation details are hidden. (See seqdbimpl.hpp).
Definition: seqdb.hpp:1529
TGi TGI
Sequence type accepted and returned for GI indices.
Definition: seqdb.hpp:222
EMmapStrategies
Permitted mmap strategies.
Definition: seqdb.hpp:204
@ eMmap_Normal
Normal, no special behavior (should undo next two options).
Definition: seqdb.hpp:206
@ eMmap_Sequential
Expect sequential page references.
Definition: seqdb.hpp:209
TSeqDBAliasFileValues TAliasFileValues
Import type to allow shorter name.
Definition: seqdb.hpp:164
EMmapFileTypes
File type for which mmap strategy may be set.
Definition: seqdb.hpp:195
@ eMmap_IndexFile
Index files (name ends with ".pin" or ".nin").
Definition: seqdb.hpp:197
set< pair< int, int > > TRangeList
List of sequence offset ranges.
Definition: seqdb.hpp:1433
void GetMaskData(int oid, const vector< int > &algo_ids, TSequenceRanges &ranges)
Get masked ranges of a sequence.
Definition: seqdb.hpp:1408
CTime –.
Definition: ncbitime.hpp:296
Definition: map.hpp:338
static unsigned char depth[2 *(256+1+29)+1]
int GetSeqLength(const CBioseq &bioseq)
Definition: cuSequence.cpp:216
Blast defline related defines.
static void DLIST_NAME() append(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:78
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static SQLCHAR output[256]
Definition: print.c:5
static const char * str(char *buf, int n)
Definition: stats.c:84
SBlastSequence GetSequence(const objects::CSeq_loc &sl, EBlastEncoding encoding, objects::CScope *scope, objects::ENa_strand strand=objects::eNa_strand_plus, ESentinelType sentinel=eSentinels, std::string *warnings=NULL)
Retrieves a sequence using the object manager.
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
#define ZERO_GI
Definition: ncbimisc.hpp:1088
#define NULL
Definition: ncbistd.hpp:225
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
CBeginInfo Begin(C &obj)
Get starting point of object hierarchy.
Definition: iterator.hpp:1004
NCBI_XOBJUTIL_EXPORT string GetTitle(const CBioseq_Handle &hnd, TGetTitleFlags flags=0)
Definition: seqtitle.cpp:106
#define NCBI_DEPRECATED
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
CRange< TSeqPos > TSeqRange
typedefs for sequence ranges
Definition: range.hpp:419
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static string SizetToString(size_t value, TNumToStringFlags flags=0, int base=10)
Convert size_t to string.
Definition: ncbistr.cpp:2742
#define kEmptyStr
Definition: ncbistr.hpp:123
#define NCBI_XOBJREAD_EXPORT
Definition: ncbi_export.h:1315
strategy
Block allocation strategies.
Definition: bmconst.h:146
EBlast_filter_program
This defines the possible sequence filtering algorithms to be used in a BLAST database.
char * dbname(DBPROCESS *dbproc)
Get name of current database.
Definition: dblib.c:6929
int i
CBioseq_Info & GetBioseq(CTSE_Info &tse, const CBioObjectId &id)
static MDB_envinfo info
Definition: mdb_load.c:37
range(_Ty, _Ty) -> range< _Ty >
double value_type
The numeric datatype used by the parser.
Definition: muParserDef.h:228
const struct ncbi::grid::netcache::search::fields::KEY key
static uint8_t * buffer
Definition: pcre2test.c:1016
USING_SCOPE(objects)
Include definitions from the objects namespace.
CSeqDB::ESeqType ParseMoleculeTypeString(const string &str)
Convert a string to a CSeqDB ESeqType object.
Definition: seqdb.cpp:1527
vector< SSeqDBInitInfo > FindBlastDBs(const string &path, const string &dbtype, bool recurse, bool include_alias_files=false, bool remove_redundant_dbs=false)
Find BLAST DBs in the directory specified.
Definition: seqdb.cpp:1429
bool DeleteBlastDb(const string &dbpath, CSeqDB::ESeqType seq_type)
Deletes all files associated with a BLAST database.
Definition: seqdb.cpp:1542
Defines BlastDb `Blob' class for SeqDB and WriteDB.
Defines exception class and several constants for SeqDB.
ESeqDBAllocType
Certain methods have an "Alloc" version.
EBlastDbVersion
BLAST database version.
Definition: seqdbcommon.hpp:51
Uint4 GetSequenceType(const CBioseq_Handle &bsh)
Return a (corrected) set of flags identifying the sequence type.
Definition: sequtils.cpp:42
static SLJIT_INLINE sljit_ins msg(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
Structure to represent a range.
Definition: seqdb.hpp:225
TOffsetPair()
Default constructor.
Definition: seqdb.hpp:230
List of sequence offset ranges.
Definition: seqdb.hpp:236
void append(const void *src, size_type num_elements)
Append extra elements at the end.
Definition: seqdb.hpp:302
const_iterator end() const
Definition: seqdb.hpp:278
bool empty() const
Definition: seqdb.hpp:272
value_type & operator[](size_type i)
Definition: seqdb.hpp:280
TOffsetPair value_type
Definition: seqdb.hpp:238
const value_type * const_iterator
Definition: seqdb.hpp:239
const_iterator begin() const
Definition: seqdb.hpp:276
void reserve(size_t num_elements)
Reserves capacity for at least num_elements elements.
Definition: seqdb.hpp:286
void push_back(const value_type &element)
Append extra element at the end.
Definition: seqdb.hpp:309
void x_reallocate_if_necessary()
Definition: seqdb.hpp:252
size_type size() const
Definition: seqdb.hpp:274
value_type * get_data() const
Definition: seqdb.hpp:282
Structure to define basic information to initialize a BLAST DB.
Definition: seqdb.hpp:1541
string m_BlastDbName
The BLAST DB name.
Definition: seqdb.hpp:1543
bool operator<(const SSeqDBInitInfo &rhs) const
operator less to support sorting
Definition: seqdb.hpp:1553
CRef< CSeqDB > InitSeqDb() const
Create a new CSeqDB instance from this object.
Definition: seqdb.hpp:1564
CSeqDB::ESeqType m_MoleculeType
The molecule type.
Definition: seqdb.hpp:1545
SSeqDBInitInfo()
Default constructor.
Definition: seqdb.hpp:1548
SSeqDBTaxInfo.
Definition: type.c:6
void free(voidpf ptr)
Modified on Wed Sep 04 15:04:10 2024 by modify_doxy.py rev. 669887