NCBI C++ ToolKit
writedb_impl.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef OBJTOOLS_WRITERS_WRITEDB__WRITEDB_IMPL_HPP
2 #define OBJTOOLS_WRITERS_WRITEDB__WRITEDB_IMPL_HPP
3 
4 /* $Id: writedb_impl.hpp 101152 2023-11-07 15:39:13Z camacho $
5  * ===========================================================================
6  *
7  * PUBLIC DOMAIN NOTICE
8  * National Center for Biotechnology Information
9  *
10  * This software/database is a "United States Government Work" under the
11  * terms of the United States Copyright Act. It was written as part of
12  * the author's official duties as a United States Government employee and
13  * thus cannot be copyrighted. This software/database is freely available
14  * to the public for use. The National Library of Medicine and the U.S.
15  * Government have not placed any restriction on its use or reproduction.
16  *
17  * Although all reasonable efforts have been taken to ensure the accuracy
18  * and reliability of the software and data, the NLM and the U.S.
19  * Government do not and cannot warrant the performance or results that
20  * may be obtained by using this software or data. The NLM and the U.S.
21  * Government disclaim all warranties, express or implied, including
22  * warranties of performance, merchantability or fitness for any particular
23  * purpose.
24  *
25  * Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Author: Kevin Bealer
30  *
31  */
32 
33 /// @file writedb_impl.hpp
34 /// Defines implementation class of WriteDB.
35 ///
36 /// Defines classes:
37 /// CWriteDBHeader
38 ///
39 /// Implemented for: UNIX, MS-Windows
40 
41 #include <objects/seq/seq__.hpp>
47 #include "writedb_volume.hpp"
48 #include "writedb_gimask.hpp"
49 #include "mask_info_registry.hpp"
50 
51 #include <objmgr/bioseq_handle.hpp>
52 #include <objmgr/seq_vector.hpp>
53 
55 
56 /// Import definitions from the objects namespace.
58 
59 /// CWriteDB_Impl class
60 ///
61 /// This manufactures blast database header files from input data.
62 
64 public:
65  /// Whether and what kind of indices to build.
67 
68  // Setup and control
69 
70  /// Constructor.
71  /// @param dbname Name of the database to create.
72  /// @param protein True for protein, false for nucleotide.
73  /// @param title Title string for volumes and alias file.
74  /// @param indices Type of indexing to do for string IDs.
75  /// @param parse_ids If true generate ISAM files
76  /// @param long_ids If true, assume long sequence ids (database|accession)
77  /// when parsing strings ids
78  /// @param use_gi_mask If true generate GI-based mask files.
79  /// @param scan_bioseq_4_cfastareader_usrobj [in]
80  /// If true, scan the Bioseq objects for a CFastaReader-created User-object
81  /// containing a defline
82  CWriteDB_Impl(const string & dbname,
83  bool protein,
84  const string & title,
85  EIndexType indices,
86  bool parse_ids,
87  bool long_ids,
88  bool use_gi_mask,
90  bool limit_defline = false,
91  Uint8 oid_masks = EOidMaskType::fNone,
92  bool scan_bioseq_4_cfastareader_usrobj = false);
93 
94  /// Destructor.
96 
97  /// Close the file and flush any remaining data to disk.
98  void Close();
99 
100  // Sequence Data
101 
102  /// Add a new sequence as raw sequence and ambiguity data.
103  ///
104  /// A new sequence record is started, and data from any previous
105  /// sequence is combined and written to disk. Each sequence needs
106  /// sequence data and header data. This method takes sequence
107  /// data in the form of seperated sequence data and compressed
108  /// ambiguities packed in the blast database disk format. It is
109  /// intended for efficiently copying sequences from sources that
110  /// provide this format, such as CSeqDBExpert(). If this method
111  /// is used for protein data, the ambiguities string should be
112  /// empty. If this method is used, header data must also be
113  /// specified with a call to SetDeflines().
114  ///
115  /// @param sequence Sequence data in blast db disk format.
116  /// @param ambiguities Ambiguity data in blast db disk format.
117  void AddSequence(const CTempString & sequence,
118  const CTempString & ambiguities);
119 
120  /// Add a new sequence as a CBioseq.
121  ///
122  /// A new sequence record is started, and data from any previous
123  /// sequence is combined and written to disk. Each sequence needs
124  /// sequence data and header data. This method can extract both
125  /// from the provided CBioseq. If other header data is preferred,
126  /// SetDeflines() can be called after this method to replace the
127  /// header data from the CBioseq. Note that CBioseqs from some
128  /// sources are not guaranteed to contain sequence data; if this
129  /// might be the case, consider the versions of AddSequence that
130  /// take either CBioseq_Handle or CBioseq and CSeqVector. In
131  /// order to use this method, sequence data should be accessible
132  /// from bs.GetInst().GetSeq_data(). (Note: objects provided to
133  /// WriteDB will be kept alive until the next AddSequence call.)
134  ///
135  /// @param bs Bioseq containing sequence and header data.
136  void AddSequence(const CBioseq & bs);
137 
138  /// Add a new sequence as a CBioseq_Handle.
139  ///
140  /// A new sequence record is started, and data from any previous
141  /// sequence is combined and written to disk. Each sequence needs
142  /// sequence data and header data. This method can extract both
143  /// from the provided CBioseq_Handle. If other header data is
144  /// preferred, SetDeflines() can be called after this method to
145  /// replace the header data from the CBioseq. (Note: objects
146  /// provided to WriteDB will be kept alive until the next
147  /// AddSequence call.)
148  ///
149  /// @param bsh Bioseq_Handle for sequence to add.
150  void AddSequence(const CBioseq_Handle & bsh);
151 
152  /// Add a new sequence as a CBioseq_Handle.
153  ///
154  /// A new sequence record is started, and data from any previous
155  /// sequence is combined and written to disk. Each sequence needs
156  /// sequence data and header data. This method will extract
157  /// header data from the provided CBioseq. If the CBioseq
158  /// contains sequence data, it will be used; otherwise sequence
159  /// data will be fetched from the provided CSeqVector. If other
160  /// header data is preferred, SetDeflines() can be called after
161  /// this method. (Note: objects provided to WriteDB will be kept
162  /// alive until the next AddSequence call.)
163  ///
164  /// @param bs Bioseq_Handle for header and sequence data.
165  /// @param sv CSeqVector for sequence data.
166  void AddSequence(const CBioseq & bs, CSeqVector & sv);
167 
168  /// This method replaces any stored header data for the current
169  /// sequence with the provided CBlast_def_line_set. Header data
170  /// can be constructed directly by the caller, or extracted from
171  /// an existing CBioseq using ExtractBioseqDeflines (see below).
172  /// Once it is in the correct form, it can be attached to the
173  /// sequence with this method. (Note: objects provided to WriteDB
174  /// will be kept alive until the next AddSequence call.)
175  ///
176  /// @param deflines Header data for the most recent sequence.
177  void SetDeflines(const CBlast_def_line_set & deflines);
178 
179  /// Set the PIG identifier of this sequence.
180  ///
181  /// For protein sequences, this sets the PIG identifier. PIG ids
182  /// are per-sequence, so it will only be attached to the first
183  /// defline in the set.
184  ///
185  /// @param pig PIG identifier as an integer.
186  void SetPig(int pig);
187 
188  // Options
189 
190  /// Set the maximum size for any file in the database.
191  ///
192  /// This method sets the maximum size for any file in a database
193  /// volume. If adding a sequence would cause any file in the
194  /// generated database to exceed this size, the current volume is
195  /// ended and a new volume is started. This is not a strict
196  /// limit, inasmuch as it always puts at least one sequence in
197  /// each volume regardless of that sequence's size.
198  ///
199  /// @param sz Maximum file size (in bytes).
200  void SetMaxFileSize(Uint8 sz);
201 
202  /// Set the maximum letters in one volume.
203  ///
204  /// This method sets the maximum number of sequence letters per
205  /// database volume. If adding a sequence would cause the volume
206  /// to have more than this many letters, the current volume is
207  /// ended and a new volume is started. This is not a strict
208  /// limit, inasmuch as it always puts at least one sequence in
209  /// each volume regardless of that sequence's size.
210  ///
211  /// @param sz Maximum sequence letters per volume.
212  void SetMaxVolumeLetters(Uint8 sz);
213 
214  /// Extract deflines from a CBioseq.
215  ///
216  /// Given a CBioseq, this method extracts and returns header info
217  /// as a defline set. The deflines will not be applied to the
218  /// current sequence unless passed to SetDeflines. The expected
219  /// use of this method is in cases where the caller has a CBioseq
220  /// or CBioseq_Handle but wishes to examine and/or change the
221  /// deflines before passing them to CWriteDB. Some elements of
222  /// the CBioseq may be shared by the returned defline set, notably
223  /// the Seq-ids.
224  ///
225  /// @param bs Bioseq from which to construct the defline set.
226  /// @param parse_ids If we should parse seq_ids.
227  /// @param long_seqids If true use long sequence ids (database|accession)
228  /// @param scan_bioseq_4_cfastareader_usrobj [in]
229  /// If true, scan the Bioseq objects for a CFastaReader-created User-object
230  /// containing a defline
231  /// @return The blast defline set.
233  ExtractBioseqDeflines(const CBioseq & bs, bool parse_ids, bool long_seqids,
234  bool scan_bioseq_4_cfastareader_usrobj = false);
235 
236  /// Set bases that should not be used in sequences.
237  ///
238  /// This method specifies nucelotide or protein bases that should
239  /// not be used in the resulting database. The bases in question
240  /// will be replaced with N (for nucleotide) or X (for protein).
241  /// The input data is expected to be specified in the appropriate
242  /// 'alphabetic' encoding (either IUPACAA and IUPACNA).
243  ///
244  /// @param masked
245  void SetMaskedLetters(const string & masked);
246 
247  /// List Volumes
248  ///
249  /// Returns the base names of all volumes constructed by this
250  /// class; the returned list may not be complete until Close() has
251  /// been called.
252  ///
253  /// @param vols
254  /// The set of volumes produced by this class.
255  void ListVolumes(vector<string> & vols);
256 
257  /// List Filenames
258  ///
259  /// Returns a list of the files constructed by this class; the
260  /// returned list may not be complete until Close() has been
261  /// called.
262  ///
263  /// @param files
264  /// The set of resolved database path names.
265  void ListFiles(vector<string> & files);
266 
267  /// Register a type of filtering data found in this database.
268  ///
269  /// The BlastDb format supports storage of masking data (lists of
270  /// masked ranges) for each database sequence, as well as an
271  /// indication of the source (or sources) of this masking data (e.g.:
272  /// masking algorithm used to create them).
273  /// This method stores a description of one of these masking data
274  /// sources in this database, including which basic algorithm was
275  /// used, as well as the options passed to that algorithm. Each
276  /// description is associated with a numeric `algorithm id' (return value
277  /// of this method), which identifies that data source when adding data
278  /// with SetMaskData.
279  ///
280  /// @return algorithm ID for the filtering data.
281  /// @param program Program used to produce this masking data. [in]
282  /// @param options Algorithm options provided to the program. [in]
283  /// @param name Name of a GI-based mask [in]
285  const string & options,
286  const string & name = "");
287 
288  /// Register a type of filtering data found in this database.
289  ///
290  /// The BlastDb format supports storage of masking data (lists of
291  /// masked ranges) for each database sequence, as well as an
292  /// indication of the source (or sources) of this masking data (e.g.:
293  /// masking algorithm used to create them).
294  /// This method stores a description of one of these masking data
295  /// sources in this database, including which basic algorithm was
296  /// used, as well as the options passed to that algorithm. Each
297  /// description is associated with a numeric `algorithm id' (return value
298  /// of this method), which identifies that data source when adding data
299  /// with SetMaskData.
300  ///
301  /// @return algorithm ID for the filtering data.
302  /// @param id A string to identify this masking data. [in]
303  /// @param description Details about the masking data. [in]
304  /// @param options Algorithm options provided to the program. [in]
305  int RegisterMaskAlgorithm(const string & id,
306  const string & description,
307  const string & options);
308 
309  /// Set filtering data for a sequence.
310  ///
311  /// This method specifies filtered regions for the sequence. Each
312  /// sequence can have filtering data from various algorithms.
313  ///
314  /// @param ranges Filtered ranges for this sequence and algorithm.
315  /// @param gis The GIs associated with this sequence
316  void SetMaskData(const CMaskedRangesVector & ranges,
317  const vector <TGi> & gis);
318 
319  /// Set up a generic CWriteDB metadata column.
320  ///
321  /// This method creates a column with the specified name (title).
322  /// The name must be unique among names provided to this database.
323  /// An integer column descriptor is returned, which must be used
324  /// to identify this column when applying blob data. This call
325  /// will fail with an exception if too many user defined columns
326  /// have already been created for this database (this limit is due
327  /// to BlastDb file naming conventions). The title identifies
328  /// this column and is also used to access the column with SeqDB.
329  ///
330  /// @param title Name identifying this column.
331  /// @return Column identifier (a positive integer).
332  int CreateColumn(const string & title, bool mbo=false);
333 
334  /// Find an existing column.
335  ///
336  /// This looks for an existing column with the specified title and
337  /// returns the column ID if found.
338  ///
339  /// @param title The column title to look for.
340  /// @return The column ID if this column title is already defined.
341  int FindColumn(const string & title) const;
342 
343  /// Add meta data to a column.
344  ///
345  /// In addition to normal blob data, database columns can store a
346  /// `dictionary' of user-defined metadata in key/value form. This
347  /// method adds one such key/value pair to the column. Specifying
348  /// a key a second time causes replacement of the previous value.
349  /// Using this mechanism to store large amounts of data may have a
350  /// negative impact on performance.
351  ///
352  /// @param col_id Specifies the column to add this metadata to.
353  /// @param key A unique key string.
354  /// @param value A value string.
355  void AddColumnMetaData(int col_id,
356  const string & key,
357  const string & value);
358 
359  /// Get a blob to use for a given column letter.
360  ///
361  /// To add data for a `blob' type column, this method should be
362  /// called to get a reference to a CBlastDbBlob object. Add the
363  /// user-defined blob data to this object. It is not correct to
364  /// call this more than once for the same sequence and column.
365  /// Reading, writing, or otherwise using this object after the
366  /// current sequence is published is an error and has undefined
367  /// consequences. ('Publishing' of a sequence usually occurs
368  /// during the following AddSequence(*) call or during Close().)
369  ///
370  /// @param col_id Indicates the column receiving the blob data.
371  /// @return The user data should be stored in this blob.
372  CBlastDbBlob & SetBlobData(int col_id);
373 
374 private:
375  // Configuration
376 
377  string m_Dbname; ///< Database base name.
378  bool m_Protein; ///< True if DB is protein.
379  string m_Title; ///< Title field of database.
380  string m_Date; ///< Time stamp (for all volumes.)
381  Uint8 m_MaxFileSize; ///< Maximum size of any file.
382  Uint8 m_MaxVolumeLetters; ///< Max letters per volume.
383  EIndexType m_Indices; ///< Indexing mode.
384  bool m_Closed; ///< True if database has been closed.
385  string m_MaskedLetters; ///< Masked protein letters (IUPAC).
386  string m_MaskByte; ///< Byte that replaced masked letters.
387  vector<char> m_MaskLookup; ///< Is (blast-aa) byte masked?
388  int m_MaskDataColumn; ///< Column ID for masking data column.
389  map<int, int> m_MaskAlgoMap; ///< Mapping from algo_id to gi-mask id
390  bool m_ParseIDs; ///< Generate ISAM files
391  bool m_UseGiMask; ///< Generate GI-based mask files
392  EBlastDbVersion m_DbVersion; ///< BLASTDB version
393 
394  /// Column titles.
395  vector<string> m_ColumnTitles;
396 
397 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
398  (!defined(NCBI_COMPILER_MIPSPRO)) )
399  /// Per-column metadata.
401 
402  /// Meta data for all columns.
403  vector< TColumnMeta > m_ColumnMetas;
404 
405  /// Gi-based masks
406  vector< CRef<CWriteDB_GiMask> > m_GiMasks;
407 #endif
408 
409  // Functions
410 
411  /// Flush accumulated sequence data to volume.
412  void x_Publish();
413 
414  /// Compute name of alias file produced.
415  string x_MakeAliasName();
416 
417  /// Flush accumulated sequence data to volume.
418  void x_MakeAlias();
419 
420  /// Clear sequence data from last sequence.
421  void x_ResetSequenceData();
422 
423  /// Convert and compute final data formats.
424  void x_CookData();
425 
426  /// Convert header data into usable forms.
427  void x_CookHeader();
428 
429  /// Collect ids for ISAM files.
430  void x_CookIds();
431 
432  /// Compute the length of the current sequence.
433  int x_ComputeSeqLength();
434 
435  /// Convert sequence data into usable forms.
436  void x_CookSequence();
437 
438  /// Prepare column data to be appended to disk.
439  void x_CookColumns();
440 
441  /// Replace masked input letters with m_MaskByte value.
442  void x_MaskSequence();
443 
444  /// Get binary version of deflines from 'user' data in Bioseq.
445  ///
446  /// Some CBioseq objects (e.g. those from CSeqDB) have an ASN.1
447  /// octet array containing a binary ASN.1 version of the blast
448  /// defline set for the sequence. This method looks for that data
449  /// and returns it if found. If not found, it returns an empty
450  /// string.
451  ///
452  /// @param bioseq Bioseq from which to fetch header. [in]
453  /// @param binhdr Header data as binary ASN.1. [out]
454  static void x_GetBioseqBinaryHeader(const CBioseq & bioseq,
455  string & binhdr);
456 
457  /// Construct deflines from a CBioseq and other meta-data.
458  ///
459  /// This method builds deflines from various data found in the
460  /// Bioseq, along with other meta data (like the PIG and
461  /// membership and linkout lists.)
462  ///
463  /// @param bioseq Defline data will be built from this. [in]
464  /// @param deflines A defline set will be returned here. [out]
465  /// @param membits Membership bits for each defline. [in]
466  /// @param linkout Linkout bits for each defline. [in]
467  /// @param pig PIG to attach to a protein sequence. [in]
468  static void
469  x_BuildDeflinesFromBioseq(const CBioseq & bioseq,
471  const vector< vector<int> > & membits,
472  const vector< vector<int> > & linkout,
473  int pig);
474 
475  /// Extract a defline set from a binary ASN.1 blob.
476  /// @param bin_hdr Binary ASN.1 encoding of defline set. [in]
477  /// @param deflines Defline set. [out]
478  static void
479  x_SetDeflinesFromBinary(const string & bin_hdr,
480  CConstRef<CBlast_def_line_set> & deflines);
481 
482  /// Extract a defline set from a CFastaReader generated CBioseq.
483  ///
484  /// CBioseq objects produced by CFastaReader have an internal
485  /// 'user' field that contains the original FASTA, which can be
486  /// used to build blast deflines. If the original FASTA deflines
487  /// were delimited with control-A characters, then those will be
488  /// found here too. If the caller wishes to accept '>' as an
489  /// alternate delimiter, then accept_gt should be specified.
490  ///
491  /// @param bioseq Bioseq object produced by CFastaReader. [in]
492  /// @param deflines Defline set. [out]
493  /// @param membits Membership bits for each defline. [in]
494  /// @param linkout Linkout bits for each defline. [in]
495  /// @param pig PIG to attach to a protein sequence. [in]
496  /// @param accept_gt Whether greater-than is a delimiter. [in]
497  /// @param parse_ids Whether seq_id should not be parsed. [in]
498  /// @param long_seqids If true, use long sequence ids (database|accession)
499  /// [in]
500  /// @param scan_bioseq_4_cfastareader_usrobj if true, scan the Bioseq objects for a [in]
501  static void
502  x_GetFastaReaderDeflines(const CBioseq & bioseq,
504  const vector< vector<int> > & membits,
505  const vector< vector<int> > & linkout,
506  int pig,
507  bool accept_gt,
508  bool parse_ids,
509  bool long_seqids,
510  bool scan_bioseq_4_cfastareader_usrobj = false);
511 
512  /// Returns true if we have unwritten sequence data.
513  bool x_HaveSequence() const;
514 
515  /// Records that we now have unwritten sequence data.
516  void x_SetHaveSequence();
517 
518  /// Records that we no longer have unwritten sequence data.
519  void x_ClearHaveSequence();
520 
521  /// Get deflines from a CBioseq and other meta-data.
522  ///
523  /// This method extracts binary ASN.1 deflines from a CBioseq if
524  /// possible, and otherwise builds deflines from various data
525  /// found in the Bioseq, along with other meta data (like the PIG
526  /// and membership and linkout lists.) It returns the result as
527  /// a blast defline set. If a binary version of the headers is
528  /// computed during this method, it will be returned in bin_hdr.
529  ///
530  /// @param bioseq Defline data will be built from this. [in]
531  /// @param deflines A defline set will be returned here. [out]
532  /// @param bin_hdr Binary header data may be returned here. [out]
533  /// @param membbits Membership bits for each defline. [in]
534  /// @param linkouts Linkout bits for each defline. [in]
535  /// @param pig PIG to attach to a protein sequence. [in]
536  /// @param OID the current OID for local id. [in]
537  /// @param parse_ids whether we should not parse id. [in]
538  static void x_ExtractDeflines(CConstRef<CBioseq> & bioseq,
540  string & bin_hdr,
541  const vector< vector<int> > & membbits,
542  const vector< vector<int> > & linkouts,
543  int pig,
544  set<TTaxId> & tax_ids,
545  int OID=-1,
546  bool parse_ids=true,
547  bool long_seqid=false,
548  bool limit_defline = false,
549  bool scan_bioseq_4_cfastareader_usrobj = false);
550 
551  /// Compute the hash of a (raw) sequence.
552  ///
553  /// The hash of the provided sequence will be computed and
554  /// assigned to the m_Hash member. The sequence and optional
555  /// ambiguities are 'raw', meaning they are packed just as
556  /// sequences are packed in nsq and psq files.
557  ///
558  /// @param sequence The sequence data. [in]
559  /// @param ambiguities Nucleotide ambiguities are provided here. [in]
560  void x_ComputeHash(const CTempString & sequence,
561  const CTempString & ambiguities);
562 
563  /// Compute the hash of a (Bioseq) sequence.
564  ///
565  /// The hash of the provided sequence will be computed and
566  /// assigned to the m_Hash member. The sequence is packed as a
567  /// CBioseq.
568  ///
569  /// @param sequence The sequence as a CBioseq. [in]
570  void x_ComputeHash(const CBioseq & sequence);
571 
572  /// Get the mask data column id.
573  ///
574  /// The mask data column is created if it does not exist, and its
575  /// column ID number is returned.
576  ///
577  /// @return The column ID for the mask data column.
578  int x_GetMaskDataColumnId();
579 
580  //
581  // Accumulated sequence data.
582  //
583 
584  /// Bioseq object for next sequence to write.
586 
587  /// SeqVector for next sequence to write.
589 
590  /// Deflines to write as header.
592 
593  /// Ids for next sequence to write, for use during ISAM construction.
594  vector< CRef<CSeq_id> > m_Ids;
595 
596  /// Linkout bits - outer vector is per-defline, inner is bits.
597  vector< vector<int> > m_Linkouts;
598 
599  /// Membership bits - outer vector is per-defline, inner is bits.
600  vector< vector<int> > m_Memberships;
601 
602  /// PIG to attach to headers for protein sequences.
603  int m_Pig;
604 
605  /// Sequence hash for this sequence.
606  int m_Hash;
607 
608  /// When a sequence is added, this will be populated with the length of that sequence.
610 
611  /// True if we have a sequence to write.
613 
614  // Cooked
615 
616  /// Sequence data in format that will be written to disk.
617  string m_Sequence;
618 
619  /// Ambiguities in format that will be written to disk.
620  string m_Ambig;
621 
622  /// Binary header in format that will be written to disk.
623  string m_BinHdr;
624 
626 
627  // Volumes
628 
629  /// This volume is currently accepting sequences.
631 
632  /// List of all volumes so far, up to and including m_Volume.
633  vector< CRef<CWriteDB_Volume> > m_VolumeList;
634 
635  /// Blob data for the current sequence, indexed by letter.
636  vector< CRef<CBlastDbBlob> > m_Blobs;
637 
638  /// List of blob columns that are active for this sequence.
639  vector<int> m_HaveBlob;
640 
641  /// Registry for masking algorithms in this database.
643 
644  ///Write lmdb handle
646 
647  ///Write tax info handle
649 
650  /// If true, use long sequence id format (database|accession) for all
651  /// acessions
653 
654  ///Current oid to use for lmdb
656 
659 
661 };
662 
664 
665 
666 #endif // OBJTOOLS_WRITERS_WRITEDB__WRITEDB_IMPL_HPP
667 
668 
CBioseq_Handle –.
`Blob' Class for SeqDB (and WriteDB).
Definition: seqdbblob.hpp:56
Registry class for the sequence masking/filtering algorithms used to create masks to be added to a CW...
This represents a set of masks for a given sequence.
Definition: writedb.hpp:65
CSeqVector –.
Definition: seq_vector.hpp:65
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
CWriteDB_Impl class.
CMaskInfoRegistry m_MaskAlgoRegistry
Registry for masking algorithms in this database.
int CreateColumn(const string &title, bool mbo=false)
Set up a generic CWriteDB metadata column.
void x_Publish()
Flush accumulated sequence data to volume.
CSeqVector m_SeqVector
SeqVector for next sequence to write.
void SetMaskedLetters(const string &masked)
Set bases that should not be used in sequences.
void x_MaskSequence()
Replace masked input letters with m_MaskByte value.
void x_CookIds()
Collect ids for ISAM files.
void SetPig(int pig)
Set the PIG identifier of this sequence.
string m_Sequence
Sequence data in format that will be written to disk.
bool m_Protein
True if DB is protein.
vector< vector< int > > m_Memberships
Membership bits - outer vector is per-defline, inner is bits.
void x_CookColumns()
Prepare column data to be appended to disk.
void AddColumnMetaData(int col_id, const string &key, const string &value)
Add meta data to a column.
CWriteDB_Column::TColumnMeta TColumnMeta
Per-column metadata.
void ListFiles(vector< string > &files)
List Filenames.
void x_SetHaveSequence()
Records that we now have unwritten sequence data.
CRef< CWriteDB_Volume > m_Volume
This volume is currently accepting sequences.
bool x_HaveSequence() const
Returns true if we have unwritten sequence data.
void AddSequence(const CTempString &sequence, const CTempString &ambiguities)
Add a new sequence as raw sequence and ambiguity data.
static void x_GetFastaReaderDeflines(const CBioseq &bioseq, CConstRef< CBlast_def_line_set > &deflines, const vector< vector< int > > &membits, const vector< vector< int > > &linkout, int pig, bool accept_gt, bool parse_ids, bool long_seqids, bool scan_bioseq_4_cfastareader_usrobj=false)
Extract a defline set from a CFastaReader generated CBioseq.
vector< char > m_MaskLookup
Is (blast-aa) byte masked?
vector< CRef< CWriteDB_Volume > > m_VolumeList
List of all volumes so far, up to and including m_Volume.
string m_Dbname
Database base name.
string m_Date
Time stamp (for all volumes.)
void x_MakeAlias()
Flush accumulated sequence data to volume.
void x_CookHeader()
Convert header data into usable forms.
void Close()
Close the file and flush any remaining data to disk.
void SetMaxFileSize(Uint8 sz)
Set the maximum size for any file in the database.
set< TTaxId > m_TaxIds
vector< CRef< CSeq_id > > m_Ids
Ids for next sequence to write, for use during ISAM construction.
void x_CookSequence()
Convert sequence data into usable forms.
~CWriteDB_Impl()
Destructor.
EBlastDbVersion m_DbVersion
BLASTDB version.
int m_Pig
PIG to attach to headers for protein sequences.
static CRef< CBlast_def_line_set > ExtractBioseqDeflines(const CBioseq &bs, bool parse_ids, bool long_seqids, bool scan_bioseq_4_cfastareader_usrobj=false)
Extract deflines from a CBioseq.
string m_MaskedLetters
Masked protein letters (IUPAC).
EIndexType m_Indices
Indexing mode.
static void x_BuildDeflinesFromBioseq(const CBioseq &bioseq, CConstRef< CBlast_def_line_set > &deflines, const vector< vector< int > > &membits, const vector< vector< int > > &linkout, int pig)
Construct deflines from a CBioseq and other meta-data.
int x_GetMaskDataColumnId()
Get the mask data column id.
vector< TColumnMeta > m_ColumnMetas
Meta data for all columns.
CConstRef< CBlast_def_line_set > m_Deflines
Deflines to write as header.
void x_ClearHaveSequence()
Records that we no longer have unwritten sequence data.
void SetMaskData(const CMaskedRangesVector &ranges, const vector< TGi > &gis)
Set filtering data for a sequence.
vector< string > m_ColumnTitles
Column titles.
int m_MaskDataColumn
Column ID for masking data column.
int x_ComputeSeqLength()
Compute the length of the current sequence.
vector< vector< int > > m_Linkouts
Linkout bits - outer vector is per-defline, inner is bits.
void x_ResetSequenceData()
Clear sequence data from last sequence.
void SetDeflines(const CBlast_def_line_set &deflines)
This method replaces any stored header data for the current sequence with the provided CBlast_def_lin...
void ListVolumes(vector< string > &vols)
List Volumes.
static void x_SetDeflinesFromBinary(const string &bin_hdr, CConstRef< CBlast_def_line_set > &deflines)
Extract a defline set from a binary ASN.1 blob.
CRef< CWriteDB_LMDB > m_Lmdbdb
Write lmdb handle.
int FindColumn(const string &title) const
Find an existing column.
CRef< CWriteDB_TaxID > m_Taxdb
Write tax info handle.
map< int, int > m_MaskAlgoMap
Mapping from algo_id to gi-mask id.
vector< CRef< CBlastDbBlob > > m_Blobs
Blob data for the current sequence, indexed by letter.
int m_Hash
Sequence hash for this sequence.
bool m_LongSeqId
If true, use long sequence id format (database|accession) for all acessions.
static void x_GetBioseqBinaryHeader(const CBioseq &bioseq, string &binhdr)
Get binary version of deflines from 'user' data in Bioseq.
int RegisterMaskAlgorithm(EBlast_filter_program program, const string &options, const string &name="")
Register a type of filtering data found in this database.
vector< int > m_HaveBlob
List of blob columns that are active for this sequence.
string m_Ambig
Ambiguities in format that will be written to disk.
CWriteDB::EIndexType EIndexType
Whether and what kind of indices to build.
Uint8 m_MaxVolumeLetters
Max letters per volume.
void x_ComputeHash(const CTempString &sequence, const CTempString &ambiguities)
Compute the hash of a (raw) sequence.
int m_LmdbOid
Current oid to use for lmdb.
CWriteDB_Impl(const string &dbname, bool protein, const string &title, EIndexType indices, bool parse_ids, bool long_ids, bool use_gi_mask, EBlastDbVersion dbver=eBDB_Version4, bool limit_defline=false, Uint8 oid_masks=EOidMaskType::fNone, bool scan_bioseq_4_cfastareader_usrobj=false)
Constructor.
void x_CookData()
Convert and compute final data formats.
string m_BinHdr
Binary header in format that will be written to disk.
bool m_UseGiMask
Generate GI-based mask files.
bool m_ScanBioseq4CFastaReaderUsrObjct
Uint8 m_MaxFileSize
Maximum size of any file.
CBlastDbBlob & SetBlobData(int col_id)
Get a blob to use for a given column letter.
bool m_ParseIDs
Generate ISAM files.
bool m_Closed
True if database has been closed.
vector< CRef< CWriteDB_GiMask > > m_GiMasks
Gi-based masks.
int m_SeqLength
When a sequence is added, this will be populated with the length of that sequence.
bool m_HaveSequence
True if we have a sequence to write.
static void x_ExtractDeflines(CConstRef< CBioseq > &bioseq, CConstRef< CBlast_def_line_set > &deflines, string &bin_hdr, const vector< vector< int > > &membbits, const vector< vector< int > > &linkouts, int pig, set< TTaxId > &tax_ids, int OID=-1, bool parse_ids=true, bool long_seqid=false, bool limit_defline=false, bool scan_bioseq_4_cfastareader_usrobj=false)
Get deflines from a CBioseq and other meta-data.
string m_Title
Title field of database.
CConstRef< CBioseq > m_Bioseq
Bioseq object for next sequence to write.
string m_MaskByte
Byte that replaced masked letters.
string x_MakeAliasName()
Compute name of alias file produced.
void SetMaxVolumeLetters(Uint8 sz)
Set the maximum letters in one volume.
EIndexType
Whether and what kind of indices to build.
Definition: writedb.hpp:104
Blast defline related defines.
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
EBlast_filter_program
This defines the possible sequence filtering algorithms to be used in a BLAST database.
char * dbname(DBPROCESS *dbproc)
Get name of current database.
Definition: dblib.c:6929
Declares CMaskInfoRegistry class.
const struct ncbi::grid::netcache::search::fields::KEY key
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
Defines exception class and several constants for SeqDB.
EBlastDbVersion
BLAST database version.
Definition: seqdbcommon.hpp:51
@ eBDB_Version4
Definition: seqdbcommon.hpp:52
@ fNone
Defines BLAST database construction classes.
Code for gi-based database mask file construction.
USING_SCOPE(objects)
Import definitions from the objects namespace.
Defines lmdb implementation of string-key database.
Code for database volume construction.
Modified on Fri Sep 20 14:57:44 2024 by modify_doxy.py rev. 669887
HHS Vulnerability Disclosure