1 #ifndef OBJTOOLS_WRITERS_WRITEDB__WRITEDB_IMPL_HPP
2 #define OBJTOOLS_WRITERS_WRITEDB__WRITEDB_IMPL_HPP
90 bool limit_defline =
false,
92 bool scan_bioseq_4_cfastareader_usrobj =
false);
234 bool scan_bioseq_4_cfastareader_usrobj =
false);
285 const string & options,
286 const string & name =
"");
306 const string & description,
307 const string & options);
317 const vector <TGi> & gis);
357 const string &
value);
397 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
398 (!defined(NCBI_COMPILER_MIPSPRO)) )
471 const vector< vector<int> > & membits,
472 const vector< vector<int> > & linkout,
504 const vector< vector<int> > & membits,
505 const vector< vector<int> > & linkout,
510 bool scan_bioseq_4_cfastareader_usrobj =
false);
541 const vector< vector<int> > & membbits,
542 const vector< vector<int> > & linkouts,
547 bool long_seqid=
false,
548 bool limit_defline =
false,
549 bool scan_bioseq_4_cfastareader_usrobj =
false);
`Blob' Class for SeqDB (and WriteDB).
Registry class for the sequence masking/filtering algorithms used to create masks to be added to a CW...
This represents a set of masks for a given sequence.
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
CMaskInfoRegistry m_MaskAlgoRegistry
Registry for masking algorithms in this database.
int CreateColumn(const string &title, bool mbo=false)
Set up a generic CWriteDB metadata column.
void x_Publish()
Flush accumulated sequence data to volume.
CSeqVector m_SeqVector
SeqVector for next sequence to write.
void SetMaskedLetters(const string &masked)
Set bases that should not be used in sequences.
void x_MaskSequence()
Replace masked input letters with m_MaskByte value.
void x_CookIds()
Collect ids for ISAM files.
void SetPig(int pig)
Set the PIG identifier of this sequence.
string m_Sequence
Sequence data in format that will be written to disk.
bool m_Protein
True if DB is protein.
vector< vector< int > > m_Memberships
Membership bits - outer vector is per-defline, inner is bits.
void x_CookColumns()
Prepare column data to be appended to disk.
void AddColumnMetaData(int col_id, const string &key, const string &value)
Add meta data to a column.
CWriteDB_Column::TColumnMeta TColumnMeta
Per-column metadata.
void ListFiles(vector< string > &files)
List Filenames.
void x_SetHaveSequence()
Records that we now have unwritten sequence data.
CRef< CWriteDB_Volume > m_Volume
This volume is currently accepting sequences.
bool x_HaveSequence() const
Returns true if we have unwritten sequence data.
void AddSequence(const CTempString &sequence, const CTempString &ambiguities)
Add a new sequence as raw sequence and ambiguity data.
static void x_GetFastaReaderDeflines(const CBioseq &bioseq, CConstRef< CBlast_def_line_set > &deflines, const vector< vector< int > > &membits, const vector< vector< int > > &linkout, int pig, bool accept_gt, bool parse_ids, bool long_seqids, bool scan_bioseq_4_cfastareader_usrobj=false)
Extract a defline set from a CFastaReader generated CBioseq.
vector< char > m_MaskLookup
Is (blast-aa) byte masked?
vector< CRef< CWriteDB_Volume > > m_VolumeList
List of all volumes so far, up to and including m_Volume.
string m_Dbname
Database base name.
string m_Date
Time stamp (for all volumes.)
void x_MakeAlias()
Flush accumulated sequence data to volume.
void x_CookHeader()
Convert header data into usable forms.
void Close()
Close the file and flush any remaining data to disk.
void SetMaxFileSize(Uint8 sz)
Set the maximum size for any file in the database.
vector< CRef< CSeq_id > > m_Ids
Ids for next sequence to write, for use during ISAM construction.
void x_CookSequence()
Convert sequence data into usable forms.
~CWriteDB_Impl()
Destructor.
EBlastDbVersion m_DbVersion
BLASTDB version.
int m_Pig
PIG to attach to headers for protein sequences.
static CRef< CBlast_def_line_set > ExtractBioseqDeflines(const CBioseq &bs, bool parse_ids, bool long_seqids, bool scan_bioseq_4_cfastareader_usrobj=false)
Extract deflines from a CBioseq.
string m_MaskedLetters
Masked protein letters (IUPAC).
EIndexType m_Indices
Indexing mode.
static void x_BuildDeflinesFromBioseq(const CBioseq &bioseq, CConstRef< CBlast_def_line_set > &deflines, const vector< vector< int > > &membits, const vector< vector< int > > &linkout, int pig)
Construct deflines from a CBioseq and other meta-data.
int x_GetMaskDataColumnId()
Get the mask data column id.
vector< TColumnMeta > m_ColumnMetas
Meta data for all columns.
CConstRef< CBlast_def_line_set > m_Deflines
Deflines to write as header.
void x_ClearHaveSequence()
Records that we no longer have unwritten sequence data.
void SetMaskData(const CMaskedRangesVector &ranges, const vector< TGi > &gis)
Set filtering data for a sequence.
vector< string > m_ColumnTitles
Column titles.
int m_MaskDataColumn
Column ID for masking data column.
int x_ComputeSeqLength()
Compute the length of the current sequence.
vector< vector< int > > m_Linkouts
Linkout bits - outer vector is per-defline, inner is bits.
void x_ResetSequenceData()
Clear sequence data from last sequence.
void SetDeflines(const CBlast_def_line_set &deflines)
This method replaces any stored header data for the current sequence with the provided CBlast_def_lin...
void ListVolumes(vector< string > &vols)
List Volumes.
static void x_SetDeflinesFromBinary(const string &bin_hdr, CConstRef< CBlast_def_line_set > &deflines)
Extract a defline set from a binary ASN.1 blob.
CRef< CWriteDB_LMDB > m_Lmdbdb
Write lmdb handle.
int FindColumn(const string &title) const
Find an existing column.
CRef< CWriteDB_TaxID > m_Taxdb
Write tax info handle.
map< int, int > m_MaskAlgoMap
Mapping from algo_id to gi-mask id.
vector< CRef< CBlastDbBlob > > m_Blobs
Blob data for the current sequence, indexed by letter.
int m_Hash
Sequence hash for this sequence.
bool m_LongSeqId
If true, use long sequence id format (database|accession) for all acessions.
static void x_GetBioseqBinaryHeader(const CBioseq &bioseq, string &binhdr)
Get binary version of deflines from 'user' data in Bioseq.
int RegisterMaskAlgorithm(EBlast_filter_program program, const string &options, const string &name="")
Register a type of filtering data found in this database.
vector< int > m_HaveBlob
List of blob columns that are active for this sequence.
string m_Ambig
Ambiguities in format that will be written to disk.
CWriteDB::EIndexType EIndexType
Whether and what kind of indices to build.
Uint8 m_MaxVolumeLetters
Max letters per volume.
void x_ComputeHash(const CTempString &sequence, const CTempString &ambiguities)
Compute the hash of a (raw) sequence.
int m_LmdbOid
Current oid to use for lmdb.
CWriteDB_Impl(const string &dbname, bool protein, const string &title, EIndexType indices, bool parse_ids, bool long_ids, bool use_gi_mask, EBlastDbVersion dbver=eBDB_Version4, bool limit_defline=false, Uint8 oid_masks=EOidMaskType::fNone, bool scan_bioseq_4_cfastareader_usrobj=false)
Constructor.
void x_CookData()
Convert and compute final data formats.
string m_BinHdr
Binary header in format that will be written to disk.
bool m_UseGiMask
Generate GI-based mask files.
bool m_ScanBioseq4CFastaReaderUsrObjct
Uint8 m_MaxFileSize
Maximum size of any file.
CBlastDbBlob & SetBlobData(int col_id)
Get a blob to use for a given column letter.
bool m_ParseIDs
Generate ISAM files.
bool m_Closed
True if database has been closed.
vector< CRef< CWriteDB_GiMask > > m_GiMasks
Gi-based masks.
int m_SeqLength
When a sequence is added, this will be populated with the length of that sequence.
bool m_HaveSequence
True if we have a sequence to write.
static void x_ExtractDeflines(CConstRef< CBioseq > &bioseq, CConstRef< CBlast_def_line_set > &deflines, string &bin_hdr, const vector< vector< int > > &membbits, const vector< vector< int > > &linkouts, int pig, set< TTaxId > &tax_ids, int OID=-1, bool parse_ids=true, bool long_seqid=false, bool limit_defline=false, bool scan_bioseq_4_cfastareader_usrobj=false)
Get deflines from a CBioseq and other meta-data.
string m_Title
Title field of database.
CConstRef< CBioseq > m_Bioseq
Bioseq object for next sequence to write.
string m_MaskByte
Byte that replaced masked letters.
string x_MakeAliasName()
Compute name of alias file produced.
void SetMaxVolumeLetters(Uint8 sz)
Set the maximum letters in one volume.
EIndexType
Whether and what kind of indices to build.
uint64_t Uint8
8-byte (64-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
EBlast_filter_program
This defines the possible sequence filtering algorithms to be used in a BLAST database.
char * dbname(DBPROCESS *dbproc)
Get name of current database.
Declares CMaskInfoRegistry class.
const struct ncbi::grid::netcache::search::fields::KEY key
const GenericPointer< typename T::ValueType > T2 value
Defines exception class and several constants for SeqDB.
EBlastDbVersion
BLAST database version.
Defines BLAST database construction classes.
Code for gi-based database mask file construction.
USING_SCOPE(objects)
Import definitions from the objects namespace.
Defines lmdb implementation of string-key database.
Code for database volume construction.