66 bool scan_bioseq_4_cfastareader_usrobj)
71 m_MaxVolumeLetters (0),
74 m_MaskDataColumn (-1),
75 m_ParseIDs (parse_ids),
76 m_UseGiMask (use_gi_mask),
81 m_HaveSequence (
false),
82 m_LongSeqId (long_ids),
84 m_limitDefline (protein? limit_defline:
false),
85 m_OidMasks (oid_masks),
86 m_ScanBioseq4CFastaReaderUsrObjct(scan_bioseq_4_cfastareader_usrobj)
94 t.assign(
t, 1,
t.size() - 1);
131 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
132 (!defined(NCBI_COMPILER_MIPSPRO)) )
169 msg <<
"Invalid molecule type of sequence added ("
171 <<
"); expected " << (
m_Protein ?
"protein" :
"nucleotide");
219 if (next_key >= other.
key) {
242 #ifdef WORDS_BIGENDIAN
252 template <>
inline string
254 #define CHAR_BUFFER_SIZE 256
261 while (*p != 0x02) ++p;
267 if ( (
in.size() == 4)
268 && ((
in[0] -
'0') * (
in[0] -
'9') <= 0) ) {
272 source->read(next_token, 4);
273 source->seekg(-4, ios_base::cur);
274 string next_key(next_token, 4);
276 if (next_key ==
in) {
278 return x_GetNextKey();
289 return (*lhs < *rhs);
297 while (!keys.empty()) {
310 if (keys.find(
key) != keys.end()) {
312 msg <<
"Error: Duplicate seq_id <"
314 <<
"> is found multiple times across volumes.";
353 LOG_POST(
Info <<
"Rename files index to " << num_digits <<
" digits");
383 vector<blastdb::TOid> vol_num_oids(
m_VolumeList.size());
430 ofstream alias(nm.c_str());
432 alias <<
"#\n# Alias file created: " <<
m_Date <<
"\n#\n"
434 <<
"DBLIST " << dblist <<
"\n";
436 if (masklist !=
"") {
437 alias <<
"MASKLIST " << masklist <<
"\n";
444 if (! bin_hdr.empty()) {
455 vector< vector< char >* > bindata;
458 if ((**iter).IsUser()) {
464 const vector< CRef< CUser_field > > &
D = uo.
GetData();
468 D[0]->CanGetLabel() &&
469 D[0]->GetLabel().IsStr() &&
471 D[0]->CanGetData() &&
472 D[0]->GetData().IsOss()) {
474 bindata =
D[0]->GetData().GetOss();
482 if (! bindata.empty()) {
483 if (bindata[0] && (! bindata[0]->
empty())) {
484 vector<char> &
b = *bindata[0];
486 bin_hdr.assign(&
b[0],
b.size());
494 static const int kGenBankLimit = 5;
495 static const int kGenBankScore = 500;
496 if (dfs->
Get().size() <= kGenBankLimit){
502 list<CRef<CBlast_def_line> > & df_set= deflines->
Set();
509 CBlast_def_line_set::Tdata::iterator itr=df_set.
begin();
511 list<CRef<CBlast_def_line> > tmp_gb_list;
512 while (itr != df_set.end()){
516 if (score >= kGenBankScore){
517 size_t orig_size = tax_ids.
size();
519 if (orig_size == tax_ids.
size()){
521 list<CRef<CBlast_def_line> >::iterator tmp_itr = itr;
523 tmp_gb_list.splice(tmp_gb_list.end(), df_set, tmp_itr);
526 itr = df_set.erase(itr);
540 while ((gb_count < kGenBankLimit) && (tmp_gb_list.size() > 0)){
541 df_set.splice(df_set.end(), tmp_gb_list, tmp_gb_list.begin());
575 if (defline->CanGetMemberships() &&
576 defline->GetMemberships().size() == 0) {
578 defline->ResetMemberships();
581 if (defline->CanGetLinks() &&
582 defline->GetLinks().size() == 0) {
584 defline->ResetLinks();
588 deflines.
Reset(bdls);
594 const vector< vector<int> > & membbits,
595 const vector< vector<int> > & linkouts,
602 vector<TTaxId> taxids;
619 else if( desc.
IsOrg()) {
620 org_pt = &(desc.
GetOrg());
628 if ((**dbiter).CanGetDb() &&
629 (**dbiter).GetDb() ==
"taxon") {
648 list< CRef<CSeq_id> > ids = bioseq.
GetId();
650 unsigned taxid_i(0), mship_i(0), links_i(0);
651 bool used_pig(
false);
658 while(! ids.empty()) {
661 defline->SetSeqid() = ids;
694 defline->SetTitle(titles);
696 if (taxid_i < taxids.size()) {
697 defline->SetTaxid(taxids[taxid_i++]);
700 if (mship_i < membbits.size()) {
701 const vector<int> & V = membbits[mship_i++];
702 defline->SetMemberships().assign(V.begin(), V.end());
705 if (links_i < linkouts.size()) {
706 const vector<int> & V = linkouts[mship_i++];
707 defline->SetLinks().assign(V.begin(), V.end());
710 if ((! used_pig) && pig) {
711 defline->SetOther_info().push_back(pig);
715 bdls->
Set().push_back(defline);
728 istringstream iss(bin_hdr);
732 deflines.
Reset(&* bdls);
743 if(bioseq_id ==
NULL ||
748 (!long_seqid && (bioseq_id->
IsPrf() || bioseq_id->
IsPir()))) {
756 if (
id.NotEmpty() && !id->
IsLocal()) {
768 const vector< vector<int> > & membbits,
769 const vector< vector<int> > & linkouts,
776 bool scan_bioseq_4_cfastareader_usrobj)
778 bool use_bin = (deflines.
Empty() && pig == 0);
780 if (! bin_hdr.empty() && OID<0) {
784 if (deflines.
Empty()) {
787 if (bioseq.
Empty()) {
790 "Error: Cannot find CBioseq or deflines.");
802 if (bin_hdr.empty()) {
812 scan_bioseq_4_cfastareader_usrobj);
822 if (bin_hdr.empty() && deflines.
Empty()) {
831 if (bin_hdr.empty() &&
832 (deflines.
Empty() || deflines->
Get().empty())) {
836 "Error: No deflines provided.");
840 const list<int> * L = 0;
842 if (deflines->
Get().front()->CanGetOther_info()) {
843 L = & deflines->
Get().front()->GetOther_info();
851 if ((L == 0) || L->empty()) {
853 bdls->
Set().front()->SetOther_info().push_back(pig);
855 deflines.
Reset(&* bdls);
857 }
else if (L->front() != pig) {
859 bdls->
Set().front()->SetOther_info().front() = pig;
861 deflines.
Reset(&* bdls);
872 bdls->
Set().front()->SetSeqid().front() = gnl_id;
874 deflines.
Reset(&* bdls);
877 if (deflines.
Empty() && (! bin_hdr.empty())) {
887 if (bin_hdr.empty() || OID>=0) {
927 if (!
m_Ids.empty()) {
935 "Error: Cannot find IDs or deflines.");
942 const list< CRef<CSeq_id> > & ids = (**iter).GetSeqid();
949 m_Ids.push_back(*it);
977 "Need sequence data.");
997 "Need sequence data.");
1007 switch(sd.
Which()) {
1033 msg =
"Unable to process sequence for entry [";
1038 if (!
msg.empty()) {
1047 "No sequence data in Bioseq, "
1048 "and no Bioseq_Handle available.");
1062 na8.reserve(sz + 1);
1067 na4.resize((sz + 1) / 2);
1069 for(
int i = 0;
i < sz;
i += 2) {
1070 na4[
i/2] = (na8[
i] << 4) + na8[
i+1];
1075 (
int)
si.GetLength(),
1139 char* map_sz_str = getenv(
"BLASTDB_LMDB_MAP_SIZE");
1200 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
1201 (!defined(NCBI_COMPILER_MIPSPRO)) )
1237 "Cannot write sequence to volume.");
1254 ((
b < 0) ? -
b :
b));
1304 template<
class TWriteSize,
class TRanges>
1309 typedef vector< pair<TSeqPos, TSeqPos> > TPairVector;
1311 Int4 num_written = 0;
1312 TWriteSize::WriteInt(blob,
count);
1314 for (
typename TRanges::const_iterator
r1 = (ranges).begin(),
1315 r1_end = (ranges).end();
1319 if (
r1->offsets.size()) {
1321 TWriteSize::WriteInt(blob,
r1->algorithm_id);
1322 TWriteSize::WriteInt(blob,
r1->offsets.size());
1325 TWriteSize::WriteInt(blob,
r2->first);
1326 TWriteSize::WriteInt(blob,
r2->second);
1336 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
1337 (!defined(NCBI_COMPILER_MIPSPRO)) )
1340 const vector <TGi> & gis)
1356 typedef vector< pair<TSeqPos, TSeqPos> > TPairVector;
1358 int range_list_count = 0;
1359 int offset_pairs_count = 0;
1367 range_list_count ++;
1368 offset_pairs_count +=
r1->offsets.size();
1371 string msg(
"Error: Algorithm IDs must be registered before use.");
1372 msg +=
" Unknown algorithm ID = " +
1379 if ((
r2->first >
r2->second) ||
1380 (
r2->second > seq_length)) {
1384 "Error: Masked data offsets out of bounds.");
1393 if (
r1->offsets.size()) {
1395 ->AddGiMask(gis,
r1->offsets);
1412 if (
r1->offsets.size()) {
1438 const string & options,
1439 const string & name)
1461 return algorithm_id;
1466 const string &description,
1467 const string &options)
1472 string value =
"100:" +
1479 return algorithm_id;
1497 size_t col_id =
m_Blobs.size() / 2;
1507 m_Blobs .push_back(new_blob2);
1523 const string &
value)
1525 if ((col_id < 0) || (col_id >= (
int)
m_ColumnMetas.size())) {
1527 "Error: provided column ID is not valid");
1539 if ((col_id < 0) || (col_id * 2 >= (
int)
m_Blobs.size())) {
1541 "Error: provided column ID is not valid");
1546 "Error: Already have blob for this sequence and column");
1576 bool scan_bioseq_4_cfastareader_usrobj)
1581 string binary_header;
1582 vector< vector<int> > v1,
v2;
1584 const bool kLimitDefline =
false;
1588 long_seqids, kLimitDefline, scan_bioseq_4_cfastareader_usrobj);
1605 "Error: Nucleotide masking not supported.");
1610 if (masked.empty()) {
1634 for (
unsigned i = 0;
i < mask_bytes.size();
i++) {
1635 int ch = ((
int) mask_bytes[
i]) & 0xFF;
1642 string mask_byte =
"X";
1660 vols.push_back((**iter).GetVolumeName());
1669 (**iter).ListFiles(files);
1714 #define TAB_REPLACEMENT " "
1721 const vector< vector<int> > & membits,
1722 const vector< vector<int> > & linkout,
1727 bool scan_bioseq_4_cfastareader_usrobj)
1737 if (scan_bioseq_4_cfastareader_usrobj) {
1752 if (
f.CanGetLabel() &&
1753 f.GetLabel().IsStr() &&
1754 f.GetLabel().GetStr() ==
"DefLine" &&
1756 f.GetData().IsStr()) {
1774 unsigned mship_i(0), links_i(0);
1775 bool used_pig(
false);
1791 defline->SetSeqid().push_back(gnl_id);
1793 string title(fasta, 1, fasta.size());
1798 defline->SetTitle(title);
1800 if (mship_i < membits.size()) {
1801 const vector<int> & V = membits[mship_i++];
1802 defline->SetMemberships().assign(V.begin(), V.end());
1805 if (links_i < linkout.size()) {
1806 const vector<int> & V = linkout[mship_i++];
1807 defline->SetLinks().assign(V.begin(), V.end());
1810 if ((! used_pig) && pig) {
1811 defline->SetOther_info().push_back(pig);
1815 bdls->
Set().push_back(defline);
1820 while(fasta.size()) {
1821 size_t id_start = skip;
1822 size_t pos_title = fasta.find(
" ", skip);
1823 size_t pos_next = fasta.find(
"\001", skip);
1826 if (pos_next == fasta.npos) {
1828 pos_next = fasta.find(
" >");
1836 if (pos_next == fasta.npos) {
1837 pos_next = fasta.size();
1841 if (pos_title == fasta.npos || pos_title >= pos_next) {
1843 pos_title = pos_next;
1846 string ids(fasta, id_start, pos_title - id_start);
1847 if (pos_title == pos_next) pos_title--;
1848 string title(fasta, pos_title + 1, pos_next-pos_title - 1);
1849 string remaining(fasta, pos_next, fasta.size() - pos_next);
1850 fasta.swap(remaining);
1853 list< CRef<CSeq_id> > seqids;
1854 if (ids.find(
'|') !=
NPOS){
1861 string label =
id->GetSeqIdString(
true);
1868 string label =
id->GetSeqIdString(
true);
1872 seqids.push_back(
id);
1878 defline->SetSeqid().swap(seqids);
1879 defline->SetTitle(title);
1881 if (mship_i < membits.size()) {
1882 const vector<int> & V = membits[mship_i++];
1883 defline->SetMemberships().assign(V.begin(), V.end());
1886 if (links_i < linkout.size()) {
1887 const vector<int> & V = linkout[mship_i++];
1888 defline->SetLinks().assign(V.begin(), V.end());
1891 if ((! used_pig) && pig) {
1892 defline->SetOther_info().push_back(pig);
1896 bdls->
Set().push_back(defline);
1903 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
1904 (!defined(NCBI_COMPILER_MIPSPRO)) )
vector< TRangeWithFuzz > TRanges
const CSeq_id * GetNonLocalId() const
Find a non-local ID if present, consulting assembly details if all IDs for the overall sequence are l...
`Blob' Class for SeqDB (and WriteDB).
@ eSimple
Just write NUL bytes until aligned.
void WritePadBytes(int align, EPadding fmt)
Align the offset by writing pad bytes.
void WriteInt2(int x)
Write a 1 byte integer to the blob.
void Clear()
Clear all owned data and reference an empty string.
void WriteInt4(Int4 x)
Write a 4 byte integer to the blob.
void WriteInt1(int x)
Write a 1 byte integer to the blob.
void WriteInt4_LE(Int4 x)
void SortBySeqIdRank(bool is_protein, bool useBlastRank=false)
Sort the deflines according to the toolkit established ranking of Seq-ids.
static Int4 GetInt4(const unsigned char *ptr)
bool IsRegistered(int algo_id) const
Verify whether the provided algorithm ID has been registered with this object.
int Add(EBlast_filter_program program, const string &options=string(), const string &progname=string())
Attempt to register the information about a masking algorithm.
This represents a set of masks for a given sequence.
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
static SIZE_TYPE Convert(const CTempString &src, TCoding src_coding, TSeqPos pos, TSeqPos length, string &dst, TCoding dst_coding)
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
CMaskInfoRegistry m_MaskAlgoRegistry
Registry for masking algorithms in this database.
int CreateColumn(const string &title, bool mbo=false)
Set up a generic CWriteDB metadata column.
void x_Publish()
Flush accumulated sequence data to volume.
CSeqVector m_SeqVector
SeqVector for next sequence to write.
void SetMaskedLetters(const string &masked)
Set bases that should not be used in sequences.
void x_MaskSequence()
Replace masked input letters with m_MaskByte value.
void x_CookIds()
Collect ids for ISAM files.
void SetPig(int pig)
Set the PIG identifier of this sequence.
string m_Sequence
Sequence data in format that will be written to disk.
bool m_Protein
True if DB is protein.
vector< vector< int > > m_Memberships
Membership bits - outer vector is per-defline, inner is bits.
void x_CookColumns()
Prepare column data to be appended to disk.
void AddColumnMetaData(int col_id, const string &key, const string &value)
Add meta data to a column.
CWriteDB_Column::TColumnMeta TColumnMeta
Per-column metadata.
void ListFiles(vector< string > &files)
List Filenames.
void x_SetHaveSequence()
Records that we now have unwritten sequence data.
CRef< CWriteDB_Volume > m_Volume
This volume is currently accepting sequences.
bool x_HaveSequence() const
Returns true if we have unwritten sequence data.
void AddSequence(const CTempString &sequence, const CTempString &ambiguities)
Add a new sequence as raw sequence and ambiguity data.
static void x_GetFastaReaderDeflines(const CBioseq &bioseq, CConstRef< CBlast_def_line_set > &deflines, const vector< vector< int > > &membits, const vector< vector< int > > &linkout, int pig, bool accept_gt, bool parse_ids, bool long_seqids, bool scan_bioseq_4_cfastareader_usrobj=false)
Extract a defline set from a CFastaReader generated CBioseq.
vector< char > m_MaskLookup
Is (blast-aa) byte masked?
vector< CRef< CWriteDB_Volume > > m_VolumeList
List of all volumes so far, up to and including m_Volume.
string m_Dbname
Database base name.
string m_Date
Time stamp (for all volumes.)
void x_MakeAlias()
Flush accumulated sequence data to volume.
void x_CookHeader()
Convert header data into usable forms.
void Close()
Close the file and flush any remaining data to disk.
void SetMaxFileSize(Uint8 sz)
Set the maximum size for any file in the database.
vector< CRef< CSeq_id > > m_Ids
Ids for next sequence to write, for use during ISAM construction.
void x_CookSequence()
Convert sequence data into usable forms.
~CWriteDB_Impl()
Destructor.
EBlastDbVersion m_DbVersion
BLASTDB version.
int m_Pig
PIG to attach to headers for protein sequences.
static CRef< CBlast_def_line_set > ExtractBioseqDeflines(const CBioseq &bs, bool parse_ids, bool long_seqids, bool scan_bioseq_4_cfastareader_usrobj=false)
Extract deflines from a CBioseq.
string m_MaskedLetters
Masked protein letters (IUPAC).
EIndexType m_Indices
Indexing mode.
static void x_BuildDeflinesFromBioseq(const CBioseq &bioseq, CConstRef< CBlast_def_line_set > &deflines, const vector< vector< int > > &membits, const vector< vector< int > > &linkout, int pig)
Construct deflines from a CBioseq and other meta-data.
int x_GetMaskDataColumnId()
Get the mask data column id.
vector< TColumnMeta > m_ColumnMetas
Meta data for all columns.
CConstRef< CBlast_def_line_set > m_Deflines
Deflines to write as header.
void x_ClearHaveSequence()
Records that we no longer have unwritten sequence data.
void SetMaskData(const CMaskedRangesVector &ranges, const vector< TGi > &gis)
Set filtering data for a sequence.
vector< string > m_ColumnTitles
Column titles.
int m_MaskDataColumn
Column ID for masking data column.
int x_ComputeSeqLength()
Compute the length of the current sequence.
vector< vector< int > > m_Linkouts
Linkout bits - outer vector is per-defline, inner is bits.
void x_ResetSequenceData()
Clear sequence data from last sequence.
void SetDeflines(const CBlast_def_line_set &deflines)
This method replaces any stored header data for the current sequence with the provided CBlast_def_lin...
void ListVolumes(vector< string > &vols)
List Volumes.
static void x_SetDeflinesFromBinary(const string &bin_hdr, CConstRef< CBlast_def_line_set > &deflines)
Extract a defline set from a binary ASN.1 blob.
CRef< CWriteDB_LMDB > m_Lmdbdb
Write lmdb handle.
int FindColumn(const string &title) const
Find an existing column.
CRef< CWriteDB_TaxID > m_Taxdb
Write tax info handle.
map< int, int > m_MaskAlgoMap
Mapping from algo_id to gi-mask id.
vector< CRef< CBlastDbBlob > > m_Blobs
Blob data for the current sequence, indexed by letter.
int m_Hash
Sequence hash for this sequence.
bool m_LongSeqId
If true, use long sequence id format (database|accession) for all acessions.
static void x_GetBioseqBinaryHeader(const CBioseq &bioseq, string &binhdr)
Get binary version of deflines from 'user' data in Bioseq.
int RegisterMaskAlgorithm(EBlast_filter_program program, const string &options, const string &name="")
Register a type of filtering data found in this database.
vector< int > m_HaveBlob
List of blob columns that are active for this sequence.
string m_Ambig
Ambiguities in format that will be written to disk.
Uint8 m_MaxVolumeLetters
Max letters per volume.
void x_ComputeHash(const CTempString &sequence, const CTempString &ambiguities)
Compute the hash of a (raw) sequence.
int m_LmdbOid
Current oid to use for lmdb.
CWriteDB_Impl(const string &dbname, bool protein, const string &title, EIndexType indices, bool parse_ids, bool long_ids, bool use_gi_mask, EBlastDbVersion dbver=eBDB_Version4, bool limit_defline=false, Uint8 oid_masks=EOidMaskType::fNone, bool scan_bioseq_4_cfastareader_usrobj=false)
Constructor.
void x_CookData()
Convert and compute final data formats.
string m_BinHdr
Binary header in format that will be written to disk.
bool m_UseGiMask
Generate GI-based mask files.
bool m_ScanBioseq4CFastaReaderUsrObjct
Uint8 m_MaxFileSize
Maximum size of any file.
CBlastDbBlob & SetBlobData(int col_id)
Get a blob to use for a given column letter.
bool m_ParseIDs
Generate ISAM files.
bool m_Closed
True if database has been closed.
vector< CRef< CWriteDB_GiMask > > m_GiMasks
Gi-based masks.
int m_SeqLength
When a sequence is added, this will be populated with the length of that sequence.
bool m_HaveSequence
True if we have a sequence to write.
static void x_ExtractDeflines(CConstRef< CBioseq > &bioseq, CConstRef< CBlast_def_line_set > &deflines, string &bin_hdr, const vector< vector< int > > &membbits, const vector< vector< int > > &linkouts, int pig, set< TTaxId > &tax_ids, int OID=-1, bool parse_ids=true, bool long_seqid=false, bool limit_defline=false, bool scan_bioseq_4_cfastareader_usrobj=false)
Get deflines from a CBioseq and other meta-data.
string m_Title
Title field of database.
CConstRef< CBioseq > m_Bioseq
Bioseq object for next sequence to write.
string m_MaskByte
Byte that replaced masked letters.
string x_MakeAliasName()
Compute name of alias file produced.
void SetMaxVolumeLetters(Uint8 sz)
Set the maximum letters in one volume.
class to support searching for duplicate isam keys
CWriteDB_IsamKey(const string &fn)
bool operator<(const CWriteDB_IsamKey &other) const
bool AdvanceKey(const CWriteDB_IsamKey &other)
This class supports creation of a string accession to integer OID lmdb database.
void InsertVolumesInfo(const vector< string > &vol_names, const vector< blastdb::TOid > &vol_num_oids)
Create volume table This api should only be called once to create vol info for all vols in the db.
int InsertEntries(const list< CRef< CSeq_id >> &seqids, const blastdb::TOid oid)
Add entries in bulk as fetched from CSeqDB::GetSeqIDs.
This class supports creation of tax id list lookup files.
int InsertEntries(const set< TTaxId > &tax_ids, const blastdb::TOid oid)
Add tax id entries in bulk for each oid This api needs to be called in sequential order of OIDs This ...
void RenameSingle()
Rename all volumes files to single-volume names.
void AddColumnMetaData(int col_id, const string &key, const string &value)
Add meta data to a column.
void RenameFileIndex(unsigned int num_digits)
const int & GetOID() const
Get the current OID of the volume.
int CreateColumn(const string &title, const TColumnMeta &meta, Uint8 max_sz, bool mbo=true)
Create a new database column.
bool WriteSequence(const string &seq, const string &ambig, const string &binhdr, const TIdList &ids, int pig, int hash, const TBlobList &blobs, int maskcol_id=-1)
Add a sequence to this volume.
void Close()
Close the volume.
const string & GetVolumeName() const
Get the name of the volume.
EIndexType
Whether and what kind of indices to build.
@ eAddHash
Add an index from sequence hash to OID.
iterator_bool insert(const value_type &val)
const_iterator begin() const
const_iterator end() const
string GetSeqIdString(const CSeq_id &id)
static const char si[8][64]
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
unsigned int TSeqPos
Type for sequence locations and lengths.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
#define TAX_ID_FROM(T, value)
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
void Error(CExceptionArgs_Base &args)
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
const string & GetMsg(void) const
Get message string.
void Info(CExceptionArgs_Base &args)
string GetName(void) const
Get the base entry name with extension (if any).
C & SerialAssign(C &dest, const C &src, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
#define MSerial_AsnBinary
EAccessionInfo
For IdentifyAccession (below)
static SIZE_TYPE ParseFastaIds(CBioseq::TId &ids, const CTempString &s, bool allow_partial_failure=false)
Parse an entire set of |-delimited FASTA-style IDs, appending the results to IDS.
static int BestRank(const CRef< CSeq_id > &id)
static int BlastRank(const CRef< CSeq_id > &id)
@ fParse_RawText
Try to ID raw non-numeric accessions.
@ fParse_ValidLocal
Treat otherwise unidentified strings as raw accessions, provided that they pass rudimentary validatio...
CConstRef< CBioseq > GetCompleteBioseq(void) const
Get the complete bioseq.
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
bool Empty(void) const THROWS_NONE
Check if CConstRef is empty – not pointing to any object which means having a null value.
TObjectType * GetPointer(void) const THROWS_NONE
Get pointer,.
void Reset(void)
Reset reference object.
void Reset(void)
Reset reference object.
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
bool NotEmpty(void) const THROWS_NONE
Check if CConstRef is not empty – pointing to an object and has a non-null value.
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
int32_t Int4
4-byte (32-bit) signed integer
uint64_t Uint8
8-byte (64-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
static string ParseEscapes(const CTempString str, EEscSeqRange mode=eEscSeqRange_Standard, char user_char='?')
Parse C-style escape sequences in the specified string.
const char * data(void) const
Return a pointer to the array represented.
static string & Replace(const string &src, const string &search, const string &replace, string &dst, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
static Uint8 StringToUInt8(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to Uint8.
size_type length(void) const
Return the length of the represented array.
static string & ReplaceInPlace(string &src, const string &search, const string &replace, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
size_type size(void) const
Return the length of the represented array.
string AsString(const CTimeFormat &format=kEmptyStr, TSeconds out_tz=eCurrentTimeZone) const
Transform time to string.
@ eCurrent
Use current time. See also CCurrentTime.
C::value_type FindBestChoice(const C &container, F score_func)
Find the best choice (lowest score) for values in a container.
static const char label[]
const TOrg & GetOrg(void) const
Get the Org member data.
EBlast_filter_program
This defines the possible sequence filtering algorithms to be used in a BLAST database.
Tdata & Set(void)
Assign a value to data member.
bool CanGet(void) const
Check if it is safe to call Get method.
const Tdata & Get(void) const
Get the member data.
@ eBlast_filter_program_other
bool IsStr(void) const
Check if variant Str is selected.
bool CanGetType(void) const
Check if it is safe to call GetType method.
void SetTag(TTag &value)
Assign a value to Tag data member.
bool IsId(void) const
Check if variant Id is selected.
bool CanGetData(void) const
Check if it is safe to call GetData method.
const TStr & GetStr(void) const
Get the variant data.
const TData & GetData(void) const
Get the Data member data.
const TType & GetType(void) const
Get the Type member data.
void SetDb(const TDb &value)
Assign a value to Db data member.
TId GetId(void) const
Get the variant data.
const TDb & GetDb(void) const
Get the Db member data.
bool CanGetDb(void) const
Check if it is safe to call GetDb method.
TGeneral & SetGeneral(void)
Select the variant.
bool IsPrf(void) const
Check if variant Prf is selected.
E_Choice Which(void) const
Which variant is currently selected.
bool IsLocal(void) const
Check if variant Local is selected.
bool IsPir(void) const
Check if variant Pir is selected.
const TUser & GetUser(void) const
Get the variant data.
const TInst & GetInst(void) const
Get the Inst member data.
bool IsOrg(void) const
Check if variant Org is selected.
bool CanGetMol(void) const
Check if it is safe to call GetMol method.
const TTitle & GetTitle(void) const
Get the variant data.
const TSource & GetSource(void) const
Get the variant data.
bool IsSource(void) const
Check if variant Source is selected.
const TId & GetId(void) const
Get the Id member data.
const Tdata & Get(void) const
Get the member data.
bool CanGetDescr(void) const
Check if it is safe to call GetDescr method.
TLength GetLength(void) const
Get the Length member data.
const TOrg & GetOrg(void) const
Get the variant data.
bool CanGetId(void) const
Check if it is safe to call GetId method.
bool CanGetSeq_data(void) const
Check if it is safe to call GetSeq_data method.
bool IsTitle(void) const
Check if variant Title is selected.
const TDescr & GetDescr(void) const
Get the Descr member data.
bool CanGetInst(void) const
Check if it is safe to call GetInst method.
bool IsUser(void) const
Check if variant User is selected.
E_Choice Which(void) const
Which variant is currently selected.
@ e_Ncbieaa
extended ASCII 1 letter aa codes
@ e_Ncbistdaa
consecutive codes for std aas
@ e_Ncbi2na
2 bit nucleic acid code
@ e_Iupacna
IUPAC 1 letter nuc acid code.
@ e_Ncbi4na
4 bit nucleic acid code
@ e_Iupacaa
IUPAC 1 letter amino acid code.
char * dbname(DBPROCESS *dbproc)
Get name of current database.
unsigned int
A callback function used to compare two keys in a database.
constexpr bool empty(list< Ts... >) noexcept
const struct ncbi::grid::netcache::search::fields::KEY key
const GenericPointer< typename T::ValueType > T2 value
const CharType(& source)[N]
std::istream & in(std::istream &in_, double &x_)
double df(double x_, const double &y_)
string BuildLMDBFileName(const string &basename, bool is_protein, bool use_index=false, unsigned int index=0)
Build the canonical LMDB file name for BLAST databases.
string GetFileNameFromExistingLMDBFile(const string &lmdb_filename, ELMDBFileType file_type)
Defines exception class and several constants for SeqDB.
unsigned SeqDB_SequenceHash(const char *sequence, int length)
Returns a path minus filename.
EBlastDbVersion
BLAST database version.
Defines `expert' version of CSeqDB interfaces.
void SeqDB_UnpackAmbiguities(const CTempString &sequence, const CTempString &ambiguities, string &result)
Unpack an ambiguous nucleotide sequence.
static const sljit_gpr r1
static SLJIT_INLINE sljit_ins l(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
static SLJIT_INLINE sljit_ins msg(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
static const sljit_gpr r2
Comparison function for set<CWriteDB_IsamKey<T> *>
bool operator()(const CWriteDB_IsamKey< T > *lhs, const CWriteDB_IsamKey< T > *rhs) const
static bool ambig(char c)
Data conversion tools for CWriteDB and associated code.
void WriteDB_Ncbi2naToBinary(const CSeq_inst &si, string &seq)
Build blast db nucleotide format from Ncbi2na Seq-inst.
void WriteDB_EaaToBinary(const CSeq_inst &si, string &seq)
Build blast db protein format from Eaa protein Seq-inst.
void WriteDB_IupacaaToBinary(const CSeq_inst &si, string &seq)
Build blast db protein format from Iupacaa protein Seq-inst.
void WriteDB_Ncbi4naToBinary(const CSeq_inst &seqinst, string &seq, string &amb)
Build blast db nucleotide format from Ncbi4na Seq-inst.
void WriteDB_StdaaToBinary(const CSeq_inst &si, string &seq)
Build blast db protein format from Stdaa protein Seq-inst.
void WriteDB_IupacnaToBinary(const CSeq_inst &si, string &seq, string &amb)
Build blast db nucleotide format from Iupacna Seq-inst.
Defines exception class for WriteDB.
int WriteDB_FindSequenceLength(bool protein, const string &seq)
Compute length of sequence from raw packing.
static bool s_UseFastaReaderDeflines(CConstRef< CBioseq > &bioseq, CConstRef< CBlast_def_line_set > &deflines, bool long_seqid)
static CRef< CBlast_def_line_set > s_EditDeflineSet(CConstRef< CBlast_def_line_set > &deflines)
static void s_CheckEmptyLists(CRef< CBlast_def_line_set > &deflines, bool owner)
static const string s_EscapeColon(const string &in)
int s_AbsMax(int a, int b)
USING_SCOPE(std)
Import C++ std namespace.
static void s_CheckDuplicateIds(set< CWriteDB_IsamKey< T > *, CWriteDB_IsamKey_Compare< T > > &keys)
Check for duplicate ids across volumes.
void s_LimitDeflines(CConstRef< CBlast_def_line_set > &dfs)
Defines implementation class of WriteDB.