59 bool scan_bioseq_4_cfastareader_usrobj)
131 bool scan_bioseq_4_cfastareader_usrobj)
151 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
152 (!defined(NCBI_COMPILER_MIPSPRO)) )
155 const string & options,
163 const string & description,
164 const string & options)
170 const vector<TGi> & gis)
204 ofstream outp(fname.c_str(), ios::binary);
218 if ((
id >> 32) != 0) {
228 magic = eight ? -2 : -1;
232 magic = eight ? -4 : -3;
238 "Error: Unsupported ID type specified.");
263 char dbtype(is_protein ?
'p' :
'n');
266 string msg(
"Failed to find ");
267 msg += (is_protein ?
"protein " :
"nucleotide ");
268 msg +=
dbname +
" BLAST database";
292 *num_seqs_found = 0u;
296 _TRACE(
"Attempting to compute length for '" <<
dbname <<
"'");
310 int num_seqs_in_gifile = 0)
312 if ( !gi_file_name.empty() ) {
318 LOG_POST(
"Created " << (is_protein ?
"protein " :
"nucleotide ") <<
319 dbname <<
" BLAST (alias) database with " << num_seqs_found
320 <<
" sequences (out of " << num_seqs_in_gifile <<
" in "
321 << gi_file_name <<
", " << setprecision(0) << fixed <<
322 (num_seqs_found*100.0/num_seqs_in_gifile) <<
"% found)");
324 LOG_POST(
"Created " << (is_protein ?
"protein " :
"nucleotide ") <<
325 "BLAST (alias) database " <<
dbname <<
" with " <<
326 num_seqs_found <<
" sequences");
331 const string& db_name,
333 const string& gi_file_name,
337 vector<string> db(1, db_name);
347 case eGiList: retval =
"GILIST";
break;
348 case eTiList: retval =
"TILIST";
break;
359 const vector<string>& databases,
361 const string& gi_file_name,
370 fnamestr <<
file_name << (is_prot ?
".pal" :
".nal");
373 ofstream
out(fname.c_str());
377 if ( !title.empty() ) {
378 out <<
"TITLE " << title <<
"\n";
381 ITERATE(vector< string >, iter, databases) {
382 out <<
"\"" << *iter <<
"\" ";
385 if ( !gi_file_name.empty() ) {
388 << gi_file_name <<
"\n";
389 }
else if (oid_range) {
390 out <<
"FIRST_OID " << oid_range->GetFrom() <<
"\n"
391 <<
"LAST_OID " << oid_range->GetToOpen() <<
"\n";
397 _TRACE(
"Deleting " << fname);
398 string msg(
"BLASTDB alias file creation failed. Some referenced files may be missing");
403 _TRACE(
"Deleting " << fname);
406 <<
" in BLAST database";
412 out <<
"NSEQ " << num_seqs <<
"\n";
413 out <<
"LENGTH " << dbsize <<
"\n";
427 return (num_digits >2) ? num_digits: 2;
432 unsigned int num_volumes,
437 string concatenated_blastdb_name;
438 vector<string> volume_names(num_volumes,
kEmptyStr);
440 for (
unsigned int i = 0;
i < num_volumes;
i++) {
442 oss <<
file_name <<
"." << setfill(
'0') << setw(num_digits) <<
i;
445 volume_names.push_back(vol_name);
446 concatenated_blastdb_name += vol_name +
" ";
454 fname <<
file_name << (is_prot ?
".pal" :
".nal");
460 if ( !title.empty() ) {
461 out <<
"TITLE " << title <<
"\n";
465 ITERATE(vector<string>, itr, volume_names) {
469 out <<
"NSEQ " << num_seqs <<
"\n";
470 out <<
"LENGTH " << dbsize <<
"\n";
476 const vector<string>& databases,
478 const string& gi_file_name,
486 const vector<string>& db_names,
489 const string& title )
497 bool delete_source_alias_files )
499 if (alias_files.empty()) {
502 "No alias files available to create group alias file.");
511 ITERATE(list<string>, itr, alias_files) {
512 ifstream
in(itr->c_str());
514 LOG_POST(
Warning << *itr <<
" does not exist, omitting from group alias file");
519 while (getline(
in, line)) {
529 if (delete_source_alias_files) {
530 ITERATE(list<string>, itr, alias_files) {
532 _TRACE(
"Deleting " << *itr);
540 list<string> alias_files;
548 const string & output_db,
551 const string & title)
565 if(vols.size() == 0) {
570 string out_ext = is_protein?
".pal":
".nal";
572 ofstream ofs(output_db + out_ext);
573 ofs <<
"TITLE " << title <<endl;
575 for (
unsigned int i = 0;
i < vols.size();
i++) {
579 string DBList =
"DBLIST " + v_basename;
580 string OidList =
"OIDLIST ";
583 string full_path = vols[
i] + ex_model_ext;
588 OidList +=
f.GetName();
591 if (vols.size() > 1) {
592 oss << output_db <<
"." << setfill(
'0') << setw(num_digits) <<
i << out_ext;
594 ovs << DBList << endl;
595 ovs << OidList << endl;
596 ovs <<
"OID_MASK_TYPE " << oid_mask_type << endl;
599 ofs << DBList << endl;
600 ofs << OidList << endl;
601 ofs <<
"OID_MASK_TYPE " << oid_mask_type << endl;
605 if (vols.size() > 1) {
608 for (
unsigned int i = 0;
i < vols.size();
i++) {
609 oss <<
" " << output_db <<
"." << setfill(
'0') << setw(num_digits) <<
i;
615 Uint8 total_length = 0;
621 ofs <<
"NSEQ " << num_seqs << endl;
622 ofs <<
"LENGTH " << total_length << endl;
TContainerType m_Ids
List of identifiers to use.
CBinaryListBuilder(EIdType id_type)
Construct a list of a given type.
void Write(const string &fname)
Write the list to a file.
EIdType m_IdType
Whether to use GIs or TIs.
`Blob' Class for SeqDB (and WriteDB).
This represents a set of masks for a given sequence.
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
CSeqDB_Substring FindBaseName() const
Returns the portion of this path containing the base name.
void GetString(string &s) const
Return the data by assigning it to a string.
static void FindVolumePaths(const string &dbname, ESeqType seqtype, vector< string > &paths, vector< string > *alias_paths=NULL, bool recursive=true, bool expand_links=true)
Find volume paths.
Uint8 GetTotalLength() const
Returns the sum of the lengths of all available sequences.
ESeqType
Sequence types (eUnknown tries protein, then nucleotide).
void GetTotals(ESummaryType sumtype, int *oid_count, Uint8 *total_length, bool use_approx=true) const
Returns the sum of the sequence lengths.
int GetNumSeqs() const
Returns the number of sequences available.
@ eFilteredAll
Values from alias files, or summation over all included sequences.
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
int CreateColumn(const string &title, bool mbo=false)
Set up a generic CWriteDB metadata column.
void SetMaskedLetters(const string &masked)
Set bases that should not be used in sequences.
void SetPig(int pig)
Set the PIG identifier of this sequence.
void AddColumnMetaData(int col_id, const string &key, const string &value)
Add meta data to a column.
void ListFiles(vector< string > &files)
List Filenames.
void AddSequence(const CTempString &sequence, const CTempString &ambiguities)
Add a new sequence as raw sequence and ambiguity data.
void Close()
Close the file and flush any remaining data to disk.
void SetMaxFileSize(Uint8 sz)
Set the maximum size for any file in the database.
static CRef< CBlast_def_line_set > ExtractBioseqDeflines(const CBioseq &bs, bool parse_ids, bool long_seqids, bool scan_bioseq_4_cfastareader_usrobj=false)
Extract deflines from a CBioseq.
void SetMaskData(const CMaskedRangesVector &ranges, const vector< TGi > &gis)
Set filtering data for a sequence.
void SetDeflines(const CBlast_def_line_set &deflines)
This method replaces any stored header data for the current sequence with the provided CBlast_def_lin...
void ListVolumes(vector< string > &vols)
List Volumes.
int FindColumn(const string &title) const
Find an existing column.
int RegisterMaskAlgorithm(EBlast_filter_program program, const string &options, const string &name="")
Register a type of filtering data found in this database.
CBlastDbBlob & SetBlobData(int col_id)
Get a blob to use for a given column letter.
void SetMaxVolumeLetters(Uint8 sz)
Set the maximum letters in one volume.
int CreateUserColumn(const string &title)
Set up a user-defined CWriteDB column.
void SetMaxFileSize(Uint8 sz)
Set maximum size for output files.
@ eProtein
Protein database.
int RegisterMaskAlgorithm(EBlast_filter_program program, const string &options=string(), const string &name=string())
Register a type of filtering data found in this database.
CWriteDB(const string &dbname, ESeqType seqtype, const string &title, int itype=eDefault, bool parse_ids=true, bool long_ids=false, bool use_gi_mask=false, EBlastDbVersion dbver=eBDB_Version4, bool limit_defline=false, Uint8 oid_masks=EOidMaskType::fNone, bool scan_bioseq_4_cfastareader_usrobj=false)
Constructor.
void AddColumnMetaData(int col_id, const string &key, const string &value)
Add meta data to a user-defined column.
int FindColumn(const string &title) const
Find an existing column.
void ListFiles(vector< string > &files)
List Filenames.
CBlastDbBlob & SetBlobData(int column_id)
Add blob data to a user-defined column.
void SetMaskData(const CMaskedRangesVector &ranges, const vector< TGi > &gis)
Set filtering data for a sequence.
void SetPig(int pig)
Set the PIG to be used for the sequence.
void AddSequence(const CBioseq &bs)
Add a sequence as a CBioseq.
void SetMaxVolumeLetters(Uint8 letters)
Set maximum letters for output volumes.
CWriteDB_Impl * m_Impl
Implementation object.
EIndexType
Whether and what kind of indices to build.
void SetMaskedLetters(const string &masked)
Set letters that should not be used in sequences.
void ListVolumes(vector< string > &vols)
List Volumes.
static CRef< CBlast_def_line_set > ExtractBioseqDeflines(const CBioseq &bs, bool parse_ids=true, bool long_ids=false, bool scan_bioseq_4_cfastareader_usrobj=false)
Extract Deflines From Bioseq.
void SetDeflines(const CBlast_def_line_set &deflines)
Set the deflines to be used for the sequence.
void Close()
Close the Database.
std::ofstream out("events_result.xml")
main entry point for tests
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
#define NCBI_CURRENT_FUNCTION
Get current function name.
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
void Warning(CExceptionArgs_Base &args)
void FindFiles(TPathIterator path_begin, TPathIterator path_end, const vector< string > &masks, TFindFunc &find_func, TFindFiles flags=fFF_Default)
Generic algorithm for file search.
virtual bool Remove(TRemoveFlags flags=eRecursive) const
Remove a directory entry.
static string GetCwd(void)
Get the current working directory.
string GetName(void) const
Get the base entry name with extension (if any).
void Reset(void)
Reset reference object.
int32_t Int4
4-byte (32-bit) signed integer
int64_t Int8
8-byte (64-bit) signed integer
uint64_t Uint8
8-byte (64-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
const char * data(void) const
Return a pointer to the array represented.
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
size_type length(void) const
Return the length of the represented array.
string AsString(const CTimeFormat &format=kEmptyStr, TSeconds out_tz=eCurrentTimeZone) const
Transform time to string.
@ eCurrent
Use current time. See also CCurrentTime.
EBlast_filter_program
This defines the possible sequence filtering algorithms to be used in a BLAST database.
char * dbname(DBPROCESS *dbproc)
Get name of current database.
constexpr auto sort(_Init &&init)
const struct ncbi::grid::netcache::search::fields::KEY key
std::istream & in(std::istream &in_, double &x_)
double f(double x_, const double &y_)
Defines BLAST database access classes.
const string kSeqDBGroupAliasFileName
The name of the group alias file name expected at each directory For more documentation,...
const string SeqDB_GetOidMaskFileExt(bool db_is_protein, EOidMaskType t)
EBlastDbVersion
BLAST database version.
string SeqDB_ResolveDbPathNoExtension(const string &filename, char dbtype='-')
Resolve a file path using SeqDB's path algorithms.
This file defines several SeqDB utility functions related to byte order and file system portability.
static bool ambig(char c)
static string s_AliasFileFilterTypeToString(EAliasFileFilterType e)
Auxiliary function to convert the enumeration into a string.
int s_GetNumOfDigits(int n)
static bool s_ComputeNumSequencesAndDbLength(const string &dbname, bool is_prot, Uint8 *dbsize, int *num_seqs_found)
Computes the number of sequences and (alias) database length for alias files.
void CWriteDB_CreateOidMaskDB(const string &input_db, const string &output_db, CWriteDB::ESeqType seq_type, int oid_mask_type, const string &title)
static void s_PrintAliasFileCreationLog(const string &dbname, bool is_protein, int num_seqs_found, const string &gi_file_name=kEmptyStr, int num_seqs_in_gifile=0)
void CWriteDB_CreateAliasFile(const string &file_name, const string &db_name, CWriteDB::ESeqType seq_type, const string &gi_file_name, const string &title, EAliasFileFilterType alias_type)
void CWriteDB_ConsolidateAliasFiles(const list< string > &alias_files, bool delete_source_alias_files)
static void s_CreateAliasFilePriv(const string &file_name, const vector< string > &databases, CWriteDB::ESeqType seq_type, const string &gi_file_name, const string &title, EAliasFileFilterType alias_type, const TSeqRange *oid_range=NULL)
static bool s_DoesBlastDbExist(const string &dbname, bool is_protein)
Returns true if the BLAST DB exists, otherwise throws a CSeqDBException.
Defines BLAST database construction classes.
EAliasFileFilterType
Defines the possible filtering types that can be applied to an alias file.
@ eTiList
Filter a BLAST database via TIs (Trace IDs)
@ eSeqIdList
Filter a BLAST database via a Seq-id list.
@ eTaxIdList
Filter a BLAST database via Taxonomy Id list.
@ eGiList
Filter a BLAST database via GIs.
@ eNoAliasFilterType
Sentinel value.
Data conversion tools for CWriteDB and associated code.
void s_WriteInt8BE(ostream &str, Uint8 x)
Write an eight byte integer to a stream in big-endian format.
void s_WriteInt4(ostream &str, int x)
Write a four byte integer to a stream in big endian format.
Defines implementation class of WriteDB.