59 bool scan_bioseq_4_cfastareader_usrobj)
131 bool scan_bioseq_4_cfastareader_usrobj)
151 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
152 (!defined(NCBI_COMPILER_MIPSPRO)) )
155 const string & options,
163 const string & description,
164 const string & options)
170 const vector<TGi> & gis)
204 ofstream outp(fname.c_str(), ios::binary);
218 if ((
id >> 32) != 0) {
228 magic = eight ? -2 : -1;
232 magic = eight ? -4 : -3;
238 "Error: Unsupported ID type specified.");
263 char dbtype(is_protein ?
'p' :
'n');
266 string msg(
"Failed to find ");
267 msg += (is_protein ?
"protein " :
"nucleotide ");
292 *num_seqs_found = 0u;
296 _TRACE(
"Attempting to compute length for '" <<
dbname <<
"'");
310 int num_seqs_in_gifile = 0)
312 if ( !gi_file_name.empty() ) {
318 LOG_POST(
"Created " << (is_protein ?
"protein " :
"nucleotide ") <<
319 dbname <<
" BLAST (alias) database with " << num_seqs_found
320 <<
" sequences (out of " << num_seqs_in_gifile <<
" in "
321 << gi_file_name <<
", " << setprecision(0) << fixed <<
322 (num_seqs_found*100.0/num_seqs_in_gifile) <<
"% found)");
324 LOG_POST(
"Created " << (is_protein ?
"protein " :
"nucleotide ") <<
325 "BLAST (alias) database " <<
dbname <<
" with " <<
326 num_seqs_found <<
" sequences");
331 const string& db_name,
333 const string& gi_file_name,
348 case eGiList: retval =
"GILIST";
break;
349 case eTiList: retval =
"TILIST";
break;
360 const vector<string>& databases,
362 const string& gi_file_name,
371 fnamestr <<
file_name << (is_prot ?
".pal" :
".nal");
374 ofstream
out(fname.c_str());
378 if ( !title.empty() ) {
379 out <<
"TITLE " << title <<
"\n";
382 ITERATE(vector< string >, iter, databases) {
383 out <<
"\"" << *iter <<
"\" ";
386 if ( !gi_file_name.empty() ) {
389 << gi_file_name <<
"\n";
390 }
else if (oid_range) {
391 out <<
"FIRST_OID " << oid_range->GetFrom() <<
"\n"
392 <<
"LAST_OID " << oid_range->GetToOpen() <<
"\n";
398 _TRACE(
"Deleting " << fname);
399 string msg(
"BLASTDB alias file creation failed. Some referenced files may be missing");
404 _TRACE(
"Deleting " << fname);
407 <<
" in BLAST database";
413 out <<
"NSEQ " << num_seqs <<
"\n";
414 out <<
"LENGTH " << dbsize <<
"\n";
428 return (num_digits >2) ? num_digits: 2;
433 unsigned int num_volumes,
438 string concatenated_blastdb_name;
439 vector<string> volume_names(num_volumes,
kEmptyStr);
441 for (
unsigned int i = 0;
i < num_volumes;
i++) {
443 oss <<
file_name <<
"." << setfill(
'0') << setw(num_digits) <<
i;
446 volume_names.push_back(vol_name);
447 concatenated_blastdb_name += vol_name +
" ";
455 fname <<
file_name << (is_prot ?
".pal" :
".nal");
461 if ( !title.empty() ) {
462 out <<
"TITLE " << title <<
"\n";
466 ITERATE(vector<string>, itr, volume_names) {
470 out <<
"NSEQ " << num_seqs <<
"\n";
471 out <<
"LENGTH " << dbsize <<
"\n";
477 const vector<string>& databases,
479 const string& gi_file_name,
487 const vector<string>& db_names,
490 const string& title )
498 bool delete_source_alias_files )
500 if (alias_files.empty()) {
503 "No alias files available to create group alias file.");
512 ITERATE(list<string>, itr, alias_files) {
513 ifstream
in(itr->c_str());
515 LOG_POST(
Warning << *itr <<
" does not exist, omitting from group alias file");
520 while (getline(
in, line)) {
530 if (delete_source_alias_files) {
531 ITERATE(list<string>, itr, alias_files) {
533 _TRACE(
"Deleting " << *itr);
541 list<string> alias_files;
549 const string & output_db,
552 const string & title)
566 if(vols.size() == 0) {
571 string out_ext = is_protein?
".pal":
".nal";
573 ofstream ofs(output_db + out_ext);
574 ofs <<
"TITLE " << title <<endl;
576 for (
unsigned int i = 0;
i < vols.size();
i++) {
580 string DBList =
"DBLIST " + v_basename;
581 string OidList =
"OIDLIST ";
584 string full_path = vols[
i] + ex_model_ext;
589 OidList +=
f.GetName();
592 if (vols.size() > 1) {
593 oss << output_db <<
"." << setfill(
'0') << setw(num_digits) <<
i << out_ext;
595 ovs << DBList << endl;
596 ovs << OidList << endl;
597 ovs <<
"OID_MASK_TYPE " << oid_mask_type << endl;
600 ofs << DBList << endl;
601 ofs << OidList << endl;
602 ofs <<
"OID_MASK_TYPE " << oid_mask_type << endl;
606 if (vols.size() > 1) {
609 for (
unsigned int i = 0;
i < vols.size();
i++) {
610 oss <<
" " << output_db <<
"." << setfill(
'0') << setw(num_digits) <<
i;
616 Uint8 total_length = 0;
622 ofs <<
"NSEQ " << num_seqs << endl;
623 ofs <<
"LENGTH " << total_length << endl;
TContainerType m_Ids
List of identifiers to use.
CBinaryListBuilder(EIdType id_type)
Construct a list of a given type.
void Write(const string &fname)
Write the list to a file.
EIdType m_IdType
Whether to use GIs or TIs.
`Blob' Class for SeqDB (and WriteDB).
This represents a set of masks for a given sequence.
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
CSeqDB_Substring FindBaseName() const
Returns the portion of this path containing the base name.
void GetString(string &s) const
Return the data by assigning it to a string.
static void FindVolumePaths(const string &dbname, ESeqType seqtype, vector< string > &paths, vector< string > *alias_paths=NULL, bool recursive=true, bool expand_links=true)
Find volume paths.
Uint8 GetTotalLength() const
Returns the sum of the lengths of all available sequences.
ESeqType
Sequence types (eUnknown tries protein, then nucleotide).
void GetTotals(ESummaryType sumtype, int *oid_count, Uint8 *total_length, bool use_approx=true) const
Returns the sum of the sequence lengths.
int GetNumSeqs() const
Returns the number of sequences available.
@ eFilteredAll
Values from alias files, or summation over all included sequences.
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
int CreateColumn(const string &title, bool mbo=false)
Set up a generic CWriteDB metadata column.
void SetMaskedLetters(const string &masked)
Set bases that should not be used in sequences.
void SetPig(int pig)
Set the PIG identifier of this sequence.
void AddColumnMetaData(int col_id, const string &key, const string &value)
Add meta data to a column.
void ListFiles(vector< string > &files)
List Filenames.
void AddSequence(const CTempString &sequence, const CTempString &ambiguities)
Add a new sequence as raw sequence and ambiguity data.
void Close()
Close the file and flush any remaining data to disk.
void SetMaxFileSize(Uint8 sz)
Set the maximum size for any file in the database.
static CRef< CBlast_def_line_set > ExtractBioseqDeflines(const CBioseq &bs, bool parse_ids, bool long_seqids, bool scan_bioseq_4_cfastareader_usrobj=false)
Extract deflines from a CBioseq.
void SetMaskData(const CMaskedRangesVector &ranges, const vector< TGi > &gis)
Set filtering data for a sequence.
void SetDeflines(const CBlast_def_line_set &deflines)
This method replaces any stored header data for the current sequence with the provided CBlast_def_lin...
void ListVolumes(vector< string > &vols)
List Volumes.
int FindColumn(const string &title) const
Find an existing column.
int RegisterMaskAlgorithm(EBlast_filter_program program, const string &options, const string &name="")
Register a type of filtering data found in this database.
CBlastDbBlob & SetBlobData(int col_id)
Get a blob to use for a given column letter.
void SetMaxVolumeLetters(Uint8 sz)
Set the maximum letters in one volume.
int CreateUserColumn(const string &title)
Set up a user-defined CWriteDB column.
void SetMaxFileSize(Uint8 sz)
Set maximum size for output files.
@ eProtein
Protein database.
int RegisterMaskAlgorithm(EBlast_filter_program program, const string &options=string(), const string &name=string())
Register a type of filtering data found in this database.
CWriteDB(const string &dbname, ESeqType seqtype, const string &title, int itype=eDefault, bool parse_ids=true, bool long_ids=false, bool use_gi_mask=false, EBlastDbVersion dbver=eBDB_Version4, bool limit_defline=false, Uint8 oid_masks=EOidMaskType::fNone, bool scan_bioseq_4_cfastareader_usrobj=false)
Constructor.
void AddColumnMetaData(int col_id, const string &key, const string &value)
Add meta data to a user-defined column.
int FindColumn(const string &title) const
Find an existing column.
void ListFiles(vector< string > &files)
List Filenames.
CBlastDbBlob & SetBlobData(int column_id)
Add blob data to a user-defined column.
void SetMaskData(const CMaskedRangesVector &ranges, const vector< TGi > &gis)
Set filtering data for a sequence.
void SetPig(int pig)
Set the PIG to be used for the sequence.
void AddSequence(const CBioseq &bs)
Add a sequence as a CBioseq.
void SetMaxVolumeLetters(Uint8 letters)
Set maximum letters for output volumes.
CWriteDB_Impl * m_Impl
Implementation object.
EIndexType
Whether and what kind of indices to build.
void SetMaskedLetters(const string &masked)
Set letters that should not be used in sequences.
void ListVolumes(vector< string > &vols)
List Volumes.
static CRef< CBlast_def_line_set > ExtractBioseqDeflines(const CBioseq &bs, bool parse_ids=true, bool long_ids=false, bool scan_bioseq_4_cfastareader_usrobj=false)
Extract Deflines From Bioseq.
void SetDeflines(const CBlast_def_line_set &deflines)
Set the deflines to be used for the sequence.
void Close()
Close the Database.
std::ofstream out("events_result.xml")
main entry point for tests
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
#define NCBI_CURRENT_FUNCTION
Get current function name.
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
void Warning(CExceptionArgs_Base &args)
void FindFiles(TPathIterator path_begin, TPathIterator path_end, const vector< string > &masks, TFindFunc &find_func, TFindFiles flags=fFF_Default)
Generic algorithm for file search.
virtual bool Remove(TRemoveFlags flags=eRecursive) const
Remove a directory entry.
static string GetCwd(void)
Get the current working directory.
string GetName(void) const
Get the base entry name with extension (if any).
void Reset(void)
Reset reference object.
int32_t Int4
4-byte (32-bit) signed integer
int64_t Int8
8-byte (64-bit) signed integer
uint64_t Uint8
8-byte (64-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate whitespace in a string (in-place)
const char * data(void) const
Return a pointer to the array represented.
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
size_type length(void) const
Return the length of the represented array.
string AsString(const CTimeFormat &format=kEmptyStr, TSeconds out_tz=eCurrentTimeZone) const
Transform time to string.
@ eCurrent
Use current time. See also CCurrentTime.
EBlast_filter_program
This defines the possible sequence filtering algorithms to be used in a BLAST database.
char * dbname(DBPROCESS *dbproc)
Get name of current database.
constexpr auto sort(_Init &&init)
const struct ncbi::grid::netcache::search::fields::KEY key
const GenericPointer< typename T::ValueType > T2 value
std::istream & in(std::istream &in_, double &x_)
Defines BLAST database access classes.
const string kSeqDBGroupAliasFileName
The name of the group alias file name expected at each directory For more documentation,...
const string SeqDB_GetOidMaskFileExt(bool db_is_protein, EOidMaskType t)
EBlastDbVersion
BLAST database version.
string SeqDB_ResolveDbPathNoExtension(const string &filename, char dbtype='-')
Resolve a file path using SeqDB's path algorithms.
This file defines several SeqDB utility functions related to byte order and file system portability.
static SLJIT_INLINE sljit_ins msg(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
static bool ambig(char c)
static string s_AliasFileFilterTypeToString(EAliasFileFilterType e)
Auxiliary function to convert the enumeration into a string.
int s_GetNumOfDigits(int n)
static bool s_ComputeNumSequencesAndDbLength(const string &dbname, bool is_prot, Uint8 *dbsize, int *num_seqs_found)
Computes the number of sequences and (alias) database length for alias files.
void CWriteDB_CreateOidMaskDB(const string &input_db, const string &output_db, CWriteDB::ESeqType seq_type, int oid_mask_type, const string &title)
static void s_PrintAliasFileCreationLog(const string &dbname, bool is_protein, int num_seqs_found, const string &gi_file_name=kEmptyStr, int num_seqs_in_gifile=0)
void CWriteDB_CreateAliasFile(const string &file_name, const string &db_name, CWriteDB::ESeqType seq_type, const string &gi_file_name, const string &title, EAliasFileFilterType alias_type)
void CWriteDB_ConsolidateAliasFiles(const list< string > &alias_files, bool delete_source_alias_files)
static void s_CreateAliasFilePriv(const string &file_name, const vector< string > &databases, CWriteDB::ESeqType seq_type, const string &gi_file_name, const string &title, EAliasFileFilterType alias_type, const TSeqRange *oid_range=NULL)
static bool s_DoesBlastDbExist(const string &dbname, bool is_protein)
Returns true if the BLAST DB exists, otherwise throws a CSeqDBException.
Defines BLAST database construction classes.
EAliasFileFilterType
Defines the possible filtering types that can be applied to an alias file.
@ eTiList
Filter a BLAST database via TIs (Trace IDs)
@ eSeqIdList
Filter a BLAST database via a Seq-id list.
@ eTaxIdList
Filter a BLAST database via Taxonomy Id list.
@ eGiList
Filter a BLAST database via GIs.
@ eNoAliasFilterType
Sentinel value.
Data conversion tools for CWriteDB and associated code.
void s_WriteInt8BE(ostream &str, Uint8 x)
Write an eight byte integer to a stream in big-endian format.
void s_WriteInt4(ostream &str, int x)
Write a four byte integer to a stream in big endian format.
Defines implementation class of WriteDB.