60 #include "../blast/blast_app_util.hpp"
67 #ifndef SKIP_DOXYGEN_PROCESSING
118 return (
a->GetId() <
b->GetId());
123 return (
a->GetRefSeqOid() <
b->GetRefSeqOid());
138 vector<SBlastDbMaskData> & mask_range,
139 vector<int> & column_ids,
140 vector<CTempString> & column_blobs);
142 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
143 (!defined(NCBI_COMPILER_MIPSPRO)) )
164 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
165 (!defined(NCBI_COMPILER_MIPSPRO)) )
175 : m_Source(source_db), m_Clusters(cluster), m_CurrentCluster(0)
177 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
178 (!defined(NCBI_COMPILER_MIPSPRO)) )
183 string algo_opts, algo_name;
214 vector<SBlastDbMaskData> & mask_range,
215 vector<int> & column_ids,
216 vector<CTempString> & column_blobs)
232 const char * seq_ptr;
233 int slength(0), alength(0);
238 ambiguities =
CTempString(seq_ptr + slength, alength);
252 if (ref_seqid.
Match(**seqid)) {
261 const vector<CRef<CClusterSeq> > & mem_seqs = cluster->
GetMemSeqs();
262 if (mem_seqs.size() > 0) {
263 vector<blastdb::TOid> mem_oids;
264 for (
unsigned int i=0;
i < mem_seqs.size();
i++) {
265 int64_t mem_oid = mem_seqs[
i]->GetOid();
270 mem_oids.push_back(mem_oid);
272 std::sort(mem_oids.begin(), mem_oids.end());
277 vector<CBlast_def_line::TTaxid> diff_ts;
278 diff_ts.resize(taxids.
size());
279 vector<CBlast_def_line::TTaxid>::iterator diff_ts_itr;
281 diff_ts_itr = std::set_difference(taxids.
begin(), taxids.
end(), ref_ts.
begin(), ref_ts.
end(), diff_ts.begin());
282 diff_ts.resize(diff_ts_itr - diff_ts.begin());
283 if (diff_ts.size() > 0) {
289 bf->SetLeafTaxIds(leaf_ts);
293 deflines->
Set().push_back(bf);
295 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
296 (!defined(NCBI_COMPILER_MIPSPRO)) )
307 mask_data.
offsets.push_back(pair<TSeqPos, TSeqPos>(
range->first,
range->second));
310 mask_range.push_back(mask_data);
316 column_blobs.resize(column_ids.size());
317 m_Blobs.resize(column_ids.size());
319 for(
int i = 0;
i < (
int)column_ids.size();
i++) {
393 arg_desc->SetUsageContext(
GetArguments().GetProgramBasename(),
394 "Application to create BLAST databases, version "
397 arg_desc->SetCurrentGroup(
"Input options");
398 arg_desc->AddDefaultKey(
kInput,
"input_file",
401 arg_desc->AddDefaultKey(
kArgDb,
"source_db",
405 arg_desc->AddDefaultKey(
kArgDbType,
"molecule_type",
410 arg_desc->SetCurrentGroup(
"Configuration options");
411 arg_desc->AddOptionalKey(
kArgDbTitle,
"database_title",
412 "Title for BLAST database\n",
415 arg_desc->SetCurrentGroup(
"Output options");
416 arg_desc->AddOptionalKey(
kOutput,
"database_name",
417 "Name of BLAST database to be created\n",
419 arg_desc->AddDefaultKey(
"max_file_sz",
"number_of_bytes",
420 "Maximum file size for BLAST database files",
422 arg_desc->AddOptionalKey(
"metadata_output_prefix",
"",
424 arg_desc->AddOptionalKey(
"logfile",
"File_Name",
425 "File to which the program log should be redirected",
428 arg_desc->AddFlag(
"verbose",
"Produce verbose output",
true);
439 static string kMods =
"KMGTPEZY";
442 for(
i = 0;
i < kMods.size();
i++) {
443 if (v <
Uint8(minprec)*1024) {
451 rv.append(kMods,
i, 1);
463 unsigned int cluster_id = 0;
464 while (input_stream) {
465 getline(input_stream, line);
466 if(line.empty() || (line.find_first_not_of(
' ') == std::string::npos)) {
472 if (cols.size() < 3) {
475 string ref_id(cols[0]);
484 string mem_id(cols[1]);
485 if (ref_id != mem_id) {
492 LOG_POST(
Info <<
"Num of Reference Seqs: " << cluster_id);
500 vector<blastdb::TOid> oids;
507 accs.push_back((*itr)->GetId());
527 bool is_protein = (args[
kArgDbType].AsString() ==
"prot");
535 m_LogFile = & (args[
"logfile"].HasValue() ? args[
"logfile"].AsOutputFile() : cout);
541 bool long_seqids =
true;
542 bool limit_defline =
false;
546 if (args[
"verbose"]) {
552 static const Uint8 MAX_VOL_FILE_SIZE = 0x100000000;
553 if (bytes >= MAX_VOL_FILE_SIZE) {
568 #ifdef METADATA_CLUSTERDB
573 string output_prefix = args[
"metadata_output_prefix"]
574 ? args[
"metadata_output_prefix"].AsString() :
kEmptyStr;
582 string metadata_filename = new_db +
"." + extn;
583 ofstream
out(metadata_filename.c_str());
586 json_out->PreserveKeyNames();
588 json_out->WriteObject(obj_info);
618 #ifndef SKIP_DOXYGEN_PROCESSING
619 int main(
int argc,
const char* argv[] )
User-defined methods of the data storage class.
User-defined methods of the data storage class.
Declares singleton objects to store the version and reference for the BLAST engine.
#define CATCH_ALL(exit_code)
Standard catch statement for all BLAST command line programs.
Code to build a database given various sources of sequence data.
void AddParam(EUsageParams p, int val)
Keeps track of the version of the BLAST engine in the NCBI C++ toolkit.
Build BlastDB format databases from various data sources.
bool AddSequences(IBioseqSource &src, bool add_pig=false)
Add sequences from an IBioseqSource object.
void SetVerbosity(bool v)
Specify level of output verbosity.
int RegisterMaskingAlgorithm(EBlast_filter_program program, const string &options, const string &name="")
Define a masking algorithm.
string GetOutputDbName() const
bool EndBuild(bool erase=false)
Finish building a new database.
void SetMaxFileSize(Uint8 max_file_size)
Set the maximum size of database component files.
void SetLeafTaxIds(const TIdToLeafs &taxids, bool keep_taxids)
Specify a leaf-taxids object.
virtual bool GetNext(CTempString &sequence, CTempString &ambiguities, CRef< CBlast_def_line_set > &deflines, vector< SBlastDbMaskData > &mask_range, vector< int > &column_ids, vector< CTempString > &column_blobs)
uint64_t m_CurrentCluster
virtual void GetColumnNames(vector< string > &names)
Get the names of all columns defined by this sequence source.
CRef< CSeqDBExpert > m_Source
vector< string > m_ColumnNames
CClusterDBSource(CRef< CSeqDBExpert > &source_db, vector< CRef< CCluster > > &clusters, CBuildDatabase *outdb)
virtual ~CClusterDBSource()
vector< CBlastDbBlob > m_Blobs
virtual int GetColumnId(const string &name)
Get the column ID for a column mentioned by name.
virtual const map< string, string > & GetColumnMetaData(int id)
Get metadata for the column with the specified Column ID.
vector< int > m_ColumnIds
vector< CRef< CCluster > > & m_Clusters
map< int, int > m_MaskIdMap
const string & GetId() const
CRef< CCluster > & GetCluster()
CClusterSeq(CRef< CCluster > cluster, const string &id, bool is_refseq)
CRef< CCluster > m_Cluster
void AddMemSeq(CRef< CClusterSeq > &m)
const string & GetRefSeqId()
void SetRefSeq(CRef< CClusterSeq > &r)
CCluster(unsigned int cluster_id)
const vector< CRef< CClusterSeq > > & GetMemSeqs()
CRef< CClusterSeq > m_RefSeq
unsigned int GetClusterId()
vector< CRef< CClusterSeq > > m_MemSeqs
CRef< CClusterSeq > & GetRefSeq()
Defines invalid user input exceptions.
The main application class.
vector< CRef< CCluster > > m_Clusters
CRef< CBuildDatabase > m_DB
void x_ProcessInputData(const string &source_db, bool is_protein)
vector< CRef< CClusterSeq > > m_ClusterSeqs
CMakeClusterDBApp()
@inheritDoc
virtual void Init()
@inheritDoc
virtual int Run()
@inheritDoc
void x_ProcessInputFile(const string &input_file)
CRef< CSeqDBExpert > m_SourceDB
CBlastUsageReport m_UsageReport
void GetRawSeqAndAmbig(int oid, const char **buffer, int *seq_length, int *ambig_length) const
Raw Sequence and Ambiguity Data.
void GetColumnBlob(int col_id, int oid, CBlastDbBlob &blob)
Fetch the data blob for the given column and oid.
void GetAvailableMaskAlgorithms(vector< int > &algorithms)
Get a list of algorithm IDs for which mask data exists.
ESeqType
Sequence types (eUnknown tries protein, then nucleotide).
void GetTaxIdsForOids(const vector< blastdb::TOid > &oids, set< TTaxId > &tax_ids) const
void GetMaskAlgorithmDetails(int algorithm_id, objects::EBlast_filter_program &program, string &program_name, string &algo_opts)
Get information about one type of masking available here.
void RetSequence(const char **buffer) const
Returns any resources associated with the sequence.
CRef< CBlast_db_metadata > GetDBMetaData(string user_path=kEmptyStr)
void ListColumns(vector< string > &titles)
List columns titles found in this database.
bool CheckOrFindOID(int &next_oid) const
Find an included OID, incrementing next_oid if necessary.
int GetColumnId(const string &title)
Get an ID number for a given column title.
CRef< CBlast_def_line_set > GetHdr(int oid) const
Get the ASN.1 header for the sequence.
void AccessionsToOids(const vector< string > &accs, vector< blastdb::TOid > &oids) const
void GetMaskData(int oid, const vector< int > &algo_ids, TSequenceRanges &ranges)
Get masked ranges of a sequence.
const map< string, string > & GetColumnMetaData(int column_id)
Get all metadata for the specified column.
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Interface to a source of raw sequence data.
iterator_bool insert(const value_type &val)
const_iterator begin() const
const_iterator end() const
const string kArgDbTitle
Title for the BLAST database.
const string kArgDbType
BLAST database molecule type.
const string kArgDb
BLAST database name.
void Print(const CCompactSAMApplication::AlignInfo &ai)
API (CDeflineGenerator) for computing sequences' titles ("definitions").
static const struct name_t names[]
std::ofstream out("events_result.xml")
main entry point for tests
void SetFullVersion(CRef< CVersionAPI > version)
Set version data for the program.
void HideStdArgs(THideStdArgs hide_mask)
Set the hide mask for the Hide Std Flags.
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
CVersionInfo GetVersion(void) const
Get the program version information.
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
@ fHideXmlHelp
Hide XML help description.
@ fHideFullVersion
Hide full version description.
@ fHideDryRun
Hide dryrun description.
@ fHideConffile
Hide configuration file description.
@ eNoOwnership
No ownership is assumed.
@ fAppend
Append to end-of-file; for eOutputFile or eIOFile.
@ eInputFile
Name of file (must exist and be readable)
@ eString
An arbitrary string.
@ eOutputFile
Name of file (must be writable)
void SetDiagPostPrefix(const char *prefix)
Specify a string to prefix all subsequent error postings with.
EDiagSev SetDiagPostLevel(EDiagSev post_sev=eDiag_Error)
Set the threshold severity for posting the messages.
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
@ eDiag_Warning
Warning message.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
void Warning(CExceptionArgs_Base &args)
void Info(CExceptionArgs_Base &args)
static char GetPathSeparator(void)
Get path separator symbol specific for the current platform.
bool Match(const CSeq_id &sid2) const
Match() - TRUE if SeqIds are equivalent.
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
TObjectType * GetPointer(void) THROWS_NONE
Get pointer,.
void Reset(void)
Reset reference object.
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
uint64_t Uint8
8-byte (64-bit) unsigned integer
bool IsEnabled(void)
Indicates whether application usage statistics collection is enabled for a current reporter instance.
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
static Uint8 StringToUInt8_DataSize(const CTempString str, TStringToNumFlags flags=0)
Convert string that can contain "software" qualifiers to Uint8.
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
static string UInt8ToString(Uint8 value, TNumToStringFlags flags=0, int base=10)
Convert UInt8 to string.
double Elapsed(void) const
Return time elapsed since first Start() or last Restart() call (in seconds).
void Start(void)
Start the timer.
EBlast_filter_program
This defines the possible sequence filtering algorithms to be used in a BLAST database.
Tdata & Set(void)
Assign a value to data member.
list< CRef< CBlast_def_line > > Tdata
@ eBlast_filter_program_other
unsigned int
A callback function used to compare two keys in a database.
static const string kOutput("out")
Command line flag to represent the output.
static string Uint8ToString_DataSize(Uint8 v, unsigned minprec=10)
Converts a Uint8 into a string which contains a data size (converse to NStr::StringToUInt8_DataSize)
int main(int argc, const char *argv[])
bool SortClusterSeqs(CRef< CClusterSeq > &a, CRef< CClusterSeq > &b)
static const string kInputSeparators(" ")
Defines token separators when multiple inputs are present.
static const string kInput("in")
Command line flag to represent the input.
bool SortCluster(CRef< CCluster > &a, CRef< CCluster > &b)
range(_Ty, _Ty) -> range< _Ty >
constexpr auto sort(_Init &&init)
constexpr bool empty(list< Ts... >) noexcept
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
Defines BLAST database access classes.
Defines exception class and several constants for SeqDB.
Int4 TOid
Ordinal ID in BLAST databases.
void SeqDB_GetMetadataFileExtension(bool db_is_protein, string &extn)
unsigned __int64 uint64_t
List of sequence offset ranges.
Structure describing filtered regions created using a particular sequence filtering algorithm.
int algorithm_id
Identifies the algorithm used.
vector< pair< TSeqPos, TSeqPos > > offsets
Start and end offsets of the filtered area.
Defines BLAST database construction classes.
Defines exception class for WriteDB.