43 #include "../libindexdb_new/sequence_istream_fasta.hpp"
44 #include "../libindexdb_new/sequence_istream_bdb.hpp"
45 #include "../libindexdb_new/dbindex.hpp"
62 "Create a BLAST database index.";
68 arg_desc->SetUsageContext(
69 GetArguments().GetProgramBasename(), USAGE_LINE );
70 arg_desc->AddOptionalKey(
71 "input",
"input_file_name",
"input file name",
73 arg_desc->AddOptionalKey(
74 "output",
"output_file_name",
"output file name",
76 arg_desc->AddDefaultKey(
77 "verbosity",
"reporting_level",
"how much to report",
79 arg_desc->AddOptionalKey(
80 "iformat",
"input_format",
81 "type of input used (default is \"blastdb\" for new style index, "
82 "\"fasta\" for old style index)",
84 arg_desc->AddDefaultKey(
85 "legacy",
"use_legacy_index_format",
86 "use legacy (0-terminated offset lists) dbindex format",
88 arg_desc->AddDefaultKey(
89 "idmap",
"generate_idmap",
90 "generate id map for the sequences in the index",
92 arg_desc->AddOptionalKey(
93 "db_mask",
"filtering_algorithm",
94 "use the specified filtering algorithm from BLAST DB",
98 "show the info about available database filtering algorithms"
101 arg_desc->AddOptionalKey(
103 "length of the indexed words",
105 arg_desc->AddOptionalKey(
106 "ws_hint",
"word_size_hint",
107 "most likely word size used in searches",
109 arg_desc->AddOptionalKey(
110 "volsize",
"volume_size",
"size of an index volume in MB",
112 arg_desc->AddOptionalKey(
113 "stat",
"statistics_file",
114 "write index statistics into file with that name "
115 "(for testing and debugging purposes only).",
117 arg_desc->AddOptionalKey(
119 "distance between stored database positions",
121 arg_desc->AddDefaultKey(
122 "old_style_index",
"boolean",
123 "Use old style index (deprecated)",
125 arg_desc->SetConstraint(
128 arg_desc->SetConstraint(
131 arg_desc->SetConstraint(
134 arg_desc->SetConstraint(
137 arg_desc->SetConstraint(
140 arg_desc->SetConstraint(
143 arg_desc->SetDependency(
145 arg_desc->SetDependency(
147 SetupArgDescriptions( arg_desc.release() );
157 bool old_style(
GetArgs()[
"old_style_index"].AsBoolean() );
183 "legacy index creation" );
191 "legacy index creation" );
194 unsigned long ws_hint =
GetArgs()[
"ws_hint"].AsInteger();
199 "to the minimum value of " << ws_hint );
206 unsigned int vol_num = 0;
215 string iformat(
GetArgs()[
"iformat"] ?
GetArgs()[
"iformat"].AsString()
216 : old_style ?
"fasta" :
"blastdb" );
218 if( !old_style && iformat ==
"fasta" ) {
219 ERR_POST(
Error <<
"new style index requires input format 'blastdb'" );
223 if( iformat ==
"fasta" ) {
231 (
GetArgs()[
"input"].AsString() ) );
234 }
else if( iformat ==
"blastdb" ) {
236 if(
GetArgs()[
"show_filters"] ) {
238 GetArgs()[
"input"].AsString() ) << endl;
245 GetArgs()[
"input"].AsString(),
true,
246 GetArgs()[
"db_mask"].AsString() );
250 GetArgs()[
"input"].AsString(),
false, 0 );
255 ERR_POST(
Error <<
"input format 'blastdb' requires -input option" );
262 if( iformat !=
"blastdb" &&
264 GetArgs()[
"db_mask"].AsString() !=
"" ) {
265 ERR_POST(
Error <<
"option 'db_mask' requires input format 'blastdb'" );
269 if( !old_style && iformat ==
"blastdb" ) {
272 "option 'output' is ignored for new style indices" );
275 typedef std::vector< std::string > TStrVec;
285 bool enable_mask(
GetArgs()[
"db_mask"] );
286 string filter( enable_mask ?
GetArgs()[
"db_mask"].AsString() :
"" );
288 ITERATE( TStrVec, dbvi, db_vols ) {
292 Uint4 vol_num_seq( 0 );
299 Uint4 num_seq( 0 ), num_vol( 0 );
311 os << dbv_name <<
"." << setfill(
'0' ) << setw( 2 )
312 << vol_num++ <<
".idx";
313 cerr <<
"creating " << os.str() <<
"..." << flush;
315 *seqstream, os.str(), start, stop, options );
316 num_seq += (stop - start);
318 if( start == stop ) cerr <<
"removed (empty)" << endl;
321 cerr <<
"done" << endl;
323 "generated index volume with OIDs: " <<
324 start <<
"--" << stop );
327 while( start != stop );
329 if( num_seq != vol_num_seq ) {
331 "number of sequence reported by BLAST database"
332 " volume (" << vol_num_seq <<
") is not the same"
333 " as in the index (" << num_seq <<
")" );
340 shdr.Save( dbv_name +
".shd" );
342 "index generated for BLAST database volume " <<
343 dbv_name <<
" with " << num_seq <<
" sequences" );
350 Uint4 num_seq( 0 ), num_vol( 0 );
358 os << ofname_base <<
"." << setfill(
'0' ) << setw( 2 )
359 << vol_num++ <<
".idx";
360 cerr <<
"creating " << os.str() <<
"..." << flush;
363 os.str(), start, stop, options );
364 num_seq += (stop - start);
366 if( start == stop ) cerr <<
"removed (empty)" << endl;
367 else{ ++num_vol; cerr <<
"done" << endl; }
368 }
while( start != stop );
374 shdr.Save( ofname_base +
".shd" );
static SOptions DefaultSOptions()
Creates an SOptions instance initialized with default values.
static void MakeIndex(const std::string &fname, const std::string &oname, TSeqNum start, TSeqNum start_chunk, TSeqNum &stop, TSeqNum &stop_chunk, const SOptions &options)
Create an index object.
CSequenceIStream::TStreamPos TSeqNum
Type used to enumerate sequences in the index.
static const char *const USAGE_LINE
String containing program usage information.
virtual int Run()
Application main procedure.
virtual void Init()
Application initialization.
static void FindVolumePaths(const string &dbname, ESeqType seqtype, vector< string > &paths, vector< string > *alias_paths=NULL, bool recursive=true, bool expand_links=true)
Find volume paths.
int GetNumOIDs() const
Returns the size of the (possibly sparse) OID range.
Sequence stream that reads BLAST nucleotide databases.
static string ShowSupportedFilters(const string &dbname)
Report on supported subject filter algorithms.
Sequence stream for reading FASTA formatted files.
Class used to abstract reading nucleotide sequences from various sources.
const unsigned long REPORT_QUIET
No progress reporting.
const unsigned long REPORT_VERBOSE
Verbose reporting.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
@ eRequires
One argument requires another.
@ eExcludes
One argument excludes another.
@ eBoolean
{'true', 't', 'false', 'f'}, case-insensitive
@ eString
An arbitrary string.
@ eInteger
Convertible into an integer number (int or Int8)
EDiagSev SetDiagPostLevel(EDiagSev post_sev=eDiag_Error)
Set the threshold severity for posting the messages.
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
@ eDiag_Warning
Warning message.
void Error(CExceptionArgs_Base &args)
void Warning(CExceptionArgs_Base &args)
void Info(CExceptionArgs_Base &args)
uint32_t Uint4
4-byte (32-bit) unsigned integer
USING_SCOPE(blastdbindex)
#define ASSERT
macro for assert.
#define GetArgs
Avoid preprocessor name clash with the NCBI C Toolkit.
Simple record type used to specify index creation parameters.
bool legacy
Indicator of the legacy index format.
unsigned long report_level
Verbose index creation.
unsigned long max_index_size
Maximum index size in megabytes.
std::string stat_file_name
File to write index statistics into.
unsigned long ws_hint
Most likely word size to use for searches.
bool idmap
Indicator of the index map creation.
unsigned long hkey_width
Width of the hash key in bits.
unsigned long stride
Stride to use for stored database locations.