NCBI C++ ToolKit
mkindex_app.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: mkindex_app.cpp 91979 2020-12-17 15:26:53Z grichenk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Aleksandr Morgulis
27  *
28  * File Description:
29  * Implementation of class CMkIndexApplication.
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 
35 #include "mkindex_app.hpp"
36 
37 #include <memory>
38 #include <string>
39 #include <sstream>
40 
41 #ifdef LOCAL_SVN
42 
43 #include "../libindexdb_new/sequence_istream_fasta.hpp"
44 #include "../libindexdb_new/sequence_istream_bdb.hpp"
45 #include "../libindexdb_new/dbindex.hpp"
46 
47 #else
48 
52 
53 #endif
54 
55 using namespace std;
56 
58 USING_SCOPE( blastdbindex );
59 
60 //------------------------------------------------------------------------------
61 const char * const CMkIndexApplication::USAGE_LINE =
62  "Create a BLAST database index.";
63 
64 //------------------------------------------------------------------------------
66 {
67  unique_ptr< CArgDescriptions > arg_desc( new CArgDescriptions );
68  arg_desc->SetUsageContext(
69  GetArguments().GetProgramBasename(), USAGE_LINE );
70  arg_desc->AddOptionalKey(
71  "input", "input_file_name", "input file name",
73  arg_desc->AddOptionalKey(
74  "output", "output_file_name", "output file name",
76  arg_desc->AddDefaultKey(
77  "verbosity", "reporting_level", "how much to report",
78  CArgDescriptions::eString, "normal" );
79  arg_desc->AddOptionalKey(
80  "iformat", "input_format",
81  "type of input used (default is \"blastdb\" for new style index, "
82  "\"fasta\" for old style index)",
84  arg_desc->AddDefaultKey(
85  "legacy", "use_legacy_index_format",
86  "use legacy (0-terminated offset lists) dbindex format",
88  arg_desc->AddDefaultKey(
89  "idmap", "generate_idmap",
90  "generate id map for the sequences in the index",
91  CArgDescriptions::eBoolean, "false" );
92  arg_desc->AddOptionalKey(
93  "db_mask", "filtering_algorithm",
94  "use the specified filtering algorithm from BLAST DB",
96  arg_desc->AddFlag(
97  "show_filters",
98  "show the info about available database filtering algorithms"
99  " and exit",
100  true );
101  arg_desc->AddOptionalKey(
102  "nmer", "nmer_size",
103  "length of the indexed words",
105  arg_desc->AddOptionalKey(
106  "ws_hint", "word_size_hint",
107  "most likely word size used in searches",
109  arg_desc->AddOptionalKey(
110  "volsize", "volume_size", "size of an index volume in MB",
112  arg_desc->AddOptionalKey(
113  "stat", "statistics_file",
114  "write index statistics into file with that name "
115  "(for testing and debugging purposes only).",
117  arg_desc->AddOptionalKey(
118  "stride", "stride",
119  "distance between stored database positions",
121  arg_desc->AddDefaultKey(
122  "old_style_index", "boolean",
123  "Use old style index (deprecated)",
124  CArgDescriptions::eBoolean, "false" );
125  arg_desc->SetConstraint(
126  "verbosity",
127  &(*new CArgAllow_Strings, "quiet", "normal", "verbose") );
128  arg_desc->SetConstraint(
129  "iformat",
130  &(*new CArgAllow_Strings, "fasta", "blastdb") );
131  arg_desc->SetConstraint(
132  "volsize",
133  new CArgAllow_Integers( 1, kMax_Int ) );
134  arg_desc->SetConstraint(
135  "stride",
136  new CArgAllow_Integers( 1, kMax_Int ) );
137  arg_desc->SetConstraint(
138  "ws_hint",
139  new CArgAllow_Integers( 1, kMax_Int ) );
140  arg_desc->SetConstraint(
141  "nmer",
142  new CArgAllow_Integers( 8, 15 ) );
143  arg_desc->SetDependency(
144  "show_filters", CArgDescriptions::eExcludes, "output" );
145  arg_desc->SetDependency(
146  "db_mask", CArgDescriptions::eRequires, "input" );
147  SetupArgDescriptions( arg_desc.release() );
148 }
149 
150 //------------------------------------------------------------------------------
152 {
155  std::string verbosity = GetArgs()["verbosity"].AsString();
156 
157  bool old_style( GetArgs()["old_style_index"].AsBoolean() );
158 
159  if( verbosity == "quiet" ) {
160  options.report_level = REPORT_QUIET;
161  }else if( verbosity == "verbose" ) {
162  options.report_level = REPORT_VERBOSE;
163  }
164 
165  if( GetArgs()["volsize"] ) {
166  options.max_index_size = GetArgs()["volsize"].AsInteger();
167  }
168 
169  if( GetArgs()["stat"] ) {
170  options.stat_file_name = GetArgs()["stat"].AsString();
171  }
172 
173  if( GetArgs()["nmer"] ) {
174  options.hkey_width = GetArgs()["nmer"].AsInteger();
175  }
176 
177  options.legacy = GetArgs()["legacy"].AsBoolean();
178  options.idmap = GetArgs()["idmap"].AsBoolean();
179 
180  if( GetArgs()["stride"] ) {
181  if( options.legacy ) {
182  ERR_POST( Warning << "-stride has no effect upon "
183  "legacy index creation" );
184  }
185  else options.stride = GetArgs()["stride"].AsInteger();
186  }
187 
188  if( GetArgs()["ws_hint"] ) {
189  if( options.legacy ) {
190  ERR_POST( Warning << "-ws_hint has no effect upon "
191  "legacy index creation" );
192  }
193  else {
194  unsigned long ws_hint = GetArgs()["ws_hint"].AsInteger();
195 
196  if( ws_hint < options.hkey_width + options.stride - 1 ) {
197  ws_hint = options.hkey_width + options.stride - 1;
198  ERR_POST( Warning << "-ws_hint requested is too low. Setting "
199  "to the minimum value of " << ws_hint );
200  }
201 
202  options.ws_hint = ws_hint;
203  }
204  }
205 
206  unsigned int vol_num = 0;
207 
208  CDbIndex::TSeqNum start, orig_stop( kMax_UI4 ), stop = 0;
209  /*
210  string ofname_base =
211  GetArgs()["show_filters"] ? "" : GetArgs()["output"].AsString();
212  string odir_name( CFile( ofname_base ).GetDir() );
213  */
214  CSequenceIStream * seqstream = 0;
215  string iformat( GetArgs()["iformat"] ? GetArgs()["iformat"].AsString()
216  : old_style ? "fasta" : "blastdb" );
217 
218  if( !old_style && iformat == "fasta" ) {
219  ERR_POST( Error << "new style index requires input format 'blastdb'" );
220  exit( 1 );
221  }
222 
223  if( iformat == "fasta" ) {
224  if( GetArgs()["db_mask"] ) {
225  ERR_POST( Error << "-db_mask requires -iformat blastdb" );
226  exit( 1 );
227  }
228 
229  if( GetArgs()["input"] ) {
230  seqstream = new CSequenceIStreamFasta(
231  ( GetArgs()["input"].AsString() ) );
232  }
233  else seqstream = new CSequenceIStreamFasta( NcbiCin );
234  }else if( iformat == "blastdb" ) {
235  if( GetArgs()["input"] ) {
236  if( GetArgs()["show_filters"] ) {
238  GetArgs()["input"].AsString() ) << endl;
239  return 0;
240  }
241 
242  if( old_style ) {
243  if( GetArgs()["db_mask"] ) {
244  seqstream = new CSequenceIStreamBlastDB(
245  GetArgs()["input"].AsString(), true,
246  GetArgs()["db_mask"].AsString() );
247  }
248  else {
249  seqstream = new CSequenceIStreamBlastDB(
250  GetArgs()["input"].AsString(), false, 0 );
251  }
252  }
253  }
254  else {
255  ERR_POST( Error << "input format 'blastdb' requires -input option" );
256  exit( 1 );
257  }
258  }else {
259  ASSERT( 0 );
260  }
261 
262  if( iformat != "blastdb" &&
263  GetArgs()["db_mask"] &&
264  GetArgs()["db_mask"].AsString() != "" ) {
265  ERR_POST( Error << "option 'db_mask' requires input format 'blastdb'" );
266  exit( 1 );
267  }
268 
269  if( !old_style && iformat == "blastdb" ) {
270  if( GetArgs()["output"] ) {
271  ERR_POST( Warning <<
272  "option 'output' is ignored for new style indices" );
273  }
274 
275  typedef std::vector< std::string > TStrVec;
276  TStrVec db_vols;
277 
278  // Enumerate BLAST database volumes.
279  {
280  std::string ifname( GetArgs()["input"].AsString() );
281  CSeqDB db( ifname, CSeqDB::eNucleotide, 0, 0, false );
282  db.FindVolumePaths( db_vols, true );
283  }
284 
285  bool enable_mask( GetArgs()["db_mask"] );
286  string filter( enable_mask ? GetArgs()["db_mask"].AsString() : "" );
287 
288  ITERATE( TStrVec, dbvi, db_vols ) {
289  seqstream =
290  new CSequenceIStreamBlastDB( *dbvi, enable_mask, filter );
291  CDbIndex::TSeqNum start, orig_stop( kMax_UI4 ), stop = 0;
292  Uint4 vol_num_seq( 0 );
293 
294  {
295  CSeqDB db( *dbvi, CSeqDB::eNucleotide, 0, 0, false );
296  vol_num_seq = db.GetNumOIDs();
297  }
298 
299  Uint4 num_seq( 0 ), num_vol( 0 );
300  vol_num = 0;
301  /*
302  std::string dbv_name(
303  CFile::ConcatPath( odir_name, CFile( *dbvi ).GetName() ) );
304  */
305  std::string dbv_name( *dbvi );
306 
307  do {
308  start = stop;
309  stop = orig_stop;
310  ostringstream os;
311  os << dbv_name << "." << setfill( '0' ) << setw( 2 )
312  << vol_num++ << ".idx";
313  cerr << "creating " << os.str() << "..." << flush;
315  *seqstream, os.str(), start, stop, options );
316  num_seq += (stop - start);
317 
318  if( start == stop ) cerr << "removed (empty)" << endl;
319  else{
320  ++num_vol;
321  cerr << "done" << endl;
322  ERR_POST( Info <<
323  "generated index volume with OIDs: " <<
324  start << "--" << stop );
325  }
326  }
327  while( start != stop );
328 
329  if( num_seq != vol_num_seq ) {
330  ERR_POST( Error <<
331  "number of sequence reported by BLAST database"
332  " volume (" << vol_num_seq << ") is not the same"
333  " as in the index (" << num_seq << ")" );
334  return 1;
335  }
336 
339  num_seq, num_vol );
340  shdr.Save( dbv_name + ".shd" );
341  ERR_POST( Info <<
342  "index generated for BLAST database volume " <<
343  dbv_name << " with " << num_seq << " sequences" );
344  delete seqstream;
345  }
346 
347  return 0;
348  }
349 
350  Uint4 num_seq( 0 ), num_vol( 0 );
351  string ofname_base =
352  GetArgs()["show_filters"] ? "" : GetArgs()["output"].AsString();
353 
354  do {
355  start = stop;
356  stop = orig_stop;
357  ostringstream os;
358  os << ofname_base << "." << setfill( '0' ) << setw( 2 )
359  << vol_num++ << ".idx";
360  cerr << "creating " << os.str() << "..." << flush;
362  *seqstream,
363  os.str(), start, stop, options );
364  num_seq += (stop - start);
365 
366  if( start == stop ) cerr << "removed (empty)" << endl;
367  else{ ++num_vol; cerr << "done" << endl; }
368  }while( start != stop );
369 
370  if( !old_style ) {
373  num_seq, num_vol );
374  shdr.Save( ofname_base + ".shd" );
375  }
376 
377  return 0;
378 }
Int32 verbosity
Definition: bzip2.c:183
CArgAllow_Integers –.
Definition: ncbiargs.hpp:1751
CArgAllow_Strings –.
Definition: ncbiargs.hpp:1641
CArgDescriptions –.
Definition: ncbiargs.hpp:541
static SOptions DefaultSOptions()
Creates an SOptions instance initialized with default values.
Definition: dbindex.cpp:383
static void MakeIndex(const std::string &fname, const std::string &oname, TSeqNum start, TSeqNum start_chunk, TSeqNum &stop, TSeqNum &stop_chunk, const SOptions &options)
Create an index object.
CSequenceIStream::TStreamPos TSeqNum
Type used to enumerate sequences in the index.
Definition: dbindex.hpp:484
static const Uint4 INDEX_FORMAT_VERSION_1
Old style index with superheader.
Definition: dbindex.hpp:129
Superheader derived classes parametrized by index format version.
Definition: dbindex.hpp:213
static const char *const USAGE_LINE
String containing program usage information.
Definition: mkindex_app.hpp:46
virtual int Run()
Application main procedure.
virtual void Init()
Application initialization.
Definition: mkindex_app.cpp:65
CSeqDB.
Definition: seqdb.hpp:161
static void FindVolumePaths(const string &dbname, ESeqType seqtype, vector< string > &paths, vector< string > *alias_paths=NULL, bool recursive=true, bool expand_links=true)
Find volume paths.
Definition: seqdb.cpp:1040
int GetNumOIDs() const
Returns the size of the (possibly sparse) OID range.
Definition: seqdb.cpp:680
@ eNucleotide
Definition: seqdb.hpp:175
Sequence stream that reads BLAST nucleotide databases.
static string ShowSupportedFilters(const string &dbname)
Report on supported subject filter algorithms.
Sequence stream for reading FASTA formatted files.
Class used to abstract reading nucleotide sequences from various sources.
const unsigned long REPORT_QUIET
No progress reporting.
Definition: dbindex.hpp:61
const unsigned long REPORT_VERBOSE
Verbose reporting.
Definition: dbindex.hpp:63
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
@ eRequires
One argument requires another.
Definition: ncbiargs.hpp:956
@ eExcludes
One argument excludes another.
Definition: ncbiargs.hpp:957
@ eBoolean
{'true', 't', 'false', 'f'}, case-insensitive
Definition: ncbiargs.hpp:590
@ eString
An arbitrary string.
Definition: ncbiargs.hpp:589
@ eInteger
Convertible into an integer number (int or Int8)
Definition: ncbiargs.hpp:592
string
Definition: cgiapp.hpp:687
EDiagSev SetDiagPostLevel(EDiagSev post_sev=eDiag_Error)
Set the threshold severity for posting the messages.
Definition: ncbidiag.cpp:6129
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
@ eDiag_Warning
Warning message.
Definition: ncbidiag.hpp:652
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1191
void Info(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1185
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
#define kMax_Int
Definition: ncbi_limits.h:184
#define kMax_UI4
Definition: ncbi_limits.h:219
#define NcbiCout
Definition: ncbistre.hpp:543
#define NcbiCin
Definition: ncbistre.hpp:542
exit(2)
USING_NCBI_SCOPE
Definition: mkindex_app.cpp:57
USING_SCOPE(blastdbindex)
#define ASSERT
macro for assert.
Definition: ncbi_std.h:107
#define GetArgs
Avoid preprocessor name clash with the NCBI C Toolkit.
Definition: ncbiapp_api.hpp:53
Simple record type used to specify index creation parameters.
Definition: dbindex.hpp:468
bool legacy
Indicator of the legacy index format.
Definition: dbindex.hpp:470
unsigned long report_level
Verbose index creation.
Definition: dbindex.hpp:477
unsigned long max_index_size
Maximum index size in megabytes.
Definition: dbindex.hpp:478
std::string stat_file_name
File to write index statistics into.
Definition: dbindex.hpp:480
unsigned long ws_hint
Most likely word size to use for searches.
Definition: dbindex.hpp:472
bool idmap
Indicator of the index map creation.
Definition: dbindex.hpp:469
unsigned long hkey_width
Width of the hash key in bits.
Definition: dbindex.hpp:473
unsigned long stride
Stride to use for stored database locations.
Definition: dbindex.hpp:471
Modified on Sat Dec 02 09:20:51 2023 by modify_doxy.py rev. 669887