NCBI C++ ToolKit
mkkblastindex.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: mkkblastindex.cpp 92025 2020-12-17 15:27:50Z grichenk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Tom Madden
27  *
28  * File Description:
29  * Produce database for kmer (minhash) search.
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbiapp.hpp>
37 
38 #ifdef _OPENMP
39 #include <omp.h>
40 #endif
41 
44 
45 /////////////////////////////////////////////////////////////////////////////
46 // CBlastKmerBuildIndexApplication::
47 
48 
50 {
51 private:
52  virtual void Init(void);
53  virtual int Run(void);
54  virtual void Exit(void);
55 };
56 
57 /////////////////////////////////////////////////////////////////////////////
58 // Init test for all different types of arguments
59 
60 
62 {
64 
65  unique_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
66 
67  arg_desc->AddKey("db", "database_name",
68  "BLAST database to read and create",
70 
71  arg_desc->AddDefaultKey("samples", "buhler_samples",
72  "Number of samples to check",
74  "30");
75 
76  arg_desc->AddDefaultKey("kmer", "kmerNum",
77  "kmer size", CArgDescriptions::eInteger, "5");
78 
79  arg_desc->AddDefaultKey("width", "width_data",
80  "Width of data arrays. 4 for an int, 2 for short(recommmended), 1 for byte",
82 
83  arg_desc->AddDefaultKey("threads", "number_threads",
84  "Number of threads to use.",
86 
87  arg_desc->AddDefaultKey("hashes", "number_hashes",
88  "Number of Hash functions to use.",
90 
91  arg_desc->AddDefaultKey("alphabet", "alphabet_choice",
92  "0 (zero) for 15 letter alphabet, 1 for 10 letter alphabet",
94 
95  arg_desc->AddDefaultKey("kversion", "index_version",
96  "1 for older LSH, 2 for Buhler LSH, 3 for one hash method",
98 
99  arg_desc->AddOptionalKey("output", "Outputfile",
100  "Index files", CArgDescriptions::eString);
101 
102  arg_desc->AddDefaultKey("logfile",
103  "LogInformation",
104  "File for logging errors",
106  "mkkblastindex.log",
108 
109  arg_desc->SetUsageContext("mkkblastindex", "Index for protein kmer search");
110 
111  SetupArgDescriptions(arg_desc.release());
112 }
113 
114 /////////////////////////////////////////////////////////////////////////////
115 // Build a KMER index.
116 //
117 
119 {
120 
122  SetDiagPostPrefix("mkkblastindex");
123 
124  int retval = 0;
125  // blast database of sequences
126  CRef<CSeqDB> seqdb(new CSeqDB(GetArgs()["db"].AsString(), CSeqDB::eProtein));
127 
128  int samples = GetArgs()["samples"].AsInteger();
129 
130  int kmerNum = GetArgs()["kmer"].AsInteger();
131 
132  int dataWidth = GetArgs()["width"].AsInteger();
133 
134  int numThreads = GetArgs()["threads"].AsInteger();
135 
136  int numHashes = GetArgs()["hashes"].AsInteger();
137 
138  int alphabet = GetArgs()["alphabet"].AsInteger();
139 
140  int version = GetArgs()["kversion"].AsInteger();
141 
142  CNcbiOstream * logFile = & (GetArgs()["logfile"].HasValue()
143  ? GetArgs()["logfile"].AsOutputFile()
144  : cout);
145 
146  *logFile << CTime(CTime::eCurrent).AsString() << ": Producing indices for " << GetArgs()["db"].AsString() << " using " << numThreads << " threads" << endl;
147 
148  try {
149  CBlastKmerBuildIndex build_index(seqdb, kmerNum, numHashes, samples, dataWidth, alphabet, version);
150  build_index.Build(numThreads);
151  }
152  catch (const CSeqDBException& e) {
153  *logFile << "CSeqDB Database error: " << e.GetMsg() << endl;
154  retval = 1;
155  }
156  catch (const blast::CMinHashException& e) {
157  *logFile << "CMinHash error: " << e.GetMsg() << endl;
158  retval = 1;
159  }
160  catch (const CFileException& e) {
161  *logFile << "File error: " << e.GetMsg() << endl;
162  retval = 1;
163  }
164  catch (const std::ios::failure&) {
165  *logFile << "mkkblastindex failed to write output" << endl;
166  retval = 1;
167  }
168  catch (const std::bad_alloc&) {
169  *logFile << "mkkblastindex ran out of memory" << endl;
170  retval = 1;
171  }
172  catch (...) {
173  *logFile << "Unknown error" << endl;
174  retval = 1;
175  }
176  return retval;
177 }
178 
179 
180 /////////////////////////////////////////////////////////////////////////////
181 // Cleanup
182 
183 
185 {
186  SetDiagStream(0);
187 }
188 
189 
190 /////////////////////////////////////////////////////////////////////////////
191 // MAIN
192 
193 
194 #ifndef SKIP_DOXYGEN_PROCESSING
195 int main(int argc, const char* argv[])
196 {
197  // Execute main application function
198  return CBlastKmerBuildIndexApplication().AppMain(argc, argv);
199 }
200 #endif /* SKIP_DOXYGEN_PROCESSING */
CArgDescriptions –.
Definition: ncbiargs.hpp:541
virtual void Exit(void)
Cleanup on application exit.
virtual void Init(void)
Initialize the application.
virtual int Run(void)
Run the application.
void Build(int numThreads=1)
Build the index.
CFileException –.
Definition: ncbifile.hpp:136
CSeqDBException.
Definition: seqdbcommon.hpp:73
CSeqDB.
Definition: seqdb.hpp:161
@ eProtein
Definition: seqdb.hpp:174
CTime –.
Definition: ncbitime.hpp:296
static int failure
Definition: t0019.c:11
void HideStdArgs(THideStdArgs hide_mask)
Set the hide mask for the Hide Std Flags.
Definition: ncbiapp.cpp:1325
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:305
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
Definition: ncbiapp.cpp:832
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
Definition: ncbiapp.cpp:1208
@ fHideXmlHelp
Hide XML help description.
@ fHideFullVersion
Hide full version description.
@ fHideDryRun
Hide dryrun description.
@ fHideConffile
Hide configuration file description.
@ fHideVersion
Hide version description.
@ fAppend
Append to end-of-file; for eOutputFile or eIOFile.
Definition: ncbiargs.hpp:622
@ eString
An arbitrary string.
Definition: ncbiargs.hpp:589
@ eOutputFile
Name of file (must be writable)
Definition: ncbiargs.hpp:596
@ eInteger
Convertible into an integer number (int or Int8)
Definition: ncbiargs.hpp:592
void SetDiagPostPrefix(const char *prefix)
Specify a string to prefix all subsequent error postings with.
Definition: ncbidiag.cpp:6100
EDiagSev SetDiagPostLevel(EDiagSev post_sev=eDiag_Error)
Set the threshold severity for posting the messages.
Definition: ncbidiag.cpp:6132
void SetDiagStream(CNcbiOstream *os, bool quick_flush=true, FDiagCleanup cleanup=0, void *cleanup_data=0, const string &stream_name="")
Set diagnostic stream.
Definition: ncbidiag.cpp:8086
@ eDiag_Warning
Warning message.
Definition: ncbidiag.hpp:652
const string & GetMsg(void) const
Get message string.
Definition: ncbiexpt.cpp:461
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
string AsString(const CTimeFormat &format=kEmptyStr, TSeconds out_tz=eCurrentTimeZone) const
Transform time to string.
Definition: ncbitime.cpp:1512
@ eCurrent
Use current time. See also CCurrentTime.
Definition: ncbitime.hpp:300
USING_SCOPE(objects)
int main(int argc, const char *argv[])
USING_NCBI_SCOPE
const string version
version string
Definition: variables.hpp:66
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Modified on Fri Sep 20 14:57:54 2024 by modify_doxy.py rev. 669887