NCBI C++ ToolKit
blastn_vdb.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: blastn_vdb.cpp 101102 2023-10-30 13:07:22Z fongah2 $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Amelia Fong
27  *
28  */
29 
30 /// @file blastn_vdb.cpp
31 /// BLASTN command line application that searches the vdb databases.
32 ///
33 /// - Based on the standard BLASTN command line application located in
34 /// app/blast/blastn_app.cpp.
35 /// - Modifications include using the VDB Blast utils class to
36 /// initialize VDB-specific BlastSeqSrc and BlastSeqInfoSrc objects.
37 /// - Does not support remote searches
38 
39 #include <ncbi_pch.hpp>
40 #include <corelib/ncbiapp.hpp>
46 
51 
52 #include "blast_vdb_app_util.hpp"
53 #include "../blast/blast_app_util.hpp"
54 
55 #include "CBlastVdbVersion.hpp" // CBlastVdbVersion
56 
58 USING_SCOPE(blast);
60 
61 // ==========================================================================//
62 
63 /// Exit code for successful completion of the Blast search.
64 #define BLAST_SRA_EXIT_SUCCESS 0
65 /// Exit code for a Blast input error (in query/options).
66 #define BLAST_SRA_INPUT_ERROR 1
67 /// Exit code for a Blast engine-related error.
68 #define BLAST_SRA_ENGINE_ERROR 2
69 /// Exit code for all other errors and unknown exceptions.
70 #define BLAST_SRA_UNKNOWN_ERROR 255
71 
72 // ==========================================================================//
73 
74 /// CVDBBlastnApp
75 ///
76 /// Blastn application that searches the SRA databases.
77 
79 {
80 public:
81  /// Constructor, sets up the version info.
82  CVDBBlastnApp();
83 
85 
86 private:
87  /// Initialize the application.
88  virtual void Init(void);
89  /// Run the application.
90  virtual int Run(void);
91  /// Cleanup on application exit.
92  virtual void Exit(void);
93 
94  void x_SetupLocalVDBSearch();
95  void x_FillVDBInfo(CBlastFormatUtil::SDbInfo & vecDbInfo);
96 
98  void x_GetFullVDBPaths(void);
99 
100 private:
101  string m_dbAllNames;
103 
104  /// This application's command line arguments.
108 };
109 
110 // ==========================================================================//
111 // Various helper functions
112 
115 }
116 
117 
119 {
120  // Read the SRA-related input
121  CArgs args = GetArgs();
122  unsigned int num_threads = (unsigned int) (args[kArgNumThreads].AsInteger());
124  CLocalVDBBlast::PreprocessDBs(m_localVDBStruct, m_dbAllNames, num_threads, search_mode);
125 }
126 
127 void CVDBBlastnApp::x_FillVDBInfo(CBlastFormatUtil::SDbInfo & dbInfo )
128 {
129  dbInfo.is_protein = false;
130  dbInfo.name = m_dbAllNames;
131  dbInfo.definition = dbInfo.name;
132  dbInfo.total_length = m_localVDBStruct.total_length;
133  dbInfo.number_seqs = m_localVDBStruct.total_num_seqs;
134 }
135 
136 // ==========================================================================//
137 // Initialization
138 
140 {
142  version->SetVersionInfo(new CBlastVdbVersion);
144  m_StopWatch.Start();
145  if (m_UsageReport.IsEnabled()) {
147  }
148 
149 }
150 
152 {
154 
155  // read the command line
156 
159  SetEnvironment("CSRA_CLIP_BY_QUALITY", "true");
160  SetEnvironment("CSRA_PATH_IN_ID", "false");
161  //SetDiagTrace(eDT_Enable);
162 
163 }
164 
165 // ==========================================================================//
166 // Run demo
167 
169 {
171  vector<string> paths;
172  CVDBAliasUtil::FindVDBPaths(dbs, false, paths, NULL, NULL, true, true, false);
173  m_dbAllNames = NStr::Join(paths, " ");
174 }
175 
177 {
178  int status = BLAST_SRA_EXIT_SUCCESS;
179  try
180  {
181  // Allow the fasta reader to complain on invalid sequence input
183  SetDiagPostPrefix("blastn_vdb");
184 
185  // Get the arguments
186  const CArgs& args = GetArgs();
187 
188  // Get and validate the Blast options
189  CRef<CBlastOptionsHandle> optsHandle;
191  optsHandle.Reset(&*m_CmdLineArgs->SetOptionsForSavedStrategy(args));
192  }
193  else {
194  if(!(args.Exist(kArgDb) && args[kArgDb]))
195  NCBI_THROW(CInputException, eInvalidInput,
196  "Must specify at least one SRA/WGS database");
197 
198  optsHandle.Reset(&*m_CmdLineArgs->SetOptions(args));
199  }
200  const CBlastOptions& opt = optsHandle->GetOptions();
201 
202  // Get the query sequence(s)
203  CRef<CQueryOptionsArgs> query_opts =
205  SDataLoaderConfig dlconfig(query_opts->QueryIsProtein());
207  CBlastInputSourceConfig iconfig(dlconfig, query_opts->GetStrand(),
208  query_opts->UseLowercaseMasks(),
209  query_opts->GetParseDeflines(),
210  query_opts->GetRange(),
212  iconfig.SetQueryLocalIdMode();
214  CBlastInput input(&fasta);
215 
216  // Resolve all vdb paths first
218 
219  // Initialize the object manager and the scope object
221 
223  // Setup for local vdb search
225  // Create the DBInfo entries for dbs being searched
226  vector< CBlastFormatUtil::SDbInfo > vecDbInfo(1);
227  x_FillVDBInfo(vecDbInfo[0]);
228 
229  // Get the formatting options and initialize the formatter
231  CBlastFormat formatter(opt, vecDbInfo,
232  fmt_args->GetFormattedOutputChoice(),
233  query_opts->GetParseDeflines(),
235  fmt_args->GetNumDescriptions(),
236  fmt_args->GetNumAlignments(),
237  *scope,
238  fmt_args->ShowGis(),
239  fmt_args->DisplayHtmlOutput(),
241  fmt_args->GetCustomOutputFormatSpec(),
242  true,
244 
245  // Begin Blast output
246  formatter.SetQueryRange(query_opts->GetRange());
247  formatter.SetLineLength(fmt_args->GetLineLength());
248  if(UseXInclude(*fmt_args, args[kArgOutput].AsString())){
249  formatter.SetBaseFile(args[kArgOutput].AsString());
250  }
251  formatter.PrintProlog();
252 
254  int batch_size = m_CmdLineArgs->GetQueryBatchSize();
255  if (batch_size) {
256  input.SetBatchSize(batch_size);
257  } else {
258  Int8 total_len = formatter.GetDbTotalLength();
259  if (total_len > 0) {
260  /* the optimal hits per batch scales with total db size */
261  Int4 target_hits = (total_len/3000) < 2000000 ? 2000000: total_len/3000;
262  mixer.SetTargetHits(target_hits);
263  }
264  input.SetBatchSize(mixer.GetBatchSize());
265  }
266  bool bIncludeFilteredReads = args[kArgIncludeFilteredReads].AsBoolean();
267 
268  // Process the input
269  for (; !input.End(); formatter.ResetScopeHistory()) {
270  CRef<CBlastQueryVector> query_batch(input.GetNextSeqBatch(*scope));
271  CRef<IQueryFactory> queries(new CObjMgr_QueryFactory(*query_batch));
272  SaveSearchStrategy(args, m_CmdLineArgs, queries, optsHandle);
273 
274  // Run local Blast
275  CRef<CSearchResultSet> results;
276  CLocalVDBBlast local_vdb_blast(query_batch, optsHandle, m_localVDBStruct, bIncludeFilteredReads);
277  results = local_vdb_blast.Run();
278  if (!batch_size)
279  input.SetBatchSize(mixer.GetBatchSize(local_vdb_blast.GetNumExtensions()));
280 
281  if (fmt_args->ArchiveFormatRequested(args)){
282  formatter.WriteArchive(*queries, *optsHandle, *results);
283  }
284  else {
285  //CScope::TBioseqHandles handles;
286  //SortAndFetchSeqData(*results, scope, handles);
287  // Output the results
288  ITERATE(CSearchResultSet, result, *results) {
289  formatter.PrintOneResultSet(**result, query_batch);
290  }
291  }
292  }
293  // End Blast output
294  formatter.PrintEpilog(opt);
297 
298  // Optional debug output
300  optsHandle->GetOptions().DebugDumpText(NcbiCerr, "BLAST options", 1);
301  }
303  }
304  catch (const CInputException& e) {
305  cerr << "BLAST query/options error: " << e.GetMsg() << endl;
306  status = BLAST_SRA_INPUT_ERROR;
307  }
308  catch (const CArgException& e) {
309  cerr << "Command line argument error: " << e.GetMsg() << endl;
310  status = BLAST_SRA_INPUT_ERROR;
311  }
312  catch (const CBlastException& e) {
313  cerr << "BLAST engine error: " << e.GetMsg() << endl;
314  status = BLAST_SRA_ENGINE_ERROR;
315  // Temporary fix to avoid vdb core dump during cleanup SB-1170
316  abort();
317  }
318  catch (const CException& e) {
319  cerr << "Error: " << e.GetMsg() << endl;
320  status = BLAST_SRA_UNKNOWN_ERROR;
321  }
322  catch (const exception& e) {
323  cerr << "Error: " << e.what() << endl;
324  status = BLAST_SRA_UNKNOWN_ERROR;
325  }
326  catch (...) {
327  cerr << "Unknown exception occurred" << endl;
328  status = BLAST_SRA_UNKNOWN_ERROR;
329  }
330 
333  m_UsageReport.AddParam(CBlastUsageReport::eProgram, (string) "blastn_vdb");
334  return status;
335 }
336 
337 // ==========================================================================//
338 // Cleanup
339 
341 {
342  SetDiagStream(0);
343 }
344 
345 // ==========================================================================//
346 // Main
347 
348 #ifndef SKIP_DOXYGEN_PROCESSING
349 int main(int argc, const char* argv[])
350 {
351  // Execute main application function
352  return CVDBBlastnApp().AppMain(argc, argv, 0, eDS_Default, "");
353 }
354 #endif /* SKIP_DOXYGEN_PROCESSING */
355 
356 // ==========================================================================//
Produce formatted blast output for command line applications.
void SaveSearchStrategy(const CArgs &args, blast::CBlastAppArgs *cmdline_args, CRef< blast::IQueryFactory > queries, CRef< blast::CBlastOptionsHandle > opts_hndl, CRef< objects::CPssmWithParameters > pssm, unsigned int num_iters)
Save the search strategy corresponding to the current command line search.
bool RecoverSearchStrategy(const CArgs &args, blast::CBlastAppArgs *cmdline_args)
Recover search strategy from input file.
string GetCmdlineArgs(const CNcbiArguments &a)
bool UseXInclude(const CFormattingArgs &f, const string &s)
void LogQueryInfo(CBlastUsageReport &report, const CBlastInput &q_info)
Interface for reading SRA sequences into blast input.
CRef< CScope > GetVDBScope(string dbAllNames)
Utility functions for BLAST VDB command line applications.
USING_SCOPE(blast)
#define BLAST_SRA_UNKNOWN_ERROR
Exit code for all other errors and unknown exceptions.
Definition: blastn_vdb.cpp:70
#define BLAST_SRA_EXIT_SUCCESS
Exit code for successful completion of the Blast search.
Definition: blastn_vdb.cpp:64
#define BLAST_SRA_ENGINE_ERROR
Exit code for a Blast engine-related error.
Definition: blastn_vdb.cpp:68
#define BLAST_SRA_INPUT_ERROR
Exit code for a Blast input error (in query/options).
Definition: blastn_vdb.cpp:66
int main(int argc, const char *argv[])
Definition: blastn_vdb.cpp:349
USING_NCBI_SCOPE
Definition: blastn_vdb.cpp:57
Main argument class for BLASTN_VDB application.
const string kArgSRASearchMode
const string kArgIncludeFilteredReads
CArgException –.
Definition: ncbiargs.hpp:120
CArgs –.
Definition: ncbiargs.hpp:379
Class to mix batch size for BLAST runs.
void SetTargetHits(Int4 target)
Int4 GetBatchSize(Int4 hits=-1)
CRef< CBlastOptionsHandle > SetOptionsForSavedStrategy(const CArgs &args)
Combine the command line arguments into a CBlastOptions object recovered from saved search strategy.
virtual CNcbiIstream & GetInputStream()
Get the input stream.
size_t GetNumThreads() const
Get the number of threads to spawn.
CRef< CBlastOptionsHandle > SetOptions(const CArgs &args)
Extract the command line arguments into a CBlastOptionsHandle object.
CRef< CBlastDatabaseArgs > GetBlastDatabaseArgs() const
Get the BLAST database arguments.
CArgDescriptions * SetCommandLine()
Set the command line arguments.
bool ExecuteRemotely() const
Determine whether the search should be executed remotely or not.
CRef< CQueryOptionsArgs > GetQueryOptionsArgs() const
Get the options for the query sequence(s)
CRef< CFormattingArgs > GetFormattingArgs() const
Get the formatting options.
bool ProduceDebugOutput() const
Return whether debug (verbose) output should be produced on remote searches (only available when comp...
virtual CNcbiOstream & GetOutputStream()
Get the output stream.
string GetDatabaseName() const
Get the BLAST database name.
Definition: blast_args.hpp:931
Defines BLAST error codes (user errors included)
Class representing a text file containing sequences in fasta format.
This class formats the BLAST results for command line applications.
void LogBlastSearchInfo(blast::CBlastUsageReport &report)
void PrintOneResultSet(const blast::CSearchResults &results, CConstRef< blast::CBlastQueryVector > queries, unsigned int itr_num=numeric_limits< unsigned int >::max(), blast::CPsiBlastIterationState::TSeqIds prev_seqids=blast::CPsiBlastIterationState::TSeqIds(), bool is_deltablast_domain_result=false)
Print all alignment information for a single query sequence along with any errors or warnings (errors...
void PrintEpilog(const blast::CBlastOptions &options)
Print the footer of the blast report.
void SetBaseFile(string base)
For use by XML2 only.
void ResetScopeHistory()
Resets the scope history for some output formats.
void SetLineLength(size_t len)
Set Alignment Length.
void WriteArchive(blast::IQueryFactory &queries, blast::CBlastOptionsHandle &options_handle, const blast::CSearchResultSet &results, unsigned int num_iters=0, const list< CRef< objects::CBlast4_error > > &msg=list< CRef< objects::CBlast4_error > >())
Writes out the query and results as an "archive" format.
Int8 GetDbTotalLength()
Get total length of the database.
void PrintProlog()
Print the header of the blast report.
void SetQueryRange(const TSeqRange &query_range)
Set query range.
Class that centralizes the configuration data for sequences to be converted.
Definition: blast_input.hpp:48
void SetQueryLocalIdMode()
Append query-specific prefix codes to all generated local ids.
Generalized converter from an abstract source of biological sequence data to collections of blast inp...
Encapsulates ALL the BLAST algorithm's options.
void AddParam(EUsageParams p, int val)
Handles command line arguments for blastn binary.
virtual int GetQueryBatchSize() const
@inheritDoc
void DebugDumpText(ostream &out, const string &bundle, unsigned int depth) const
Definition: ddumpable.cpp:56
virtual bool ArchiveFormatRequested(const CArgs &args) const
string GetCustomOutputFormatSpec() const
Retrieve for string that specifies the custom output format for tabular and comma-separated value.
EOutputFormat GetFormattedOutputChoice() const
Get the choice of formatted output.
TSeqPos GetNumAlignments() const
Number of alignments to show in traditional BLAST output.
bool ShowGis() const
Display the NCBI GIs in formatted output?
TSeqPos GetNumDescriptions() const
Number of one-line descriptions to show in traditional BLAST output.
size_t GetLineLength() const
bool DisplayHtmlOutput() const
Display HTML output?
Defines user input exceptions.
static string PreprocessDBs(CLocalVDBBlast::SLocalVDBStruct &local_vdb, const string db_names, unsigned int num_threads=kDisableThreadedSearch, ESRASearchMode seach_mode=eAligned)
CRef< CSearchResultSet > Run()
NCBI C++ Object Manager dependant implementation of IQueryFactory.
objects::ENa_strand GetStrand() const
Get strand to search in query sequence(s)
Definition: blast_args.hpp:800
bool GetParseDeflines() const
Should the defline be parsed?
Definition: blast_args.hpp:804
bool QueryIsProtein() const
Is the query sequence protein?
Definition: blast_args.hpp:807
TSeqRange GetRange() const
Get query sequence range restriction.
Definition: blast_args.hpp:796
bool UseLowercaseMasks() const
Use lowercase masking in FASTA input?
Definition: blast_args.hpp:802
Search Results for All Queries.
CStopWatch –.
Definition: ncbitime.hpp:1938
static void FindVDBPaths(const string &dbname, bool isProtein, vector< string > &paths, vector< string > *db_alias_list=NULL, vector< string > *vdb_alias_list=NULL, bool recursive=true, bool expand_links=true, bool verify_dbs=true)
Get the list of vdb names.
Definition: vdbalias.cpp:620
static Uint4 SetupVDBManager()
*Note* Call this in main thread first, if you are going to instantiate this object or use any of the ...
static void ReleaseVDBManager()
Call this release vdb manager if SetupManger has been explicitly called in the main thread.
CVDBBlastnApp.
Definition: blastn_vdb.cpp:79
CStopWatch m_StopWatch
Definition: blastn_vdb.cpp:107
CBlastUsageReport m_UsageReport
Definition: blastn_vdb.cpp:106
CLocalVDBBlast::SLocalVDBStruct m_localVDBStruct
Definition: blastn_vdb.cpp:102
void x_SetupLocalVDBSearch()
Definition: blastn_vdb.cpp:118
CVDBBlastnApp()
Constructor, sets up the version info.
Definition: blastn_vdb.cpp:139
void x_GetFullVDBPaths(void)
Definition: blastn_vdb.cpp:168
CRef< CBlastnVdbAppArgs > m_CmdLineArgs
This application's command line arguments.
Definition: blastn_vdb.cpp:105
CRef< CScope > x_GetScope(void)
virtual int Run(void)
Run the application.
Definition: blastn_vdb.cpp:176
virtual void Exit(void)
Cleanup on application exit.
Definition: blastn_vdb.cpp:340
virtual void Init(void)
Initialize the application.
Definition: blastn_vdb.cpp:151
string m_dbAllNames
Definition: blastn_vdb.cpp:101
void x_FillVDBInfo(CBlastFormatUtil::SDbInfo &vecDbInfo)
Definition: blastn_vdb.cpp:127
const string kArgOutput
Output file name.
const string kArgDb
BLAST database name.
const string kArgNumThreads
Argument to determine the number of threads to use when running BLAST.
void Print(const CCompactSAMApplication::AlignInfo &ai)
size_t SplitQuery_GetChunkSize(EProgram program)
Returns the optimal chunk size for a given task.
Definition: local_blast.cpp:54
EProgram GetProgram() const
Accessors/Mutators for individual options.
const CBlastOptions & GetOptions() const
Return the object which this object is a handle for.
void SetFullVersion(CRef< CVersionAPI > version)
Set version data for the program.
Definition: ncbiapp.cpp:1154
void HideStdArgs(THideStdArgs hide_mask)
Set the hide mask for the Hide Std Flags.
Definition: ncbiapp.cpp:1292
CNcbiEnvironment & SetEnvironment(void)
Get a non-const copy of the application's cached environment.
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:285
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
Definition: ncbiapp.cpp:799
CVersionInfo GetVersion(void) const
Get the program version information.
Definition: ncbiapp.cpp:1164
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
Definition: ncbiapp.cpp:1175
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
@ fHideXmlHelp
Hide XML help description.
@ fHideLogfile
Hide log file description.
@ fHideFullVersion
Hide full version description.
@ fHideDryRun
Hide dryrun description.
@ fHideConffile
Hide configuration file description.
bool Exist(const string &name) const
Check existence of argument description.
Definition: ncbiargs.cpp:1813
#define NULL
Definition: ncbistd.hpp:225
void SetDiagPostPrefix(const char *prefix)
Specify a string to prefix all subsequent error postings with.
Definition: ncbidiag.cpp:6097
EDiagSev SetDiagPostLevel(EDiagSev post_sev=eDiag_Error)
Set the threshold severity for posting the messages.
Definition: ncbidiag.cpp:6129
void SetDiagStream(CNcbiOstream *os, bool quick_flush=true, FDiagCleanup cleanup=0, void *cleanup_data=0, const string &stream_name="")
Set diagnostic stream.
Definition: ncbidiag.cpp:8083
@ eDS_Default
Try standard log file (app.name + ".log") in /log/, use stderr on failure.
Definition: ncbidiag.hpp:1790
@ eDiag_Warning
Warning message.
Definition: ncbidiag.hpp:652
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
const string & GetMsg(void) const
Get message string.
Definition: ncbiexpt.cpp:461
virtual const char * what(void) const noexcept
Standard report (includes full backlog).
Definition: ncbiexpt.cpp:342
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
bool IsEnabled(void)
Indicates whether application usage statistics collection is enabled for a current reporter instance.
#define NcbiCerr
Definition: ncbistre.hpp:544
static string Join(const TContainer &arr, const CTempString &delim)
Join strings using the specified delimiter.
Definition: ncbistr.hpp:2697
double Elapsed(void) const
Return time elapsed since first Start() or last Restart() call (in seconds).
Definition: ncbitime.hpp:2776
void Start(void)
Start the timer.
Definition: ncbitime.hpp:2765
#define CVersion
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
static int input()
Main class to perform a BLAST search on the local machine.
static int version
Definition: mdb_load.c:29
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
void abort()
NOTE: This file contains work in progress and the APIs are likely to change, please do not rely on th...
Declares the CRemoteBlast class.
Configuration structure for the CBlastScopeSource.
void OptimizeForWholeLargeSequenceRetrieval(bool value=true)
Configures the BLAST database data loader to optimize the retrieval of *entire* large sequences.
else result
Definition: token2.c:20
Defines database alias file access classes.
Declares the CLocalVDBBlast class.
Modified on Sat Dec 09 04:47:18 2023 by modify_doxy.py rev. 669887