NCBI C++ ToolKit
seedtop_app.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: seedtop_app.cpp 92126 2020-12-22 16:37:26Z grichenk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Ning Ma
27  *
28  */
29 
30 /** @file seedtop_app.cpp
31  * SEEDTOP command line application
32  */
33 
34 #include <ncbi_pch.hpp>
35 #include <corelib/ncbiapp.hpp>
41 #include <objmgr/util/sequence.hpp>
42 #include "blast_app_util.hpp"
43 
44 #ifndef SKIP_DOXYGEN_PROCESSING
46 USING_SCOPE(blast);
48 #endif
49 
51 {
52  string name;
53  string pattern;
54 };
55 
56 // Read one seedtop pattern from file
57 // Returns "" if EOF or error encountered
59  /* The pattern input file is unique for seedtop. Each pattern contains one ID initialed
60  lines for pattern identification, and one or more PA initialed lines for the actual
61  pattern specified using ProSite syntax.
62 
63  Pattern lines should be less than 100 letters long. Longer patterns can be specified
64  by multiple PA lines as given in the example. Here is a pattern input file with a single
65  pattern containing two PA lines. For testing purposes, we can use it with refseq protein
66  records such as YP_471346.1, YP_575330.1, or YP_564843.1.
67 
68  A pattern input file can contain multiple patterns as long as they are separated by a
69  line with a single forward slash (/).
70  */
71  struct SSeedTopPattern retv;
72  char line[128];
73  in.getline(line, 128);
74  int len = in.gcount();
75  if (len < 4 || line[0]!='I' || line[1]!='D' || line[2]!=' ') return retv;
76  retv.name = string(&line[3], len-4);
77  while(true) {
78  in.getline(line, 100);
79  len = in.gcount();
80  if (len < 4 || line[0]!='P' || line[1]!='A' || line[2]!=' ') return retv;
81  while (line[len-2] == ' ') len -= 1;
82  if (line[len-2] == '>') line[len-2] = '-';
83  else if (line[len-2] == '.') len -= 1;
84  retv.pattern += string(&line[3], len-4);
85  }
86  return retv;
87 }
88 
90 {
91 public:
92  /** @inheritDoc */
95  version->SetVersionInfo(new CBlastVersion());
97  }
98 private:
99  /** @inheritDoc */
100  virtual void Init();
101  /** @inheritDoc */
102  virtual int Run();
103 };
104 
105 static const string kPattern("pattern");
106 static const string kDb("db");
107 static const string kSubject("subject");
108 static const string kOutput("out");
109 
111 {
113 
114  unique_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
115 
116  arg_desc->SetUsageContext(GetArguments().GetProgramBasename(),
117  "Application to find pattern in BLAST databases or subject sequences, version "
118  + CBlastVersion().Print());
119 
120  arg_desc->AddDefaultKey(kPattern, "input_file",
121  "File containing the patterns to be searched",
123 
124  arg_desc->AddOptionalKey(kDb, "database_name",
125  "Name of BLAST database to be searched",
127 
128  arg_desc->AddOptionalKey(kSubject, "input_file",
129  "File containing the subject sequences in FASTA format",
131 
132  arg_desc->AddDefaultKey(kOutput, "output_file",
133  "Output file to include results of the search",
135 
136  arg_desc->SetDependency(kDb, CArgDescriptions::eExcludes, kSubject);
137 
138  SetupArgDescriptions(arg_desc.release());
139 }
140 
142 {
143  int status = BLAST_EXIT_SUCCESS;
144 
145  try {
146 
147  // Allow the fasta reader to complain on invalid sequence input
149  SetDiagPostPrefix("seedtop");
150 
151  /*** Get the BLAST options ***/
152  const CArgs& args = GetArgs();
153 
154  CNcbiIstream& f_pattern = args[kPattern].AsInputFile();
155  CNcbiOstream& f_output = args[kOutput].AsOutputFile();
156 
158  CRef<CLocalDbAdapter> db_adapter;
159 
160  if (args.Exist(kSubject) && args[kSubject]) {
161  CNcbiIstream& f_subject = args[kSubject].AsInputFile();
162  //TSeqRange subj_range;
163  SDataLoaderConfig dlconfig(true);
165  CBlastInputSourceConfig iconfig(dlconfig);
166  CBlastFastaInputSource fasta(f_subject, iconfig);
167  CBlastInput input(&fasta);
168  CRef<blast::CBlastQueryVector> subjects(input.GetAllSeqs(*scope));
169  CRef<IQueryFactory> qf(new blast::CObjMgr_QueryFactory(*subjects));
170  CRef<CBlastOptionsHandle> opts_hndl
172  db_adapter.Reset(new CLocalDbAdapter(qf, opts_hndl));
173 
174  } else if (args.Exist(kDb) && args[kDb]) {
175 
176  CRef<CSearchDatabase> db(new CSearchDatabase(args[kDb].AsString(),
178  CRef<CSeqDB> seqdb = db->GetSeqDb();
179  db_adapter.Reset(new CLocalDbAdapter(*db));
180  scope->AddDataLoader(RegisterOMDataLoader(seqdb));
181 
182  } else {
183  NCBI_THROW(CInputException, eInvalidInput,
184  "Either a BLAST database or subject sequence(s) must be specified");
185  }
186  _ASSERT(db_adapter);
187 
188  while (true) {
189 
190  struct SSeedTopPattern pattern = s_ReadPattern(f_pattern);
191  if (pattern.pattern == "") break;
192 
193  CSeedTop seed_top(pattern.pattern);
194  CSeedTop::TSeedTopResults results = seed_top.Run(db_adapter);
195  CConstRef<CSeq_id> old_id(new CSeq_id());
197  const CSeq_id *sid = (*it)->GetId();
198  const CBioseq_Handle& bhl = scope->GetBioseqHandle(*sid);
199 
200  if (sid->AsFastaString() != old_id->AsFastaString()) {
201  const CBioseq_Handle::TId ids = bhl.GetId();
202  f_output << endl << '>';
203  ITERATE(CBioseq_Handle::TId, id, ids) {
204  string idst((*id).AsString());
205  int index = idst.find_last_not_of('|');
206  f_output << string(idst, 0, index + 1) << "|" ;
207  }
208 
209  f_output << sequence::CDeflineGenerator().GenerateDefline(bhl) << endl << endl;
210  f_output << "ID " << pattern.name << endl;;
211  f_output << "PA " << pattern.pattern << endl;
212  old_id.Reset(sid);
213  }
214 
215  f_output << "HI";
216  ITERATE(CPacked_seqint_Base::Tdata, range, (*it)->GetPacked_int().Get()) {
217  static const ESeqLocExtremes ex = eExtreme_Positional;
218  f_output << " (" << (*range)->GetStart(ex)+1 << " "
219  << (*range)->GetStop(ex)+1 << ")";
220  }
221  f_output << endl;
222  CSeqVector sv = bhl.GetSeqVector(CBioseq_Handle::eCoding_Iupac);
223  string sq;
224  CSeq_loc::TRange tot_range = (*it)->GetTotalRange();
225  sv.GetSeqData(tot_range.GetFrom(), tot_range.GetTo()+1, sq);
226  f_output << "SQ " << sq << endl;
227  }
228 
229  db_adapter->ResetBlastSeqSrcIteration();
230  }
231 
232  } CATCH_ALL(status)
233  return status;
234 }
235 
236 #ifndef SKIP_DOXYGEN_PROCESSING
237 int main(int argc, const char* argv[] /*, const char* envp[]*/)
238 {
239  return CSeedTopApp().AppMain(argc, argv);
240 }
241 #endif /* SKIP_DOXYGEN_PROCESSING */
Produce formatted blast output for command line applications.
string RegisterOMDataLoader(CRef< CSeqDB > db_handle)
Register the BLAST database data loader using the already initialized CSeqDB object.
Utility functions for BLAST command line applications.
#define BLAST_EXIT_SUCCESS
Command line binary exit code: success.
Interface for reading SRA sequences into blast input.
@ eBlastp
Protein-Protein.
Definition: blast_types.hpp:59
Main argument class for BLASTP application.
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgs –.
Definition: ncbiargs.hpp:379
CBioseq_Handle –.
Class representing a text file containing sequences in fasta format.
Class that centralizes the configuration data for sequences to be converted.
Definition: blast_input.hpp:48
Generalized converter from an abstract source of biological sequence data to collections of blast inp...
Keeps track of the version of the BLAST engine in the NCBI C++ toolkit.
Definition: version.hpp:53
Defines user input exceptions.
Interface to create a BlastSeqSrc suitable for use in CORE BLAST from a a variety of BLAST database/s...
CRef –.
Definition: ncbiobj.hpp:618
CScope –.
Definition: scope.hpp:92
Blast Search Subject.
virtual void Init()
@inheritDoc
virtual int Run()
@inheritDoc
CSeedTopApp()
@inheritDoc
Definition: seedtop_app.cpp:93
void Print(const CCompactSAMApplication::AlignInfo &ai)
TSeedTopResults Run(CRef< CLocalDbAdapter > db)
Definition: seedtop.cpp:102
CRef< CSeqDB > GetSeqDb() const
Obtain a reference to the database.
static CBlastOptionsHandle * Create(EProgram program, EAPILocality locality=CBlastOptions::eLocal)
Creates an options handle object configured with default options for the requested program,...
vector< CConstRef< CSeq_loc > > TSeedTopResults
Definition: seedtop.hpp:140
@ eBlastDbIsProtein
protein
void SetFullVersion(CRef< CVersionAPI > version)
Set version data for the program.
Definition: ncbiapp.cpp:1187
void HideStdArgs(THideStdArgs hide_mask)
Set the hide mask for the Hide Std Flags.
Definition: ncbiapp.cpp:1325
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:305
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
Definition: ncbiapp.cpp:1208
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
@ fHideXmlHelp
Hide XML help description.
@ fHideLogfile
Hide log file description.
@ fHideFullVersion
Hide full version description.
@ fHideDryRun
Hide dryrun description.
@ fHideConffile
Hide configuration file description.
bool Exist(const string &name) const
Check existence of argument description.
Definition: ncbiargs.cpp:1813
@ eExcludes
One argument excludes another.
Definition: ncbiargs.hpp:957
@ eInputFile
Name of file (must exist and be readable)
Definition: ncbiargs.hpp:595
@ eString
An arbitrary string.
Definition: ncbiargs.hpp:589
@ eOutputFile
Name of file (must be writable)
Definition: ncbiargs.hpp:596
string
Definition: cgiapp.hpp:690
void SetDiagPostPrefix(const char *prefix)
Specify a string to prefix all subsequent error postings with.
Definition: ncbidiag.cpp:6100
EDiagSev SetDiagPostLevel(EDiagSev post_sev=eDiag_Error)
Set the threshold severity for posting the messages.
Definition: ncbidiag.cpp:6132
@ eDiag_Warning
Warning message.
Definition: ncbidiag.hpp:652
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
const string AsFastaString(void) const
Definition: Seq_id.cpp:2266
void AddDataLoader(const string &loader_name, TPriority pri=kPriority_Default)
Add data loader by name.
Definition: scope.cpp:510
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
vector< CSeq_id_Handle > TId
const TId & GetId(void) const
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
#define CVersion
static int input()
int len
const string version
version string
Definition: variables.hpp:66
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
std::istream & in(std::istream &in_, double &x_)
NOTE: This file contains work in progress and the APIs are likely to change, please do not rely on th...
static int * results[]
Declares the CSeedTop class.
static struct SSeedTopPattern s_ReadPattern(CNcbiIstream &in)
Definition: seedtop_app.cpp:58
USING_SCOPE(blast)
static const string kOutput("out")
static const string kSubject("subject")
static const string kPattern("pattern")
USING_NCBI_SCOPE
Definition: seedtop_app.cpp:45
static const string kDb("db")
Configuration structure for the CBlastScopeSource.
void OptimizeForWholeLargeSequenceRetrieval(bool value=true)
Configures the BLAST database data loader to optimize the retrieval of *entire* large sequences.
#define _ASSERT
Modified on Fri Sep 20 14:57:53 2024 by modify_doxy.py rev. 669887