NCBI C++ ToolKit
blastdbcmd.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: blastdbcmd.cpp 101027 2023-10-17 19:19:18Z camacho $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Christiam Camacho
27  *
28  */
29 
30 /** @file blastdbcmd.cpp
31  * Command line tool to examine the contents of BLAST databases. This is the
32  * successor to fastacmd from the C toolkit
33  */
34 
35 #include <ncbi_pch.hpp>
36 #include <corelib/ncbiapp.hpp>
37 #include <serial/objostrjson.hpp>
47 #include <objtools/blast/seqdb_reader/tax4blastsqlite.hpp> // for taxid to their descendant taxids lookup
50 #include <serial/objostrjson.hpp>
51 #include "../blast/blast_app_util.hpp"
52 #include <iomanip>
53 
54 
55 #ifndef SKIP_DOXYGEN_PROCESSING
57 USING_SCOPE(blast);
58 #endif
59 
60 static const string NA = "N/A";
61 
62 /// The application class
63 class CBlastDBCmdApp : public CNcbiApplication
64 {
65 public:
66  /** @inheritDoc */
69  version->SetVersionInfo(new CBlastVersion());
72  if (m_UsageReport.IsEnabled()) {
74  m_UsageReport.AddParam(CBlastUsageReport::eProgram, (string) "blastdbcmd");
75  }
76  }
79  }
80 private:
81  /** @inheritDoc */
82  virtual void Init();
83  /** @inheritDoc */
84  virtual int Run();
85 
86  /// Handle to BLAST database
88  /// Is the database protein
90  /// output is FASTA
91  bool m_FASTA;
92  /// output is ASN.1 defline
94  /// should we find duplicate entries?
96  /// should we output target sequence only?
98 
100 
102 
105 
106  /// Initializes Blast DB
107  void x_InitBlastDB();
109 
110  string x_InitSearchRequest();
111 
112  /// Prints the BLAST database information (e.g.: handles -info command line
113  /// option)
115 
116  /// Processes all requests except printing the BLAST database information
117  /// @return 0 on success; 1 if some sequences were not retrieved
119 
120  /// Process batch entry with range, strand and filter id
121  /// @param args program input args
122  /// @param seq_fmt sequence formatter object
123  /// @return 0 on sucess; 1 if some queries were not processed
125 
127 
128  /// Process entry with range, strand and filter id
129  /// @param args program input args
130  /// @param seq_fmt sequence formatter object
131  /// @return 0 on sucess; 1 if some queries were not processed
133 
135 
137 
138  bool x_GetOids(const string & acc, vector<int> & oids);
139 
140  int x_ModifyConfigForBatchEntry(const string & config);
141 
142  bool x_UseLongSeqIds();
143 
145 
147 
148  void x_AddCmdOptions();
149 };
150 
151 
152 string s_PreProcessAccessionsForDBv5(const string & id)
153 {
154  string rv = id;
155  if ((id.find('|') != NPOS) || (id.find('_') != NPOS)) {
156 
157  CRef<CSeq_id> seqid;
158  try {
160  }
161  catch(...) {
162  }
163 
164  if(seqid.NotEmpty()) {
165  if(seqid->IsPir() || seqid->IsPrf()) {
166  return seqid->AsFastaString();
167  }
168  else if (seqid->IsPdb()) {
169  string tmp = seqid->GetSeqIdString();
170  rv = tmp.substr(0,4);
171  rv += tmp.substr(4);
172  return (rv);
173  }
174  return seqid->GetSeqIdString(true);
175  }
176  }
177 
178  return NStr::ToUpper(rv);
179 
180 }
181 
182 
183 bool
184 CBlastDBCmdApp::x_GetOids(const string & id, vector<int> & oids)
185 {
186  string acc = id;
189  }
190  TGi num_id = NStr::StringToNumeric<TGi>(acc, NStr::fConvErr_NoThrow);
191  if(!errno) {
192  int gi_oid = -1;
193  m_BlastDb->GiToOidwFilterCheck(num_id, gi_oid);
194  if(gi_oid < 0) {
195  m_BlastDb->AccessionToOids(acc, oids);
196  }
197  else {
198  oids.push_back(gi_oid);
199  }
200 
201  }
202  else {
203  m_BlastDb->AccessionToOids(acc, oids);
204  }
205  if(oids.empty()) {
206  ERR_POST(Error << "Entry not found: " << acc);
207  return false;
208  }
209  return true;
210 }
211 
212 int
214 {
215  unsigned int err_found = 0;
216  const CArgs& args = GetArgs();
218 
219  if (args["ipg"].HasValue()) {
220  CSeqDB::TOID oid;
221  m_BlastDb->PigToOid(args["ipg"].AsInteger(),oid);
222  fmt.Write(oid, m_Config);
223  } else if (args["entry"].HasValue()) {
224  static const string kDelim(",");
225  const string& entry = args["entry"].AsString();
226 
227  vector<string> queries;
228  if (entry.find(kDelim[0]) != string::npos) {
229  NStr::Split(entry, kDelim, queries);
230  } else {
231  queries.resize(1);
232  queries[0] = entry;
233  }
234  for(unsigned int i=0; i < queries.size(); i++) {
235  vector<CSeqDB::TOID> oids;
236  if(x_GetOids(queries[i], oids)) {
237  for(unsigned int j=0; j < oids.size(); j++) {
238  if(m_TargetOnly) {
239  fmt.Write(oids[j], m_Config, queries[i]);
240  }
241  else {
242  fmt.Write(oids[j], m_Config);
243  }
244  }
245  }
246  else {
247  err_found ++;
248  }
249  }
250  if(err_found == queries.size()) {
251  NCBI_THROW(CInputException, eInvalidInput,
252  "Entry or entries not found in BLAST database");
253  }
254  }
255  return (err_found) ? 1:0;
256 }
257 
258 bool s_IsMaskAlgoIdValid(CSeqDB & blastdb, int id)
259 {
260  if (id >= 0) {
261  vector<int> algo_id(1, id);
262  vector<int> invalid_algo_ids = blastdb.ValidateMaskAlgorithms(algo_id);
263  if ( !invalid_algo_ids.empty()) {
264  ERR_POST(Error << "Invalid filtering algorithm ID: " << NStr::IntToString(id));
265  return false;
266  }
267  }
268  return true;
269 }
270 
272 {
273  int status = 0;
274  if (!m_DbIsProtein) {
276  }
278  m_Config.m_FiltAlgoId = -1;
279  if(!format.empty()) {
280  vector<string> tmp;
282  for(unsigned int i=0; i < tmp.size(); i++) {
283  if(tmp[i].find('-')!= string::npos) {
284  try {
286  } catch (...) {
287  }
288  }
289  else if (!m_DbIsProtein && NStr::EqualNocase(tmp[i].c_str(), "minus")) {
291  }
292  else {
295  status = 1;
296  }
297  }
298  }
299  }
300  return status;
301 }
302 
303 int
305 {
306  vector<blastdb::TOid> oids;
308  if(oids.size() == 0) {
309  ERR_POST (Error << "No seq found in db for taxonomy list");
310  return 1;
311  }
312  for(unsigned i=0; i < oids.size(); i++) {
313  fmt.Write(oids[i], m_Config);
314  }
315  return 0;
316 }
317 
318 
319 void
321 {
322  const CArgs& args = GetArgs();
323  vector<string> ids;
324  if(args[kArgTaxIdList].HasValue()) {
325  string input = args[kArgTaxIdList].AsString();
326  NStr::Split(input, ",", ids);
327  }
328  else {
329  CNcbiIstream& input = args[kArgTaxIdListFile].AsInputFile();
330  while (input) {
331  string line;
332  NcbiGetlineEOL(input, line);
333  if ( !line.empty() ) {
334  ids.push_back(line);
335  }
336  }
337  }
338 
339  unique_ptr<ITaxonomy4Blast> tb;
340  if( ! args[kArgNoTaxIdExpansion].AsBoolean() ){
341  try{
342  tb.reset(new CTaxonomy4BlastSQLite());
343  }
344  catch(CException &){
345  LOG_POST(Warning << "The -taxids command line option requires additional data files. Please see the section 'Taxonomic filtering for BLAST databases' in https://www.ncbi.nlm.nih.gov/books/NBK569839/ for details.");
346  }
347  }
348 
349  for (auto id : ids) {
350  auto taxid = NStr::StringToNumeric<TTaxId>(id, NStr::fAllowLeadingSpaces | NStr::fAllowTrailingSpaces);
351  m_TaxIdList.insert(taxid);
352  if (tb) {
353  vector<int> descendants;
354  tb->GetLeafNodeTaxids(taxid, descendants);
355  for (auto d: descendants)
356  m_TaxIdList.insert(static_cast<TTaxId>(d));
357  }
358  }
359 
360  CSeqDB::ESeqType seqtype = ParseMoleculeTypeString(args[kArgDbType].AsString());
361  m_DbIsProtein = static_cast<bool>(seqtype == CSeqDB::eProtein);
362  m_TargetOnly = args["target_only"];
363  if(m_TargetOnly) {
364  CRef<CSeqDBGiList> taxid_list(new CSeqDBGiList());
365  taxid_list->AddTaxIds(m_TaxIdList);
366  m_BlastDb.Reset(new CSeqDBExpert(args[kArgDb].AsString(), seqtype, taxid_list.GetPointer()));
367  }
368  else {
369  m_BlastDb.Reset(new CSeqDBExpert(args[kArgDb].AsString(), seqtype));
370  }
371 }
372 
373 
374 int
376 {
377  int err_found = 0;
378  const CArgs& args = GetArgs();
379  CNcbiIstream& input = args["entry_batch"].AsInputFile();
380  vector<string> ids, formats;
381  vector<CSeqDB::TOID> oids;
382  while (input) {
383  string line;
384  NcbiGetlineEOL(input, line);
385  if ( !line.empty() ) {
386  string id, format;
388  if(id.empty()) {
389  continue;
390  }
391  ids.push_back(id);
392  formats.push_back(format);
393  }
394  }
395 
397  for(unsigned int i=0; i < ids.size(); i++) {
398  ids[i] = s_PreProcessAccessionsForDBv5(ids[i]);
399  }
400  }
401  try {
402  m_BlastDb->AccessionsToOids(ids, oids);
403  }
404  catch (CSeqDBException & e) {
405  if (e.GetMsg().find("DB contains no accession info") == NPOS){
406  NCBI_RETHROW_SAME(e, e.GetMsg());
407  }
408  }
409  for(unsigned i=0; i < ids.size(); i++) {
410  if(oids[i] == kSeqDBEntryNotFound) {
411  TGi num_id = NStr::StringToNumeric<TGi>(ids[i], NStr::fConvErr_NoThrow);
412  if(!errno) {
413  int gi_oid = -1;
414  m_BlastDb->GiToOidwFilterCheck(num_id, gi_oid);
415  if(gi_oid >= 0) {
416  oids[i] = gi_oid;
417  }
418  }
419  if(oids[i] == kSeqDBEntryNotFound) {
420  err_found ++;
421  ERR_POST (Error << "Skipped " << ids[i]);
422  continue;
423  }
424  }
425  if(x_ModifyConfigForBatchEntry(formats[i])) {
426  err_found ++;
427  ERR_POST (Error << "Skipped " << ids[i]);
428  continue;
429  }
430  if(m_TargetOnly) {
431  fmt.Write(oids[i], m_Config, ids[i]);
432  }
433  else {
434  fmt.Write(oids[i], m_Config);
435  }
436  }
437  return (err_found) ? 1 : 0;
438 }
439 
440 int
442 {
443  int err_found = 0;
444  const CArgs& args = GetArgs();
445  CNcbiIstream& input = args["entry_batch"].AsInputFile();
446 
447  while (input) {
448  string line;
449  NcbiGetlineEOL(input, line);
450  if ( !line.empty() ) {
451  string id, format;
453  if(id.empty()) {
454  continue;
455  }
457  err_found ++;
458  ERR_POST (Error << "Skipped " << id);
459  continue;
460  }
461  vector<int> oids;
462  if(!x_GetOids(id, oids)) {
463  err_found ++;
464  ERR_POST (Error << "Skipped " << id);
465  continue;
466  }
467 
468  if (m_GetDuplicates) {
469  for(unsigned int j=0; j < oids.size(); j++) {
470  fmt.Write(oids[j], m_Config);
471  }
472  }
473  else {
474  if(m_TargetOnly) {
475  fmt.Write(oids[0], m_Config, id);
476  }
477  else {
478  fmt.Write(oids[0], m_Config);
479  }
480  }
481  }
482  }
483  return (err_found) ? 1 : 0;
484 }
485 
486 
487 int
489 {
490  int err_found = 0;
491  const CArgs& args = GetArgs();
492  CNcbiIstream& input = args["ipg_batch"].AsInputFile();
493 
494  while (input) {
495  string line;
496  NcbiGetlineEOL(input, line);
497  if ( !line.empty() ) {
498  string id, format;
500  if(id.empty()) {
501  continue;
502  }
504  err_found ++;
505  ERR_POST (Error << "Skipped IPG : " << id);
506  continue;
507  }
508  int oid;
510  m_BlastDb->PigToOid(pig,oid);
511  if (oid == -1) {
512  err_found ++;
513  ERR_POST (Error << "Skipped IPG: " << id);
514  continue;
515  }
516 
517  fmt.Write(oid, m_Config);
518  }
519  }
520  return (err_found) ? 1 : 0;
521 }
522 
523 void
525 {
526  const CArgs& args = GetArgs();
527 
528  CSeqDB::ESeqType seqtype = ParseMoleculeTypeString(args[kArgDbType].AsString());
529  m_BlastDb.Reset(new CSeqDBExpert(args[kArgDb].AsString(), seqtype));
530  m_DbIsProtein = static_cast<bool>(m_BlastDb->GetSequenceType() == CSeqDB::eProtein);
531 }
532 
533 void
535 {
538  const string kLetters = m_DbIsProtein ? "residues" : "bases";
539  const string kVersion = (m_BlastDb->GetBlastDbVersion() == EBlastDbVersion::eBDB_Version5) ? "5":"4";
540  const CArgs& args = GetArgs();
541 
542  CNcbiOstream& out = args[kArgOutput].AsOutputFile();
543 
544  // Print basic database information
545  out << "Database: " << m_BlastDb->GetTitle() << endl
547  << " sequences; ";
548  if(args["exact_length"])
550  else
552  out << " total " << kLetters << endl << endl
553  << "Date: " << m_BlastDb->GetDate()
554  << "\tLongest sequence: "
556  << kLetters << endl << endl;
557 
558  out << "BLASTDB Version: " << kVersion << endl;
559 
560 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
561  (!defined(NCBI_COMPILER_MIPSPRO)) )
562  // Print filtering algorithms supported
564 #endif
565 
566  // Print volume names
567  vector<string> volumes;
568  m_BlastDb->FindVolumePaths(volumes,false);
569  out << endl << "Volumes:" << endl;
570  ITERATE(vector<string>, file_name, volumes) {
571  out << "\t" << *file_name << endl;
572  }
573 }
574 
576 {
577 private:
578  enum {
585  eMaxFields
586  };
588  vector<int> m_Fields;
589  vector<string> m_Seperators;
592 public:
594  vector<string> fields;
595  string sp = kEmptyStr;
596  for (unsigned int i = 0; i < fmt.size(); i++) {
597  if (fmt[i] == '%') {
598  if (fmt[i+1] == '%') {
599  sp += fmt[i];
600  continue;
601  }
602  i++;
603  switch (fmt[i]) {
604  case 'f' :
605  m_NeedTaxInfoLookup = true;
606  m_Seperators.push_back(sp);
607  for(unsigned int i=eTaxID; i < eMaxFields; i++){
608  if ( i == eNumSeqs ) continue;
609  m_Fields.push_back(i);
610  m_Seperators.push_back("\t");
611  }
612  break;
613  case 'T' :
614  m_Fields.push_back(eTaxID);
615  break;
616  case 'S' :
617  m_Fields.push_back(eSciName);
618  m_NeedTaxInfoLookup = true;
619  break;
620  case 'L' :
621  m_Fields.push_back(eCommonName);
622  m_NeedTaxInfoLookup = true;
623  break;
624  case 'K' :
625  m_Fields.push_back(eSuperKingdom);
626  m_NeedTaxInfoLookup = true;
627  break;
628  case 'B' :
629  m_Fields.push_back(eBlastName);
630  m_NeedTaxInfoLookup = true;
631  break;
632  case 'n' :
633  m_Fields.push_back(eNumSeqs);
634  m_NeedNumSeqs = true;
635  break;
636  default:
637  sp += fmt[i-1];
638  sp += fmt[i];
639  continue;
640  break;
641  }
642  m_Seperators.push_back(sp);
643  sp = kEmptyStr;
644  }
645  else {
646  sp += fmt[i];
647  }
648  }
649  m_Seperators.push_back(sp);
650 
651  if(m_Fields.empty()) {
652  NCBI_THROW(CInputException, eInvalidInput,
653  "Invalid format options for tax_info.");
654  }
655  }
656 
657  void PrintEntry(const SSeqDBTaxInfo & t, int num_seqs){
658  for(unsigned int i=0; i < m_Fields.size(); i++) {
659  m_Out << m_Seperators[i];
660  switch (m_Fields[i]){
661  case eTaxID:
662  m_Out << t.taxid;
663  break;
664  case eSciName:
665  m_Out << t.scientific_name;
666  break;
667  case eCommonName:
668  m_Out << t.common_name;
669  break;
670  case eSuperKingdom:
671  m_Out << t.s_kingdom;
672  break;
673  case eBlastName:
674  m_Out << t.blast_name;
675  break;
676  case eNumSeqs:
677  m_Out << num_seqs;
678  break;
679  default:
680  NCBI_THROW(CInputException, eInvalidInput,
681  "Invalid format options for tax_info.");
682  break;
683  }
684  }
685  m_Out << m_Seperators.back();
686  m_Out << "\n";
687  }
689  bool NeedNumSeqs(){return m_NeedNumSeqs;}
690 };
691 
692 
693 void
695 {
697  const CArgs& args = GetArgs();
698 
699  CNcbiOstream& out = args[kArgOutput].AsOutputFile();
700  const string& kFmt = args["outfmt"].AsString();
701  CPrintTaxFields tf(out, kFmt);
702  set<TTaxId> tax_ids;
703  m_BlastDb->GetDBTaxIds(tax_ids);
704 
705  // Print basic database information
706  out << "# of Tax IDs in Database: " << tax_ids.size() << endl;
708  ITERATE(set<TTaxId>, itr, tax_ids) {
710  if(tf.NeedTaxNames()){
712  if(info.taxid == ZERO_TAX_ID){
713  info.taxid = *itr;
714  info.scientific_name = NA;
715  info.common_name = NA;
716  info.blast_name = NA;
717  info.s_kingdom = NA;
718  }
719  }
720  else {
721  info.taxid = *itr;
722  }
723  int num_seqs = 0;
724  if(tf.NeedNumSeqs()) {
725  vector<blastdb::TOid> rv;
726  set<TTaxId> ti;
727  ti.insert(*itr);
728  m_BlastDb->TaxIdsToOids(ti, rv);
729  num_seqs = rv.size();
730  }
731  tf.PrintEntry(info, num_seqs);
732  }
733 }
734 
735 
736 string
738 {
739  const CArgs& args = GetArgs();
740  m_GetDuplicates = args["get_dups"];
741  m_TargetOnly = args["target_only"];
742 
743  string outfmt = kEmptyStr;
744  if (args["outfmt"].HasValue()) {
745  outfmt = args["outfmt"].AsString();
746  m_FASTA = false;
747  m_Asn1Bioseq = false;
748 
749  if ((outfmt.find("%f") != string::npos &&
750  (outfmt.find("%b") != string::npos || outfmt.find("%d") != string::npos)) ||
751  (outfmt.find("%b") != string::npos && outfmt.find("%d") != string::npos)) {
752  NCBI_THROW(CInputException, eInvalidInput,
753  "The %f, %b, %d output format options cannot be specified together.");
754  }
755 
756  if (outfmt.find("%b") != string::npos) {
757  outfmt = "%b";
758  m_Asn1Bioseq = true;
759  }
760 
761  // If "%f" is found within outfmt, discard everything else
762  if (outfmt.find("%f") != string::npos) {
763  outfmt = "%f";
764  m_FASTA = true;
765  }
766 
767  if (outfmt.find("%d") != string::npos) {
768  outfmt = "%d";
769  }
770 
771  if (outfmt.find("%m") != string::npos) {
772  int algo_id = 0;
773  size_t i = outfmt.find("%m") + 2;
774  bool found = false;
775  while (i < outfmt.size() && outfmt[i] >= '0' && outfmt[i] <= '9') {
776  algo_id = algo_id * 10 + (outfmt[i] - '0');
777  outfmt.erase(i, 1);
778  found = true;
779  }
780  if (!found) {
781  NCBI_THROW(CInputException, eInvalidInput,
782  "The option '-outfmt %m' is not followed by a masking algo ID.");
783  }
784  m_Config.m_FmtAlgoId = algo_id;
786  NCBI_THROW(CInvalidDataException, eInvalidInput,
787  "Invalid filtering algorithm ID for outfmt %m.");
788  }
789  }
790  }
791 
792  if (args["strand"].HasValue() && !m_DbIsProtein) {
793  if (args["strand"].AsString() == "plus") {
795  } else if (args["strand"].AsString() == "minus") {
797  } else {
798  NCBI_THROW(CInputException, eInvalidInput,
799  "Both strands is not supported");
800  }
801  }
802  m_Config.m_UseCtrlA = args["ctrl_a"];
803  if (args["mask_sequence_with"].HasValue()) {
804  m_Config.m_FiltAlgoId = -1;
805  m_Config.m_FiltAlgoId = NStr::StringToInt(args["mask_sequence_with"].AsString(), NStr::fConvErr_NoThrow);
806  if(errno) {
807  m_Config.m_FiltAlgoId = m_BlastDb->GetMaskAlgorithmId(args["mask_sequence_with"].AsString());
808  }
810  NCBI_THROW(CInvalidDataException, eInvalidInput,
811  "Invalid filtering algorithm ID for mask_sequence_with.");
812  }
813  }
814  if (args["range"].HasValue()) {
815  m_Config.m_SeqRange = ParseSequenceRangeOpenEnd(args["range"].AsString());
816  }
817  return outfmt;
818 }
819 
820 int
822 {
823  const CArgs& args = GetArgs();
824  if (args["entry"].HasValue() && args["entry"].AsString() == "all") {
825  fmt.DumpAll(m_Config);
826  }
827  else if (args["entry_batch"].HasValue()) {
828  if(m_GetDuplicates) {
829  return x_ProcessBatchEntry(fmt);
830  }
831  else {
832  return x_ProcessBatchEntry_NoDup(fmt);
833  }
834  }
835  else if (args["entry"].HasValue() || args["ipg"].HasValue()) {
836  return x_ProcessEntry(fmt);
837  }
838  else if (args["ipg_batch"].HasValue()) {
839  return x_ProcessBatchPig(fmt);
840  }
841  else if(args[kArgTaxIdList].HasValue()||
842  args[kArgTaxIdListFile].HasValue()) {
843  return x_ProcessTaxIdList(fmt);
844  }
845  else {
846  NCBI_THROW(CInputException, eInvalidInput,
847  "Must specify query type: one of 'entry', 'entry_batch', or 'pig'");
848  }
849  return 0;
850 }
851 
853 {
854  const CArgs& args = GetArgs();
855  if (args["long_seqids"].AsBoolean()) {
856  return true;
857  }
859  if (app) {
860  const CNcbiRegistry& registry = app->GetConfig();
861  if (registry.Get("BLAST", "LONG_SEQID") == "1") {
862  return true;
863  }
864  }
865  return false;
866 }
867 
868 int
870 {
871  int err_found = 0;
872  try {
873  const CArgs& args = GetArgs();
874  CNcbiOstream& out = args[kArgOutput].AsOutputFile();
875  string outfmt = x_InitSearchRequest();
876  /* Special case: full db dump when no range and mask data is specified */
877  if (m_FASTA) {
878  CBlastDB_FastaFormatter fasta_fmt(*m_BlastDb, out, args["line_length"].AsInteger(), x_UseLongSeqIds());
879  err_found = x_ProcessSearchType(fasta_fmt);
880  }
881  else if (m_Asn1Bioseq) {
882  CBlastDB_BioseqFormatter bioseq_fmt(*m_BlastDb, out);
883  err_found = x_ProcessSearchType(bioseq_fmt);
884  }
885  else {
886  CBlastDB_SeqFormatter seq_fmt(outfmt, *m_BlastDb, out);
887  err_found = x_ProcessSearchType(seq_fmt);
888  }
889  }
890  catch (const CException& e) {
891  ERR_POST(Error << e.GetMsg());
892  err_found = 1;
893  } catch (...) {
894  ERR_POST(Error << "Failed to retrieve requested item");
895  err_found = 1;
896  }
897  return err_found;
898 }
899 
900 
902 {
904 
905  unique_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
906 
907  // Specify USAGE context
908  arg_desc->SetUsageContext(GetArguments().GetProgramBasename(),
909  "BLAST database client, version " + CBlastVersion().Print());
910 
911  arg_desc->SetCurrentGroup("BLAST database options");
912  arg_desc->AddDefaultKey(kArgDb, "dbname", "BLAST database name",
914 
915  arg_desc->AddDefaultKey(kArgDbType, "molecule_type",
916  "Molecule type stored in BLAST database",
917  CArgDescriptions::eString, "guess");
918  arg_desc->SetConstraint(kArgDbType, &(*new CArgAllow_Strings,
919  "nucl", "prot", "guess"));
920 
921  arg_desc->SetCurrentGroup("Retrieval options");
922  arg_desc->AddOptionalKey("entry", "sequence_identifier",
923  "Comma-delimited search string(s) of sequence identifiers"
924  ":\n\te.g.: 555, AC147927, 'gnl|dbname|tag', or 'all' "
925  "to select all\n\tsequences in the database",
927 
928  arg_desc->AddOptionalKey("entry_batch", "input_file",
929  "Input file for batch processing (Format: one entry per line, seq id \n"
930  "followed by optional space-delimited specifier(s) [range|strand|mask_algo_id]",
932  arg_desc->SetDependency("entry_batch", CArgDescriptions::eExcludes, "entry");
933  arg_desc->SetDependency("entry_batch", CArgDescriptions::eExcludes, "range");
934  arg_desc->SetDependency("entry_batch", CArgDescriptions::eExcludes, "strand");
935  arg_desc->SetDependency("entry_batch", CArgDescriptions::eExcludes, "mask_sequence_with");
936 
937  arg_desc->AddOptionalKey("ipg", "IPG", "IPG to retrieve",
939  arg_desc->SetConstraint("ipg", new CArgAllowValuesGreaterThanOrEqual(0));
940  arg_desc->SetDependency("ipg", CArgDescriptions::eExcludes, "entry");
941  arg_desc->SetDependency("ipg", CArgDescriptions::eExcludes, "entry_batch");
942  arg_desc->SetDependency("ipg", CArgDescriptions::eExcludes, "target_only");
943  arg_desc->SetDependency("ipg", CArgDescriptions::eExcludes, "ipg_batch");
944 
945  arg_desc->AddOptionalKey("ipg_batch", "input_file",
946  "Input file for batch processing (Format: one entry per line, IPG \n"
947  "followed by optional space-delimited specifier(s) [range|strand|mask_algo_id]",
949  arg_desc->SetDependency("ipg_batch", CArgDescriptions::eExcludes, "entry");
950  arg_desc->SetDependency("ipg_batch", CArgDescriptions::eExcludes, "entry_batch");
951  arg_desc->SetDependency("ipg_batch", CArgDescriptions::eExcludes, "range");
952  arg_desc->SetDependency("ipg_batch", CArgDescriptions::eExcludes, "strand");
953  arg_desc->SetDependency("ipg_batch", CArgDescriptions::eExcludes, "mask_sequence_with");
954 
955  arg_desc->AddFlag("info", "Print BLAST database information", true);
956  // All other options to this program should be here
957  const char* exclusions[] = { "entry", "entry_batch", "outfmt", "strand",
958  "target_only", "ctrl_a", "get_dups", "pig", "range",
959  "mask_sequence", "list", "remove_redundant_dbs", "recursive",
960  "list_outfmt", "metadata", "metadata_output_prefix", kArgTaxIdListFile.c_str(), kArgTaxIdList.c_str(),
961  kArgNoTaxIdExpansion.c_str()};
962  for (size_t i = 0; i < sizeof(exclusions)/sizeof(*exclusions); i++) {
963  arg_desc->SetDependency("info", CArgDescriptions::eExcludes,
964  string(exclusions[i]));
965  }
966 
967  arg_desc->AddFlag("metadata", "Generate BLAST database metadata", true);
968  // All other options to this program should be here
969  const char* exclusions_m[] = { "entry", "entry_batch", "outfmt", "strand",
970  "target_only", "ctrl_a", "get_dups", "pig", "range",
971  "mask_sequence", "list", "remove_redundant_dbs", "recursive",
972  "list_outfmt", "info", kArgTaxIdListFile.c_str(), kArgTaxIdList.c_str(),
973  kArgNoTaxIdExpansion.c_str()};
974  for (size_t i = 0; i < sizeof(exclusions_m)/sizeof(*exclusions_m); i++) {
975  arg_desc->SetDependency("metadata", CArgDescriptions::eExcludes,
976  string(exclusions_m[i]));
977  }
978 
979  arg_desc->AddOptionalKey("metadata_output_prefix", "",
980  "Path prefix for location of database files in metadata", CArgDescriptions::eString);
981  arg_desc->SetDependency("metadata_output_prefix", CArgDescriptions::eRequires, "metadata");
982 
983  arg_desc->AddFlag("tax_info",
984  "Print taxonomic information contained in this BLAST database.\n"
985  "Use -outfmt to customize output. Format specifiers supported are:\n"
986  "\t\t%T means taxid\n"
987  "\t\t%L means common taxonomic name\n"
988  "\t\t%S means scientific name\n"
989  "\t\t%K means taxonomic super kingdom\n"
990  "\t\t%B means BLAST name\n"
991  "\t\t%n means num of seqs\n"
992  "By default it prints: '%T %S %L %K %B'\n", true);
993  // All other options to this program should be here
994  const char* tax_info_exclusions[] = { "info", "entry", "entry_batch", "strand",
995  "target_only", "ctrl_a", "get_dups", "pig", "range",
996  "mask_sequence", "list", "remove_redundant_dbs", "recursive",
997  "list_outfmt", kArgTaxIdListFile.c_str(), kArgTaxIdList.c_str(),
998  kArgNoTaxIdExpansion.c_str() };
999  for (size_t i = 0; i < sizeof(tax_info_exclusions)/sizeof(*tax_info_exclusions); i++) {
1000  arg_desc->SetDependency("tax_info", CArgDescriptions::eExcludes,
1001  string(tax_info_exclusions[i]));
1002  }
1003 
1004  arg_desc->SetCurrentGroup("Sequence retrieval configuration options");
1005  arg_desc->AddOptionalKey("range", "numbers",
1006  "Range of sequence to extract in 1-based offsets "
1007  "(Format: start-stop, for start to end of sequence use start - )",
1009 
1010  arg_desc->AddDefaultKey("strand", "strand",
1011  "Strand of nucleotide sequence to extract",
1012  CArgDescriptions::eString, "plus");
1013  arg_desc->SetConstraint("strand", &(*new CArgAllow_Strings, "minus",
1014  "plus"));
1015 
1016  arg_desc->AddOptionalKey("mask_sequence_with", "mask_algo_id",
1017  "Produce lower-case masked FASTA using the "
1018  "algorithm ID specified",
1020 
1021  // Some additional tax related grouping
1022  arg_desc->SetCurrentGroup("Taxonomic filtering options");
1023  arg_desc->AddOptionalKey(kArgTaxIdList, "taxonomy_ids",
1024  "Comma-delimited taxonomy identifiers", CArgDescriptions::eString);
1025  arg_desc->SetDependency(kArgTaxIdList, CArgDescriptions::eExcludes, "entry");
1026  arg_desc->SetDependency(kArgTaxIdList, CArgDescriptions::eExcludes, "entry_batch");
1027  arg_desc->SetDependency(kArgTaxIdList, CArgDescriptions::eExcludes, "pig");
1028 
1029  arg_desc->AddOptionalKey(kArgTaxIdListFile, "input_file",
1030  "Input file for taxonomy identifiers", CArgDescriptions::eInputFile);
1031  arg_desc->SetDependency(kArgTaxIdListFile, CArgDescriptions::eExcludes, "entry");
1032  arg_desc->SetDependency(kArgTaxIdListFile, CArgDescriptions::eExcludes, "entry_batch");
1033  arg_desc->SetDependency(kArgTaxIdListFile, CArgDescriptions::eExcludes, "pig");
1034  arg_desc->SetDependency(kArgTaxIdListFile, CArgDescriptions::eExcludes, kArgTaxIdList);
1035 
1036  // Disable Tax ID resoution to the descendants SB-3791/SB-3779
1037  arg_desc->AddFlag(kArgNoTaxIdExpansion, "Do not expand the taxonomy IDs provided to their descendant taxonomy IDs ", true);
1038  arg_desc->SetDependency(kArgNoTaxIdExpansion, CArgDescriptions::eExcludes, "entry");
1039  arg_desc->SetDependency(kArgNoTaxIdExpansion, CArgDescriptions::eExcludes, "entry_batch");
1040  arg_desc->SetDependency(kArgNoTaxIdExpansion, CArgDescriptions::eExcludes, "pig");
1041  arg_desc->SetCurrentGroup("Output configuration options");
1042  arg_desc->AddDefaultKey(kArgOutput, "output_file", "Output file name",
1044 
1045  // The format specifiers below should be handled in
1046  // CSeqFormatter::x_Builder
1047  arg_desc->AddDefaultKey("outfmt", "format",
1048  "Output format, where the available format specifiers are:\n"
1049  "\t\t%f means sequence in FASTA format\n"
1050  "\t\t%s means sequence data (without defline)\n"
1051  "\t\t%a means accession\n"
1052  "\t\t%g means gi\n"
1053  "\t\t%o means ordinal id (OID)\n"
1054  "\t\t%i means sequence id\n"
1055  "\t\t%t means sequence title\n"
1056  "\t\t%l means sequence length\n"
1057  "\t\t%h means sequence hash value\n"
1058  "\t\t%T means taxid\n"
1059  "\t\t%X means leaf-node taxids\n"
1060  "\t\t%e means membership integer\n"
1061  "\t\t%L means common taxonomic name\n"
1062  "\t\t%C means common taxonomic names for leaf-node taxids\n"
1063  "\t\t%S means scientific name\n"
1064  "\t\t%N means scientific names for leaf-node taxids\n"
1065  "\t\t%B means BLAST name\n" /* Is this useful outside NCBI? */
1066 #if _DEBUG
1067  "\t\t%n means a list of links integers separated by ';'\n"
1068 #endif /* _DEBUG */
1069  "\t\t%K means taxonomic super kingdom\n"
1070  "\t\t%P means PIG\n"
1071 #if _DEBUG
1072  "\t\t%d means defline in text ASN.1 format\n"
1073  "\t\t%b means Bioseq in text ASN.1 format\n"
1074 #endif /* _DEBUG */
1075  "\t\t%m means sequence masking data.\n"
1076  "\t\t Masking data will be displayed as a series of 'N-M' values\n"
1077  "\t\t separated by ';' or the word 'none' if none are available.\n"
1078 #if _DEBUG
1079  "\tIf '%f' or '%d' are specified, all other format specifiers are ignored.\n"
1080  "\tFor every format except '%f' and '%d', each line of output will "
1081 #else
1082  "\tIf '%f' is specified, all other format specifiers are ignored.\n"
1083  "\tFor every format except '%f', each line of output will "
1084 #endif /* _DEBUG */
1085  "correspond\n\tto a sequence.\n",
1087 
1088  //arg_desc->AddDefaultKey("target_only", "value",
1089  // "Definition line should contain target gi only",
1090  // CArgDescriptions::eBoolean, "false");
1091  arg_desc->AddFlag("target_only",
1092  "Definition line should contain target entry only", true);
1093 
1094  //arg_desc->AddDefaultKey("get_dups", "value",
1095  // "Retrieve duplicate accessions",
1096  // CArgDescriptions::eBoolean, "false");
1097  arg_desc->AddFlag("get_dups", "Retrieve duplicate accessions", true);
1098  arg_desc->SetDependency("get_dups", CArgDescriptions::eExcludes,
1099  "target_only");
1100 
1101  arg_desc->SetCurrentGroup("Output configuration options for FASTA format");
1102  arg_desc->AddDefaultKey("line_length", "number", "Line length for output",
1104  NStr::IntToString(80));
1105  arg_desc->SetConstraint("line_length",
1107 
1108  arg_desc->AddFlag("ctrl_a",
1109  "Use Ctrl-A as the non-redundant defline separator",true);
1110 
1111  const char* exclusions_discovery[] = { "entry", "entry_batch", "outfmt",
1112  "strand", "target_only", "ctrl_a", "get_dups", "pig", "range", kArgDb.c_str(),
1113  "info", "mask_sequence", "line_length" };
1114  arg_desc->SetCurrentGroup("BLAST database configuration and discovery options");
1115  arg_desc->AddFlag("show_blastdb_search_path",
1116  "Displays the default BLAST database search paths", true);
1117  arg_desc->AddOptionalKey("list", "directory",
1118  "List BLAST databases in the specified directory",
1120  arg_desc->AddFlag("remove_redundant_dbs",
1121  "Remove the databases that are referenced by another "
1122  "alias file in the directory in question", true);
1123  arg_desc->AddFlag("recursive",
1124  "Recursively traverse the directory structure to list "
1125  "available BLAST databases", true);
1126  arg_desc->AddDefaultKey("list_outfmt", "format",
1127  "Output format for the list option, where the available format specifiers are:\n"
1128  "\t\t%f means the BLAST database absolute file name path\n"
1129  "\t\t%p means the BLAST database molecule type\n"
1130  "\t\t%t means the BLAST database title\n"
1131  "\t\t%d means the date of last update of the BLAST database\n"
1132  "\t\t%l means the number of bases/residues in the BLAST database\n"
1133  "\t\t%n means the number of sequences in the BLAST database\n"
1134  "\t\t%U means the number of bytes used by the BLAST database\n"
1135  "\t\t%v means the BLAST database format version\n"
1136  "\tFor every format each line of output will "
1137  "correspond to a BLAST database.\n",
1138  CArgDescriptions::eString, "%f %p");
1139  for (size_t i = 0; i <
1140  sizeof(exclusions_discovery)/sizeof(*exclusions_discovery); i++) {
1141  arg_desc->SetDependency("list", CArgDescriptions::eExcludes,
1142  string(exclusions_discovery[i]));
1143  arg_desc->SetDependency("recursive", CArgDescriptions::eExcludes,
1144  string(exclusions_discovery[i]));
1145  arg_desc->SetDependency("remove_redundant_dbs", CArgDescriptions::eExcludes,
1146  string(exclusions_discovery[i]));
1147  arg_desc->SetDependency("list_outfmt", CArgDescriptions::eExcludes,
1148  string(exclusions_discovery[i]));
1149  arg_desc->SetDependency("show_blastdb_search_path", CArgDescriptions::eExcludes,
1150  string(exclusions_discovery[i]));
1151  }
1152  arg_desc->SetDependency("show_blastdb_search_path", CArgDescriptions::eExcludes,
1153  "list");
1154  arg_desc->SetDependency("show_blastdb_search_path", CArgDescriptions::eExcludes,
1155  "recursive");
1156  arg_desc->SetDependency("show_blastdb_search_path", CArgDescriptions::eExcludes,
1157  "list_outfmt");
1158  arg_desc->SetDependency("show_blastdb_search_path", CArgDescriptions::eExcludes,
1159  "remove_redundant_dbs");
1160 
1161  arg_desc->AddFlag("exact_length", "Get exact length for db info", true);
1162  arg_desc->SetDependency("exact_length", CArgDescriptions::eRequires,
1163  "info");
1164  arg_desc->AddFlag("long_seqids", "Use long seq id for fasta deflines", true);
1165  arg_desc->SetDependency("long_seqids", CArgDescriptions::eExcludes, "info");
1166  SetupArgDescriptions(arg_desc.release());
1167 }
1168 
1169 int CBlastDBCmdApp::Run(void)
1170 {
1171  int status = 0;
1172  const CArgs& args = GetArgs();
1173 
1174  // Silences warning in CSeq_id for CSeq_id::fParse_PartialOK
1175  SetDiagFilter(eDiagFilter_Post, "!(1306.10)");
1177  SetDiagPostPrefix("blastdbcmd");
1178 
1179  try {
1180  CNcbiOstream& out = args["out"].AsOutputFile();
1181  if (args["show_blastdb_search_path"]) {
1183  return status;
1184  } else if (args["list"]) {
1185  const string& blastdb_dir = args["list"].AsString();
1186  const bool recurse = args["recursive"];
1187  const bool remove_redundant_dbs = args["remove_redundant_dbs"];
1188  const string dbtype = args[kArgDbType]
1189  ? args[kArgDbType].AsString()
1190  : "guess";
1191  const string& kOutFmt = args["list_outfmt"].AsString();
1192  const vector<SSeqDBInitInfo> dbs =
1193  FindBlastDBs(blastdb_dir, dbtype, recurse, true,
1194  remove_redundant_dbs);
1195  CBlastDbFormatter blastdb_fmt(kOutFmt);
1196  ITERATE(vector<SSeqDBInitInfo>, db, dbs) {
1197  out << blastdb_fmt.Write(*db) << NcbiEndl;
1198  }
1199  return status;
1200  }
1201 
1202  if (args["info"]) {
1203  x_InitBlastDB();
1205  }
1206  else if (args["metadata"]) {
1207  x_InitBlastDB();
1208  string output_prefix = args["metadata_output_prefix"]
1209  ? args["metadata_output_prefix"].AsString()
1210  : kEmptyStr;
1211  if (!output_prefix.empty() && (output_prefix.back() != CFile::GetPathSeparator()))
1212  output_prefix += CFile::GetPathSeparator();
1213  CRef<CBlast_db_metadata> m = m_BlastDb->GetDBMetaData(output_prefix);
1214  unique_ptr<CObjectOStreamJson> json_out(new CObjectOStreamJson(out, eNoOwnership));
1215  json_out->SetDefaultStringEncoding(eEncoding_Ascii);
1216  json_out->PreserveKeyNames();
1217  CConstObjectInfo obj_info(m, m->GetTypeInfo());
1218  json_out->WriteObject(obj_info);
1219  json_out->Flush();
1220  out.flush();
1221  out << NcbiEndl;
1222  }
1223  else if (args["tax_info"]) {
1224  x_InitBlastDB();
1226  }
1227  else if(args[kArgTaxIdList].HasValue() ||
1228  args[kArgTaxIdListFile].HasValue()) {
1230  status = x_ProcessSearchRequest();
1231  }
1232  else {
1233  x_InitBlastDB();
1234  status = x_ProcessSearchRequest();
1235  }
1236  x_AddCmdOptions();
1237 
1238  } CATCH_ALL(status)
1239 
1241  return status;
1242 }
1243 
1245 {
1246  const CArgs & args = GetArgs();
1247  if (args["info"]) {
1249  }
1250  else if (args["tax_info"]) {
1252  }
1253  else if(args[kArgTaxIdList].HasValue() || args[kArgTaxIdListFile].HasValue()) {
1255  }
1256  else if(args["ipg"].HasValue() || args["ipg_batch"].HasValue()) {
1258  }
1259  else if(args["entry"].HasValue() || args["entry_batch"].HasValue()) {
1261  if (args["entry"].HasValue() && args["entry"].AsString() == "all") {
1263  }
1264  else {
1266  }
1267  }
1268  if(args["outfmt"].HasValue()) {
1269  m_UsageReport.AddParam(CBlastUsageReport::eOutputFmt, args["outfmt"].AsString());
1270  }
1271 
1272  vector<string> db_list;
1274  NON_CONST_ITERATE(vector<string>, itr, db_list) {
1275  int off = (*itr).find_last_of(CFile::GetPathSeparator());
1276  if (off != -1) {
1277  (*itr).erase(0, off+1);
1278  }
1279  }
1280  string db_name = NStr::Join(db_list, " ");
1281 
1282  int off = db_name.find_last_of(CFile::GetPathSeparator());
1283  if (off != -1) {
1284  db_name.erase(0, off+1);
1285  }
1290 }
1291 
1292 
1293 
1294 #ifndef SKIP_DOXYGEN_PROCESSING
1295 int main(int argc, const char* argv[] /*, const char* envp[]*/)
1296 {
1297  return CBlastDBCmdApp().AppMain(argc, argv);
1298 }
1299 #endif /* SKIP_DOXYGEN_PROCESSING */
Declares singleton objects to store the version and reference for the BLAST engine.
#define CATCH_ALL(exit_code)
Standard catch statement for all BLAST command line programs.
Declares the BLAST exception class.
Interface for converting sources of sequence data into blast sequence input.
Auxiliary classes/functions for BLAST input library.
TSeqRange ParseSequenceRangeOpenEnd(const string &range_str, const char *error_prefix=NULL)
Parse and extract a sequence range from argument provided to this function.
static const NStr::TNumToStringFlags kFlags
Definition of an identifier for a sequence in a BLAST database.
Definition of a customizable BLAST DB information formatter interface.
USING_SCOPE(blast)
static const string NA
Definition: blastdbcmd.cpp:60
string s_PreProcessAccessionsForDBv5(const string &id)
Definition: blastdbcmd.cpp:152
int main(int argc, const char *argv[])
USING_NCBI_SCOPE
Definition: blastdbcmd.cpp:56
bool s_IsMaskAlgoIdValid(CSeqDB &blastdb, int id)
Definition: blastdbcmd.cpp:258
Class to constrain the values of an argument to those greater than or equal to the value specified in...
CArgAllow_Strings –.
Definition: ncbiargs.hpp:1641
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgs –.
Definition: ncbiargs.hpp:379
The application class.
void x_PrintBlastDatabaseInformation()
Prints the BLAST database information (e.g.
Definition: blastdbcmd.cpp:534
void x_InitBlastDB()
Initializes Blast DB.
Definition: blastdbcmd.cpp:524
int x_ProcessBatchEntry(CBlastDB_Formatter &seq_fmt)
Process batch entry with range, strand and filter id.
Definition: blastdbcmd.cpp:441
virtual void Init()
@inheritDoc
void x_PrintBlastDatabaseTaxInformation()
Definition: blastdbcmd.cpp:694
bool m_Asn1Bioseq
output is ASN.1 defline
Definition: blastdbcmd.cpp:93
bool x_UseLongSeqIds()
Definition: blastdbcmd.cpp:852
int x_ProcessBatchEntry_NoDup(CBlastDB_Formatter &fmt)
Definition: blastdbcmd.cpp:375
bool x_GetOids(const string &acc, vector< int > &oids)
Definition: blastdbcmd.cpp:184
bool m_TargetOnly
should we output target sequence only?
Definition: blastdbcmd.cpp:97
virtual int Run()
@inheritDoc
string x_InitSearchRequest()
Definition: blastdbcmd.cpp:737
CBlastDBCmdApp()
@inheritDoc
Definition: blastdbcmd.cpp:67
virtual void Init()
@inheritDoc
bool m_DbIsProtein
Is the database protein.
Definition: blastdbcmd.cpp:89
CRef< CSeqDBExpert > m_BlastDb
Handle to BLAST database.
Definition: blastdbcmd.cpp:87
int x_ProcessSearchRequest()
Processes all requests except printing the BLAST database information.
Definition: blastdbcmd.cpp:869
bool m_FASTA
output is FASTA
Definition: blastdbcmd.cpp:91
CStopWatch m_StopWatch
Definition: blastdbcmd.cpp:104
void x_AddCmdOptions()
int x_ProcessTaxIdList(CBlastDB_Formatter &fmt)
Definition: blastdbcmd.cpp:304
bool m_GetDuplicates
should we find duplicate entries?
Definition: blastdbcmd.cpp:95
int x_ProcessBatchPig(CBlastDB_Formatter &fmt)
Definition: blastdbcmd.cpp:488
int x_ProcessEntry(CBlastDB_Formatter &fmt)
Process entry with range, strand and filter id.
Definition: blastdbcmd.cpp:213
int x_ModifyConfigForBatchEntry(const string &config)
Definition: blastdbcmd.cpp:271
CBlastDB_FormatterConfig m_Config
Definition: blastdbcmd.cpp:99
CBlastUsageReport m_UsageReport
Definition: blastdbcmd.cpp:103
set< TTaxId > m_TaxIdList
Definition: blastdbcmd.cpp:101
void x_InitBlastDB_TaxIdList()
Definition: blastdbcmd.cpp:320
int x_ProcessSearchType(CBlastDB_Formatter &fmt)
Definition: blastdbcmd.cpp:821
virtual int Run()
@inheritDoc
Fasta formatter interface.
virtual int Write(CSeqDB::TOID oid, const CBlastDB_FormatterConfig &config, string target_id=kEmptyStr)=0
virtual void DumpAll(const CBlastDB_FormatterConfig &config)=0
Customizable sequence formatter interface.
Customizable BLAST DB information formatter interface.
void AddParam(EUsageParams p, int val)
Keeps track of the version of the BLAST engine in the NCBI C++ toolkit.
Definition: version.hpp:53
CConstObjectInfo –.
Definition: objectinfo.hpp:421
Defines user input exceptions.
Defines invalid user input exceptions.
static CNcbiApplication * Instance(void)
Singleton method.
Definition: ncbiapp.cpp:264
CNcbiRegistry –.
Definition: ncbireg.hpp:913
CObjectOStreamJson –.
Definition: objostrjson.hpp:54
bool m_NeedTaxInfoLookup
Definition: blastdbcmd.cpp:590
void PrintEntry(const SSeqDBTaxInfo &t, int num_seqs)
Definition: blastdbcmd.cpp:657
vector< int > m_Fields
Definition: blastdbcmd.cpp:588
CNcbiOstream & m_Out
Definition: blastdbcmd.cpp:587
CPrintTaxFields(CNcbiOstream &out, const string &fmt)
Definition: blastdbcmd.cpp:593
vector< string > m_Seperators
Definition: blastdbcmd.cpp:589
CRef –.
Definition: ncbiobj.hpp:618
CSeqDBException.
Definition: seqdbcommon.hpp:73
CSeqDBExpert.
Definition: seqdbexpert.hpp:55
CSeqDBGiList.
void AddTaxIds(const set< TTaxId > &tax_ids)
static bool GetTaxNames(TTaxId tax_id, SSeqDBTaxInfo &info)
Get the taxonomy names for a given tax id.
Definition: seqdbtax.cpp:219
CSeqDB.
Definition: seqdb.hpp:161
int TOID
Sequence type accepted and returned for OID indices.
Definition: seqdb.hpp:216
void GetDBTaxIds(set< TTaxId > &tax_ids) const
Get all unique tax ids from db.
Definition: seqdb.cpp:262
static void FindVolumePaths(const string &dbname, ESeqType seqtype, vector< string > &paths, vector< string > *alias_paths=NULL, bool recursive=true, bool expand_links=true)
Find volume paths.
Definition: seqdb.cpp:1040
void TaxIdsToOids(set< TTaxId > &tax_ids, vector< blastdb::TOid > &rv) const
Get Oid list for input tax ids.
Definition: seqdb.cpp:257
bool GiToOidwFilterCheck(TGi gi, int &oid) const
Translate a GI To an OID with filter check.
Definition: seqdb.cpp:817
Uint8 GetTotalLength() const
Returns the sum of the lengths of all available sequences.
Definition: seqdb.cpp:685
bool PigToOid(int pig, int &oid) const
Translate a PIG to an OID.
Definition: seqdb.cpp:781
static string GenerateSearchPath()
Returns the default BLAST database search path configured for this local installation of BLAST.
Definition: seqdb.cpp:1340
vector< int > ValidateMaskAlgorithms(const vector< int > &algorithm_ids)
Validates the algorithm IDs passed to this function, returning a vector of those algorithm IDs not pr...
Definition: seqdb.cpp:1242
const string & GetDBNameList() const
Get list of database names.
Definition: seqdb.cpp:760
string GetAvailableMaskAlgorithmDescriptions()
Returns a formatted string with the list of available masking algorithms in this database for display...
Definition: seqdb.cpp:1237
int GetMaxLength() const
Returns the length of the largest sequence in the database.
Definition: seqdb.cpp:705
ESeqType GetSequenceType() const
Returns the type of database opened - protein or nucleotide.
Definition: seqdb.cpp:427
ESeqType
Sequence types (eUnknown tries protein, then nucleotide).
Definition: seqdb.hpp:173
@ eProtein
Definition: seqdb.hpp:174
string GetTitle() const
Returns the database title.
Definition: seqdb.cpp:630
int GetNumSeqs() const
Returns the number of sequences available.
Definition: seqdb.cpp:670
CRef< CBlast_db_metadata > GetDBMetaData(string user_path=kEmptyStr)
Definition: seqdb.cpp:1705
void AccessionToOids(const string &acc, vector< int > &oids) const
Translate an Accession to a list of OIDs.
Definition: seqdb.cpp:870
string GetDate() const
Returns the construction date of the database.
Definition: seqdb.cpp:635
int GetMaskAlgorithmId(const string &algo_name) const
Get the numeric algorithm ID for a string.
Definition: seqdb.cpp:1232
EBlastDbVersion GetBlastDbVersion() const
Return blast db version.
Definition: seqdb.cpp:1604
Uint8 GetExactTotalLength()
Returns the exact sum of the lengths of all available sequences.
Definition: seqdb.cpp:690
void AccessionsToOids(const vector< string > &accs, vector< blastdb::TOid > &oids) const
Definition: seqdb.cpp:252
CStopWatch –.
Definition: ncbitime.hpp:1938
Clas to retrieve taxonomic information for filtering BLASTDBs.
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
size_type size() const
Definition: set.hpp:132
const string kArgOutput
Output file name.
const string kArgDbType
BLAST database molecule type.
const string kArgTaxIdListFile
Argument to specify file with taxonomy ids for filtering.
const string kArgNoTaxIdExpansion
Argument to not to resolve TaxId to descendant.
const string kArgDb
BLAST database name.
const string kArgTaxIdList
Argument to specify taxonomy ids for filtering.
static CMemoryRegistry registry
Definition: cn3d_tools.cpp:81
void Print(const CCompactSAMApplication::AlignInfo &ai)
const char * file_name[]
std::ofstream out("events_result.xml")
main entry point for tests
#define false
Definition: bool.h:36
static char tmp[3200]
Definition: utf8.c:42
void SetFullVersion(CRef< CVersionAPI > version)
Set version data for the program.
Definition: ncbiapp.cpp:1174
void HideStdArgs(THideStdArgs hide_mask)
Set the hide mask for the Hide Std Flags.
Definition: ncbiapp.cpp:1312
#define ZERO_TAX_ID
Definition: ncbimisc.hpp:1115
const CNcbiRegistry & GetConfig(void) const
Get the application's cached configuration parameters (read-only).
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:305
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
Definition: ncbiapp.cpp:819
CVersionInfo GetVersion(void) const
Get the program version information.
Definition: ncbiapp.cpp:1184
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
Definition: ncbiapp.cpp:1195
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
@ fHideXmlHelp
Hide XML help description.
@ fHideFullVersion
Hide full version description.
@ fHideDryRun
Hide dryrun description.
@ fHideConffile
Hide configuration file description.
@ eNoOwnership
No ownership is assumed.
Definition: ncbi_types.h:135
@ eRequires
One argument requires another.
Definition: ncbiargs.hpp:956
@ eExcludes
One argument excludes another.
Definition: ncbiargs.hpp:957
@ eInputFile
Name of file (must exist and be readable)
Definition: ncbiargs.hpp:595
@ eString
An arbitrary string.
Definition: ncbiargs.hpp:589
@ eOutputFile
Name of file (must be writable)
Definition: ncbiargs.hpp:596
@ eInteger
Convertible into an integer number (int or Int8)
Definition: ncbiargs.hpp:592
void SetDiagFilter(EDiagFilter what, const char *filter_str)
Set diagnostic filter.
Definition: ncbidiag.cpp:7670
void SetDiagPostPrefix(const char *prefix)
Specify a string to prefix all subsequent error postings with.
Definition: ncbidiag.cpp:6097
EDiagSev SetDiagPostLevel(EDiagSev post_sev=eDiag_Error)
Set the threshold severity for posting the messages.
Definition: ncbidiag.cpp:6129
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
@ eDiag_Warning
Warning message.
Definition: ncbidiag.hpp:652
@ eDiagFilter_Post
for all non-TRACE, non-FATAL
Definition: ncbidiag.hpp:2530
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
const string & GetMsg(void) const
Get message string.
Definition: ncbiexpt.cpp:461
#define NCBI_RETHROW_SAME(prev_exception, message)
Generic macro to re-throw the same exception.
Definition: ncbiexpt.hpp:749
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1191
static char GetPathSeparator(void)
Get path separator symbol specific for the current platform.
Definition: ncbifile.cpp:433
const string AsFastaString(void) const
Definition: Seq_id.cpp:2266
string GetSeqIdString(bool with_version=false) const
Return seqid string with optional version for text seqid type.
Definition: Seq_id.cpp:2145
@ fParse_RawText
Try to ID raw non-numeric accessions.
Definition: Seq_id.hpp:81
@ fParse_PartialOK
Warn rather than throwing an exception when a FASTA-style ID set contains unparsable portions,...
Definition: Seq_id.hpp:80
@ fParse_AnyLocal
Treat otherwise unidentified strings as local accessions as long as they don't resemble FASTA-style I...
Definition: Seq_id.hpp:90
TObjectType * GetPointer(void) THROWS_NONE
Get pointer,.
Definition: ncbiobj.hpp:998
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
static TThisType GetEmpty(void)
Definition: range.hpp:306
virtual const string & Get(const string &section, const string &name, TFlags flags=0) const
Get the parameter value.
Definition: ncbireg.cpp:262
bool IsEnabled(void)
Indicates whether application usage statistics collection is enabled for a current reporter instance.
#define NcbiEndl
Definition: ncbistre.hpp:548
CNcbiIstream & NcbiGetlineEOL(CNcbiIstream &is, string &str, string::size_type *count=NULL)
Read from "is" to "str" the next line (taking into account platform specifics of End-of-Line)
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
static int StringToNonNegativeInt(const CTempString str, TStringToNumFlags flags=0)
Convert string to non-negative integer value.
Definition: ncbistr.cpp:457
#define kEmptyStr
Definition: ncbistr.hpp:123
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
int TNumToStringFlags
Bitwise OR of "ENumToStringFlags".
Definition: ncbistr.hpp:266
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3461
#define NPOS
Definition: ncbistr.hpp:133
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
static string Join(const TContainer &arr, const CTempString &delim)
Join strings using the specified delimiter.
Definition: ncbistr.hpp:2697
static bool SplitInTwo(const CTempString str, const CTempString delim, string &str1, string &str2, TSplitFlags flags=0)
Split a string into two pieces using the specified delimiters.
Definition: ncbistr.cpp:3554
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5353
static string & ToUpper(string &str)
Convert string to upper case – string& version.
Definition: ncbistr.cpp:424
static string UInt8ToString(Uint8 value, TNumToStringFlags flags=0, int base=10)
Convert UInt8 to string.
Definition: ncbistr.hpp:5168
@ eEncoding_Ascii
Definition: ncbistr.hpp:202
@ fAllowTrailingSpaces
Ignore trailing space characters.
Definition: ncbistr.hpp:297
@ fConvErr_NoThrow
Do not throw an exception on error.
Definition: ncbistr.hpp:285
@ fAllowLeadingSpaces
Ignore leading spaces in converted string.
Definition: ncbistr.hpp:294
@ fSplit_Truncate
Definition: ncbistr.hpp:2501
@ fSplit_Tokenize
All delimiters are merged and trimmed, to get non-empty tokens only.
Definition: ncbistr.hpp:2508
@ fSplit_MergeDelimiters
Merge adjacent delimiters.
Definition: ncbistr.hpp:2498
@ fWithCommas
Use commas as thousands separator.
Definition: ncbistr.hpp:254
double Elapsed(void) const
Return time elapsed since first Start() or last Restart() call (in seconds).
Definition: ncbitime.hpp:2776
void Start(void)
Start the timer.
Definition: ncbitime.hpp:2765
#define CVersion
bool IsPrf(void) const
Check if variant Prf is selected.
Definition: Seq_id_.hpp:916
bool IsPdb(void) const
Check if variant Pdb is selected.
Definition: Seq_id_.hpp:922
bool IsPir(void) const
Check if variant Pir is selected.
Definition: Seq_id_.hpp:853
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
static int input()
int i
static MDB_envinfo info
Definition: mdb_load.c:37
static int version
Definition: mdb_load.c:29
constexpr bool empty(list< Ts... >) noexcept
EIPRangeType t
Definition: ncbi_localip.c:101
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
static Format format
Definition: njn_ioutil.cpp:53
static string kVersion
Definition of a blastdb formatter interfaces.
CSeqDB::ESeqType ParseMoleculeTypeString(const string &str)
Convert a string to a CSeqDB ESeqType object.
Definition: seqdb.cpp:1527
vector< SSeqDBInitInfo > FindBlastDBs(const string &path, const string &dbtype, bool recurse, bool include_alias_files=false, bool remove_redundant_dbs=false)
Find BLAST DBs in the directory specified.
Definition: seqdb.cpp:1429
Defines exception class and several constants for SeqDB.
const blastdb::TOid kSeqDBEntryNotFound
@ eBDB_Version5
Definition: seqdbcommon.hpp:53
Defines `expert' version of CSeqDB interfaces.
Defines database volume access classes.
Configuration object for CBlastDB_Formatter classes.
int m_FmtAlgoId
Filtering algorithm ID for outfmt m.
bool m_UseCtrlA
Determines whether Ctrl-A characters should be used as defline separators.
int m_FiltAlgoId
Filtering algorithm ID to mask the FASTA.
TSeqRange m_SeqRange
The range of the sequence to retrieve, if empty, the entire sequence will be retrived.
objects::ENa_strand m_Strand
All SeqLoc types will have this strand assigned; If set to 'other', the strand will be set to 'unknow...
SSeqDBTaxInfo.
#define _DEBUG
#define _ASSERT
Modified on Wed Apr 17 13:08:08 2024 by modify_doxy.py rev. 669887