NCBI C++ ToolKit
makeblastdb.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: makeblastdb.cpp 101152 2023-11-07 15:39:13Z camacho $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Christiam Camacho
27  *
28  */
29 
30 /** @file makeblastdb.cpp
31  * Command line tool to create BLAST databases. This is the successor to
32  * formatdb from the C toolkit
33  */
34 
35 #include <ncbi_pch.hpp>
36 #include <serial/objostrjson.hpp>
39 #include <corelib/ncbiapp.hpp>
40 
41 #include <serial/iterator.hpp>
43 #include <objmgr/util/sequence.hpp>
44 
45 #include <objects/seq/Seqdesc.hpp>
47 
55 #include <util/format_guess.hpp>
56 #include <util/util_exception.hpp>
58 
59 #include <serial/objostrjson.hpp>
61 #include "../blast/blast_app_util.hpp"
62 #include "masked_range_set.hpp"
63 
64 #include <iostream>
65 #include <sstream>
66 #include <fstream>
67 
68 #ifndef SKIP_DOXYGEN_PROCESSING
70 USING_SCOPE(blast);
72 #endif /* SKIP_DOXYGEN_PROCESSING */
73 
74 /// The main application class
76 public:
77  /// Convenience typedef
79 
85  eUnsupported = 256
86  };
87 
88  /** @inheritDoc */
90  : m_LogFile(NULL),
92  {
94  version->SetVersionInfo(new CBlastVersion());
97  if (m_UsageReport.IsEnabled()) {
99  m_UsageReport.AddParam(CBlastUsageReport::eProgram, (string) "makeblastdb");
100  }
101  }
104  }
105 
106 private:
107  /** @inheritDoc */
108  virtual void Init();
109  /** @inheritDoc */
110  virtual int Run();
111 
113 
114  vector<ESupportedInputFormats>
115  x_GuessInputType(const vector<CTempString>& filenames,
116  vector<string>& blastdbs);
121 
122  void x_BuildDatabase();
123 
124  void x_AddFasta(CNcbiIstream & data);
125 
126  void x_AddSeqEntries(CNcbiIstream & data, TFormat fmt);
127 
128  void x_ProcessMaskData();
129 
130  void x_ProcessInputData(const string & paths, bool is_protein);
131 
132  bool x_ShouldParseSeqIds(void);
133 
134  void x_VerifyInputFilesType(const vector<CTempString>& filenames,
136 
137  void x_AddCmdOptions();
138 
139  // Data
140 
142 
144 
146 
148 
152 };
153 
154 /// Reads an object defined in a NCBI ASN.1 spec from a stream in multiple
155 /// formats: binary and text ASN.1 and XML
156 /// @param file stream to read the object from [in]
157 /// @param fmt specifies the format in which the object is encoded [in]
158 /// @param obj on input is an empty CRef<> object, on output it's populated
159 /// with the object read [in|out]
160 /// @param msg error message to display if reading fails [in]
161 template<class TObj>
164  CRef<TObj> & obj,
165  const string & msg)
166 {
167  obj.Reset(new TObj);
168 
169  switch (fmt) {
171  file >> MSerial_AsnBinary >> *obj;
172  break;
173 
175  file >> MSerial_AsnText >> *obj;
176  break;
177 
178  default:
179  NCBI_THROW(CInvalidDataException, eInvalidInput, string("Unknown encoding for ") + msg);
180  }
181 }
182 
183 /// Overloaded version of s_ReadObject which uses CFormatGuess to determine
184 /// the encoding of the object in the file
185 /// @param file stream to read the object from [in]
186 /// @param obj on input is an empty CRef<> object, on output it's populated
187 /// with the object read [in|out]
188 /// @param msg error message to display if reading fails [in]
189 template<class TObj>
191  CRef<TObj> & obj,
192  const string & msg)
193 {
195  fg.GetFormatHints().AddPreferredFormat(CFormatGuess::eBinaryASN);
196  fg.GetFormatHints().AddPreferredFormat(CFormatGuess::eTextASN);
197  fg.GetFormatHints().DisableAllNonpreferred();
198  s_ReadObject(file, fg.GuessFormat(), obj, msg);
199 }
200 
201 /// Command line flag to represent the input
202 static const string kInput("in");
203 /// Defines token separators when multiple inputs are present
204 static const string kInputSeparators(" ");
205 /// Command line flag to represent the output
206 static const string kOutput("out");
207 
209 {
211 
212  unique_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
213 
214  // Specify USAGE context
215  arg_desc->SetUsageContext(GetArguments().GetProgramBasename(),
216  "Application to create BLAST databases, version "
217  + CBlastVersion().Print());
218 
219  string dflt("Default = input file name provided to -");
220  dflt += kInput + " argument";
221 
222  arg_desc->SetCurrentGroup("Input options");
223  arg_desc->AddDefaultKey(kInput, "input_file",
224  "Input file/database name",
226  arg_desc->AddDefaultKey("input_type", "type",
227  "Type of the data specified in input_file",
228  CArgDescriptions::eString, "fasta");
229  arg_desc->SetConstraint("input_type", &(*new CArgAllow_Strings,
230  "fasta", "blastdb",
231  "asn1_bin",
232  "asn1_txt"));
233 
234  arg_desc->AddKey(kArgDbType, "molecule_type",
235  "Molecule type of target db", CArgDescriptions::eString);
236  arg_desc->SetConstraint(kArgDbType, &(*new CArgAllow_Strings,
237  "nucl", "prot"));
238 
239  arg_desc->SetCurrentGroup("Configuration options");
240  arg_desc->AddOptionalKey(kArgDbTitle, "database_title",
241  "Title for BLAST database\n" + dflt,
243 
244  arg_desc->AddFlag("parse_seqids",
245  "Option to parse seqid for FASTA input if set, for all other input types seqids are parsed automatically", true);
246 
247  arg_desc->AddFlag("hash_index",
248  "Create index of sequence hash values.",
249  true);
250 
251 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
252  (!defined(NCBI_COMPILER_MIPSPRO)) )
253  arg_desc->SetCurrentGroup("Sequence masking options");
254  arg_desc->AddOptionalKey("mask_data", "mask_data_files",
255  "Comma-separated list of input files containing "
256  "masking data as produced by NCBI masking "
257  "applications (e.g. dustmasker, segmasker, "
258  "windowmasker)",
260 
261  arg_desc->AddOptionalKey("mask_id", "mask_algo_ids",
262  "Comma-separated list of strings to uniquely "
263  "identify the masking algorithm",
265 
266  arg_desc->AddOptionalKey("mask_desc", "mask_algo_descriptions",
267  "Comma-separated list of free form strings to "
268  "describe the masking algorithm details",
270 
271  arg_desc->SetDependency("mask_id", CArgDescriptions::eRequires, "mask_data");
272  arg_desc->SetDependency("mask_desc", CArgDescriptions::eRequires, "mask_id");
273 
274  arg_desc->AddFlag("gi_mask",
275  "Create GI indexed masking data.", true);
276  arg_desc->SetDependency("gi_mask", CArgDescriptions::eExcludes, "mask_id");
277  arg_desc->SetDependency("gi_mask", CArgDescriptions::eRequires, "parse_seqids");
278 
279  arg_desc->AddOptionalKey("gi_mask_name", "gi_based_mask_names",
280  "Comma-separated list of masking data output files.",
282  arg_desc->SetDependency("gi_mask_name", CArgDescriptions::eRequires, "mask_data");
283  arg_desc->SetDependency("gi_mask_name", CArgDescriptions::eRequires, "gi_mask");
284 
285 #endif
286 
287  arg_desc->SetCurrentGroup("Output options");
288  arg_desc->AddOptionalKey(kOutput, "database_name",
289  "Name of BLAST database to be created\n" + dflt +
290  "Required if multiple file(s)/database(s) are "
291  "provided as input",
293  arg_desc->AddDefaultKey("blastdb_version", "version",
294  "Version of BLAST database to be created",
296  NStr::NumericToString(static_cast<int>(eBDB_Version5)));
297  arg_desc->SetConstraint("blastdb_version",
299  arg_desc->AddDefaultKey("max_file_sz", "number_of_bytes",
300  "Maximum file size for BLAST database files",
302  arg_desc->AddOptionalKey("metadata_output_prefix", "",
303  "Path prefix for location of database files in metadata", CArgDescriptions::eString);
304  arg_desc->AddOptionalKey("logfile", "File_Name",
305  "File to which the program log should be redirected",
308 #if _BLAST_DEBUG
309  arg_desc->AddFlag("verbose", "Produce verbose output", true);
310  arg_desc->AddFlag("limit_defline", "limit_defline", true);
311 #endif /* _BLAST_DEBUG */
312 
313  arg_desc->SetCurrentGroup("Taxonomy options");
314  arg_desc->AddOptionalKey("taxid", "TaxID",
315  "Taxonomy ID to assign to all sequences",
317  arg_desc->SetConstraint("taxid", new CArgAllowValuesGreaterThanOrEqual(0));
318  arg_desc->SetDependency("taxid", CArgDescriptions::eExcludes, "taxid_map");
319 
320  arg_desc->AddOptionalKey("taxid_map", "TaxIDMapFile",
321  "Text file mapping sequence IDs to taxonomy IDs.\n"
322  "Format:<SequenceId> <TaxonomyId><newline>",
324  arg_desc->SetDependency("taxid_map", CArgDescriptions::eRequires, "parse_seqids");
325 
326  arg_desc->AddOptionalKey("oid_masks", "oid_masks",
327  "0x01 Exclude Model", CArgDescriptions::eInteger);
328 
329  SetupArgDescriptions(arg_desc.release());
330 }
331 
332 /// Converts a Uint8 into a string which contains a data size (converse to
333 /// NStr::StringToUInt8_DataSize)
334 /// @param v value to convert [in]
335 /// @param minprec minimum precision [in]
336 static string Uint8ToString_DataSize(Uint8 v, unsigned minprec = 10)
337 {
338  static string kMods = "KMGTPEZY";
339 
340  size_t i(0);
341  for(i = 0; i < kMods.size(); i++) {
342  if (v < Uint8(minprec)*1024) {
343  v /= 1024;
344  }
345  }
346 
347  string rv = NStr::UInt8ToString(v);
348 
349  if (i) {
350  rv.append(kMods, i, 1);
351  rv.append("B");
352  }
353 
354  return rv;
355 }
356 
358 {
359  m_DB->AddFasta(data);
360 }
361 
362 static TTaxId s_GetTaxId(const CBioseq & bio)
363 {
364  CSeq_entry * p_ptr = bio.GetParentEntry();
365  while (p_ptr != NULL)
366  {
367  if(p_ptr->IsSetDescr())
368  {
369  ITERATE(CSeq_descr::Tdata, it, p_ptr->GetDescr().Get()) {
370  const CSeqdesc& desc = **it;
371  if(desc.IsSource()) {
372  return desc.GetSource().GetOrg().GetTaxId();
373  }
374 
375  if(desc.IsOrg()) {
376  return desc.GetOrg().GetTaxId();
377  }
378  }
379  }
380 
381  p_ptr = p_ptr->GetParentEntry();
382  }
383  return ZERO_TAX_ID;
384 }
385 
386 static bool s_HasTitle(const CBioseq & bio)
387 {
388  if (! bio.CanGetDescr()) {
389  return false;
390  }
391 
392  ITERATE(list< CRef< CSeqdesc > >, iter, bio.GetDescr().Get()) {
393  const CSeqdesc & desc = **iter;
394 
395  if (desc.IsTitle()) {
396  return true;
397  }
398  }
399 
400  return false;
401 }
402 
404 public:
405  /// Convenience typedef
407 
408  bool IsUnverified(const CSeq_descr& descr) {
409 
410  const string unv("Unverified");
411 
412  const CSeq_descr::Tdata& da = descr.Get();
413  ITERATE(CSeq_descr::Tdata, da_iter, da) {
414  CRef<CSeqdesc> dai = *da_iter;
415  if (dai->IsUser()) {
416  const CSeqdesc::TUser& du = dai->GetUser();
417  if (du.IsSetType()) {
418  const CUser_object::TType& ty = du.GetType();
419  if (ty.IsStr()) {
420  const CObject_id::TStr& str = ty.GetStr();
421  if (NStr::CompareNocase(str, unv) == 0) {
422  return true;
423  }
424  }
425  }
426  }
427  }
428  return false;
429 
430  }
431 
432  CSeqEntrySource(CNcbiIstream & is, TFormat fmt, bool skip_unver)
433  :m_objmgr(CObjectManager::GetInstance()),
434  m_scope(new CScope(*m_objmgr)),
435  m_entry(new CSeq_entry),
436  m_SkipUnver(skip_unver)
437  {
438  char ch = is.peek();
439 
440  // Get rid of white spaces
441  while (!is.eof()
442  &&
443  (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r')) {
444  is.read(&ch, 1);
445  ch = is.peek();
446  }
447 
448  if (is.eof())
449  return;
450 
451  // If input is a Bioseq_set
452  if (ch == 'B' || ch == '0') {
454  s_ReadObject(is, fmt, obj, "bioseq");
455  m_entry->SetSet(*obj);
456  } else {
457  // If not, it should be a Seq-entry.
458  s_ReadObject(is, fmt, m_entry, "bioseq");
459  }
460 
461  CTypeIterator<CBioseq_set> it_bio_set;
462  CTypeIterator<CBioseq> it_bio;
463 
464  // Step through Seq-entry, picking out Bioseq-set objects.
465  for (it_bio_set = Begin(*m_entry); it_bio_set; ++it_bio_set) {
466  // If Bioseq-set is of the 'nuc-prot' class,
467  // and it's marked "Unverified", skip ALL of the
468  // Bioseq objects it contains.
469  if (it_bio_set->GetClass() == CBioseq_set::eClass_nuc_prot) {
470  if (it_bio_set->CanGetDescr() && IsUnverified(it_bio_set->GetDescr())) {
471  for (it_bio = Begin(*it_bio_set); it_bio; ++it_bio) {
472  m_bio_skipped.insert(&(*it_bio));
473  }
474  }
475  }
476  }
477 
478  // Step through Seq-entry, picking out Bioseq objects.
479  for (it_bio = Begin(*m_entry); it_bio; ++it_bio) {
480  // If Bioseq is marked as "Unverified", skip it.
481  if (it_bio->CanGetDescr() && IsUnverified(it_bio->GetDescr())) {
482  // Because m_bio_skipped is an STL set container,
483  // inserting an item that's already in the set will leave
484  // the set unaltered (i.e. no duplicate items).
485  m_bio_skipped.insert(&(*it_bio));
486  }
487  }
488 
489  m_bio = Begin(*m_entry);
490  m_entry->Parentize();
492  }
493 
495  {
497 
498  if (m_bio) {
499 
500  // If skipping of "unverified" entries is enabled...
501  if (m_SkipUnver) {
502  // If the the address of the current Bioseq object (*m_bio)
503  // is in the skip set, advance to the next Bioseq and
504  // try again.
505  while (m_bio_skipped.find(&(*m_bio)) != m_bio_skipped.end()) {
506  ++m_bio; // will be null when incremented past end
507  if (!m_bio) return rv;
508  }
509  }
510 
511  const sequence::CDeflineGenerator::TUserFlags flags = sequence::CDeflineGenerator::fUseAutoDef;
512  sequence::CDeflineGenerator gen;
513  const string & title = gen.GenerateDefline(*m_bio , *m_scope, flags);
514  string old_title;
515  if (s_HasTitle(*m_bio)) {
516  for (auto& i: m_bio->SetDescr().Set()) {
517  if (i->IsTitle()) {
518  old_title = i->GetTitle();
519  i->SetTitle(title);
520  }
521  }
522  } else {
524  des->SetTitle(title);
525  CSeq_descr& desr(m_bio->SetDescr());
526  desr.Set().push_back(des);
527  }
528 
529  if (ZERO_TAX_ID == m_bio->GetTaxId()) {
530  TTaxId taxid = s_GetTaxId(*m_bio);
531  if (ZERO_TAX_ID != taxid) {
533  des->SetOrg().SetTaxId(taxid);
534  m_bio->SetDescr().Set().push_back(des);
535  }
536  }
537 
538  rv.Reset(&(*m_bio));
539  ++m_bio; // will be null when incremented past end
540  }
541 
542  return rv;
543  }
544 
545 private:
552 };
553 
555 {
556  bool found = false;
557  try {
558  while(!data.eof())
559  {
560  CSeqEntrySource src(data, fmt, m_SkipUnver);
561  found = found || m_DB->AddSequences(src);
562  }
563  } catch (const CEofException& e) {
564  if (e.GetErrCode() == CEofException::eEof) {
565  /* ignore */
566  } else {
567  throw e;
568  }
569  }
570  if (!found) {
571  ERR_POST(Warning << "No sequences written");
572  }
573 }
574 
576 public:
577  CRawSeqDBSource(const string & name, bool protein, CBuildDatabase * outdb);
578 
580  {
581  if (m_Sequence) {
583  m_Sequence = NULL;
584  }
585  }
586 
587  virtual bool GetNext(CTempString & sequence,
588  CTempString & ambiguities,
589  CRef<CBlast_def_line_set> & deflines,
590  vector<SBlastDbMaskData> & mask_range,
591  vector<int> & column_ids,
592  vector<CTempString> & column_blobs);
593 
594 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
595  (!defined(NCBI_COMPILER_MIPSPRO)) )
596  virtual void GetColumnNames(vector<string> & names)
597  {
599  }
600 
601  virtual int GetColumnId(const string & name)
602  {
603  return m_Source->GetColumnId(name);
604  }
605 
606  virtual const map<string,string> & GetColumnMetaData(int id)
607  {
608  return m_Source->GetColumnMetaData(id);
609  }
610 #endif
611 
613  {
614  if (m_Sequence) {
617  }
618  }
619 
620 private:
622  const char * m_Sequence;
623  int m_Oid;
624 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
625  (!defined(NCBI_COMPILER_MIPSPRO)) )
626  vector<CBlastDbBlob> m_Blobs;
627  vector<int> m_ColumnIds;
628  vector<string> m_ColumnNames;
629  vector<int> m_MaskIds;
631 #endif
632 };
633 
634 CRawSeqDBSource::CRawSeqDBSource(const string & name, bool protein, CBuildDatabase * outdb)
635  : m_Sequence(NULL), m_Oid(0)
636 {
637  CSeqDB::ESeqType seqtype =
639 
640  m_Source.Reset(new CSeqDBExpert(name, seqtype));
641 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
642  (!defined(NCBI_COMPILER_MIPSPRO)) )
643  // Process mask meta data
645  ITERATE(vector<int>, algo_id, m_MaskIds) {
647  string algo_opts, algo_name;
648  m_Source->GetMaskAlgorithmDetails(*algo_id, algo, algo_name, algo_opts);
650  algo_name += NStr::IntToString(*algo_id);
651  }
652  m_MaskIdMap[*algo_id] = outdb->RegisterMaskingAlgorithm(algo, algo_opts, algo_name);
653  }
654  // Process columns
656  for(int i = 0; i < (int)m_ColumnNames.size(); i++) {
658  }
659 #endif
660 }
661 
662 bool
664  CTempString & ambiguities,
665  CRef<CBlast_def_line_set> & deflines,
666  vector<SBlastDbMaskData> & mask_range,
667  vector<int> & column_ids,
668  vector<CTempString> & column_blobs)
669 {
670  if (! m_Source->CheckOrFindOID(m_Oid))
671  return false;
672 
673  if (m_Sequence) {
675  m_Sequence = NULL;
676  }
677 
678  int slength(0), alength(0);
679 
680  m_Source->GetRawSeqAndAmbig(m_Oid, & m_Sequence, & slength, & alength);
681 
682  sequence = CTempString(m_Sequence, slength);
683  ambiguities = CTempString(m_Sequence + slength, alength);
684 
685  deflines = m_Source->GetHdr(m_Oid);
686 
687 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
688  (!defined(NCBI_COMPILER_MIPSPRO)) )
689  // process masks
690  ITERATE(vector<int>, algo_id, m_MaskIds) {
691 
693  m_Source->GetMaskData(m_Oid, *algo_id, ranges);
694 
695  SBlastDbMaskData mask_data;
696  mask_data.algorithm_id = m_MaskIdMap[*algo_id];
697 
699  mask_data.offsets.push_back(pair<TSeqPos, TSeqPos>(range->first, range->second));
700  }
701 
702  mask_range.push_back(mask_data);
703  }
704 
705  // The column IDs will be the same each time; another approach is
706  // to only send back the IDs for those columns that are non-empty.
707  column_ids = m_ColumnIds;
708  column_blobs.resize(column_ids.size());
709  m_Blobs.resize(column_ids.size());
710 
711  for(int i = 0; i < (int)column_ids.size(); i++) {
712  m_Source->GetColumnBlob(column_ids[i], m_Oid, m_Blobs[i]);
713  column_blobs[i] = m_Blobs[i].Str();
714  }
715 #endif
716 
717  m_Oid ++;
718 
719  return true;
720 }
721 
724  fmt)
725 {
727  switch (fmt) {
728  case eFasta: retval = CFormatGuess::eFasta; break;
729  case eBinaryASN: retval = CFormatGuess::eBinaryASN; break;
730  case eTextASN: retval = CFormatGuess::eTextASN; break;
731  default: break;
732  }
733  return retval;
734 }
735 
738 {
740  switch (fmt) {
741  case CFormatGuess::eFasta: retval = eFasta; break;
742  case CFormatGuess::eBinaryASN: retval = eBinaryASN; break;
743  case CFormatGuess::eTextASN: retval = eTextASN; break;
744  default: break;
745  }
746  return retval;
747 }
748 
751 {
753  const CArgs& args = GetArgs();
754  if (args["input_type"].HasValue()) {
755  const string& input_type = args["input_type"].AsString();
756  if (input_type == "fasta") {
757  retval = eFasta;
758  } else if (input_type == "asn1_bin") {
759  retval = eBinaryASN;
760  } else if (input_type == "asn1_txt") {
761  retval = eTextASN;
762  } else if (input_type == "blastdb") {
763  retval = eBlastDb;
764  } else {
765  // need to add supported type to list of constraints!
766  _ASSERT(false);
767  }
768  }
769  return retval;
770 }
771 
772 void
775 {
776  //Let other part of the program deal with blastdb input
777  if(eBlastDb == input_type)
778  return;
779 
780  // Guess the input data type
781  for (size_t i = 0; i < filenames.size(); i++) {
782  string seq_file = filenames[i];
783 
784  CFile input_file(seq_file);
785  if ( !input_file.Exists() ) {
786  string error_msg = "File " + seq_file + " does not exist";
787  NCBI_THROW(CInvalidDataException, eInvalidInput, error_msg);
788  }
789  if (input_file.GetLength() == 0) {
790  string error_msg = "File " + seq_file + " is empty";
791  NCBI_THROW(CInvalidDataException, eInvalidInput, error_msg);
792  }
793 
794  CNcbiIfstream f(seq_file.c_str(), ios::binary);
795  if(input_type == eFasta && x_ConvertToSupportedType(x_GuessFileType(f)) != eFasta) {
796  string msg = "\nInput file " + seq_file + " does NOT appear to be FASTA (processing anyway).\n" \
797  + "Advise validating database with 'blastdbcheck -dbtype [prot|nucl] -db ${DBNAME}'\n";
798  ERR_POST(Warning << msg);
799  }
800  }
801  return;
802 }
803 
806 {
808  fg.GetFormatHints().AddPreferredFormat(CFormatGuess::eBinaryASN);
809  fg.GetFormatHints().AddPreferredFormat(CFormatGuess::eTextASN);
810  fg.GetFormatHints().AddPreferredFormat(CFormatGuess::eFasta);
811  fg.GetFormatHints().DisableAllNonpreferred();
812  return fg.GuessFormat();
813 }
814 
817 {
818  switch(fmt) {
820  x_AddFasta(input);
821  break;
822 
825  x_AddSeqEntries(input, fmt);
826  break;
827 
828  default:
829  string msg("Input format not supported (");
830  msg += string(CFormatGuess::GetFormatName(fmt)) + " format). ";
831  msg += "Use -input_type to specify the input type being used.";
832  NCBI_THROW(CInvalidDataException, eInvalidInput, msg);
833  }
834 }
835 
836 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
837  (!defined(NCBI_COMPILER_MIPSPRO)) )
838 
840 {
841  const CArgs & args = GetArgs();
842 
843  const CArgValue & files = args["mask_data"];
844  const CArgValue & ids = args["mask_id"];
845  const CArgValue & descs = args["mask_desc"];
846  const CArgValue & gi_names = args["gi_mask_name"];
847 
848  vector<string> file_list;
849  vector<string> id_list;
850  vector<string> desc_list;
851  vector<string> gi_mask_names;
852 
853  if (! files.HasValue()) return;
854  NStr::Split(NStr::TruncateSpaces(files.AsString()), ",", file_list);
855  if (! file_list.size()) {
856  NCBI_THROW(CInvalidDataException, eInvalidInput,
857  "mask_data option found, but no files were specified.");
858  }
859 
860  if (ids.HasValue()) {
861  NStr::Split(NStr::TruncateSpaces(ids.AsString()), ",", id_list);
862  if (file_list.size() != id_list.size()) {
863  NCBI_THROW(CInvalidDataException, eInvalidInput,
864  "the size of mask_id does not match that of mask_data.");
865  }
866  // make sure this is not a numeric id
867  for (unsigned int i = 0; i < id_list.size(); ++i) {
868  Int4 nid(-1);
869  if (NStr::StringToNumeric(id_list[i], &nid, NStr::fConvErr_NoThrow, 10)) {
870  NCBI_THROW(CInvalidDataException, eInvalidInput,
871  "mask_id can not be numeric.");
872  }
873  }
874  }
875 
876  if (descs.HasValue()) {
877  NStr::Split(NStr::TruncateSpaces(descs.AsString()), ",", desc_list);
878  if (file_list.size() != desc_list.size()) {
879  NCBI_THROW(CInvalidDataException, eInvalidInput,
880  "the size of mask_desc does not match that of mask_data.");
881  }
882  } else {
883  // description is optional
884  vector<string> default_desc(id_list.size(), "");
885  desc_list.swap(default_desc);
886  }
887 
888  if (gi_names.HasValue()) {
889  NStr::Split(NStr::TruncateSpaces(gi_names.AsString()), ",", gi_mask_names);
890  if (file_list.size() != gi_mask_names.size()) {
891  NCBI_THROW(CInvalidDataException, eInvalidInput,
892  "the size of gi_mask_name does not match that of mask_data.");
893  }
894  }
895 
896  for (unsigned int i = 0; i < file_list.size(); ++i) {
897  if ( !CFile(file_list[i]).Exists() ) {
898  ERR_POST(Error << "Ignoring mask file '" << file_list[i]
899  << "' as it does not exist.");
900  continue;
901  }
902 
903  CNcbiIfstream mask_file(file_list[i].c_str(), ios::binary);
905  {{
906  CFormatGuess fg(mask_file);
907  fg.GetFormatHints().AddPreferredFormat(CFormatGuess::eBinaryASN);
908  fg.GetFormatHints().AddPreferredFormat(CFormatGuess::eTextASN);
909  fg.GetFormatHints().DisableAllNonpreferred();
910  mask_file_format = fg.GuessFormat();
911  }}
912 
913  int algo_id = -1;
914  while (true) {
915  CRef<CBlast_db_mask_info> first_obj;
916 
917  try {
918  s_ReadObject(mask_file, mask_file_format, first_obj, "mask data in '" + file_list[i] + "'");
919  }
920  catch (CEofException&) {
921  // must be end of file
922  break;
923  }
924 
925  if (algo_id < 0) {
926  *m_LogFile << "Mask file: " << file_list[i] << endl;
927  string opts = first_obj->GetAlgo_options();
928  if (id_list.size()) {
929  algo_id = m_DB->RegisterMaskingAlgorithm(id_list[i], desc_list[i], opts);
930  } else {
931  EBlast_filter_program prog_id =
932  static_cast<EBlast_filter_program>(first_obj->GetAlgo_program());
933  string name = gi_mask_names.size() ? gi_mask_names[i] : file_list[i];
934  algo_id = m_DB->RegisterMaskingAlgorithm(prog_id, opts, name);
935  }
936  }
937 
938  CRef<CBlast_mask_list> masks(& first_obj->SetMasks());
939  first_obj.Reset();
940 
941  while(1) {
942  if (m_Ranges.Empty() && ! masks->GetMasks().empty()) {
945  }
946 
947  ITERATE(CBlast_mask_list::TMasks, iter, masks->GetMasks()) {
948  CConstRef<CSeq_id> seqid((**iter).GetId());
949 
950  if (seqid.Empty()) {
951  NCBI_THROW(CInvalidDataException, eInvalidInput,
952  "Cannot get masked range Seq-id");
953  }
954 
955  m_Ranges->Insert(algo_id, *seqid, **iter);
956  }
957 
958  if (! masks->GetMore())
959  break;
960 
961  s_ReadObject(mask_file, mask_file_format, masks, "mask data (continuation)");
962  }
963  }
964  }
965 }
966 #endif
967 
969 {
970  const CArgs& args = GetArgs();
971  if ("fasta" != args["input_type"].AsString())
972  return true;
973  else if (args["parse_seqids"])
974  return true;
975 
976  return false;
977 }
978 
979 void CMakeBlastDBApp::x_ProcessInputData(const string & paths,
980  bool is_protein)
981 {
982  vector<CTempString> names;
983  SeqDB_SplitQuoted(paths, names);
984  vector<string> blastdb;
985  TIdToLeafs leafTaxIds;
986 
989  if(eBlastDb != input_fmt) {
990  if (names[0] == "-") {
991  x_AddSequenceData(cin, build_fmt);
992  }
993  else {
994  x_VerifyInputFilesType(names, input_fmt);
995  for (size_t i = 0; i < names.size(); i++) {
996  string seq_file = names[i];
997  CNcbiIfstream f(seq_file.c_str(), ios::binary);
998  x_AddSequenceData(f, build_fmt);
999  }
1000  }
1001  }
1002  else {
1003  vector<string> blastdb;
1004  copy(names.begin(), names.end(), back_inserter(blastdb));
1005  CSeqDB::ESeqType seqtype = (is_protein ? CSeqDB::eProtein : CSeqDB::eNucleotide);
1006 
1007  vector<string> final_blastdb;
1008 
1009  if (m_IsModifyMode) {
1010  ASSERT(blastdb.size()==1);
1011  CSeqDB db(blastdb[0], seqtype);
1012  vector<string> paths;
1013  db.FindVolumePaths(paths);
1014  // if paths.size() == 1, we will happily take it to be the same
1015  // case as a single volume database and recreate a new db
1016  if (paths.size() > 1) {
1017  NCBI_THROW(CInvalidDataException, eInvalidInput,
1018  "Modifying an alias BLAST db is currently not supported.");
1019  }
1020  final_blastdb.push_back(blastdb[0]);
1021  } else {
1022  ITERATE(vector<string>, iter, blastdb) {
1023  const string & s = *iter;
1024 
1025  try {
1026  CSeqDB db(s, seqtype);
1027  }
1028  catch(const CSeqDBException &) {
1029  ERR_POST(Error << "Unable to open input "
1030  << s << " as BLAST db");
1031  }
1032  final_blastdb.push_back(s);
1033  }
1034  }
1035 
1036  if (final_blastdb.size()) {
1037  string quoted;
1038  SeqDB_CombineAndQuote(final_blastdb, quoted);
1039 
1040  CRef<CSeqDB> indb(new CSeqDB(
1041  quoted,
1042  is_protein ? CSeqDB::eProtein : CSeqDB::eNucleotide
1043  ));
1044  const int numoids = indb->GetNumOIDs();
1045  for (int oid = 0; oid < numoids; ++oid) {
1046  CRef<CBlast_def_line_set> hdr = indb->GetHdr(oid);
1047  ITERATE(CBlast_def_line_set::Tdata, itr, hdr->Get()) {
1048  CRef<CBlast_def_line> bdl = *itr;
1049  CBlast_def_line::TTaxIds leafs = bdl->GetLeafTaxIds();
1050  if (!leafs.empty()) {
1051  const string id =
1052  bdl->GetSeqid().front()->AsFastaString();
1053  set<TTaxId> ids = leafTaxIds[id];
1054  ids.insert(leafs.begin(), leafs.end());
1055  leafTaxIds[id] = ids;
1056  }
1057  }
1058  }
1059 
1060  m_DB->SetLeafTaxIds(leafTaxIds, true);
1062  new CRawSeqDBSource(quoted, is_protein, m_DB)
1063  );
1064  m_DB->AddSequences(*raw);
1065  } else {
1066  NCBI_THROW(CInvalidDataException, eInvalidInput,
1067  "No valid input FASTA file or BLAST db is found.");
1068  }
1069  }
1070 }
1071 
1073 {
1074  const string env_skip =
1075  GetEnvironment().Get("NCBI_MAKEBLASTDB_SKIP_UNVERIFIED_BIOSEQS");
1076  m_SkipUnver = (env_skip.empty() == false);
1077 
1078  const string dont_scan_bioseq =
1079  GetEnvironment().Get("NCBI_MAKEBLASTDB_DONT_SCAN_BIOSEQ_FOR_CFASTAREADER_USER_OBJECTS");
1080  const bool scan_bioseq_4_cfastareader_usrobj = static_cast<bool>(dont_scan_bioseq.empty());
1081 
1082  const CArgs & args = GetArgs();
1083 
1084  // Get arguments to the CBuildDatabase constructor.
1085 
1086  bool is_protein = (args[kArgDbType].AsString() == "prot");
1087 
1088  // 1. database name option if present
1089  // 2. else, kInput
1090  string dbname = (args[kOutput].HasValue()
1091  ? args[kOutput]
1092  : args[kInput]).AsString();
1094  if (input_fmt == eBlastDb && dbname == args[kInput].AsString()) {
1095  NCBI_THROW(CInvalidDataException, eInvalidInput,
1096  "Cannot create a BLAST database from an existing one without "
1097  "changing the output name, please provide a (different) database name "
1098  "using -" + kOutput);
1099  }
1100 
1101  vector<string> input_files;
1102  NStr::Split(dbname, kInputSeparators, input_files);
1103  if (dbname == "-" || input_files.size() > 1) {
1104  NCBI_THROW(CInvalidDataException, eInvalidInput,
1105  "Please provide a database name using -" + kOutput);
1106  }
1107 
1108  if (args[kInput].AsString() == dbname) {
1109  m_IsModifyMode = true;
1110  }
1111 
1112  // 1. title option if present
1113  // 2. otherwise, kInput, UNLESS
1114  // 3. input is a BLAST database, in which we use that title
1115  string title = (args[kArgDbTitle].HasValue()
1116  ? args[kArgDbTitle]
1117  : args[kInput]).AsString();
1118  if (!args[kArgDbTitle].HasValue() && input_fmt == eBlastDb) {
1119  vector<CTempString> names;
1120  SeqDB_SplitQuoted(args[kInput].AsString(), names);
1121  if (names.size() > 1) {
1122  NCBI_THROW(CInvalidDataException, eInvalidInput,
1123  "Please provide a title using -title");
1124  }
1125  CRef<CSeqDB> dbhandle(new CSeqDB(names.front(),
1126  (is_protein ? CSeqDB::eProtein : CSeqDB::eNucleotide)));
1127  title = dbhandle->GetTitle();
1128  }
1129 
1130 
1131  // N.B.: Source database(s) in the current working directory will
1132  // be overwritten (as in formatdb)
1133 
1134  if (title == "-") {
1135  NCBI_THROW(CInvalidDataException, eInvalidInput,
1136  "Please provide a title using -title");
1137  }
1138 
1139  m_LogFile = & (args["logfile"].HasValue()
1140  ? args["logfile"].AsOutputFile()
1141  : cout);
1142 
1143  bool parse_seqids = x_ShouldParseSeqIds();
1144  bool hash_index = args["hash_index"];
1145  bool use_gi_mask = args["gi_mask"];
1146 
1148  indexing |= (hash_index ? CWriteDB::eAddHash : 0);
1149  indexing |= (parse_seqids ? CWriteDB::eFullIndex : 0);
1150 
1151  bool long_seqids = false;
1153  if (app) {
1154  const CNcbiRegistry& registry = app->GetConfig();
1155  long_seqids = (registry.Get("BLAST", "LONG_SEQID") == "1");
1156  }
1157 
1158  const EBlastDbVersion dbver =
1159  static_cast<EBlastDbVersion>(args["blastdb_version"].AsInteger());
1160 
1161  bool limit_defline = false;
1162 #if _BLAST_DEBUG
1163  if(args["limit_defline"]) {
1164  limit_defline = true;
1165  }
1166 #endif
1167  Uint8 oid_masks = 0;
1168  if(args["oid_masks"]) {
1169  oid_masks = args["oid_masks"].AsInteger();
1170  }
1172  title,
1173  is_protein,
1174  indexing,
1175  use_gi_mask,
1176  m_LogFile,
1177  long_seqids,
1178  dbver,
1179  limit_defline,
1180  oid_masks,
1181  scan_bioseq_4_cfastareader_usrobj));
1182 
1183 #if _BLAST_DEBUG
1184  if (args["verbose"]) {
1185  m_DB->SetVerbosity(true);
1186  }
1187 #endif /* _BLAST_DEBUG */
1188 
1189  // Should we keep the linkout and membership bits? Sure.
1190 
1191  // Create empty linkout bit table in order to call these methods;
1192  // however, in the future it would probably be good to populate
1193  // this from a user provided option as multisource does. Also, it
1194  // might be wasteful to copy membership bits, as the resulting
1195  // database will most likely not have corresponding mask files;
1196  // but until there is a way to configure membership bits with this
1197  // tool, I think it is better to keep, than to toss.
1198 
1199  TLinkoutMap no_bits;
1200 
1201 // m_DB->SetLinkouts(no_bits, true); // DEPRECATED
1202  m_DB->SetMembBits(no_bits, true);
1203 
1204 
1205  // Max file size
1206 
1207  Uint8 bytes = NStr::StringToUInt8_DataSize(args["max_file_sz"].AsString());
1208  static const Uint8 MAX_VOL_FILE_SIZE = 0x100000000;
1209  if (bytes >= MAX_VOL_FILE_SIZE) {
1210  NCBI_THROW(CInvalidDataException, eInvalidInput,
1211  "max_file_sz must be < 4 GiB");
1212  }
1213  *m_LogFile << "Maximum file size: "
1214  << Uint8ToString_DataSize(bytes) << endl;
1215 
1216  m_DB->SetMaxFileSize(bytes);
1217 
1218  if (args["taxid"].HasValue()) {
1219  _ASSERT( !args["taxid_map"].HasValue() );
1220  CRef<CTaxIdSet> taxids(new CTaxIdSet(TAX_ID_FROM(int, args["taxid"].AsInteger())));
1221  m_DB->SetTaxids(*taxids);
1222  } else if (args["taxid_map"].HasValue()) {
1223  _ASSERT( !args["taxid"].HasValue() );
1224  CRef<CTaxIdSet> taxids(new CTaxIdSet());
1225  taxids->SetMappingFromFile(args["taxid_map"].AsInputFile());
1226  m_DB->SetTaxids(*taxids);
1227  }
1228 
1229 
1230 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
1231  (!defined(NCBI_COMPILER_MIPSPRO)) )
1233 #endif
1234  x_ProcessInputData(args[kInput].AsString(), is_protein);
1235 
1236  bool success = m_DB->EndBuild();
1237  if(success) {
1238  string new_db = m_DB->GetOutputDbName();
1240  CSeqDB sdb(new_db, t);
1241  string output_prefix = args["metadata_output_prefix"]
1242  ? args["metadata_output_prefix"].AsString()
1243  : kEmptyStr;
1244  if (!output_prefix.empty() && (output_prefix.back() != CFile::GetPathSeparator()))
1245  output_prefix += CFile::GetPathSeparator();
1246  CRef<CBlast_db_metadata> m = sdb.GetDBMetaData(output_prefix);
1247  string extn (kEmptyStr);
1248  SeqDB_GetMetadataFileExtension(is_protein, extn);
1249  string metadata_filename = new_db + "." + extn;
1250  ofstream out(metadata_filename.c_str());
1251  unique_ptr<CObjectOStreamJson> json_out(new CObjectOStreamJson(out, eNoOwnership));
1252  json_out->SetDefaultStringEncoding(eEncoding_Ascii);
1253  json_out->PreserveKeyNames();
1254  CConstObjectInfo obj_info(m, m->GetTypeInfo());
1255  json_out->WriteObject(obj_info);
1256  json_out->Flush();
1257  out.flush();
1258  out << NcbiEndl;
1259  }
1260 }
1261 
1263 {
1265  SetDiagPostPrefix("makeblastdb");
1266 
1267  int status = 0;
1268  try { x_BuildDatabase(); }
1269  CATCH_ALL(status)
1270  x_AddCmdOptions();
1272  return status;
1273 }
1274 
1276 {
1277  const CArgs & args = GetArgs();
1278  if (args["input_type"].HasValue()) {
1279  m_UsageReport.AddParam(CBlastUsageReport::eInputType, args["input_type"].AsString());
1280  }
1281  if (args[kArgDbType].HasValue()) {
1283  }
1284  if(args["taxid"].HasValue() || args["taxid_map"].HasValue()) {
1286  }
1287  if(args["parse_seqids"].HasValue()) {
1288  m_UsageReport.AddParam(CBlastUsageReport::eParseSeqIDs, args["parse_seqids"].AsBoolean());
1289  }
1290  if (args["gi_mask"].HasValue()) {
1292  }
1293  else if(args["mask_data"].HasValue()) {
1295  }
1296 }
1297 
1298 
1299 #ifndef SKIP_DOXYGEN_PROCESSING
1300 int main(int argc, const char* argv[] /*, const char* envp[]*/)
1301 {
1302  return CMakeBlastDBApp().AppMain(argc, argv);
1303 }
1304 #endif /* SKIP_DOXYGEN_PROCESSING */
User-defined methods of the data storage class.
User-defined methods of the data storage class.
Declares singleton objects to store the version and reference for the BLAST engine.
#define CATCH_ALL(exit_code)
Standard catch statement for all BLAST command line programs.
Interface for converting sources of sequence data into blast sequence input.
Auxiliary classes/functions for BLAST input library.
#define false
Definition: bool.h:36
Code to build a database given various sources of sequence data.
Class to constrain the values of an argument to those greater than or equal to the value specified in...
CArgAllow_Integers –.
Definition: ncbiargs.hpp:1751
CArgAllow_Strings –.
Definition: ncbiargs.hpp:1641
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgValue –.
Definition: ncbiargs.hpp:184
CArgs –.
Definition: ncbiargs.hpp:379
CSeq_entry * GetParentEntry(void) const
Definition: Bioseq.hpp:174
TTaxId GetTaxId() const
Determine the tax-id for this bioseq.
Definition: Bioseq.cpp:177
void AddParam(EUsageParams p, int val)
Keeps track of the version of the BLAST engine in the NCBI C++ toolkit.
Definition: version.hpp:53
Build BlastDB format databases from various data sources.
Definition: build_db.hpp:136
bool AddSequences(IBioseqSource &src, bool add_pig=false)
Add sequences from an IBioseqSource object.
Definition: build_db.cpp:794
bool AddFasta(CNcbiIstream &fasta_file)
Add sequences from a file containing FASTA data.
Definition: build_db.cpp:1398
void SetVerbosity(bool v)
Specify level of output verbosity.
Definition: build_db.hpp:392
void SetMaskDataSource(IMaskDataSource &ranges)
Specify an object mapping Seq-id to subject masking data.
Definition: build_db.cpp:1609
void SetMembBits(const TLinkoutMap &membbits, bool keep_mbits)
Specify a membership bit lookup object.
Definition: build_db.cpp:1270
int RegisterMaskingAlgorithm(EBlast_filter_program program, const string &options, const string &name="")
Define a masking algorithm.
Definition: build_db.cpp:1584
void SetTaxids(CTaxIdSet &taxids)
Specify a mapping of sequence ids to taxonomic ids.
Definition: build_db.cpp:1216
string GetOutputDbName() const
Definition: build_db.hpp:465
bool EndBuild(bool erase=false)
Finish building a new database.
Definition: build_db.cpp:1423
void SetMaxFileSize(Uint8 max_file_size)
Set the maximum size of database component files.
Definition: build_db.cpp:1578
void SetLeafTaxIds(const TIdToLeafs &taxids, bool keep_taxids)
Specify a leaf-taxids object.
Definition: build_db.cpp:1278
CConstObjectInfo –.
Definition: objectinfo.hpp:421
CFile –.
Definition: ncbifile.hpp:1604
Class implements different ad-hoc unreliable file format identifications.
EFormat
The formats are checked in the same order as declared here.
@ eBinaryASN
Binary ASN.1.
@ eFasta
FASTA format sequence record, CFastaReader.
@ eUnknown
unknown format
@ eTextASN
Text ASN.1.
static const char * GetFormatName(EFormat format)
Defines invalid user input exceptions.
The main application class.
Definition: makeblastdb.cpp:75
CRef< CMaskedRangeSet > m_Ranges
void x_ProcessMaskData()
void x_ProcessInputData(const string &paths, bool is_protein)
void x_AddSeqEntries(CNcbiIstream &data, TFormat fmt)
CFormatGuess::EFormat TFormat
Convenience typedef.
Definition: makeblastdb.cpp:78
CStopWatch m_StopWatch
void x_AddFasta(CNcbiIstream &data)
void x_AddSequenceData(CNcbiIstream &input, TFormat fmt)
virtual void Init()
@inheritDoc
CNcbiOstream * m_LogFile
CRef< CBuildDatabase > m_DB
bool x_ShouldParseSeqIds(void)
CMakeBlastDBApp()
@inheritDoc
Definition: makeblastdb.cpp:89
virtual int Run()
@inheritDoc
ESupportedInputFormats x_GetUserInputTypeHint(void)
CBlastUsageReport m_UsageReport
TFormat x_ConvertToCFormatGuessType(ESupportedInputFormats fmt)
ESupportedInputFormats x_ConvertToSupportedType(TFormat fmt)
vector< ESupportedInputFormats > x_GuessInputType(const vector< CTempString > &filenames, vector< string > &blastdbs)
void x_VerifyInputFilesType(const vector< CTempString > &filenames, CMakeBlastDBApp::ESupportedInputFormats input_type)
TFormat x_GuessFileType(CNcbiIstream &input)
void Insert(int algo_id, const CSeq_id &id, const CSeq_loc &v)
static CNcbiApplication * Instance(void)
Singleton method.
Definition: ncbiapp.cpp:244
CNcbiRegistry –.
Definition: ncbireg.hpp:913
CObjectManager –.
CObjectOStreamJson –.
Definition: objostrjson.hpp:54
TTaxId GetTaxId() const
Definition: Org_ref.cpp:72
virtual ~CRawSeqDBSource()
virtual int GetColumnId(const string &name)
Get the column ID for a column mentioned by name.
virtual const map< string, string > & GetColumnMetaData(int id)
Get metadata for the column with the specified Column ID.
CRef< CSeqDBExpert > m_Source
CRawSeqDBSource(const string &name, bool protein, CBuildDatabase *outdb)
vector< CBlastDbBlob > m_Blobs
const char * m_Sequence
vector< int > m_MaskIds
virtual bool GetNext(CTempString &sequence, CTempString &ambiguities, CRef< CBlast_def_line_set > &deflines, vector< SBlastDbMaskData > &mask_range, vector< int > &column_ids, vector< CTempString > &column_blobs)
vector< string > m_ColumnNames
vector< int > m_ColumnIds
virtual void GetColumnNames(vector< string > &names)
Get the names of all columns defined by this sequence source.
map< int, int > m_MaskIdMap
CRef –.
Definition: ncbiobj.hpp:618
CScope –.
Definition: scope.hpp:92
CSeqDBException.
Definition: seqdbcommon.hpp:73
CSeqDBExpert.
Definition: seqdbexpert.hpp:55
void GetRawSeqAndAmbig(int oid, const char **buffer, int *seq_length, int *ambig_length) const
Raw Sequence and Ambiguity Data.
Definition: seqdbexpert.cpp:64
CSeqDB.
Definition: seqdb.hpp:161
void GetColumnBlob(int col_id, int oid, CBlastDbBlob &blob)
Fetch the data blob for the given column and oid.
Definition: seqdb.cpp:1220
static void FindVolumePaths(const string &dbname, ESeqType seqtype, vector< string > &paths, vector< string > *alias_paths=NULL, bool recursive=true, bool expand_links=true)
Find volume paths.
Definition: seqdb.cpp:1040
int GetNumOIDs() const
Returns the size of the (possibly sparse) OID range.
Definition: seqdb.cpp:680
void GetAvailableMaskAlgorithms(vector< int > &algorithms)
Get a list of algorithm IDs for which mask data exists.
Definition: seqdb.cpp:1227
ESeqType
Sequence types (eUnknown tries protein, then nucleotide).
Definition: seqdb.hpp:173
@ eNucleotide
Definition: seqdb.hpp:175
@ eProtein
Definition: seqdb.hpp:174
void GetMaskAlgorithmDetails(int algorithm_id, objects::EBlast_filter_program &program, string &program_name, string &algo_opts)
Get information about one type of masking available here.
Definition: seqdb.cpp:1263
void RetSequence(const char **buffer) const
Returns any resources associated with the sequence.
Definition: seqdb.cpp:523
string GetTitle() const
Returns the database title.
Definition: seqdb.cpp:630
CRef< CBlast_db_metadata > GetDBMetaData(string user_path=kEmptyStr)
Definition: seqdb.cpp:1673
void ListColumns(vector< string > &titles)
List columns titles found in this database.
Definition: seqdb.cpp:1191
bool CheckOrFindOID(int &next_oid) const
Find an included OID, incrementing next_oid if necessary.
Definition: seqdb.cpp:728
int GetColumnId(const string &title)
Get an ID number for a given column title.
Definition: seqdb.cpp:1196
CRef< CBlast_def_line_set > GetHdr(int oid) const
Get the ASN.1 header for the sequence.
Definition: seqdb.cpp:418
void GetMaskData(int oid, const vector< int > &algo_ids, TSequenceRanges &ranges)
Get masked ranges of a sequence.
Definition: seqdb.hpp:1408
const map< string, string > & GetColumnMetaData(int column_id)
Get all metadata for the specified column.
Definition: seqdb.cpp:1202
virtual CConstRef< CBioseq > GetNext()
Get a Bioseq object if there are any more to get.
CRef< CObjectManager > m_objmgr
set< CBioseq * > m_bio_skipped
CTypeIterator< CBioseq > m_bio
CRef< CScope > m_scope
CSeqEntrySource(CNcbiIstream &is, TFormat fmt, bool skip_unver)
CRef< CSeq_entry > m_entry
CFormatGuess::EFormat TFormat
Convenience typedef.
bool IsUnverified(const CSeq_descr &descr)
@Seq_descr.hpp User-defined methods of the data storage class.
Definition: Seq_descr.hpp:55
Definition: Seq_entry.hpp:56
const CSeq_descr & GetDescr(void) const
Definition: Seq_entry.cpp:120
void Parentize(void)
Definition: Seq_entry.cpp:71
bool IsSetDescr(void) const
Definition: Seq_entry.cpp:106
CSeq_entry * GetParentEntry(void) const
Definition: Seq_entry.hpp:131
CStopWatch –.
Definition: ncbitime.hpp:1938
void SetMappingFromFile(CNcbiIstream &f)
Definition: taxid_set.cpp:45
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
Template class for iteration on objects of class C.
Definition: iterator.hpp:673
@ eFullIndex
Use several forms of each Seq-id in the string index.
Definition: writedb.hpp:112
@ eAddHash
Add an index from sequence hash to OID.
Definition: writedb.hpp:126
@ eNoIndex
Build a database without any indices.
Definition: writedb.hpp:106
int TIndexType
Bitwise OR of "EIndexType".
Definition: writedb.hpp:128
Interface to a source of Bioseq objects.
Definition: build_db.hpp:54
Interface to a source of raw sequence data.
Definition: build_db.hpp:70
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
const_iterator begin() const
Definition: set.hpp:135
bool empty() const
Definition: set.hpp:133
const_iterator find(const key_type &key) const
Definition: set.hpp:137
const_iterator end() const
Definition: set.hpp:136
const string kArgDbTitle
Title for the BLAST database.
const string kArgDbType
BLAST database molecule type.
static CMemoryRegistry registry
Definition: cn3d_tools.cpp:81
void Print(const CCompactSAMApplication::AlignInfo &ai)
API (CDeflineGenerator) for computing sequences' titles ("definitions").
static uch flags
static FILE * input_file
Definition: common.c:35
static const struct name_t names[]
std::ofstream out("events_result.xml")
main entry point for tests
void SetFullVersion(CRef< CVersionAPI > version)
Set version data for the program.
Definition: ncbiapp.cpp:1154
void HideStdArgs(THideStdArgs hide_mask)
Set the hide mask for the Hide Std Flags.
Definition: ncbiapp.cpp:1292
#define ZERO_TAX_ID
Definition: ncbimisc.hpp:1115
const CNcbiEnvironment & GetEnvironment(void) const
Get the application's cached environment.
const CNcbiRegistry & GetConfig(void) const
Get the application's cached configuration parameters (read-only).
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:285
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
Definition: ncbiapp.cpp:799
CVersionInfo GetVersion(void) const
Get the program version information.
Definition: ncbiapp.cpp:1164
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
Definition: ncbiapp.cpp:1175
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
#define TAX_ID_FROM(T, value)
Definition: ncbimisc.hpp:1111
@ fHideXmlHelp
Hide XML help description.
@ fHideFullVersion
Hide full version description.
@ fHideDryRun
Hide dryrun description.
@ fHideConffile
Hide configuration file description.
@ eNoOwnership
No ownership is assumed.
Definition: ncbi_types.h:135
virtual const string & AsString(void) const =0
Get the argument's string value.
virtual bool HasValue(void) const =0
Check if argument holds a value.
@ fAppend
Append to end-of-file; for eOutputFile or eIOFile.
Definition: ncbiargs.hpp:622
@ eRequires
One argument requires another.
Definition: ncbiargs.hpp:956
@ eExcludes
One argument excludes another.
Definition: ncbiargs.hpp:957
@ eInputFile
Name of file (must exist and be readable)
Definition: ncbiargs.hpp:595
@ eString
An arbitrary string.
Definition: ncbiargs.hpp:589
@ eOutputFile
Name of file (must be writable)
Definition: ncbiargs.hpp:596
@ eInteger
Convertible into an integer number (int or Int8)
Definition: ncbiargs.hpp:592
string
Definition: cgiapp.hpp:687
#define NULL
Definition: ncbistd.hpp:225
void SetDiagPostPrefix(const char *prefix)
Specify a string to prefix all subsequent error postings with.
Definition: ncbidiag.cpp:6097
EDiagSev SetDiagPostLevel(EDiagSev post_sev=eDiag_Error)
Set the threshold severity for posting the messages.
Definition: ncbidiag.cpp:6129
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
@ eDiag_Warning
Warning message.
Definition: ncbidiag.hpp:652
const string & Get(const string &name, bool *found=NULL) const
Get environment value by name.
Definition: ncbienv.cpp:109
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
TErrCode GetErrCode(void) const
Get error code.
Definition: ncbiexpt.cpp:453
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1191
static char GetPathSeparator(void)
Get path separator symbol specific for the current platform.
Definition: ncbifile.cpp:433
#define MSerial_AsnBinary
Definition: serialbase.hpp:697
#define MSerial_AsnText
I/O stream manipulators –.
Definition: serialbase.hpp:696
CBeginInfo Begin(C &obj)
Get starting point of object hierarchy.
Definition: iterator.hpp:1004
CSeq_entry_Handle AddTopLevelSeqEntry(CSeq_entry &top_entry, TPriority pri=kPriority_Default, EExist action=eExist_Default)
Add seq_entry, default priority is higher than for defaults or loaders Add object to the score with p...
Definition: scope.cpp:522
bool Empty(void) const THROWS_NONE
Check if CConstRef is empty – not pointing to any object which means having a null value.
Definition: ncbiobj.hpp:1385
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:1439
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
virtual const string & Get(const string &section, const string &name, TFlags flags=0) const
Get the parameter value.
Definition: ncbireg.cpp:262
bool IsEnabled(void)
Indicates whether application usage statistics collection is enabled for a current reporter instance.
#define NcbiEndl
Definition: ncbistre.hpp:548
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
#define kEmptyStr
Definition: ncbistr.hpp:123
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3457
static Uint8 StringToUInt8_DataSize(const CTempString str, TStringToNumFlags flags=0)
Convert string that can contain "software" qualifiers to Uint8.
Definition: ncbistr.cpp:1539
static TNumeric StringToNumeric(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to a numeric value.
Definition: ncbistr.hpp:330
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5083
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
static string TruncateSpaces(const string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string.
Definition: ncbistr.cpp:3182
static string UInt8ToString(Uint8 value, TNumToStringFlags flags=0, int base=10)
Convert UInt8 to string.
Definition: ncbistr.hpp:5167
@ eEncoding_Ascii
Definition: ncbistr.hpp:202
@ fConvErr_NoThrow
Do not throw an exception on error.
Definition: ncbistr.hpp:285
double Elapsed(void) const
Return time elapsed since first Start() or last Restart() call (in seconds).
Definition: ncbitime.hpp:2776
void Start(void)
Start the timer.
Definition: ncbitime.hpp:2765
#define CVersion
const TOrg & GetOrg(void) const
Get the Org member data.
Definition: BioSource_.hpp:509
list< CRef< CSeq_loc > > TMasks
const TMasks & GetMasks(void) const
Get the Masks member data.
EBlast_filter_program
This defines the possible sequence filtering algorithms to be used in a BLAST database.
const Tdata & Get(void) const
Get the member data.
list< CRef< CBlast_def_line > > Tdata
TMore GetMore(void) const
Get the More member data.
@ eBlast_filter_program_other
bool IsStr(void) const
Check if variant Str is selected.
Definition: Object_id_.hpp:291
bool IsSetType(void) const
type of object within class Check if a value has been assigned to Type data member.
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
const TType & GetType(void) const
Get the Type member data.
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
@ eClass_nuc_prot
nuc acid and coded proteins
Definition: Bioseq_set_.hpp:99
list< CRef< CSeqdesc > > Tdata
Definition: Seq_descr_.hpp:91
const TUser & GetUser(void) const
Get the variant data.
Definition: Seqdesc_.cpp:384
bool IsOrg(void) const
Check if variant Org is selected.
Definition: Seqdesc_.hpp:1046
const TSource & GetSource(void) const
Get the variant data.
Definition: Seqdesc_.cpp:566
bool IsSource(void) const
Check if variant Source is selected.
Definition: Seqdesc_.hpp:1190
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
bool CanGetDescr(void) const
Check if it is safe to call GetDescr method.
Definition: Bioseq_.hpp:309
const TOrg & GetOrg(void) const
Get the variant data.
Definition: Seqdesc_.cpp:240
void SetDescr(TDescr &value)
Assign a value to Descr data member.
Definition: Bioseq_.cpp:65
Tdata & Set(void)
Assign a value to data member.
Definition: Seq_descr_.hpp:172
bool IsTitle(void) const
Check if variant Title is selected.
Definition: Seqdesc_.hpp:1026
const TDescr & GetDescr(void) const
Get the Descr member data.
Definition: Bioseq_.hpp:315
bool IsUser(void) const
Check if variant User is selected.
Definition: Seqdesc_.hpp:1122
char * dbname(DBPROCESS *dbproc)
Get name of current database.
Definition: dblib.c:6929
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
FILE * file
static int input()
int i
static TTaxId s_GetTaxId(const CBioseq &bio)
static bool s_HasTitle(const CBioseq &bio)
USING_SCOPE(blast)
static const string kOutput("out")
Command line flag to represent the output.
static string Uint8ToString_DataSize(Uint8 v, unsigned minprec=10)
Converts a Uint8 into a string which contains a data size (converse to NStr::StringToUInt8_DataSize)
void s_ReadObject(CNcbiIstream &file, CFormatGuess::EFormat fmt, CRef< TObj > &obj, const string &msg)
Reads an object defined in a NCBI ASN.1 spec from a stream in multiple formats: binary and text ASN....
int main(int argc, const char *argv[])
static const string kInputSeparators(" ")
Defines token separators when multiple inputs are present.
USING_NCBI_SCOPE
Definition: makeblastdb.cpp:69
static const string kInput("in")
Command line flag to represent the input.
static int version
Definition: mdb_load.c:29
range(_Ty, _Ty) -> range< _Ty >
EIPRangeType t
Definition: ncbi_localip.c:101
ESERV_Algo algo
#define ASSERT
macro for assert.
Definition: ncbi_std.h:107
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
double f(double x_, const double &y_)
Definition: njn_root.hpp:188
static int filenames
Definition: pcregrep.c:172
Defines BLAST database access classes.
Defines exception class and several constants for SeqDB.
EBlastDbVersion
BLAST database version.
Definition: seqdbcommon.hpp:51
@ eBDB_Version4
Definition: seqdbcommon.hpp:52
@ eBDB_Version5
Definition: seqdbcommon.hpp:53
void SeqDB_GetMetadataFileExtension(bool db_is_protein, string &extn)
void SeqDB_SplitQuoted(const string &dbname, vector< CSeqDB_Substring > &dbs, bool keep_quote=false)
Combine and quote list of database names.
void SeqDB_CombineAndQuote(const vector< string > &dbs, string &dbname)
Combine and quote list of database names.
static const char * str(char *buf, int n)
Definition: stats.c:84
List of sequence offset ranges.
Definition: seqdb.hpp:236
Structure describing filtered regions created using a particular sequence filtering algorithm.
int algorithm_id
Identifies the algorithm used.
vector< pair< TSeqPos, TSeqPos > > offsets
Start and end offsets of the filtered area.
static void des(const char *src, const char *out)
Definition: challenge.c:132
#define _ASSERT
Defines BLAST database construction classes.
Defines exception class for WriteDB.
Modified on Sat Dec 09 04:49:40 2023 by modify_doxy.py rev. 669887