NCBI C++ ToolKit
makeclusterdb.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: makeclusterdb.cpp 98613 2022-12-12 18:58:48Z fongah2 $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Amelia Fong
27  *
28  */
29 
30 /** @file makeclusterdb.cpp
31  * Command line tool to create cluster databases.
32  */
33 
34 #include <ncbi_pch.hpp>
35 #include <serial/objostrjson.hpp>
38 #include <corelib/ncbiapp.hpp>
39 
40 #include <serial/iterator.hpp>
42 #include <objmgr/util/sequence.hpp>
43 
44 #include <objects/seq/Seqdesc.hpp>
46 
54 #include <util/format_guess.hpp>
55 #include <util/util_exception.hpp>
57 
58 #include <serial/objostrjson.hpp>
60 #include "../blast/blast_app_util.hpp"
61 #include "masked_range_set.hpp"
62 
63 #include <iostream>
64 #include <sstream>
65 #include <fstream>
66 
67 #ifndef SKIP_DOXYGEN_PROCESSING
69 USING_SCOPE(blast);
71 #endif /* SKIP_DOXYGEN_PROCESSING */
72 
73 class CCluster;
74 
75 class CClusterSeq : public CObject {
76 public:
77  CClusterSeq(CRef<CCluster> cluster, const string & id, bool is_refseq) :
78  m_Cluster(cluster), m_Id(id), m_IsRefSeq(is_refseq) { }
80  const string & GetId() const { return m_Id; }
81  bool IsRefSeq() const { return m_IsRefSeq; }
82  void SetOid(int64_t oid) { m_Oid = oid; }
83  int64_t GetOid() const { return m_Oid; }
84 private:
86  string m_Id;
87  bool m_IsRefSeq;
89 };
90 
91 class CCluster : public CObject {
92 public:
93  CCluster (unsigned int cluster_id) : m_ClusterId(cluster_id) {}
94  unsigned int GetClusterId() { return m_ClusterId; }
96  const string & GetRefSeqId() { return(m_RefSeq.Empty() ? kEmptyStr : m_RefSeq->GetId()); }
98  m_RefSeq.Reset(r); }
99  const vector<CRef<CClusterSeq> > & GetMemSeqs() { return m_MemSeqs; }
101  m_MemSeqs.push_back(m);
102  }
104  if(m_RefSeq.NotEmpty()) {
105  return m_RefSeq->GetOid();
106  }
107  return -1;
108  }
109 private:
110  unsigned int m_ClusterId;
112  vector<CRef<CClusterSeq> > m_MemSeqs;
113 
114 };
115 
117 {
118  return (a->GetId() < b->GetId());
119 }
120 
122 {
123  return (a->GetRefSeqOid() < b->GetRefSeqOid());
124 }
125 
126 
128 public:
129  CClusterDBSource(CRef<CSeqDBExpert> & source_db, vector<CRef<CCluster> > & clusters, CBuildDatabase * outdb);
130 
132  {
133  }
134 
135  virtual bool GetNext(CTempString & sequence,
136  CTempString & ambiguities,
137  CRef<CBlast_def_line_set> & deflines,
138  vector<SBlastDbMaskData> & mask_range,
139  vector<int> & column_ids,
140  vector<CTempString> & column_blobs);
141 
142 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
143  (!defined(NCBI_COMPILER_MIPSPRO)) )
144  virtual void GetColumnNames(vector<string> & names)
145  {
147  }
148 
149  virtual int GetColumnId(const string & name)
150  {
151  return m_Source->GetColumnId(name);
152  }
153 
154  virtual const map<string,string> & GetColumnMetaData(int id)
155  {
156  return m_Source->GetColumnMetaData(id);
157  }
158 #endif
159 
160 private:
162  vector<CRef<CCluster> > & m_Clusters;
164 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
165  (!defined(NCBI_COMPILER_MIPSPRO)) )
166  vector<CBlastDbBlob> m_Blobs;
167  vector<int> m_ColumnIds;
168  vector<string> m_ColumnNames;
169  vector<int> m_MaskIds;
171 #endif
172 };
173 
175  : m_Source(source_db), m_Clusters(cluster), m_CurrentCluster(0)
176 {
177 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
178  (!defined(NCBI_COMPILER_MIPSPRO)) )
179  // Process mask meta data
181  ITERATE(vector<int>, algo_id, m_MaskIds) {
183  string algo_opts, algo_name;
184  m_Source->GetMaskAlgorithmDetails(*algo_id, algo, algo_name, algo_opts);
186  algo_name += NStr::IntToString(*algo_id);
187  }
188  m_MaskIdMap[*algo_id] = outdb->RegisterMaskingAlgorithm(algo, algo_opts, algo_name);
189  }
190  // Process columns
192  for(int i = 0; i < (int)m_ColumnNames.size(); i++) {
194  }
195 #endif
197  if(m_Clusters[m_CurrentCluster]->GetRefSeqOid() < 0) {
198  LOG_POST(Warning << m_Clusters[m_CurrentCluster]->GetRefSeqId() + " not in source db");
199  }
200  else {
201  break;
202  }
203  }
204 
205  if(m_CurrentCluster == m_Clusters.size()) {
206  NCBI_THROW(CSeqDBException, eArgErr, "No valid cluster");
207  }
208 }
209 
210 bool
212  CTempString & ambiguities,
213  CRef<CBlast_def_line_set> & deflines,
214  vector<SBlastDbMaskData> & mask_range,
215  vector<int> & column_ids,
216  vector<CTempString> & column_blobs)
217 {
218  if(m_CurrentCluster >= m_Clusters.size()) {
219  return false;
220  }
221 
223  blastdb::TOid ref_oid = cluster->GetRefSeqOid();
224  if (! m_Source->CheckOrFindOID(ref_oid)){
225  return false;
226  }
227 
228  if (ref_oid != cluster->GetRefSeqOid()) {
229  NCBI_THROW(CSeqDBException, eArgErr, "Oid not found");
230  }
231 
232  const char * seq_ptr;
233  int slength(0), alength(0);
234 
235  m_Source->GetRawSeqAndAmbig(ref_oid, &seq_ptr, & slength, & alength);
236 
237  sequence = CTempString(seq_ptr, slength);
238  ambiguities = CTempString(seq_ptr + slength, alength);
239 
240  CRef<CBlast_def_line_set> ref_defline_set = m_Source->GetHdr(ref_oid);
241  CBlast_def_line_set::Tdata ref_deflines = ref_defline_set->Set();
244 
245  CRef<CClusterSeq> ref_seq = cluster->GetRefSeq();
246  CSeq_id ref_seqid(ref_seq->GetId());
248  NON_CONST_ITERATE(CBlast_def_line_set::Tdata, itr, ref_deflines) {
249  CBlast_def_line::TTaxIds ts = (*itr)->GetTaxIds();
250  taxids.insert(ts.begin(), ts.end());
251  ITERATE(list< CRef<CSeq_id> >, seqid, (*itr)->GetSeqid()) {
252  if (ref_seqid.Match(**seqid)) {
253  bf.Reset(*itr);
254  ref_ts.insert(ts.begin(), ts.end());
255  break;
256  }
257  }
258  }
259 
260  _ASSERT(bf.NotEmpty());
261  const vector<CRef<CClusterSeq> > & mem_seqs = cluster->GetMemSeqs();
262  if (mem_seqs.size() > 0) {
263  vector<blastdb::TOid> mem_oids;
264  for (unsigned int i=0; i < mem_seqs.size(); i++) {
265  int64_t mem_oid = mem_seqs[i]->GetOid();
266  if (mem_oid < 0) {
267  LOG_POST(Warning << mem_seqs[i]->GetId() + " not in source db");
268  continue;
269  }
270  mem_oids.push_back(mem_oid);
271  }
272  std::sort(mem_oids.begin(), mem_oids.end());
273  set<TTaxId> mem_ts;
274  m_Source->GetTaxIdsForOids(mem_oids, mem_ts);
275  taxids.insert(mem_ts.begin(), mem_ts.end());
276  }
277  vector<CBlast_def_line::TTaxid> diff_ts;
278  diff_ts.resize(taxids.size());
279  vector<CBlast_def_line::TTaxid>::iterator diff_ts_itr;
280 
281  diff_ts_itr = std::set_difference(taxids.begin(), taxids.end(), ref_ts.begin(), ref_ts.end(), diff_ts.begin());
282  diff_ts.resize(diff_ts_itr - diff_ts.begin());
283  if (diff_ts.size() > 0) {
284  CBlast_def_line::TTaxIds leaf_ts(diff_ts.begin(), diff_ts.end());
285  const CBlast_def_line::TTaxIds& tx = bf->GetLeafTaxIds();
286  if(tx.size() > 0) {
287  leaf_ts.insert(tx.begin(), tx.end());
288  }
289  bf->SetLeafTaxIds(leaf_ts);
290  }
291 
292  deflines.Reset(new CBlast_def_line_set());
293  deflines->Set().push_back(bf);
294 
295 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
296  (!defined(NCBI_COMPILER_MIPSPRO)) )
297  // process masks
298  ITERATE(vector<int>, algo_id, m_MaskIds) {
299 
301  m_Source->GetMaskData(ref_oid, *algo_id, ranges);
302 
303  SBlastDbMaskData mask_data;
304  mask_data.algorithm_id = m_MaskIdMap[*algo_id];
305 
307  mask_data.offsets.push_back(pair<TSeqPos, TSeqPos>(range->first, range->second));
308  }
309 
310  mask_range.push_back(mask_data);
311  }
312 
313  // The column IDs will be the same each time; another approach is
314  // to only send back the IDs for those columns that are non-empty.
315  column_ids = m_ColumnIds;
316  column_blobs.resize(column_ids.size());
317  m_Blobs.resize(column_ids.size());
318 
319  for(int i = 0; i < (int)column_ids.size(); i++) {
320  m_Source->GetColumnBlob(column_ids[i], ref_oid, m_Blobs[i]);
321  column_blobs[i] = m_Blobs[i].Str();
322  }
323 #endif
324 
325  m_Source->RetSequence(&seq_ptr);
327 
328  return true;
329 }
330 
331 
332 /// The main application class
334 public:
335 
336  /** @inheritDoc */
338  : m_LogFile(NULL)
339  {
341  version->SetVersionInfo(new CBlastVersion());
343  m_StopWatch.Start();
344  if (m_UsageReport.IsEnabled()) {
346  m_UsageReport.AddParam(CBlastUsageReport::eProgram, (string) "makeclusterdb");
347  }
348  }
351  }
352 
353 private:
354  /** @inheritDoc */
355  virtual void Init();
356  /** @inheritDoc */
357  virtual int Run();
358 
359  void x_BuildDatabase();
360 
361  void x_ProcessInputFile(const string & input_file);
362  void x_ProcessInputData(const string & source_db, bool is_protein);
363 
364  void x_AddCmdOptions();
365 
366  // Data
370  vector<CRef<CCluster> > m_Clusters;
371  vector<CRef<CClusterSeq> > m_ClusterSeqs;
372 
375 };
376 
377 
378 
379 /// Command line flag to represent the input
380 static const string kInput("in");
381 /// Defines token separators when multiple inputs are present
382 static const string kInputSeparators(" ");
383 /// Command line flag to represent the output
384 static const string kOutput("out");
385 
387 {
389 
390  unique_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
391 
392  // Specify USAGE context
393  arg_desc->SetUsageContext(GetArguments().GetProgramBasename(),
394  "Application to create BLAST databases, version "
395  + CBlastVersion().Print());
396 
397  arg_desc->SetCurrentGroup("Input options");
398  arg_desc->AddDefaultKey(kInput, "input_file",
399  "Input file",
401  arg_desc->AddDefaultKey(kArgDb, "source_db",
402  "Source DB",
404 
405  arg_desc->AddDefaultKey(kArgDbType, "molecule_type",
406  "Molecule type of target db", CArgDescriptions::eString, "prot");
407  arg_desc->SetConstraint(kArgDbType, &(*new CArgAllow_Strings,
408  "nucl", "prot"));
409 
410  arg_desc->SetCurrentGroup("Configuration options");
411  arg_desc->AddOptionalKey(kArgDbTitle, "database_title",
412  "Title for BLAST database\n",
414 
415  arg_desc->SetCurrentGroup("Output options");
416  arg_desc->AddOptionalKey(kOutput, "database_name",
417  "Name of BLAST database to be created\n",
419  arg_desc->AddDefaultKey("max_file_sz", "number_of_bytes",
420  "Maximum file size for BLAST database files",
422  arg_desc->AddOptionalKey("metadata_output_prefix", "",
423  "Path prefix for location of database files in metadata", CArgDescriptions::eString);
424  arg_desc->AddOptionalKey("logfile", "File_Name",
425  "File to which the program log should be redirected",
428  arg_desc->AddFlag("verbose", "Produce verbose output", true);
429 
430  SetupArgDescriptions(arg_desc.release());
431 }
432 
433 /// Converts a Uint8 into a string which contains a data size (converse to
434 /// NStr::StringToUInt8_DataSize)
435 /// @param v value to convert [in]
436 /// @param minprec minimum precision [in]
437 static string Uint8ToString_DataSize(Uint8 v, unsigned minprec = 10)
438 {
439  static string kMods = "KMGTPEZY";
440 
441  size_t i(0);
442  for(i = 0; i < kMods.size(); i++) {
443  if (v < Uint8(minprec)*1024) {
444  v /= 1024;
445  }
446  }
447 
448  string rv = NStr::UInt8ToString(v);
449 
450  if (i) {
451  rv.append(kMods, i, 1);
452  rv.append("B");
453  }
454 
455  return rv;
456 }
457 
459 {
460  CNcbiIfstream input_stream(input_file);
461  string line = kEmptyStr;
462  CRef<CCluster> current_cluster;
463  unsigned int cluster_id = 0;
464  while (input_stream) {
465  getline(input_stream, line);
466  if(line.empty() || (line.find_first_not_of(' ') == std::string::npos)) {
467  continue;
468  }
469 
470  vector<string> cols;
471  NStr::Split(line, " \t", cols);
472  if (cols.size() < 3) {
473  continue;
474  }
475  string ref_id(cols[0]);
476  if(current_cluster.Empty() || (current_cluster->GetRefSeqId() != ref_id)) {
477  current_cluster.Reset(new CCluster(cluster_id));
478  cluster_id ++;
479  CRef<CClusterSeq> r_seq(new CClusterSeq(current_cluster, ref_id, true));
480  current_cluster->SetRefSeq(r_seq);
481  m_Clusters.push_back(current_cluster);
482  m_ClusterSeqs.push_back(r_seq);
483  }
484  string mem_id(cols[1]);
485  if (ref_id != mem_id) {
486  CRef<CClusterSeq> m(new CClusterSeq(current_cluster, mem_id, false));
487  current_cluster->AddMemSeq(m);
488  m_ClusterSeqs.push_back(m);
489  }
490  }
491 
492  LOG_POST(Info <<"Num of Reference Seqs: " << cluster_id);
493  LOG_POST(Info <<"Num of Cluster Seqs: " << m_ClusterSeqs.size());
495 }
496 
497 void CMakeClusterDBApp::x_ProcessInputData(const string & source_db, bool is_protein)
498 {
499  vector<string> accs;
500  vector<blastdb::TOid> oids;
501  accs.reserve(m_ClusterSeqs.size());
502  oids.reserve(m_ClusterSeqs.size());
503  CSeqDB::ESeqType seq_type = is_protein ? CSeqDB::eProtein : CSeqDB::eNucleotide;
504  m_SourceDB.Reset(new CSeqDBExpert(source_db, seq_type));
505 
507  accs.push_back((*itr)->GetId());
508  }
509  m_SourceDB->AccessionsToOids(accs, oids);
510 
511  if (oids.size() != m_ClusterSeqs.size()) {
512  NCBI_THROW(CSeqDBException, eArgErr, " Accessions to Oids look up error");
513  }
514 
515  for (uint64_t i=0; i < oids.size(); i++) {
516  m_ClusterSeqs[i]->SetOid(oids[i]);
517  }
518  std::sort(m_Clusters.begin(), m_Clusters.end(), SortCluster);
519 }
520 
522 {
523  const CArgs & args = GetArgs();
524 
525  // Get arguments to the CBuildDatabase constructor.
526 
527  bool is_protein = (args[kArgDbType].AsString() == "prot");
528 
529  // 1. database name option if present
530  // 2. else, kInput
531  string out_dbname = (args[kOutput].HasValue() ? args[kOutput] : args[kInput]).AsString();
532 
533  string title = args[kArgDbTitle].HasValue() ? args[kArgDbTitle].AsString() : "Cluster " + args[kArgDb].AsString();
534 
535  m_LogFile = & (args["logfile"].HasValue() ? args["logfile"].AsOutputFile() : cout);
536 
537  x_ProcessInputFile(args[kInput].AsString());
538 
539  x_ProcessInputData(args[kArgDb].AsString(), is_protein);
540 
541  bool long_seqids = true;
542  bool limit_defline = false;
543  m_DB.Reset(new CBuildDatabase(out_dbname, title, is_protein, false, true, false, m_LogFile,
544  long_seqids, eBDB_Version5, limit_defline, 0));
545 
546  if (args["verbose"]) {
547  m_DB->SetVerbosity(true);
548  }
549 
550  // Max file size
551  Uint8 bytes = NStr::StringToUInt8_DataSize(args["max_file_sz"].AsString());
552  static const Uint8 MAX_VOL_FILE_SIZE = 0x100000000;
553  if (bytes >= MAX_VOL_FILE_SIZE) {
554  NCBI_THROW(CInvalidDataException, eInvalidInput, "max_file_sz must be < 4 GiB");
555  }
556  *m_LogFile << "Maximum file size: " << Uint8ToString_DataSize(bytes) << endl;
557 
558  m_DB->SetMaxFileSize(bytes);
560  m_DB->SetLeafTaxIds(empty, true);
561 
563  m_DB->AddSequences(*raw);
564 
565  bool success = m_DB->EndBuild();
566  string new_db = m_DB->GetOutputDbName();
567 
568 #ifdef METADATA_CLUSTERDB
569  if(success) {
570  string new_db = m_DB->GetOutputDbName();
572  CSeqDB sdb(new_db, t);
573  string output_prefix = args["metadata_output_prefix"]
574  ? args["metadata_output_prefix"].AsString() : kEmptyStr;
575 
576  if (!output_prefix.empty() && (output_prefix.back() != CFile::GetPathSeparator())) {
577  output_prefix += CFile::GetPathSeparator();
578  }
579  CRef<CBlast_db_metadata> m = sdb.GetDBMetaData(output_prefix);
580  string extn (kEmptyStr);
581  SeqDB_GetMetadataFileExtension(is_protein, extn);
582  string metadata_filename = new_db + "." + extn;
583  ofstream out(metadata_filename.c_str());
584  unique_ptr<CObjectOStreamJson> json_out(new CObjectOStreamJson(out, eNoOwnership));
585  json_out->SetDefaultStringEncoding(eEncoding_Ascii);
586  json_out->PreserveKeyNames();
587  CConstObjectInfo obj_info(m, m->GetTypeInfo());
588  json_out->WriteObject(obj_info);
589  json_out->Flush();
590  out.flush();
591  out << NcbiEndl;
592  }
593 #endif
594 }
595 
597 {
599  SetDiagPostPrefix("makeclusterdb");
600 
601  int status = 0;
602  try { x_BuildDatabase(); }
603  CATCH_ALL(status)
604  x_AddCmdOptions();
606  return status;
607 }
608 
610 {
611  const CArgs & args = GetArgs();
612  if (args[kArgDbType].HasValue()) {
614  }
615 }
616 
617 
618 #ifndef SKIP_DOXYGEN_PROCESSING
619 int main(int argc, const char* argv[] /*, const char* envp[]*/)
620 {
621  return CMakeClusterDBApp().AppMain(argc, argv);
622 }
623 #endif /* SKIP_DOXYGEN_PROCESSING */
User-defined methods of the data storage class.
User-defined methods of the data storage class.
Declares singleton objects to store the version and reference for the BLAST engine.
#define CATCH_ALL(exit_code)
Standard catch statement for all BLAST command line programs.
Interface for converting sources of sequence data into blast sequence input.
Auxiliary classes/functions for BLAST input library.
Code to build a database given various sources of sequence data.
CArgAllow_Strings –.
Definition: ncbiargs.hpp:1641
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgs –.
Definition: ncbiargs.hpp:379
void AddParam(EUsageParams p, int val)
Keeps track of the version of the BLAST engine in the NCBI C++ toolkit.
Definition: version.hpp:53
Build BlastDB format databases from various data sources.
Definition: build_db.hpp:136
bool AddSequences(IBioseqSource &src, bool add_pig=false)
Add sequences from an IBioseqSource object.
Definition: build_db.cpp:794
void SetVerbosity(bool v)
Specify level of output verbosity.
Definition: build_db.hpp:392
int RegisterMaskingAlgorithm(EBlast_filter_program program, const string &options, const string &name="")
Define a masking algorithm.
Definition: build_db.cpp:1584
string GetOutputDbName() const
Definition: build_db.hpp:465
bool EndBuild(bool erase=false)
Finish building a new database.
Definition: build_db.cpp:1423
void SetMaxFileSize(Uint8 max_file_size)
Set the maximum size of database component files.
Definition: build_db.cpp:1578
void SetLeafTaxIds(const TIdToLeafs &taxids, bool keep_taxids)
Specify a leaf-taxids object.
Definition: build_db.cpp:1278
virtual bool GetNext(CTempString &sequence, CTempString &ambiguities, CRef< CBlast_def_line_set > &deflines, vector< SBlastDbMaskData > &mask_range, vector< int > &column_ids, vector< CTempString > &column_blobs)
uint64_t m_CurrentCluster
virtual void GetColumnNames(vector< string > &names)
Get the names of all columns defined by this sequence source.
CRef< CSeqDBExpert > m_Source
vector< string > m_ColumnNames
CClusterDBSource(CRef< CSeqDBExpert > &source_db, vector< CRef< CCluster > > &clusters, CBuildDatabase *outdb)
virtual ~CClusterDBSource()
vector< CBlastDbBlob > m_Blobs
virtual int GetColumnId(const string &name)
Get the column ID for a column mentioned by name.
vector< int > m_MaskIds
virtual const map< string, string > & GetColumnMetaData(int id)
Get metadata for the column with the specified Column ID.
vector< int > m_ColumnIds
vector< CRef< CCluster > > & m_Clusters
map< int, int > m_MaskIdMap
const string & GetId() const
int64_t GetOid() const
void SetOid(int64_t oid)
blastdb::TOid m_Oid
CRef< CCluster > & GetCluster()
CClusterSeq(CRef< CCluster > cluster, const string &id, bool is_refseq)
bool IsRefSeq() const
CRef< CCluster > m_Cluster
void AddMemSeq(CRef< CClusterSeq > &m)
unsigned int m_ClusterId
const string & GetRefSeqId()
void SetRefSeq(CRef< CClusterSeq > &r)
CCluster(unsigned int cluster_id)
const vector< CRef< CClusterSeq > > & GetMemSeqs()
CRef< CClusterSeq > m_RefSeq
unsigned int GetClusterId()
vector< CRef< CClusterSeq > > m_MemSeqs
int64_t GetRefSeqOid()
CRef< CClusterSeq > & GetRefSeq()
CConstObjectInfo –.
Definition: objectinfo.hpp:421
Defines invalid user input exceptions.
The main application class.
vector< CRef< CCluster > > m_Clusters
CNcbiOstream * m_LogFile
CRef< CBuildDatabase > m_DB
CStopWatch m_StopWatch
void x_ProcessInputData(const string &source_db, bool is_protein)
vector< CRef< CClusterSeq > > m_ClusterSeqs
CMakeClusterDBApp()
@inheritDoc
virtual void Init()
@inheritDoc
virtual int Run()
@inheritDoc
void x_ProcessInputFile(const string &input_file)
CRef< CSeqDBExpert > m_SourceDB
CBlastUsageReport m_UsageReport
CObjectOStreamJson –.
Definition: objostrjson.hpp:54
CObject –.
Definition: ncbiobj.hpp:180
CSeqDBException.
Definition: seqdbcommon.hpp:73
CSeqDBExpert.
Definition: seqdbexpert.hpp:55
void GetRawSeqAndAmbig(int oid, const char **buffer, int *seq_length, int *ambig_length) const
Raw Sequence and Ambiguity Data.
Definition: seqdbexpert.cpp:64
CSeqDB.
Definition: seqdb.hpp:161
void GetColumnBlob(int col_id, int oid, CBlastDbBlob &blob)
Fetch the data blob for the given column and oid.
Definition: seqdb.cpp:1220
void GetAvailableMaskAlgorithms(vector< int > &algorithms)
Get a list of algorithm IDs for which mask data exists.
Definition: seqdb.cpp:1227
ESeqType
Sequence types (eUnknown tries protein, then nucleotide).
Definition: seqdb.hpp:173
@ eNucleotide
Definition: seqdb.hpp:175
@ eProtein
Definition: seqdb.hpp:174
void GetTaxIdsForOids(const vector< blastdb::TOid > &oids, set< TTaxId > &tax_ids) const
Definition: seqdb.cpp:267
void GetMaskAlgorithmDetails(int algorithm_id, objects::EBlast_filter_program &program, string &program_name, string &algo_opts)
Get information about one type of masking available here.
Definition: seqdb.cpp:1263
void RetSequence(const char **buffer) const
Returns any resources associated with the sequence.
Definition: seqdb.cpp:523
CRef< CBlast_db_metadata > GetDBMetaData(string user_path=kEmptyStr)
Definition: seqdb.cpp:1673
void ListColumns(vector< string > &titles)
List columns titles found in this database.
Definition: seqdb.cpp:1191
bool CheckOrFindOID(int &next_oid) const
Find an included OID, incrementing next_oid if necessary.
Definition: seqdb.cpp:728
int GetColumnId(const string &title)
Get an ID number for a given column title.
Definition: seqdb.cpp:1196
CRef< CBlast_def_line_set > GetHdr(int oid) const
Get the ASN.1 header for the sequence.
Definition: seqdb.cpp:418
void AccessionsToOids(const vector< string > &accs, vector< blastdb::TOid > &oids) const
Definition: seqdb.cpp:252
void GetMaskData(int oid, const vector< int > &algo_ids, TSequenceRanges &ranges)
Get masked ranges of a sequence.
Definition: seqdb.hpp:1408
const map< string, string > & GetColumnMetaData(int column_id)
Get all metadata for the specified column.
Definition: seqdb.cpp:1202
CStopWatch –.
Definition: ncbitime.hpp:1938
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
Interface to a source of raw sequence data.
Definition: build_db.hpp:70
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
const_iterator begin() const
Definition: set.hpp:135
size_type size() const
Definition: set.hpp:132
const_iterator end() const
Definition: set.hpp:136
const string kArgDbTitle
Title for the BLAST database.
const string kArgDbType
BLAST database molecule type.
const string kArgDb
BLAST database name.
void Print(const CCompactSAMApplication::AlignInfo &ai)
API (CDeflineGenerator) for computing sequences' titles ("definitions").
static FILE * input_file
Definition: common.c:35
static const struct name_t names[]
std::ofstream out("events_result.xml")
main entry point for tests
void SetFullVersion(CRef< CVersionAPI > version)
Set version data for the program.
Definition: ncbiapp.cpp:1154
void HideStdArgs(THideStdArgs hide_mask)
Set the hide mask for the Hide Std Flags.
Definition: ncbiapp.cpp:1292
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:285
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
Definition: ncbiapp.cpp:799
CVersionInfo GetVersion(void) const
Get the program version information.
Definition: ncbiapp.cpp:1164
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
Definition: ncbiapp.cpp:1175
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
@ fHideXmlHelp
Hide XML help description.
@ fHideFullVersion
Hide full version description.
@ fHideDryRun
Hide dryrun description.
@ fHideConffile
Hide configuration file description.
@ eNoOwnership
No ownership is assumed.
Definition: ncbi_types.h:135
@ fAppend
Append to end-of-file; for eOutputFile or eIOFile.
Definition: ncbiargs.hpp:622
@ eInputFile
Name of file (must exist and be readable)
Definition: ncbiargs.hpp:595
@ eString
An arbitrary string.
Definition: ncbiargs.hpp:589
@ eOutputFile
Name of file (must be writable)
Definition: ncbiargs.hpp:596
#define NULL
Definition: ncbistd.hpp:225
void SetDiagPostPrefix(const char *prefix)
Specify a string to prefix all subsequent error postings with.
Definition: ncbidiag.cpp:6097
EDiagSev SetDiagPostLevel(EDiagSev post_sev=eDiag_Error)
Set the threshold severity for posting the messages.
Definition: ncbidiag.cpp:6129
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
@ eDiag_Warning
Warning message.
Definition: ncbidiag.hpp:652
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1191
void Info(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1185
static char GetPathSeparator(void)
Get path separator symbol specific for the current platform.
Definition: ncbifile.cpp:433
bool Match(const CSeq_id &sid2) const
Match() - TRUE if SeqIds are equivalent.
Definition: Seq_id.hpp:1033
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
TObjectType * GetPointer(void) THROWS_NONE
Get pointer,.
Definition: ncbiobj.hpp:998
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
bool IsEnabled(void)
Indicates whether application usage statistics collection is enabled for a current reporter instance.
#define NcbiEndl
Definition: ncbistre.hpp:548
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
#define kEmptyStr
Definition: ncbistr.hpp:123
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3457
static Uint8 StringToUInt8_DataSize(const CTempString str, TStringToNumFlags flags=0)
Convert string that can contain "software" qualifiers to Uint8.
Definition: ncbistr.cpp:1539
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5083
static string UInt8ToString(Uint8 value, TNumToStringFlags flags=0, int base=10)
Convert UInt8 to string.
Definition: ncbistr.hpp:5167
@ eEncoding_Ascii
Definition: ncbistr.hpp:202
double Elapsed(void) const
Return time elapsed since first Start() or last Restart() call (in seconds).
Definition: ncbitime.hpp:2776
void Start(void)
Start the timer.
Definition: ncbitime.hpp:2765
#define CVersion
EBlast_filter_program
This defines the possible sequence filtering algorithms to be used in a BLAST database.
Tdata & Set(void)
Assign a value to data member.
list< CRef< CBlast_def_line > > Tdata
@ eBlast_filter_program_other
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
USING_SCOPE(blast)
static const string kOutput("out")
Command line flag to represent the output.
static string Uint8ToString_DataSize(Uint8 v, unsigned minprec=10)
Converts a Uint8 into a string which contains a data size (converse to NStr::StringToUInt8_DataSize)
int main(int argc, const char *argv[])
bool SortClusterSeqs(CRef< CClusterSeq > &a, CRef< CClusterSeq > &b)
static const string kInputSeparators(" ")
Defines token separators when multiple inputs are present.
USING_NCBI_SCOPE
static const string kInput("in")
Command line flag to represent the input.
bool SortCluster(CRef< CCluster > &a, CRef< CCluster > &b)
static int version
Definition: mdb_load.c:29
range(_Ty, _Ty) -> range< _Ty >
constexpr auto sort(_Init &&init)
constexpr bool empty(list< Ts... >) noexcept
unsigned int a
Definition: ncbi_localip.c:102
EIPRangeType t
Definition: ncbi_localip.c:101
ESERV_Algo algo
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
Defines BLAST database access classes.
Defines exception class and several constants for SeqDB.
Int4 TOid
Ordinal ID in BLAST databases.
Definition: seqdbcommon.hpp:58
@ eBDB_Version5
Definition: seqdbcommon.hpp:53
void SeqDB_GetMetadataFileExtension(bool db_is_protein, string &extn)
signed __int64 int64_t
Definition: stdint.h:135
unsigned __int64 uint64_t
Definition: stdint.h:136
List of sequence offset ranges.
Definition: seqdb.hpp:236
Structure describing filtered regions created using a particular sequence filtering algorithm.
int algorithm_id
Identifies the algorithm used.
vector< pair< TSeqPos, TSeqPos > > offsets
Start and end offsets of the filtered area.
#define _ASSERT
Defines BLAST database construction classes.
Defines exception class for WriteDB.
Modified on Sat Dec 09 04:45:24 2023 by modify_doxy.py rev. 669887