NCBI C++ ToolKit
writedb.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: writedb.cpp 101152 2023-11-07 15:39:13Z camacho $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Kevin Bealer
27  *
28  */
29 
30 /// @file writedb.cpp
31 /// Implementation for the CWriteDB class, the top level class for WriteDB.
32 #include <ncbi_pch.hpp>
36 #include "writedb_impl.hpp"
38 #include <iostream>
39 
41 
42 using namespace std;
43 
44 // Impl
45 
46 
47 // CWriteDB
48 
49 CWriteDB::CWriteDB(const string & dbname,
50  CWriteDB::ESeqType seqtype,
51  const string & title,
52  int indices,
53  bool parse_ids,
54  bool long_ids,
55  bool use_gi_mask,
56  EBlastDbVersion dbver,
57  bool limit_defline,
58  Uint8 oid_masks,
59  bool scan_bioseq_4_cfastareader_usrobj)
60  : m_Impl(0)
61 {
63  seqtype == eProtein,
64  title,
65  (EIndexType)indices,
66  parse_ids,
67  long_ids,
68  use_gi_mask,
69  dbver,
70  limit_defline,
71  oid_masks);
72 }
73 
75 {
76  delete m_Impl;
77 }
78 
80 {
81  m_Impl->AddSequence(bs);
82 }
83 
85 {
86  m_Impl->AddSequence(bsh);
87 }
88 
90 {
91  m_Impl->AddSequence(bs, sv);
92 }
93 
95 {
96  m_Impl->SetDeflines(deflines);
97 }
98 
99 void CWriteDB::SetPig(int pig)
100 {
101  m_Impl->SetPig(pig);
102 }
103 
105 {
106  m_Impl->Close();
107 }
108 
109 void CWriteDB::AddSequence(const CTempString & sequence,
110  const CTempString & ambig)
111 {
112  string s(sequence.data(), sequence.length());
113  string a(ambig.data(), ambig.length());
114 
115  m_Impl->AddSequence(s, a);
116 }
117 
119 {
120  m_Impl->SetMaxFileSize(sz);
121 }
122 
124 {
126 }
127 
129 CWriteDB::ExtractBioseqDeflines(const CBioseq & bs, bool parse_ids,
130  bool long_ids,
131  bool scan_bioseq_4_cfastareader_usrobj)
132 {
133  return CWriteDB_Impl::ExtractBioseqDeflines(bs, parse_ids, long_ids, scan_bioseq_4_cfastareader_usrobj);
134 }
135 
136 void CWriteDB::SetMaskedLetters(const string & masked)
137 {
138  m_Impl->SetMaskedLetters(masked);
139 }
140 
141 void CWriteDB::ListVolumes(vector<string> & vols)
142 {
143  m_Impl->ListVolumes(vols);
144 }
145 
146 void CWriteDB::ListFiles(vector<string> & files)
147 {
148  m_Impl->ListFiles(files);
149 }
150 
151 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
152  (!defined(NCBI_COMPILER_MIPSPRO)) )
153 int CWriteDB::
155  const string & options,
156  const string & name)
157 {
158  return m_Impl->RegisterMaskAlgorithm(program, options, name);
159 }
160 
161 int CWriteDB::
162 RegisterMaskAlgorithm(const string & id,
163  const string & description,
164  const string & options)
165 {
166  return m_Impl->RegisterMaskAlgorithm(id, description, options);
167 }
168 
170  const vector<TGi> & gis)
171 {
172  m_Impl->SetMaskData(ranges, gis);
173 }
174 
175 int CWriteDB::FindColumn(const string & title) const
176 {
177  return m_Impl->FindColumn(title);
178 }
179 
180 int CWriteDB::CreateUserColumn(const string & title)
181 {
182  return m_Impl->CreateColumn(title);
183 }
184 
185 void CWriteDB::AddColumnMetaData(int col_id, const string & key, const string & value)
186 {
187  m_Impl->AddColumnMetaData(col_id, key, value);
188 }
189 
191 {
192  return m_Impl->SetBlobData(col_id);
193 }
194 #endif
195 
197  : m_IdType(id_type)
198 {
199 }
200 
201 void CBinaryListBuilder::Write(const string & fname)
202 {
203  // Create a binary stream.
204  ofstream outp(fname.c_str(), ios::binary);
205  Write(outp);
206 }
207 
209 {
210  // Header; first check for 8 byte ids.
211 
212  bool eight = false;
213 
214  ITERATE(vector<Int8>, iter, m_Ids) {
215  Int8 id = *iter;
216  _ASSERT(id > 0);
217 
218  if ((id >> 32) != 0) {
219  eight = true;
220  break;
221  }
222  }
223 
224  Int4 magic = 0;
225 
226  switch(m_IdType) {
227  case eGi:
228  magic = eight ? -2 : -1;
229  break;
230 
231  case eTi:
232  magic = eight ? -4 : -3;
233  break;
234 
235  default:
237  eArgErr,
238  "Error: Unsupported ID type specified.");
239  }
240 
241  s_WriteInt4(outp, magic);
242  s_WriteInt4(outp, (int)m_Ids.size());
243 
244  sort(m_Ids.begin(), m_Ids.end());
245 
246  if (eight) {
247  ITERATE(vector<Int8>, iter, m_Ids) {
248  s_WriteInt8BE(outp, *iter);
249  }
250  } else {
251  ITERATE(vector<Int8>, iter, m_Ids) {
252  s_WriteInt4(outp, (int)*iter);
253  }
254  }
255 }
256 
257 /// Returns true if the BLAST DB exists, otherwise throws a CSeqDBException
258 /// @param dbname name of BLAST DB [in]
259 /// @param is_prot is the BLAST DB protein? [in]
260 static bool
261 s_DoesBlastDbExist(const string& dbname, bool is_protein)
262 {
263  char dbtype(is_protein ? 'p' : 'n');
264  string path = SeqDB_ResolveDbPathNoExtension(dbname, dbtype);
265  if (path.empty()) {
266  string msg("Failed to find ");
267  msg += (is_protein ? "protein " : "nucleotide ");
268  msg += dbname + " BLAST database";
269  NCBI_THROW(CSeqDBException, eFileErr, msg);
270  }
271  return true;
272 }
273 
274 /// Computes the number of sequences and (alias) database length for alias
275 /// files
276 /// @param dbname Name of the BLAST database over which the alias file is being
277 /// created [in]
278 /// @param is_prot is the BLAST database protein? [in]
279 /// @param dbsize (Approximate) number of letters in the BLAST DB [out]
280 /// @param num_seqs_found Number of sequences found in the dbname, or the
281 /// number of sequences in the intersection between the dbname and the GIs in
282 /// the gi_file_name (if applicable) [out]
283 static bool
285  bool is_prot,
286  Uint8* dbsize,
287  int* num_seqs_found)
288 {
289  _ASSERT((dbsize != NULL));
290  _ASSERT(num_seqs_found != NULL);
291  *dbsize = 0u;
292  *num_seqs_found = 0u;
293 
295  try {
296  _TRACE("Attempting to compute length for '" << dbname << "'");
297  CRef<CSeqDB> dbhandle(new CSeqDB(dbname, dbtype));
298  dbhandle->GetTotals(CSeqDB::eFilteredAll, num_seqs_found, dbsize, false);
299  } catch(...) {
300  return false;
301  }
302  return true;
303 }
304 
305 static void
307  bool is_protein,
308  int num_seqs_found,
309  const string& gi_file_name = kEmptyStr,
310  int num_seqs_in_gifile = 0)
311 {
312  if ( !gi_file_name.empty() ) {
313  /* This won't work if the target directory is not the current working directory
314  CRef<CSeqDBFileGiList> gilist;
315  gilist.Reset(new CSeqDBFileGiList(gi_file_name));
316  num_seqs_in_gifile = gilist->Size();
317  } */
318  LOG_POST("Created " << (is_protein ? "protein " : "nucleotide ") <<
319  dbname << " BLAST (alias) database with " << num_seqs_found
320  << " sequences (out of " << num_seqs_in_gifile << " in "
321  << gi_file_name << ", " << setprecision(0) << fixed <<
322  (num_seqs_found*100.0/num_seqs_in_gifile) << "% found)");
323  } else {
324  LOG_POST("Created " << (is_protein ? "protein " : "nucleotide ") <<
325  "BLAST (alias) database " << dbname << " with " <<
326  num_seqs_found << " sequences");
327  }
328 }
329 
331  const string& db_name,
332  CWriteDB::ESeqType seq_type,
333  const string& gi_file_name,
334  const string& title,
335  EAliasFileFilterType alias_type /*= eGiList*/)
336 {
337  vector<string> db(1, db_name);
338  CWriteDB_CreateAliasFile(file_name, db, seq_type, gi_file_name, title,
339  alias_type);
340 }
341 
342 /// Auxiliary function to convert the enumeration into a string
344 {
345  string retval;
346  switch (e) {
347  case eGiList: retval = "GILIST"; break;
348  case eTiList: retval = "TILIST"; break;
349  case eSeqIdList: retval = "SEQIDLIST"; break;
350  case eTaxIdList: retval = "TAXIDLIST"; break;
351  case eNoAliasFilterType: break;
352  default: _ASSERT(false); /* Need to add a type here? */
353  }
354  return retval;
355 }
356 
357 static void
359  const vector<string>& databases,
360  CWriteDB::ESeqType seq_type,
361  const string& gi_file_name,
362  const string& title,
363  EAliasFileFilterType alias_type,
364  const TSeqRange* oid_range = NULL)
365 {
366  bool is_prot(seq_type == CWriteDB::eProtein ? true : false);
367  Uint8 dbsize = 0;
368  int num_seqs = 0;
369  CNcbiOstrstream fnamestr;
370  fnamestr << file_name << (is_prot ? ".pal" : ".nal");
371  string fname = CNcbiOstrstreamToString(fnamestr);
372 
373  ofstream out(fname.c_str());
374  out << "#\n# Alias file created " << CTime(CTime::eCurrent).AsString()
375  << "\n#\n";
376 
377  if ( !title.empty() ) {
378  out << "TITLE " << title << "\n";
379  }
380  out << "DBLIST ";
381  ITERATE(vector< string >, iter, databases) {
382  out << "\"" << *iter << "\" ";
383  }
384  out << "\n";
385  if ( !gi_file_name.empty() ) {
386  _ASSERT(alias_type != eNoAliasFilterType);
387  out << s_AliasFileFilterTypeToString(alias_type) << " "
388  << gi_file_name << "\n";
389  } else if (oid_range) {
390  out << "FIRST_OID " << oid_range->GetFrom() << "\n"
391  << "LAST_OID " << oid_range->GetToOpen() << "\n";
392  }
393  out.close();
394 
395  if (!s_ComputeNumSequencesAndDbLength(file_name, is_prot, &dbsize, &num_seqs)){
396  CDirEntry(fname).Remove();
397  _TRACE("Deleting " << fname);
398  string msg("BLASTDB alias file creation failed. Some referenced files may be missing");
399  NCBI_THROW(CSeqDBException, eArgErr, msg);
400  };
401  if (num_seqs == 0) {
402  CDirEntry(fname).Remove();
403  _TRACE("Deleting " << fname);
404  CNcbiOstrstream oss;
405  oss << "No seqs in " << s_AliasFileFilterTypeToString(alias_type) << " were found"
406  << " in BLAST database";
407  string msg = CNcbiOstrstreamToString(oss);
408  NCBI_THROW(CSeqDBException, eArgErr, msg);
409  }
410 
411  out.open(fname.c_str(), ios::out|ios::app);
412  out << "NSEQ " << num_seqs << "\n";
413  out << "LENGTH " << dbsize << "\n";
414  out.close();
415 
416  s_PrintAliasFileCreationLog(file_name, is_prot, num_seqs);
417 }
418 
420 {
421  int num_digits = 0;
422  while (n) {
423  n/=10;
424  num_digits ++;
425  }
426 
427  return (num_digits >2) ? num_digits: 2;
428 }
429 
430 
432  unsigned int num_volumes,
433  CWriteDB::ESeqType seq_type,
434  const string& title)
435 {
436  bool is_prot(seq_type == CWriteDB::eProtein ? true : false);
437  string concatenated_blastdb_name;
438  vector<string> volume_names(num_volumes, kEmptyStr);
439  int num_digits = s_GetNumOfDigits(num_volumes);
440  for (unsigned int i = 0; i < num_volumes; i++) {
441  CNcbiOstrstream oss;
442  oss << file_name << "." << setfill('0') << setw(num_digits) << i;
443  const string vol_name((string)CNcbiOstrstreamToString(oss));
444  s_DoesBlastDbExist(vol_name, is_prot);
445  volume_names.push_back(vol_name);
446  concatenated_blastdb_name += vol_name + " ";
447  }
448 
449  Uint8 dbsize = 0;
450  int num_seqs = 0;
451  s_ComputeNumSequencesAndDbLength(concatenated_blastdb_name, is_prot,
452  &dbsize, &num_seqs);
453  CNcbiOstrstream fname;
454  fname << file_name << (is_prot ? ".pal" : ".nal");
455 
456  ofstream out(((string)CNcbiOstrstreamToString(fname)).c_str());
457  out << "#\n# Alias file created " << CTime(CTime::eCurrent).AsString()
458  << "\n#\n";
459 
460  if ( !title.empty() ) {
461  out << "TITLE " << title << "\n";
462  }
463 
464  out << "DBLIST ";
465  ITERATE(vector<string>, itr, volume_names) {
466  out << CDirEntry(*itr).GetName() << " ";
467  }
468  out << "\n";
469  out << "NSEQ " << num_seqs << "\n";
470  out << "LENGTH " << dbsize << "\n";
471  out.close();
472  s_PrintAliasFileCreationLog(concatenated_blastdb_name, is_prot, num_seqs);
473 }
474 
476  const vector<string>& databases,
477  CWriteDB::ESeqType seq_type,
478  const string& gi_file_name,
479  const string& title,
480  EAliasFileFilterType alias_type /*= eGiList*/)
481 {
482  s_CreateAliasFilePriv(file_name, databases, seq_type, gi_file_name, title, alias_type);
483 }
484 
486  const vector<string>& db_names,
487  CWriteDB::ESeqType seq_type,
488  const TSeqRange& oid_range,
489  const string& title /*= string()*/)
490 {
491  s_CreateAliasFilePriv(file_name, db_names, seq_type, kEmptyStr, title,
492  eNoAliasFilterType, &oid_range);
493 }
494 
495 void
496 CWriteDB_ConsolidateAliasFiles(const list<string>& alias_files,
497  bool delete_source_alias_files /* = false */)
498 {
499  if (alias_files.empty()) {
501  eArgErr,
502  "No alias files available to create group alias file.");
503  }
504 
505  ofstream out(kSeqDBGroupAliasFileName.c_str());
506  out << "# Alias file index for " << CDir::GetCwd() << endl;
507  out << "# Generated on " << CTime(CTime::eCurrent).AsString() << " by "
508  << NCBI_CURRENT_FUNCTION << endl;
509  out << "#" << endl;
510 
511  ITERATE(list<string>, itr, alias_files) {
512  ifstream in(itr->c_str());
513  if ( !in ) {
514  LOG_POST(Warning << *itr << " does not exist, omitting from group alias file");
515  continue;
516  }
517  out << "ALIAS_FILE " << CFile(*itr).GetName() << endl;
518  string line;
519  while (getline(in, line)) {
521  if (line.empty() || NStr::StartsWith(line, "#")) {
522  continue;
523  }
524  out << line << endl;
525  }
526  out << endl;
527  }
528 
529  if (delete_source_alias_files) {
530  ITERATE(list<string>, itr, alias_files) {
531  CFile(*itr).Remove(); // ignore errors
532  _TRACE("Deleting " << *itr);
533  }
534  }
535 }
536 
537 void
538 CWriteDB_ConsolidateAliasFiles(bool delete_source_alias_files /* = false */)
539 {
540  list<string> alias_files;
541  // Using "*.[pn]al" as pattern doesn't work
542  FindFiles("*.nal", alias_files, fFF_File);
543  FindFiles("*.pal", alias_files, fFF_File);
544  CWriteDB_ConsolidateAliasFiles(alias_files, delete_source_alias_files);
545 }
546 
547 void CWriteDB_CreateOidMaskDB(const string& input_db,
548  const string & output_db,
549  CWriteDB::ESeqType seq_type,
550  int oid_mask_type,
551  const string & title)
552 {
553  CRef<CSeqDB> seqdb;
554  bool is_protein = seq_type == CWriteDB::eProtein ? true : false;
556  try {
557  seqdb.Reset(new CSeqDB(input_db, t));
558  }
559  catch(CException & e) {
560  NCBI_THROW(CSeqDBException, eArgErr, "Invalid input db");
561  }
562 
563  vector<string> vols;
564  seqdb->FindVolumePaths(vols);
565  if(vols.size() == 0) {
566  NCBI_THROW(CSeqDBException, eArgErr, "no vol found for " + input_db);
567  }
568  seqdb.Reset();
569 
570  string out_ext = is_protein? ".pal":".nal";
571  int num_digits = s_GetNumOfDigits(vols.size());
572  ofstream ofs(output_db + out_ext);
573  ofs << "TITLE " << title <<endl;
574 
575  for (unsigned int i = 0; i < vols.size(); i++) {
576  CSeqDB_Path v_path(vols[i]);
577  string v_basename;
578  v_path.FindBaseName().GetString(v_basename);
579  string DBList = "DBLIST " + v_basename;
580  string OidList = "OIDLIST ";
581  if (oid_mask_type & EOidMaskType::fExcludeModel) {
582  string ex_model_ext = "." + SeqDB_GetOidMaskFileExt(is_protein, EOidMaskType::fExcludeModel);
583  string full_path = vols[i] + ex_model_ext;
584  CFile f(full_path);
585  if (!f.Exists()) {
586  NCBI_THROW(CSeqDBException, eArgErr, "Exclude oid mask file not found for " + vols[i]);
587  }
588  OidList += f.GetName();
589  }
590  CNcbiOstrstream oss;
591  if (vols.size() > 1) {
592  oss << output_db << "." << setfill('0') << setw(num_digits) << i << out_ext;
593  ofstream ovs((string)CNcbiOstrstreamToString(oss));
594  ovs << DBList << endl;
595  ovs << OidList << endl;
596  ovs << "OID_MASK_TYPE " << oid_mask_type << endl;
597  }
598  else {
599  ofs << DBList << endl;
600  ofs << OidList << endl;
601  ofs << "OID_MASK_TYPE " << oid_mask_type << endl;
602  }
603  }
604 
605  if (vols.size() > 1) {
606  CNcbiOstrstream oss;
607  oss << "DBLIST";
608  for (unsigned int i = 0; i < vols.size(); i++) {
609  oss << " " << output_db << "." << setfill('0') << setw(num_digits) << i;
610  }
611  ofs << (string) CNcbiOstrstreamToString(oss) << endl;
612  }
613 
614 
615  Uint8 total_length = 0;
616  int num_seqs = 0;
617  CSeqDB new_db(output_db, t);
618  num_seqs = new_db.GetNumSeqs();
619  total_length = new_db.GetTotalLength();
620 
621  ofs << "NSEQ " << num_seqs << endl;
622  ofs << "LENGTH " << total_length << endl;
623 }
624 
626 
#define true
Definition: bool.h:35
TContainerType m_Ids
List of identifiers to use.
Definition: writedb.hpp:515
CBinaryListBuilder(EIdType id_type)
Construct a list of a given type.
Definition: writedb.cpp:196
void Write(const string &fname)
Write the list to a file.
Definition: writedb.cpp:201
EIdType m_IdType
Whether to use GIs or TIs.
Definition: writedb.hpp:518
EIdType
Identifier types.
Definition: writedb.hpp:465
@ eTi
Trace id.
Definition: writedb.hpp:470
@ eGi
Genomic id.
Definition: writedb.hpp:467
CBioseq_Handle –.
`Blob' Class for SeqDB (and WriteDB).
Definition: seqdbblob.hpp:56
CDirEntry –.
Definition: ncbifile.hpp:262
CFile –.
Definition: ncbifile.hpp:1604
This represents a set of masks for a given sequence.
Definition: writedb.hpp:65
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
CSeqDBException.
Definition: seqdbcommon.hpp:73
CSeqDB_Path.
CSeqDB_Substring FindBaseName() const
Returns the portion of this path containing the base name.
void GetString(string &s) const
Return the data by assigning it to a string.
CSeqDB.
Definition: seqdb.hpp:161
static void FindVolumePaths(const string &dbname, ESeqType seqtype, vector< string > &paths, vector< string > *alias_paths=NULL, bool recursive=true, bool expand_links=true)
Find volume paths.
Definition: seqdb.cpp:1040
Uint8 GetTotalLength() const
Returns the sum of the lengths of all available sequences.
Definition: seqdb.cpp:685
ESeqType
Sequence types (eUnknown tries protein, then nucleotide).
Definition: seqdb.hpp:173
@ eNucleotide
Definition: seqdb.hpp:175
@ eProtein
Definition: seqdb.hpp:174
void GetTotals(ESummaryType sumtype, int *oid_count, Uint8 *total_length, bool use_approx=true) const
Returns the sum of the sequence lengths.
Definition: seqdb.cpp:1110
int GetNumSeqs() const
Returns the number of sequences available.
Definition: seqdb.cpp:670
@ eFilteredAll
Values from alias files, or summation over all included sequences.
Definition: seqdb.hpp:188
CSeqVector –.
Definition: seq_vector.hpp:65
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
CTime –.
Definition: ncbitime.hpp:296
CWriteDBException.
CWriteDB_Impl class.
int CreateColumn(const string &title, bool mbo=false)
Set up a generic CWriteDB metadata column.
void SetMaskedLetters(const string &masked)
Set bases that should not be used in sequences.
void SetPig(int pig)
Set the PIG identifier of this sequence.
void AddColumnMetaData(int col_id, const string &key, const string &value)
Add meta data to a column.
void ListFiles(vector< string > &files)
List Filenames.
void AddSequence(const CTempString &sequence, const CTempString &ambiguities)
Add a new sequence as raw sequence and ambiguity data.
void Close()
Close the file and flush any remaining data to disk.
void SetMaxFileSize(Uint8 sz)
Set the maximum size for any file in the database.
static CRef< CBlast_def_line_set > ExtractBioseqDeflines(const CBioseq &bs, bool parse_ids, bool long_seqids, bool scan_bioseq_4_cfastareader_usrobj=false)
Extract deflines from a CBioseq.
void SetMaskData(const CMaskedRangesVector &ranges, const vector< TGi > &gis)
Set filtering data for a sequence.
void SetDeflines(const CBlast_def_line_set &deflines)
This method replaces any stored header data for the current sequence with the provided CBlast_def_lin...
void ListVolumes(vector< string > &vols)
List Volumes.
int FindColumn(const string &title) const
Find an existing column.
int RegisterMaskAlgorithm(EBlast_filter_program program, const string &options, const string &name="")
Register a type of filtering data found in this database.
CBlastDbBlob & SetBlobData(int col_id)
Get a blob to use for a given column letter.
void SetMaxVolumeLetters(Uint8 sz)
Set the maximum letters in one volume.
int CreateUserColumn(const string &title)
Set up a user-defined CWriteDB column.
Definition: writedb.cpp:180
void SetMaxFileSize(Uint8 sz)
Set maximum size for output files.
Definition: writedb.cpp:118
ESeqType
Sequence types.
Definition: writedb.hpp:95
@ eProtein
Protein database.
Definition: writedb.hpp:97
int RegisterMaskAlgorithm(EBlast_filter_program program, const string &options=string(), const string &name=string())
Register a type of filtering data found in this database.
~CWriteDB()
Destructor.
Definition: writedb.cpp:74
CWriteDB(const string &dbname, ESeqType seqtype, const string &title, int itype=eDefault, bool parse_ids=true, bool long_ids=false, bool use_gi_mask=false, EBlastDbVersion dbver=eBDB_Version4, bool limit_defline=false, Uint8 oid_masks=EOidMaskType::fNone, bool scan_bioseq_4_cfastareader_usrobj=false)
Constructor.
Definition: writedb.cpp:49
void AddColumnMetaData(int col_id, const string &key, const string &value)
Add meta data to a user-defined column.
Definition: writedb.cpp:185
int FindColumn(const string &title) const
Find an existing column.
Definition: writedb.cpp:175
void ListFiles(vector< string > &files)
List Filenames.
Definition: writedb.cpp:146
CBlastDbBlob & SetBlobData(int column_id)
Add blob data to a user-defined column.
Definition: writedb.cpp:190
void SetMaskData(const CMaskedRangesVector &ranges, const vector< TGi > &gis)
Set filtering data for a sequence.
Definition: writedb.cpp:169
void SetPig(int pig)
Set the PIG to be used for the sequence.
Definition: writedb.cpp:99
void AddSequence(const CBioseq &bs)
Add a sequence as a CBioseq.
Definition: writedb.cpp:79
void SetMaxVolumeLetters(Uint8 letters)
Set maximum letters for output volumes.
Definition: writedb.cpp:123
CWriteDB_Impl * m_Impl
Implementation object.
Definition: writedb.hpp:447
EIndexType
Whether and what kind of indices to build.
Definition: writedb.hpp:104
void SetMaskedLetters(const string &masked)
Set letters that should not be used in sequences.
Definition: writedb.cpp:136
void ListVolumes(vector< string > &vols)
List Volumes.
Definition: writedb.cpp:141
static CRef< CBlast_def_line_set > ExtractBioseqDeflines(const CBioseq &bs, bool parse_ids=true, bool long_ids=false, bool scan_bioseq_4_cfastareader_usrobj=false)
Extract Deflines From Bioseq.
Definition: writedb.cpp:129
void SetDeflines(const CBlast_def_line_set &deflines)
Set the deflines to be used for the sequence.
Definition: writedb.cpp:94
void Close()
Close the Database.
Definition: writedb.cpp:104
char value[7]
Definition: config.c:431
const char * file_name[]
std::ofstream out("events_result.xml")
main entry point for tests
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
string
Definition: cgiapp.hpp:687
#define NULL
Definition: ncbistd.hpp:225
#define _TRACE(message)
Definition: ncbidbg.hpp:122
#define NCBI_CURRENT_FUNCTION
Get current function name.
Definition: ncbidiag.hpp:142
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1191
void FindFiles(TPathIterator path_begin, TPathIterator path_end, const vector< string > &masks, TFindFunc &find_func, TFindFiles flags=fFF_Default)
Generic algorithm for file search.
Definition: ncbifile.hpp:3145
virtual bool Remove(TRemoveFlags flags=eRecursive) const
Remove a directory entry.
Definition: ncbifile.cpp:2595
static string GetCwd(void)
Get the current working directory.
Definition: ncbifile.cpp:3708
string GetName(void) const
Get the base entry name with extension (if any).
Definition: ncbifile.hpp:3916
@ fFF_File
find files
Definition: ncbifile.hpp:3008
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
#define kEmptyStr
Definition: ncbistr.hpp:123
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
Definition: ncbistr.cpp:3197
const char * data(void) const
Return a pointer to the array represented.
Definition: tempstr.hpp:313
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5411
size_type length(void) const
Return the length of the represented array.
Definition: tempstr.hpp:320
string AsString(const CTimeFormat &format=kEmptyStr, TSeconds out_tz=eCurrentTimeZone) const
Transform time to string.
Definition: ncbitime.cpp:1511
@ eCurrent
Use current time. See also CCurrentTime.
Definition: ncbitime.hpp:300
EBlast_filter_program
This defines the possible sequence filtering algorithms to be used in a BLAST database.
char * dbname(DBPROCESS *dbproc)
Get name of current database.
Definition: dblib.c:6929
int i
yy_size_t n
constexpr auto sort(_Init &&init)
const struct ncbi::grid::netcache::search::fields::KEY key
unsigned int a
Definition: ncbi_localip.c:102
EIPRangeType t
Definition: ncbi_localip.c:101
std::istream & in(std::istream &in_, double &x_)
double f(double x_, const double &y_)
Definition: njn_root.hpp:188
Defines BLAST database access classes.
const string kSeqDBGroupAliasFileName
The name of the group alias file name expected at each directory For more documentation,...
const string SeqDB_GetOidMaskFileExt(bool db_is_protein, EOidMaskType t)
EBlastDbVersion
BLAST database version.
Definition: seqdbcommon.hpp:51
string SeqDB_ResolveDbPathNoExtension(const string &filename, char dbtype='-')
Resolve a file path using SeqDB's path algorithms.
@ fExcludeModel
This file defines several SeqDB utility functions related to byte order and file system portability.
#define _ASSERT
static bool ambig(char c)
static string s_AliasFileFilterTypeToString(EAliasFileFilterType e)
Auxiliary function to convert the enumeration into a string.
Definition: writedb.cpp:343
int s_GetNumOfDigits(int n)
Definition: writedb.cpp:419
static bool s_ComputeNumSequencesAndDbLength(const string &dbname, bool is_prot, Uint8 *dbsize, int *num_seqs_found)
Computes the number of sequences and (alias) database length for alias files.
Definition: writedb.cpp:284
void CWriteDB_CreateOidMaskDB(const string &input_db, const string &output_db, CWriteDB::ESeqType seq_type, int oid_mask_type, const string &title)
Definition: writedb.cpp:547
static void s_PrintAliasFileCreationLog(const string &dbname, bool is_protein, int num_seqs_found, const string &gi_file_name=kEmptyStr, int num_seqs_in_gifile=0)
Definition: writedb.cpp:306
void CWriteDB_CreateAliasFile(const string &file_name, const string &db_name, CWriteDB::ESeqType seq_type, const string &gi_file_name, const string &title, EAliasFileFilterType alias_type)
Definition: writedb.cpp:330
void CWriteDB_ConsolidateAliasFiles(const list< string > &alias_files, bool delete_source_alias_files)
Definition: writedb.cpp:496
static void s_CreateAliasFilePriv(const string &file_name, const vector< string > &databases, CWriteDB::ESeqType seq_type, const string &gi_file_name, const string &title, EAliasFileFilterType alias_type, const TSeqRange *oid_range=NULL)
Definition: writedb.cpp:358
static bool s_DoesBlastDbExist(const string &dbname, bool is_protein)
Returns true if the BLAST DB exists, otherwise throws a CSeqDBException.
Definition: writedb.cpp:261
Defines BLAST database construction classes.
EAliasFileFilterType
Defines the possible filtering types that can be applied to an alias file.
Definition: writedb.hpp:609
@ eTiList
Filter a BLAST database via TIs (Trace IDs)
Definition: writedb.hpp:612
@ eSeqIdList
Filter a BLAST database via a Seq-id list.
Definition: writedb.hpp:613
@ eTaxIdList
Filter a BLAST database via Taxonomy Id list.
Definition: writedb.hpp:614
@ eGiList
Filter a BLAST database via GIs.
Definition: writedb.hpp:611
@ eNoAliasFilterType
Sentinel value.
Definition: writedb.hpp:610
Data conversion tools for CWriteDB and associated code.
void s_WriteInt8BE(ostream &str, Uint8 x)
Write an eight byte integer to a stream in big-endian format.
void s_WriteInt4(ostream &str, int x)
Write a four byte integer to a stream in big endian format.
Defines implementation class of WriteDB.
Modified on Tue Nov 28 02:29:25 2023 by modify_doxy.py rev. 669887