NCBI C++ ToolKit
writedb.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: writedb.cpp 101963 2024-03-12 13:43:08Z zaretska $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Kevin Bealer
27  *
28  */
29 
30 /// @file writedb.cpp
31 /// Implementation for the CWriteDB class, the top level class for WriteDB.
32 #include <ncbi_pch.hpp>
36 #include "writedb_impl.hpp"
38 #include <iostream>
39 
41 
42 using namespace std;
43 
44 // Impl
45 
46 
47 // CWriteDB
48 
49 CWriteDB::CWriteDB(const string & dbname,
50  CWriteDB::ESeqType seqtype,
51  const string & title,
52  int indices,
53  bool parse_ids,
54  bool long_ids,
55  bool use_gi_mask,
56  EBlastDbVersion dbver,
57  bool limit_defline,
58  Uint8 oid_masks,
59  bool scan_bioseq_4_cfastareader_usrobj)
60  : m_Impl(0)
61 {
63  seqtype == eProtein,
64  title,
65  (EIndexType)indices,
66  parse_ids,
67  long_ids,
68  use_gi_mask,
69  dbver,
70  limit_defline,
71  oid_masks);
72 }
73 
75 {
76  delete m_Impl;
77 }
78 
80 {
81  m_Impl->AddSequence(bs);
82 }
83 
85 {
86  m_Impl->AddSequence(bsh);
87 }
88 
90 {
91  m_Impl->AddSequence(bs, sv);
92 }
93 
95 {
96  m_Impl->SetDeflines(deflines);
97 }
98 
99 void CWriteDB::SetPig(int pig)
100 {
101  m_Impl->SetPig(pig);
102 }
103 
105 {
106  m_Impl->Close();
107 }
108 
109 void CWriteDB::AddSequence(const CTempString & sequence,
110  const CTempString & ambig)
111 {
112  string s(sequence.data(), sequence.length());
113  string a(ambig.data(), ambig.length());
114 
115  m_Impl->AddSequence(s, a);
116 }
117 
119 {
120  m_Impl->SetMaxFileSize(sz);
121 }
122 
124 {
126 }
127 
129 CWriteDB::ExtractBioseqDeflines(const CBioseq & bs, bool parse_ids,
130  bool long_ids,
131  bool scan_bioseq_4_cfastareader_usrobj)
132 {
133  return CWriteDB_Impl::ExtractBioseqDeflines(bs, parse_ids, long_ids, scan_bioseq_4_cfastareader_usrobj);
134 }
135 
136 void CWriteDB::SetMaskedLetters(const string & masked)
137 {
138  m_Impl->SetMaskedLetters(masked);
139 }
140 
141 void CWriteDB::ListVolumes(vector<string> & vols)
142 {
143  m_Impl->ListVolumes(vols);
144 }
145 
146 void CWriteDB::ListFiles(vector<string> & files)
147 {
148  m_Impl->ListFiles(files);
149 }
150 
151 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
152  (!defined(NCBI_COMPILER_MIPSPRO)) )
153 int CWriteDB::
155  const string & options,
156  const string & name)
157 {
158  return m_Impl->RegisterMaskAlgorithm(program, options, name);
159 }
160 
161 int CWriteDB::
162 RegisterMaskAlgorithm(const string & id,
163  const string & description,
164  const string & options)
165 {
166  return m_Impl->RegisterMaskAlgorithm(id, description, options);
167 }
168 
170  const vector<TGi> & gis)
171 {
172  m_Impl->SetMaskData(ranges, gis);
173 }
174 
175 int CWriteDB::FindColumn(const string & title) const
176 {
177  return m_Impl->FindColumn(title);
178 }
179 
180 int CWriteDB::CreateUserColumn(const string & title)
181 {
182  return m_Impl->CreateColumn(title);
183 }
184 
185 void CWriteDB::AddColumnMetaData(int col_id, const string & key, const string & value)
186 {
187  m_Impl->AddColumnMetaData(col_id, key, value);
188 }
189 
191 {
192  return m_Impl->SetBlobData(col_id);
193 }
194 #endif
195 
197  : m_IdType(id_type)
198 {
199 }
200 
201 void CBinaryListBuilder::Write(const string & fname)
202 {
203  // Create a binary stream.
204  ofstream outp(fname.c_str(), ios::binary);
205  Write(outp);
206 }
207 
209 {
210  // Header; first check for 8 byte ids.
211 
212  bool eight = false;
213 
214  ITERATE(vector<Int8>, iter, m_Ids) {
215  Int8 id = *iter;
216  _ASSERT(id > 0);
217 
218  if ((id >> 32) != 0) {
219  eight = true;
220  break;
221  }
222  }
223 
224  Int4 magic = 0;
225 
226  switch(m_IdType) {
227  case eGi:
228  magic = eight ? -2 : -1;
229  break;
230 
231  case eTi:
232  magic = eight ? -4 : -3;
233  break;
234 
235  default:
237  eArgErr,
238  "Error: Unsupported ID type specified.");
239  }
240 
241  s_WriteInt4(outp, magic);
242  s_WriteInt4(outp, (int)m_Ids.size());
243 
244  sort(m_Ids.begin(), m_Ids.end());
245 
246  if (eight) {
247  ITERATE(vector<Int8>, iter, m_Ids) {
248  s_WriteInt8BE(outp, *iter);
249  }
250  } else {
251  ITERATE(vector<Int8>, iter, m_Ids) {
252  s_WriteInt4(outp, (int)*iter);
253  }
254  }
255 }
256 
257 /// Returns true if the BLAST DB exists, otherwise throws a CSeqDBException
258 /// @param dbname name of BLAST DB [in]
259 /// @param is_prot is the BLAST DB protein? [in]
260 static bool
261 s_DoesBlastDbExist(const string& dbname, bool is_protein)
262 {
263  char dbtype(is_protein ? 'p' : 'n');
264  string path = SeqDB_ResolveDbPathNoExtension(dbname, dbtype);
265  if (path.empty()) {
266  string msg("Failed to find ");
267  msg += (is_protein ? "protein " : "nucleotide ");
268  msg += dbname + " BLAST database";
269  NCBI_THROW(CSeqDBException, eFileErr, msg);
270  }
271  return true;
272 }
273 
274 /// Computes the number of sequences and (alias) database length for alias
275 /// files
276 /// @param dbname Name of the BLAST database over which the alias file is being
277 /// created [in]
278 /// @param is_prot is the BLAST database protein? [in]
279 /// @param dbsize (Approximate) number of letters in the BLAST DB [out]
280 /// @param num_seqs_found Number of sequences found in the dbname, or the
281 /// number of sequences in the intersection between the dbname and the GIs in
282 /// the gi_file_name (if applicable) [out]
283 static bool
285  bool is_prot,
286  Uint8* dbsize,
287  int* num_seqs_found)
288 {
289  _ASSERT((dbsize != NULL));
290  _ASSERT(num_seqs_found != NULL);
291  *dbsize = 0u;
292  *num_seqs_found = 0u;
293 
295  try {
296  _TRACE("Attempting to compute length for '" << dbname << "'");
297  CRef<CSeqDB> dbhandle(new CSeqDB(dbname, dbtype));
298  dbhandle->GetTotals(CSeqDB::eFilteredAll, num_seqs_found, dbsize, false);
299  } catch(...) {
300  return false;
301  }
302  return true;
303 }
304 
305 static void
307  bool is_protein,
308  int num_seqs_found,
309  const string& gi_file_name = kEmptyStr,
310  int num_seqs_in_gifile = 0)
311 {
312  if ( !gi_file_name.empty() ) {
313  /* This won't work if the target directory is not the current working directory
314  CRef<CSeqDBFileGiList> gilist;
315  gilist.Reset(new CSeqDBFileGiList(gi_file_name));
316  num_seqs_in_gifile = gilist->Size();
317  } */
318  LOG_POST("Created " << (is_protein ? "protein " : "nucleotide ") <<
319  dbname << " BLAST (alias) database with " << num_seqs_found
320  << " sequences (out of " << num_seqs_in_gifile << " in "
321  << gi_file_name << ", " << setprecision(0) << fixed <<
322  (num_seqs_found*100.0/num_seqs_in_gifile) << "% found)");
323  } else {
324  LOG_POST("Created " << (is_protein ? "protein " : "nucleotide ") <<
325  "BLAST (alias) database " << dbname << " with " <<
326  num_seqs_found << " sequences");
327  }
328 }
329 
331  const string& db_name,
332  CWriteDB::ESeqType seq_type,
333  const string& gi_file_name,
334  const string& title,
335  EAliasFileFilterType alias_type /*= eGiList*/)
336 {
337  vector<string> db;
338  NStr::Split(db_name," ",db);
339  CWriteDB_CreateAliasFile(file_name, db, seq_type, gi_file_name, title,
340  alias_type);
341 }
342 
343 /// Auxiliary function to convert the enumeration into a string
345 {
346  string retval;
347  switch (e) {
348  case eGiList: retval = "GILIST"; break;
349  case eTiList: retval = "TILIST"; break;
350  case eSeqIdList: retval = "SEQIDLIST"; break;
351  case eTaxIdList: retval = "TAXIDLIST"; break;
352  case eNoAliasFilterType: break;
353  default: _ASSERT(false); /* Need to add a type here? */
354  }
355  return retval;
356 }
357 
358 static void
360  const vector<string>& databases,
361  CWriteDB::ESeqType seq_type,
362  const string& gi_file_name,
363  const string& title,
364  EAliasFileFilterType alias_type,
365  const TSeqRange* oid_range = NULL)
366 {
367  bool is_prot(seq_type == CWriteDB::eProtein ? true : false);
368  Uint8 dbsize = 0;
369  int num_seqs = 0;
370  CNcbiOstrstream fnamestr;
371  fnamestr << file_name << (is_prot ? ".pal" : ".nal");
372  string fname = CNcbiOstrstreamToString(fnamestr);
373 
374  ofstream out(fname.c_str());
375  out << "#\n# Alias file created " << CTime(CTime::eCurrent).AsString()
376  << "\n#\n";
377 
378  if ( !title.empty() ) {
379  out << "TITLE " << title << "\n";
380  }
381  out << "DBLIST ";
382  ITERATE(vector< string >, iter, databases) {
383  out << "\"" << *iter << "\" ";
384  }
385  out << "\n";
386  if ( !gi_file_name.empty() ) {
387  _ASSERT(alias_type != eNoAliasFilterType);
388  out << s_AliasFileFilterTypeToString(alias_type) << " "
389  << gi_file_name << "\n";
390  } else if (oid_range) {
391  out << "FIRST_OID " << oid_range->GetFrom() << "\n"
392  << "LAST_OID " << oid_range->GetToOpen() << "\n";
393  }
394  out.close();
395 
396  if (!s_ComputeNumSequencesAndDbLength(file_name, is_prot, &dbsize, &num_seqs)){
397  CDirEntry(fname).Remove();
398  _TRACE("Deleting " << fname);
399  string msg("BLASTDB alias file creation failed. Some referenced files may be missing");
400  NCBI_THROW(CSeqDBException, eArgErr, msg);
401  };
402  if (num_seqs == 0) {
403  CDirEntry(fname).Remove();
404  _TRACE("Deleting " << fname);
405  CNcbiOstrstream oss;
406  oss << "No seqs in " << s_AliasFileFilterTypeToString(alias_type) << " were found"
407  << " in BLAST database";
408  string msg = CNcbiOstrstreamToString(oss);
409  NCBI_THROW(CSeqDBException, eArgErr, msg);
410  }
411 
412  out.open(fname.c_str(), ios::out|ios::app);
413  out << "NSEQ " << num_seqs << "\n";
414  out << "LENGTH " << dbsize << "\n";
415  out.close();
416 
417  s_PrintAliasFileCreationLog(file_name, is_prot, num_seqs);
418 }
419 
421 {
422  int num_digits = 0;
423  while (n) {
424  n/=10;
425  num_digits ++;
426  }
427 
428  return (num_digits >2) ? num_digits: 2;
429 }
430 
431 
433  unsigned int num_volumes,
434  CWriteDB::ESeqType seq_type,
435  const string& title)
436 {
437  bool is_prot(seq_type == CWriteDB::eProtein ? true : false);
438  string concatenated_blastdb_name;
439  vector<string> volume_names(num_volumes, kEmptyStr);
440  int num_digits = s_GetNumOfDigits(num_volumes);
441  for (unsigned int i = 0; i < num_volumes; i++) {
442  CNcbiOstrstream oss;
443  oss << file_name << "." << setfill('0') << setw(num_digits) << i;
444  const string vol_name((string)CNcbiOstrstreamToString(oss));
445  s_DoesBlastDbExist(vol_name, is_prot);
446  volume_names.push_back(vol_name);
447  concatenated_blastdb_name += vol_name + " ";
448  }
449 
450  Uint8 dbsize = 0;
451  int num_seqs = 0;
452  s_ComputeNumSequencesAndDbLength(concatenated_blastdb_name, is_prot,
453  &dbsize, &num_seqs);
454  CNcbiOstrstream fname;
455  fname << file_name << (is_prot ? ".pal" : ".nal");
456 
457  ofstream out(((string)CNcbiOstrstreamToString(fname)).c_str());
458  out << "#\n# Alias file created " << CTime(CTime::eCurrent).AsString()
459  << "\n#\n";
460 
461  if ( !title.empty() ) {
462  out << "TITLE " << title << "\n";
463  }
464 
465  out << "DBLIST ";
466  ITERATE(vector<string>, itr, volume_names) {
467  out << CDirEntry(*itr).GetName() << " ";
468  }
469  out << "\n";
470  out << "NSEQ " << num_seqs << "\n";
471  out << "LENGTH " << dbsize << "\n";
472  out.close();
473  s_PrintAliasFileCreationLog(concatenated_blastdb_name, is_prot, num_seqs);
474 }
475 
477  const vector<string>& databases,
478  CWriteDB::ESeqType seq_type,
479  const string& gi_file_name,
480  const string& title,
481  EAliasFileFilterType alias_type /*= eGiList*/)
482 {
483  s_CreateAliasFilePriv(file_name, databases, seq_type, gi_file_name, title, alias_type);
484 }
485 
487  const vector<string>& db_names,
488  CWriteDB::ESeqType seq_type,
489  const TSeqRange& oid_range,
490  const string& title /*= string()*/)
491 {
492  s_CreateAliasFilePriv(file_name, db_names, seq_type, kEmptyStr, title,
493  eNoAliasFilterType, &oid_range);
494 }
495 
496 void
497 CWriteDB_ConsolidateAliasFiles(const list<string>& alias_files,
498  bool delete_source_alias_files /* = false */)
499 {
500  if (alias_files.empty()) {
502  eArgErr,
503  "No alias files available to create group alias file.");
504  }
505 
506  ofstream out(kSeqDBGroupAliasFileName.c_str());
507  out << "# Alias file index for " << CDir::GetCwd() << endl;
508  out << "# Generated on " << CTime(CTime::eCurrent).AsString() << " by "
509  << NCBI_CURRENT_FUNCTION << endl;
510  out << "#" << endl;
511 
512  ITERATE(list<string>, itr, alias_files) {
513  ifstream in(itr->c_str());
514  if ( !in ) {
515  LOG_POST(Warning << *itr << " does not exist, omitting from group alias file");
516  continue;
517  }
518  out << "ALIAS_FILE " << CFile(*itr).GetName() << endl;
519  string line;
520  while (getline(in, line)) {
522  if (line.empty() || NStr::StartsWith(line, "#")) {
523  continue;
524  }
525  out << line << endl;
526  }
527  out << endl;
528  }
529 
530  if (delete_source_alias_files) {
531  ITERATE(list<string>, itr, alias_files) {
532  CFile(*itr).Remove(); // ignore errors
533  _TRACE("Deleting " << *itr);
534  }
535  }
536 }
537 
538 void
539 CWriteDB_ConsolidateAliasFiles(bool delete_source_alias_files /* = false */)
540 {
541  list<string> alias_files;
542  // Using "*.[pn]al" as pattern doesn't work
543  FindFiles("*.nal", alias_files, fFF_File);
544  FindFiles("*.pal", alias_files, fFF_File);
545  CWriteDB_ConsolidateAliasFiles(alias_files, delete_source_alias_files);
546 }
547 
548 void CWriteDB_CreateOidMaskDB(const string& input_db,
549  const string & output_db,
550  CWriteDB::ESeqType seq_type,
551  int oid_mask_type,
552  const string & title)
553 {
554  CRef<CSeqDB> seqdb;
555  bool is_protein = seq_type == CWriteDB::eProtein ? true : false;
557  try {
558  seqdb.Reset(new CSeqDB(input_db, t));
559  }
560  catch(CException & e) {
561  NCBI_THROW(CSeqDBException, eArgErr, "Invalid input db");
562  }
563 
564  vector<string> vols;
565  seqdb->FindVolumePaths(vols);
566  if(vols.size() == 0) {
567  NCBI_THROW(CSeqDBException, eArgErr, "no vol found for " + input_db);
568  }
569  seqdb.Reset();
570 
571  string out_ext = is_protein? ".pal":".nal";
572  int num_digits = s_GetNumOfDigits(vols.size());
573  ofstream ofs(output_db + out_ext);
574  ofs << "TITLE " << title <<endl;
575 
576  for (unsigned int i = 0; i < vols.size(); i++) {
577  CSeqDB_Path v_path(vols[i]);
578  string v_basename;
579  v_path.FindBaseName().GetString(v_basename);
580  string DBList = "DBLIST " + v_basename;
581  string OidList = "OIDLIST ";
582  if (oid_mask_type & EOidMaskType::fExcludeModel) {
583  string ex_model_ext = "." + SeqDB_GetOidMaskFileExt(is_protein, EOidMaskType::fExcludeModel);
584  string full_path = vols[i] + ex_model_ext;
585  CFile f(full_path);
586  if (!f.Exists()) {
587  NCBI_THROW(CSeqDBException, eArgErr, "Exclude oid mask file not found for " + vols[i]);
588  }
589  OidList += f.GetName();
590  }
591  CNcbiOstrstream oss;
592  if (vols.size() > 1) {
593  oss << output_db << "." << setfill('0') << setw(num_digits) << i << out_ext;
594  ofstream ovs((string)CNcbiOstrstreamToString(oss));
595  ovs << DBList << endl;
596  ovs << OidList << endl;
597  ovs << "OID_MASK_TYPE " << oid_mask_type << endl;
598  }
599  else {
600  ofs << DBList << endl;
601  ofs << OidList << endl;
602  ofs << "OID_MASK_TYPE " << oid_mask_type << endl;
603  }
604  }
605 
606  if (vols.size() > 1) {
607  CNcbiOstrstream oss;
608  oss << "DBLIST";
609  for (unsigned int i = 0; i < vols.size(); i++) {
610  oss << " " << output_db << "." << setfill('0') << setw(num_digits) << i;
611  }
612  ofs << (string) CNcbiOstrstreamToString(oss) << endl;
613  }
614 
615 
616  Uint8 total_length = 0;
617  int num_seqs = 0;
618  CSeqDB new_db(output_db, t);
619  num_seqs = new_db.GetNumSeqs();
620  total_length = new_db.GetTotalLength();
621 
622  ofs << "NSEQ " << num_seqs << endl;
623  ofs << "LENGTH " << total_length << endl;
624 }
625 
627 
TContainerType m_Ids
List of identifiers to use.
Definition: writedb.hpp:515
CBinaryListBuilder(EIdType id_type)
Construct a list of a given type.
Definition: writedb.cpp:196
void Write(const string &fname)
Write the list to a file.
Definition: writedb.cpp:201
EIdType m_IdType
Whether to use GIs or TIs.
Definition: writedb.hpp:518
EIdType
Identifier types.
Definition: writedb.hpp:465
@ eTi
Trace id.
Definition: writedb.hpp:470
@ eGi
Genomic id.
Definition: writedb.hpp:467
CBioseq_Handle –.
`Blob' Class for SeqDB (and WriteDB).
Definition: seqdbblob.hpp:56
CDirEntry –.
Definition: ncbifile.hpp:262
CFile –.
Definition: ncbifile.hpp:1605
This represents a set of masks for a given sequence.
Definition: writedb.hpp:65
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
CSeqDBException.
Definition: seqdbcommon.hpp:73
CSeqDB_Path.
CSeqDB_Substring FindBaseName() const
Returns the portion of this path containing the base name.
void GetString(string &s) const
Return the data by assigning it to a string.
CSeqDB.
Definition: seqdb.hpp:161
static void FindVolumePaths(const string &dbname, ESeqType seqtype, vector< string > &paths, vector< string > *alias_paths=NULL, bool recursive=true, bool expand_links=true)
Find volume paths.
Definition: seqdb.cpp:1040
Uint8 GetTotalLength() const
Returns the sum of the lengths of all available sequences.
Definition: seqdb.cpp:685
ESeqType
Sequence types (eUnknown tries protein, then nucleotide).
Definition: seqdb.hpp:173
@ eNucleotide
Definition: seqdb.hpp:175
@ eProtein
Definition: seqdb.hpp:174
void GetTotals(ESummaryType sumtype, int *oid_count, Uint8 *total_length, bool use_approx=true) const
Returns the sum of the sequence lengths.
Definition: seqdb.cpp:1110
int GetNumSeqs() const
Returns the number of sequences available.
Definition: seqdb.cpp:670
@ eFilteredAll
Values from alias files, or summation over all included sequences.
Definition: seqdb.hpp:188
CSeqVector –.
Definition: seq_vector.hpp:65
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
CTime –.
Definition: ncbitime.hpp:296
CWriteDBException.
CWriteDB_Impl class.
int CreateColumn(const string &title, bool mbo=false)
Set up a generic CWriteDB metadata column.
void SetMaskedLetters(const string &masked)
Set bases that should not be used in sequences.
void SetPig(int pig)
Set the PIG identifier of this sequence.
void AddColumnMetaData(int col_id, const string &key, const string &value)
Add meta data to a column.
void ListFiles(vector< string > &files)
List Filenames.
void AddSequence(const CTempString &sequence, const CTempString &ambiguities)
Add a new sequence as raw sequence and ambiguity data.
void Close()
Close the file and flush any remaining data to disk.
void SetMaxFileSize(Uint8 sz)
Set the maximum size for any file in the database.
static CRef< CBlast_def_line_set > ExtractBioseqDeflines(const CBioseq &bs, bool parse_ids, bool long_seqids, bool scan_bioseq_4_cfastareader_usrobj=false)
Extract deflines from a CBioseq.
void SetMaskData(const CMaskedRangesVector &ranges, const vector< TGi > &gis)
Set filtering data for a sequence.
void SetDeflines(const CBlast_def_line_set &deflines)
This method replaces any stored header data for the current sequence with the provided CBlast_def_lin...
void ListVolumes(vector< string > &vols)
List Volumes.
int FindColumn(const string &title) const
Find an existing column.
int RegisterMaskAlgorithm(EBlast_filter_program program, const string &options, const string &name="")
Register a type of filtering data found in this database.
CBlastDbBlob & SetBlobData(int col_id)
Get a blob to use for a given column letter.
void SetMaxVolumeLetters(Uint8 sz)
Set the maximum letters in one volume.
int CreateUserColumn(const string &title)
Set up a user-defined CWriteDB column.
Definition: writedb.cpp:180
void SetMaxFileSize(Uint8 sz)
Set maximum size for output files.
Definition: writedb.cpp:118
ESeqType
Sequence types.
Definition: writedb.hpp:95
@ eProtein
Protein database.
Definition: writedb.hpp:97
int RegisterMaskAlgorithm(EBlast_filter_program program, const string &options=string(), const string &name=string())
Register a type of filtering data found in this database.
~CWriteDB()
Destructor.
Definition: writedb.cpp:74
CWriteDB(const string &dbname, ESeqType seqtype, const string &title, int itype=eDefault, bool parse_ids=true, bool long_ids=false, bool use_gi_mask=false, EBlastDbVersion dbver=eBDB_Version4, bool limit_defline=false, Uint8 oid_masks=EOidMaskType::fNone, bool scan_bioseq_4_cfastareader_usrobj=false)
Constructor.
Definition: writedb.cpp:49
void AddColumnMetaData(int col_id, const string &key, const string &value)
Add meta data to a user-defined column.
Definition: writedb.cpp:185
int FindColumn(const string &title) const
Find an existing column.
Definition: writedb.cpp:175
void ListFiles(vector< string > &files)
List Filenames.
Definition: writedb.cpp:146
CBlastDbBlob & SetBlobData(int column_id)
Add blob data to a user-defined column.
Definition: writedb.cpp:190
void SetMaskData(const CMaskedRangesVector &ranges, const vector< TGi > &gis)
Set filtering data for a sequence.
Definition: writedb.cpp:169
void SetPig(int pig)
Set the PIG to be used for the sequence.
Definition: writedb.cpp:99
void AddSequence(const CBioseq &bs)
Add a sequence as a CBioseq.
Definition: writedb.cpp:79
void SetMaxVolumeLetters(Uint8 letters)
Set maximum letters for output volumes.
Definition: writedb.cpp:123
CWriteDB_Impl * m_Impl
Implementation object.
Definition: writedb.hpp:447
EIndexType
Whether and what kind of indices to build.
Definition: writedb.hpp:104
void SetMaskedLetters(const string &masked)
Set letters that should not be used in sequences.
Definition: writedb.cpp:136
void ListVolumes(vector< string > &vols)
List Volumes.
Definition: writedb.cpp:141
static CRef< CBlast_def_line_set > ExtractBioseqDeflines(const CBioseq &bs, bool parse_ids=true, bool long_ids=false, bool scan_bioseq_4_cfastareader_usrobj=false)
Extract Deflines From Bioseq.
Definition: writedb.cpp:129
void SetDeflines(const CBlast_def_line_set &deflines)
Set the deflines to be used for the sequence.
Definition: writedb.cpp:94
void Close()
Close the Database.
Definition: writedb.cpp:104
const char * file_name[]
std::ofstream out("events_result.xml")
main entry point for tests
#define true
Definition: bool.h:35
static FILE * f
Definition: readconf.c:23
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
string
Definition: cgiapp.hpp:690
#define NULL
Definition: ncbistd.hpp:225
#define _TRACE(message)
Definition: ncbidbg.hpp:122
#define NCBI_CURRENT_FUNCTION
Get current function name.
Definition: ncbidiag.hpp:142
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1191
void FindFiles(TPathIterator path_begin, TPathIterator path_end, const vector< string > &masks, TFindFunc &find_func, TFindFiles flags=fFF_Default)
Generic algorithm for file search.
Definition: ncbifile.hpp:3146
virtual bool Remove(TRemoveFlags flags=eRecursive) const
Remove a directory entry.
Definition: ncbifile.cpp:2595
static string GetCwd(void)
Get the current working directory.
Definition: ncbifile.cpp:3708
string GetName(void) const
Get the base entry name with extension (if any).
Definition: ncbifile.hpp:3917
@ fFF_File
find files
Definition: ncbifile.hpp:3009
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
#define kEmptyStr
Definition: ncbistr.hpp:123
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3452
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate whitespace in a string (in-place)
Definition: ncbistr.cpp:3192
const char * data(void) const
Return a pointer to the array represented.
Definition: tempstr.hpp:313
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5406
size_type length(void) const
Return the length of the represented array.
Definition: tempstr.hpp:320
string AsString(const CTimeFormat &format=kEmptyStr, TSeconds out_tz=eCurrentTimeZone) const
Transform time to string.
Definition: ncbitime.cpp:1512
@ eCurrent
Use current time. See also CCurrentTime.
Definition: ncbitime.hpp:300
EBlast_filter_program
This defines the possible sequence filtering algorithms to be used in a BLAST database.
char * dbname(DBPROCESS *dbproc)
Get name of current database.
Definition: dblib.c:6929
int i
yy_size_t n
constexpr auto sort(_Init &&init)
const struct ncbi::grid::netcache::search::fields::KEY key
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
unsigned int a
Definition: ncbi_localip.c:102
EIPRangeType t
Definition: ncbi_localip.c:101
std::istream & in(std::istream &in_, double &x_)
Defines BLAST database access classes.
const string kSeqDBGroupAliasFileName
The name of the group alias file name expected at each directory For more documentation,...
const string SeqDB_GetOidMaskFileExt(bool db_is_protein, EOidMaskType t)
EBlastDbVersion
BLAST database version.
Definition: seqdbcommon.hpp:51
string SeqDB_ResolveDbPathNoExtension(const string &filename, char dbtype='-')
Resolve a file path using SeqDB's path algorithms.
@ fExcludeModel
This file defines several SeqDB utility functions related to byte order and file system portability.
static SLJIT_INLINE sljit_ins msg(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
#define _ASSERT
static bool ambig(char c)
static string s_AliasFileFilterTypeToString(EAliasFileFilterType e)
Auxiliary function to convert the enumeration into a string.
Definition: writedb.cpp:344
int s_GetNumOfDigits(int n)
Definition: writedb.cpp:420
static bool s_ComputeNumSequencesAndDbLength(const string &dbname, bool is_prot, Uint8 *dbsize, int *num_seqs_found)
Computes the number of sequences and (alias) database length for alias files.
Definition: writedb.cpp:284
void CWriteDB_CreateOidMaskDB(const string &input_db, const string &output_db, CWriteDB::ESeqType seq_type, int oid_mask_type, const string &title)
Definition: writedb.cpp:548
static void s_PrintAliasFileCreationLog(const string &dbname, bool is_protein, int num_seqs_found, const string &gi_file_name=kEmptyStr, int num_seqs_in_gifile=0)
Definition: writedb.cpp:306
void CWriteDB_CreateAliasFile(const string &file_name, const string &db_name, CWriteDB::ESeqType seq_type, const string &gi_file_name, const string &title, EAliasFileFilterType alias_type)
Definition: writedb.cpp:330
void CWriteDB_ConsolidateAliasFiles(const list< string > &alias_files, bool delete_source_alias_files)
Definition: writedb.cpp:497
static void s_CreateAliasFilePriv(const string &file_name, const vector< string > &databases, CWriteDB::ESeqType seq_type, const string &gi_file_name, const string &title, EAliasFileFilterType alias_type, const TSeqRange *oid_range=NULL)
Definition: writedb.cpp:359
static bool s_DoesBlastDbExist(const string &dbname, bool is_protein)
Returns true if the BLAST DB exists, otherwise throws a CSeqDBException.
Definition: writedb.cpp:261
Defines BLAST database construction classes.
EAliasFileFilterType
Defines the possible filtering types that can be applied to an alias file.
Definition: writedb.hpp:609
@ eTiList
Filter a BLAST database via TIs (Trace IDs)
Definition: writedb.hpp:612
@ eSeqIdList
Filter a BLAST database via a Seq-id list.
Definition: writedb.hpp:613
@ eTaxIdList
Filter a BLAST database via Taxonomy Id list.
Definition: writedb.hpp:614
@ eGiList
Filter a BLAST database via GIs.
Definition: writedb.hpp:611
@ eNoAliasFilterType
Sentinel value.
Definition: writedb.hpp:610
Data conversion tools for CWriteDB and associated code.
void s_WriteInt8BE(ostream &str, Uint8 x)
Write an eight byte integer to a stream in big-endian format.
void s_WriteInt4(ostream &str, int x)
Write a four byte integer to a stream in big endian format.
Defines implementation class of WriteDB.
Modified on Fri Sep 20 14:58:04 2024 by modify_doxy.py rev. 669887