1 /* $Id: writedb.cpp 101963 2024-03-12 13:43:08Z zaretska $
2  * ===========================================================================
3  *
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Kevin Bealer
27  *
28  */
30 /// @file writedb.cpp
31 /// Implementation for the CWriteDB class, the top level class for WriteDB.
32 #include <ncbi_pch.hpp>
36 #include "writedb_impl.hpp"
38 #include <iostream>
42 using namespace std;
44 // Impl
47 // CWriteDB
49 CWriteDB::CWriteDB(const string & dbname,
50  CWriteDB::ESeqType seqtype,
51  const string & title,
52  int indices,
53  bool parse_ids,
54  bool long_ids,
55  bool use_gi_mask,
56  EBlastDbVersion dbver,
57  bool limit_defline,
58  Uint8 oid_masks,
59  bool scan_bioseq_4_cfastareader_usrobj)
60  : m_Impl(0)
61 {
63  seqtype == eProtein,
64  title,
65  (EIndexType)indices,
66  parse_ids,
67  long_ids,
68  use_gi_mask,
69  dbver,
70  limit_defline,
71  oid_masks);
72 }
75 {
76  delete m_Impl;
77 }
80 {
81  m_Impl->AddSequence(bs);
82 }
85 {
86  m_Impl->AddSequence(bsh);
87 }
90 {
91  m_Impl->AddSequence(bs, sv);
92 }
95 {
96  m_Impl->SetDeflines(deflines);
97 }
99 void CWriteDB::SetPig(int pig)
100 {
101  m_Impl->SetPig(pig);
102 }
105 {
106  m_Impl->Close();
107 }
109 void CWriteDB::AddSequence(const CTempString & sequence,
110  const CTempString & ambig)
111 {
112  string s(, sequence.length());
113  string a(, ambig.length());
115  m_Impl->AddSequence(s, a);
116 }
119 {
120  m_Impl->SetMaxFileSize(sz);
121 }
124 {
126 }
129 CWriteDB::ExtractBioseqDeflines(const CBioseq & bs, bool parse_ids,
130  bool long_ids,
131  bool scan_bioseq_4_cfastareader_usrobj)
132 {
133  return CWriteDB_Impl::ExtractBioseqDeflines(bs, parse_ids, long_ids, scan_bioseq_4_cfastareader_usrobj);
134 }
136 void CWriteDB::SetMaskedLetters(const string & masked)
137 {
138  m_Impl->SetMaskedLetters(masked);
139 }
141 void CWriteDB::ListVolumes(vector<string> & vols)
142 {
143  m_Impl->ListVolumes(vols);
144 }
146 void CWriteDB::ListFiles(vector<string> & files)
147 {
148  m_Impl->ListFiles(files);
149 }
151 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
152  (!defined(NCBI_COMPILER_MIPSPRO)) )
153 int CWriteDB::
155  const string & options,
156  const string & name)
157 {
158  return m_Impl->RegisterMaskAlgorithm(program, options, name);
159 }
161 int CWriteDB::
162 RegisterMaskAlgorithm(const string & id,
163  const string & description,
164  const string & options)
165 {
166  return m_Impl->RegisterMaskAlgorithm(id, description, options);
167 }
170  const vector<TGi> & gis)
171 {
172  m_Impl->SetMaskData(ranges, gis);
173 }
175 int CWriteDB::FindColumn(const string & title) const
176 {
177  return m_Impl->FindColumn(title);
178 }
180 int CWriteDB::CreateUserColumn(const string & title)
181 {
182  return m_Impl->CreateColumn(title);
183 }
185 void CWriteDB::AddColumnMetaData(int col_id, const string & key, const string & value)
186 {
187  m_Impl->AddColumnMetaData(col_id, key, value);
188 }
191 {
192  return m_Impl->SetBlobData(col_id);
193 }
194 #endif
197  : m_IdType(id_type)
198 {
199 }
201 void CBinaryListBuilder::Write(const string & fname)
202 {
203  // Create a binary stream.
204  ofstream outp(fname.c_str(), ios::binary);
205  Write(outp);
206 }
209 {
210  // Header; first check for 8 byte ids.
212  bool eight = false;
214  ITERATE(vector<Int8>, iter, m_Ids) {
215  Int8 id = *iter;
216  _ASSERT(id > 0);
218  if ((id >> 32) != 0) {
219  eight = true;
220  break;
221  }
222  }
224  Int4 magic = 0;
226  switch(m_IdType) {
227  case eGi:
228  magic = eight ? -2 : -1;
229  break;
231  case eTi:
232  magic = eight ? -4 : -3;
233  break;
235  default:
237  eArgErr,
238  "Error: Unsupported ID type specified.");
239  }
241  s_WriteInt4(outp, magic);
242  s_WriteInt4(outp, (int)m_Ids.size());
244  sort(m_Ids.begin(), m_Ids.end());
246  if (eight) {
247  ITERATE(vector<Int8>, iter, m_Ids) {
248  s_WriteInt8BE(outp, *iter);
249  }
250  } else {
251  ITERATE(vector<Int8>, iter, m_Ids) {
252  s_WriteInt4(outp, (int)*iter);
253  }
254  }
255 }
257 /// Returns true if the BLAST DB exists, otherwise throws a CSeqDBException
258 /// @param dbname name of BLAST DB [in]
259 /// @param is_prot is the BLAST DB protein? [in]
260 static bool
261 s_DoesBlastDbExist(const string& dbname, bool is_protein)
262 {
263  char dbtype(is_protein ? 'p' : 'n');
264  string path = SeqDB_ResolveDbPathNoExtension(dbname, dbtype);
265  if (path.empty()) {
266  string msg("Failed to find ");
267  msg += (is_protein ? "protein " : "nucleotide ");
268  msg += dbname + " BLAST database";
269  NCBI_THROW(CSeqDBException, eFileErr, msg);
270  }
271  return true;
272 }
274 /// Computes the number of sequences and (alias) database length for alias
275 /// files
276 /// @param dbname Name of the BLAST database over which the alias file is being
277 /// created [in]
278 /// @param is_prot is the BLAST database protein? [in]
279 /// @param dbsize (Approximate) number of letters in the BLAST DB [out]
280 /// @param num_seqs_found Number of sequences found in the dbname, or the
281 /// number of sequences in the intersection between the dbname and the GIs in
282 /// the gi_file_name (if applicable) [out]
283 static bool
285  bool is_prot,
286  Uint8* dbsize,
287  int* num_seqs_found)
288 {
289  _ASSERT((dbsize != NULL));
290  _ASSERT(num_seqs_found != NULL);
291  *dbsize = 0u;
292  *num_seqs_found = 0u;
295  try {
296  _TRACE("Attempting to compute length for '" << dbname << "'");
297  CRef<CSeqDB> dbhandle(new CSeqDB(dbname, dbtype));
298  dbhandle->GetTotals(CSeqDB::eFilteredAll, num_seqs_found, dbsize, false);
299  } catch(...) {
300  return false;
301  }
302  return true;
303 }
305 static void
307  bool is_protein,
308  int num_seqs_found,
309  const string& gi_file_name = kEmptyStr,
310  int num_seqs_in_gifile = 0)
311 {
312  if ( !gi_file_name.empty() ) {
313  /* This won't work if the target directory is not the current working directory
314  CRef<CSeqDBFileGiList> gilist;
315  gilist.Reset(new CSeqDBFileGiList(gi_file_name));
316  num_seqs_in_gifile = gilist->Size();
317  } */
318  LOG_POST("Created " << (is_protein ? "protein " : "nucleotide ") <<
319  dbname << " BLAST (alias) database with " << num_seqs_found
320  << " sequences (out of " << num_seqs_in_gifile << " in "
321  << gi_file_name << ", " << setprecision(0) << fixed <<
322  (num_seqs_found*100.0/num_seqs_in_gifile) << "% found)");
323  } else {
324  LOG_POST("Created " << (is_protein ? "protein " : "nucleotide ") <<
325  "BLAST (alias) database " << dbname << " with " <<
326  num_seqs_found << " sequences");
327  }
328 }
331  const string& db_name,
332  CWriteDB::ESeqType seq_type,
333  const string& gi_file_name,
334  const string& title,
335  EAliasFileFilterType alias_type /*= eGiList*/)
336 {
337  vector<string> db;
338  NStr::Split(db_name," ",db);
339  CWriteDB_CreateAliasFile(file_name, db, seq_type, gi_file_name, title,
340  alias_type);
341 }
343 /// Auxiliary function to convert the enumeration into a string
345 {
346  string retval;
347  switch (e) {
348  case eGiList: retval = "GILIST"; break;
349  case eTiList: retval = "TILIST"; break;
350  case eSeqIdList: retval = "SEQIDLIST"; break;
351  case eTaxIdList: retval = "TAXIDLIST"; break;
352  case eNoAliasFilterType: break;
353  default: _ASSERT(false); /* Need to add a type here? */
354  }
355  return retval;
356 }
358 static void
360  const vector<string>& databases,
361  CWriteDB::ESeqType seq_type,
362  const string& gi_file_name,
363  const string& title,
364  EAliasFileFilterType alias_type,
365  const TSeqRange* oid_range = NULL)
366 {
367  bool is_prot(seq_type == CWriteDB::eProtein ? true : false);
368  Uint8 dbsize = 0;
369  int num_seqs = 0;
370  CNcbiOstrstream fnamestr;
371  fnamestr << file_name << (is_prot ? ".pal" : ".nal");
372  string fname = CNcbiOstrstreamToString(fnamestr);
374  ofstream out(fname.c_str());
375  out << "#\n# Alias file created " << CTime(CTime::eCurrent).AsString()
376  << "\n#\n";
378  if ( !title.empty() ) {
379  out << "TITLE " << title << "\n";
380  }
381  out << "DBLIST ";
382  ITERATE(vector< string >, iter, databases) {
383  out << "\"" << *iter << "\" ";
384  }
385  out << "\n";
386  if ( !gi_file_name.empty() ) {
387  _ASSERT(alias_type != eNoAliasFilterType);
388  out << s_AliasFileFilterTypeToString(alias_type) << " "
389  << gi_file_name << "\n";
390  } else if (oid_range) {
391  out << "FIRST_OID " << oid_range->GetFrom() << "\n"
392  << "LAST_OID " << oid_range->GetToOpen() << "\n";
393  }
394  out.close();
396  if (!s_ComputeNumSequencesAndDbLength(file_name, is_prot, &dbsize, &num_seqs)){
397  CDirEntry(fname).Remove();
398  _TRACE("Deleting " << fname);
399  string msg("BLASTDB alias file creation failed. Some referenced files may be missing");
400  NCBI_THROW(CSeqDBException, eArgErr, msg);
401  };
402  if (num_seqs == 0) {
403  CDirEntry(fname).Remove();
404  _TRACE("Deleting " << fname);
405  CNcbiOstrstream oss;
406  oss << "No seqs in " << s_AliasFileFilterTypeToString(alias_type) << " were found"
407  << " in BLAST database";
408  string msg = CNcbiOstrstreamToString(oss);
409  NCBI_THROW(CSeqDBException, eArgErr, msg);
410  }
412, ios::out|ios::app);
413  out << "NSEQ " << num_seqs << "\n";
414  out << "LENGTH " << dbsize << "\n";
415  out.close();
417  s_PrintAliasFileCreationLog(file_name, is_prot, num_seqs);
418 }
421 {
422  int num_digits = 0;
423  while (n) {
424  n/=10;
425  num_digits ++;
426  }
428  return (num_digits >2) ? num_digits: 2;
429 }
433  unsigned int num_volumes,
434  CWriteDB::ESeqType seq_type,
435  const string& title)
436 {
437  bool is_prot(seq_type == CWriteDB::eProtein ? true : false);
438  string concatenated_blastdb_name;
439  vector<string> volume_names(num_volumes, kEmptyStr);
440  int num_digits = s_GetNumOfDigits(num_volumes);
441  for (unsigned int i = 0; i < num_volumes; i++) {
442  CNcbiOstrstream oss;
443  oss << file_name << "." << setfill('0') << setw(num_digits) << i;
444  const string vol_name((string)CNcbiOstrstreamToString(oss));
445  s_DoesBlastDbExist(vol_name, is_prot);
446  volume_names.push_back(vol_name);
447  concatenated_blastdb_name += vol_name + " ";
448  }
450  Uint8 dbsize = 0;
451  int num_seqs = 0;
452  s_ComputeNumSequencesAndDbLength(concatenated_blastdb_name, is_prot,
453  &dbsize, &num_seqs);
454  CNcbiOstrstream fname;
455  fname << file_name << (is_prot ? ".pal" : ".nal");
457  ofstream out(((string)CNcbiOstrstreamToString(fname)).c_str());
458  out << "#\n# Alias file created " << CTime(CTime::eCurrent).AsString()
459  << "\n#\n";
461  if ( !title.empty() ) {
462  out << "TITLE " << title << "\n";
463  }
465  out << "DBLIST ";
466  ITERATE(vector<string>, itr, volume_names) {
467  out << CDirEntry(*itr).GetName() << " ";
468  }
469  out << "\n";
470  out << "NSEQ " << num_seqs << "\n";
471  out << "LENGTH " << dbsize << "\n";
472  out.close();
473  s_PrintAliasFileCreationLog(concatenated_blastdb_name, is_prot, num_seqs);
474 }
477  const vector<string>& databases,
478  CWriteDB::ESeqType seq_type,
479  const string& gi_file_name,
480  const string& title,
481  EAliasFileFilterType alias_type /*= eGiList*/)
482 {
483  s_CreateAliasFilePriv(file_name, databases, seq_type, gi_file_name, title, alias_type);
484 }
487  const vector<string>& db_names,
488  CWriteDB::ESeqType seq_type,
489  const TSeqRange& oid_range,
490  const string& title /*= string()*/)
491 {
492  s_CreateAliasFilePriv(file_name, db_names, seq_type, kEmptyStr, title,
493  eNoAliasFilterType, &oid_range);
494 }
496 void
497 CWriteDB_ConsolidateAliasFiles(const list<string>& alias_files,
498  bool delete_source_alias_files /* = false */)
499 {
500  if (alias_files.empty()) {
502  eArgErr,
503  "No alias files available to create group alias file.");
504  }
506  ofstream out(kSeqDBGroupAliasFileName.c_str());
507  out << "# Alias file index for " << CDir::GetCwd() << endl;
508  out << "# Generated on " << CTime(CTime::eCurrent).AsString() << " by "
509  << NCBI_CURRENT_FUNCTION << endl;
510  out << "#" << endl;
512  ITERATE(list<string>, itr, alias_files) {
513  ifstream in(itr->c_str());
514  if ( !in ) {
515  LOG_POST(Warning << *itr << " does not exist, omitting from group alias file");
516  continue;
517  }
518  out << "ALIAS_FILE " << CFile(*itr).GetName() << endl;
519  string line;
520  while (getline(in, line)) {
522  if (line.empty() || NStr::StartsWith(line, "#")) {
523  continue;
524  }
525  out << line << endl;
526  }
527  out << endl;
528  }
530  if (delete_source_alias_files) {
531  ITERATE(list<string>, itr, alias_files) {
532  CFile(*itr).Remove(); // ignore errors
533  _TRACE("Deleting " << *itr);
534  }
535  }
536 }
538 void
539 CWriteDB_ConsolidateAliasFiles(bool delete_source_alias_files /* = false */)
540 {
541  list<string> alias_files;
542  // Using "*.[pn]al" as pattern doesn't work
543  FindFiles("*.nal", alias_files, fFF_File);
544  FindFiles("*.pal", alias_files, fFF_File);
545  CWriteDB_ConsolidateAliasFiles(alias_files, delete_source_alias_files);
546 }
548 void CWriteDB_CreateOidMaskDB(const string& input_db,
549  const string & output_db,
550  CWriteDB::ESeqType seq_type,
551  int oid_mask_type,
552  const string & title)
553 {
554  CRef<CSeqDB> seqdb;
555  bool is_protein = seq_type == CWriteDB::eProtein ? true : false;
557  try {
558  seqdb.Reset(new CSeqDB(input_db, t));
559  }
560  catch(CException & e) {
561  NCBI_THROW(CSeqDBException, eArgErr, "Invalid input db");
562  }
564  vector<string> vols;
565  seqdb->FindVolumePaths(vols);
566  if(vols.size() == 0) {
567  NCBI_THROW(CSeqDBException, eArgErr, "no vol found for " + input_db);
568  }
569  seqdb.Reset();
571  string out_ext = is_protein? ".pal":".nal";
572  int num_digits = s_GetNumOfDigits(vols.size());
573  ofstream ofs(output_db + out_ext);
574  ofs << "TITLE " << title <<endl;
576  for (unsigned int i = 0; i < vols.size(); i++) {
577  CSeqDB_Path v_path(vols[i]);
578  string v_basename;
579  v_path.FindBaseName().GetString(v_basename);
580  string DBList = "DBLIST " + v_basename;
581  string OidList = "OIDLIST ";
582  if (oid_mask_type & EOidMaskType::fExcludeModel) {
583  string ex_model_ext = "." + SeqDB_GetOidMaskFileExt(is_protein, EOidMaskType::fExcludeModel);
584  string full_path = vols[i] + ex_model_ext;
585  CFile f(full_path);
586  if (!f.Exists()) {
587  NCBI_THROW(CSeqDBException, eArgErr, "Exclude oid mask file not found for " + vols[i]);
588  }
589  OidList += f.GetName();
590  }
591  CNcbiOstrstream oss;
592  if (vols.size() > 1) {
593  oss << output_db << "." << setfill('0') << setw(num_digits) << i << out_ext;
594  ofstream ovs((string)CNcbiOstrstreamToString(oss));
595  ovs << DBList << endl;
596  ovs << OidList << endl;
597  ovs << "OID_MASK_TYPE " << oid_mask_type << endl;
598  }
599  else {
600  ofs << DBList << endl;
601  ofs << OidList << endl;
602  ofs << "OID_MASK_TYPE " << oid_mask_type << endl;
603  }
604  }
606  if (vols.size() > 1) {
607  CNcbiOstrstream oss;
608  oss << "DBLIST";
609  for (unsigned int i = 0; i < vols.size(); i++) {
610  oss << " " << output_db << "." << setfill('0') << setw(num_digits) << i;
611  }
612  ofs << (string) CNcbiOstrstreamToString(oss) << endl;
613  }
616  Uint8 total_length = 0;
617  int num_seqs = 0;
618  CSeqDB new_db(output_db, t);
619  num_seqs = new_db.GetNumSeqs();
620  total_length = new_db.GetTotalLength();
622  ofs << "NSEQ " << num_seqs << endl;
623  ofs << "LENGTH " << total_length << endl;
624 }
