NCBI C++ ToolKit
writedb_impl.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: writedb_impl.cpp 101152 2023-11-07 15:39:13Z camacho $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Kevin Bealer
27  *
28  */
29 
30 /// @file writedb_impl.cpp
31 /// Implementation for the CWriteDB_Impl class.
32 /// class for WriteDB.
33 #include <ncbi_pch.hpp>
40 #include <objects/blastdb/defline_extra.hpp> // for kAsnDeflineObjLabel
41 #include <serial/typeinfo.hpp>
42 #include <corelib/ncbi_bswap.hpp>
43 
44 #include "writedb_impl.hpp"
46 
47 #include <iostream>
48 #include <sstream>
49 #include <cmath>
50 
52 
53 /// Import C++ std namespace.
55 
57  bool protein,
58  const string & title,
59  EIndexType indices,
60  bool parse_ids,
61  bool long_ids,
62  bool use_gi_mask,
63  EBlastDbVersion dbver,
64  bool limit_defline,
65  Uint8 oid_masks,
66  bool scan_bioseq_4_cfastareader_usrobj)
67  : m_Dbname (dbname),
68  m_Protein (protein),
69  m_Title (title),
70  m_MaxFileSize (0),
71  m_MaxVolumeLetters (0),
72  m_Indices (indices),
73  m_Closed (false),
74  m_MaskDataColumn (-1),
75  m_ParseIDs (parse_ids),
76  m_UseGiMask (use_gi_mask),
77  m_DbVersion (dbver),
78  m_Pig (0),
79  m_Hash (0),
80  m_SeqLength (0),
81  m_HaveSequence (false),
82  m_LongSeqId (long_ids),
83  m_LmdbOid (0),
84  m_limitDefline (protein? limit_defline: false),
85  m_OidMasks (oid_masks),
86  m_ScanBioseq4CFastaReaderUsrObjct(scan_bioseq_4_cfastareader_usrobj)
87 {
89 
90  m_Date = now.AsString("b d, Y ");
91  string t = now.AsString("H:m P");
92 
93  if (t[0] == '0') {
94  t.assign(t, 1, t.size() - 1);
95  }
96 
97  m_Date += t;
98 }
99 
101 {
102  try {
103  Close();
104  } catch (const CWriteDBException& e) {
105  ERR_POST(Error << "BLAST Database creation error: " << e.GetMsg());
106  }
107 
108 }
109 
111 {
112  m_Bioseq.Reset();
114  m_Deflines.Reset();
115  m_Ids.clear();
116  m_Linkouts.clear();
117  m_Memberships.clear();
118  m_Pig = 0;
119  m_Hash = 0;
120  m_SeqLength = 0;
121 
122  m_Sequence.erase();
123  m_Ambig.erase();
124  m_BinHdr.erase();
125 
126  m_TaxIds.clear();
127 
128  NON_CONST_ITERATE(vector<int>, iter, m_HaveBlob) {
129  *iter = 0;
130  }
131 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
132  (!defined(NCBI_COMPILER_MIPSPRO)) )
133  NON_CONST_ITERATE(vector< CRef<CBlastDbBlob> >, iter, m_Blobs) {
134  (**iter).Clear();
135  }
136 #endif
137 }
138 
140  const CTempString & ambig)
141 {
142  // Publish previous sequence (if any)
143  x_Publish();
144 
145  // Blank slate for new sequence.
147 
148  m_Sequence.assign(seq.data(), seq.length());
149  m_Ambig.assign(ambig.data(), ambig.length());
150 
152  x_ComputeHash(seq, ambig);
153  }
154 
156 }
157 
159 {
160  // Publish previous sequence
161  x_Publish();
162 
163  // Blank slate for new sequence.
165 
166  m_Bioseq.Reset(& bs);
167  if (m_Bioseq->GetInst().CanGetMol() && (m_Bioseq->IsAa() != m_Protein)) {
169  msg << "Invalid molecule type of sequence added ("
170  << (m_Bioseq->IsAa() ? "protein" : "nucleotide")
171  << "); expected " << (m_Protein ? "protein" : "nucleotide");
173  }
174 
176  x_ComputeHash(bs);
177  }
178 
180 }
181 
183 {
184  AddSequence(bs);
185  m_SeqVector = sv;
186 }
187 
189 {
190  CSeqVector sv(bsh);
191  AddSequence(*bsh.GetCompleteBioseq(), sv);
192 }
193 
194 
195 /// class to support searching for duplicate isam keys
196 template <class T>
198 
199  public:
200  // data member
203 
204  // constructor
205  CWriteDB_IsamKey(const string &fn) {
206  source = new CNcbiIfstream(fn.c_str(),
207  IOS_BASE::in | IOS_BASE::binary);
208  key = x_GetNextKey();
209  };
210 
212  delete source;
213  };
214 
215  // advance key to catch up other
216  bool AdvanceKey(const CWriteDB_IsamKey & other) {
217  while (!source->eof()) {
218  T next_key = x_GetNextKey();
219  if (next_key >= other.key) {
220  key = next_key;
221  return true;
222  }
223  }
224  return false;
225  };
226 
227  // less_than, used for sorting
228  bool operator <(const CWriteDB_IsamKey &other) const {
229  return (key < other.key);
230  };
231 
232  private:
233  // read in the next key, for numeric id
235 #define INT4_SIZE 4
236  char s[INT4_SIZE] = { '\0' };
237  source->read(s, INT4_SIZE);
238  if ((source->gcount() != INT4_SIZE) || source->eof()) {
239  return T();
240  }
241  source->seekg(INT4_SIZE, ios_base::cur);
242 #ifdef WORDS_BIGENDIAN
243  Int4 next_key = (Int4) *((Int4 *) s);
244 #else
245  Int4 next_key = CByteSwap::GetInt4((const unsigned char *)s);
246 #endif
247  return next_key;
248  };
249 };
250 
251 // customized string file reading
252 template <> inline string
254 #define CHAR_BUFFER_SIZE 256
255  char s[CHAR_BUFFER_SIZE] = { '\0' };
256  source->getline(s, CHAR_BUFFER_SIZE);
257  if ((source->gcount() == 0) || source->eof()) {
258  return kEmptyStr;
259  }
260  char * p = s;
261  while (*p != 0x02) ++p;
262  string in(s, p);
263 
264  // check if the current key is PDB-like,
265  // if so, advance for the next
266  // PDB key must be [0-9]...
267  if ( (in.size() == 4)
268  && ((in[0] - '0') * (in[0] - '9') <= 0) ) {
269 
270  // probing the next key to make sure this is pdb id
271  char next_token[4];
272  source->read(next_token, 4);
273  source->seekg(-4, ios_base::cur);
274  string next_key(next_token, 4);
275 
276  if (next_key == in) {
277  // automatically advance to next key
278  return x_GetNextKey();
279  }
280  }
281  return in;
282 };
283 
284 /// Comparison function for set<CWriteDB_IsamKey<T> *>
285 template <class T>
288  const CWriteDB_IsamKey<T> * rhs) const {
289  return (*lhs < *rhs);
290  }
291 };
292 
293 /// Check for duplicate ids across volumes
294 template <class T>
296  CWriteDB_IsamKey_Compare<T> > & keys) {
297  while (!keys.empty()) {
298  // pick the smallest key
299  CWriteDB_IsamKey<T> * key = *(keys.begin());
300 
301  keys.erase(key);
302 
303  if (keys.empty()) {
304  delete key;
305  return;
306  }
307 
308  const CWriteDB_IsamKey<T> * next = *(keys.begin());
309  if (key->AdvanceKey(*next)) {
310  if (keys.find(key) != keys.end()) {
312  msg << "Error: Duplicate seq_id <"
313  << key->key
314  << "> is found multiple times across volumes.";
316  }
317  keys.insert(key);
318  } else {
319  delete key;
320  }
321  }
322 };
323 
325 {
326  if (m_Closed)
327  return;
328 
329  m_Closed = true;
330 
331  x_Publish();
332  m_Sequence.erase();
333  m_Ambig.erase();
334 
335  if (! m_Volume.Empty()) {
336  m_Volume->Close();
337 
338  if (m_UseGiMask) {
339  for (unsigned int i=0; i<m_GiMasks.size(); ++i) {
340  m_GiMasks[i]->Close();
341  }
342  }
343 
344  if (m_VolumeList.size() == 1) {
346  }
347  else if(m_VolumeList.size() > 100){
348  unsigned int num_digits = log10(m_VolumeList.size()) +1;
349  for(unsigned i=0; i < m_VolumeList.size(); i++) {
351  v->RenameFileIndex(num_digits);
352  }
353  LOG_POST(Info << "Rename files index to " << num_digits << " digits");
354  }
355 
356  // disable the check for duplicate ids across volumes
357  /*
358  else if (m_Indices != CWriteDB::eNoIndex) {
359  set<CWriteDB_IsamKey<string> *, CWriteDB_IsamKey_Compare<string> > sids;
360  ITERATE(vector< CRef<CWriteDB_Volume> >, iter, m_VolumeList) {
361  string fn = (*iter)->GetVolumeName() + (m_Protein ? ".psd" : ".nsd");
362  if (CFile(fn).Exists()) {
363  sids.insert(new CWriteDB_IsamKey<string>(fn));
364  }
365  }
366  s_CheckDuplicateIds(sids);
367 
368  set<CWriteDB_IsamKey<Int4> *, CWriteDB_IsamKey_Compare<Int4> > nids;
369  ITERATE(vector< CRef<CWriteDB_Volume> >, iter, m_VolumeList) {
370  string fn = (*iter)->GetVolumeName() + (m_Protein ? ".pnd" : ".nnd");
371  if (CFile(fn).Exists()) {
372  nids.insert(new CWriteDB_IsamKey<Int4>(fn));
373  }
374  }
375  s_CheckDuplicateIds(nids);
376  } */
377 
378  if (m_VolumeList.size() > 1 || m_UseGiMask) {
379  x_MakeAlias();
380  }
381  if ((m_DbVersion == eBDB_Version5) && m_Lmdbdb) {
382  vector<string> vol_names(m_VolumeList.size());
383  vector<blastdb::TOid> vol_num_oids(m_VolumeList.size());
384  for(unsigned i=0; i < m_VolumeList.size(); i++) {
386  vol_names[i] = CDirEntry(v->GetVolumeName()).GetName();
387  vol_num_oids[i] = v->GetOID();
388  }
389  m_Lmdbdb->InsertVolumesInfo(vol_names, vol_num_oids);
390  m_Lmdbdb.Reset();
391  m_Taxdb.Reset();
392  }
393 
394  m_Volume.Reset();
395  }
396 }
397 
399 {
400  return m_Dbname + (m_Protein ? ".pal" : ".nal");
401 }
402 
404 {
405  string dblist;
406  if (m_VolumeList.size() > 1) {
407  for(unsigned i = 0; i < m_VolumeList.size(); i++) {
408  if (dblist.size())
409  dblist += " ";
410 
412  dblist += CDirEntry(v->GetVolumeName()).GetName();
413  }
414  } else {
415  dblist = m_Dbname;
416  }
417 
418  string masklist("");
419  if (m_UseGiMask) {
420  for (unsigned i = 0; i < m_GiMasks.size(); i++) {
421  const string & x = m_GiMasks[i]->GetName();
422  if (x != "") {
423  masklist += x + " ";
424  }
425  }
426  }
427 
428  string nm = x_MakeAliasName();
429 
430  ofstream alias(nm.c_str());
431 
432  alias << "#\n# Alias file created: " << m_Date << "\n#\n"
433  << "TITLE " << m_Title << "\n"
434  << "DBLIST " << dblist << "\n";
435 
436  if (masklist != "") {
437  alias << "MASKLIST " << masklist << "\n";
438  }
439 }
440 
442  string & bin_hdr)
443 {
444  if (! bin_hdr.empty()) {
445  return;
446  }
447 
448  if (! bioseq.CanGetDescr()) {
449  return;
450  }
451 
452  // Getting the binary headers, when they exist, is probably faster
453  // than building new deflines from the 'visible' CBioseq parts.
454 
455  vector< vector< char >* > bindata;
456 
457  ITERATE(list< CRef< CSeqdesc > >, iter, bioseq.GetDescr().Get()) {
458  if ((**iter).IsUser()) {
459  const CUser_object & uo = (**iter).GetUser();
460  const CObject_id & oi = uo.GetType();
461 
462  if (oi.IsStr() && oi.GetStr() == kAsnDeflineObjLabel) {
463  if (uo.CanGetData()) {
464  const vector< CRef< CUser_field > > & D = uo.GetData();
465 
466  if (D.size() &&
467  D[0].NotEmpty() &&
468  D[0]->CanGetLabel() &&
469  D[0]->GetLabel().IsStr() &&
470  D[0]->GetLabel().GetStr() == kAsnDeflineObjLabel &&
471  D[0]->CanGetData() &&
472  D[0]->GetData().IsOss()) {
473 
474  bindata = D[0]->GetData().GetOss();
475  break;
476  }
477  }
478  }
479  }
480  }
481 
482  if (! bindata.empty()) {
483  if (bindata[0] && (! bindata[0]->empty())) {
484  vector<char> & b = *bindata[0];
485 
486  bin_hdr.assign(& b[0], b.size());
487  }
488  }
489 }
490 
491 void
493 {
494  static const int kGenBankLimit = 5;
495  static const int kGenBankScore = 500;
496  if (dfs->Get().size() <= kGenBankLimit){
497  return;
498  }
499 
500  CBlast_def_line_set * deflines = const_cast<CBlast_def_line_set*>(dfs.GetPointer());
501  deflines->SortBySeqIdRank(true, true);
502  list<CRef<CBlast_def_line> > & df_set= deflines->Set();
503 
504  if(FindBestChoice(df_set.front()->GetSeqid(), CSeq_id::BlastRank)->IsLocal()){
505  return;
506  }
507  string id =FindBestChoice(df_set.front()->GetSeqid(), CSeq_id::BlastRank)->AsFastaString();
508  CBlast_def_line::TTaxIds tax_ids;
509  CBlast_def_line_set::Tdata::iterator itr=df_set.begin();
510  int gb_count = 0;
511  list<CRef<CBlast_def_line> > tmp_gb_list;
512  while (itr != df_set.end()){
513  CBlast_def_line & df= **itr;
514  int score = CSeq_id::BlastRank(FindBestChoice(df.GetSeqid(), CSeq_id::BlastRank));
515  CBlast_def_line::TTaxIds df_taxids= df.GetTaxIds();
516  if (score >= kGenBankScore){
517  size_t orig_size = tax_ids.size();
518  tax_ids.insert(df_taxids.begin(), df_taxids.end());
519  if (orig_size == tax_ids.size()){
520  if(gb_count < 5){
521  list<CRef<CBlast_def_line> >::iterator tmp_itr = itr;
522  itr++;
523  tmp_gb_list.splice(tmp_gb_list.end(), df_set, tmp_itr);
524  }
525  else {
526  itr = df_set.erase(itr);
527  }
528  continue;
529  }
530  else {
531  gb_count ++;
532  }
533  }
534  else {
535  tax_ids.insert(df_taxids.begin(), df_taxids.end());
536  }
537  itr++;
538  }
539 
540  while ((gb_count < kGenBankLimit) && (tmp_gb_list.size() > 0)){
541  df_set.splice(df_set.end(), tmp_gb_list, tmp_gb_list.begin());
542  gb_count++;
543  }
544  tmp_gb_list.clear();
545 }
546 
547 
548 static void
549 s_CheckEmptyLists(CRef<CBlast_def_line_set> & deflines, bool owner);
550 
553 {
555  SerialAssign(*bdls, *deflines);
556  s_CheckEmptyLists(bdls, true);
557  return bdls;
558 }
559 
560 static void
562 {
563  CBlast_def_line_set * bdls = 0;
564  CConstRef<CBlast_def_line_set> here(&*deflines);
565 
566  if (! owner) {
567  here = s_EditDeflineSet(here);
568  return;
569  }
570 
571  bdls = const_cast<CBlast_def_line_set*>(here.GetPointer());
572 
573  NON_CONST_ITERATE(list< CRef< CBlast_def_line > >, iter, bdls->Set()) {
574  CRef<CBlast_def_line> defline = *iter;
575  if (defline->CanGetMemberships() &&
576  defline->GetMemberships().size() == 0) {
577 
578  defline->ResetMemberships();
579  }
580 
581  if (defline->CanGetLinks() &&
582  defline->GetLinks().size() == 0) {
583 
584  defline->ResetLinks();
585  }
586  }
587 
588  deflines.Reset(bdls);
589 }
590 
591 void
594  const vector< vector<int> > & membbits,
595  const vector< vector<int> > & linkouts,
596  int pig)
597 {
598  if (! (bioseq.CanGetDescr() && bioseq.CanGetId())) {
599  return;
600  }
601 
602  vector<TTaxId> taxids;
603  string titles;
604 
605  // Scan the CBioseq for taxids and the title string.
606 
607  ITERATE(list< CRef< CSeqdesc > >, iter, bioseq.GetDescr().Get()) {
608  const CSeqdesc & desc = **iter;
609 
610  if (desc.IsTitle()) {
611  //defline->SetTitle((**iter)->GetTitle());
612  titles = (**iter).GetTitle();
613  }
614  else {
615  const COrg_ref * org_pt = NULL;
616  if (desc.IsSource()) {
617  org_pt = &(desc.GetSource().GetOrg());
618  }
619  else if( desc.IsOrg()) {
620  org_pt = &(desc.GetOrg());
621  }
622 
623  if((NULL != org_pt) && org_pt->CanGetDb()) {
624  ITERATE(vector< CRef< CDbtag > >,
625  dbiter,
626  org_pt->GetDb()) {
627 
628  if ((**dbiter).CanGetDb() &&
629  (**dbiter).GetDb() == "taxon") {
630 
631  const CObject_id & oi = (**dbiter).GetTag();
632 
633  if (oi.IsId()) {
634  //defline->SetTaxid(oi.GetId());
635  taxids.push_back(TAX_ID_FROM(CObject_id::TId, oi.GetId()));
636  }
637  }
638  }
639  }
640  }
641  }
642 
643  // The bioseq has a field contianing the ids for the first
644  // defline. The title string contains the title for the first
645  // defline, plus all the other defline titles and ids. This code
646  // unpacks them and builds a normal blast defline set.
647 
648  list< CRef<CSeq_id> > ids = bioseq.GetId();
649 
650  unsigned taxid_i(0), mship_i(0), links_i(0);
651  bool used_pig(false);
652 
653  // Build the deflines.
654 
656  CRef<CBlast_def_line> defline;
657 
658  while(! ids.empty()) {
659  defline.Reset(new CBlast_def_line);
660 
661  defline->SetSeqid() = ids;
662  ids.clear();
663 
664  /*
665  size_t pos = titles.find(" >");
666  string T;
667 
668  if (pos != titles.npos) {
669  T.assign(titles, 0, pos);
670  titles.erase(0, pos + 2);
671 
672  pos = titles.find(" ");
673  string nextid;
674 
675  if (pos != titles.npos) {
676  nextid.assign(titles, 0, pos);
677  titles.erase(0, pos + 1);
678  } else {
679  nextid.swap(titles);
680  }
681 
682  // Parse '|' seperated ids.
683  if ( nextid.find('|') == NPOS
684  || !isalpha((unsigned char)(nextid[0]))) {
685  ids.push_back(CRef<CSeq_id> (new CSeq_id(CSeq_id::e_Local, nextid)));
686  } else {
687  CSeq_id::ParseFastaIds(ids, nextid);
688  }
689  } else {
690  T = titles;
691  }
692 
693  */
694  defline->SetTitle(titles);
695 
696  if (taxid_i < taxids.size()) {
697  defline->SetTaxid(taxids[taxid_i++]);
698  }
699 
700  if (mship_i < membbits.size()) {
701  const vector<int> & V = membbits[mship_i++];
702  defline->SetMemberships().assign(V.begin(), V.end());
703  }
704 
705  if (links_i < linkouts.size()) {
706  const vector<int> & V = linkouts[mship_i++];
707  defline->SetLinks().assign(V.begin(), V.end());
708  }
709 
710  if ((! used_pig) && pig) {
711  defline->SetOther_info().push_back(pig);
712  used_pig = true;
713  }
714 
715  bdls->Set().push_back(defline);
716  }
717 
718  s_CheckEmptyLists(bdls, true);
719  deflines = bdls;
720 }
721 
723 x_SetDeflinesFromBinary(const string & bin_hdr,
725 {
727 
728  istringstream iss(bin_hdr);
729  iss >> MSerial_AsnBinary >> *bdls;
730 
731  s_CheckEmptyLists(bdls, true);
732  deflines.Reset(&* bdls);
733 }
734 
735 
736 static bool s_UseFastaReaderDeflines(CConstRef<CBioseq> & bioseq, CConstRef<CBlast_def_line_set> & deflines, bool long_seqid)
737 {
738  if(deflines.Empty())
739  return false;
740 
741  const CSeq_id * bioseq_id = bioseq->GetNonLocalId();
742 
743  if(bioseq_id == NULL ||
744  // For bare pir and prf ids go with the one from defline.
745  // This is to parse bare ids as local ones. The bare pdb ids are pdb in
746  // bioseq (parsed by CFastaReader), but local in deflines (parsed by
747  // CSeq_id).
748  (!long_seqid && (bioseq_id->IsPrf() || bioseq_id->IsPir()))) {
749  return true;
750  }
751 
752  // Bioseq has non-local id, make sure at least one id is non-local from CFastaReader
753  // defline
754  ITERATE(list< CRef<CBlast_def_line> >, iter, deflines->Get()) {
755  CRef<CSeq_id> id = FindBestChoice((**iter).GetSeqid(), &CSeq_id::BestRank);
756  if (id.NotEmpty() && !id->IsLocal()) {
757  return true;
758  }
759  }
760  return false;
761 
762 }
763 
764 void
767  string & bin_hdr,
768  const vector< vector<int> > & membbits,
769  const vector< vector<int> > & linkouts,
770  int pig,
771  set<TTaxId> & tax_ids,
772  int OID,
773  bool parse_ids,
774  bool long_ids,
775  bool limit_defline,
776  bool scan_bioseq_4_cfastareader_usrobj)
777 {
778  bool use_bin = (deflines.Empty() && pig == 0);
779 
780  if (! bin_hdr.empty() && OID<0) {
781  return;
782  }
783 
784  if (deflines.Empty()) {
785  // Use bioseq if deflines are not provided.
786 
787  if (bioseq.Empty()) {
789  eArgErr,
790  "Error: Cannot find CBioseq or deflines.");
791  }
792 
793  // CBioseq objects from SeqDB have binary headers embedded in
794  // them. If these are found, we try to use them. However,
795  // using binary headers may not help us much if we also want
796  // lists of sequence identifiers (for building ISAM files).
797 
798  if (use_bin) {
799  x_GetBioseqBinaryHeader(*bioseq, bin_hdr);
800  }
801 
802  if (bin_hdr.empty()) {
803  try {
804  x_GetFastaReaderDeflines(*bioseq,
805  deflines,
806  membbits,
807  linkouts,
808  pig,
809  false,
810  parse_ids,
811  long_ids,
812  scan_bioseq_4_cfastareader_usrobj);
813  } catch ( const CSeqIdException&e ) {
814  //LOG_POST(Info << "x_GetFastaReaderDeflines " << e.GetMsg() );
815  }
816  }
817 
818  if(!s_UseFastaReaderDeflines(bioseq, deflines, long_ids)) {
819  deflines.Reset();
820  }
821 
822  if (bin_hdr.empty() && deflines.Empty()) {
824  deflines,
825  membbits,
826  linkouts,
827  pig);
828  }
829  }
830 
831  if (bin_hdr.empty() &&
832  (deflines.Empty() || deflines->Get().empty())) {
833 
835  eArgErr,
836  "Error: No deflines provided.");
837  }
838 
839  if (pig != 0) {
840  const list<int> * L = 0;
841 
842  if (deflines->Get().front()->CanGetOther_info()) {
843  L = & deflines->Get().front()->GetOther_info();
844  }
845 
846  // If the pig does not agree with the current value, set the
847  // new value and force a rebuild of the binary headers. If
848  // there is more than one value in the list, leave the others
849  // in place.
850 
851  if ((L == 0) || L->empty()) {
853  bdls->Set().front()->SetOther_info().push_back(pig);
854 
855  deflines.Reset(&* bdls);
856  bin_hdr.erase();
857  } else if (L->front() != pig) {
859  bdls->Set().front()->SetOther_info().front() = pig;
860 
861  deflines.Reset(&* bdls);
862  bin_hdr.erase();
863  }
864  }
865 
866  if (OID>=0) {
867  // Re-inject the BL_ORD_ID
868  CRef<CSeq_id> gnl_id(new CSeq_id);
869  gnl_id->SetGeneral().SetDb("BL_ORD_ID");
870  gnl_id->SetGeneral().SetTag().SetId(OID);
872  bdls->Set().front()->SetSeqid().front() = gnl_id;
873 
874  deflines.Reset(&* bdls);
875  }
876 
877  if (deflines.Empty() && (! bin_hdr.empty())) {
878  // Uncompress the deflines from binary.
879  x_SetDeflinesFromBinary(bin_hdr, deflines);
880  }
881 
882  if (limit_defline) {
883  s_LimitDeflines(deflines);
884  bin_hdr.clear();
885  }
886 
887  if (bin_hdr.empty() || OID>=0) {
888  // Compress the deflines to binary.
889 
890  CNcbiOstrstream oss;
891  oss << MSerial_AsnBinary << *deflines;
892  bin_hdr = CNcbiOstrstreamToString(oss);
893  }
894 
895  if ((! deflines.Empty()) && deflines->CanGet()) {
896  ITERATE(list< CRef<CBlast_def_line> >, defline, deflines->Get()) {
897  CBlast_def_line::TTaxIds taxid_set = (*defline)->GetTaxIds();
898  tax_ids.insert(taxid_set.begin(), taxid_set.end());
899  }
900  }
901 }
902 
904 {
905  int OID = -1;
906  if (! m_ParseIDs) {
907  OID = (m_Volume ) ? m_Volume->GetOID() : 0;
908  }
910  m_Deflines,
911  m_BinHdr,
913  m_Linkouts,
914  m_Pig,
915  m_TaxIds,
916  OID,
917  m_ParseIDs,
918  m_LongSeqId,
921 
922  x_CookIds();
923 }
924 
926 {
927  if (! m_Ids.empty()) {
928  return;
929  }
930 
931  if (m_Deflines.Empty()) {
932  if (m_BinHdr.empty()) {
934  eArgErr,
935  "Error: Cannot find IDs or deflines.");
936  }
937 
939  }
940 
941  ITERATE(list< CRef<CBlast_def_line> >, iter, m_Deflines->Get()) {
942  const list< CRef<CSeq_id> > & ids = (**iter).GetSeqid();
943  // m_Ids.insert(m_Ids.end(), ids.begin(), ids.end());
944  // Spelled out for WorkShop. :-/
945  // ID-6757 : STL containers have efficient internal memory maintenance,
946  // the following line is, on the contrary, very inefficient.
947  // m_Ids.reserve(m_Ids.size() + ids.size());
948  ITERATE (list<CRef<CSeq_id> >, it, ids) {
949  m_Ids.push_back(*it);
950  }
951  }
952 }
953 
955 {
956  // Scan and mask the sequence itself.
957  for(unsigned i = 0; i < m_Sequence.size(); i++) {
958  if (m_MaskLookup[m_Sequence[i] & 0xFF] != 0) {
959  m_Sequence[i] = m_MaskByte[0];
960  }
961  }
962 }
963 
965 {
966  if (! m_SeqLength) {
967  if (! m_Sequence.empty()) {
969  } else if (m_SeqVector.size()) {
971  } else if (! (m_Bioseq &&
972  m_Bioseq->CanGetInst() &&
973  m_Bioseq->GetInst().GetLength())) {
974 
976  eArgErr,
977  "Need sequence data.");
978  }
979 
980  if (m_Bioseq.NotEmpty()) {
981  const CSeq_inst & si = m_Bioseq->GetInst();
982  m_SeqLength = si.GetLength();
983  }
984  }
985 
986  return m_SeqLength;
987 }
988 
990 {
991  if (! m_Sequence.empty())
992  return;
993 
994  if (! (m_Bioseq.NotEmpty() && m_Bioseq->CanGetInst())) {
996  eArgErr,
997  "Need sequence data.");
998  }
999 
1000  const CSeq_inst & si = m_Bioseq->GetInst();
1001 
1002  if (m_Bioseq->GetInst().CanGetSeq_data()) {
1003  const CSeq_data & sd = si.GetSeq_data();
1004 
1005  string msg;
1006 
1007  switch(sd.Which()) {
1010  break;
1011 
1012  case CSeq_data::e_Ncbieaa:
1014  break;
1015 
1016  case CSeq_data::e_Iupacaa:
1018  break;
1019 
1020  case CSeq_data::e_Ncbi2na:
1022  break;
1023 
1024  case CSeq_data::e_Ncbi4na:
1026  break;
1027 
1028  case CSeq_data::e_Iupacna:
1030  break;
1031 
1032  default:
1033  msg = "Unable to process sequence for entry [";
1034  msg += (m_Bioseq->GetId().front())->GetSeqIdString(false);
1035  msg += "].";
1036  }
1037 
1038  if (! msg.empty()) {
1039  NCBI_THROW(CWriteDBException, eArgErr, msg);
1040  }
1041  } else {
1042  int sz = m_SeqVector.size();
1043 
1044  if (sz == 0) {
1046  eArgErr,
1047  "No sequence data in Bioseq, "
1048  "and no Bioseq_Handle available.");
1049  }
1050 
1051  if (m_Protein) {
1052  // I add one to the string length to allow the "i+1" in
1053  // the loop to be done safely.
1054 
1055  m_Sequence.reserve(sz);
1057  } else {
1058  // I add one to the string length to allow the "i+1" in the
1059  // loop to be done safely.
1060 
1061  string na8;
1062  na8.reserve(sz + 1);
1063  m_SeqVector.GetSeqData(0, sz, na8);
1064  na8.resize(sz + 1);
1065 
1066  string na4;
1067  na4.resize((sz + 1) / 2);
1068 
1069  for(int i = 0; i < sz; i += 2) {
1070  na4[i/2] = (na8[i] << 4) + na8[i+1];
1071  }
1072 
1073  WriteDB_Ncbi4naToBinary(na4.data(),
1074  (int) na4.size(),
1075  (int) si.GetLength(),
1076  m_Sequence,
1077  m_Ambig);
1078  }
1079  }
1080 }
1081 
1083 {
1084 }
1085 
1086 // The CPU should be kept at 190 degrees for 10 minutes.
1088 {
1089  // We need sequence, ambiguity, and binary deflines. If any of
1090  // these is missing, it is created from other data if possible.
1091 
1092  // For now I am disabling binary headers, because in normal usage
1093  // I would expect to see sequences from ID1 or similar, and the
1094  // non-binary case is slightly more complex.
1095 
1096  x_CookHeader();
1097  x_CookSequence();
1098  x_CookColumns();
1099 
1100  if (m_Protein && m_MaskedLetters.size()) {
1101  x_MaskSequence();
1102  }
1103 }
1104 
1106 {
1107  return m_HaveSequence;
1108 }
1109 
1111 {
1113  m_HaveSequence = true;
1114 }
1115 
1117 {
1119  m_HaveSequence = false;
1120 }
1121 
1123 {
1124  // This test should fail only on the first call, or if an
1125  // exception was thrown.
1126 
1127  if (x_HaveSequence()) {
1128  _ASSERT(! (m_Bioseq.Empty() && m_Sequence.empty()));
1129 
1131  } else {
1132  return;
1133  }
1134 
1135 
1136  if(m_DbVersion == eBDB_Version5 && m_Lmdbdb.Empty()) {
1137  const string lmdb_fname_w_path = BuildLMDBFileName(m_Dbname, m_Protein);
1138  Uint8 map_size = 0;
1139  char* map_sz_str = getenv("BLASTDB_LMDB_MAP_SIZE");
1140  if (map_sz_str) {
1141  map_size = NStr::StringToUInt8(map_sz_str);
1142  }
1143  if(map_size > 0){
1144  m_Lmdbdb.Reset(new CWriteDB_LMDB(lmdb_fname_w_path, map_size));
1147  map_size));
1148  }
1149  else {
1150  m_Lmdbdb.Reset(new CWriteDB_LMDB(lmdb_fname_w_path));
1153  }
1154  }
1155 
1156  x_CookData();
1157 
1158  bool done = false;
1159 
1160  if (! m_Volume.Empty()) {
1162  m_Ambig,
1163  m_BinHdr,
1164  m_Ids,
1165  m_Pig,
1166  m_Hash,
1167  m_Blobs,
1169  if (done && (m_DbVersion == eBDB_Version5) && m_Lmdbdb) {
1170  if (m_ParseIDs) {
1172  }
1174  m_LmdbOid++;
1175  }
1176  }
1177 
1178  if (! done) {
1179  int index = (int) m_VolumeList.size();
1180 
1181  if (m_Volume.NotEmpty()) {
1182  m_Volume->Close();
1183  }
1184 
1185  {
1186 
1188  m_Protein,
1189  m_Title,
1190  m_Date,
1191  index,
1192  m_MaxFileSize,
1194  m_Indices,
1195  m_DbVersion,
1196  m_OidMasks));
1197 
1198  m_VolumeList.push_back(m_Volume);
1199 
1200 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
1201  (!defined(NCBI_COMPILER_MIPSPRO)) )
1202  _ASSERT(m_Blobs.size() == m_ColumnTitles.size() * 2);
1203  _ASSERT(m_Blobs.size() == m_ColumnMetas.size() * 2);
1204  _ASSERT(m_Blobs.size() == m_HaveBlob.size() * 2);
1205 
1206  for(size_t i = 0; i < m_ColumnTitles.size(); i++) {
1208  m_ColumnMetas[i],
1209  m_MaxFileSize);
1210  }
1211 #endif
1212  }
1213 
1214  // need to reset OID, hense recalculate the header and id
1215  x_CookHeader();
1216 
1218  m_Ambig,
1219  m_BinHdr,
1220  m_Ids,
1221  m_Pig,
1222  m_Hash,
1223  m_Blobs,
1225 
1226  if (done && (m_DbVersion == eBDB_Version5) && m_Lmdbdb) {
1227  if (m_ParseIDs){
1229  }
1231  m_LmdbOid++;
1232  }
1233 
1234  if (! done) {
1236  eArgErr,
1237  "Cannot write sequence to volume.");
1238  }
1239  }
1240 }
1241 
1243 {
1245  bdls(const_cast<CBlast_def_line_set*>(& deflines));
1246 
1247  s_CheckEmptyLists(bdls, true);
1248  m_Deflines = bdls;
1249 }
1250 
1251 inline int s_AbsMax(int a, int b)
1252 {
1253  return std::max(((a < 0) ? -a : a),
1254  ((b < 0) ? -b : b));
1255 }
1256 
1257 // Filtering data format on disk:
1258 //
1259 // Size of integer type for this blob (1, 2, or 4) (4 bytes).
1260 //
1261 // Array of filtering types:
1262 // Filter-type (enumeration)
1263 // Array of offsets:
1264 // Start Offset
1265 // End Offset
1266 //
1267 // The isize is one of 1, 2, or 4, written in the first byte, and
1268 // followed by 0, 1, or 3 NUL bytes to align the data offset to a
1269 // multiple of `isize'.
1270 //
1271 // All other integer values in this array use isize bytes, including
1272 // array counts and the `type' enumerations. After all the offset is
1273 // written, the blob is aligned to a multiple of 4 using the `eSimple'
1274 // method.
1275 //
1276 // Each array is an element count followed by that many elements.
1277 
1278 #if 0
1279 
1280 // I think this is a better approach; but it needs more testing,
1281 // particularly with regard to platform portability.
1282 
1283 struct SWriteInt1 {
1284  static void WriteInt(CBlastDbBlob & blob, int value)
1285  {
1286  blob.WriteInt1(value);
1287  }
1288 };
1289 
1290 struct SWriteInt2 {
1291  static void WriteInt(CBlastDbBlob & blob, int value)
1292  {
1293  blob.WriteInt2(value);
1294  }
1295 };
1296 
1297 struct SWriteInt4 {
1298  static void WriteInt(CBlastDbBlob & blob, int value)
1299  {
1300  blob.WriteInt4(value);
1301  }
1302 };
1303 
1304 template<class TWriteSize, class TRanges>
1305 void s_WriteRanges(CBlastDbBlob & blob,
1306  int count,
1307  const TRanges & ranges)
1308 {
1309  typedef vector< pair<TSeqPos, TSeqPos> > TPairVector;
1310 
1311  Int4 num_written = 0;
1312  TWriteSize::WriteInt(blob, count);
1313 
1314  for ( typename TRanges::const_iterator r1 = (ranges).begin(),
1315  r1_end = (ranges).end();
1316  r1 != r1_end;
1317  ++r1 ) {
1318 
1319  if (r1->offsets.size()) {
1320  num_written ++;
1321  TWriteSize::WriteInt(blob, r1->algorithm_id);
1322  TWriteSize::WriteInt(blob, r1->offsets.size());
1323 
1324  ITERATE(TPairVector, r2, r1->offsets) {
1325  TWriteSize::WriteInt(blob, r2->first);
1326  TWriteSize::WriteInt(blob, r2->second);
1327  }
1328  }
1329  }
1330 
1331  _ASSERT(num_written == count);
1332 }
1333 
1334 #endif
1335 
1336 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
1337  (!defined(NCBI_COMPILER_MIPSPRO)) )
1338 
1340  const vector <TGi> & gis)
1341 {
1342  // No GI is found for the sequence
1343  // TODO should we generate a warning?
1344  if (m_UseGiMask && !gis.size()) {
1345  return;
1346  }
1347 
1348  TSeqPos seq_length = x_ComputeSeqLength();
1349 
1350  // Check validity of data and determine maximum integer value
1351  // stored here before writing anything. The best numeric_size
1352  // will be selected; this numeric size is applied uniformly to all
1353  // integers in this blob (except for the first one, which is the
1354  // integer size itself, and which is always a single byte.)
1355 
1356  typedef vector< pair<TSeqPos, TSeqPos> > TPairVector;
1357 
1358  int range_list_count = 0;
1359  int offset_pairs_count = 0;
1360 
1361 
1362  ITERATE(CMaskedRangesVector, r1, ranges) {
1363  if (r1->empty()) {
1364  continue;
1365  }
1366 
1367  range_list_count ++;
1368  offset_pairs_count += r1->offsets.size();
1369 
1370  if ( !m_MaskAlgoRegistry.IsRegistered(r1->algorithm_id) ) {
1371  string msg("Error: Algorithm IDs must be registered before use.");
1372  msg += " Unknown algorithm ID = " +
1373  NStr::IntToString((int)r1->algorithm_id);
1374  NCBI_THROW(CWriteDBException, eArgErr, msg);
1375  }
1376 
1377 
1378  ITERATE(TPairVector, r2, r1->offsets) {
1379  if ((r2->first > r2->second) ||
1380  (r2->second > seq_length)) {
1381 
1383  eArgErr,
1384  "Error: Masked data offsets out of bounds.");
1385  }
1386  }
1387  }
1388 
1389 
1390  // Gi-based masks
1391  if (m_UseGiMask) {
1392  ITERATE(CMaskedRangesVector, r1, ranges) {
1393  if (r1->offsets.size()) {
1394  m_GiMasks[m_MaskAlgoMap[r1->algorithm_id]]
1395  ->AddGiMask(gis, r1->offsets);
1396  }
1397  }
1398  return;
1399  }
1400 
1401  // OID-based masks
1402  const int col_id = x_GetMaskDataColumnId();
1403  CBlastDbBlob & blob = SetBlobData(col_id);
1404  blob.Clear();
1405  blob.WriteInt4(range_list_count);
1406 
1407  CBlastDbBlob & blob2 = SetBlobData(col_id);
1408  blob2.Clear();
1409  blob2.WriteInt4(range_list_count);
1410 
1411  ITERATE(CMaskedRangesVector, r1, ranges) {
1412  if (r1->offsets.size()) {
1413  blob.WriteInt4(r1->algorithm_id);
1414  blob.WriteInt4(r1->offsets.size());
1415  blob2.WriteInt4(r1->algorithm_id);
1416  blob2.WriteInt4(r1->offsets.size());
1417 
1418  ITERATE(TPairVector, r2, r1->offsets) {
1419  blob.WriteInt4(r2->first);
1420  blob.WriteInt4(r2->second);
1421  blob2.WriteInt4_LE(r2->first);
1422  blob2.WriteInt4_LE(r2->second);
1423  }
1424  }
1425  }
1426 
1429 }
1430 
1431 static const string s_EscapeColon(const string &in) {
1432  const char l = 0x1;
1433  return NStr::Replace(in, ":", string(l,1));
1434 }
1435 
1438  const string & options,
1439  const string & name)
1440 {
1441  int algorithm_id = m_MaskAlgoRegistry.Add(program, options, name);
1442 
1443  string key = NStr::IntToString(algorithm_id);
1444  string value;
1446  value = NStr::IntToString((int)program) + ":" +
1447  s_EscapeColon(options) + ":" +
1448  s_EscapeColon(name) + ":";
1449  } else {
1450  value = NStr::IntToString((int)program) + ":" + s_EscapeColon(options);
1451  }
1452 
1453  if (m_UseGiMask) {
1454  m_MaskAlgoMap[algorithm_id] = m_GiMasks.size();
1456  (new CWriteDB_GiMask(name, value, m_MaxFileSize)));
1457  } else {
1459  }
1460 
1461  return algorithm_id;
1462 }
1463 
1465 RegisterMaskAlgorithm(const string &id,
1466  const string &description,
1467  const string &options)
1468 {
1469  int algorithm_id = m_MaskAlgoRegistry.Add(id);
1470 
1471  string key = NStr::IntToString(algorithm_id);
1472  string value = "100:" +
1473  s_EscapeColon(options) + ":" +
1474  s_EscapeColon(id) + ":" +
1475  s_EscapeColon(description);
1476 
1478 
1479  return algorithm_id;
1480 }
1481 
1482 int CWriteDB_Impl::FindColumn(const string & title) const
1483 {
1484  for(int i = 0; i < (int) m_ColumnTitles.size(); i++) {
1485  if (title == m_ColumnTitles[i]) {
1486  return i;
1487  }
1488  }
1489 
1490  return -1;
1491 }
1492 
1493 int CWriteDB_Impl::CreateColumn(const string & title, bool mbo)
1494 {
1495  _ASSERT(FindColumn(title) == -1);
1496 
1497  size_t col_id = m_Blobs.size() / 2;
1498 
1499  _ASSERT(m_HaveBlob.size() == col_id);
1500  _ASSERT(m_ColumnTitles.size() == col_id);
1501  _ASSERT(m_ColumnMetas.size() == col_id);
1502 
1503  CRef<CBlastDbBlob> new_blob(new CBlastDbBlob);
1504  CRef<CBlastDbBlob> new_blob2(new CBlastDbBlob);
1505 
1506  m_Blobs .push_back(new_blob);
1507  m_Blobs .push_back(new_blob2);
1508  m_HaveBlob .push_back(0);
1509  m_ColumnTitles.push_back(title);
1510  m_ColumnMetas .push_back(TColumnMeta());
1511 
1512  if (m_Volume.NotEmpty()) {
1513  size_t id2 = m_Volume->CreateColumn(title, m_ColumnMetas.back(), mbo);
1514  _ASSERT(id2 == col_id);
1515  (void)id2; // get rid of compiler warning
1516  }
1517 
1518  return col_id;
1519 }
1520 
1522  const string & key,
1523  const string & value)
1524 {
1525  if ((col_id < 0) || (col_id >= (int) m_ColumnMetas.size())) {
1526  NCBI_THROW(CWriteDBException, eArgErr,
1527  "Error: provided column ID is not valid");
1528  }
1529 
1530  m_ColumnMetas[col_id][key] = value;
1531 
1532  if (m_Volume.NotEmpty()) {
1533  m_Volume->AddColumnMetaData(col_id, key, value);
1534  }
1535 }
1536 
1538 {
1539  if ((col_id < 0) || (col_id * 2 >= (int) m_Blobs.size())) {
1540  NCBI_THROW(CWriteDBException, eArgErr,
1541  "Error: provided column ID is not valid");
1542  }
1543 
1544  if (m_HaveBlob[col_id] > 1) {
1545  NCBI_THROW(CWriteDBException, eArgErr,
1546  "Error: Already have blob for this sequence and column");
1547  }
1548 
1549  ++m_HaveBlob[col_id];
1550 
1551  // Blobs are reused to reduce buffer reallocation; a missing blob
1552  // means the corresponding column does not exist.
1553 
1554  return *m_Blobs[col_id * 2 + m_HaveBlob[col_id] - 1];
1555 }
1556 #endif
1557 
1559 {
1560  m_Pig = pig;
1561 }
1562 
1564 {
1565  m_MaxFileSize = sz;
1566 }
1567 
1569 {
1570  m_MaxVolumeLetters = sz;
1571 }
1572 
1575  bool long_seqids,
1576  bool scan_bioseq_4_cfastareader_usrobj)
1577 {
1578  // Get information
1579 
1581  string binary_header;
1582  vector< vector<int> > v1, v2;
1583  set<TTaxId> t;
1584  const bool kLimitDefline = false;
1585 
1586  CConstRef<CBioseq> bsref(& bs);
1587  x_ExtractDeflines(bsref, deflines, binary_header, v2, v2, 0, t, -1, parse_ids,
1588  long_seqids, kLimitDefline, scan_bioseq_4_cfastareader_usrobj);
1589 
1590  // Convert to return type
1591 
1593  bdls.Reset(const_cast<CBlast_def_line_set*>(&*deflines));
1594 
1595  return bdls;
1596 }
1597 
1598 void CWriteDB_Impl::SetMaskedLetters(const string & masked)
1599 {
1600  // Only supported for protein.
1601 
1602  if (! m_Protein) {
1604  eArgErr,
1605  "Error: Nucleotide masking not supported.");
1606  }
1607 
1608  m_MaskedLetters = masked;
1609 
1610  if (masked.empty()) {
1611  vector<char> none;
1612  m_MaskLookup.swap(none);
1613  return;
1614  }
1615 
1616  // Convert set of masked letters to stdaa, use the result to build
1617  // a lookup table.
1618 
1619  string mask_bytes;
1622  0,
1623  (int) m_MaskedLetters.size(),
1624  mask_bytes,
1626 
1627  _ASSERT(mask_bytes.size() == m_MaskedLetters.size());
1628 
1629  // Build a table of character-to-bool.
1630  // (Bool is represented by char 0 and 1.)
1631 
1632  m_MaskLookup.resize(256, (char)0);
1633 
1634  for (unsigned i = 0; i < mask_bytes.size(); i++) {
1635  int ch = ((int) mask_bytes[i]) & 0xFF;
1636  m_MaskLookup[ch] = (char)1;
1637  }
1638 
1639  // Convert the masking character - always 'X' - to stdaa.
1640 
1641  if (m_MaskByte.empty()) {
1642  string mask_byte = "X";
1643 
1644  CSeqConvert::Convert(mask_byte,
1646  0,
1647  1,
1648  m_MaskByte,
1650 
1651  _ASSERT(m_MaskByte.size() == 1);
1652  }
1653 }
1654 
1655 void CWriteDB_Impl::ListVolumes(vector<string> & vols)
1656 {
1657  vols.clear();
1658 
1659  ITERATE(vector< CRef<CWriteDB_Volume> >, iter, m_VolumeList) {
1660  vols.push_back((**iter).GetVolumeName());
1661  }
1662 }
1663 
1664 void CWriteDB_Impl::ListFiles(vector<string> & files)
1665 {
1666  files.clear();
1667 
1668  ITERATE(vector< CRef<CWriteDB_Volume> >, iter, m_VolumeList) {
1669  (**iter).ListFiles(files);
1670  }
1671 
1672  if (m_VolumeList.size() > 1) {
1673  files.push_back(x_MakeAliasName());
1674  }
1675  if (m_DbVersion == eBDB_Version5) {
1676  files.push_back(BuildLMDBFileName(m_Dbname, m_Protein));
1677  }
1678 }
1679 
1680 /// Compute the hash of a (raw) sequence.
1681 ///
1682 /// The hash of the provided sequence will be computed and assigned to
1683 /// the m_Hash field. For protein, the sequence is in the Ncbistdaa
1684 /// format. For nucleotide, the sequence and optional ambiguities are
1685 /// in 'raw' format, meaning they are packed just as sequences are
1686 /// packed in nsq files.
1687 ///
1688 /// @param sequence The sequence data. [in]
1689 /// @param ambiguities Nucleotide ambiguities are provided here. [in]
1691  const CTempString & ambig)
1692 {
1693  if (m_Protein) {
1694  m_Hash = SeqDB_SequenceHash(sequence.data(), sequence.size());
1695  } else {
1696  string na8;
1697  SeqDB_UnpackAmbiguities(sequence, ambig, na8);
1698  m_Hash = SeqDB_SequenceHash(na8.data(), na8.size());
1699  }
1700 }
1701 
1702 /// Compute the hash of a (Bioseq) sequence.
1703 ///
1704 /// The hash of the provided sequence will be computed and
1705 /// assigned to the m_Hash member. The sequence is packed as a
1706 /// CBioseq.
1707 ///
1708 /// @param sequence The sequence as a CBioseq. [in]
1710 {
1711  m_Hash = SeqDB_SequenceHash(sequence);
1712 }
1713 
1714 #define TAB_REPLACEMENT " "
1715 
1716 
1717 
1719 x_GetFastaReaderDeflines(const CBioseq & bioseq,
1720  CConstRef<CBlast_def_line_set> & deflines,
1721  const vector< vector<int> > & membits,
1722  const vector< vector<int> > & linkout,
1723  int pig,
1724  bool accept_gt,
1725  bool parse_ids,
1726  bool long_seqids,
1727  bool scan_bioseq_4_cfastareader_usrobj)
1728 {
1729  if (! bioseq.CanGetDescr()) {
1730  return;
1731  }
1732 
1733  string fasta;
1734 
1735  // Scan the CBioseq for the CFastaReader user object.
1736 
1737  if (scan_bioseq_4_cfastareader_usrobj) {
1738  ITERATE(list< CRef< CSeqdesc > >, iter, bioseq.GetDescr().Get()) {
1739  const CSeqdesc & desc = **iter;
1740 
1741  if (desc.IsUser() &&
1742  desc.GetUser().CanGetType() &&
1743  desc.GetUser().GetType().IsStr() &&
1744  desc.GetUser().GetType().GetStr() == "CFastaReader" &&
1745  desc.GetUser().CanGetData()) {
1746 
1747  const vector< CRef< CUser_field > > & D = desc.GetUser().GetData();
1748 
1749  ITERATE(vector< CRef< CUser_field > >, iter, D) {
1750  const CUser_field & f = **iter;
1751 
1752  if (f.CanGetLabel() &&
1753  f.GetLabel().IsStr() &&
1754  f.GetLabel().GetStr() == "DefLine" &&
1755  f.CanGetData() &&
1756  f.GetData().IsStr()) {
1757  fasta = NStr::Replace(f.GetData().GetStr(), "\\t", TAB_REPLACEMENT);
1758  fasta = NStr::ParseEscapes(fasta);
1759  break;
1760  }
1761  }
1762  }
1763  }
1764 
1765  }
1766  if (fasta.empty())
1767  return;
1768 
1769  // The bioseq has a field contianing the ids for the first
1770  // defline. The title string contains the title for the first
1771  // defline, plus all the other defline titles and ids. This code
1772  // unpacks them and builds a normal blast defline set.
1773 
1774  unsigned mship_i(0), links_i(0);
1775  bool used_pig(false);
1776 
1777  // Build the deflines.
1778 
1780  CRef<CBlast_def_line> defline;
1781 
1782  if (!parse_ids) {
1783 
1784  // Generate an BL_ORD_ID in case no parse is needed
1785  CRef<CSeq_id> gnl_id(new CSeq_id());
1786  gnl_id->SetGeneral().SetDb("BL_ORD_ID");
1787  gnl_id->SetGeneral().SetTag().SetId(0); // will be filled later
1788 
1789  // Build the local defline.
1790  defline.Reset(new CBlast_def_line);
1791  defline->SetSeqid().push_back(gnl_id);
1792 
1793  string title(fasta, 1, fasta.size());
1794  // Replace ^A with space
1795  NStr::ReplaceInPlace(title, "\001", " ");
1796  // Replace tabs with three spaces
1797  NStr::ReplaceInPlace(title, "\t", TAB_REPLACEMENT);
1798  defline->SetTitle(title);
1799 
1800  if (mship_i < membits.size()) {
1801  const vector<int> & V = membits[mship_i++];
1802  defline->SetMemberships().assign(V.begin(), V.end());
1803  }
1804 
1805  if (links_i < linkout.size()) {
1806  const vector<int> & V = linkout[mship_i++];
1807  defline->SetLinks().assign(V.begin(), V.end());
1808  }
1809 
1810  if ((! used_pig) && pig) {
1811  defline->SetOther_info().push_back(pig);
1812  used_pig = true;
1813  }
1814 
1815  bdls->Set().push_back(defline);
1816 
1817  } else {
1818 
1819  int skip = 1;
1820  while(fasta.size()) {
1821  size_t id_start = skip;
1822  size_t pos_title = fasta.find(" ", skip);
1823  size_t pos_next = fasta.find("\001", skip);
1824  skip = 1;
1825 
1826  if (pos_next == fasta.npos) {
1827  if (accept_gt) {
1828  pos_next = fasta.find(" >");
1829  skip = 2;
1830  }
1831  } else {
1832  // If there is a ^A, turn off GT checking.
1833  accept_gt = false;
1834  }
1835 
1836  if (pos_next == fasta.npos) {
1837  pos_next = fasta.size();
1838  skip = 0;
1839  }
1840 
1841  if (pos_title == fasta.npos || pos_title >= pos_next) {
1842  // title field is missing
1843  pos_title = pos_next;
1844  }
1845 
1846  string ids(fasta, id_start, pos_title - id_start);
1847  if (pos_title == pos_next) pos_title--;
1848  string title(fasta, pos_title + 1, pos_next-pos_title - 1);
1849  string remaining(fasta, pos_next, fasta.size() - pos_next);
1850  fasta.swap(remaining);
1851 
1852  // Parse ids. They may or may not be bar-separated.
1853  list< CRef<CSeq_id> > seqids;
1854  if (ids.find('|') != NPOS){
1855  CSeq_id::ParseFastaIds(seqids, ids);
1856  }
1857  else {
1859  if ((id->Which() == CSeq_id::e_Prf) ||
1860  (id->Which() == CSeq_id::e_Pir)){
1861  string label = id->GetSeqIdString(true);
1862  id.Reset(new CSeq_id(CSeq_id::e_Local, label));
1863  }
1864 
1865  CSeq_id::EAccessionInfo info = id->IdentifyAccession();
1867  (bioseq.IsAa() == !!(info & CSeq_id::fAcc_nuc))) {
1868  string label = id->GetSeqIdString(true);
1869  id.Reset(new CSeq_id(CSeq_id::e_Local, label));
1870  }
1871 
1872  seqids.push_back(id);
1873  }
1874 
1875  // Build the actual defline.
1876 
1877  defline.Reset(new CBlast_def_line);
1878  defline->SetSeqid().swap(seqids);
1879  defline->SetTitle(title);
1880 
1881  if (mship_i < membits.size()) {
1882  const vector<int> & V = membits[mship_i++];
1883  defline->SetMemberships().assign(V.begin(), V.end());
1884  }
1885 
1886  if (links_i < linkout.size()) {
1887  const vector<int> & V = linkout[mship_i++];
1888  defline->SetLinks().assign(V.begin(), V.end());
1889  }
1890 
1891  if ((! used_pig) && pig) {
1892  defline->SetOther_info().push_back(pig);
1893  used_pig = true;
1894  }
1895 
1896  bdls->Set().push_back(defline);
1897  }
1898  }
1899  s_CheckEmptyLists(bdls, true);
1900  deflines = bdls;
1901 }
1902 
1903 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
1904  (!defined(NCBI_COMPILER_MIPSPRO)) )
1906 {
1907  if (m_MaskDataColumn == -1) {
1908  m_MaskDataColumn = CreateColumn("BlastDb/MaskData", true);
1909  }
1910  return m_MaskDataColumn;
1911 }
1912 #endif
1913 
1915 
1916 
vector< TRangeWithFuzz > TRanges
Definition: Seq_loc.cpp:4277
CBioseq_Handle –.
const CSeq_id * GetNonLocalId() const
Find a non-local ID if present, consulting assembly details if all IDs for the overall sequence are l...
Definition: Bioseq.cpp:292
bool IsAa(void) const
Definition: Bioseq.cpp:350
`Blob' Class for SeqDB (and WriteDB).
Definition: seqdbblob.hpp:56
@ eSimple
Just write NUL bytes until aligned.
Definition: seqdbblob.hpp:271
void WritePadBytes(int align, EPadding fmt)
Align the offset by writing pad bytes.
Definition: seqdbblob.cpp:562
void WriteInt2(int x)
Write a 1 byte integer to the blob.
Definition: seqdbblob.cpp:313
void Clear()
Clear all owned data and reference an empty string.
Definition: seqdbblob.cpp:58
void WriteInt4(Int4 x)
Write a 4 byte integer to the blob.
Definition: seqdbblob.cpp:323
void WriteInt1(int x)
Write a 1 byte integer to the blob.
Definition: seqdbblob.cpp:303
void WriteInt4_LE(Int4 x)
Definition: seqdbblob.cpp:363
void SortBySeqIdRank(bool is_protein, bool useBlastRank=false)
Sort the deflines according to the toolkit established ranking of Seq-ids.
static Int4 GetInt4(const unsigned char *ptr)
Definition: ncbi_bswap.hpp:121
CDirEntry –.
Definition: ncbifile.hpp:262
bool IsRegistered(int algo_id) const
Verify whether the provided algorithm ID has been registered with this object.
int Add(EBlast_filter_program program, const string &options=string(), const string &progname=string())
Attempt to register the information about a masking algorithm.
This represents a set of masks for a given sequence.
Definition: writedb.hpp:65
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
static SIZE_TYPE Convert(const CTempString &src, TCoding src_coding, TSeqPos pos, TSeqPos length, string &dst, TCoding dst_coding)
CSeqIdException –.
Definition: Seq_id.hpp:1001
@ e_Ncbistdaa
Definition: sequtil.hpp:58
@ e_Iupacaa
Definition: sequtil.hpp:55
CSeqVector –.
Definition: seq_vector.hpp:65
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
CTime –.
Definition: ncbitime.hpp:296
CWriteDBException.
CWriteDB_GiMask class.
CMaskInfoRegistry m_MaskAlgoRegistry
Registry for masking algorithms in this database.
int CreateColumn(const string &title, bool mbo=false)
Set up a generic CWriteDB metadata column.
void x_Publish()
Flush accumulated sequence data to volume.
CSeqVector m_SeqVector
SeqVector for next sequence to write.
void SetMaskedLetters(const string &masked)
Set bases that should not be used in sequences.
void x_MaskSequence()
Replace masked input letters with m_MaskByte value.
void x_CookIds()
Collect ids for ISAM files.
void SetPig(int pig)
Set the PIG identifier of this sequence.
string m_Sequence
Sequence data in format that will be written to disk.
bool m_Protein
True if DB is protein.
vector< vector< int > > m_Memberships
Membership bits - outer vector is per-defline, inner is bits.
void x_CookColumns()
Prepare column data to be appended to disk.
void AddColumnMetaData(int col_id, const string &key, const string &value)
Add meta data to a column.
CWriteDB_Column::TColumnMeta TColumnMeta
Per-column metadata.
void ListFiles(vector< string > &files)
List Filenames.
void x_SetHaveSequence()
Records that we now have unwritten sequence data.
CRef< CWriteDB_Volume > m_Volume
This volume is currently accepting sequences.
bool x_HaveSequence() const
Returns true if we have unwritten sequence data.
void AddSequence(const CTempString &sequence, const CTempString &ambiguities)
Add a new sequence as raw sequence and ambiguity data.
static void x_GetFastaReaderDeflines(const CBioseq &bioseq, CConstRef< CBlast_def_line_set > &deflines, const vector< vector< int > > &membits, const vector< vector< int > > &linkout, int pig, bool accept_gt, bool parse_ids, bool long_seqids, bool scan_bioseq_4_cfastareader_usrobj=false)
Extract a defline set from a CFastaReader generated CBioseq.
vector< char > m_MaskLookup
Is (blast-aa) byte masked?
vector< CRef< CWriteDB_Volume > > m_VolumeList
List of all volumes so far, up to and including m_Volume.
string m_Dbname
Database base name.
string m_Date
Time stamp (for all volumes.)
void x_MakeAlias()
Flush accumulated sequence data to volume.
void x_CookHeader()
Convert header data into usable forms.
void Close()
Close the file and flush any remaining data to disk.
void SetMaxFileSize(Uint8 sz)
Set the maximum size for any file in the database.
set< TTaxId > m_TaxIds
vector< CRef< CSeq_id > > m_Ids
Ids for next sequence to write, for use during ISAM construction.
void x_CookSequence()
Convert sequence data into usable forms.
~CWriteDB_Impl()
Destructor.
EBlastDbVersion m_DbVersion
BLASTDB version.
int m_Pig
PIG to attach to headers for protein sequences.
static CRef< CBlast_def_line_set > ExtractBioseqDeflines(const CBioseq &bs, bool parse_ids, bool long_seqids, bool scan_bioseq_4_cfastareader_usrobj=false)
Extract deflines from a CBioseq.
string m_MaskedLetters
Masked protein letters (IUPAC).
EIndexType m_Indices
Indexing mode.
static void x_BuildDeflinesFromBioseq(const CBioseq &bioseq, CConstRef< CBlast_def_line_set > &deflines, const vector< vector< int > > &membits, const vector< vector< int > > &linkout, int pig)
Construct deflines from a CBioseq and other meta-data.
int x_GetMaskDataColumnId()
Get the mask data column id.
vector< TColumnMeta > m_ColumnMetas
Meta data for all columns.
CConstRef< CBlast_def_line_set > m_Deflines
Deflines to write as header.
void x_ClearHaveSequence()
Records that we no longer have unwritten sequence data.
void SetMaskData(const CMaskedRangesVector &ranges, const vector< TGi > &gis)
Set filtering data for a sequence.
vector< string > m_ColumnTitles
Column titles.
int m_MaskDataColumn
Column ID for masking data column.
int x_ComputeSeqLength()
Compute the length of the current sequence.
vector< vector< int > > m_Linkouts
Linkout bits - outer vector is per-defline, inner is bits.
void x_ResetSequenceData()
Clear sequence data from last sequence.
void SetDeflines(const CBlast_def_line_set &deflines)
This method replaces any stored header data for the current sequence with the provided CBlast_def_lin...
void ListVolumes(vector< string > &vols)
List Volumes.
static void x_SetDeflinesFromBinary(const string &bin_hdr, CConstRef< CBlast_def_line_set > &deflines)
Extract a defline set from a binary ASN.1 blob.
CRef< CWriteDB_LMDB > m_Lmdbdb
Write lmdb handle.
int FindColumn(const string &title) const
Find an existing column.
CRef< CWriteDB_TaxID > m_Taxdb
Write tax info handle.
map< int, int > m_MaskAlgoMap
Mapping from algo_id to gi-mask id.
vector< CRef< CBlastDbBlob > > m_Blobs
Blob data for the current sequence, indexed by letter.
int m_Hash
Sequence hash for this sequence.
bool m_LongSeqId
If true, use long sequence id format (database|accession) for all acessions.
static void x_GetBioseqBinaryHeader(const CBioseq &bioseq, string &binhdr)
Get binary version of deflines from 'user' data in Bioseq.
int RegisterMaskAlgorithm(EBlast_filter_program program, const string &options, const string &name="")
Register a type of filtering data found in this database.
vector< int > m_HaveBlob
List of blob columns that are active for this sequence.
string m_Ambig
Ambiguities in format that will be written to disk.
Uint8 m_MaxVolumeLetters
Max letters per volume.
void x_ComputeHash(const CTempString &sequence, const CTempString &ambiguities)
Compute the hash of a (raw) sequence.
int m_LmdbOid
Current oid to use for lmdb.
CWriteDB_Impl(const string &dbname, bool protein, const string &title, EIndexType indices, bool parse_ids, bool long_ids, bool use_gi_mask, EBlastDbVersion dbver=eBDB_Version4, bool limit_defline=false, Uint8 oid_masks=EOidMaskType::fNone, bool scan_bioseq_4_cfastareader_usrobj=false)
Constructor.
void x_CookData()
Convert and compute final data formats.
string m_BinHdr
Binary header in format that will be written to disk.
bool m_UseGiMask
Generate GI-based mask files.
bool m_ScanBioseq4CFastaReaderUsrObjct
Uint8 m_MaxFileSize
Maximum size of any file.
CBlastDbBlob & SetBlobData(int col_id)
Get a blob to use for a given column letter.
bool m_ParseIDs
Generate ISAM files.
bool m_Closed
True if database has been closed.
vector< CRef< CWriteDB_GiMask > > m_GiMasks
Gi-based masks.
int m_SeqLength
When a sequence is added, this will be populated with the length of that sequence.
bool m_HaveSequence
True if we have a sequence to write.
static void x_ExtractDeflines(CConstRef< CBioseq > &bioseq, CConstRef< CBlast_def_line_set > &deflines, string &bin_hdr, const vector< vector< int > > &membbits, const vector< vector< int > > &linkouts, int pig, set< TTaxId > &tax_ids, int OID=-1, bool parse_ids=true, bool long_seqid=false, bool limit_defline=false, bool scan_bioseq_4_cfastareader_usrobj=false)
Get deflines from a CBioseq and other meta-data.
string m_Title
Title field of database.
CConstRef< CBioseq > m_Bioseq
Bioseq object for next sequence to write.
string m_MaskByte
Byte that replaced masked letters.
string x_MakeAliasName()
Compute name of alias file produced.
void SetMaxVolumeLetters(Uint8 sz)
Set the maximum letters in one volume.
class to support searching for duplicate isam keys
CWriteDB_IsamKey(const string &fn)
bool operator<(const CWriteDB_IsamKey &other) const
CNcbiIfstream * source
bool AdvanceKey(const CWriteDB_IsamKey &other)
This class supports creation of a string accession to integer OID lmdb database.
void InsertVolumesInfo(const vector< string > &vol_names, const vector< blastdb::TOid > &vol_num_oids)
Create volume table This api should only be called once to create vol info for all vols in the db.
int InsertEntries(const list< CRef< CSeq_id >> &seqids, const blastdb::TOid oid)
Add entries in bulk as fetched from CSeqDB::GetSeqIDs.
This class supports creation of tax id list lookup files.
int InsertEntries(const set< TTaxId > &tax_ids, const blastdb::TOid oid)
Add tax id entries in bulk for each oid This api needs to be called in sequential order of OIDs This ...
CWriteDB_Volume class.
void RenameSingle()
Rename all volumes files to single-volume names.
void AddColumnMetaData(int col_id, const string &key, const string &value)
Add meta data to a column.
void RenameFileIndex(unsigned int num_digits)
const int & GetOID() const
Get the current OID of the volume.
int CreateColumn(const string &title, const TColumnMeta &meta, Uint8 max_sz, bool mbo=true)
Create a new database column.
bool WriteSequence(const string &seq, const string &ambig, const string &binhdr, const TIdList &ids, int pig, int hash, const TBlobList &blobs, int maskcol_id=-1)
Add a sequence to this volume.
void Close()
Close the volume.
const string & GetVolumeName() const
Get the name of the volume.
EIndexType
Whether and what kind of indices to build.
Definition: writedb.hpp:104
@ eAddHash
Add an index from sequence hash to OID.
Definition: writedb.hpp:126
Definition: set.hpp:45
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
const_iterator begin() const
Definition: set.hpp:135
void clear()
Definition: set.hpp:153
size_type size() const
Definition: set.hpp:132
const_iterator end() const
Definition: set.hpp:136
string GetSeqIdString(const CSeq_id &id)
Definition: compartp.cpp:100
#define T(s)
Definition: common.h:230
static const char si[8][64]
Definition: des.c:146
Blast defline related defines.
#define kAsnDeflineObjLabel
#define false
Definition: bool.h:36
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:56
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define TAX_ID_FROM(T, value)
Definition: ncbimisc.hpp:1111
#define NULL
Definition: ncbistd.hpp:225
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
const string & GetMsg(void) const
Get message string.
Definition: ncbiexpt.cpp:461
void Info(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1185
string GetName(void) const
Get the base entry name with extension (if any).
Definition: ncbifile.hpp:3917
const CVect2< U > & v2
Definition: globals.hpp:440
C & SerialAssign(C &dest, const C &src, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
Definition: serialbase.hpp:482
#define MSerial_AsnBinary
Definition: serialbase.hpp:697
EAccessionInfo
For IdentifyAccession (below)
Definition: Seq_id.hpp:220
static SIZE_TYPE ParseFastaIds(CBioseq::TId &ids, const CTempString &s, bool allow_partial_failure=false)
Parse an entire set of |-delimited FASTA-style IDs, appending the results to IDS.
Definition: Seq_id.cpp:2603
static int BestRank(const CRef< CSeq_id > &id)
Definition: Seq_id.hpp:774
static int BlastRank(const CRef< CSeq_id > &id)
Definition: Seq_id.hpp:782
@ fAcc_prot
Definition: Seq_id.hpp:252
@ fAcc_nuc
Definition: Seq_id.hpp:251
@ fParse_RawText
Try to ID raw non-numeric accessions.
Definition: Seq_id.hpp:81
@ fParse_ValidLocal
Treat otherwise unidentified strings as raw accessions, provided that they pass rudimentary validatio...
Definition: Seq_id.hpp:87
CConstRef< CBioseq > GetCompleteBioseq(void) const
Get the complete bioseq.
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
Definition: seq_vector.cpp:304
TSeqPos size(void) const
Definition: seq_vector.hpp:291
bool Empty(void) const THROWS_NONE
Check if CConstRef is empty – not pointing to any object which means having a null value.
Definition: ncbiobj.hpp:1385
TObjectType * GetPointer(void) const THROWS_NONE
Get pointer,.
Definition: ncbiobj.hpp:1684
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:1439
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool NotEmpty(void) const THROWS_NONE
Check if CConstRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:1392
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
#define kEmptyStr
Definition: ncbistr.hpp:123
#define NPOS
Definition: ncbistr.hpp:133
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5086
static string ParseEscapes(const CTempString str, EEscSeqRange mode=eEscSeqRange_Standard, char user_char='?')
Parse C-style escape sequences in the specified string.
Definition: ncbistr.cpp:4784
const char * data(void) const
Return a pointer to the array represented.
Definition: tempstr.hpp:313
static string & Replace(const string &src, const string &search, const string &replace, string &dst, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3305
static Uint8 StringToUInt8(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to Uint8.
Definition: ncbistr.cpp:871
size_type length(void) const
Return the length of the represented array.
Definition: tempstr.hpp:320
static string & ReplaceInPlace(string &src, const string &search, const string &replace, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3396
size_type size(void) const
Return the length of the represented array.
Definition: tempstr.hpp:327
string AsString(const CTimeFormat &format=kEmptyStr, TSeconds out_tz=eCurrentTimeZone) const
Transform time to string.
Definition: ncbitime.cpp:1512
@ eCurrent
Use current time. See also CCurrentTime.
Definition: ncbitime.hpp:300
C::value_type FindBestChoice(const C &container, F score_func)
Find the best choice (lowest score) for values in a container.
Definition: ncbiutil.hpp:250
static const char label[]
const TOrg & GetOrg(void) const
Get the Org member data.
Definition: BioSource_.hpp:509
EBlast_filter_program
This defines the possible sequence filtering algorithms to be used in a BLAST database.
Tdata & Set(void)
Assign a value to data member.
bool CanGet(void) const
Check if it is safe to call Get method.
const Tdata & Get(void) const
Get the member data.
@ eBlast_filter_program_other
bool IsStr(void) const
Check if variant Str is selected.
Definition: Object_id_.hpp:291
bool CanGetType(void) const
Check if it is safe to call GetType method.
void SetTag(TTag &value)
Assign a value to Tag data member.
Definition: Dbtag_.cpp:66
bool IsId(void) const
Check if variant Id is selected.
Definition: Object_id_.hpp:264
bool CanGetData(void) const
Check if it is safe to call GetData method.
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
const TData & GetData(void) const
Get the Data member data.
const TType & GetType(void) const
Get the Type member data.
void SetDb(const TDb &value)
Assign a value to Db data member.
Definition: Dbtag_.hpp:229
TId GetId(void) const
Get the variant data.
Definition: Object_id_.hpp:270
const TDb & GetDb(void) const
Get the Db member data.
Definition: Org_ref_.hpp:491
bool CanGetDb(void) const
Check if it is safe to call GetDb method.
Definition: Org_ref_.hpp:485
TGeneral & SetGeneral(void)
Select the variant.
Definition: Seq_id_.cpp:375
bool IsPrf(void) const
Check if variant Prf is selected.
Definition: Seq_id_.hpp:916
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_id_.hpp:746
bool IsLocal(void) const
Check if variant Local is selected.
Definition: Seq_id_.hpp:775
bool IsPir(void) const
Check if variant Pir is selected.
Definition: Seq_id_.hpp:853
@ e_Prf
PRF SEQDB.
Definition: Seq_id_.hpp:108
@ e_Local
local use
Definition: Seq_id_.hpp:95
const TUser & GetUser(void) const
Get the variant data.
Definition: Seqdesc_.cpp:384
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
bool IsOrg(void) const
Check if variant Org is selected.
Definition: Seqdesc_.hpp:1046
bool CanGetMol(void) const
Check if it is safe to call GetMol method.
Definition: Seq_inst_.hpp:599
const TTitle & GetTitle(void) const
Get the variant data.
Definition: Seqdesc_.hpp:1032
const TSource & GetSource(void) const
Get the variant data.
Definition: Seqdesc_.cpp:566
bool IsSource(void) const
Check if variant Source is selected.
Definition: Seqdesc_.hpp:1190
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:290
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
bool CanGetDescr(void) const
Check if it is safe to call GetDescr method.
Definition: Bioseq_.hpp:309
TLength GetLength(void) const
Get the Length member data.
Definition: Seq_inst_.hpp:659
const TOrg & GetOrg(void) const
Get the variant data.
Definition: Seqdesc_.cpp:240
bool CanGetId(void) const
Check if it is safe to call GetId method.
Definition: Bioseq_.hpp:284
bool CanGetSeq_data(void) const
Check if it is safe to call GetSeq_data method.
Definition: Seq_inst_.hpp:811
bool IsTitle(void) const
Check if variant Title is selected.
Definition: Seqdesc_.hpp:1026
const TDescr & GetDescr(void) const
Get the Descr member data.
Definition: Bioseq_.hpp:315
bool CanGetInst(void) const
Check if it is safe to call GetInst method.
Definition: Bioseq_.hpp:330
bool IsUser(void) const
Check if variant User is selected.
Definition: Seqdesc_.hpp:1122
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_data_.hpp:475
@ e_Ncbieaa
extended ASCII 1 letter aa codes
Definition: Seq_data_.hpp:111
@ e_Ncbistdaa
consecutive codes for std aas
Definition: Seq_data_.hpp:113
@ e_Ncbi2na
2 bit nucleic acid code
Definition: Seq_data_.hpp:106
@ e_Iupacna
IUPAC 1 letter nuc acid code.
Definition: Seq_data_.hpp:104
@ e_Ncbi4na
4 bit nucleic acid code
Definition: Seq_data_.hpp:107
@ e_Iupacaa
IUPAC 1 letter amino acid code.
Definition: Seq_data_.hpp:105
char * dbname(DBPROCESS *dbproc)
Get name of current database.
Definition: dblib.c:6929
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
static MDB_envinfo info
Definition: mdb_load.c:37
constexpr bool empty(list< Ts... >) noexcept
const struct ncbi::grid::netcache::search::fields::KEY key
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
const CharType(& source)[N]
Definition: pointer.h:1149
unsigned int a
Definition: ncbi_localip.c:102
EIPRangeType t
Definition: ncbi_localip.c:101
T max(T x_, T y_)
T log10(T x_)
std::istream & in(std::istream &in_, double &x_)
double f(double x_, const double &y_)
Definition: njn_root.hpp:188
double df(double x_, const double &y_)
Definition: njn_root.hpp:189
#define count
@ eTaxId2Offsets
Definition: seqdb_lmdb.hpp:125
string BuildLMDBFileName(const string &basename, bool is_protein, bool use_index=false, unsigned int index=0)
Build the canonical LMDB file name for BLAST databases.
Definition: seqdb_lmdb.cpp:749
string GetFileNameFromExistingLMDBFile(const string &lmdb_filename, ELMDBFileType file_type)
Definition: seqdb_lmdb.cpp:763
Defines exception class and several constants for SeqDB.
unsigned SeqDB_SequenceHash(const char *sequence, int length)
Returns a path minus filename.
Definition: seqdbobj.cpp:146
EBlastDbVersion
BLAST database version.
Definition: seqdbcommon.hpp:51
@ eBDB_Version5
Definition: seqdbcommon.hpp:53
Defines `expert' version of CSeqDB interfaces.
void SeqDB_UnpackAmbiguities(const CTempString &sequence, const CTempString &ambiguities, string &result)
Unpack an ambiguous nucleotide sequence.
Definition: seqdbvol.cpp:1698
#define D(d)
static const sljit_gpr r1
static SLJIT_INLINE sljit_ins l(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
static SLJIT_INLINE sljit_ins msg(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
static const sljit_gpr r2
Comparison function for set<CWriteDB_IsamKey<T> *>
bool operator()(const CWriteDB_IsamKey< T > *lhs, const CWriteDB_IsamKey< T > *rhs) const
#define _ASSERT
done
Definition: token1.c:1
static bool ambig(char c)
Data conversion tools for CWriteDB and associated code.
void WriteDB_Ncbi2naToBinary(const CSeq_inst &si, string &seq)
Build blast db nucleotide format from Ncbi2na Seq-inst.
void WriteDB_EaaToBinary(const CSeq_inst &si, string &seq)
Build blast db protein format from Eaa protein Seq-inst.
void WriteDB_IupacaaToBinary(const CSeq_inst &si, string &seq)
Build blast db protein format from Iupacaa protein Seq-inst.
void WriteDB_Ncbi4naToBinary(const CSeq_inst &seqinst, string &seq, string &amb)
Build blast db nucleotide format from Ncbi4na Seq-inst.
void WriteDB_StdaaToBinary(const CSeq_inst &si, string &seq)
Build blast db protein format from Stdaa protein Seq-inst.
void WriteDB_IupacnaToBinary(const CSeq_inst &si, string &seq, string &amb)
Build blast db nucleotide format from Iupacna Seq-inst.
Defines exception class for WriteDB.
int WriteDB_FindSequenceLength(bool protein, const string &seq)
Compute length of sequence from raw packing.
static bool s_UseFastaReaderDeflines(CConstRef< CBioseq > &bioseq, CConstRef< CBlast_def_line_set > &deflines, bool long_seqid)
#define CHAR_BUFFER_SIZE
static CRef< CBlast_def_line_set > s_EditDeflineSet(CConstRef< CBlast_def_line_set > &deflines)
#define TAB_REPLACEMENT
static void s_CheckEmptyLists(CRef< CBlast_def_line_set > &deflines, bool owner)
static const string s_EscapeColon(const string &in)
#define INT4_SIZE
int s_AbsMax(int a, int b)
USING_SCOPE(std)
Import C++ std namespace.
static void s_CheckDuplicateIds(set< CWriteDB_IsamKey< T > *, CWriteDB_IsamKey_Compare< T > > &keys)
Check for duplicate ids across volumes.
void s_LimitDeflines(CConstRef< CBlast_def_line_set > &dfs)
Defines implementation class of WriteDB.
Modified on Wed Sep 04 15:01:45 2024 by modify_doxy.py rev. 669887