33 #include <ncbi_pch.hpp>
40 #include <objects/blastdb/defline_extra.hpp> // for kAsnDeflineObjLabel
41 #include <serial/typeinfo.hpp>
42 #include <corelib/ncbi_bswap.hpp>
44 #include "writedb_impl.hpp"
47 #include <iostream>
48 #include <sstream>
49 #include <cmath>
53 /// Import C++ std namespace.
57  bool protein,
58  const string & title,
59  EIndexType indices,
60  bool parse_ids,
61  bool long_ids,
62  bool use_gi_mask,
63  EBlastDbVersion dbver,
64  bool limit_defline,
65  Uint8 oid_masks,
66  bool scan_bioseq_4_cfastareader_usrobj)
67  : m_Dbname (dbname),
68  m_Protein (protein),
69  m_Title (title),
70  m_MaxFileSize (0),
71  m_MaxVolumeLetters (0),
72  m_Indices (indices),
73  m_Closed (false),
74  m_MaskDataColumn (-1),
75  m_ParseIDs (parse_ids),
76  m_UseGiMask (use_gi_mask),
77  m_DbVersion (dbver),
78  m_Pig (0),
79  m_Hash (0),
80  m_SeqLength (0),
81  m_HaveSequence (false),
82  m_LongSeqId (long_ids),
83  m_LmdbOid (0),
84  m_limitDefline (protein? limit_defline: false),
85  m_OidMasks (oid_masks),
86  m_ScanBioseq4CFastaReaderUsrObjct(scan_bioseq_4_cfastareader_usrobj)
87 {
90  m_Date = now.AsString("b d, Y ");
91  string t = now.AsString("H:m P");
93  if (t[0] == '0') {
94  t.assign(t, 1, t.size() - 1);
95  }
97  m_Date += t;
98 }
101 {
102  try {
103  Close();
104  } catch (const CWriteDBException& e) {
105  ERR_POST(Error << "BLAST Database creation error: " << e.GetMsg());
106  }
108 }
111 {
112  m_Bioseq.Reset();
114  m_Deflines.Reset();
115  m_Ids.clear();
116  m_Linkouts.clear();
117  m_Memberships.clear();
118  m_Pig = 0;
119  m_Hash = 0;
120  m_SeqLength = 0;
122  m_Sequence.erase();
123  m_Ambig.erase();
124  m_BinHdr.erase();
126  m_TaxIds.clear();
128  NON_CONST_ITERATE(vector<int>, iter, m_HaveBlob) {
129  *iter = 0;
130  }
131 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
132  (!defined(NCBI_COMPILER_MIPSPRO)) )
133  NON_CONST_ITERATE(vector< CRef<CBlastDbBlob> >, iter, m_Blobs) {
134  (**iter).Clear();
135  }
136 #endif
137 }
140  const CTempString & ambig)
141 {
142  // Publish previous sequence (if any)
143  x_Publish();
145  // Blank slate for new sequence.
148  m_Sequence.assign(, seq.length());
149  m_Ambig.assign(, ambig.length());
152  x_ComputeHash(seq, ambig);
153  }
156 }
159 {
160  // Publish previous sequence
161  x_Publish();
163  // Blank slate for new sequence.
166  m_Bioseq.Reset(& bs);
167  if (m_Bioseq->GetInst().CanGetMol() && (m_Bioseq->IsAa() != m_Protein)) {
169  msg << "Invalid molecule type of sequence added ("
170  << (m_Bioseq->IsAa() ? "protein" : "nucleotide")
171  << "); expected " << (m_Protein ? "protein" : "nucleotide");
173  }
176  x_ComputeHash(bs);
177  }
180 }
183 {
184  AddSequence(bs);
185  m_SeqVector = sv;
186 }
189 {
190  CSeqVector sv(bsh);
191  AddSequence(*bsh.GetCompleteBioseq(), sv);
192 }
195 /// class to support searching for duplicate isam keys
196 template <class T>
199  public:
200  // data member
204  // constructor
205  CWriteDB_IsamKey(const string &fn) {
206  source = new CNcbiIfstream(fn.c_str(),
207  IOS_BASE::in | IOS_BASE::binary);
208  key = x_GetNextKey();
209  };
212  delete source;
213  };
215  // advance key to catch up other
216  bool AdvanceKey(const CWriteDB_IsamKey & other) {
217  while (!source->eof()) {
218  T next_key = x_GetNextKey();
219  if (next_key >= other.key) {
220  key = next_key;
221  return true;
222  }
223  }
224  return false;
225  };
227  // less_than, used for sorting
228  bool operator <(const CWriteDB_IsamKey &other) const {
229  return (key < other.key);
230  };
232  private:
233  // read in the next key, for numeric id
235 #define INT4_SIZE 4
236  char s[INT4_SIZE] = { '\0' };
237  source->read(s, INT4_SIZE);
238  if ((source->gcount() != INT4_SIZE) || source->eof()) {
239  return T();
240  }
241  source->seekg(INT4_SIZE, ios_base::cur);
243  Int4 next_key = (Int4) *((Int4 *) s);
244 #else
245  Int4 next_key = CByteSwap::GetInt4((const unsigned char *)s);
246 #endif
247  return next_key;
248  };
249 };
251 // customized string file reading
252 template <> inline string
254 #define CHAR_BUFFER_SIZE 256
255  char s[CHAR_BUFFER_SIZE] = { '\0' };
256  source->getline(s, CHAR_BUFFER_SIZE);
257  if ((source->gcount() == 0) || source->eof()) {
258  return kEmptyStr;
259  }
260  char * p = s;
261  while (*p != 0x02) ++p;
262  string in(s, p);
264  // check if the current key is PDB-like,
265  // if so, advance for the next
266  // PDB key must be [0-9]...
267  if ( (in.size() == 4)
268  && ((in[0] - '0') * (in[0] - '9') <= 0) ) {
270  // probing the next key to make sure this is pdb id
271  char next_token[4];
272  source->read(next_token, 4);
273  source->seekg(-4, ios_base::cur);
274  string next_key(next_token, 4);
276  if (next_key == in) {
277  // automatically advance to next key
278  return x_GetNextKey();
279  }
280  }
281  return in;
282 };
284 /// Comparison function for set<CWriteDB_IsamKey<T> *>
285 template <class T>
288  const CWriteDB_IsamKey<T> * rhs) const {
289  return (*lhs < *rhs);
290  }
291 };
293 /// Check for duplicate ids across volumes
294 template <class T>
296  CWriteDB_IsamKey_Compare<T> > & keys) {
297  while (!keys.empty()) {
298  // pick the smallest key
299  CWriteDB_IsamKey<T> * key = *(keys.begin());
301  keys.erase(key);
303  if (keys.empty()) {
304  delete key;
305  return;
306  }
308  const CWriteDB_IsamKey<T> * next = *(keys.begin());
309  if (key->AdvanceKey(*next)) {
310  if (keys.find(key) != keys.end()) {
312  msg << "Error: Duplicate seq_id <"
313  << key->key
314  << "> is found multiple times across volumes.";
316  }
317  keys.insert(key);
318  } else {
319  delete key;
320  }
321  }
322 };
325 {
326  if (m_Closed)
327  return;
329  m_Closed = true;
331  x_Publish();
332  m_Sequence.erase();
333  m_Ambig.erase();
335  if (! m_Volume.Empty()) {
336  m_Volume->Close();
338  if (m_UseGiMask) {
339  for (unsigned int i=0; i<m_GiMasks.size(); ++i) {
340  m_GiMasks[i]->Close();
341  }
342  }
344  if (m_VolumeList.size() == 1) {
346  }
347  else if(m_VolumeList.size() > 100){
348  unsigned int num_digits = log10(m_VolumeList.size()) +1;
349  for(unsigned i=0; i < m_VolumeList.size(); i++) {
351  v->RenameFileIndex(num_digits);
352  }
353  LOG_POST(Info << "Rename files index to " << num_digits << " digits");
354  }
356  // disable the check for duplicate ids across volumes
357  /*
358  else if (m_Indices != CWriteDB::eNoIndex) {
359  set<CWriteDB_IsamKey<string> *, CWriteDB_IsamKey_Compare<string> > sids;
360  ITERATE(vector< CRef<CWriteDB_Volume> >, iter, m_VolumeList) {
361  string fn = (*iter)->GetVolumeName() + (m_Protein ? ".psd" : ".nsd");
362  if (CFile(fn).Exists()) {
363  sids.insert(new CWriteDB_IsamKey<string>(fn));
364  }
365  }
366  s_CheckDuplicateIds(sids);
368  set<CWriteDB_IsamKey<Int4> *, CWriteDB_IsamKey_Compare<Int4> > nids;
369  ITERATE(vector< CRef<CWriteDB_Volume> >, iter, m_VolumeList) {
370  string fn = (*iter)->GetVolumeName() + (m_Protein ? ".pnd" : ".nnd");
371  if (CFile(fn).Exists()) {
372  nids.insert(new CWriteDB_IsamKey<Int4>(fn));
373  }
374  }
375  s_CheckDuplicateIds(nids);
376  } */
378  if (m_VolumeList.size() > 1 || m_UseGiMask) {
379  x_MakeAlias();
380  }
381  if ((m_DbVersion == eBDB_Version5) && m_Lmdbdb) {
382  vector<string> vol_names(m_VolumeList.size());
383  vector<blastdb::TOid> vol_num_oids(m_VolumeList.size());
384  for(unsigned i=0; i < m_VolumeList.size(); i++) {
386  vol_names[i] = CDirEntry(v->GetVolumeName()).GetName();
387  vol_num_oids[i] = v->GetOID();
388  }
389  m_Lmdbdb->InsertVolumesInfo(vol_names, vol_num_oids);
390  m_Lmdbdb.Reset();
391  m_Taxdb.Reset();
392  }
394  m_Volume.Reset();
395  }
396 }
399 {
400  return m_Dbname + (m_Protein ? ".pal" : ".nal");
401 }
404 {
405  string dblist;
406  if (m_VolumeList.size() > 1) {
407  for(unsigned i = 0; i < m_VolumeList.size(); i++) {
408  if (dblist.size())
409  dblist += " ";
412  dblist += CDirEntry(v->GetVolumeName()).GetName();
413  }
414  } else {
415  dblist = m_Dbname;
416  }
418  string masklist("");
419  if (m_UseGiMask) {
420  for (unsigned i = 0; i < m_GiMasks.size(); i++) {
421  const string & x = m_GiMasks[i]->GetName();
422  if (x != "") {
423  masklist += x + " ";
424  }
425  }
426  }
428  string nm = x_MakeAliasName();
430  ofstream alias(nm.c_str());
432  alias << "#\n# Alias file created: " << m_Date << "\n#\n"
433  << "TITLE " << m_Title << "\n"
434  << "DBLIST " << dblist << "\n";
436  if (masklist != "") {
437  alias << "MASKLIST " << masklist << "\n";
438  }
439 }
442  string & bin_hdr)
443 {
444  if (! bin_hdr.empty()) {
445  return;
446  }
448  if (! bioseq.CanGetDescr()) {
449  return;
450  }
452  // Getting the binary headers, when they exist, is probably faster
453  // than building new deflines from the 'visible' CBioseq parts.
455  vector< vector< char >* > bindata;
457  ITERATE(list< CRef< CSeqdesc > >, iter, bioseq.GetDescr().Get()) {
458  if ((**iter).IsUser()) {
459  const CUser_object & uo = (**iter).GetUser();
460  const CObject_id & oi = uo.GetType();
462  if (oi.IsStr() && oi.GetStr() == kAsnDeflineObjLabel) {
463  if (uo.CanGetData()) {
464  const vector< CRef< CUser_field > > & D = uo.GetData();
466  if (D.size() &&
467  D[0].NotEmpty() &&
468  D[0]->CanGetLabel() &&
469  D[0]->GetLabel().IsStr() &&
470  D[0]->GetLabel().GetStr() == kAsnDeflineObjLabel &&
471  D[0]->CanGetData() &&
472  D[0]->GetData().IsOss()) {
474  bindata = D[0]->GetData().GetOss();
475  break;
476  }
477  }
478  }
479  }
480  }
482  if (! bindata.empty()) {
483  if (bindata[0] && (! bindata[0]->empty())) {
484  vector<char> & b = *bindata[0];
486  bin_hdr.assign(& b[0], b.size());
487  }
488  }
489 }
491 void
493 {
494  static const int kGenBankLimit = 5;
495  static const int kGenBankScore = 500;
496  if (dfs->Get().size() <= kGenBankLimit){
497  return;
498  }
500  CBlast_def_line_set * deflines = const_cast<CBlast_def_line_set*>(dfs.GetPointer());
501  deflines->SortBySeqIdRank(true, true);
502  list<CRef<CBlast_def_line> > & df_set= deflines->Set();
504  if(FindBestChoice(df_set.front()->GetSeqid(), CSeq_id::BlastRank)->IsLocal()){
505  return;
506  }
507  string id =FindBestChoice(df_set.front()->GetSeqid(), CSeq_id::BlastRank)->AsFastaString();
508  CBlast_def_line::TTaxIds tax_ids;
509  CBlast_def_line_set::Tdata::iterator itr=df_set.begin();
510  int gb_count = 0;
511  list<CRef<CBlast_def_line> > tmp_gb_list;
512  while (itr != df_set.end()){
513  CBlast_def_line & df= **itr;
514  int score = CSeq_id::BlastRank(FindBestChoice(df.GetSeqid(), CSeq_id::BlastRank));
515  CBlast_def_line::TTaxIds df_taxids= df.GetTaxIds();
516  if (score >= kGenBankScore){
517  size_t orig_size = tax_ids.size();
518  tax_ids.insert(df_taxids.begin(), df_taxids.end());
519  if (orig_size == tax_ids.size()){
520  if(gb_count < 5){
521  list<CRef<CBlast_def_line> >::iterator tmp_itr = itr;
522  itr++;
523  tmp_gb_list.splice(tmp_gb_list.end(), df_set, tmp_itr);
524  }
525  else {
526  itr = df_set.erase(itr);
527  }
528  continue;
529  }
530  else {
531  gb_count ++;
532  }
533  }
534  else {
535  tax_ids.insert(df_taxids.begin(), df_taxids.end());
536  }
537  itr++;
538  }
540  while ((gb_count < kGenBankLimit) && (tmp_gb_list.size() > 0)){
541  df_set.splice(df_set.end(), tmp_gb_list, tmp_gb_list.begin());
542  gb_count++;
543  }
544  tmp_gb_list.clear();
545 }
548 static void
549 s_CheckEmptyLists(CRef<CBlast_def_line_set> & deflines, bool owner);
553 {
555  SerialAssign(*bdls, *deflines);
556  s_CheckEmptyLists(bdls, true);
557  return bdls;
558 }
560 static void
562 {
563  CBlast_def_line_set * bdls = 0;
564  CConstRef<CBlast_def_line_set> here(&*deflines);
566  if (! owner) {
567  here = s_EditDeflineSet(here);
568  return;
569  }
571  bdls = const_cast<CBlast_def_line_set*>(here.GetPointer());
573  NON_CONST_ITERATE(list< CRef< CBlast_def_line > >, iter, bdls->Set()) {
574  CRef<CBlast_def_line> defline = *iter;
575  if (defline->CanGetMemberships() &&
576  defline->GetMemberships().size() == 0) {
578  defline->ResetMemberships();
579  }
581  if (defline->CanGetLinks() &&
582  defline->GetLinks().size() == 0) {
584  defline->ResetLinks();
585  }
586  }
588  deflines.Reset(bdls);
589 }
591 void
594  const vector< vector<int> > & membbits,
595  const vector< vector<int> > & linkouts,
596  int pig)
597 {
598  if (! (bioseq.CanGetDescr() && bioseq.CanGetId())) {
599  return;
600  }
602  vector<TTaxId> taxids;
603  string titles;
605  // Scan the CBioseq for taxids and the title string.
607  ITERATE(list< CRef< CSeqdesc > >, iter, bioseq.GetDescr().Get()) {
608  const CSeqdesc & desc = **iter;
610  if (desc.IsTitle()) {
611  //defline->SetTitle((**iter)->GetTitle());
612  titles = (**iter).GetTitle();
613  }
614  else {
615  const COrg_ref * org_pt = NULL;
616  if (desc.IsSource()) {
617  org_pt = &(desc.GetSource().GetOrg());
618  }
619  else if( desc.IsOrg()) {
620  org_pt = &(desc.GetOrg());
621  }
623  if((NULL != org_pt) && org_pt->CanGetDb()) {
624  ITERATE(vector< CRef< CDbtag > >,
625  dbiter,
626  org_pt->GetDb()) {
628  if ((**dbiter).CanGetDb() &&
629  (**dbiter).GetDb() == "taxon") {
631  const CObject_id & oi = (**dbiter).GetTag();
633  if (oi.IsId()) {
634  //defline->SetTaxid(oi.GetId());
635  taxids.push_back(TAX_ID_FROM(CObject_id::TId, oi.GetId()));
636  }
637  }
638  }
639  }
640  }
641  }
643  // The bioseq has a field contianing the ids for the first
644  // defline. The title string contains the title for the first
645  // defline, plus all the other defline titles and ids. This code
646  // unpacks them and builds a normal blast defline set.
648  list< CRef<CSeq_id> > ids = bioseq.GetId();
650  unsigned taxid_i(0), mship_i(0), links_i(0);
651  bool used_pig(false);
653  // Build the deflines.
656  CRef<CBlast_def_line> defline;
658  while(! ids.empty()) {
659  defline.Reset(new CBlast_def_line);
661  defline->SetSeqid() = ids;
662  ids.clear();
664  /*
665  size_t pos = titles.find(" >");
666  string T;
668  if (pos != titles.npos) {
669  T.assign(titles, 0, pos);
670  titles.erase(0, pos + 2);
672  pos = titles.find(" ");
673  string nextid;
675  if (pos != titles.npos) {
676  nextid.assign(titles, 0, pos);
677  titles.erase(0, pos + 1);
678  } else {
679  nextid.swap(titles);
680  }
682  // Parse '|' seperated ids.
683  if ( nextid.find('|') == NPOS
684  || !isalpha((unsigned char)(nextid[0]))) {
685  ids.push_back(CRef<CSeq_id> (new CSeq_id(CSeq_id::e_Local, nextid)));
686  } else {
687  CSeq_id::ParseFastaIds(ids, nextid);
688  }
689  } else {
690  T = titles;
691  }
693  */
694  defline->SetTitle(titles);
696  if (taxid_i < taxids.size()) {
697  defline->SetTaxid(taxids[taxid_i++]);
698  }
700  if (mship_i < membbits.size()) {
701  const vector<int> & V = membbits[mship_i++];
702  defline->SetMemberships().assign(V.begin(), V.end());
703  }
705  if (links_i < linkouts.size()) {
706  const vector<int> & V = linkouts[mship_i++];
707  defline->SetLinks().assign(V.begin(), V.end());
708  }
710  if ((! used_pig) && pig) {
711  defline->SetOther_info().push_back(pig);
712  used_pig = true;
713  }
715  bdls->Set().push_back(defline);
716  }
718  s_CheckEmptyLists(bdls, true);
719  deflines = bdls;
720 }
723 x_SetDeflinesFromBinary(const string & bin_hdr,
725 {
728  istringstream iss(bin_hdr);
729  iss >> MSerial_AsnBinary >> *bdls;
731  s_CheckEmptyLists(bdls, true);
732  deflines.Reset(&* bdls);
733 }
736 static bool s_UseFastaReaderDeflines(CConstRef<CBioseq> & bioseq, CConstRef<CBlast_def_line_set> & deflines, bool long_seqid)
737 {
738  if(deflines.Empty())
739  return false;
741  const CSeq_id * bioseq_id = bioseq->GetNonLocalId();
743  if(bioseq_id == NULL ||
744  // For bare pir and prf ids go with the one from defline.
745  // This is to parse bare ids as local ones. The bare pdb ids are pdb in
746  // bioseq (parsed by CFastaReader), but local in deflines (parsed by
747  // CSeq_id).
748  (!long_seqid && (bioseq_id->IsPrf() || bioseq_id->IsPir()))) {
749  return true;
750  }
752  // Bioseq has non-local id, make sure at least one id is non-local from CFastaReader
753  // defline
754  ITERATE(list< CRef<CBlast_def_line> >, iter, deflines->Get()) {
755  CRef<CSeq_id> id = FindBestChoice((**iter).GetSeqid(), &CSeq_id::BestRank);
756  if (id.NotEmpty() && !id->IsLocal()) {
757  return true;
758  }
759  }
760  return false;
762 }
764 void
767  string & bin_hdr,
768  const vector< vector<int> > & membbits,
769  const vector< vector<int> > & linkouts,
770  int pig,
771  set<TTaxId> & tax_ids,
772  int OID,
773  bool parse_ids,
774  bool long_ids,
775  bool limit_defline,
776  bool scan_bioseq_4_cfastareader_usrobj)
777 {
778  bool use_bin = (deflines.Empty() && pig == 0);
780  if (! bin_hdr.empty() && OID<0) {
781  return;
782  }
784  if (deflines.Empty()) {
785  // Use bioseq if deflines are not provided.
787  if (bioseq.Empty()) {
789  eArgErr,
790  "Error: Cannot find CBioseq or deflines.");
791  }
793  // CBioseq objects from SeqDB have binary headers embedded in
794  // them. If these are found, we try to use them. However,
795  // using binary headers may not help us much if we also want
796  // lists of sequence identifiers (for building ISAM files).
798  if (use_bin) {
799  x_GetBioseqBinaryHeader(*bioseq, bin_hdr);
800  }
802  if (bin_hdr.empty()) {
803  try {
804  x_GetFastaReaderDeflines(*bioseq,
805  deflines,
806  membbits,
807  linkouts,
808  pig,
809  false,
810  parse_ids,
811  long_ids,
812  scan_bioseq_4_cfastareader_usrobj);
813  } catch ( const CSeqIdException&e ) {
814  //LOG_POST(Info << "x_GetFastaReaderDeflines " << e.GetMsg() );
815  }
816  }
818  if(!s_UseFastaReaderDeflines(bioseq, deflines, long_ids)) {
819  deflines.Reset();
820  }
822  if (bin_hdr.empty() && deflines.Empty()) {
824  deflines,
825  membbits,
826  linkouts,
827  pig);
828  }
829  }
831  if (bin_hdr.empty() &&
832  (deflines.Empty() || deflines->Get().empty())) {
835  eArgErr,
836  "Error: No deflines provided.");
837  }
839  if (pig != 0) {
840  const list<int> * L = 0;
842  if (deflines->Get().front()->CanGetOther_info()) {
843  L = & deflines->Get().front()->GetOther_info();
844  }
846  // If the pig does not agree with the current value, set the
847  // new value and force a rebuild of the binary headers. If
848  // there is more than one value in the list, leave the others
849  // in place.
851  if ((L == 0) || L->empty()) {
853  bdls->Set().front()->SetOther_info().push_back(pig);
855  deflines.Reset(&* bdls);
856  bin_hdr.erase();
857  } else if (L->front() != pig) {
859  bdls->Set().front()->SetOther_info().front() = pig;
861  deflines.Reset(&* bdls);
862  bin_hdr.erase();
863  }
864  }
866  if (OID>=0) {
867  // Re-inject the BL_ORD_ID
868  CRef<CSeq_id> gnl_id(new CSeq_id);
869  gnl_id->SetGeneral().SetDb("BL_ORD_ID");
870  gnl_id->SetGeneral().SetTag().SetId(OID);
872  bdls->Set().front()->SetSeqid().front() = gnl_id;
874  deflines.Reset(&* bdls);
875  }
877  if (deflines.Empty() && (! bin_hdr.empty())) {
878  // Uncompress the deflines from binary.
879  x_SetDeflinesFromBinary(bin_hdr, deflines);
880  }
882  if (limit_defline) {
883  s_LimitDeflines(deflines);
884  bin_hdr.clear();
885  }
887  if (bin_hdr.empty() || OID>=0) {
888  // Compress the deflines to binary.
890  CNcbiOstrstream oss;
891  oss << MSerial_AsnBinary << *deflines;
892  bin_hdr = CNcbiOstrstreamToString(oss);
893  }
895  if ((! deflines.Empty()) && deflines->CanGet()) {
896  ITERATE(list< CRef<CBlast_def_line> >, defline, deflines->Get()) {
897  CBlast_def_line::TTaxIds taxid_set = (*defline)->GetTaxIds();
898  tax_ids.insert(taxid_set.begin(), taxid_set.end());
899  }
900  }
901 }
904 {
905  int OID = -1;
906  if (! m_ParseIDs) {
907  OID = (m_Volume ) ? m_Volume->GetOID() : 0;
908  }
910  m_Deflines,
911  m_BinHdr,
913  m_Linkouts,
914  m_Pig,
915  m_TaxIds,
916  OID,
917  m_ParseIDs,
918  m_LongSeqId,
922  x_CookIds();
923 }
926 {
927  if (! m_Ids.empty()) {
928  return;
929  }
931  if (m_Deflines.Empty()) {
932  if (m_BinHdr.empty()) {
934  eArgErr,
935  "Error: Cannot find IDs or deflines.");
936  }
939  }
941  ITERATE(list< CRef<CBlast_def_line> >, iter, m_Deflines->Get()) {
942  const list< CRef<CSeq_id> > & ids = (**iter).GetSeqid();
943  // m_Ids.insert(m_Ids.end(), ids.begin(), ids.end());
944  // Spelled out for WorkShop. :-/
945  // ID-6757 : STL containers have efficient internal memory maintenance,
946  // the following line is, on the contrary, very inefficient.
947  // m_Ids.reserve(m_Ids.size() + ids.size());
948  ITERATE (list<CRef<CSeq_id> >, it, ids) {
949  m_Ids.push_back(*it);
950  }
951  }
952 }
955 {
956  // Scan and mask the sequence itself.
957  for(unsigned i = 0; i < m_Sequence.size(); i++) {
958  if (m_MaskLookup[m_Sequence[i] & 0xFF] != 0) {
959  m_Sequence[i] = m_MaskByte[0];
960  }
961  }
962 }
965 {
966  if (! m_SeqLength) {
967  if (! m_Sequence.empty()) {
969  } else if (m_SeqVector.size()) {
971  } else if (! (m_Bioseq &&
972  m_Bioseq->CanGetInst() &&
973  m_Bioseq->GetInst().GetLength())) {
976  eArgErr,
977  "Need sequence data.");
978  }
980  if (m_Bioseq.NotEmpty()) {
981  const CSeq_inst & si = m_Bioseq->GetInst();
982  m_SeqLength = si.GetLength();
983  }
984  }
986  return m_SeqLength;
987 }
990 {
991  if (! m_Sequence.empty())
992  return;
994  if (! (m_Bioseq.NotEmpty() && m_Bioseq->CanGetInst())) {
996  eArgErr,
997  "Need sequence data.");
998  }
1000  const CSeq_inst & si = m_Bioseq->GetInst();
1002  if (m_Bioseq->GetInst().CanGetSeq_data()) {
1003  const CSeq_data & sd = si.GetSeq_data();
1005  string msg;
1007  switch(sd.Which()) {
1010  break;
1012  case CSeq_data::e_Ncbieaa:
1014  break;
1016  case CSeq_data::e_Iupacaa:
1018  break;
1020  case CSeq_data::e_Ncbi2na:
1022  break;
1024  case CSeq_data::e_Ncbi4na:
1026  break;
1028  case CSeq_data::e_Iupacna:
1030  break;
1032  default:
1033  msg = "Unable to process sequence for entry [";
1034  msg += (m_Bioseq->GetId().front())->GetSeqIdString(false);
1035  msg += "].";
1036  }
1038  if (! msg.empty()) {
1039  NCBI_THROW(CWriteDBException, eArgErr, msg);
1040  }
1041  } else {
1042  int sz = m_SeqVector.size();
1044  if (sz == 0) {
1046  eArgErr,
1047  "No sequence data in Bioseq, "
1048  "and no Bioseq_Handle available.");
1049  }
1051  if (m_Protein) {
1052  // I add one to the string length to allow the "i+1" in
1053  // the loop to be done safely.
1055  m_Sequence.reserve(sz);
1057  } else {
1058  // I add one to the string length to allow the "i+1" in the
1059  // loop to be done safely.
1061  string na8;
1062  na8.reserve(sz + 1);
1063  m_SeqVector.GetSeqData(0, sz, na8);
1064  na8.resize(sz + 1);
1066  string na4;
1067  na4.resize((sz + 1) / 2);
1069  for(int i = 0; i < sz; i += 2) {
1070  na4[i/2] = (na8[i] << 4) + na8[i+1];
1071  }
1073  WriteDB_Ncbi4naToBinary(,
1074  (int) na4.size(),
1075  (int) si.GetLength(),
1076  m_Sequence,
1077  m_Ambig);
1078  }
1079  }
1080 }
1083 {
1084 }
1086 // The CPU should be kept at 190 degrees for 10 minutes.
1088 {
1089  // We need sequence, ambiguity, and binary deflines. If any of
1090  // these is missing, it is created from other data if possible.
1092  // For now I am disabling binary headers, because in normal usage
1093  // I would expect to see sequences from ID1 or similar, and the
1094  // non-binary case is slightly more complex.
1096  x_CookHeader();
1097  x_CookSequence();
1098  x_CookColumns();
1100  if (m_Protein && m_MaskedLetters.size()) {
1101  x_MaskSequence();
1102  }
1103 }
1106 {
1107  return m_HaveSequence;
1108 }
1111 {
1113  m_HaveSequence = true;
1114 }
1117 {
1119  m_HaveSequence = false;
1120 }
1123 {
1124  // This test should fail only on the first call, or if an
1125  // exception was thrown.
1127  if (x_HaveSequence()) {
1128  _ASSERT(! (m_Bioseq.Empty() && m_Sequence.empty()));
1131  } else {
1132  return;
1133  }
1136  if(m_DbVersion == eBDB_Version5 && m_Lmdbdb.Empty()) {
1137  const string lmdb_fname_w_path = BuildLMDBFileName(m_Dbname, m_Protein);
1138  Uint8 map_size = 0;
1139  char* map_sz_str = getenv("BLASTDB_LMDB_MAP_SIZE");
1140  if (map_sz_str) {
1141  map_size = NStr::StringToUInt8(map_sz_str);
1142  }
1143  if(map_size > 0){
1144  m_Lmdbdb.Reset(new CWriteDB_LMDB(lmdb_fname_w_path, map_size));
1147  map_size));
1148  }
1149  else {
1150  m_Lmdbdb.Reset(new CWriteDB_LMDB(lmdb_fname_w_path));
1153  }
1154  }
1156  x_CookData();
1158  bool done = false;
1160  if (! m_Volume.Empty()) {
1162  m_Ambig,
1163  m_BinHdr,
1164  m_Ids,
1165  m_Pig,
1166  m_Hash,
1167  m_Blobs,
1169  if (done && (m_DbVersion == eBDB_Version5) && m_Lmdbdb) {
1170  if (m_ParseIDs) {
1172  }
1174  m_LmdbOid++;
1175  }
1176  }
1178  if (! done) {
1179  int index = (int) m_VolumeList.size();
1181  if (m_Volume.NotEmpty()) {
1182  m_Volume->Close();
1183  }
1185  {
1188  m_Protein,
1189  m_Title,
1190  m_Date,
1191  index,
1192  m_MaxFileSize,
1194  m_Indices,
1195  m_DbVersion,
1196  m_OidMasks));
1198  m_VolumeList.push_back(m_Volume);
1200 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
1201  (!defined(NCBI_COMPILER_MIPSPRO)) )
1202  _ASSERT(m_Blobs.size() == m_ColumnTitles.size() * 2);
1203  _ASSERT(m_Blobs.size() == m_ColumnMetas.size() * 2);
1204  _ASSERT(m_Blobs.size() == m_HaveBlob.size() * 2);
1206  for(size_t i = 0; i < m_ColumnTitles.size(); i++) {
1208  m_ColumnMetas[i],
1209  m_MaxFileSize);
1210  }
1211 #endif
1212  }
1214  // need to reset OID, hense recalculate the header and id
1215  x_CookHeader();
1218  m_Ambig,
1219  m_BinHdr,
1220  m_Ids,
1221  m_Pig,
1222  m_Hash,
1223  m_Blobs,
1226  if (done && (m_DbVersion == eBDB_Version5) && m_Lmdbdb) {
1227  if (m_ParseIDs){
1229  }
1231  m_LmdbOid++;
1232  }
1234  if (! done) {
1236  eArgErr,
1237  "Cannot write sequence to volume.");
1238  }
1239  }
1240 }
1243 {
1245  bdls(const_cast<CBlast_def_line_set*>(& deflines));
1247  s_CheckEmptyLists(bdls, true);
1248  m_Deflines = bdls;
1249 }
1251 inline int s_AbsMax(int a, int b)
1252 {
1253  return std::max(((a < 0) ? -a : a),
1254  ((b < 0) ? -b : b));
1255 }
1257 // Filtering data format on disk:
1258 //
1259 // Size of integer type for this blob (1, 2, or 4) (4 bytes).
1260 //
1261 // Array of filtering types:
1262 // Filter-type (enumeration)
1263 // Array of offsets:
1264 // Start Offset
1265 // End Offset
1266 //
1267 // The isize is one of 1, 2, or 4, written in the first byte, and
1268 // followed by 0, 1, or 3 NUL bytes to align the data offset to a
1269 // multiple of `isize'.
1270 //
1271 // All other integer values in this array use isize bytes, including
1272 // array counts and the `type' enumerations. After all the offset is
1273 // written, the blob is aligned to a multiple of 4 using the `eSimple'
1274 // method.
1275 //
1276 // Each array is an element count followed by that many elements.
1278 #if 0
1280 // I think this is a better approach; but it needs more testing,
1281 // particularly with regard to platform portability.
1283 struct SWriteInt1 {
1284  static void WriteInt(CBlastDbBlob & blob, int value)
1285  {
1286  blob.WriteInt1(value);
1287  }
1288 };
1290 struct SWriteInt2 {
1291  static void WriteInt(CBlastDbBlob & blob, int value)
1292  {
1293  blob.WriteInt2(value);
1294  }
1295 };
1297 struct SWriteInt4 {
1298  static void WriteInt(CBlastDbBlob & blob, int value)
1299  {
1300  blob.WriteInt4(value);
1301  }
1302 };
1304 template<class TWriteSize, class TRanges>
1305 void s_WriteRanges(CBlastDbBlob & blob,
1306  int count,
1307  const TRanges & ranges)
1308 {
1309  typedef vector< pair<TSeqPos, TSeqPos> > TPairVector;
1311  Int4 num_written = 0;
1312  TWriteSize::WriteInt(blob, count);
1314  for ( typename TRanges::const_iterator r1 = (ranges).begin(),
1315  r1_end = (ranges).end();
1316  r1 != r1_end;
1317  ++r1 ) {
1319  if (r1->offsets.size()) {
1320  num_written ++;
1321  TWriteSize::WriteInt(blob, r1->algorithm_id);
1322  TWriteSize::WriteInt(blob, r1->offsets.size());
1324  ITERATE(TPairVector, r2, r1->offsets) {
1325  TWriteSize::WriteInt(blob, r2->first);
1326  TWriteSize::WriteInt(blob, r2->second);
1327  }
1328  }
1329  }
1331  _ASSERT(num_written == count);
1332 }
1334 #endif
1336 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
1337  (!defined(NCBI_COMPILER_MIPSPRO)) )
1340  const vector <TGi> & gis)
1341 {
1342  // No GI is found for the sequence
1343  // TODO should we generate a warning?
1344  if (m_UseGiMask && !gis.size()) {
1345  return;
1346  }
1348  TSeqPos seq_length = x_ComputeSeqLength();
1350  // Check validity of data and determine maximum integer value
1351  // stored here before writing anything. The best numeric_size
1352  // will be selected; this numeric size is applied uniformly to all
1353  // integers in this blob (except for the first one, which is the
1354  // integer size itself, and which is always a single byte.)
1356  typedef vector< pair<TSeqPos, TSeqPos> > TPairVector;
1358  int range_list_count = 0;
1359  int offset_pairs_count = 0;
1362  ITERATE(CMaskedRangesVector, r1, ranges) {
1363  if (r1->empty()) {
1364  continue;
1365  }
1367  range_list_count ++;
1368  offset_pairs_count += r1->offsets.size();
1370  if ( !m_MaskAlgoRegistry.IsRegistered(r1->algorithm_id) ) {
1371  string msg("Error: Algorithm IDs must be registered before use.");
1372  msg += " Unknown algorithm ID = " +
1373  NStr::IntToString((int)r1->algorithm_id);
1374  NCBI_THROW(CWriteDBException, eArgErr, msg);
1375  }
1378  ITERATE(TPairVector, r2, r1->offsets) {
1379  if ((r2->first > r2->second) ||
1380  (r2->second > seq_length)) {
1383  eArgErr,
1384  "Error: Masked data offsets out of bounds.");
1385  }
1386  }
1387  }
1390  // Gi-based masks
1391  if (m_UseGiMask) {
1392  ITERATE(CMaskedRangesVector, r1, ranges) {
1393  if (r1->offsets.size()) {
1394  m_GiMasks[m_MaskAlgoMap[r1->algorithm_id]]
1395  ->AddGiMask(gis, r1->offsets);
1396  }
1397  }
1398  return;
1399  }
1401  // OID-based masks
1402  const int col_id = x_GetMaskDataColumnId();
1403  CBlastDbBlob & blob = SetBlobData(col_id);
1404  blob.Clear();
1405  blob.WriteInt4(range_list_count);
1407  CBlastDbBlob & blob2 = SetBlobData(col_id);
1408  blob2.Clear();
1409  blob2.WriteInt4(range_list_count);
1411  ITERATE(CMaskedRangesVector, r1, ranges) {
1412  if (r1->offsets.size()) {
1413  blob.WriteInt4(r1->algorithm_id);
1414  blob.WriteInt4(r1->offsets.size());
1415  blob2.WriteInt4(r1->algorithm_id);
1416  blob2.WriteInt4(r1->offsets.size());
1418  ITERATE(TPairVector, r2, r1->offsets) {
1419  blob.WriteInt4(r2->first);
1420  blob.WriteInt4(r2->second);
1421  blob2.WriteInt4_LE(r2->first);
1422  blob2.WriteInt4_LE(r2->second);
1423  }
1424  }
1425  }
1429 }
1431 static const string s_EscapeColon(const string &in) {
1432  const char l = 0x1;
1433  return NStr::Replace(in, ":", string(l,1));
1434 }
1438  const string & options,
1439  const string & name)
1440 {
1441  int algorithm_id = m_MaskAlgoRegistry.Add(program, options, name);
1443  string key = NStr::IntToString(algorithm_id);
1444  string value;
1446  value = NStr::IntToString((int)program) + ":" +
1447  s_EscapeColon(options) + ":" +
1448  s_EscapeColon(name) + ":";
1449  } else {
1450  value = NStr::IntToString((int)program) + ":" + s_EscapeColon(options);
1451  }
1453  if (m_UseGiMask) {
1454  m_MaskAlgoMap[algorithm_id] = m_GiMasks.size();
1456  (new CWriteDB_GiMask(name, value, m_MaxFileSize)));
1457  } else {
1459  }
1461  return algorithm_id;
1462 }
1465 RegisterMaskAlgorithm(const string &id,
1466  const string &description,
1467  const string &options)
1468 {
1469  int algorithm_id = m_MaskAlgoRegistry.Add(id);
1471  string key = NStr::IntToString(algorithm_id);
1472  string value = "100:" +
1473  s_EscapeColon(options) + ":" +
1474  s_EscapeColon(id) + ":" +
1475  s_EscapeColon(description);
1479  return algorithm_id;
1480 }
1482 int CWriteDB_Impl::FindColumn(const string & title) const
1483 {
1484  for(int i = 0; i < (int) m_ColumnTitles.size(); i++) {
1485  if (title == m_ColumnTitles[i]) {
1486  return i;
1487  }
1488  }
1490  return -1;
1491 }
1493 int CWriteDB_Impl::CreateColumn(const string & title, bool mbo)
1494 {
1495  _ASSERT(FindColumn(title) == -1);
1497  size_t col_id = m_Blobs.size() / 2;
1499  _ASSERT(m_HaveBlob.size() == col_id);
1500  _ASSERT(m_ColumnTitles.size() == col_id);
1501  _ASSERT(m_ColumnMetas.size() == col_id);
1503  CRef<CBlastDbBlob> new_blob(new CBlastDbBlob);
1504  CRef<CBlastDbBlob> new_blob2(new CBlastDbBlob);
1506  m_Blobs .push_back(new_blob);
1507  m_Blobs .push_back(new_blob2);
1508  m_HaveBlob .push_back(0);
1509  m_ColumnTitles.push_back(title);
1510  m_ColumnMetas .push_back(TColumnMeta());
1512  if (m_Volume.NotEmpty()) {
1513  size_t id2 = m_Volume->CreateColumn(title, m_ColumnMetas.back(), mbo);
1514  _ASSERT(id2 == col_id);
1515  (void)id2; // get rid of compiler warning
1516  }
1518  return col_id;
1519 }
1522  const string & key,
1523  const string & value)
1524 {
1525  if ((col_id < 0) || (col_id >= (int) m_ColumnMetas.size())) {
1526  NCBI_THROW(CWriteDBException, eArgErr,
1527  "Error: provided column ID is not valid");
1528  }
1530  m_ColumnMetas[col_id][key] = value;
1532  if (m_Volume.NotEmpty()) {
1533  m_Volume->AddColumnMetaData(col_id, key, value);
1534  }
1535 }
1538 {
1539  if ((col_id < 0) || (col_id * 2 >= (int) m_Blobs.size())) {
1540  NCBI_THROW(CWriteDBException, eArgErr,
1541  "Error: provided column ID is not valid");
1542  }
1544  if (m_HaveBlob[col_id] > 1) {
1545  NCBI_THROW(CWriteDBException, eArgErr,
1546  "Error: Already have blob for this sequence and column");
1547  }
1549  ++m_HaveBlob[col_id];
1551  // Blobs are reused to reduce buffer reallocation; a missing blob
1552  // means the corresponding column does not exist.
1554  return *m_Blobs[col_id * 2 + m_HaveBlob[col_id] - 1];
1555 }
1556 #endif
1559 {
1560  m_Pig = pig;
1561 }
1564 {
1565  m_MaxFileSize = sz;
1566 }
1569 {
1570  m_MaxVolumeLetters = sz;
1571 }
1575  bool long_seqids,
1576  bool scan_bioseq_4_cfastareader_usrobj)
1577 {
1578  // Get information
1581  string binary_header;
1582  vector< vector<int> > v1, v2;
1583  set<TTaxId> t;
1584  const bool kLimitDefline = false;
1586  CConstRef<CBioseq> bsref(& bs);
1587  x_ExtractDeflines(bsref, deflines, binary_header, v2, v2, 0, t, -1, parse_ids,
1588  long_seqids, kLimitDefline, scan_bioseq_4_cfastareader_usrobj);
1590  // Convert to return type
1593  bdls.Reset(const_cast<CBlast_def_line_set*>(&*deflines));
1595  return bdls;
1596 }
1598 void CWriteDB_Impl::SetMaskedLetters(const string & masked)
1599 {
1600  // Only supported for protein.
1602  if (! m_Protein) {
1604  eArgErr,
1605  "Error: Nucleotide masking not supported.");
1606  }
1608  m_MaskedLetters = masked;
1610  if (masked.empty()) {
1611  vector<char> none;
1612  m_MaskLookup.swap(none);
1613  return;
1614  }
1616  // Convert set of masked letters to stdaa, use the result to build
1617  // a lookup table.
1619  string mask_bytes;
1622  0,
1623  (int) m_MaskedLetters.size(),
1624  mask_bytes,
1627  _ASSERT(mask_bytes.size() == m_MaskedLetters.size());
1629  // Build a table of character-to-bool.
1630  // (Bool is represented by char 0 and 1.)
1632  m_MaskLookup.resize(256, (char)0);
1634  for (unsigned i = 0; i < mask_bytes.size(); i++) {
1635  int ch = ((int) mask_bytes[i]) & 0xFF;
1636  m_MaskLookup[ch] = (char)1;
1637  }
1639  // Convert the masking character - always 'X' - to stdaa.
1641  if (m_MaskByte.empty()) {
1642  string mask_byte = "X";
1644  CSeqConvert::Convert(mask_byte,
1646  0,
1647  1,
1648  m_MaskByte,
1651  _ASSERT(m_MaskByte.size() == 1);
1652  }
1653 }
1655 void CWriteDB_Impl::ListVolumes(vector<string> & vols)
1656 {
1657  vols.clear();
1659  ITERATE(vector< CRef<CWriteDB_Volume> >, iter, m_VolumeList) {
1660  vols.push_back((**iter).GetVolumeName());
1661  }
1662 }
1664 void CWriteDB_Impl::ListFiles(vector<string> & files)
1665 {
1666  files.clear();
1668  ITERATE(vector< CRef<CWriteDB_Volume> >, iter, m_VolumeList) {
1669  (**iter).ListFiles(files);
1670  }
1672  if (m_VolumeList.size() > 1) {
1673  files.push_back(x_MakeAliasName());
1674  }
1675  if (m_DbVersion == eBDB_Version5) {
1676  files.push_back(BuildLMDBFileName(m_Dbname, m_Protein));
1677  }
1678 }
1680 /// Compute the hash of a (raw) sequence.
1681 ///
1682 /// The hash of the provided sequence will be computed and assigned to
1683 /// the m_Hash field. For protein, the sequence is in the Ncbistdaa
1684 /// format. For nucleotide, the sequence and optional ambiguities are
1685 /// in 'raw' format, meaning they are packed just as sequences are
1686 /// packed in nsq files.
1687 ///
1688 /// @param sequence The sequence data. [in]
1689 /// @param ambiguities Nucleotide ambiguities are provided here. [in]
1691  const CTempString & ambig)
1692 {
1693  if (m_Protein) {
1694  m_Hash = SeqDB_SequenceHash(, sequence.size());
1695  } else {
1696  string na8;
1697  SeqDB_UnpackAmbiguities(sequence, ambig, na8);
1698  m_Hash = SeqDB_SequenceHash(, na8.size());
1699  }
1700 }
1702 /// Compute the hash of a (Bioseq) sequence.
1703 ///
1704 /// The hash of the provided sequence will be computed and
1705 /// assigned to the m_Hash member. The sequence is packed as a
1706 /// CBioseq.
1707 ///
1708 /// @param sequence The sequence as a CBioseq. [in]
1710 {
1711  m_Hash = SeqDB_SequenceHash(sequence);
1712 }
1714 #define TAB_REPLACEMENT " "
1719 x_GetFastaReaderDeflines(const CBioseq & bioseq,
1720  CConstRef<CBlast_def_line_set> & deflines,
1721  const vector< vector<int> > & membits,
1722  const vector< vector<int> > & linkout,
1723  int pig,
1724  bool accept_gt,
1725  bool parse_ids,
1726  bool long_seqids,
1727  bool scan_bioseq_4_cfastareader_usrobj)
1728 {
1729  if (! bioseq.CanGetDescr()) {
1730  return;
1731  }
1733  string fasta;
1735  // Scan the CBioseq for the CFastaReader user object.
1737  if (scan_bioseq_4_cfastareader_usrobj) {
1738  ITERATE(list< CRef< CSeqdesc > >, iter, bioseq.GetDescr().Get()) {
1739  const CSeqdesc & desc = **iter;
1741  if (desc.IsUser() &&
1742  desc.GetUser().CanGetType() &&
1743  desc.GetUser().GetType().IsStr() &&
1744  desc.GetUser().GetType().GetStr() == "CFastaReader" &&
1745  desc.GetUser().CanGetData()) {
1747  const vector< CRef< CUser_field > > & D = desc.GetUser().GetData();
1749  ITERATE(vector< CRef< CUser_field > >, iter, D) {
1750  const CUser_field & f = **iter;
1752  if (f.CanGetLabel() &&
1753  f.GetLabel().IsStr() &&
1754  f.GetLabel().GetStr() == "DefLine" &&
1755  f.CanGetData() &&
1756  f.GetData().IsStr()) {
1757  fasta = NStr::Replace(f.GetData().GetStr(), "\\t", TAB_REPLACEMENT);
1758  fasta = NStr::ParseEscapes(fasta);
1759  break;
1760  }
1761  }
1762  }
1763  }
1765  }
1766  if (fasta.empty())
1767  return;
1769  // The bioseq has a field contianing the ids for the first
1770  // defline. The title string contains the title for the first
1771  // defline, plus all the other defline titles and ids. This code
1772  // unpacks them and builds a normal blast defline set.
1774  unsigned mship_i(0), links_i(0);
1775  bool used_pig(false);
1777  // Build the deflines.
1780  CRef<CBlast_def_line> defline;
1782  if (!parse_ids) {
1784  // Generate an BL_ORD_ID in case no parse is needed
1785  CRef<CSeq_id> gnl_id(new CSeq_id());
1786  gnl_id->SetGeneral().SetDb("BL_ORD_ID");
1787  gnl_id->SetGeneral().SetTag().SetId(0); // will be filled later
1789  // Build the local defline.
1790  defline.Reset(new CBlast_def_line);
1791  defline->SetSeqid().push_back(gnl_id);
1793  string title(fasta, 1, fasta.size());
1794  // Replace ^A with space
1795  NStr::ReplaceInPlace(title, "\001", " ");
1796  // Replace tabs with three spaces
1797  NStr::ReplaceInPlace(title, "\t", TAB_REPLACEMENT);
1798  defline->SetTitle(title);
1800  if (mship_i < membits.size()) {
1801  const vector<int> & V = membits[mship_i++];
1802  defline->SetMemberships().assign(V.begin(), V.end());
1803  }
1805  if (links_i < linkout.size()) {
1806  const vector<int> & V = linkout[mship_i++];
1807  defline->SetLinks().assign(V.begin(), V.end());
1808  }
1810  if ((! used_pig) && pig) {
1811  defline->SetOther_info().push_back(pig);
1812  used_pig = true;
1813  }
1815  bdls->Set().push_back(defline);
1817  } else {
1819  int skip = 1;
1820  while(fasta.size()) {
1821  size_t id_start = skip;
1822  size_t pos_title = fasta.find(" ", skip);
1823  size_t pos_next = fasta.find("\001", skip);
1824  skip = 1;
1826  if (pos_next == fasta.npos) {
1827  if (accept_gt) {
1828  pos_next = fasta.find(" >");
1829  skip = 2;
1830  }
1831  } else {
1832  // If there is a ^A, turn off GT checking.
1833  accept_gt = false;
1834  }
1836  if (pos_next == fasta.npos) {
1837  pos_next = fasta.size();
1838  skip = 0;
1839  }
1841  if (pos_title == fasta.npos || pos_title >= pos_next) {
1842  // title field is missing
1843  pos_title = pos_next;
1844  }
1846  string ids(fasta, id_start, pos_title - id_start);
1847  if (pos_title == pos_next) pos_title--;
1848  string title(fasta, pos_title + 1, pos_next-pos_title - 1);
1849  string remaining(fasta, pos_next, fasta.size() - pos_next);
1850  fasta.swap(remaining);
1852  // Parse ids. They may or may not be bar-separated.
1853  list< CRef<CSeq_id> > seqids;
1854  if (ids.find('|') != NPOS){
1855  CSeq_id::ParseFastaIds(seqids, ids);
1856  }
1857  else {
1859  if ((id->Which() == CSeq_id::e_Prf) ||
1860  (id->Which() == CSeq_id::e_Pir)){
1861  string label = id->GetSeqIdString(true);
1862  id.Reset(new CSeq_id(CSeq_id::e_Local, label));
1863  }
1865  CSeq_id::EAccessionInfo info = id->IdentifyAccession();
1867  (bioseq.IsAa() == !!(info & CSeq_id::fAcc_nuc))) {
1868  string label = id->GetSeqIdString(true);
1869  id.Reset(new CSeq_id(CSeq_id::e_Local, label));
1870  }
1872  seqids.push_back(id);
1873  }
1875  // Build the actual defline.
1877  defline.Reset(new CBlast_def_line);
1878  defline->SetSeqid().swap(seqids);
1879  defline->SetTitle(title);
1881  if (mship_i < membits.size()) {
1882  const vector<int> & V = membits[mship_i++];
1883  defline->SetMemberships().assign(V.begin(), V.end());
1884  }
1886  if (links_i < linkout.size()) {
1887  const vector<int> & V = linkout[mship_i++];
1888  defline->SetLinks().assign(V.begin(), V.end());
1889  }
1891  if ((! used_pig) && pig) {
1892  defline->SetOther_info().push_back(pig);
1893  used_pig = true;
1894  }
1896  bdls->Set().push_back(defline);
1897  }
1898  }
1899  s_CheckEmptyLists(bdls, true);
1900  deflines = bdls;
1901 }
1903 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
1904  (!defined(NCBI_COMPILER_MIPSPRO)) )
1906 {
1907  if (m_MaskDataColumn == -1) {
1908  m_MaskDataColumn = CreateColumn("BlastDb/MaskData", true);
1909  }
1910  return m_MaskDataColumn;
1911 }
1912 #endif
