NCBI C++ ToolKit
build_db.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: build_db.cpp 101152 2023-11-07 15:39:13Z camacho $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Kevin Bealer
27 *
28 */
29 
30 /** @file build_db.cpp
31  Code to build a database given various sources of sequence data.
32  */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbienv.hpp>
35 
36 // Blast databases
37 
41 
42 // Object Manager
43 
45 #include <objmgr/scope.hpp>
46 #include <objmgr/seq_vector.hpp>
47 #include <objtools/readers/reader_exception.hpp> // for CObjReaderParseException
48 
49 // Other utilities
50 
52 
53 // Local
54 
57 
58 #ifndef SKIP_DOXYGEN_PROCESSING
61 #endif
62 
63 int debug_mode = 0;
64 
66 {
67  CScope::TIds ids = x_GetScope().GetIds(*seqid);
68 
69  bool have_seqid = false;
70  bool have_gi = false;
71 
72  gi = ZERO_GI;
73 
74  ITERATE(CScope::TIds, iter, ids) {
75  CConstRef<CSeq_id> id = iter->GetSeqId();
76  if (debug_mode > 5)
77  m_LogFile << "Seq-id " << seqid->AsFastaString()
78  << " contains id " << id->AsFastaString() << endl;
79 
80  if (id->IsGi()) {
81  if (gi > ZERO_GI) {
82  if (debug_mode > 5)
83  m_LogFile << "WARNING: multiple GIs discovered; gi[0] = "
84  << gi << endl;
85  } else {
86  if (debug_mode > 5)
87  m_LogFile << "Seq-id " << seqid->AsFastaString()
88  << " resolved to "
89  << id->GetGi() << endl;
90  gi = id->GetGi();
91  have_gi = true;
92  }
93  } else if ((! have_seqid) && (id->Which() == seqid->Which())) {
94  m_LogFile << "Remote: Resolving <" << seqid->AsFastaString()
95  << "> to <" << id->AsFastaString() << ">" << endl;
96 
97  if (id->GetTextseq_Id() == NULL ||
98  id->GetTextseq_Id()->IsSetVersion() == false) {
99 
100  m_LogFile
101  << "Warning: Resolution still does not provide version."
102  << endl;
103  } else {
104  seqid.Reset(const_cast<CSeq_id*>(id.GetPointer()));
105  have_seqid = true;
106  }
107  }
108 
109  if (have_gi)
110  break;
111  }
112 }
113 
114 // Resolve all ids to GIs, storing them in a GI list.
115 
117 {
118  CRef<CInputGiList> gi_list(new CInputGiList);
119 
120  ITERATE(vector<string>, id, ids) {
121  // There are three possibilities:
122  //
123  // 1. Numbers are added to the list as GIs.
124  // 2. Remote services may be called to determine the most
125  // recent version.
126  // 3. Non-numerical types are added to the list as Seq-ids.
127  //
128  // For #2, the remote service call is only made if:
129  //
130  // A. Remote services are enabled.
131  // B. The Seq-id can have a version (only CTextseq_id types.)
132  // C. The version is not present.
133 
134  TGi gi(ZERO_GI);
135  bool specific = false;
136  CRef<CSeq_id> seqid;
137 
138  bool worked = CheckAccession(*id, gi, seqid, specific);
139 
140  // If a source database is specified, try that as a backup
141  // resolution mechanism.
142 
143  if (! worked) {
144  if (m_SourceDb.NotEmpty()) {
145  worked = x_ResolveFromSource(*id, seqid);
146  }
147  }
148 
149  if (! worked) {
150  m_LogFile << "Did not recognize id: \"" << *id << "\"" << endl;
151  continue;
152  }
153 
154  // 1. Numeric GI
155 
156  if (gi != ZERO_GI) {
157  if (debug_mode > 5)
158  m_LogFile << "Found numerical GI:" << gi << endl;
159 
160  gi_list->AppendGi(gi);
161  continue;
162  }
163 
164  // 2. Possible remote resolution. We look for a GI and if
165  // that is not found, try to find a Seq-id of the same type
166  // (but with a version).
167 
168  if (m_UseRemote && (! specific)) {
169  x_ResolveRemoteId(seqid, gi);
170 
171  if (gi != ZERO_GI) {
172  gi_list->AppendGi(gi);
173  continue;
174  }
175  }
176 
177  // 3. Just add the Seq-id as a Seq-id.
178 
179  gi_list->AppendSi(*id);
180  }
181 
182  return gi_list;
183 }
184 
185 bool CBuildDatabase::x_ResolveFromSource(const string & acc,
187 {
188  if (m_SourceDb.Empty()) {
189  return false;
190  }
191 
192  vector<int> oids;
193  m_SourceDb->AccessionToOids(acc, oids);
194 
195  bool found(false), done(false);
196 
197  ITERATE(vector<int>, oid, oids) {
198  list< CRef<CSeq_id> > ids = m_SourceDb->GetSeqIDs(*oid);
199 
200  ITERATE(list< CRef<CSeq_id> >, seqid, ids) {
201  CRef<CSeq_id> s = *seqid;
202 
203  string S = s->AsFastaString();
204  size_t pos = S.find(acc);
205 
206  if (pos != string::npos) {
207  size_t endpos = pos + acc.size();
208 
209  bool start_okay = (pos == 0 || S[pos-1] == '|');
210  bool end_okay = ((endpos == S.size()) ||
211  (S[endpos] == '.' ||
212  S[endpos] == '|'));
213 
214  if (start_okay && end_okay) {
215  done = true;
216  }
217 
218  if (done || (! found)) {
219  found = true;
220  id = s;
221  }
222  }
223 
224  if (done)
225  break;
226  }
227 
228  if (done)
229  break;
230  }
231 
232  return found;
233 }
234 
236 {
237  TIdToBits bitset;
238 
239  // Get sequence, deflines, ambiguities, and sometimes pigs. The
240  // simplest route (for WriteDB) is raw data + asn deflines, so we
241  // use that when possible.
242 
244  int count = 0;
245 
246  for(int oid = 0; m_SourceDb->CheckOrFindOID(oid); oid++) {
247  // Raw data.
248 
249  const char * buffer (0);
250  int slength(0);
251  int alength(0);
252 
253  m_SourceDb->GetRawSeqAndAmbig(oid, & buffer, & slength, & alength);
254 
256 
257  CTempString sequence(buffer, slength);
258  CTempString ambig(buffer + slength, alength);
259 
260  // Deflines
261 
263  m_DeflineCount += headers->Get().size();
264  m_OIDCount ++;
265 
266  x_SetLinkAndMbit(headers);
267 
268  // Always include the taxid; although OPTIONAL, some programs
269  // expect it, since the C ASN.1 loaders always emit integers.
270 
271  m_Taxids->FixTaxId(headers);
272 
273  // Now, add the sequence to the WriteDB database.
274 
275  m_OutputDb->AddSequence(sequence, ambig);
276  m_OutputDb->SetDeflines(*headers);
277  count ++;
278  }
279 
280  if (count) {
281  double t = sw.Elapsed();
282 
283  m_LogFile << "Duplication from source DB; duplicated "
284  << count << " sequences in " << t << " seconds." << endl;
285  }
286 }
287 
288 // This could be moved to writedb once it is tested and working.
289 
291 {
292  if ((! bs->CanGetInst()) || bs->GetInst().CanGetSeq_data() ||
293  ! bs->GetInst().CanGetExt() || ! bs->GetInst().GetExt().IsDelta()) {
294  return bs;
295  }
296 
297  if (bs->GetInst().CanGetMol() &&
298  !CSeq_inst::IsNa(bs->GetInst().GetMol())) {
300  CNcbiOstrstream oss;
301  oss << id->AsFastaString() << ": Protein delta sequences are not supported.";
302  string msg = CNcbiOstrstreamToString(oss);
303  NCBI_THROW(CMultisourceException, eArg, msg);
304  }
305 
306  try {
307  const CDelta_ext & dext = bs->GetInst().GetExt().GetDelta();
308 
309  if(dext.Get().front()->Which() != CDelta_seq::e_Literal)
310  return bs;
311 
312  typedef list< CRef< CDelta_seq > > TItems;
313 
314  // Don't really want to use na4, because a half byte at the
315  // end of a string would require that string to be manually
316  // adjusted before appending.
317 
318  string seq8na;
319  if (bs->GetInst().CanGetLength()) {
320  seq8na.reserve(bs->GetInst().GetLength());
321  }
322 
323  string na8;
324 
325  ITERATE(TItems, item, dext.Get()) {
326  if(((**item).IsLoc()) && ((**item).GetLoc().IsNull())) {
327  seq8na.append(1, 0x0f);
328  continue;
329  }
330 
331  const CSeq_literal & L = (**item).GetLiteral();
332 
333  if (!L.CanGetSeq_data()) {
334  if (L.CanGetLength()){
335  seq8na.append(L.GetLength(), 0x0f);
336  continue;
337  } else {
339  "Part of the delta sequence, including its length, is un-available.");
340  }
341  }
342 
343  if (L.GetSeq_data().IsNcbi2na()) {
346  0,
347  L.GetLength(),
348  na8,
350  } else if (L.GetSeq_data().IsNcbi4na()) {
353  0,
354  L.GetLength(),
355  na8,
357  } else if (L.GetSeq_data().IsGap()) {
358  seq8na.append(L.GetLength(), 0x0f);
359  } else {
361  "Unhandled type of sequence data encountered.");
362  }
363 
364  seq8na += na8;
365  na8.resize(0);
366  }
367 
368  // Now convert back to 4na, since WriteDB does not yet handle
369  // 8na sequences.
370 
371  int length = seq8na.size();
372  vector<char> seq4na;
373  CSeqConvert::Convert(seq8na,
375  0,
376  length,
377  seq4na,
379 
380  // Copy the needed fields of the CBioseq (but remove the delta
381  // sequence) and add a Seq-data.
382 
383  CRef<CBioseq> bs2(new CBioseq);
384 
385  if (bs->IsSetId()) {
386  bs2->SetId() = bs->GetId();
387  }
388 
389  if (bs->IsSetDescr()) {
390  bs2->SetDescr(const_cast<CSeq_descr&>(bs->GetDescr()));
391  }
392 
393  CRef<CSeq_inst> inst(new CSeq_inst);
394 
395  inst->SetSeq_data().SetNcbi4na().Set().swap(seq4na);
396  inst->SetMol(CSeq_inst::eMol_na);
397  inst->SetLength(length);
399 
400  bs2->SetInst(*inst);
401 
402  if (bs->IsSetAnnot()) {
403  bs2->SetAnnot() = bs->GetAnnot();
404  }
405 
406  bs = bs2;
407  }
408  catch(CInvalidChoiceSelection &) {
410  "Bioseq must have Seq-data or "
411  "Delta containing only literals.");
412  }
413 
414  return bs;
415 }
416 
417 
419 {
420  int pig = 0;
421  const CBlast_def_line & defline = *(headers->Get().front());
422  if (defline.IsSetOther_info())
423  pig = defline.GetOther_info().front();
424 
425  m_OutputDb->SetPig(pig);
426 }
427 
429 {
430  // Always include the taxid; although OPTIONAL, some programs
431  // expect it, since the C ASN.1 loaders always emit integers.
432 
433  m_Taxids->FixTaxId(headers);
434 
435  /// Use case: for transition to GI-less BLASTDBs
436  if (m_SkipCopyingGis)
437  headers->RemoveGIs();
438 
439  // Edit the linkouts
440 
441  x_SetLinkAndMbit(headers);
442 }
443 
444 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
445  (!defined(NCBI_COMPILER_MIPSPRO)) )
446 void
448 {
449  if (m_MaskData.Empty()) {
450  return;
451  }
452 
453  const CMaskedRangesVector& rng = m_MaskData->GetRanges(ids);
454  if (rng.empty()) {
455  return;
456  }
457 
458  vector <TGi> gis;
459  ITERATE(list< CRef<CSeq_id> >, id, ids) {
460  if ((*id)->IsGi()) {
461  gis.push_back((*id)->GetGi());
462  }
463  }
464  m_OutputDb->SetMaskData(rng, gis);
465  m_FoundMatchingMasks = true;
466 }
467 #endif
468 
470  objects::CSeqVector * sv,
471  bool add_pig)
472 {
473  CRef<CBlast_def_line_set> headers =
476 
477  x_EditHeaders(headers);
478 
479  // Add the sequence
480  if (sv) {
481  m_OutputDb->AddSequence(*bs, *sv);
482  } else {
483  bs = s_FixBioseqDeltas(bs);
484  if(bs->GetInst().CanGetSeq_data())
485  m_OutputDb->AddSequence(*bs);
486  else
487  return false;
488  }
489 
490  m_DeflineCount += headers->Get().size();
491  m_OIDCount ++;
492 
493  if(add_pig) {
494  x_AddPig(headers);
495  }
496 
497  m_OutputDb->SetDeflines(*headers);
498 
499 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
500  (!defined(NCBI_COMPILER_MIPSPRO)) )
501  const list< CRef<CSeq_id> > & ids = bs->GetId();
502  x_AddMasksForSeqId(ids);
503 #endif
504  return true;
505 }
506 
507 void CBuildDatabase::x_AddOneRemoteSequence(const objects::CSeq_id & seqid,
508  bool & found_all,
509  bool & error)
510 {
511  // Get handle and bioseq
512 
514  CBioseq_Handle bsh;
515 
516  try {
517  bsh = x_GetScope().GetBioseqHandle(seqid);
518  bs = bsh.GetCompleteBioseq();
519 
520  if (debug_mode > 5) m_LogFile << MSerial_AsnText << *bs << endl;
521  }
522  catch (const CException & e) {
523  m_LogFile << "Caught exception for query: "
524  << seqid.AsFastaString() << endl
525  << e.what() << endl;
526  found_all = false;
527  error = true;
528  }
529 
531  error = true;
532  }
533 
534 
535 
536  CSeqVector sv(bsh);
537 
538  if(!x_EditAndAddBioseq(bs, & sv))
539  error = true;
540 
541  if (error) {
542  if (debug_mode > 5)
543  m_LogFile << "Could not find entry for: "
544  << seqid.AsFastaString() << endl;
545 
546  found_all = false;
547  return;
548  }
549 
550  if (debug_mode > 5)
551  m_LogFile << "-- REMOTE: Found sequence "
552  << seqid.AsFastaString() << endl;
553 }
554 
556 {
558  int count = 0;
559 
560  bool found_all = true;
561 
562  int num_gis = gi_list.GetNumGis();
563  int i = 0;
564 
565  for(i = 0; i < num_gis; i++) {
566  if (m_Verbose)
567  m_LogFile << "GI " << gi_list.GetKey<TGi>(i);
568 
569  // We only need to fetch here for those cases where the SeqDB
570  // attempt could not translate the GI.
571 
572  if (gi_list.GetGiOid(i).oid == -1) {
573  if (m_Verbose)
574  m_LogFile << " not found locally; adding remotely." << endl;
575 
576  CRef<CSeq_id> id(new CSeq_id);
577  id->SetGi(gi_list.GetKey<TGi>(i));
578 
579  bool error = false;
580 
581  x_AddOneRemoteSequence(*id, found_all, error);
582  count++;
583  } else {
584  if (m_Verbose)
585  m_LogFile << " found locally; not adding remotely." << endl;
586  }
587  }
588 
589  int num_seqids = gi_list.GetNumSis();
590 
591  for(i = 0; i < num_seqids; i++) {
592  if (m_Verbose)
593  m_LogFile << "Seq-id "
594  << gi_list.GetKey<string>(i);
595 
596  // We only need to fetch here for those cases where the SeqDB
597  // attempt could not translate the GI.
598 
599  if (gi_list.GetSiOid(i).oid == -1) {
600  if (m_Verbose)
601  m_LogFile << " not found locally; adding remotely." << endl;
602 
603  bool error = false;
604 
605  string acc = gi_list.GetKey<string>(i);
606  CRef<CSeq_id> id(new CSeq_id(acc));
607  x_AddOneRemoteSequence(*id, found_all, error);
608  count++;
609  } else {
610  if (m_Verbose)
611  m_LogFile << " found locally; not adding remotely." << endl;
612  }
613  }
614 
615  if (count) {
616  double t = sw.Elapsed();
617 
618  m_LogFile << "Adding sequences from remote source; added "
619  << count << " sequences in " << t << " seconds." << endl;
620  }
621 
622  return found_all;
623 }
624 
625 bool
627 {
628  bool success = true;
629 
630  int num_gis = gi_list.GetNumGis();
631 
632  int unresolved = 0;
633 
634  int i;
635  for(i = 0; i < num_gis; i++) {
636  // We only need to fetch here for those cases where the SeqDB
637  // attempt could not translate the GI.
638 
639  if (gi_list.GetGiOid(i).oid == -1) {
640  if (m_Verbose)
641  m_LogFile << "GI " << gi_list.GetKey<TGi>(i)
642  << " was not resolvable." << endl;
643 
644  success = false;
645  unresolved ++;
646  } else {
647  if (m_Verbose)
648  m_LogFile << "GI " << gi_list.GetKey<TGi>(i)
649  << " found locally." << endl;
650  }
651  }
652 
653  int num_seqids = gi_list.GetNumSis();
654 
655  for(i = 0; i < num_seqids; i++) {
656  // We only need to fetch here for those cases where the SeqDB
657  // attempt could not translate the GI.
658 
659  if (gi_list.GetSiOid(i).oid == -1) {
660  if (m_Verbose)
661  m_LogFile << "Seq-id "
662  << gi_list.GetKey<string>(i)
663  << " was not resolvable." << endl;
664 
665  unresolved ++;
666  success = false;
667  } else {
668  if (m_Verbose)
669  m_LogFile << "Seq-id "
670  << gi_list.GetKey<string>(i)
671  << " found locally." << endl;
672  }
673  }
674 
675  if (unresolved) {
676  m_LogFile << "Could not resolve " << unresolved << " IDs." << endl;
677  }
678 
679  success = false;
680  unresolved ++;
681 
682  return success;
683 }
684 
686 public:
687  CFastaBioseqSource(CNcbiIstream & fasta_file,
688  bool is_protein,
689  bool parse_ids,
690  bool long_ids);
691 
693 
694  virtual CConstRef<CBioseq> GetNext();
695 
696 private:
699 };
700 
702  bool is_protein,
703  bool parse_ids,
704  bool long_ids)
705  : m_FastaReader(NULL)
706 {
707  m_LineReader.Reset(new CBufferedLineReader(fasta_file));
708  typedef CFastaReader::EFlags TFlags;
709 
710  int iflags = CFastaReader::fForceType;
711 
712  if (is_protein) {
713  iflags |= CFastaReader::fAssumeProt;
714  } else {
715  iflags |= CFastaReader::fAssumeNuc;
716  iflags |= CFastaReader::fParseGaps;
717  }
718 
719  if (parse_ids) {
720  iflags |= CFastaReader::fRequireID;
721  // parse bare accessions
722  if (!long_ids) {
723  iflags |= CFastaReader::fParseRawID;
724  }
725  } else {
726  iflags |= CFastaReader::fNoParseID;
727  }
728 
729  iflags |= CFastaReader::fQuickIDCheck;
731 
732  TFlags flags = (TFlags) iflags;
733 
738 
740  if (app) {
741  const CNcbiRegistry& registry = app->GetConfig();
742  const string& value = registry.Get("BLAST", "MAX_SEQID_LENGTH");
743  if (!value.empty()) {
745  catch (const exception&) {} // Ignore errors
746  }
747  }
748 }
749 
751 {
752  delete m_FastaReader;
753 }
754 
756 {
758 
759  if (m_LineReader.NotEmpty() && ! m_LineReader->AtEOF()) {
760  CRef<CSeq_entry> entry;
761  try { entry = m_FastaReader->ReadOneSeq(); }
762  catch (const CObjReaderParseException& e) {
763  static const string kKeyword("m_Pos = ");
764  SIZE_TYPE start = NStr::Find(e.what(), kKeyword);
765  SIZE_TYPE end = NStr::Find(e.what(), ")", start);
766  string pos("unknown");
767  if (start != NPOS && end != NPOS) {
768  start += kKeyword.size();
769  pos = string(e.what()).substr(start, end-start);
770  }
771  string msg = e.GetMsg();
772  const string extra_string("CFastaReader: ");
773  if (NStr::StartsWith(msg, extra_string)) {
774  msg.erase(0, extra_string.size());
775  }
776  NCBI_THROW(CWriteDBException, eFileErr, msg);
777  }
778 
779  if (entry.NotEmpty()) {
780  _ASSERT(entry->IsSeq());
781  rv.Reset(& entry->GetSeq());
782  }
783  }
784 
785  // Any failure to read a Bioseq is considered an EOF.
786 
787  if (rv.Empty()) {
789  }
790 
791  return rv;
792 }
793 
795 {
796  bool found = false;
797 
799  int count = 0;
800 #ifdef NCBI_INT8_GI
801  CSeq_id::TGi max_gi32_val = CSeq_id::TGi(GI_CONST(0xFFFFFFFFU)) ;
802 #endif
803 
804  CConstRef<CBioseq> bs = src.GetNext();
805 
806  while(bs.NotEmpty()) {
807  string bioseq_id("Unknown");
808 
809  if (bs->CanGetId()) {
810  const list< CRef<CSeq_id> > & ids = bs->GetId();
811  CSeq_id::TGi check_gi ;
812  //BEGIN:SB-2994
813 #ifdef NCBI_INT8_GI
814  if ( m_SkipLargeGis && !ids.empty() && ids.front().NotEmpty()){
815  bool skip_this = false;
816  for(list< CRef<CSeq_id> >::const_iterator it = ids.begin(); it != ids.end(); it++ ){
817  if( it->NotEmpty() ){
818  CSeq_id::EAccessionInfo info = (*it)->IdentifyAccession();
819  if( info == CSeq_id::EAccessionInfo::eAcc_gi ){
820  check_gi = (*it)->GetGi();
821  if( check_gi > max_gi32_val )
822  {
823  skip_this = true;
824  }
825  }
826  }
827  }
828  if( skip_this ){
829  m_LogFile << "Ignoring gi '" << check_gi << "' as it has value larger then " << 0xFFFFFFFF<< endl;
830  bs = src.GetNext();
831  continue;
832  }
833  }
834 #endif
835  //END:SB-2994
836  if (! ids.empty() && ids.front().NotEmpty()) {
837  bioseq_id.assign(ids.front()->AsFastaString());
838  }
839 
840  if (!m_LongIDs) {
841 
842  // If accession's molecule type is different than expected,
843  // change sequence id to local. CFastaReader cannot distingush
844  // between bare pir protein ids genbank nucleotide ids.
845  CBioseq* bss = const_cast<CBioseq*>(bs.GetNonNullPointer());
846  for (auto& it: bss->SetId()) {
847  CSeq_id::EAccessionInfo info = it->IdentifyAccession();
848  if (!it->IsLocal() && !it->IsGi() &&
851 
852  string label = it->GetSeqIdString(true);
853  it.Reset(new CSeq_id(CSeq_id::e_Local, label));
854  }
855  }
856  }
857  }
858 
859  if(bs->IsAa() != m_IsProtein ){
860  bs = src.GetNext();
861  continue;
862  }
863 
864  if (bs->GetLength() > 0x7fffffff)
865  {
867  eArgErr,
868  "Sequences longer than 2,147,483,647 bases are not supported. Offending sequence is " + bioseq_id);
869  }
870 
871  if ((bs->GetLength() == 0) || (!x_EditAndAddBioseq(bs, NULL, add_pig))){
872  m_LogFile << "Ignoring sequence '" << bioseq_id
873  << "' as it has no sequence data" << endl;
874  bs = src.GetNext();
875  continue;
876  }
877 
878  if (m_Verbose) {
879  m_LogFile << "Adding bioseq from fasta; first id is: '" << bioseq_id
880  << "'" << endl;
881  }
882 
883  // No linkouts or memberships here (yet).
884 
885  found = true;
886 
887  count++;
888 
889  if (debug_mode > 5) m_LogFile << "-- FASTA: Found sequence." << endl;
890 
891  bs = src.GetNext();
892  }
893 
894  if (count) {
895  double t = sw.Elapsed();
896 
897  m_LogFile << "Adding sequences from FASTA; added "
898  << count << " sequences in " << t << " seconds." << endl;
899  }
900 
901  return found;
902 }
903 
905 {
907 
908  bool done = false;
909  bool rv = false;
910 
911 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
912  (!defined(NCBI_COMPILER_MIPSPRO)) )
913  // Get all column names.
914 
915  vector<string> all_names;
916  map<int, int> in2out;
917  int mask_id = -1;
918 
919  src.GetColumnNames(all_names);
920 
921  for(int i = 0; i < (int) all_names.size(); i++) {
922  string name = all_names[i];
923  int in_id = src.GetColumnId(name);
924 
925  // skip masking data column
926  if (name == "BlastDb/MaskData") {
927  mask_id = in_id;
928  continue;
929  }
930  int out_id = m_OutputDb->FindColumn(name);
931 
932  if (out_id < 0) {
933  out_id = m_OutputDb->CreateUserColumn(name);
934  }
935 
936  typedef map<string,string> StringPairMap;
937  const StringPairMap & meta = src.GetColumnMetaData(in_id);
938 
939  ITERATE(StringPairMap, iter, meta) {
940  m_OutputDb->AddColumnMetaData(out_id, iter->first, iter->second);
941  }
942 
943  in2out[in_id] = out_id;
944  }
945 #endif
946  // Copy all data.
947 
948  vector<CTempString> column_blobs;
949  vector<int> column_ids;
950 
951  int count = 0;
952 
953  while(! done) {
954  CTempString sequence, ambiguities;
955  CRef<CBlast_def_line_set> deflines;
956  CMaskedRangesVector mask_data;
957 
958  if (src.GetNext(sequence,
959  ambiguities,
960  deflines,
961  mask_data,
962  column_ids,
963  column_blobs)) {
964 
965  // Copy data
966 
967  _ASSERT(column_blobs.size() == column_ids.size());
968 
969  if (sequence.empty()) {
971  "Error in raw data: no sequence");
972  }
973 
974  if ((! ambiguities.empty()) && m_IsProtein) {
976  "Error in raw data: "
977  "protein db cannot with ambiguities");
978  }
979 
980  if (deflines.Empty()) {
982  "Error in raw data: no headers provided");
983  }
984 
985  x_EditHeaders(deflines);
986 
987  m_OutputDb->AddSequence(sequence, ambiguities);
988  x_AddPig(deflines);
989  m_OutputDb->SetDeflines(*deflines);
990 
991 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
992  (!defined(NCBI_COMPILER_MIPSPRO)) )
993  for(int i = 0; i < (int)column_ids.size(); i++) {
994  int in_id = column_ids[i];
995  if (in_id == mask_id) continue;
996 
997  if (column_blobs[i].size() == 0)
998  continue;
999 
1000  _ASSERT(in2out.find(in_id) != in2out.end());
1001 
1002  int out_id = in2out[in_id];
1003 
1004  CTempString blob_in = column_blobs[i];
1005  CBlastDbBlob & blob_out = m_OutputDb->SetBlobData(out_id);
1006 
1007  blob_out.Clear();
1008  blob_out.WriteRaw(& blob_in.data()[0], blob_in.size());
1009  }
1010  // Don't forget about the IMaskDataSource!
1011  vector <TGi> gis; // GIs associated with this sequence
1012  if (!mask_data.empty() || !m_MaskData.Empty()) {
1013  ITERATE(CBlast_def_line_set::Tdata, defline, deflines->Get()) {
1014  const list< CRef<CSeq_id> > & ids = (*defline)->GetSeqid();
1015  ITERATE(list< CRef<CSeq_id> >, id, ids) {
1016  if ((*id)->IsGi()) {
1017  gis.push_back((*id)->GetGi());
1018  }
1019  }
1020  if (!m_MaskData.Empty()) {
1021  const CMaskedRangesVector rng = m_MaskData->GetRanges(ids);
1022  if (!rng.empty()) {
1023  mask_data.insert(mask_data.end(), rng.begin(), rng.end());
1024  m_FoundMatchingMasks = true;
1025  }
1026  }
1027  }
1028  }
1029  if (!mask_data.empty()) {
1030  m_OutputDb->SetMaskData(mask_data, gis);
1031  }
1032 #endif
1033 
1034  rv = true;
1035  count ++;
1036  } else {
1037  done = true;
1038  }
1039  }
1040 
1041  if (count) {
1042  double t = sw.Elapsed();
1043 
1044  m_LogFile << "Adding sequences from raw db source; added "
1045  << count << " sequences in " << t << " seconds." << endl;
1046  }
1047 
1048  return rv;
1049 }
1050 
1052 {
1053  CDirEntry dir_entry(dbname);
1054  string dir_name = dir_entry.GetDir(CDirEntry::eIfEmptyPath_Empty);
1055  if (dir_name.empty()) {
1056  return;
1057  }
1058 
1059  CDir d(dir_name);
1060  if ( !d.Exists() ) {
1061  if ( !d.CreatePath() ) {
1062  string msg("Failed to create directory '" + d.GetName() + "'");
1063  NCBI_THROW(CMultisourceException, eOutputFileError, msg);
1064  }
1065  }
1066  if (!d.CheckAccess(CDirEntry::fWrite)) {
1067  string msg("You do not have write permissions on '" +
1068  d.GetName() + "'");
1069  NCBI_THROW(CMultisourceException, eOutputFileError, msg);
1070  }
1071 }
1072 
1074  const string & title,
1075  bool is_protein,
1076  CWriteDB::TIndexType indexing,
1077  bool use_gi_mask,
1078  ostream * logfile,
1079  bool long_seqids,
1080  EBlastDbVersion dbver,
1081  bool limit_defline,
1082  Uint8 oid_masks,
1083  bool scan_bioseq_4_cfastareader_usrobj)
1084  : m_IsProtein (is_protein),
1085  m_KeepLinks (false),
1086  m_KeepMbits (false),
1087  m_KeepLeafs (false),
1088  m_Taxids (new CTaxIdSet()),
1089  m_LogFile (*logfile),
1090  m_UseRemote (true),
1091  m_DeflineCount (0),
1092  m_OIDCount (0),
1093  m_Verbose (false),
1094  m_ParseIDs (((indexing & CWriteDB::eFullIndex) != 0 ? true : false)),
1095  m_LongIDs (long_seqids),
1096  m_FoundMatchingMasks(false),
1097  m_SkipCopyingGis(false),
1098  m_SkipLargeGis(true),
1099  m_OutputDbName(kEmptyStr),
1100  m_ScanBioseq4CFastaReaderUsrObjct(scan_bioseq_4_cfastareader_usrobj)
1101 {
1103  const string output_dbname = CDirEntry::CreateAbsolutePath(dbname);
1104  m_LogFile << "\n\nBuilding a new DB, current time: "
1105  << CTime(CTime::eCurrent).AsString() << endl;
1106 
1107  m_LogFile << "New DB name: " << output_dbname << endl;
1108  m_LogFile << "New DB title: " << title << endl;
1109  const string mol_type(is_protein ? "Protein" : "Nucleotide");
1110  m_LogFile << "Sequence type: " << mol_type << endl;
1111  if (DeleteBlastDb(output_dbname, ParseMoleculeTypeString(mol_type))) {
1112  m_LogFile << "Deleted existing " << mol_type
1113  << " BLAST database named " << output_dbname << endl;
1114  }
1115 
1116  CWriteDB::ESeqType seqtype =
1117  (is_protein ? CWriteDB::eProtein : CWriteDB::eNucleotide);
1118 
1119  m_OutputDb.Reset(new CWriteDB(output_dbname,
1120  seqtype,
1121  title,
1122  indexing,
1123  m_ParseIDs,
1124  m_LongIDs,
1125  use_gi_mask,
1126  dbver,
1127  limit_defline,
1128  oid_masks));
1129 
1130  // Standard 1 GB limit
1131 
1132  m_OutputDb->SetMaxFileSize(1000*1000*1000);
1133  m_OutputDbName = output_dbname;
1134 }
1135 
1137  const string & title,
1138  bool is_protein,
1139  bool sparse,
1140  bool parse_seqids,
1141  bool use_gi_mask,
1142  ostream * logfile,
1143  bool long_seqids,
1144  EBlastDbVersion dbver,
1145  bool limit_defline,
1146  Uint8 oid_masks,
1147  bool scan_bioseq_4_cfastareader_usrobj)
1148  : m_IsProtein (is_protein),
1149  m_KeepLinks (false),
1150  m_KeepMbits (false),
1151  m_KeepLeafs (false),
1152  m_Taxids (new CTaxIdSet()),
1153  m_LogFile (*logfile),
1154  m_UseRemote (true),
1155  m_DeflineCount (0),
1156  m_OIDCount (0),
1157  m_Verbose (false),
1158  m_ParseIDs (parse_seqids),
1159  m_LongIDs (long_seqids),
1160  m_FoundMatchingMasks(false),
1161  m_SkipCopyingGis(false),
1162  m_SkipLargeGis(true),
1163  m_ScanBioseq4CFastaReaderUsrObjct(scan_bioseq_4_cfastareader_usrobj)
1164 {
1166  const string output_dbname = CDirEntry::CreateAbsolutePath(dbname);
1167  m_LogFile << "\n\nBuilding a new DB, current time: "
1168  << CTime(CTime::eCurrent).AsString() << endl;
1169 
1170  m_LogFile << "New DB name: " << output_dbname << endl;
1171  m_LogFile << "New DB title: " << title << endl;
1172  const string mol_type(is_protein ? "Protein" : "Nucleotide");
1173  m_LogFile << "Sequence type: " << mol_type << endl;
1174  if (DeleteBlastDb(output_dbname, ParseMoleculeTypeString(mol_type))) {
1175  m_LogFile << "Deleted existing " << mol_type
1176  << " BLAST database named " << output_dbname << endl;
1177  }
1178 
1179  CWriteDB::ESeqType seqtype =
1180  (is_protein ? CWriteDB::eProtein : CWriteDB::eNucleotide);
1181 
1182  CWriteDB::EIndexType ix = (sparse
1184  : CWriteDB::eDefault);
1185 
1186  m_OutputDb.Reset(new CWriteDB(output_dbname,
1187  seqtype,
1188  title,
1189  ix,
1190  m_ParseIDs,
1191  long_seqids,
1192  use_gi_mask,
1193  dbver,
1194  limit_defline,
1195  oid_masks,
1196  scan_bioseq_4_cfastareader_usrobj));
1197 
1198  // Standard 1 GB limit
1199 
1200  m_OutputDb->SetMaxFileSize(1000*1000*1000);
1201  m_OutputDbName = output_dbname;
1202 }
1203 
1205 {
1207  ERR_POST(Error << "No sequences matched any of the masks provided.\n"
1208  << "Please ensure that the -parse_seqids option is used "
1209  << "in the\nfiltering program as well as makeblastdb.");
1210  }
1211  if (!m_Taxids->HasEverFixedId()) {
1212  ERR_POST(Error << "No sequences matched any of the taxids provided.");
1213  }
1214 }
1215 
1217 {
1218  m_Taxids.Reset(& taxids);
1219 }
1220 
1221 void CBuildDatabase::SetMaskLetters(const string & letters)
1222 {
1223  m_OutputDb->SetMaskedLetters(letters);
1224 }
1225 
1227 {
1228  if (m_Scope.Empty()) {
1229  if (m_ObjMgr.Empty()) {
1231  }
1232 
1233  m_Scope.Reset(new CScope(*m_ObjMgr));
1234 
1235  // Add default loaders (GB loader in this demo) to the scope.
1236  m_Scope->AddDefaults();
1237  }
1238 
1239  return *m_Scope;
1240 }
1241 
1243 {
1244  m_LogFile << "Configured source DB: " << seqdb->GetDBNameList() << endl;
1245  m_LogFile << "Source DB has title: " << seqdb->GetTitle() << endl;
1246  m_LogFile << "Source DB time stamp: " << seqdb->GetDate() << endl;
1247  m_SourceDb = seqdb;
1248 }
1249 
1250 void CBuildDatabase::SetSourceDb(const string & src_db_name)
1251 {
1252  _ASSERT(src_db_name.size());
1253  CRef<CSeqDBExpert> src_db(new CSeqDBExpert(src_db_name,
1254  m_IsProtein
1256  : CSeqDB::eNucleotide));
1257 
1258  SetSourceDb(src_db);
1259 }
1260 
1261 // NCBI_DEPRECATED
1263  bool keep_links)
1264 {
1265  m_LogFile << "Keep Linkouts: " << (keep_links ? "T" : "F") << endl;
1266  MapToLMBits(linkouts, m_Id2Links);
1267  m_KeepLinks = keep_links;
1268 }
1269 
1271  bool keep_mbits)
1272 {
1273  m_LogFile << "Keep MBits: " << (keep_mbits ? "T" : "F") << endl;
1274  MapToLMBits(membbits, m_Id2Mbits);
1275  m_KeepMbits = keep_mbits;
1276 }
1277 
1279  const TIdToLeafs& taxids,
1280  bool keep_taxids
1281 )
1282 {
1283  m_LogFile << "Keep Leaf Taxids: " << (keep_taxids ? "T" : "F") << endl;
1284  m_Id2Leafs = taxids;
1285  m_KeepLeafs = keep_taxids;
1286 }
1287 
1288 bool
1289 CBuildDatabase::Build(const vector<string> & ids,
1290  CNcbiIstream * fasta_file)
1291 {
1293 
1294  StartBuild();
1295 
1296  bool success = AddIds(ids);
1297 
1298  if (success) {
1299  success = AddFasta(*fasta_file);
1300  }
1301 
1302  bool success2 = EndBuild();
1303 
1304  success = success && success2;
1305 
1306  double t = sw.Elapsed();
1307 
1308  m_LogFile << "Total sequences stored: " << m_OIDCount << endl;
1309  m_LogFile << "Total deflines stored: " << m_DeflineCount << endl;
1310 
1311  m_LogFile << "Total time to build database: "
1312  << t << " seconds.\n" << endl;
1313 
1314  return success;
1315 }
1316 
1318 {
1319 }
1320 
1321 bool CBuildDatabase::AddIds(const vector<string> & ids)
1322 {
1323 
1324  bool success = true;
1325 
1326  // Resolve all ids to GIs, storing them in a GI list.
1327 
1328  CRef<CInputGiList> gi_list;
1329 
1330  if (m_SourceDb.NotEmpty() && ! ids.empty()) {
1331  gi_list = x_ResolveGis(ids);
1332  }
1333 
1334  // Translate the GI list.
1335 
1336  if (gi_list.NotEmpty() &&
1337  (gi_list->GetNumGis() || gi_list->GetNumSis())) {
1338 
1339  // The process of constructing a SeqDB object with a user GI
1340  // list causes translation of the User GI list, and is the
1341  // fastest way of performing such a translation in bulk. It
1342  // is possible to iterate the list afterwards to determine
1343  // what subset of it that has been translated; non-translated
1344  // GIs will need to be fetched using a data loader.
1345  //
1346  // It is not necessary, however, to iterate the GI list to
1347  // find OIDs that correspond to the filtered DB; these can be
1348  // found using OID iteration over SeqDB, which produces a
1349  // better ordering inasmuch as the reads from the source
1350  // sequence data will be sequential on disk.
1351 
1353 
1354  CRef<CSeqDBExpert> filtered
1357  &* gi_list));
1358 
1359  m_SourceDb = filtered;
1360 
1361  // Add all local database sequences to the output DB.
1362 
1363  x_DupLocal();
1364 
1365  if (m_Verbose) {
1366  // Map oid to gi.
1367  map<int,TGi> seen_it;
1368 
1369  for(int i = 0; i < gi_list->GetNumGis(); i++) {
1370  int this_oid = gi_list->GetGiOid(i).oid;
1371  TGi this_gi = gi_list->GetGiOid(i).gi;
1372 
1373  if (this_oid != -1) {
1374  if (seen_it.find(this_oid) == seen_it.end()) {
1375  seen_it[this_oid] = this_gi;
1376  } else {
1377  m_LogFile << "GI " << this_gi
1378  << " is duplicate of GI "
1379  << seen_it[this_oid]
1380  << endl;
1381  }
1382  }
1383  }
1384  }
1385  }
1386 
1387  if (gi_list.NotEmpty()) {
1388  if (m_UseRemote) {
1389  success = x_AddRemoteSequences(*gi_list);
1390  } else {
1391  success = x_ReportUnresolvedIds(*gi_list);
1392  }
1393  }
1394 
1395  return success;
1396 }
1397 
1399 {
1400  // Add any fasta sequences as well.
1401  bool success = true;
1402 
1403  if (fasta_file) {
1404  CFastaBioseqSource fbs(fasta_file,
1405  m_IsProtein,
1406  m_ParseIDs,
1407  m_LongIDs);
1408 
1409  try {
1410  success = AddSequences(fbs);
1411  if (success == false)
1412  NCBI_THROW(CWriteDBException, eFileErr, "No sequences added");
1413 
1414  }
1415  catch (...) {
1416  EndBuild(true);
1417  throw;
1418  }
1419  }
1420  return success;
1421 }
1422 
1424 {
1425  try {
1426  m_OutputDb->Close();
1427  return x_EndBuild(erase, NULL);
1428  } catch (const CException& e) {
1429  return x_EndBuild(true, erase ? NULL : &e);
1430  } catch (exception& e) {
1432  return x_EndBuild(true, erase ? NULL : &ex);
1433  } catch (...) {
1434  NCBI_EXCEPTION_VAR(ex, CException, eUnknown, "Non-standard exception");
1435  return x_EndBuild(true, erase ? NULL : &ex);
1436  }
1437 }
1438 
1439 bool CBuildDatabase::x_EndBuild(bool erase, const CException * close_exception)
1440 {
1441  bool success = true;
1442 
1443  vector<string> vols;
1444  vector<string> files;
1445 
1446  m_OutputDb->ListVolumes(vols);
1447  m_OutputDb->ListFiles(files);
1448 
1449  m_LogFile << endl;
1450 
1451  _ASSERT(vols.empty() == files.empty());
1452 
1453  if (vols.empty()) {
1454  m_LogFile << "No volumes were created."
1455  << endl;
1456 
1457  success = false;
1458  } else {
1459  ITERATE(vector<string>, iterf, files) {
1460  if (erase) {
1461  CFile(*iterf).Remove();
1462  _TRACE("Deleting " << *iterf);
1463  }
1464  }
1465  }
1466 
1467  m_LogFile << endl;
1468 
1469  if (close_exception) {
1470  NCBI_RETHROW(*close_exception, CWriteDBException, eArgErr,
1471  close_exception->GetMsg());
1472  }
1473 
1474  return success;
1475 }
1476 
1477 
1478 static void
1480  objects::CBlast_def_line& defline,
1481  TIdToLeafs& leafs,
1482  bool keep_old,
1483  vector<string>& keys
1484 )
1485 {
1486  bool found = false;
1487  CBlast_def_line::TTaxIds taxids;
1488 
1489  ITERATE(vector<string>, key, keys) {
1490  if (!key->empty()) {
1491  TIdToLeafs::iterator item = leafs.find(*key);
1492  if (item != leafs.end()) {
1493  found = true;
1494  taxids.insert(item->second.begin(), item->second.end());
1495  }
1496  }
1497  }
1498 
1500  if (found) {
1501  if (keep_old) {
1502  const CBlast_def_line::TTaxIds& tx = defline.GetLeafTaxIds();
1503  tv.insert(tx.begin(), tx.end());
1504  }
1505  tv.insert(taxids.begin(), taxids.end());
1506  defline.SetLeafTaxIds(tv);
1507  } else {
1508  if (!keep_old) {
1509  defline.SetLeafTaxIds(tv);
1510  }
1511  }
1512 }
1513 
1514 
1515 static void
1516 s_SetDeflineBits(objects::CBlast_def_line & defline,
1517  TIdToBits & bitmap,
1518  bool keep_old,
1519  bool is_memb,
1520  vector<string> & keys)
1521 {
1522  bool found = false;
1523  int value = 0;
1524 
1525  ITERATE(vector<string>, key, keys) {
1526  if (! key->size())
1527  continue;
1528 
1529  TIdToBits::iterator item = bitmap.find(*key);
1530 
1531  if (item != bitmap.end()) {
1532  found = true;
1533  value |= item->second;
1534  }
1535  }
1536 
1537  if (found) {
1538  list<int> & linkv = (is_memb
1539  ? defline.SetMemberships()
1540  : defline.SetLinks());
1541 
1542  if (! keep_old) {
1543  linkv.clear();
1544  }
1545 
1546  if (linkv.empty()) {
1547  linkv.push_back(value);
1548  } else {
1549  linkv.front() |= value;
1550  }
1551  } else {
1552  if (! keep_old) {
1553  if (is_memb) {
1554  defline.ResetMemberships();
1555  } else {
1556  defline.ResetLinks();
1557  }
1558  }
1559  }
1560 }
1561 
1562 void
1564 {
1565  vector<string> keys;
1566 
1567  NON_CONST_ITERATE(CBlast_def_line_set::Tdata, iter, headers->Set()) {
1568  CBlast_def_line & defline = **iter;
1569  GetDeflineKeys(defline, keys);
1570 
1571  // m_Id2Links is DEPRECATED
1572 // s_SetDeflineBits(defline, m_Id2Links, m_KeepLinks, false, keys);
1573  s_SetDeflineBits(defline, m_Id2Mbits, m_KeepMbits, true, keys);
1574  s_SetDeflineLeafs(defline, m_Id2Leafs, m_KeepLeafs, keys);
1575  }
1576 }
1577 
1579 {
1580  m_OutputDb->SetMaxFileSize(max_file_size);
1581 }
1582 
1583 int
1585  const string & options,
1586  const string & name)
1587 {
1588 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
1589  (!defined(NCBI_COMPILER_MIPSPRO)) )
1590  return m_OutputDb->RegisterMaskAlgorithm(program, options, name);
1591 #else
1592  return 0;
1593 #endif
1594 }
1595 
1596 int
1598  const string & description,
1599  const string & options)
1600 {
1601 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
1602  (!defined(NCBI_COMPILER_MIPSPRO)) )
1603  return m_OutputDb->RegisterMaskAlgorithm(program, description, options);
1604 #else
1605  return 0;
1606 #endif
1607 }
1608 
1610 {
1611  m_MaskData.Reset(& ranges);
1612 }
1613 
USING_SCOPE(objects)
static CConstRef< CBioseq > s_FixBioseqDeltas(CConstRef< objects::CBioseq > bs)
Definition: build_db.cpp:290
static void s_SetDeflineLeafs(objects::CBlast_def_line &defline, TIdToLeafs &leafs, bool keep_old, vector< string > &keys)
Definition: build_db.cpp:1479
int debug_mode
Definition: build_db.cpp:63
static void s_SetDeflineBits(objects::CBlast_def_line &defline, TIdToBits &bitmap, bool keep_old, bool is_memb, vector< string > &keys)
Definition: build_db.cpp:1516
Code to build a database given various sources of sequence data.
CBioseq_Handle –.
TSeqPos GetLength(void) const
Definition: Bioseq.cpp:360
bool IsAa(void) const
Definition: Bioseq.cpp:350
`Blob' Class for SeqDB (and WriteDB).
Definition: seqdbblob.hpp:56
void WriteRaw(const char *begin, int size)
Write raw data to the blob (moving the write pointer).
Definition: seqdbblob.cpp:421
void Clear()
Clear all owned data and reference an empty string.
Definition: seqdbblob.cpp:58
Implementation of ILineReader for IReader.
bool AddSequences(IBioseqSource &src, bool add_pig=false)
Add sequences from an IBioseqSource object.
Definition: build_db.cpp:794
bool m_IsProtein
True for a protein database, false for nucleotide.
Definition: build_db.hpp:597
bool AddFasta(CNcbiIstream &fasta_file)
Add sequences from a file containing FASTA data.
Definition: build_db.cpp:1398
int m_DeflineCount
Define count.
Definition: build_db.hpp:644
bool m_KeepLinks
True to keep linkout bits from source dbs, false to discard.
Definition: build_db.hpp:601
bool m_SkipLargeGis
If set to true, skip GIs with value > 0x7FFFFFFF.
Definition: build_db.hpp:667
TIdToBits m_Id2Links
Table of linkout bits to apply to sequences.
Definition: build_db.hpp:605
bool m_FoundMatchingMasks
If true, there were sequences whose IDs matched those in the provided masking locations (via SetMaskD...
Definition: build_db.hpp:661
static void CreateDirectories(const string &dbname)
Create Directory for blast db.
Definition: build_db.cpp:1051
bool x_ReportUnresolvedIds(const CInputGiList &gi_list) const
Write log messages for any unresolved IDs.
Definition: build_db.cpp:626
bool m_UseRemote
Whether to use remote resolution and sequence fetching.
Definition: build_db.hpp:641
void x_EditHeaders(CRef< objects::CBlast_def_line_set > headers)
Modify deflines with linkout and membership bits and taxids.
Definition: build_db.cpp:428
objects::CScope & x_GetScope()
Get a scope for remote loading of objects.
Definition: build_db.cpp:1226
void SetMaskDataSource(IMaskDataSource &ranges)
Specify an object mapping Seq-id to subject masking data.
Definition: build_db.cpp:1609
void SetMembBits(const TLinkoutMap &membbits, bool keep_mbits)
Specify a membership bit lookup object.
Definition: build_db.cpp:1270
bool m_ParseIDs
If true, string IDs found in FASTA input will be parsed as Seq-ids.
Definition: build_db.hpp:653
int RegisterMaskingAlgorithm(EBlast_filter_program program, const string &options, const string &name="")
Define a masking algorithm.
Definition: build_db.cpp:1584
int m_OIDCount
Number of OIDs stored in this database.
Definition: build_db.hpp:647
void SetSourceDb(const string &src_db_name)
Specify source database(s) via the database name(s).
Definition: build_db.cpp:1250
bool m_ScanBioseq4CFastaReaderUsrObjct
Definition: build_db.hpp:671
bool m_KeepMbits
True to keep membership bits from source dbs, false to discard.
Definition: build_db.hpp:608
CRef< CTaxIdSet > m_Taxids
Set of TaxIDs configured to apply to sequences.
Definition: build_db.hpp:626
CRef< objects::CScope > m_Scope
Sequence scope, used for remote fetching.
Definition: build_db.hpp:623
CRef< objects::CObjectManager > m_ObjMgr
Object manager, used for remote fetching.
Definition: build_db.hpp:620
ostream & m_LogFile
Logfile.
Definition: build_db.hpp:638
bool m_Verbose
If true, more detailed log messages will be produced.
Definition: build_db.hpp:650
bool m_KeepLeafs
True to keep leaf taxids from source dbs, false to discard.
Definition: build_db.hpp:614
bool x_EndBuild(bool erase, const CException *close_exception)
Definition: build_db.cpp:1439
bool m_SkipCopyingGis
If set to true, when copying BLASTDBs, skip the GIs.
Definition: build_db.hpp:664
CRef< CWriteDB > m_OutputDb
Database being produced here.
Definition: build_db.hpp:629
CRef< CSeqDBExpert > m_SourceDb
Database for duplicating sequences locally (-sourcedb option.)
Definition: build_db.hpp:632
void SetTaxids(CTaxIdSet &taxids)
Specify a mapping of sequence ids to taxonomic ids.
Definition: build_db.cpp:1216
CRef< CInputGiList > x_ResolveGis(const vector< string > &ids)
Resolve various input IDs (as strings) to GIs.
Definition: build_db.cpp:116
void x_SetLinkAndMbit(CRef< objects::CBlast_def_line_set > headers)
Store linkout (now deprecated) and membership bits in provided headers.
Definition: build_db.cpp:1563
CRef< IMaskDataSource > m_MaskData
Subject masking data.
Definition: build_db.hpp:635
bool x_EditAndAddBioseq(CConstRef< objects::CBioseq > bs, objects::CSeqVector *sv, bool add_pig=false)
Modify a Bioseq as needed and add it to the database.
Definition: build_db.cpp:469
bool m_LongIDs
If true, use long sequence ids (database|accession)
Definition: build_db.hpp:656
bool Build(const vector< string > &ids, CNcbiIstream *fasta_file)
Build the database.
Definition: build_db.cpp:1289
void x_AddOneRemoteSequence(const objects::CSeq_id &seqid, bool &found, bool &error)
Fetch a sequence from the remote service and add it to the db.
Definition: build_db.cpp:507
void x_AddMasksForSeqId(const list< CRef< CSeq_id > > &ids)
Add the masks for the Seq-id(s) (usually just one) to the database being created.
Definition: build_db.cpp:447
TIdToLeafs m_Id2Leafs
Table of leaf taxids to apply to sequences.
Definition: build_db.hpp:617
void x_ResolveRemoteId(CRef< objects::CSeq_id > &seqid, TGi &gi)
Resolve an ID remotely.
Definition: build_db.cpp:65
void x_DupLocal()
Duplicate IDs from local databases.
Definition: build_db.cpp:235
string m_OutputDbName
Definition: build_db.hpp:669
TIdToBits m_Id2Mbits
Table of membership bits to apply to sequences.
Definition: build_db.hpp:611
bool x_ResolveFromSource(const string &acc, CRef< objects::CSeq_id > &id)
Determine if this string ID can be found in the source database.
Definition: build_db.cpp:185
void StartBuild()
Start building a new database.
Definition: build_db.cpp:1317
bool EndBuild(bool erase=false)
Finish building a new database.
Definition: build_db.cpp:1423
void SetLinkouts(const TLinkoutMap &linkouts, bool keep_links)
Specify a linkout bit lookup object.
Definition: build_db.cpp:1262
void SetMaxFileSize(Uint8 max_file_size)
Set the maximum size of database component files.
Definition: build_db.cpp:1578
void x_AddPig(CRef< objects::CBlast_def_line_set > headers)
Add pig if id can be extracted from the deflines.
Definition: build_db.cpp:418
bool AddIds(const vector< string > &ids)
Add the specified sequences from the source database.
Definition: build_db.cpp:1321
void SetMaskLetters(const string &mask_letters)
Specify letters to mask out of protein sequence data.
Definition: build_db.cpp:1221
bool x_AddRemoteSequences(CInputGiList &gi_list)
Duplicate IDs from local databases.
Definition: build_db.cpp:555
CBuildDatabase(const string &dbname, const string &title, bool is_protein, CWriteDB::TIndexType indexing, bool use_gi_mask, ostream *logfile, bool long_seqids=false, EBlastDbVersion dbver=eBDB_Version4, bool limit_defline=false, Uint8 oid_masks=EOidMaskType::fNone, bool scan_bioseq_4_cfastareader_usrobj=true)
Constructor.
Definition: build_db.cpp:1073
void SetLeafTaxIds(const TIdToLeafs &taxids, bool keep_taxids)
Specify a leaf-taxids object.
Definition: build_db.cpp:1278
CDirEntry –.
Definition: ncbifile.hpp:262
CDir –.
Definition: ncbifile.hpp:1695
CFastaBioseqSource(CNcbiIstream &fasta_file, bool is_protein, bool parse_ids, bool long_ids)
Definition: build_db.cpp:701
virtual CConstRef< CBioseq > GetNext()
Get a Bioseq object if there are any more to get.
Definition: build_db.cpp:755
CFastaReader * m_FastaReader
Definition: build_db.cpp:698
CRef< ILineReader > m_LineReader
Definition: build_db.cpp:697
Base class for reading FASTA sequences.
Definition: fasta.hpp:80
CFile –.
Definition: ncbifile.hpp:1604
Gi List for database construction.
void AppendSi(const string &si, int oid=-1)
Append a Seq-id.
void AppendGi(TGi gi, int oid=-1)
Append a GI.
Thrown on an attempt to access wrong choice variant.
Definition: exception.hpp:102
This represents a set of masks for a given sequence.
Definition: writedb.hpp:65
bool empty() const
Redefine empty to mean no elements or none of its elements being empty.
Definition: writedb.hpp:71
CMultisourceException.
static CNcbiApplication * Instance(void)
Singleton method.
Definition: ncbiapp.cpp:264
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
CNcbiRegistry –.
Definition: ncbireg.hpp:913
CScope –.
Definition: scope.hpp:92
static SIZE_TYPE Convert(const CTempString &src, TCoding src_coding, TSeqPos pos, TSeqPos length, string &dst, TCoding dst_coding)
CSeqDBExpert.
Definition: seqdbexpert.hpp:55
void GetRawSeqAndAmbig(int oid, const char **buffer, int *seq_length, int *ambig_length) const
Raw Sequence and Ambiguity Data.
Definition: seqdbexpert.cpp:64
int GetNumGis() const
Get the number of GIs in the array.
const SGiOid & GetGiOid(int index) const
Access an element of the array.
int GetNumSis() const
Get the number of Seq-ids in the array.
const SSiOid & GetSiOid(int index) const
Access an element of the array.
T GetKey(int index) const
const string & GetDBNameList() const
Get list of database names.
Definition: seqdb.cpp:760
list< CRef< CSeq_id > > GetSeqIDs(int oid) const
Gets a list of sequence identifiers.
Definition: seqdb.cpp:765
ESeqType GetSequenceType() const
Returns the type of database opened - protein or nucleotide.
Definition: seqdb.cpp:427
@ eNucleotide
Definition: seqdb.hpp:175
@ eProtein
Definition: seqdb.hpp:174
string GetTitle() const
Returns the database title.
Definition: seqdb.cpp:630
void AccessionToOids(const string &acc, vector< int > &oids) const
Translate an Accession to a list of OIDs.
Definition: seqdb.cpp:870
bool CheckOrFindOID(int &next_oid) const
Find an included OID, incrementing next_oid if necessary.
Definition: seqdb.cpp:728
string GetDate() const
Returns the construction date of the database.
Definition: seqdb.cpp:635
CRef< CBlast_def_line_set > GetHdr(int oid) const
Get the ASN.1 header for the sequence.
Definition: seqdb.cpp:418
@ e_Ncbi8na
Definition: sequtil.hpp:52
@ e_Ncbi4na
Definition: sequtil.hpp:50
@ e_Ncbi2na
Definition: sequtil.hpp:48
CSeqVector –.
Definition: seq_vector.hpp:65
@Seq_descr.hpp User-defined methods of the data storage class.
Definition: Seq_descr.hpp:55
bool IsNa(void) const
Definition: Seq_inst.hpp:106
CStopWatch –.
Definition: ncbitime.hpp:1937
void FixTaxId(CRef< objects::CBlast_def_line_set > deflines)
Check that each defline has the specified taxid; if not, replace the defline and set the taxid.
Definition: taxid_set.cpp:131
bool HasEverFixedId() const
Definition: taxid_set.hpp:62
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
CTime –.
Definition: ncbitime.hpp:296
CWriteDBException.
CWriteDB.
Definition: writedb.hpp:92
int CreateUserColumn(const string &title)
Set up a user-defined CWriteDB column.
Definition: writedb.cpp:180
void SetMaxFileSize(Uint8 sz)
Set maximum size for output files.
Definition: writedb.cpp:118
ESeqType
Sequence types.
Definition: writedb.hpp:95
@ eProtein
Protein database.
Definition: writedb.hpp:97
@ eNucleotide
Nucleotide database.
Definition: writedb.hpp:100
int RegisterMaskAlgorithm(EBlast_filter_program program, const string &options=string(), const string &name=string())
Register a type of filtering data found in this database.
void AddColumnMetaData(int col_id, const string &key, const string &value)
Add meta data to a user-defined column.
Definition: writedb.cpp:185
int FindColumn(const string &title) const
Find an existing column.
Definition: writedb.cpp:175
void ListFiles(vector< string > &files)
List Filenames.
Definition: writedb.cpp:146
CBlastDbBlob & SetBlobData(int column_id)
Add blob data to a user-defined column.
Definition: writedb.cpp:190
void SetMaskData(const CMaskedRangesVector &ranges, const vector< TGi > &gis)
Set filtering data for a sequence.
Definition: writedb.cpp:169
void SetPig(int pig)
Set the PIG to be used for the sequence.
Definition: writedb.cpp:99
void AddSequence(const CBioseq &bs)
Add a sequence as a CBioseq.
Definition: writedb.cpp:79
EIndexType
Whether and what kind of indices to build.
Definition: writedb.hpp:104
@ eDefault
Like eFullIndex but also build a numeric Trace ID index.
Definition: writedb.hpp:121
@ eSparseIndex
Use only simple accessions in the string index.
Definition: writedb.hpp:109
void SetMaskedLetters(const string &masked)
Set letters that should not be used in sequences.
Definition: writedb.cpp:136
void ListVolumes(vector< string > &vols)
List Volumes.
Definition: writedb.cpp:141
int TIndexType
Bitwise OR of "EIndexType".
Definition: writedb.hpp:128
static CRef< CBlast_def_line_set > ExtractBioseqDeflines(const CBioseq &bs, bool parse_ids=true, bool long_ids=false, bool scan_bioseq_4_cfastareader_usrobj=false)
Extract Deflines From Bioseq.
Definition: writedb.cpp:129
void SetDeflines(const CBlast_def_line_set &deflines)
Set the deflines to be used for the sequence.
Definition: writedb.cpp:94
void Close()
Close the Database.
Definition: writedb.cpp:104
Interface to a source of Bioseq objects.
Definition: build_db.hpp:54
virtual CConstRef< objects::CBioseq > GetNext()=0
Get a Bioseq object if there are any more to get.
@ eProblem_TooLong
Definition: line_error.hpp:76
@ eProblem_ModifierFoundButNoneExpected
Definition: line_error.hpp:81
@ eProblem_TooManyAmbiguousResidues
Definition: line_error.hpp:79
An interface providing lookups of mask-data by Seq-id.
Definition: build_db.hpp:120
virtual CMaskedRangesVector & GetRanges(const list< CRef< CSeq_id > > &id)=0
Get ranges of masking data for the given Seq-ids.
Interface to a source of raw sequence data.
Definition: build_db.hpp:70
virtual const map< string, string > & GetColumnMetaData(int id)=0
Get metadata for the column with the specified Column ID.
virtual bool GetNext(CTempString &sequence, CTempString &ambiguities, CRef< objects::CBlast_def_line_set > &deflines, vector< SBlastDbMaskData > &mask_ranges, vector< int > &column_ids, vector< CTempString > &column_blobs)=0
Get a raw sequence.
virtual void GetColumnNames(vector< string > &names)=0
Get the names of all columns defined by this sequence source.
virtual int GetColumnId(const string &name)=0
Get the column ID for a column mentioned by name.
const_iterator end() const
Definition: map.hpp:152
const_iterator find(const key_type &key) const
Definition: map.hpp:153
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
const_iterator begin() const
Definition: set.hpp:135
const_iterator end() const
Definition: set.hpp:136
static CMemoryRegistry registry
Definition: cn3d_tools.cpp:81
static uch flags
Operators to edit gaps in sequences.
#define S(s)
#define true
Definition: bool.h:35
#define false
Definition: bool.h:36
const CNcbiRegistry & GetConfig(void) const
Get the application's cached configuration parameters (read-only).
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define GI_CONST(gi)
Definition: ncbimisc.hpp:1087
#define ZERO_GI
Definition: ncbimisc.hpp:1088
string
Definition: cgiapp.hpp:687
#define NULL
Definition: ncbistd.hpp:225
#define _TRACE(message)
Definition: ncbidbg.hpp:122
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
#define NCBI_EXCEPTION_VAR(name, exception_class, err_code, message)
Create an instance of the exception to be thrown later.
Definition: ncbiexpt.hpp:684
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
const string & GetMsg(void) const
Get message string.
Definition: ncbiexpt.cpp:461
virtual const char * what(void) const noexcept
Standard report (includes full backlog).
Definition: ncbiexpt.cpp:342
#define NCBI_RETHROW(prev_exception, exception_class, err_code, message)
Generic macro to re-throw an exception.
Definition: ncbiexpt.hpp:737
static string CreateAbsolutePath(const string &path, ERelativeToWhat rtw=eRelativeToCwd)
Get an absolute path from some, possibly relative, path.
Definition: ncbifile.cpp:665
string GetDir(EIfEmptyPath mode=eIfEmptyPath_Current) const
Get the directory component for this directory entry.
Definition: ncbifile.cpp:475
bool CreatePath(TCreateFlags flags=fCreate_Default) const
Create the directory path recursively possibly more than one at a time.
Definition: ncbifile.cpp:4106
virtual bool Remove(TRemoveFlags flags=eRecursive) const
Remove a directory entry.
Definition: ncbifile.cpp:2595
bool CheckAccess(TMode access_mode) const
Check access rights.
Definition: ncbifile.cpp:1720
virtual bool Exists(void) const
Check if directory "dirname" exists.
Definition: ncbifile.hpp:4065
string GetName(void) const
Get the base entry name with extension (if any).
Definition: ncbifile.hpp:3916
@ eIfEmptyPath_Empty
Return empty string.
Definition: ncbifile.hpp:330
@ fWrite
Write permission.
Definition: ncbifile.hpp:1152
#define MSerial_AsnText
I/O stream manipulators –.
Definition: serialbase.hpp:696
virtual CRef< CSeq_entry > ReadOneSeq(ILineErrorListener *pMessageListener=nullptr)
Read a single effective sequence, which may turn out to be a segmented set.
Definition: fasta.cpp:312
EFlags
Note on fAllSeqIds: some databases (notably nr) have merged identical sequences, joining their deflin...
Definition: fasta.hpp:86
virtual bool AtEOF(void) const =0
Indicates (negatively) whether there is any more input.
void SetMaxIDLength(Uint4 max_len)
If this is set, an exception will be thrown if a Sequence ID exceeds the given length.
Definition: fasta.cpp:485
void IgnoreProblem(ILineError::EProblem problem)
Definition: fasta.cpp:2221
@ fNoParseID
Generate an ID (whole defline -> title)
Definition: fasta.hpp:90
@ fQuickIDCheck
Just check local IDs' first characters.
Definition: fasta.hpp:110
@ fRequireID
Reject deflines that lack IDs.
Definition: fasta.hpp:95
@ fDisableNoResidues
If no residues found do not raise an error.
Definition: fasta.hpp:113
@ fForceType
Force specified type regardless of accession.
Definition: fasta.hpp:89
@ fParseRawID
Try to identify raw accessions.
Definition: fasta.hpp:97
@ fAssumeNuc
Assume nucs unless accns indicate otherwise.
Definition: fasta.hpp:87
@ fParseGaps
Make a delta sequence if gaps found.
Definition: fasta.hpp:91
@ fAssumeProt
Assume prots unless accns indicate otherwise.
Definition: fasta.hpp:88
const string AsFastaString(void) const
Definition: Seq_id.cpp:2266
EAccessionInfo
For IdentifyAccession (below)
Definition: Seq_id.hpp:220
static int BestRank(const CRef< CSeq_id > &id)
Definition: Seq_id.hpp:774
const CTextseq_id * GetTextseq_Id(void) const
Return embedded CTextseq_id, if any.
Definition: Seq_id.cpp:169
@ fAcc_prot
Definition: Seq_id.hpp:252
@ fAcc_nuc
Definition: Seq_id.hpp:251
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
vector< CSeq_id_Handle > TIds
Definition: scope.hpp:143
CConstRef< CBioseq > GetCompleteBioseq(void) const
Get the complete bioseq.
TBioseqStateFlags GetState(void) const
Get state of the bioseq.
bool Empty(void) const THROWS_NONE
Check if CConstRef is empty – not pointing to any object which means having a null value.
Definition: ncbiobj.hpp:1385
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:1439
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool NotEmpty(void) const THROWS_NONE
Check if CConstRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:1392
TObjectType * GetNonNullPointer(void) const
Get pointer value and throw a null pointer exception if pointer is null.
Definition: ncbiobj.hpp:1654
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
virtual const string & Get(const string &section, const string &name, TFlags flags=0) const
Get the parameter value.
Definition: ncbireg.cpp:262
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
#define kEmptyStr
Definition: ncbistr.hpp:123
#define NPOS
Definition: ncbistr.hpp:133
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2891
const char * data(void) const
Return a pointer to the array represented.
Definition: tempstr.hpp:313
bool empty(void) const
Return true if the represented string is empty (i.e., the length is zero)
Definition: tempstr.hpp:334
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
static unsigned int StringToUInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to unsigned int.
Definition: ncbistr.cpp:642
size_type size(void) const
Return the length of the represented array.
Definition: tempstr.hpp:327
double Elapsed(void) const
Return time elapsed since first Start() or last Restart() call (in seconds).
Definition: ncbitime.hpp:2775
string AsString(const CTimeFormat &format=kEmptyStr, TSeconds out_tz=eCurrentTimeZone) const
Transform time to string.
Definition: ncbitime.cpp:1512
@ eCurrent
Use current time. See also CCurrentTime.
Definition: ncbitime.hpp:300
@ eStart
Start timer immediately after creating.
Definition: ncbitime.hpp:1941
C::value_type FindBestChoice(const C &container, F score_func)
Find the best choice (lowest score) for values in a container.
Definition: ncbiutil.hpp:250
static const char label[]
EBlast_filter_program
This defines the possible sequence filtering algorithms to be used in a BLAST database.
bool IsSetOther_info(void) const
In proteins this stores the PIG, in nucleotides this stores the "origin GIs" (if one sequence is desc...
const Tdata & Get(void) const
Get the member data.
list< CRef< CBlast_def_line > > Tdata
const TOther_info & GetOther_info(void) const
Get the Other_info member data.
NCBI_NS_NCBI::TGi TGi
Definition: Seq_id_.hpp:180
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_id_.hpp:746
bool IsGi(void) const
Check if variant Gi is selected.
Definition: Seq_id_.hpp:883
bool IsSetVersion(void) const
Check if a value has been assigned to Version data member.
@ e_Local
local use
Definition: Seq_id_.hpp:95
const TSeq & GetSeq(void) const
Get the variant data.
Definition: Seq_entry_.cpp:102
bool IsSeq(void) const
Check if variant Seq is selected.
Definition: Seq_entry_.hpp:257
TId & SetId(void)
Assign a value to Id data member.
Definition: Bioseq_.hpp:296
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
Definition: Bioseq_.hpp:372
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:290
bool IsNcbi4na(void) const
Check if variant Ncbi4na is selected.
Definition: Seq_data_.hpp:564
TLength GetLength(void) const
Get the Length member data.
bool CanGetLength(void) const
Check if it is safe to call GetLength method.
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
bool CanGetId(void) const
Check if it is safe to call GetId method.
Definition: Bioseq_.hpp:284
void SetDescr(TDescr &value)
Assign a value to Descr data member.
Definition: Bioseq_.cpp:65
const TNcbi4na & GetNcbi4na(void) const
Get the variant data.
Definition: Seq_data_.hpp:570
void SetRepr(TRepr value)
Assign a value to Repr data member.
Definition: Seq_inst_.hpp:574
bool CanGetSeq_data(void) const
Check if it is safe to call GetSeq_data method.
const TNcbi2na & GetNcbi2na(void) const
Get the variant data.
Definition: Seq_data_.hpp:550
const Tdata & Get(void) const
Get the member data.
Definition: Delta_ext_.hpp:164
void SetLength(TLength value)
Assign a value to Length data member.
Definition: Seq_inst_.hpp:668
bool IsGap(void) const
Check if variant Gap is selected.
Definition: Seq_data_.hpp:704
void SetSeq_data(TSeq_data &value)
Assign a value to Seq_data data member.
Definition: Seq_inst_.cpp:130
bool IsNcbi2na(void) const
Check if variant Ncbi2na is selected.
Definition: Seq_data_.hpp:544
const TSeq_data & GetSeq_data(void) const
Get the Seq_data member data.
void SetMol(TMol value)
Assign a value to Mol data member.
Definition: Seq_inst_.hpp:621
@ eRepr_raw
continuous sequence
Definition: Seq_inst_.hpp:94
@ eMol_na
just a nucleic acid
Definition: Seq_inst_.hpp:113
@ e_Literal
a piece of sequence
Definition: Delta_seq_.hpp:90
char * dbname(DBPROCESS *dbproc)
Get name of current database.
Definition: dblib.c:6929
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
static CStopWatch sw
int i
static MDB_envinfo info
Definition: mdb_load.c:37
bool CheckAccession(const string &acc, TGi &gi, CRef< objects::CSeq_id > &seqid, bool &specific)
void GetDeflineKeys(const objects::CBlast_def_line &defline, vector< string > &keys)
Get all keys for a defline.
void MapToLMBits(const TLinkoutMap &gilist, TIdToBits &gi2links)
Read a set of GI lists, each a vector of strings, and combine the bits into the resulting linkbits ma...
vector< CItem * > TItems
Definition: pt_item.hpp:113
const struct ncbi::grid::netcache::search::fields::SIZE size
const struct ncbi::grid::netcache::search::fields::KEY key
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
EIPRangeType t
Definition: ncbi_localip.c:101
Defines unified interface to application:
The Object manager core.
static pcre_uint8 * buffer
Definition: pcretest.c:1051
CSeqDB::ESeqType ParseMoleculeTypeString(const string &str)
Convert a string to a CSeqDB ESeqType object.
Definition: seqdb.cpp:1527
bool DeleteBlastDb(const string &dbpath, CSeqDB::ESeqType seq_type)
Deletes all files associated with a BLAST database.
Definition: seqdb.cpp:1542
EBlastDbVersion
BLAST database version.
Definition: seqdbcommon.hpp:51
Defines `expert' version of CSeqDB interfaces.
TGi gi
The GI or 0 if unknown.
int oid
The OID or -1 if unknown.
int oid
The OID or -1 if unknown.
#define _ASSERT
done
Definition: token1.c:1
static bool ambig(char c)
Defines BLAST database construction classes.
Modified on Tue May 28 05:48:54 2024 by modify_doxy.py rev. 669887