NCBI C++ ToolKit
seqdbvol.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: seqdbvol.cpp 100101 2023-06-15 14:10:29Z merezhuk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Kevin Bealer
27  *
28  */
29 
30 /// @file seqdbvol.cpp
31 /// Implementation for the CSeqDBVol class, which provides an
32 /// interface for all functionality of one database volume.
33 #include <ncbi_pch.hpp>
35 #include "seqdboidlist.hpp"
36 
39 
40 #include <serial/objistr.hpp>
41 #include <serial/objostr.hpp>
42 #include <serial/objistrasnb.hpp>
43 #include <serial/objostrasnb.hpp>
44 #include <serial/serial.hpp>
45 #include <corelib/ncbimtx.hpp>
46 
47 #include <sstream>
48 
50 
52  CSeqDBLockHold & locked)
53 {
54  const char* data(0);
55 
56  if (m_NumOIDs == 0) {
57 
59 
60  // TODO we may want to check the version number and file type
61  m_Size = (Int4) SeqDB_GetStdOrd((Int4 *) (data+8));
62  m_NumOIDs = (Int4) SeqDB_GetStdOrd((Int4 *) (data+12));
63  }
64 
65  if (oid >= m_NumOIDs || oid < 0) return INVALID_GI;
66 
67  TIndx offset = oid * m_Size + 32;
69  return GI_FROM(Uint4, SeqDB_GetStdOrd((Uint4 *) data));
70 }
71 
73  const string & name,
74  char prot_nucl,
75  CSeqDBGiList * user_list,
76  CSeqDBNegativeList * neg_list,
77  int vol_start,
78  CSeqDBLockHold & locked)
79  : m_Atlas (atlas),
80  m_IsAA (prot_nucl == 'p'),
81  m_VolName (name),
82  m_TaxCache (256),
83  m_MemBit (0),
84  m_OidMaskType (0),
85  m_VolStart (vol_start),
86  m_VolEnd (0),
87  m_DeflineCache (256),
88  m_HaveColumns (false),
89  m_SeqFileOpened(false),
90  m_HdrFileOpened(false),
91  m_HashFileOpened(false),
92  m_OidFileOpened(false)
93 {
94  if (user_list) {
95  m_UserGiList.Reset(user_list);
96  m_OidMaskType = static_cast<int>(m_UserGiList->GetMaskOpts());
97  }
98  if (neg_list) {
99  m_NegativeList.Reset(neg_list);
100  }
101 
102  m_Idx.Reset(new CSeqDBIdxFile(atlas, name, prot_nucl));
103 
105 
106  // To allow for empty volumes, we must tolerate the absence of all
107  // files other than the index file.
108 }
109 
110 void
113 }
114 
115 void
117  CFastMutexGuard mtx_gurad(m_MtxSeq);
118  if (!m_SeqFileOpened && m_Idx->GetNumOIDs() != 0) {
119  m_Seq.Reset(new CSeqDBSeqFile(m_Atlas, m_VolName, (m_IsAA?'p':'n')));
120  }
121  m_SeqFileOpened = true;
122 }
123 
124 void
126  CFastMutexGuard mtx_gurad(m_MtxHdr);
127  if (!m_HdrFileOpened && m_Idx->GetNumOIDs() != 0) {
128  m_Hdr.Reset(new CSeqDBHdrFile(m_Atlas, m_VolName, (m_IsAA?'p':'n')));
129  }
130  m_HdrFileOpened = true;
131 }
132 
133 void
135  CFastMutexGuard mtx_gurad(m_MtxPig);
136  if (m_IsamPig.NotEmpty()) {
138  }
139  else if (CSeqDBIsam::IndexExists(m_VolName, (m_IsAA?'p':'n'), 'p') &&
140  m_Idx->GetNumOIDs() != 0) {
141  m_IsamPig = new CSeqDBIsam(m_Atlas, m_VolName, (m_IsAA?'p':'n'), 'p', ePigId);
142  }
143 }
144 
145 void
147  CFastMutexGuard mtx_gurad(m_MtxPig);
148  if (m_IsamPig.NotEmpty()) {
149  if (m_IsamPig->ReferencedOnlyOnce()) {
150  m_IsamPig.Reset();
151  }
152  else {
154  }
155  }
156 }
157 
158 void
160  CFastMutexGuard mtx_gurad(m_MtxGi);
161  if (m_IsamGi.NotEmpty()) {
163  }
164  else if (CSeqDBIsam::IndexExists(m_VolName, (m_IsAA?'p':'n'), 'n') &&
165  m_Idx->GetNumOIDs() != 0) {
166  m_IsamGi = new CSeqDBIsam(m_Atlas, m_VolName, (m_IsAA?'p':'n'), 'n', eGiId);
167  }
168 }
169 
170 void
172  CFastMutexGuard mtx_gurad(m_MtxGi);
173  if (m_IsamGi.NotEmpty()) {
174  if (m_IsamGi->ReferencedOnlyOnce()) {
175  m_IsamGi.Reset();
176  }
177  else {
179  }
180  }
181 }
182 
183 void
185  CFastMutexGuard mtx_gurad(m_MtxStr);
186  if (m_IsamStr.NotEmpty()) {
188  }
189  else if(CSeqDBIsam::IndexExists(m_VolName, (m_IsAA?'p':'n'), 's') &&
190  m_Idx->GetNumOIDs() != 0) {
191  m_IsamStr = new CSeqDBIsam(m_Atlas, m_VolName, (m_IsAA?'p':'n'), 's', eStringId);
192  }
193 }
194 
195 void
197  CFastMutexGuard mtx_gurad(m_MtxStr);
198  if (m_IsamStr.NotEmpty()) {
199  if (m_IsamStr->ReferencedOnlyOnce()) {
200  m_IsamStr.Reset();
201  }
202  else {
204  }
205  }
206 }
207 
208 void
210  CFastMutexGuard mtx_gurad(m_MtxTi);
211  if (m_IsamTi.NotEmpty()) {
213  }
214  else if (CSeqDBIsam::IndexExists(m_VolName, (m_IsAA?'p':'n'), 't') &&
215  m_Idx->GetNumOIDs() != 0) {
216  m_IsamTi = new CSeqDBIsam(m_Atlas, m_VolName, (m_IsAA?'p':'n'), 't', eTiId);
217  }
218 }
219 
220 void
222  CFastMutexGuard mtx_gurad(m_MtxTi);
223  if (m_IsamTi.NotEmpty()) {
224  if (m_IsamTi->ReferencedOnlyOnce()) {
225  m_IsamTi.Reset();
226  }
227  else {
229  }
230  }
231 }
232 
233 void
235  static CFastMutex mtx;
236  CFastMutexGuard mtx_gurad(mtx);
237  if (!m_HashFileOpened &&
238  CSeqDBIsam::IndexExists(m_VolName, (m_IsAA?'p':'n'), 'h') &&
239  m_Idx->GetNumOIDs() != 0) {
240  m_IsamHash =
241  new CSeqDBIsam(m_Atlas,
242  m_VolName,
243  (m_IsAA?'p':'n'),
244  'h',
245  eHashId);
246  }
247  m_HashFileOpened = true;
248 }
249 
250 void
252  static CFastMutex mtx;
253  CFastMutexGuard mtx_gurad(mtx);
254  if (!m_OidFileOpened &&
256  m_Idx->GetNumOIDs() != 0) {
257  m_GiIndex =
259  m_VolName,
260  (m_IsAA?'p':'n'));
261  }
262  m_OidFileOpened = true;
263 }
264 
266 {
267  return x_GetSeqType();
268 }
269 
271 {
272  return m_Idx->GetSeqType();
273 }
274 
276 {
277  return m_Idx->GetLMDBFileName();
278 }
279 
280 int CSeqDBVol::GetSeqLengthProt(int oid) const
281 {
282  TIndx start_offset = 0;
283  TIndx end_offset = 0;
284 
285  //m_Atlas.Lock(locked);
286  m_Idx->GetSeqStartEnd(oid, start_offset, end_offset);
287 
288  _ASSERT('p' == m_Idx->GetSeqType());
289 
290  // Subtract one, for the inter-sequence null.
291  return int(end_offset - start_offset - 1);
292 }
293 
294 // Assumes locked.
295 
297 {
298  TIndx start_offset = 0;
299  TIndx end_offset = 0;
300 
302  m_Idx->GetSeqStartEnd(oid, start_offset, end_offset);
303 
304  _ASSERT(m_Idx->GetSeqType() == 'n');
305 
306  int whole_bytes = int(end_offset - start_offset - 1);
307 
308  // The last byte is partially full; the last two bits of
309  // the last byte store the number of nucleotides in the
310  // last byte (0 to 3).
311 
312  char amb_char = 0;
313  m_Seq->ReadBytes(& amb_char, end_offset - 1, end_offset);
314 
315  int remainder = amb_char & 3;
316  return (whole_bytes * 4) + remainder;
317 }
318 
320 {
321  TIndx start_offset = 0;
322  TIndx end_offset = 0;
323 
324  //m_Atlas.Lock(locked);
325  m_Idx->GetSeqStartEnd(oid, start_offset, end_offset);
326 
327  _ASSERT(m_Idx->GetSeqType() == 'n');
328 
329  int whole_bytes = int(end_offset - start_offset - 1);
330 
331  // Same principle as below - but use lower bits of oid
332  // instead of fetching the actual last byte. this should
333  // correct for bias, unless sequence length modulo 4 has a
334  // significant statistical bias, which seems unlikely to
335  // me.
336 
337  return (whole_bytes * 4) + (oid & 0x03);
338 }
339 
340 /// Build NA2 to NcbiNA4 translation table
341 ///
342 /// This builds a translation table for nucleotide data. The table
343 /// will be used by s_SeqDBMapNA2ToNA4(). The table is indexed by the
344 /// packed nucleotide representation, or "NA2" format, which encodes
345 /// four bases per byte. The elements of the table are the unpacked
346 /// "Ncbi-NA4" representation, which encodes two bases per byte.
347 ///
348 /// @return
349 /// The NA2 to NA4 translation table
350 static vector<Uint1>
352 {
353  vector<Uint1> translated;
354  translated.resize(512);
355 
356  Uint1 convert[16] = { 0x11, 0x12, 0x14, 0x18,
357  0x21, 0x22, 0x24, 0x28,
358  0x41, 0x42, 0x44, 0x48,
359  0x81, 0x82, 0x84, 0x88 };
360 
361  Int2 pair1 = 0;
362  Int2 pair2 = 0;
363 
364  for(pair1 = 0; pair1 < 16; pair1++) {
365  for(pair2 = 0; pair2 < 16; pair2++) {
366  Int2 index = (pair1 * 16 + pair2) * 2;
367 
368  translated[index] = convert[pair1];
369  translated[index+1] = convert[pair2];
370  }
371  }
372 
373  return translated;
374 }
375 
376 /// Convert sequence data from NA2 to NA4 format
377 ///
378 /// This uses a translation table to convert nucleotide data. The
379 /// input data is in NA2 format, the output data will be in NcbiNA4
380 /// format.
381 ///
382 /// @param buf2bit
383 /// The NA2 input data. [in]
384 /// @param buf4bit
385 /// The NcbiNA4 output data. [out]
386 /// @param base_length
387 /// The length (in bases) of the input data. [in]
388 static void
389 s_SeqDBMapNA2ToNA4(const char * buf2bit,
390  vector<char> & buf4bit,
391  int base_length)
392 {
393  static vector<Uint1> expanded = s_SeqDBMapNA2ToNA4Setup();
394 
395  int estimated_length = (base_length + 1)/2;
396  int bytes = 0;
397 
398  buf4bit.resize(estimated_length);
399 
400  int inp_chars = base_length/4;
401 
402  for(int i=0; i<inp_chars; i++) {
403  Uint4 inp_char = (buf2bit[i] & 0xFF);
404 
405  buf4bit[bytes] = expanded[ (inp_char*2) ];
406  buf4bit[bytes+1] = expanded[ (inp_char*2) + 1 ];
407  bytes += 2;
408  }
409 
410  int bases_remain = base_length - (inp_chars*4);
411 
412  if (bases_remain) {
413  Uint1 remainder_bits = 2 * bases_remain;
414  Uint1 remainder_mask = (0xFF << (8 - remainder_bits)) & 0xFF;
415  Uint4 last_masked = buf2bit[inp_chars] & remainder_mask;
416 
417  buf4bit[bytes++] = expanded[ (last_masked*2) ];
418 
419  if (bases_remain > 2) {
420  buf4bit[bytes ++] = expanded[ (last_masked*2)+1 ];
421  }
422  }
423 
424  buf4bit.resize(bytes);
425 
426  _ASSERT(estimated_length == (int)buf4bit.size());
427 }
428 
429 /// Build NA2 to Ncbi-NA8 translation table
430 ///
431 /// This builds a translation table for nucleotide data. The table
432 /// will be used by s_SeqDBMapNA2ToNA8(). The table is indexed by the
433 /// packed nucleotide representation, or "NA2" format, which encodes
434 /// four bases per byte. The elements of the table are the unpacked
435 /// "Ncbi-NA8" representation, which encodes one base per byte.
436 ///
437 /// @return
438 /// The NA2 to NA8 translation table
439 static vector<Uint1>
441 {
442  // Builds a table; each two bit slice holds 0,1,2 or 3. These are
443  // converted to whole bytes containing 1,2,4, or 8, respectively.
444 
445  vector<Uint1> translated;
446  translated.reserve(1024);
447 
448  for(int i = 0; i<256; i++) {
449  int p1 = (i >> 6) & 0x3;
450  int p2 = (i >> 4) & 0x3;
451  int p3 = (i >> 2) & 0x3;
452  int p4 = i & 0x3;
453 
454  translated.push_back(1 << p1);
455  translated.push_back(1 << p2);
456  translated.push_back(1 << p3);
457  translated.push_back(1 << p4);
458  }
459 
460  return translated;
461 }
462 
463 /// Convert sequence data from NA2 to NA8 format
464 ///
465 /// This uses a translation table to convert nucleotide data. The
466 /// input data is in NA2 format, the output data will be in Ncbi-NA8
467 /// format. This function also optionally adds sentinel bytes to the
468 /// start and end of the data (needed by some applications).
469 ///
470 /// @param buf2bit
471 /// The NA2 input data. [in]
472 /// @param buf8bit
473 /// The start of the Ncbi-NA8 output data. [out]
474 /// @param buf8bit_end
475 /// The end of the Ncbi-NA8 output data. [out]
476 /// @param sentinel_bytes
477 /// Specify true if sentinel bytes should be included. [in]
478 /// @param range
479 /// The subregion of the sequence to work on. [in]
480 static void
481 s_SeqDBMapNA2ToNA8(const char * buf2bit,
482  char * buf8bit,
483  const SSeqDBSlice & range)
484 {
485  // Design note: The variable "p" makes this algorithm much easier
486  // to write correctly. It represents a pointer into the input
487  // data and is maintained to point at the next unused byte of
488  // input data.
489 
490  static vector<Uint1> expanded = s_SeqDBMapNA2ToNA8Setup();
491 
492  int pos = range.begin;
493 
494  int input_chars_begin = range.begin / 4;
495  int input_chars_end = (range.end + 3) / 4;
496 
497  int whole_chars_begin = (range.begin + 3) / 4;
498  int whole_chars_end = range.end / 4;
499 
500  int p = input_chars_begin;
501 
502  if (p < whole_chars_begin) {
503  Int4 table_offset = (buf2bit[input_chars_begin] & 0xFF) * 4;
504 
505  int endpt = (input_chars_begin + 1) * 4;
506 
507  if (endpt > range.end) {
508  endpt = range.end;
509  }
510 
511  for(int k = range.begin; k < endpt; k++) {
512  switch(k & 0x3) {
513  case 0:
514  _ASSERT(0);
515  break;
516 
517  case 1:
518  buf8bit[pos++] = expanded[ table_offset + 1 ];
519  break;
520 
521  case 2:
522  buf8bit[pos++] = expanded[ table_offset + 2 ];
523  break;
524 
525  case 3:
526  buf8bit[pos++] = expanded[ table_offset + 3 ];
527  break;
528  }
529  }
530 
531  p ++;
532  }
533 
534  // In a nucleotide search, this loop is probably a noticeable time
535  // consumer, at least relative to the CSeqDB universe. Each input
536  // byte is used to look up a 4 byte output translation. That four
537  // byte section is copied to the output vector. By pre-processing
538  // the arithmetic in the ~Setup() function, we can just pull bytes
539  // from a vector.
540 
541  p = whole_chars_begin;
542 
543  while(p < whole_chars_end) {
544  Int4 table_offset = (buf2bit[p] & 0xFF) * 4;
545 
546  buf8bit[pos++] = expanded[ table_offset ];
547  buf8bit[pos++] = expanded[ table_offset + 1 ];
548  buf8bit[pos++] = expanded[ table_offset + 2 ];
549  buf8bit[pos++] = expanded[ table_offset + 3 ];
550  p++;
551  }
552 
553  if (p < input_chars_end) {
554  Int4 table_offset = (buf2bit[p] & 0xFF) * 4;
555 
556  int remains = (range.end & 0x3);
557  _ASSERT(remains);
558 
559  buf8bit[pos++] = expanded[ table_offset ];
560 
561  if (remains > 1) {
562  buf8bit[pos++] = expanded[ table_offset + 1 ];
563 
564  if (remains > 2) {
565  buf8bit[pos++] = expanded[ table_offset + 2 ];
566  }
567  }
568  }
569 }
570 
572  15, /* Gap, 0 */
573  0, /* A, 1 */
574  1, /* C, 2 */
575  6, /* M, 3 */
576  2, /* G, 4 */
577  4, /* R, 5 */
578  9, /* S, 6 */
579  13, /* V, 7 */
580  3, /* T, 8 */
581  8, /* W, 9 */
582  5, /* Y, 10 */
583  12, /* H, 11 */
584  7, /* K, 12 */
585  11, /* D, 13 */
586  10, /* B, 14 */
587  14 /* N, 15 */
588 };
589 
590 /// Convert sequence data from Ncbi-NA8 to Blast-NA8 format
591 ///
592 /// This uses a translation table to convert nucleotide data. The
593 /// input data is in Ncbi-NA8 format, the output data will be in
594 /// Blast-NA8 format. The data is converted in-place.
595 ///
596 /// @param buf
597 /// The array of nucleotides to convert. [in|out]
598 /// @param range
599 /// The range of opearation. [in]
600 static void
602  const SSeqDBSlice & range)
603 {
604  for(int i = range.begin; i < range.end; i++)
605  buf[i] = SeqDB_ncbina8_to_blastna8[ buf[i] & 0xF ];
606 }
607 
608 //--------------------
609 // NEW (long) version
610 //--------------------
611 
612 /// Get length of ambiguous region (new version)
613 ///
614 /// Given an ambiguity element in the new format, this returns the
615 /// length of the ambiguous region.
616 ///
617 /// @param ambchars
618 /// The packed ambiguity data. [in]
619 /// @param i
620 /// The index into the ambiguity data. [in]
621 /// @return
622 /// The region length.
623 inline Uint4 s_ResLenNew(const vector<Int4> & ambchars, Uint4 i)
624 {
625  return (ambchars[i] >> 16) & 0xFFF;
626 }
627 
628 /// Get position of ambiguous region (new version)
629 ///
630 /// Given an ambiguity element in the new format, this returns the
631 /// position of the ambiguous region.
632 ///
633 /// @param ambchars
634 /// The packed ambiguity data. [in]
635 /// @param i
636 /// The index into the ambiguity data. [in]
637 /// @return
638 /// The region length.
639 inline Uint4 s_ResPosNew(const vector<Int4> & ambchars, Uint4 i)
640 {
641  return ambchars[i+1];
642 }
643 
644 //-----------------------
645 // OLD (compact) version
646 //-----------------------
647 
648 /// Get ambiguous residue value (old version)
649 ///
650 /// Given an ambiguity element in the old format, this returns the
651 /// residue value to use for all bases in the ambiguous region.
652 ///
653 /// @param ambchars
654 /// The packed ambiguity data. [in]
655 /// @param i
656 /// The index into the ambiguity data. [in]
657 /// @return
658 /// The residue value.
659 inline Uint4 s_ResVal(const vector<Int4> & ambchars, Uint4 i)
660 {
661  return (ambchars[i] >> 28) & 0xF;
662 }
663 
664 /// Get ambiguous region length (old version)
665 ///
666 /// Given an ambiguity element in the old format, this returns the
667 /// length of the ambiguous region.
668 ///
669 /// @param ambchars
670 /// The packed ambiguity data. [in]
671 /// @param i
672 /// The index into the ambiguity data. [in]
673 /// @return
674 /// The residue value.
675 inline Uint4 s_ResLenOld(const vector<Int4> & ambchars, Uint4 i)
676 {
677  return (ambchars[i] >> 24) & 0xF;
678 }
679 
680 /// Get ambiguous residue value (old version)
681 ///
682 /// Given an ambiguity element in the old format, this returns the
683 /// position of the ambiguous region.
684 ///
685 /// @param ambchars
686 /// The packed ambiguity data. [in]
687 /// @param i
688 /// The index into the ambiguity data. [in]
689 /// @return
690 /// The residue value.
691 inline Uint4 s_ResPosOld(const vector<Int4> & ambchars, Uint4 i)
692 {
693  return ambchars[i] & 0xFFFFFF; // RES_OFFSET
694 }
695 
696 /// Rebuild an ambiguous region from sequence and ambiguity data
697 ///
698 /// When sequence data for a blast database is built, ambiguous
699 /// regions are replaced with random strings of the four standard
700 /// nucleotides. The ambiguity data is seperately encoded as a
701 /// sequence of integer values. This function unpacks the ambiguity
702 /// data and replaces the randomized bases with correct (ambiguous)
703 /// encodings. This version works with 4 bit representations.
704 ///
705 /// @param buf4bit
706 /// Sequence data for a sequence. [in|out]
707 /// @param amb_chars
708 /// Corresponding ambiguous data. [in]
709 static void
710 s_SeqDBRebuildDNA_NA4(vector<char> & buf4bit,
711  const vector<Int4> & amb_chars)
712 {
713  if (amb_chars.empty())
714  return;
715 
716  // Number of ambiguities.
717  Uint4 amb_num = amb_chars[0];
718 
719  // The new format is indicated by setting the highest order bit in
720  // the LENGTH field. Either all ambiguities for this sequence
721  // will use the new format, or all will use the old format.
722 
723  bool new_format = (amb_num & 0x80000000) != 0;
724 
725  if (new_format) {
726  amb_num &= 0x7FFFFFFF;
727  }
728 
729  for(Uint4 i=1; i < amb_num+1; i++) {
730  Int4 row_len = 0;
731  Int4 position = 0;
732  Uint1 char_r = 0;
733 
734  if (new_format) {
735  char_r = s_ResVal (amb_chars, i);
736  row_len = s_ResLenNew(amb_chars, i);
737  position = s_ResPosNew(amb_chars, i);
738  } else {
739  char_r = s_ResVal (amb_chars, i);
740  row_len = s_ResLenOld(amb_chars, i);
741  position = s_ResPosOld(amb_chars, i);
742  }
743 
744  Int4 pos = position / 2;
745  Int4 rem = position & 1; /* 0 or 1 */
746  Uint1 char_l = char_r << 4;
747 
748  Int4 j;
749  Int4 index = pos;
750 
751  // This could be made slightly faster for long runs.
752 
753  for(j = 0; j <= row_len; j++) {
754  if (!rem) {
755  buf4bit[index] = (buf4bit[index] & 0x0F) + char_l;
756  rem = 1;
757  } else {
758  buf4bit[index] = (buf4bit[index] & 0xF0) + char_r;
759  rem = 0;
760  index++;
761  }
762  }
763 
764  if (new_format) // for new format we have 8 bytes for each element.
765  i++;
766  }
767 }
768 
769 /// Rebuild an ambiguous region from sequence and ambiguity data
770 ///
771 /// When sequence data for a blast database is built, ambiguous
772 /// regions are replaced with random strings of the four standard
773 /// nucleotides. The ambiguity data is seperately encoded as a
774 /// sequence of integer values. This function unpacks the ambiguity
775 /// data and replaces the randomized bases with correct (ambiguous)
776 /// encodings. This version works with 8 bit representations.
777 ///
778 /// @param seq
779 /// Sequence data for a sequence. [in|out]
780 /// @param amb_chars
781 /// Corresponding ambiguous data. [in]
782 /// @param region
783 /// If non-null, the part of the sequence to get. [in]
784 static void
786  const vector<Int4> & amb_chars,
787  const SSeqDBSlice & region)
788 {
789  if (amb_chars.empty() || !seq ) return;
790 
791  Uint4 amb_num = amb_chars[0];
792 
793  /* Check if highest order bit set. */
794  bool new_format = (amb_num & 0x80000000) != 0;
795 
796  if (new_format) amb_num &= 0x7FFFFFFF;
797 
798  for(Uint4 i = 1; i < amb_num+1; i++) {
799  Int4 row_len = 0;
800  Int4 position = 0;
801  Uint1 trans_ch = 0;
802 
803  if (new_format) {
804  trans_ch = s_ResVal (amb_chars, i);
805  row_len = s_ResLenNew(amb_chars, i) + 1;
806  position = s_ResPosNew(amb_chars, i);
807  } else {
808  trans_ch = s_ResVal (amb_chars, i);
809  row_len = s_ResLenOld(amb_chars, i) + 1;
810  position = s_ResPosOld(amb_chars, i);
811  }
812 
813  if (new_format) ++i;
814 
815  if (position + row_len <= region.begin)
816  continue;
817 
818  if(position >= region.end)
819  break;
820 
821  for (int j = 0; j < row_len; ++j, ++position)
822  if ( position >= region.begin && position < region.end)
823  seq[position] = trans_ch;
824  }
825 }
826 
827 /// Store protein sequence data in a Seq-inst
828 ///
829 /// This function reads length elements from seq_buffer and stores
830 /// them in a Seq-inst object. It also sets appropriate encoding
831 /// information in that object.
832 ///
833 /// @param seqinst
834 /// The Seq-inst to return the data in. [out]
835 /// @param seq_buffer
836 /// The input sequence data. [in]
837 /// @param length
838 /// The length (in bases) of the input data. [in]
839 static void
841  const char * seq_buffer,
842  int length)
843 {
844  // stuff - ncbistdaa
845  // mol = aa
846 
847  // This possibly/probably copies several times.
848  // 1. One copy into stdaa_data.
849  // 2. Second copy into NCBIstdaa.
850  // 3. Third copy into seqdata.
851 
852  vector<char> aa_data;
853  aa_data.resize(length);
854 
855  for(int i = 0; i < length; i++) {
856  aa_data[i] = seq_buffer[i];
857  }
858 
859  seqinst.SetSeq_data().SetNcbistdaa().Set().swap(aa_data);
860  seqinst.SetMol(CSeq_inst::eMol_aa);
861 }
862 
863 /// Store non-ambiguous nucleotide sequence data in a Seq-inst
864 ///
865 /// This function reads length elements from seq_buffer and stores
866 /// them in a Seq-inst object. It also sets appropriate encoding
867 /// information in that object. No ambiguity information is used.
868 /// The input array is assumed to be in 2 bit representation.
869 ///
870 /// @param seqinst
871 /// The Seq-inst to return the data in. [out]
872 /// @param seq_buffer
873 /// The input sequence data. [in]
874 /// @param length
875 /// The length (in bases) of the input data. [in]
876 static void
878  const char * seq_buffer,
879  int length)
880 {
881  int whole_bytes = length / 4;
882  int partial_byte = ((length & 0x3) != 0) ? 1 : 0;
883 
884  vector<char> na_data;
885  na_data.resize(whole_bytes + partial_byte);
886 
887  for(int i = 0; i<whole_bytes; i++) {
888  na_data[i] = seq_buffer[i];
889  }
890 
891  if (partial_byte) {
892  na_data[whole_bytes] = seq_buffer[whole_bytes] & (0xFF - 0x03);
893  }
894 
895  seqinst.SetSeq_data().SetNcbi2na().Set().swap(na_data);
896  seqinst.SetMol(CSeq_inst::eMol_na);
897 }
898 
899 /// Store non-ambiguous nucleotide sequence data in a Seq-inst
900 ///
901 /// This function reads length elements from seq_buffer and stores
902 /// them in a Seq-inst object. It also sets appropriate encoding
903 /// information in that object. No ambiguity information is used.
904 /// The input array is assumed to be in Ncbi-NA4 representation.
905 ///
906 /// @param seqinst
907 /// The Seq-inst to return the data in. [out]
908 /// @param seq_buffer
909 /// The input sequence data in Ncbi-NA4 format. [in]
910 /// @param length
911 /// The length (in bases) of the input data. [in]
912 /// @param amb_chars
913 /// The ambiguity data for this sequence. [in]
914 static void
916  const char * seq_buffer,
917  int length,
918  vector<Int4> & amb_chars)
919 {
920  vector<char> buffer_4na;
921  s_SeqDBMapNA2ToNA4(seq_buffer, buffer_4na, length); // length is not /4 here
922  s_SeqDBRebuildDNA_NA4(buffer_4na, amb_chars);
923 
924  seqinst.SetSeq_data().SetNcbi4na().Set().swap(buffer_4na);
925  seqinst.SetMol(CSeq_inst::eMol_na);
926 }
927 
928 /// Get the title string for a CBioseq
929 ///
930 /// GetBioseq will use this function to get a title field when
931 /// constructing the CBioseq object.
932 ///
933 /// @param deflines
934 /// The set of deflines for this sequence. [in]
935 /// @param title
936 /// The returned title string. [out]
937 static void
939 {
940  title.erase();
941 
942  string seqid_str;
943 
944  typedef list< CRef<CBlast_def_line> >::const_iterator TDefIt;
945  typedef list< CRef<CSeq_id > >::const_iterator TSeqIt;
946 
947  const list< CRef<CBlast_def_line> > & dl = deflines->Get();
948 
949  bool first_defline(true);
950 
951  for(TDefIt iter = dl.begin(); iter != dl.end(); iter++) {
952  ostringstream oss;
953 
954  const CBlast_def_line & defline = **iter;
955 
956  if (! title.empty()) {
957  //oss << "\1";
958  oss << " ";
959  }
960 
961  bool wrote_seqids(false);
962 
963  if ((!first_defline) && defline.CanGetSeqid()) {
964  const list< CRef<CSeq_id > > & sl = defline.GetSeqid();
965 
966  bool first_seqid(true);
967 
968  // First should look like: "<title>"
969  // Others should look like: " ><seqid>|<seqid>|<seqid><title>"
970 
971  // Should this be two sections not one loop?
972 
973  for (TSeqIt seqit = sl.begin(); seqit != sl.end(); seqit++) {
974  if (first_seqid) {
975  oss << ">";
976  } else {
977  oss << "|";
978  }
979 
980  (*seqit)->WriteAsFasta(oss);
981 
982  first_seqid = false;
983  wrote_seqids = true;
984  }
985  }
986 
987  // Omit seqids from first defline
988  first_defline = false;
989 
990  if (defline.CanGetTitle()) {
991  if (wrote_seqids) {
992  oss << " ";
993  }
994  oss << defline.GetTitle();
995  }
996 
997  title += oss.str();
998  }
999 }
1000 
1001 /// Search for a Seq-id in a list of Seq-ids
1002 ///
1003 /// This iterates over a list of Seq-ids, and returns true if a
1004 /// specific Seq-id is equivalent to one found in the list.
1005 ///
1006 /// @param seqids
1007 /// A list of Seq-ids to search. [in]
1008 /// @param target
1009 /// The Seq-id to search for. [in]
1010 /// @return
1011 /// True if the Seq-id was found.
1012 static bool
1013 s_SeqDB_SeqIdIn(const list< CRef< CSeq_id > > & seqids, const CSeq_id & target)
1014 {
1015  typedef list< CRef<CSeq_id> > TSeqidList;
1016 
1017  ITERATE(TSeqidList, iter, seqids) {
1018  CSeq_id::E_SIC rv = (**iter).Compare(target);
1019 
1020  switch(rv) {
1021  case CSeq_id::e_YES:
1022  return true;
1023 
1024  case CSeq_id::e_NO:
1025  return false;
1026 
1027  default:
1028  break;
1029  }
1030  }
1031 
1032  return false;
1033 }
1034 
1037  TGi preferred_gi,
1038  const CSeq_id * preferred_seqid)
1039 
1040 {
1041  // Commented out, not used at this time.
1042 // typedef list< CRef<CBlast_def_line> > TBDLL;
1043 // typedef TBDLL::iterator TBDLLIter;
1044 // typedef TBDLL::const_iterator TBDLLConstIter;
1045 
1046  // 1. read a defline set w/ gethdr, filtering by membership bit.
1047 
1049  x_GetFilteredHeader(oid, NULL);
1050 
1051  // 2. if there is a preferred gi, bump it to the top.
1052 
1053  if (preferred_gi != ZERO_GI || preferred_seqid) {
1055 
1056  CRef<const CSeq_id> seqid;
1057  if (preferred_gi != ZERO_GI) {
1058  seqid.Reset(new CSeq_id(CSeq_id::e_Gi, preferred_gi));
1059  } else {
1060  seqid.Reset(preferred_seqid);
1061  }
1062 
1063  bool found = false;
1064 
1065  ITERATE(list< CRef<CBlast_def_line> >, iter, BDLS->Get()) {
1066  if ((! found) && s_SeqDB_SeqIdIn((**iter).GetSeqid(), *seqid)) {
1067  found = true;
1068  new_bdls->Set().push_front(*iter);
1069  } else {
1070  new_bdls->Set().push_back(*iter);
1071  }
1072  }
1073 
1074  return new_bdls;
1075  }
1076 
1077  return BDLS;
1078 }
1079 
1080 list< CRef<CSeqdesc> >
1082  TGi preferred_gi,
1083  const CSeq_id * preferred_seqid)
1084 
1085 
1086 {
1087  const bool provide_new_taxonomy_info = true;
1088  const bool use_taxinfo_cache = (CThread::GetSelf() == 0 ? true : false);
1089 
1090  const char * TAX_ORGREF_DB_NAME = "taxon";
1091 
1092  list< CRef<CSeqdesc> > taxonomy;
1093 
1095  x_GetTaxDefline(oid, preferred_gi, preferred_seqid);
1096 
1097  if (bdls.Empty()) {
1098  return taxonomy;
1099  }
1100 
1101  typedef list< CRef<CBlast_def_line> > TBDLL;
1102 // typedef TBDLL::iterator TBDLLIter; // not used at this time
1103  typedef TBDLL::const_iterator TBDLLConstIter;
1104 
1105  const TBDLL & dl = bdls->Get();
1106 
1107  // Lock for sake of tax cache
1108 
1109  //m_Atlas.Lock(locked);
1110 
1111  for(TBDLLConstIter iter = dl.begin(); iter != dl.end(); iter ++) {
1112  TTaxId taxid = ZERO_TAX_ID;
1113 
1114  if ((*iter)->CanGetTaxid()) {
1115  taxid = (*iter)->GetTaxid();
1116  }
1117  if (taxid <= ZERO_TAX_ID) {
1118  continue;
1119  }
1120 
1121  bool have_org_desc = false;
1122 
1123  if (use_taxinfo_cache && m_TaxCache.Lookup(TAX_ID_TO(int, taxid)).NotEmpty()) {
1124  have_org_desc = true;
1125  }
1126 
1127  SSeqDBTaxInfo tnames(taxid);
1128  bool found_taxid_in_taxonomy_blastdb = true;
1129 
1130  if ((! have_org_desc) && provide_new_taxonomy_info) {
1131  try {
1132  found_taxid_in_taxonomy_blastdb = CSeqDBTaxInfo::GetTaxNames(taxid, tnames);
1133  } catch (CSeqDBException &) {
1134  found_taxid_in_taxonomy_blastdb = false;
1135  }
1136  }
1137 
1138  if (provide_new_taxonomy_info) {
1139  if (have_org_desc) {
1140  taxonomy.push_back(m_TaxCache.Lookup(TAX_ID_TO(int, taxid)));
1141  } else {
1142  CRef<CDbtag> org_tag(new CDbtag);
1143  org_tag->SetDb(TAX_ORGREF_DB_NAME);
1144  org_tag->SetTag().SetId(TAX_ID_TO(int, taxid));
1145 
1146  CRef<COrg_ref> org(new COrg_ref);
1147  if (found_taxid_in_taxonomy_blastdb) {
1148  org->SetTaxname().swap(tnames.scientific_name);
1149  org->SetCommon().swap(tnames.common_name);
1150  }
1151  org->SetDb().push_back(org_tag);
1152 
1154  source.Reset(new CBioSource);
1155  source->SetOrg(*org);
1156 
1157  CRef<CSeqdesc> desc(new CSeqdesc);
1158  desc->SetSource(*source);
1159 
1160  taxonomy.push_back(desc);
1161 
1162  if (use_taxinfo_cache) {
1163  m_TaxCache.Lookup(TAX_ID_TO(int, taxid)) = desc;
1164  }
1165  }
1166  }
1167  }
1168 
1169  return taxonomy;
1170 }
1171 
1172 /// Efficiently decode a Blast-def-line-set from binary ASN.1.
1173 /// @param oss Octet string sequence of binary ASN.1 data.
1174 /// @param bdls Blast def line set decoded from oss.
1177 {
1178  typedef const CUser_field::TData::TOss TOss;
1179 
1180  const char * data = NULL;
1181  size_t size = 0;
1182  string temp;
1183 
1184  if (oss.size() == 1) {
1185  // In the single-element case, no copies are needed.
1186 
1187  const vector<char> & v = *oss.front();
1188  data = & v[0];
1189  size = v.size();
1190  } else {
1191  // Determine the octet string length and do one allocation.
1192 
1193  ITERATE (TOss, iter1, oss) {
1194  size += (**iter1).size();
1195  }
1196 
1197  temp.reserve(size);
1198 
1199  ITERATE (TOss, iter3, oss) {
1200  // 23.2.4[1] "The elements of a vector are stored contiguously".
1201  temp.append(& (**iter3)[0], (*iter3)->size());
1202  }
1203 
1204  data = & temp[0];
1205  }
1206 
1209  inpstr >> *retval;
1210  return retval;
1211 }
1212 
1213 template<class T>
1215 s_ExtractBlastDefline(const T& bioseq)
1216 {
1218  if ( !bioseq.IsSetDescr() ) {
1219  return failure;
1220  }
1221 
1222  const CSeq_descr::Tdata& descList = bioseq.GetDescr().Get();
1223  ITERATE(CSeq_descr::Tdata, iter, descList) {
1224  if ( !(*iter)->IsUser() ) {
1225  continue;
1226  }
1227 
1228  const CUser_object& uobj = (*iter)->GetUser();
1229  const CObject_id& uobjid = uobj.GetType();
1230  if (uobjid.IsStr() && uobjid.GetStr() == kAsnDeflineObjLabel) {
1231  const vector< CRef< CUser_field > >& usf = uobj.GetData();
1232  _ASSERT( !usf.empty() );
1233  _ASSERT(usf.front()->CanGetData());
1234  if (usf.front()->GetData().IsOss()) { //only one user field
1235  return s_OssToDefline(usf.front()->GetData().GetOss());
1236  }
1237  }
1238  }
1239  return failure;
1240 }
1241 
1244 { return s_ExtractBlastDefline(handle); }
1245 
1248 { return s_ExtractBlastDefline(bioseq); }
1249 
1252 {
1253  CRef<CSeqdesc> asndef;
1254 
1255  vector<char> hdr_data;
1256  x_GetFilteredBinaryHeader(oid, hdr_data);
1257 
1258  if (! hdr_data.empty()) {
1259  CRef<CUser_object> uobj(new CUser_object);
1260 
1261  CRef<CObject_id> uo_oi(new CObject_id);
1262  uo_oi->SetStr(kAsnDeflineObjLabel);
1263  uobj->SetType(*uo_oi);
1264 
1266 
1267  CRef<CObject_id> uf_oi(new CObject_id);
1268  uf_oi->SetStr(kAsnDeflineObjLabel);
1269  uf->SetLabel(*uf_oi);
1270 
1271  vector< vector<char>* > & strs = uf->SetData().SetOss();
1272  uf->SetNum(1);
1273 
1274  strs.push_back(new vector<char>);
1275  strs[0]->swap(hdr_data);
1276 
1277  uobj->SetData().push_back(uf);
1278 
1279  asndef = new CSeqdesc;
1280  asndef->SetUser(*uobj);
1281  }
1282 
1283  return asndef;
1284 }
1285 
1288  TGi target_gi,
1289  const CSeq_id * target_seq_id,
1290  bool seqdata,
1291  CSeqDBLockHold & locked)
1292 {
1293  typedef list< CRef<CBlast_def_line> > TDeflines;
1294  CRef<CBioseq> null_result;
1295 
1296  CRef<CBlast_def_line> defline;
1297  list< CRef< CSeq_id > > seqids;
1298 
1300 
1301  // Get the defline set; but do not modify the object returned by
1302  // GetFilteredHeader, since that object lives in the cache.
1303 
1304  CRef<CBlast_def_line_set> orig_deflines =
1305  x_GetFilteredHeader(oid, NULL);
1306 
1307  CRef<CBlast_def_line_set> defline_set;
1308 
1309  if ((target_gi != ZERO_GI) || target_seq_id) {
1310  defline_set.Reset(new CBlast_def_line_set);
1311 
1312  CRef<const CSeq_id > seqid;
1313  CRef<const CSeq_id > seqid_lcl;
1314  if (target_gi != ZERO_GI) {
1315  seqid.Reset(new CSeq_id(CSeq_id::e_Gi, target_gi));
1316  seqid_lcl.Reset(new CSeq_id(CSeq_id::e_Local, NStr::NumericToString(target_gi)));
1317  }
1318  else {
1319  seqid.Reset(target_seq_id);
1320  seqid_lcl.Reset(new CSeq_id(CSeq_id::e_Local, seqid->GetSeqIdString(true)));
1321  }
1322 
1323  CRef<CBlast_def_line> filt_dl;
1324 
1325  ITERATE(TDeflines, iter, orig_deflines->Get()) {
1326  if ((seqid.NotEmpty() && s_SeqDB_SeqIdIn((**iter).GetSeqid(), *seqid)) ||
1327  (seqid_lcl.NotEmpty() && s_SeqDB_SeqIdIn((**iter).GetSeqid(), *seqid_lcl))) {
1328  filt_dl = *iter;
1329  break;
1330  }
1331  }
1332 
1333  if (filt_dl.Empty()) {
1334  NCBI_THROW(CSeqDBException, eArgErr,
1335  "Error: oid headers do not contain target gi/seq_id.");
1336  } else {
1337  defline_set->Set().push_back(filt_dl);
1338  }
1339  } else {
1340  defline_set = orig_deflines;
1341  }
1342 
1343  if (defline_set.Empty() ||
1344  (! defline_set->CanGet()) ||
1345  (0 == defline_set->Get().size())) {
1346  return null_result;
1347  }
1348 
1349  defline = defline_set->Get().front();
1350  if (! defline->CanGetSeqid()) {
1351  return null_result;
1352  }
1353  seqids = defline->GetSeqid();
1354 
1355  // Get length & sequence.
1356 
1357  CRef<CBioseq> bioseq(new CBioseq);
1358 
1359  bool is_prot = (x_GetSeqType() == 'p');
1360 
1361  if (seqdata) {
1362  const char * seq_buffer = 0;
1363  int length = x_GetSequence(oid, & seq_buffer);
1364 
1365  if (length < 1) {
1366  return null_result;
1367  }
1368 
1369  // If protein, we set bsp->mol = Seq_mol_aa, seq_data_type =
1370  // Seq_code_ncbistdaa; then we write the buffer into the byte
1371  // store (or equivalent).
1372  //
1373  // Nucleotide sequences require more work:
1374  // a. Try to get ambchars
1375  // b. If there are any, convert sequence to 4 byte rep.
1376  // c. Otherwise write to a byte store.
1377  // d. Set mol = Seq_mol_na;
1378 
1379  CSeq_inst & seqinst = bioseq->SetInst();
1380 
1381  if (is_prot) {
1382  s_SeqDBWriteSeqDataProt(seqinst, seq_buffer, length);
1383  } else {
1384  // nucl
1385  vector<Int4> ambchars;
1386 
1387  x_GetAmbChar(oid, ambchars);
1388 
1389  if (ambchars.empty()) {
1390  // keep as 2 bit
1391  s_SeqDBWriteSeqDataNucl(seqinst, seq_buffer, length);
1392  } else {
1393  // translate to 4 bit
1394  s_SeqDBWriteSeqDataNucl(seqinst, seq_buffer, length, ambchars);
1395  }
1396 
1397  // mol = na
1398  seqinst.SetMol(CSeq_inst::eMol_na);
1399  }
1400 
1401  if (seq_buffer) {
1402  seq_buffer = 0;
1403  }
1404 
1405  // Set the length and repr (== raw).
1406 
1407  seqinst.SetLength(length);
1408  seqinst.SetRepr(CSeq_inst::eRepr_raw);
1409  } else {
1410  CSeq_inst & seqinst = bioseq->SetInst();
1412 
1413  bioseq->SetInst().SetMol(is_prot
1415  : CSeq_inst::eMol_na);
1416  }
1417 
1418  // Set the id (Seq_id)
1419 
1420  bioseq->SetId().swap(seqids);
1421 
1422  // If the format is binary, we get the defline and chain it onto
1423  // the bsp->desc list; then we read and append taxonomy names to
1424  // the list (x_GetTaxonomy()).
1425 
1426  if (defline_set.NotEmpty()) {
1427  // Convert defline to string.
1428 
1429  string description;
1430 
1431  s_GetBioseqTitle(defline_set, description);
1432 
1433  CRef<CSeqdesc> desc1(new CSeqdesc);
1434  desc1->SetTitle().swap(description);
1435 
1436  CRef<CSeqdesc> desc2( x_GetAsnDefline(oid) );
1437 
1438  CSeq_descr & seq_desc_set = bioseq->SetDescr();
1439  seq_desc_set.Set().push_back(desc1);
1440 
1441  if (! desc2.Empty()) {
1442  seq_desc_set.Set().push_back(desc2);
1443  }
1444  }
1445 
1446  list< CRef<CSeqdesc> > tax =
1447  x_GetTaxonomy(oid, target_gi, target_seq_id);
1448 
1449  ITERATE(list< CRef<CSeqdesc> >, iter, tax) {
1450  bioseq->SetDescr().Set().push_back(*iter);
1451  }
1452 
1453  return bioseq;
1454 }
1455 
1456 char * CSeqDBVol::x_AllocType(size_t length,
1457  ESeqDBAllocType alloc_type) const
1458 {
1459  // Allocation using the atlas is not intended for the end user.
1460  // 16 bytes are added as insurance against potential off-by-one or
1461  // off-by-a-few errors.
1462 
1463  length += 16;
1464 
1465  char * retval = 0;
1466 
1467  switch(alloc_type) {
1468  case eMalloc:
1469  retval = (char*) malloc(length);
1470  break;
1471 
1472  case eNew:
1473  retval = new char[length];
1474  break;
1475 
1476  case eAtlas:
1477  default:
1478  retval = CSeqDBAtlas::Alloc(length + 16, false);
1479  }
1480 
1481  return retval;
1482 }
1483 
1485  char ** buffer,
1486  int nucl_code,
1487  ESeqDBAllocType alloc_type,
1488  SSeqDBSlice * region,
1489  CSeqDB::TSequenceRanges * masks) const
1490 {
1491  char * buf1 = 0;
1492  int baselen =
1493  x_GetAmbigSeq(oid, & buf1, nucl_code, alloc_type, region, masks);
1494 
1495  *buffer = buf1;
1496  return baselen;
1497 }
1498 
1499 static void s_SeqDBMaskSequence(char * seq,
1500  CSeqDB::TSequenceRanges * masks,
1501  char mask_letter,
1502  const SSeqDBSlice & range)
1503 {
1504  if (!masks || masks->empty()) return;
1505 
1506  // TODO This could be optimized with binary search
1507  unsigned int i(0);
1508  unsigned int begin(range.begin);
1509  unsigned int end(range.end);
1510 
1511  while (i < masks->size() && (*masks)[i].second <= begin) ++i;
1512 
1513  while (i < masks->size() && (*masks)[i].first < end) {
1514  for (size_t j = max((*masks)[i].first, begin);
1515  j < min((*masks)[i].second, end); ++j) {
1516  seq[j] = mask_letter;
1517  }
1518  ++i;
1519  }
1520 }
1521 
1523  char ** buffer,
1524  int nucl_code,
1525  ESeqDBAllocType alloc_type,
1526  CSeqDB::TSequenceRanges * partial_ranges,
1527  CSeqDB::TSequenceRanges * masks) const
1528 {
1529 
1530  if ((partial_ranges == NULL) || (partial_ranges->size() == 0)) {
1531  NCBI_THROW(CSeqDBException, eFileErr, "Error: Empty partial fetching ranges.");
1532  }
1533 
1534  const char * tmp(0);
1535  int base_length = x_GetSequence(oid, &tmp);
1536  if (base_length < 1) {
1537  NCBI_THROW(CSeqDBException, eFileErr, "Error: could not get sequence or range.");
1538  }
1539 
1540 
1541  int num_ranges = static_cast<int>(partial_ranges->size());
1542  if ((*partial_ranges)[num_ranges - 1].second > base_length) {
1543  NCBI_THROW(CSeqDBException, eFileErr, "Error: region beyond sequence range.");
1544  }
1545 
1546  bool sentinel = (nucl_code == kSeqDBNuclBlastNA8);
1547  *buffer = x_AllocType(base_length + (sentinel ? 2 : 0), alloc_type);
1548  char *seq = *buffer + (sentinel ? 1 : 0);
1549 
1550  vector<Int4> ambchars;
1551  x_GetAmbChar(oid, ambchars);
1552  ITERATE(CSeqDB::TSequenceRanges, riter, *(partial_ranges)) {
1553  int begin(riter->first);
1554  int end(riter->second);
1555 
1556  if (begin) seq[begin - 1] = (char) FENCE_SENTRY;
1557  if (end < base_length) seq[end] = (char) FENCE_SENTRY;
1558  }
1559 
1560  //cerr << "Oid: " << oid << endl;
1561 
1562  ITERATE(CSeqDB::TSequenceRanges, riter, *(partial_ranges)) {
1563  SSeqDBSlice slice(max(0, (int)riter->first), min(base_length, (int)riter->second));
1564  //cerr << "Use range set: " << riter->first << "\t" << riter->second << "\t" << "length: " << base_length << endl;
1565  s_SeqDBMapNA2ToNA8(tmp, seq, slice);
1566  s_SeqDBRebuildDNA_NA8(seq, ambchars, slice);
1567  s_SeqDBMaskSequence(seq, masks, (char)14, slice);
1568  if (sentinel){
1569  s_SeqDBMapNcbiNA8ToBlastNA8(seq, slice);
1570  }
1571  }
1572 
1573  if (sentinel) {
1574  (*buffer)[0] = (char)15;
1575  (*buffer)[base_length+1] = (char)15;
1576  }
1577  if (masks) {
1578  masks->clear();
1579  }
1580  return base_length;
1581 }
1582 
1583 
1584 
1585 
1586 /// List of offset ranges as begin/end pairs.
1588 
1590  char ** buffer,
1591  int nucl_code,
1592  ESeqDBAllocType alloc_type,
1593  SSeqDBSlice * region,
1594  CSeqDB::TSequenceRanges *masks) const
1595 {
1596  // Note the use of the third argument of x_GetSequence() to manage
1597  // the lifetime of the acquired region. Specifying false for that
1598  // argument ties the lifetime to the CSeqDBSeqFile's memory lease.
1599 
1600  const char * tmp(0);
1601  int base_length = x_GetSequence(oid, &tmp);
1602 
1603  if (region && region->end > base_length )
1604  NCBI_THROW(CSeqDBException, eFileErr, "Error: region beyond sequence range.");
1605 
1606  SSeqDBSlice range = region ? (*region) : SSeqDBSlice(0, base_length);
1607  base_length = range.end - range.begin;
1608 
1609  if (base_length < 1)
1610  NCBI_THROW(CSeqDBException, eFileErr, "Error: could not get sequence or range.");
1611 
1612  if (m_Idx->GetSeqType() == 'p') {
1613 
1614  tmp += range.begin;
1615  *buffer = x_AllocType(base_length, alloc_type);
1616  memcpy(*buffer, tmp, base_length);
1617  s_SeqDBMaskSequence(*buffer - range.begin, masks, (char)21, range);
1618 
1619  } else {
1620 
1621  bool sentinel = (nucl_code == kSeqDBNuclBlastNA8);
1622  *buffer = x_AllocType(base_length + (sentinel ? 2 : 0), alloc_type);
1623  char *seq = *buffer - range.begin + (sentinel ? 1 : 0);
1624 
1625  // Get ambiguity characters.
1626 
1627  vector<Int4> ambchars;
1628  x_GetAmbChar(oid, ambchars);
1629 
1630  // Determine if we want to filter by offset ranges. This
1631  // is only done if:
1632  //
1633  // 1. No range is specified by the user.
1634  // 2. We have cached ranges.
1635  TRangeList range_set;
1636  bool use_range_set = true;
1637  {
1638  CFastMutexGuard mtx_guard(m_MtxCachedRange);
1639  TRangeCache::iterator rciter = m_RangeCache.find(oid);
1640  if (region
1641  || rciter == m_RangeCache.end()
1642  || rciter->second->GetRanges().empty()
1644  use_range_set = false;
1645  }
1646  else {
1647  range_set = rciter->second->GetRanges();
1648  }
1649  }
1650 
1651  if (!use_range_set) {
1652  s_SeqDBMapNA2ToNA8(tmp, seq, range);
1653  s_SeqDBRebuildDNA_NA8(seq, ambchars, range);
1654  s_SeqDBMaskSequence(seq, masks, (char)14, range);
1655  if (sentinel) s_SeqDBMapNcbiNA8ToBlastNA8(seq, range);
1656 
1657  } else {
1658 
1659  _ASSERT (!region);
1660  // Place 'fence' sentinel bytes around each range; this is done
1661  // before any of the range data is mapped so that the range data
1662  // is free to replace the sentinel bytes if needed; that would
1663  // only happen if range_set are adjacent or overlapping.
1664 
1665  ITERATE(TRangeVector, riter, range_set) {
1666  int begin(riter->first);
1667  int end(riter->second);
1668 
1669  if (begin) seq[begin - 1] = (char) FENCE_SENTRY;
1670  if (end < base_length) seq[end] = (char) FENCE_SENTRY;
1671  }
1672 
1673  ITERATE(TRangeVector, riter, range_set) {
1674 
1675  SSeqDBSlice slice(max(0, riter->first),
1676  min(range.end, riter->second));
1677 
1678  s_SeqDBMapNA2ToNA8(tmp, seq, slice);
1679  s_SeqDBRebuildDNA_NA8(seq, ambchars, slice);
1680  s_SeqDBMaskSequence(seq, masks, (char)14, slice);
1681  if (sentinel) s_SeqDBMapNcbiNA8ToBlastNA8(seq, slice);
1682  }
1683  }
1684 
1685  // Put back the sentinel at last
1686  if (sentinel) {
1687  (*buffer)[0] = (char)15;
1688  (*buffer)[base_length+1] = (char)15;
1689  }
1690  }
1691 
1692  // Clear the masks after consumption
1693  if (masks) masks->clear();
1694 
1695  return base_length;
1696 }
1697 
1698 void SeqDB_UnpackAmbiguities(const CTempString & sequence,
1699  const CTempString & ambiguities,
1700  string & result)
1701 {
1702  result.resize(0);
1703 
1704  // The code in this block is derived from GetBioseq() and
1705  // s_SeqDBWriteSeqDataNucl().
1706 
1707  // Get the length and the (probably mmapped) data.
1708 
1709  if (sequence.length() == 0) {
1710  NCBI_THROW(CSeqDBException, eFileErr,
1711  "Error: packed sequence data is not valid.");
1712  }
1713 
1714  const char * seq_buffer = sequence.data();
1715 
1716  int whole_bytes = static_cast<int>(sequence.length()) - 1;
1717  int remainder = sequence[whole_bytes] & 3;
1718  int base_length = (whole_bytes * 4) + remainder;
1719 
1720  if (base_length == 0) {
1721  return;
1722  }
1723 
1724  // Get ambiguity characters.
1725 
1726  vector<Int4> ambchars;
1727  ambchars.reserve(ambiguities.length()/4);
1728 
1729  for(size_t i = 0; i < ambiguities.length(); i+=4) {
1730  Int4 A = SeqDB_GetStdOrd((int*) (ambiguities.data() + i));
1731  ambchars.push_back(A);
1732  }
1733 
1734  // Combine and translate to 4 bits-per-character encoding.
1735 
1736  char * buffer_na8 = (char*) malloc(base_length);
1737 
1738  try {
1740 
1741  s_SeqDBMapNA2ToNA8(seq_buffer, buffer_na8, range);
1742 
1743  s_SeqDBRebuildDNA_NA8(buffer_na8, ambchars, range);
1744  }
1745  catch(...) {
1746  free(buffer_na8);
1747  throw;
1748  }
1749 
1750  result.assign(buffer_na8, base_length);
1751 
1752  free(buffer_na8);
1753 }
1754 
1755 
1757  const char ** buffer) const
1758 {
1759  TIndx start_offset = 0;
1760  TIndx end_offset = 0;
1761 
1762  int length = -1;
1763 
1764  //m_Atlas.Lock(locked);
1766 
1767  if (oid >= m_Idx->GetNumOIDs()) return -1;
1768 
1769  m_Idx->GetSeqStartEnd(oid, start_offset, end_offset);
1770 
1771  char seqtype = m_Idx->GetSeqType();
1772 
1773  if ('p' == seqtype) {
1774  // Subtract one, for the inter-sequence null.
1775 
1776  end_offset --;
1777 
1778  length = int(end_offset - start_offset);
1779 
1780  // Although we subtracted one above to get the correct length,
1781  // we expand the range here by one byte in both directions.
1782  // The normal consumer of this data relies on them, and can
1783  // walk off memory if a sequence ends on a slice boundary.
1784  *buffer = m_Seq->GetFileDataPtr(start_offset-1) + 1;
1785  if (! (*buffer - 1)) return -1;
1786 
1787  } else if ('n' == seqtype) {
1788  // The last byte is partially full; the last two bits of the
1789  // last byte store the number of nucleotides in the last byte
1790  // (0 to 3).
1791 
1792  // 'Hold' is used if we are going to fetch another kind of
1793  // data after this data, but before we are done actually using
1794  // this data. If can_release is true, we will return after
1795  // this. If keep is true, we don't need hold because keep
1796  // will already have preserved the region.
1797 
1798 
1799  *buffer = m_Seq->GetFileDataPtr(start_offset);
1800 
1801  if (! (*buffer)) return -1;
1802 
1803  // If we are returning a hold on the sequence (keep), and the
1804  // caller does not need the lock after this (can_release) we
1805  // can let go of the lock (because the hold will prevent GC of
1806  // the underlying data). This will allow the following data
1807  // access to occur outside of the locked duration - lowering
1808  // contention in the nucleotide case.
1809 
1810  /* do not release the lock since we may be getting more...
1811  if (keep && can_release) {
1812  m_Atlas.Unlock(locked);
1813  }*/
1814 
1815  int whole_bytes = int(end_offset - start_offset - 1);
1816 
1817  char last_char = (*buffer)[whole_bytes];
1818 
1819  int remainder = last_char & 3;
1820  length = (whole_bytes * 4) + remainder;
1821  }
1822 
1823  return length;
1824 }
1825 
1826 list< CRef<CSeq_id> > CSeqDBVol::GetSeqIDs(int oid) const
1827 {
1828  list< CRef< CSeq_id > > seqids;
1829 
1830  CRef<CBlast_def_line_set> defline_set =
1831  x_GetFilteredHeader(oid, NULL);
1832 
1833  if ((! defline_set.Empty()) && defline_set->CanGet()) {
1834  ITERATE(list< CRef<CBlast_def_line> >, defline, defline_set->Get()) {
1835  if (! (*defline)->CanGetSeqid()) {
1836  continue;
1837  }
1838 
1839  ITERATE(list< CRef<CSeq_id> >, seqid, (*defline)->GetSeqid()) {
1840  seqids.push_back(*seqid);
1841  }
1842  }
1843  }
1844 
1845  return seqids;
1846 }
1847 
1848 list< CRef<CSeq_id> > CSeqDBVol::GetSeqIDs(int oid, CObjectIStreamAsnBinary *inpstr ) const
1849 {
1850  list< CRef< CSeq_id > > seqids;
1851 
1852  CRef<CBlast_def_line_set> defline_set =
1853  x_GetFilteredHeader(oid, NULL, inpstr);
1854 
1855  if ((! defline_set.Empty()) && defline_set->CanGet()) {
1856  ITERATE(list< CRef<CBlast_def_line> >, defline, defline_set->Get()) {
1857  if (! (*defline)->CanGetSeqid()) {
1858  continue;
1859  }
1860 
1861  ITERATE(list< CRef<CSeq_id> >, seqid, (*defline)->GetSeqid()) {
1862  seqids.push_back(*seqid);
1863  }
1864  }
1865  }
1866 
1867  return seqids;
1868 }
1869 
1871  CSeqDBLockHold & locked) const
1872 {
1874  if (!m_GiIndex.Empty()) {
1875  return m_GiIndex->GetSeqGI(oid, locked);
1876  }
1877  return INVALID_GI;
1878 }
1879 
1881 {
1882  return m_Idx->GetVolumeLength();
1883 }
1884 
1887  CSeqDBLockHold & locked) const
1888 {
1889  return x_GetFilteredHeader(oid, NULL);
1890 }
1891 
1892 bool s_IncludeDefline_Taxid(const CBlast_def_line & def, const set<TTaxId> & user_tax_ids)
1893 {
1894  CBlast_def_line::TTaxIds tax_ids;
1895  if (def.IsSetTaxid()) {
1896  tax_ids.insert(def.GetTaxid());
1897  }
1898  if(def.IsSetLinks()) {
1899  CBlast_def_line::TLinks leaf_ids = def.GetLinks();
1900 #ifdef NCBI_STRICT_TAX_ID
1901  ITERATE(CBlast_def_line::TLinks, it, leaf_ids) tax_ids.insert(TAX_ID_FROM(int, *it));
1902 #else
1903  tax_ids.insert(leaf_ids.begin(), leaf_ids.end());
1904 #endif
1905  }
1906 
1907  if(user_tax_ids.size() > tax_ids.size()) {
1908  ITERATE(CBlast_def_line::TTaxIds, itr, tax_ids) {
1909  if(user_tax_ids.find(*itr) != user_tax_ids.end()) {
1910  return true;
1911  }
1912  }
1913 
1914  }
1915  else {
1916  ITERATE(set<TTaxId>, itr, user_tax_ids) {
1917  if(tax_ids.find(*itr) != tax_ids.end()) {
1918  return true;
1919  }
1920  }
1921  }
1922  return false;
1923 }
1924 
1926 {
1927  ITERATE(list< CRef<CSeq_id> >, seqid, def.GetSeqid()) {
1929  if((*seqid)->IdentifyAccession() & CSeq_id::fAcc_predicted) {
1930  return false;
1931  }
1932  }
1933  }
1934  return true;
1935 }
1936 
1937 
1938 bool s_IncludeDefline_NegativeTaxid(const CBlast_def_line & def, const set<TTaxId> & user_tax_ids)
1939 {
1940  CBlast_def_line::TTaxIds taxid_set = def.GetTaxIds();
1941  if(taxid_set.size() > user_tax_ids.size()) {
1942  return true;
1943  }
1944  ITERATE(CBlast_def_line::TTaxIds, itr, taxid_set) {
1945  if(user_tax_ids.find(*itr) == user_tax_ids.end()) {
1946  return true;
1947  }
1948  }
1949  return false;
1950 }
1951 
1954  bool * changed) const
1955 {
1956  typedef list< CRef<CBlast_def_line> > TBDLL;
1957  typedef TBDLL::iterator TBDLLIter;
1958 
1959  //m_Atlas.Lock(locked);
1960 
1961  const bool useCache = (CThread::GetSelf() == 0 ? true : false);
1962  TDeflineCacheItem & cached = m_DeflineCache.Lookup(oid);
1963 
1964  if (useCache && cached.first.NotEmpty()) {
1965  if (changed) {
1966  *changed = cached.second;
1967  }
1968 
1969  return cached.first;
1970  }
1971 
1972  bool asn_changed = false;
1973 
1975  x_GetHdrAsn1(oid, true, & asn_changed);
1976 
1977  bool id_filter = x_HaveIdFilter();
1978 
1979  if (id_filter || m_MemBit || m_OidMaskType) {
1980  // Create the memberships mask (should this be fixed to allow
1981  // membership bits greater than 32?)
1982 
1983  TBDLL & dl = BDLS->Set();
1984 
1985  for(TBDLLIter iter = dl.begin(); iter != dl.end(); ) {
1986  const CBlast_def_line & defline = **iter;
1987 
1988  bool have_memb = true;
1989 
1990  if (m_MemBit) {
1991  have_memb =
1992  defline.CanGetMemberships() &&
1993  defline.IsSetMemberships() &&
1994  (! defline.GetMemberships().empty());
1995 
1996  if (have_memb) {
1997  int bits = defline.GetMemberships().front();
1998  int memb_mask = 0x1 << (m_MemBit-1);
1999 
2000  if ((bits & memb_mask) == 0) {
2001  have_memb = false;
2002  }
2003  }
2004  }
2005 
2006  // Here we must pass both the user-gi and volume-gi test,
2007  // for each defline, but not necessarily for each Seq-id.
2008  if (have_memb) {
2009  if (id_filter && defline.CanGetSeqid()) {
2010  have_memb = false;
2011  bool have_user = false, have_volume = false;
2012  ITERATE(list< CRef<CSeq_id> >, seqid, defline.GetSeqid()) {
2013  x_FilterHasId(**seqid, have_user, have_volume);
2014  if (have_user && have_volume) break;
2015  }
2016  have_memb = have_user && have_volume;
2017  }
2018 
2019  if(have_memb && (!m_UserGiList.Empty()) && (m_UserGiList->GetNumTaxIds() > 0)) {
2020  have_memb = s_IncludeDefline_Taxid(defline, m_UserGiList->GetTaxIdsList());
2021  }
2022 
2023  if (!have_memb && !m_VolumeGiLists.empty()) {
2025  if( (*vtaxid)->GetNumTaxIds() > 0) {
2026  have_memb = s_IncludeDefline_Taxid(defline, (*vtaxid)->GetTaxIdsList());
2027  if(have_memb){
2028  break;
2029  }
2030  }
2031  }
2032  }
2033 
2034  if(have_memb && (!m_NegativeList.Empty()) && (m_NegativeList->GetNumTaxIds() > 0)) {
2036  }
2037 
2038  if(have_memb && (m_OidMaskType != 0)) {
2039  have_memb = s_IncludeDefline_MaskFilter(defline, m_OidMaskType);
2040  }
2041  }
2042 
2043  if (! have_memb) {
2044  TBDLLIter eraseme = iter++;
2045  dl.erase(eraseme);
2046  asn_changed = true;
2047  } else {
2048  iter++;
2049  }
2050  }
2051  }
2052 
2053  if (useCache)
2054  {
2055  if (asn_changed) {
2056  cached.first = BDLS;
2057  cached.second = asn_changed;
2058  } else {
2059  cached.first = BDLS;
2060  cached.second = asn_changed;
2061  }
2062  }
2063 
2064  return BDLS;
2065 }
2066 
2069  bool * changed,
2070  CObjectIStreamAsnBinary *inpstr ) const
2071 {
2072  typedef list< CRef<CBlast_def_line> > TBDLL;
2073  typedef TBDLL::iterator TBDLLIter;
2074 
2075  //m_Atlas.Lock(locked);
2076 
2077  const bool useCache = (CThread::GetSelf() == 0 ? true : false);
2078  TDeflineCacheItem & cached = m_DeflineCache.Lookup(oid);
2079 
2080  if (useCache && cached.first.NotEmpty()) {
2081  if (changed) {
2082  *changed = cached.second;
2083  }
2084 
2085  return cached.first;
2086  }
2087 
2088  bool asn_changed = false;
2089 
2091  x_GetHdrAsn1(oid, true, & asn_changed, inpstr);
2092 
2093  bool id_filter = x_HaveIdFilter();
2094 
2095  if (id_filter || m_MemBit || m_OidMaskType) {
2096  // Create the memberships mask (should this be fixed to allow
2097  // membership bits greater than 32?)
2098 
2099  TBDLL & dl = BDLS->Set();
2100 
2101  for(TBDLLIter iter = dl.begin(); iter != dl.end(); ) {
2102  const CBlast_def_line & defline = **iter;
2103 
2104  bool have_memb = true;
2105 
2106  if (m_MemBit) {
2107  have_memb =
2108  defline.CanGetMemberships() &&
2109  defline.IsSetMemberships() &&
2110  (! defline.GetMemberships().empty());
2111 
2112  if (have_memb) {
2113  int bits = defline.GetMemberships().front();
2114  int memb_mask = 0x1 << (m_MemBit-1);
2115 
2116  if ((bits & memb_mask) == 0) {
2117  have_memb = false;
2118  }
2119  }
2120  }
2121 
2122  // Here we must pass both the user-gi and volume-gi test,
2123  // for each defline, but not necessarily for each Seq-id.
2124  if (have_memb) {
2125  if (id_filter && defline.CanGetSeqid()) {
2126  have_memb = false;
2127  bool have_user = false, have_volume = false;
2128  ITERATE(list< CRef<CSeq_id> >, seqid, defline.GetSeqid()) {
2129  x_FilterHasId(**seqid, have_user, have_volume);
2130  if (have_user && have_volume) break;
2131  }
2132  have_memb = have_user && have_volume;
2133  }
2134 
2135  if(have_memb && (!m_UserGiList.Empty()) && (m_UserGiList->GetNumTaxIds() > 0)) {
2136  have_memb = s_IncludeDefline_Taxid(defline, m_UserGiList->GetTaxIdsList());
2137  }
2138 
2139  if (!have_memb && !m_VolumeGiLists.empty()) {
2141  if( (*vtaxid)->GetNumTaxIds() > 0) {
2142  have_memb = s_IncludeDefline_Taxid(defline, (*vtaxid)->GetTaxIdsList());
2143  if(have_memb){
2144  break;
2145  }
2146  }
2147  }
2148  }
2149 
2150  if(have_memb && (!m_NegativeList.Empty()) && (m_NegativeList->GetNumTaxIds() > 0)) {
2152  }
2153 
2154  if(have_memb && (m_OidMaskType != 0)) {
2155  have_memb = s_IncludeDefline_MaskFilter(defline, m_OidMaskType);
2156  }
2157 
2158  }
2159 
2160  if (! have_memb) {
2161  TBDLLIter eraseme = iter++;
2162  dl.erase(eraseme);
2163  asn_changed = true;
2164  } else {
2165  iter++;
2166  }
2167  }
2168  }
2169 
2170  if (useCache)
2171  {
2172  if (asn_changed) {
2173  cached.first = BDLS;
2174  cached.second = asn_changed;
2175  } else {
2176  cached.first = BDLS;
2177  cached.second = asn_changed;
2178  }
2179  }
2180 
2181  return BDLS;
2182 }
2185  bool adjust_oids,
2186  bool * changed) const
2187 {
2189 
2190  CTempString raw_data = x_GetHdrAsn1Binary(oid);
2191 
2192  if (! raw_data.size()) {
2193  return bdls;
2194  }
2195 
2196  // Now create an ASN.1 object from the memory chunk provided here.
2197 
2198  CObjectIStreamAsnBinary inpstr(raw_data.data(), raw_data.size());
2199 
2200  bdls.Reset(new objects::CBlast_def_line_set);
2201 
2202  inpstr >> *bdls;
2203 
2204  if (adjust_oids && bdls.NotEmpty() && m_VolStart) {
2205  NON_CONST_ITERATE(list< CRef<CBlast_def_line> >, dl, bdls->Set()) {
2206  if (! (**dl).CanGetSeqid()) {
2207  continue;
2208  }
2209 
2210  NON_CONST_ITERATE(list< CRef<CSeq_id> >, id, (*dl)->SetSeqid()) {
2211  CSeq_id & seqid = **id;
2212 
2213  if (seqid.Which() == CSeq_id::e_General) {
2214  CDbtag & dbt = seqid.SetGeneral();
2215 
2216  if (dbt.GetDb() == "BL_ORD_ID") {
2217  int vol_oid = dbt.GetTag().GetId();
2218  dbt.SetTag().SetId(m_VolStart + vol_oid);
2219  if (changed) {
2220  *changed = true;
2221  }
2222  }
2223  }
2224  }
2225  }
2226  }
2227 
2228  return bdls;
2229 }
2230 
2233  bool adjust_oids,
2234  bool * changed,
2235  CObjectIStreamAsnBinary *inpstr ) const
2236 {
2238 
2239  CTempString raw_data = x_GetHdrAsn1Binary(oid);
2240 
2241  if (! raw_data.size()) {
2242  return bdls;
2243  }
2244 
2245  /*
2246  * FixNonPrint(how);
2247  * ResetThisState();
2248  * OpenFromBuffer(buffer, size);
2249  */
2250  inpstr->Close(); // insted of ResetThisState
2251  inpstr->OpenFromBuffer( raw_data.data(), raw_data.size());
2252 
2253  bdls.Reset(new objects::CBlast_def_line_set);
2254 
2255  (*inpstr) >> *bdls; // NEW METHOD
2256 
2257  if (adjust_oids && bdls.NotEmpty() && m_VolStart) {
2258  NON_CONST_ITERATE(list< CRef<CBlast_def_line> >, dl, bdls->Set()) {
2259  if (! (**dl).CanGetSeqid()) {
2260  continue;
2261  }
2262 
2263  NON_CONST_ITERATE(list< CRef<CSeq_id> >, id, (*dl)->SetSeqid()) {
2264  CSeq_id & seqid = **id;
2265 
2266  if (seqid.Which() == CSeq_id::e_General) {
2267  CDbtag & dbt = seqid.SetGeneral();
2268 
2269  if (dbt.GetDb() == "BL_ORD_ID") {
2270  int vol_oid = dbt.GetTag().GetId();
2271  dbt.SetTag().SetId(m_VolStart + vol_oid);
2272  if (changed) {
2273  *changed = true;
2274  }
2275  }
2276  }
2277  }
2278  }
2279  }
2280 
2281  return bdls;
2282 }
2283 
2286 {
2287  TIndx hdr_start = 0;
2288  TIndx hdr_end = 0;
2289 
2290  //m_Atlas.Lock(locked);
2291 
2293 
2294  m_Idx->GetHdrStartEnd(oid, hdr_start, hdr_end);
2295  const char * asn_region = m_Hdr->GetFileDataPtr(hdr_start);
2296 
2297  return CTempString(asn_region, hdr_end - hdr_start);
2298 }
2299 
2300 void
2302  vector<char> & hdr_data ) const
2303 {
2304  // This method's client is GetBioseq() and related methods. That
2305  // code needs filtered ASN.1 headers; eliminating the fetching of
2306  // filtered data here is not necessary (the cache will hit.)
2307 
2308  // If the data changed after deserialization, we need to serialize
2309  // the modified version. If not, we can just copy the binary data
2310  // from disk.
2311 
2312  bool changed = false;
2313 
2315  x_GetFilteredHeader(oid, & changed);
2316 
2317  if (changed) {
2318  CNcbiOstrstream asndata;
2319 
2320  {{
2321  CObjectOStreamAsnBinary outpstr(asndata);
2322  outpstr << *dls;
2323  }}
2324  string s = CNcbiOstrstreamToString(asndata);
2325  hdr_data.assign(s.data(), s.data() + s.size());
2326  } else {
2327  CTempString raw = x_GetHdrAsn1Binary(oid);
2328  hdr_data.assign(raw.data(), raw.data() + raw.size());
2329  }
2330 }
2331 
2333  vector<Int4> & ambchars) const
2334 {
2335  TIndx start_offset = 0;
2336  TIndx end_offset = 0;
2337 
2338  //m_Atlas.Lock(locked);
2339 
2340  bool ok = m_Idx->GetAmbStartEnd(oid, start_offset, end_offset);
2341 
2342  if (! ok) {
2343  NCBI_THROW(CSeqDBException, eFileErr,
2344  "File error: could not get ambiguity data.");
2345  }
2346 
2347  int length = int(end_offset - start_offset);
2348 
2349  if (length) {
2350  int total = length / 4;
2351 
2352  // 'hold' should be false here because we only need the data
2353  // for the duration of this function.
2354  Int4 * buffer =
2355  (Int4*) m_Seq->GetFileDataPtr(start_offset);
2356 
2357  // This is probably unnecessary
2358  total &= 0x7FFFFFFF;
2359 
2360  ambchars.resize(total);
2361 
2362  for(int i = 0; i<total; i++) {
2363  ambchars[i] = SeqDB_GetStdOrd((const int *)(& buffer[i]));
2364  }
2365  } else {
2366  ambchars.clear();
2367  }
2368 }
2369 
2371 {
2372  return m_Idx->GetNumOIDs();
2373 }
2374 
2375 string CSeqDBVol::GetTitle() const
2376 {
2377  return m_Idx->GetTitle();
2378 }
2379 
2380 string CSeqDBVol::GetDate() const
2381 {
2382  return m_Idx->GetDate();
2383 }
2384 
2386 {
2387  return m_Idx->GetMaxLength();
2388 }
2389 
2391 {
2392  return m_Idx->GetMinLength();
2393 }
2394 
2395 bool CSeqDBVol::PigToOid(int pig, int & oid) const
2396 {
2397  bool rv = false;
2398  x_OpenPigFile();
2399  if (m_IsamPig.NotEmpty()) {
2400  rv = m_IsamPig->PigToOid(pig, oid);
2401  x_UnleasePigFile();
2402  }
2403  return rv;
2404 }
2405 
2406 bool CSeqDBVol::GetPig(int oid, int & pig, CSeqDBLockHold & locked) const
2407 {
2408  pig = -1;
2409 
2410  CRef<CBlast_def_line_set> BDLS = x_GetHdrAsn1(oid, false, NULL);
2411 
2412  if (BDLS.Empty() || (! BDLS->IsSet())) {
2413  return false;
2414  }
2415 
2416  typedef list< CRef< CBlast_def_line > >::const_iterator TI1;
2417  typedef list< int >::const_iterator TI2;
2418 
2419  TI1 it1 = BDLS->Get().begin();
2420 
2421  for(; it1 != BDLS->Get().end(); it1++) {
2422  if ((*it1)->IsSetOther_info()) {
2423  TI2 it2 = (*it1)->GetOther_info().begin();
2424  TI2 it2end = (*it1)->GetOther_info().end();
2425 
2426  for(; it2 != it2end; it2++) {
2427  if (*it2 != -1) {
2428  pig = *it2;
2429  return true;
2430  }
2431  }
2432  }
2433  }
2434 
2435  return false;
2436 }
2437 
2439  int & oid,
2440  CSeqDBLockHold & locked) const
2441 {
2442  // Note: this is the (Int8 to int) interface layer; code below
2443  // this point (in the call stack) uses int; code above this level,
2444  // up to the user level, uses Int8.
2445 
2446  x_OpenTiFile();
2447  if (m_IsamTi.Empty()) {
2448  // If the "nti/ntd" files become ubiquitous, this could be
2449  // removed. For now, I will look up trace IDs in the string
2450  // DB if the database in question does not have the Trace ID
2451  // ISAM files. (The following could be made more efficient.)
2452 
2453  CSeq_id seqid(string("gnl|ti|") + NStr::Int8ToString(ti));
2454  vector<int> oids;
2455 
2456  SeqidToOids(seqid, oids, locked);
2457 
2458  if (oids.size()) {
2459  oid = oids[0];
2460  }
2461  return ! oids.empty();
2462  }
2463  else {
2464  bool rv = false;
2465  rv = m_IsamTi->IdToOid(ti, oid);
2466  x_UnleaseTiFile();
2467  return rv;
2468  }
2469 }
2470 
2471 bool CSeqDBVol::GiToOid(TGi gi, int & oid, CSeqDBLockHold & locked) const
2472 {
2473  bool rv = false;
2474  x_OpenGiFile();
2475  if (m_IsamGi.Empty()) {
2476  return false;
2477  }
2478  rv = m_IsamGi->IdToOid(GI_TO(Int8, gi), oid);
2479  x_UnleaseGiFile();
2480  return rv;
2481 }
2482 
2484  CSeqDBLockHold & locked) const
2485 {
2486  if (ids.GetNumGis()) {
2487  x_OpenGiFile();
2488  if (m_IsamGi.NotEmpty()) {
2490  x_UnleaseGiFile();
2491  } else {
2493  eArgErr,
2494  "GI list specified but no ISAM file found for GI in " + m_VolName);
2495  }
2496  }
2497 
2498  if (ids.GetNumTis()) {
2499  x_OpenTiFile();
2500  if (m_IsamTi.NotEmpty()) {
2502  x_UnleaseTiFile();
2503  } else {
2505  eArgErr,
2506  "TI list specified but no ISAM file found for TI in " + m_VolName);
2507  }
2508  }
2509 
2510  if (ids.GetNumPigs()) {
2511  x_OpenPigFile();
2512  if (m_IsamPig.NotEmpty()) {
2514  x_UnleasePigFile();
2515  } else {
2517  eArgErr,
2518  "IPG list specified but no ISAM file found for IPG in " + m_VolName);
2519  }
2520  }
2521 
2522  if (ids.GetNumSis() && (GetLMDBFileName() == kEmptyStr)) {
2523  x_OpenStrFile();
2524  if (m_IsamStr.NotEmpty()) {
2526  x_UnleaseStrFile();
2527  } else {
2529  eArgErr,
2530  "SI list specified but no ISAM file found for SI in " + m_VolName);
2531  }
2532  }
2533 }
2534 
2536  CSeqDBLockHold & locked) const
2537 {
2538  // Numeric translation is done in batch mode.
2539 
2540  if (ids.GetNumGis()) {
2541  x_OpenGiFile();
2542  if (m_IsamGi.NotEmpty()) {
2544  x_UnleaseGiFile();
2545  } else {
2547  eArgErr,
2548  "GI list specified but no ISAM file found for GI in " + m_VolName);
2549  }
2550  }
2551 
2552  if (ids.GetNumTis()) {
2553  x_OpenTiFile();
2554  if (m_IsamTi.NotEmpty()) {
2556  x_UnleaseTiFile();
2557  } else {
2559  eArgErr,
2560  "TI list specified but no ISAM file found for TI in " + m_VolName);
2561  }
2562  }
2563 
2564  if (ids.GetNumSis()) {
2565  x_OpenStrFile();
2566  if (m_IsamStr.NotEmpty()) {
2568  x_UnleaseStrFile();
2569  } else {
2571  eArgErr,
2572  "SI list specified but no ISAM file found for SI in " + m_VolName);
2573  }
2574  }
2575 }
2576 
2577 bool CSeqDBVol::GetGi(int oid,
2578  TGi & gi,
2579  CSeqDBLockHold & locked) const
2580 {
2581  gi = INVALID_GI;
2582 
2583  if (!CSeqDBIsam::IndexExists(m_VolName, (m_IsAA?'p':'n'), 'n')) {
2584  return false;
2585  }
2586 
2588  x_GetFilteredHeader(oid, NULL);
2589 
2590  if (BDLS.Empty() || (! BDLS->IsSet())) {
2591  return false;
2592  }
2593 
2594  typedef list< CRef< CBlast_def_line > >::const_iterator TI1;
2595  typedef list< CRef< CSeq_id > >::const_iterator TI2;
2596 
2597  TI1 it1 = BDLS->Get().begin();
2598 
2599  // Iterate over all blast def lines in the set
2600 
2601  for(; it1 != BDLS->Get().end(); it1++) {
2602  if ((*it1)->CanGetSeqid()) {
2603  TI2 it2 = (*it1)->GetSeqid().begin();
2604  TI2 it2end = (*it1)->GetSeqid().end();
2605 
2606  // Iterate within each defline
2607 
2608  for(; it2 != it2end; it2++) {
2609  if ((*it2)->IsGi()) {
2610  gi = (*it2)->GetGi();
2611  return true;
2612  }
2613  }
2614  }
2615  }
2616 
2617  return false;
2618 }
2619 
2620 void CSeqDBVol::x_StringToOids(const string & acc,
2621  ESeqDBIdType id_type,
2622  Int8 ident,
2623  const string & str_id,
2624  bool simpler,
2625  vector<int> & oids) const
2626 {
2627  bool vcheck (false);
2628  bool fits_in_four = (ident == -1) || ! (ident >> 32);
2629  bool needs_four = true;
2630 
2631  switch(id_type) {
2632  case eStringId:
2633  x_OpenStrFile();
2634  if (! m_IsamStr.Empty()) {
2635  // Not simplified
2636  vcheck = true;
2637  m_IsamStr->StringToOids(str_id, oids, simpler, vcheck);
2638  x_UnleaseStrFile();
2639  }
2640  break;
2641 
2642  case ePigId:
2643  // Converted to PIG type.
2644  x_OpenPigFile();
2645  if (! m_IsamPig.Empty()) {
2646  int oid(-1);
2647 
2648  if (m_IsamPig->PigToOid((int) ident, oid)) {
2649  oids.push_back(oid);
2650  }
2651  x_UnleasePigFile();
2652  }
2653  break;
2654 
2655  case eGiId:
2656  // Converted to GI type.
2657  x_OpenGiFile();
2658  if (! m_IsamGi.Empty()) {
2659  int oid(-1);
2660 
2661  if (m_IsamGi->IdToOid(ident, oid)) {
2662  oids.push_back(oid);
2663  }
2664  x_UnleaseGiFile();
2665  }
2666  break;
2667 
2668  case eTiId:
2669  // Converted to TI type.
2670  x_OpenTiFile();
2671  if (! m_IsamTi.Empty()) {
2672  int oid(-1);
2673 
2674  if (m_IsamTi->IdToOid(ident, oid)) {
2675  oids.push_back(oid);
2676  }
2677  x_UnleaseTiFile();
2678  }
2679  else {
2680  x_OpenStrFile();
2681  if (m_IsamStr.NotEmpty()) {
2682  // Not every database with TIs has a TI index, so fall
2683  // back to a string comparison if the first attempt fails.
2684  //
2685  // 1. TI's don't have versions.
2686  // 2. Specify "adjusted" as true, because lookup of
2687  // "gb|.." and similar tricks are not needed for TIs.
2688  m_IsamStr->StringToOids(acc, oids, true, vcheck);
2689  x_UnleaseStrFile();
2690  }
2691  }
2692  break;
2693 
2694  case eOID:
2695  // Converted to OID directly.
2696  oids.push_back((int) ident);
2697  break;
2698 
2699  case eHashId:
2700  _ASSERT(0);
2702  eArgErr,
2703  "Internal error: hashes are not Seq-ids.");
2704  }
2705 
2706  if ((! fits_in_four) && needs_four) {
2708  eArgErr,
2709  "ID overflows range of specified type.");
2710  }
2711 
2712  if (vcheck) {
2713  x_CheckVersions(acc, oids);
2714  }
2715 }
2716 
2717 void CSeqDBVol::x_CheckVersions(const string & acc,
2718  vector<int> & oids) const
2719 {
2720  // If we resolved a string id by stripping the version off of the
2721  // string, we need to check (for each OID) if the real ID had a
2722  // matching version.
2723 
2724  // This condition happens in two cases: where the database has a
2725  // different version of the same ID, and in the case of sparse
2726  // databases (which do not store the version). In the latter
2727  // case, we may get a list of OIDs where some of the OIDs pass
2728  // this test, and others fail.
2729 
2730  size_t pos = acc.find(".");
2731  _ASSERT(pos != acc.npos);
2732 
2733  string ver_str(acc, pos+1, acc.size()-(pos+1));
2734  int vernum = NStr::StringToInt(ver_str,
2737 
2738  string nover(acc, 0, pos);
2739 
2740  size_t pos2(0);
2741  while((pos2 = nover.find("|")) != nover.npos) {
2742  nover.erase(0, pos2+1);
2743  }
2744 
2745  NON_CONST_ITERATE(vector<int>, iter, oids) {
2746  list< CRef<CSeq_id> > ids =
2747  GetSeqIDs(*iter);
2748 
2749  bool found = false;
2750 
2751  ITERATE(list< CRef<CSeq_id> >, seqid, ids) {
2752  const CTextseq_id * id = (*seqid)->GetTextseq_Id();
2753 
2754  if (id &&
2755  id->CanGetAccession() &&
2756  id->GetAccession() == nover &&
2757  id->CanGetVersion() &&
2758  id->GetVersion() == vernum) {
2759 
2760  found = true;
2761  break;
2762  }
2763  }
2764 
2765  if (! found) {
2766  *iter = -1;
2767  }
2768  }
2769 
2770  oids.erase(remove(oids.begin(), oids.end(), -1), oids.end());
2771 }
2772 
2773 void CSeqDBVol::AccessionToOids(const string & acc,
2774  vector<int> & oids,
2775  CSeqDBLockHold & locked) const
2776 {
2777  Int8 ident (-1);
2778  string str_id;
2779  bool simpler (false);
2780 
2781  ESeqDBIdType id_type = SeqDB_SimplifyAccession(acc, ident, str_id, simpler);
2782 
2783  x_StringToOids(acc, id_type, ident, str_id, simpler, oids);
2784 
2785 }
2786 
2788  vector<int> & oids,
2789  CSeqDBLockHold & locked) const
2790 {
2791  Int8 ident (-1);
2792  string str_id;
2793  bool simpler (false);
2794 
2795  ESeqDBIdType id_type = SeqDB_SimplifySeqid(seqid, 0, ident, str_id, simpler);
2796 
2797  x_StringToOids(seqid.AsFastaString(), id_type, ident, str_id, simpler, oids);
2798 
2799 }
2800 
2802 {
2803  m_Idx->UnLease();
2804 
2805  if (m_Seq.NotEmpty()) {
2806  m_Seq->UnLease();
2807  }
2808  if (m_Hdr.NotEmpty()) {
2809  m_Hdr->UnLease();
2810  }
2811  if (m_IsamPig.NotEmpty()) {
2812  m_IsamPig->UnLease();
2813  }
2814  if (m_IsamGi.NotEmpty()) {
2815  m_IsamGi->UnLease();
2816  }
2817  if (m_IsamStr.NotEmpty()) {
2818  m_IsamStr->UnLease();
2819  }
2820 }
2821 
2822 int CSeqDBVol::GetOidAtOffset(int first_seq,
2823  Uint8 residue,
2824  CSeqDBLockHold & locked) const
2825 {
2826  // This method compensates for representation in two ways.
2827  //
2828  // 1. For protein, we subtract the oid to compensate for
2829  // inter-sequence nulls.
2830  //
2831  // 2. For nucleotide, the input value is 0..(num residues). We
2832  // scale this value to the length of the byte data.
2833 
2834  int vol_cnt = GetNumOIDs();
2835  Uint8 vol_len = GetVolumeLength();
2836 
2837  if (first_seq >= vol_cnt) {
2839  eArgErr,
2840  "OID not in valid range.");
2841  }
2842 
2843  if (residue >= vol_len) {
2845  eArgErr,
2846  "Residue offset not in valid range.");
2847  }
2848 
2849  if ('n' == m_Idx->GetSeqType()) {
2850  // Input range is from 0 .. total_length
2851  // Require range from 0 .. byte_length
2852 
2853  Uint8 end_of_bytes = x_GetSeqResidueOffset(vol_cnt);
2854 
2855  double dresidue = (double(residue) * end_of_bytes) / vol_len;
2856 
2857  if (dresidue < 0) {
2858  residue = 0;
2859  } else {
2860  residue = Uint8(dresidue);
2861 
2862  if (residue > (end_of_bytes-1)) {
2863  residue = end_of_bytes - 1;
2864  }
2865  }
2866  }
2867 
2868  // First seq limit handled right here.
2869  // oid_end refers to first disincluded oid.
2870 
2871  int oid_beg = first_seq;
2872  int oid_end = vol_cnt-1;
2873 
2874  // Residue limit we need to search for.
2875 
2876  int oid_mid = (oid_beg + oid_end)/2;
2877 
2878  while(oid_beg < oid_end) {
2879  Uint8 offset = x_GetSeqResidueOffset(oid_mid);
2880 
2881  if ('p' == m_Idx->GetSeqType()) {
2882  offset -= oid_mid;
2883  }
2884 
2885  if (offset >= residue) {
2886  oid_end = oid_mid;
2887  } else {
2888  oid_beg = oid_mid + 1;
2889  }
2890 
2891  oid_mid = (oid_beg + oid_end)/2;
2892  }
2893 
2894  return oid_mid;
2895 }
2896 
2898 {
2899  //m_Atlas.Lock(locked);
2900 
2901  TIndx start_offset = 0;
2902  m_Idx->GetSeqStart(oid, start_offset);
2903  return start_offset;
2904 }
2905 
2908  TSeqPos begin,
2909  TSeqPos end,
2910  CSeqDBLockHold & locked) const
2911 {
2912  // This design was part of the BlastDbDataLoader code.
2913 
2914  //m_Atlas.Lock(locked);
2915 
2917 
2918  CRef<CSeq_data> seq_data(new CSeq_data);
2919 
2920  if (m_IsAA) {
2921  const char * buffer(0);
2922  TSeqPos length(0);
2923 
2924  length = x_GetSequence(oid, & buffer);
2925 
2926  if ((begin >= end) || (end > length)) {
2928  eArgErr,
2929  "Begin and end offsets are not valid.");
2930  }
2931 
2932  seq_data->SetNcbistdaa().Set().assign(buffer + begin, buffer + end);
2933  } else {
2934  // This code builds an array and packs the output in 4 bit
2935  // format for NA. No attempt is made to find an optimal
2936  // packing for the data.
2937 
2938  int nucl_code(kSeqDBNuclNcbiNA8);
2939 
2940  SSeqDBSlice slice(begin, end);
2941 
2942  char * buffer(0);
2943  TSeqPos length(0);
2944 
2945  length = x_GetAmbigSeq(oid,
2946  & buffer,
2947  nucl_code,
2948  eNew,
2949  & slice,
2950  NULL);
2951 
2952  // validity of begin, end, and length has already been checked by
2953  // overloaded x_GetSequence()
2954  // note: length has been redefined to be end - begin
2955 
2956  vector<char> v4;
2957  v4.reserve((length+1)/2);
2958 
2959  // (this is an attempt to stop a warning message.)
2960  TSeqPos length_whole = TSeqPos(length & (TSeqPos(0)-TSeqPos(2)));
2961 
2962  for(TSeqPos i = 0; i < length_whole; i += 2) {
2963  v4.push_back((buffer[i] << 4) | buffer[i+1]);
2964  }
2965 
2966  if (length_whole != length) {
2967  _ASSERT((length_whole) == (length-1));
2968  v4.push_back(buffer[length_whole] << 4);
2969  }
2970 
2971  seq_data->SetNcbi4na().Set().swap(v4);
2972  delete [] buffer;
2973  }
2974 
2975  return seq_data;
2976 }
2977 
2978 void
2980  const char ** buffer,
2981  int * seq_length,
2982  int * amb_length ) const
2983 {
2984  if (seq_length)
2985  *seq_length = 0;
2986 
2987  if (amb_length)
2988  *amb_length = 0;
2989 
2990  if (buffer)
2991  *buffer = 0;
2992 
2993  TIndx start_S = 0;
2994  TIndx end_S = 0;
2995  TIndx start_A = 0;
2996  TIndx end_A = 0;
2997  TIndx map_begin = 0;
2998  TIndx map_end = 0;
2999 
3000  //m_Atlas.Lock(locked);
3002 
3003  m_Idx->GetSeqStartEnd(oid, start_S, end_S);
3004  bool amb_ok = true;
3005 
3006  if (m_IsAA) {
3007  // No ambiguities in protein dbs, but there is a NUL between
3008  // sequences, so we subtract one to remove that.
3009 
3010  end_A = start_A = --end_S;
3011 
3012  _ASSERT(start_S > 0);
3013 
3014  map_begin = start_S - 1;
3015  map_end = end_A + 1;
3016  } else {
3017  amb_ok = m_Idx->GetAmbStartEnd(oid, start_A, end_A);
3018 
3019  map_begin = start_S;
3020  map_end = end_A;
3021  }
3022 
3023  int s_len = int(end_S - start_S);
3024  int a_len = int(end_A - start_A);
3025 
3026  if (! (s_len && amb_ok)) {
3027  NCBI_THROW(CSeqDBException, eFileErr,
3028  "File error: could not get sequence data.");
3029  }
3030 
3031  if (amb_length) {
3032  *amb_length = a_len;
3033  }
3034 
3035  if (seq_length) {
3036  *seq_length = s_len;
3037  }
3038 
3039  if (buffer) {
3040  *buffer = m_Seq->GetFileDataPtr(map_begin);
3041  *buffer += (start_S - map_begin);
3042  }
3043 
3044  if (buffer && *buffer) {
3045  if (! *seq_length) {
3047  eArgErr,
3048  "Could not get sequence data.");
3049  }
3050  } else {
3051  if (((buffer && *buffer) || a_len) && (! *seq_length)) {
3053  }
3054  }
3055 }
3056 
3057 template<class T>
3058 static void
3060 {
3061  if (id >= (static_cast<T>(1) << 32)) {
3063  eArgErr,
3064  "ID overflows range of specified type.");
3065  }
3066 }
3067 
3069  TGi & high_id,
3070  int & count,
3071  CSeqDBLockHold & locked) const
3072 {
3073  //m_Atlas.Lock(locked);
3074  x_OpenGiFile();
3075  low_id = ZERO_GI;
3076  high_id = ZERO_GI;
3077  count = 0;
3078 
3079  if (m_IsamGi.NotEmpty()) {
3080  Int8 L(0), H(0);
3081 
3082  m_IsamGi->GetIdBounds(L, H, count);
3083 
3084  low_id = GI_FROM(Int8, L);
3085  high_id = GI_FROM(Int8, H);
3086 
3087  s_SeqDBFitsInFour(L);
3089  x_UnleaseGiFile();
3090  }
3091 }
3092 
3093 void CSeqDBVol::GetPigBounds(int & low_id,
3094  int & high_id,
3095  int & count,
3096  CSeqDBLockHold & locked) const
3097 {
3098  //m_Atlas.Lock(locked);
3099  x_OpenPigFile();
3100  low_id = high_id = count = 0;
3101 
3102  if (m_IsamPig.NotEmpty()) {
3103  Int8 L(0), H(0);
3104 
3105  m_IsamPig->GetIdBounds(L, H, count);
3106 
3107  low_id = (int) L;
3108  high_id = (int) H;
3109 
3110  s_SeqDBFitsInFour(L);
3112  x_UnleasePigFile();
3113  }
3114 }
3115 
3116 void CSeqDBVol::GetStringBounds(string & low_id,
3117  string & high_id,
3118  int & count) const
3119 {
3120  x_OpenStrFile();
3121  count = 0;
3122  low_id.erase();
3123  high_id.erase();
3124 
3125  if (m_IsamStr.NotEmpty()) {
3126  m_IsamStr->GetIdBounds(low_id, high_id, count);
3127  x_UnleaseStrFile();
3128  }
3129 }
3130 
3132  const TRangeList & offset_ranges,
3133  bool append_ranges,
3134  bool cache_data) const
3135 {
3136 
3137  CFastMutexGuard mtx_gurad(m_MtxCachedRange);
3138  if (offset_ranges.empty() && (! cache_data) && (! append_ranges)) {
3139  // Specifying no-cache plus an empty offset range list, means
3140  // that we are clearing out this sequence. In this case, just
3141  // free the relevant element and be done.
3142 
3143  m_RangeCache.erase(oid);
3144  return;
3145  }
3146 
3147  // This adds the range cache object to the map.
3148 
3150 
3151  if (R.Empty() || R->GetRanges().empty()) {
3152  // In this case, we are disabling caching, and no ranges
3153  // exist. There is nothing to do, and no need to keep the
3154  // element around, so once again we erase + exit.
3155 
3156  if (offset_ranges.empty() && (! cache_data)) {
3157  m_RangeCache.erase(oid);
3158  return;
3159  }
3160 
3161  if (R.Empty()) {
3162  R.Reset(new CSeqDBRangeList());
3163  }
3164  }
3165 
3166  // We should flush the sequence if:
3167  //
3168  // 1. We are not keeping the old ranges (1).
3169  // 2. There are new ranges to add (2).
3170  // 3. We are clearing the 'cache data' flag.
3171 
3172  bool flush_sequence = ((! append_ranges) || // (1)
3173  (! offset_ranges.empty()) || // (2)
3174  (! cache_data)); // (3)
3175 
3176  if (flush_sequence) {
3177  R->FlushSequence();
3178  }
3179 
3180  R->SetRanges(offset_ranges, append_ranges, cache_data);
3181 }
3182 
3184 {
3185  CFastMutexGuard mtx_gurad(m_MtxCachedRange);
3186  m_RangeCache.clear();
3187 }
3188 
3189 void CSeqDBRangeList::SetRanges(const TRangeList & offset_ranges,
3190  bool append_ranges,
3191  bool cache_data)
3192 {
3193  if (append_ranges) {
3194  m_Ranges.insert(offset_ranges.begin(), offset_ranges.end());
3195  } else {
3196  m_Ranges = offset_ranges;
3197  }
3198 
3199  // Note that actual caching is not currently done.
3200  m_CacheData = cache_data;
3201 }
3202 
3204 {
3205  if (m_UserGiList.Empty() ||
3206  m_VolumeGiLists.empty() ||
3207  m_UserGiList->GetNumSis() ||
3208  m_UserGiList->GetNumTis()) {
3209 
3210  return;
3211  }
3212 
3214  if ((**gilist).GetNumSis() != 0)
3215  return;
3216 
3217  if ((**gilist).GetNumTis() != 0)
3218  return;
3219  }
3220 
3221  // If we have volume GI lists, and a user gi list, and neither of
3222  // these uses Seq-ids, then we can detach the user gi list (from
3223  // the volume) because it is redundant with the volume GI lists.
3224  // The opposite is not true -- we could not simply remove the
3225  // volume GI lists and rely on the user gi list. This is because
3226  // each volume GI list is translated in terms of the user GI list,
3227  // which means that only the intersection of the two lists is left
3228  // in the volume GI list.
3229 
3230  m_UserGiList.Reset();
3231 }
3232 
3234  vector<int> & oids,
3235  CSeqDBLockHold & locked) const
3236 {
3237  // It has not been decided whether sequence hash lookups are of
3238  // long term interest or whether standard databases will be built
3239  // with these indices, but it should not cause any harm to support
3240  // them for databases where the files do exist.
3241 
3242  // Since it is normal for a hash lookup to fail (the user of this
3243  // feature generally does not know if the sequence will be found),
3244  // the lack of hash indexing is reported by throwing an exception.
3245 
3247  if (m_IsamHash.Empty()) {
3249  eArgErr,
3250  "Hash lookup requested but no hash ISAM file found.");
3251  }
3252 
3253  m_IsamHash->HashToOids(hash, oids);
3254 }
3255 
3256 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
3257  (!defined(NCBI_COMPILER_MIPSPRO)) )
3259  int oid,
3260  CBlastDbBlob & blob,
3261  bool keep,
3262  CSeqDBLockHold & locked)
3263 {
3264  if (! m_HaveColumns) {
3265  x_OpenAllColumns(locked);
3266  }
3267 
3268  if ((int)m_Columns.size() == 0 || m_Columns[col_id].Empty())
3269  return;
3270 
3271  _ASSERT(col_id >= 0);
3272 
3273  m_Columns[col_id]->GetBlob(oid, blob, keep, & locked);
3274 }
3275 
3276 const map<string,string> &
3278  CSeqDBLockHold & locked)
3279 {
3280  //m_Atlas.Lock(locked);
3281 
3282  if (! m_HaveColumns) {
3283  x_OpenAllColumns(locked);
3284  }
3285 
3286  _ASSERT(col_id >= 0);
3287  _ASSERT(col_id < (int)m_Columns.size());
3288  _ASSERT(m_Columns[col_id].NotEmpty());
3289 
3290  return m_Columns[col_id]->GetMetaData();
3291 }
3292 
3294  CSeqDBLockHold & locked)
3295 {
3296  //m_Atlas.Lock(locked);
3297 
3298  if (! m_HaveColumns) {
3299  x_OpenAllColumns(locked);
3300  }
3301 
3302  ITERATE(vector< CRef<CSeqDBColumn> >, iter, m_Columns) {
3303  titles.insert((**iter).GetTitle());
3304  }
3305 }
3306 
3308 {
3309  //m_Atlas.Lock(locked);
3310 
3311  if (m_HaveColumns) {
3312  return;
3313  }
3314 
3315  string alpha("abcdefghijklmnopqrstuvwxyz");
3316  string ei("??a"), ed("??b"), ed2("??c");
3317 
3318  ei[0] = ed[0] = ed2[0] = (m_IsAA ? 'p' : 'n');
3319 
3320  map<string,int> unique_titles;
3321 
3322  for(size_t i = 0; i < alpha.size(); i++) {
3323  ei[1] = ed[1] = ed2[1] = alpha[i];
3324 
3326 
3327  bool big = CSeqDBColumn::ColumnExists(m_VolName, ed, m_Atlas);
3328  bool small = CSeqDBColumn::ColumnExists(m_VolName, ed2, m_Atlas);
3329 
3330  if ( ! (big || small)) continue;
3331 
3332  CRef<CSeqDBColumn> col;
3333 
3334  const Int2 bytetest = 0x0011;
3335  const char * ptr = (const char *) &bytetest;
3336  if (ptr[0] == 0x11 && small) {
3337  col.Reset(new CSeqDBColumn(m_VolName, ei, ed2, & locked));
3338  } else {
3339  col.Reset(new CSeqDBColumn(m_VolName, ei, ed, & locked));
3340  }
3341 
3342  string errmsg, errarg;
3343 
3344  string title = col->GetTitle();
3345 
3346  if (unique_titles[title]) {
3347  errmsg = "duplicate column title";
3348  errarg = title;
3349  } else {
3350  unique_titles[title] = 1;
3351  }
3352 
3353  int noidc(col->GetNumOIDs()), noidv(m_Idx->GetNumOIDs());
3354 
3355  if (noidc != noidv) {
3356  errmsg = "column has wrong #oids";
3357  errarg = NStr::IntToString(noidc) + " vs "
3358  + NStr::IntToString(noidv);
3359  }
3360 
3361  if (errmsg.size()) {
3362  if (errarg.size()) {
3363  errmsg += string(" [") + errarg + "].";
3364  }
3365  NCBI_THROW(CSeqDBException, eFileErr,
3366  string("Error: ") + errmsg);
3367  }
3368 
3369  m_Columns.push_back(col);
3370  }
3371  }
3372 
3373  m_HaveColumns = true;
3374 }
3375 
3376 int CSeqDBVol::GetColumnId(const string & title,
3377  CSeqDBLockHold & locked)
3378 {
3379  //m_Atlas.Lock(locked);
3380 
3381  if (! m_HaveColumns) {
3382  x_OpenAllColumns(locked);
3383  }
3384 
3385  for(size_t i = 0; i < m_Columns.size(); i++) {
3386  if (m_Columns[i]->GetTitle() == title) {
3387  return static_cast<int>(i);
3388  }
3389  }
3390 
3391  return -1;
3392 }
3393 #endif
3394 
3395 
3397 
#define FENCE_SENTRY
This sentry value is used as a 'fence' around the valid portions of partially decoded sequences.
Definition: blast_util.h:364
vector< TSeqRange > TRangeVector
ncbi::TMaskedQueryRegions mask
CBioseq_Handle –.
`Blob' Class for SeqDB (and WriteDB).
Definition: seqdbblob.hpp:56
TTaxIds GetTaxIds() const
Definition: Dbtag.hpp:53
CFastMutex –.
Definition: ncbimtx.hpp:667
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
CObjectIStreamAsnBinary –.
Definition: objistrasnb.hpp:59
CObjectOStreamAsnBinary –.
Definition: objostrasnb.hpp:58
CSeqDBAtlas class.
Definition: seqdbatlas.hpp:297
static char * Alloc(size_t length, bool clear=true)
Allocate memory that atlas will keep track of.
Definition: seqdbatlas.cpp:232
CSeqDBColumn class.
Definition: seqdbcol.hpp:59
static bool ColumnExists(const string &basename, const string &extn, CSeqDBAtlas &atlas)
Determine if the column exists.
Definition: seqdbcol.cpp:155
CSeqDBException.
Definition: seqdbcommon.hpp:73
void UnLease()
Release memory held in the atlas layer by this object.
Definition: seqdbfile.hpp:293
const char * GetFileDataPtr(const string &fname, TIndx offset)
Get a pointer to the specified offset.
Definition: seqdbatlas.hpp:755
CSeqDBGiIndex.
Definition: seqdbvol.hpp:55
CSeqDBFileMemMap m_Lease
Definition: seqdbvol.hpp:85
CSeqDBAtlas::TIndx TIndx
Definition: seqdbvol.hpp:57
static bool IndexExists(const string &name, const char prot_nucl)
Definition: seqdbvol.hpp:75
Int4 m_NumOIDs
Definition: seqdbvol.hpp:88
TGi GetSeqGI(TOid oid, CSeqDBLockHold &locked)
Definition: seqdbvol.cpp:51
CSeqDBGiList.
int GetNumGis() const
Get the number of GIs in the array.
int GetNumTaxIds() const
int GetNumSis() const
Get the number of Seq-ids in the array.
int GetNumTis() const
Get the number of TIs in the array.
int GetNumPigs() const
Uint8 GetMaskOpts() const
set< TTaxId > & GetTaxIdsList()
Header file.
Definition: seqdbfile.hpp:838
const char * GetFileDataPtr(TIndx start) const
Read part of the file into a buffer.
Definition: seqdbfile.hpp:907
Index file.
Definition: seqdbfile.hpp:410
string GetDate() const
Get the construction date of the volume.
Definition: seqdbfile.hpp:539
void GetHdrStartEnd(int oid, TIndx &start, TIndx &end) const
Get the location of a sequence's header data.
Definition: seqdbfile.hpp:708
void UnLease()
Release any memory leases temporarily held here.
Definition: seqdbfile.hpp:569
string GetTitle() const
Get the volume title.
Definition: seqdbfile.hpp:533
int GetNumOIDs() const
Get the number of oids in this volume.
Definition: seqdbfile.hpp:545
string GetLMDBFileName() const
Definition: seqdbfile.hpp:577
bool GetAmbStartEnd(int oid, TIndx &start, TIndx &end) const
Get the location of a sequence's ambiguity data.
Definition: seqdbfile.hpp:694
Uint8 GetVolumeLength() const
Get the length of the volume (in bases).
Definition: seqdbfile.hpp:551
int GetMinLength() const
Get the length of the shortest sequence in this volume.
Definition: seqdbfile.hpp:563
void GetSeqStart(int oid, TIndx &start) const
Get the location of a sequence's packed sequence data.
Definition: seqdbfile.hpp:729
char GetSeqType() const
Get the sequence data type.
Definition: seqdbfile.hpp:527
int GetMaxLength() const
Get the length of the longest sequence in this volume.
Definition: seqdbfile.hpp:557
void GetSeqStartEnd(int oid, TIndx &start, TIndx &end) const
Get the location of a sequence's packed sequence data.
Definition: seqdbfile.hpp:716
TValue & Lookup(int key)
Find a value in the cache.
CSeqDBIsam.
Definition: seqdbisam.hpp:127
bool IdToOid(Int8 id, TOid &oid)
GI or TI translation.
Definition: seqdbisam.hpp:225
void HashToOids(unsigned hash, vector< TOid > &oids)
Sequence hash lookup.
Definition: seqdbisam.cpp:1666
bool PigToOid(TPig pig, TOid &oid)
PIG translation.
Definition: seqdbisam.hpp:203
void IdsToOids(int vol_start, int vol_end, CSeqDBGiList &ids)
Translate Gis and Tis to Oids for the given ID list.
Definition: seqdbisam.cpp:1387
void GetIdBounds(Int8 &low_id, Int8 &high_id, int &count)
Get Numeric Bounds.
Definition: seqdbisam.cpp:1624
void UnLease()
Return any memory held by this object to the atlas.
Definition: seqdbisam.cpp:1215
void StringToOids(const string &acc, vector< TOid > &oids, bool adjusted, bool &version_check)
String translation.
Definition: seqdbisam.cpp:1235
static bool IndexExists(const string &dbname, char prot_nucl, char file_ext_char)
Check if a given ISAM index exists.
Definition: seqdbisam.cpp:1200
CSeqDBLockHold.
Definition: seqdbatlas.hpp:166
CSeqDBNegativeList.
int GetNumTaxIds() const
int GetNumTis() const
Get the number of TIs in the array.
int GetNumGis() const
Get the number of GIs in the array.
int GetNumSis() const
Get the number of SeqIds in the array.
set< TTaxId > & GetTaxIdsList()
CSeqDBRangeList.
Definition: seqdbvol.hpp:100
TRangeList m_Ranges
Range of offsets needed for this sequence.
Definition: seqdbvol.hpp:154
void SetRanges(const TRangeList &ranges, bool append_ranges, bool cache_data)
Set ranges of the sequence that will be used.
Definition: seqdbvol.cpp:3189
static int ImmediateLength()
Sequences shorter than this will not use ranges in any case.
Definition: seqdbvol.hpp:147
bool m_CacheData
True if caching of sequence data is required for this sequence.
Definition: seqdbvol.hpp:157
Sequence data file.
Definition: seqdbfile.hpp:748
void ReadBytes(char *buf, TIndx start, TIndx end) const
Read part of the file into a buffer.
Definition: seqdbfile.hpp:795
const char * GetFileDataPtr(TIndx start) const
Get a pointer into the file contents.
Definition: seqdbfile.hpp:822
static bool GetTaxNames(TTaxId tax_id, SSeqDBTaxInfo &info)
Get the taxonomy names for a given tax id.
Definition: seqdbtax.cpp:229
void OptimizeGiLists() const
Simplify the GI list configuration.
Definition: seqdbvol.cpp:3203
bool m_HaveColumns
True if we have opened the columns for this volume.
Definition: seqdbvol.hpp:1431
list< CRef< CSeq_id > > GetSeqIDs(int oid) const
Get the Seq-ids associated with a sequence.
Definition: seqdbvol.cpp:1826
CFastMutex m_MtxCachedRange
Definition: seqdbvol.hpp:1445
CRef< CSeqDBIsam > m_IsamGi
Handles translation of GIs to OIDs.
Definition: seqdbvol.hpp:1381
void SeqidToOids(CSeq_id &seqid, vector< int > &oids, CSeqDBLockHold &locked) const
Find OIDs for the specified Seq-id.
Definition: seqdbvol.cpp:2787
Uint8 x_GetSeqResidueOffset(int oid) const
Returns the base-offset of the specified oid.
Definition: seqdbvol.cpp:2897
void x_OpenHashFile(void) const
Definition: seqdbvol.cpp:234
void x_UnleasePigFile(void) const
Definition: seqdbvol.cpp:146
int GetAmbigPartialSeq(int oid, char **buffer, int nucl_code, ESeqDBAllocType alloc_type, CSeqDB::TSequenceRanges *partial_ranges, CSeqDB::TSequenceRanges *masks) const
Definition: seqdbvol.cpp:1522
void x_UnleaseTiFile(void) const
Definition: seqdbvol.cpp:221
CSeqDBAtlas & m_Atlas
The memory management layer.
Definition: seqdbvol.hpp:1358
void AccessionToOids(const string &acc, vector< int > &oids, CSeqDBLockHold &locked) const
Find OIDs for the specified accession or formatted Seq-id.
Definition: seqdbvol.cpp:2773
void GetColumnBlob(int col_id, int oid, CBlastDbBlob &blob, bool keep, CSeqDBLockHold &locked)
Fetch the data blob for the given column and oid.
Definition: seqdbvol.cpp:3258
CRef< CSeqDBHdrFile > m_Hdr
Contains header (defline) information for this volume.
Definition: seqdbvol.hpp:1373
CFastMutex m_MtxPig
Definition: seqdbvol.hpp:1440
void x_OpenSeqFile(void) const
Definition: seqdbvol.cpp:116
CSeqDBIntCache< CRef< CSeqdesc > > m_TaxCache
This cache allows CBioseqs to share taxonomic objects.
Definition: seqdbvol.hpp:1396
void x_OpenTiFile(void) const
Definition: seqdbvol.cpp:209
int GetAmbigSeq(int oid, char **buffer, int nucl_code, ESeqDBAllocType alloc_type, SSeqDBSlice *region, CSeqDB::TSequenceRanges *masks) const
Get a sequence with ambiguous regions.
Definition: seqdbvol.cpp:1484
vector< CRef< CSeqDBColumn > > m_Columns
Set of columns defined for this volume.
Definition: seqdbvol.hpp:1450
CRef< CSeqDBIdxFile > m_Idx
Metadata plus offsets into the sequence, header, and ambiguity data.
Definition: seqdbvol.hpp:1367
void x_OpenStrFile(void) const
Definition: seqdbvol.cpp:184
CFastMutex m_MtxTi
Definition: seqdbvol.hpp:1442
int GetSeqLengthExact(int oid) const
Exact sequence length for nucleotide databases.
Definition: seqdbvol.cpp:296
void x_StringToOids(const string &acc, ESeqDBIdType id_type, Int8 ident, const string &str_id, bool simplified, vector< int > &oids) const
Definition: seqdbvol.cpp:2620
void OpenSeqFile(CSeqDBLockHold &locked) const
Open sequence file.
Definition: seqdbvol.cpp:111
bool m_HashFileOpened
Definition: seqdbvol.hpp:1436
int GetColumnId(const string &title, CSeqDBLockHold &locked)
Get an ID number for a given column title.
Definition: seqdbvol.cpp:3376
CRef< CSeqDBIsam > m_IsamStr
Handles translation of strings (accessions) to OIDs.
Definition: seqdbvol.hpp:1384
vector< CRef< CSeqDBGiList > > TGiLists
A set of GI lists.
Definition: seqdbvol.hpp:904
CSeqDBIntCache< TDeflineCacheItem > m_DeflineCache
Cache of filtered deflines.
Definition: seqdbvol.hpp:1428
int x_GetAmbigSeq(int oid, char **buffer, int nucl_code, ESeqDBAllocType alloc_type, SSeqDBSlice *region, CSeqDB::TSequenceRanges *masks) const
Get a sequence with ambiguous regions.
Definition: seqdbvol.cpp:1589
int m_VolStart
Starting OID of this volume.
Definition: seqdbvol.hpp:1419
int GetNumOIDs() const
Get the number of OIDs for this volume.
Definition: seqdbvol.cpp:2370
bool GetGi(int oid, TGi &gi, CSeqDBLockHold &locked) const
Find the GI given an OID.
Definition: seqdbvol.cpp:2577
CRef< CBlast_def_line_set > x_GetHdrAsn1(int oid, bool adjust_oids, bool *changed) const
Get sequence header object.
Definition: seqdbvol.cpp:2184
void GetPigBounds(int &low_id, int &high_id, int &count, CSeqDBLockHold &locked) const
Get PIG Bounds.
Definition: seqdbvol.cpp:3093
void x_FilterHasId(const CSeq_id &id, bool &have_user, bool &have_vol) const
Determine if a user ID list affects this ID, and how.
Definition: seqdbvol.hpp:943
string m_VolName
The name of this volume.
Definition: seqdbvol.hpp:1364
CTempString x_GetHdrAsn1Binary(int oid) const
Get sequence header binary data.
Definition: seqdbvol.cpp:2285
CFastMutex m_MtxHdr
Definition: seqdbvol.hpp:1444
void FlushOffsetRangeCache()
Flush all offset ranges cached.
Definition: seqdbvol.cpp:3183
CSeqDBVol(CSeqDBAtlas &atlas, const string &name, char prot_nucl, CSeqDBGiList *user_list, CSeqDBNegativeList *neg_list, int vol_start, CSeqDBLockHold &locked)
Constructor.
Definition: seqdbvol.cpp:72
void IdsToOids(CSeqDBGiList &gis, CSeqDBLockHold &locked) const
Translate Gis to Oids for the given vector of Gi/Oid pairs.
Definition: seqdbvol.cpp:2483
void GetRawSeqAndAmbig(int oid, const char **buffer, int *seq_length, int *ambig_length) const
Get Raw Sequence and Ambiguity Data.
Definition: seqdbvol.cpp:2979
void x_OpenHdrFile(void) const
Definition: seqdbvol.cpp:125
string GetLMDBFileName() const
Get sqlite file name associated with this volume Empty string if version 4.
Definition: seqdbvol.cpp:275
void ListColumns(set< string > &titles, CSeqDBLockHold &locked)
List the titles of all columns for this volume.
Definition: seqdbvol.cpp:3293
int GetSeqLengthApprox(int oid) const
Approximate sequence length for nucleotide databases.
Definition: seqdbvol.cpp:319
CFastMutex m_MtxStr
Definition: seqdbvol.hpp:1441
TRangeCache m_RangeCache
Cached/ranged sequence info.
Definition: seqdbvol.hpp:1416
int m_VolEnd
First OID past end of this volume.
Definition: seqdbvol.hpp:1422
bool m_SeqFileOpened
True if the volume file has been (at least tried to) opened.
Definition: seqdbvol.hpp:1434
bool GetPig(int oid, int &pig, CSeqDBLockHold &locked) const
Find the PIG given an OID.
Definition: seqdbvol.cpp:2406
int GetSeqLengthProt(int oid) const
Sequence length for protein databases.
Definition: seqdbvol.cpp:280
CFastMutex m_MtxSeq
Definition: seqdbvol.hpp:1443
CRef< CSeqDBSeqFile > m_Seq
Contains sequence data for this volume.
Definition: seqdbvol.hpp:1370
CRef< CSeqdesc > x_GetAsnDefline(int oid) const
Get sequence header information structures.
Definition: seqdbvol.cpp:1251
string GetTitle() const
Get the volume title.
Definition: seqdbvol.cpp:2375
CRef< CSeqDBGiIndex > m_GiIndex
The GI index file (for fast oid->gi conversion)
Definition: seqdbvol.hpp:1393
CRef< CSeqDBNegativeList > m_NegativeList
The negative ID list, if one exists.
Definition: seqdbvol.hpp:1402
TGiLists m_VolumeGiLists
The volume GI lists, if any exist.
Definition: seqdbvol.hpp:1405
CRef< CSeqDBIsam > m_IsamTi
Handles translation of TI (trace ids) to OIDs.
Definition: seqdbvol.hpp:1387
int x_GetSequence(int oid, const char **buffer) const
Get sequence data.
Definition: seqdbvol.cpp:1756
CRef< CSeqDBIsam > m_IsamHash
Handles translation of sequence hash value to OIDs.
Definition: seqdbvol.hpp:1390
void UnLease()
Return expendable resources held by this volume.
Definition: seqdbvol.cpp:2801
list< CRef< CSeqdesc > > x_GetTaxonomy(int oid, TGi preferred_gi, const CSeq_id *preferred_seq_id)
Get taxonomic descriptions of a sequence.
Definition: seqdbvol.cpp:1081
int m_OidMaskType
Definition: seqdbvol.hpp:1410
CRef< CBlast_def_line_set > x_GetFilteredHeader(int oid, bool *changed) const
Get sequence header information.
Definition: seqdbvol.cpp:1953
char * x_AllocType(size_t length, ESeqDBAllocType alloc_type) const
Allocate memory in one of several ways.
Definition: seqdbvol.cpp:1456
void x_CheckVersions(const string &acc, vector< int > &oids) const
Check Seq-id versions for special sparse-id support case.
Definition: seqdbvol.cpp:2717
bool GiToOid(TGi gi, int &oid, CSeqDBLockHold &locked) const
Find the OID given a GI.
Definition: seqdbvol.cpp:2471
bool m_HdrFileOpened
Definition: seqdbvol.hpp:1435
TGi GetSeqGI(int oid, CSeqDBLockHold &locked) const
Get the GI of a sequence This method returns the gi of the sequence.
Definition: seqdbvol.cpp:1870
CRef< CSeq_data > GetSeqData(int oid, TSeqPos begin, TSeqPos end, CSeqDBLockHold &locked) const
Fetch data as a CSeq_data object.
Definition: seqdbvol.cpp:2907
void GetGiBounds(TGi &low_id, TGi &high_id, int &count, CSeqDBLockHold &locked) const
Get GI Bounds.
Definition: seqdbvol.cpp:3068
int GetOidAtOffset(int first_seq, Uint8 residue, CSeqDBLockHold &locked) const
Find the OID at a given index into the database.
Definition: seqdbvol.cpp:2822
char GetSeqType() const
Get the sequence type stored in this database.
Definition: seqdbvol.cpp:265
bool x_HaveIdFilter(void) const
Returns true if this volume has an ID list.
Definition: seqdbvol.hpp:919
void x_OpenAllColumns(CSeqDBLockHold &locked)
Find all columns for this volume.
Definition: seqdbvol.cpp:3307
CRef< CBioseq > GetBioseq(int oid, TGi pref_gi, const CSeq_id *pref_seq_id, bool seqdata, CSeqDBLockHold &locked)
Get a CBioseq object for this sequence.
Definition: seqdbvol.cpp:1287
int GetMinLength() const
Get the length of the smallest sequence in this volume.
Definition: seqdbvol.cpp:2390
void x_UnleaseStrFile(void) const
Definition: seqdbvol.cpp:196
CRef< CSeqDBGiList > m_UserGiList
The user ID list, if one exists.
Definition: seqdbvol.hpp:1399
CSeqDBAtlas::TIndx TIndx
Import TIndx definition from the CSeqDBAtlas class.
Definition: seqdbvol.hpp:172
void x_OpenGiFile(void) const
Definition: seqdbvol.cpp:159
pair< CRef< CBlast_def_line_set >, bool > TDeflineCacheItem
Filtered defline plus whether binary data needed changes.
Definition: seqdbvol.hpp:1425
CRef< CSeqDBIsam > m_IsamPig
Handles translation of GIs to OIDs.
Definition: seqdbvol.hpp:1378
bool m_IsAA
True if the volume is protein, false for nucleotide.
Definition: seqdbvol.hpp:1361
const map< string, string > & GetColumnMetaData(int col_id, CSeqDBLockHold &locked)
Get all metadata for the specified column.
Definition: seqdbvol.cpp:3277
CRef< CBlast_def_line_set > x_GetTaxDefline(int oid, TGi preferred_gi, const CSeq_id *preferred_seq_id)
Get defline filtered by several criteria.
Definition: seqdbvol.cpp:1036
void x_OpenOidFile(void) const
Definition: seqdbvol.cpp:251
int GetMaxLength() const
Get the length of the largest sequence in this volume.
Definition: seqdbvol.cpp:2385
bool PigToOid(int pig, int &oid) const
Find the OID given a PIG.
Definition: seqdbvol.cpp:2395
bool TiToOid(Int8 ti, int &oid, CSeqDBLockHold &locked) const
Find the OID given a TI.
Definition: seqdbvol.cpp:2438
void x_OpenPigFile(void) const
Definition: seqdbvol.cpp:134
void x_GetFilteredBinaryHeader(int oid, vector< char > &hdr_data) const
Get binary sequence header information.
Definition: seqdbvol.cpp:2301
CFastMutex m_MtxGi
Definition: seqdbvol.hpp:1439
Uint8 GetVolumeLength() const
Get the total length of this volume (in bases).
Definition: seqdbvol.cpp:1880
CRef< CBlast_def_line_set > GetFilteredHeader(int oid, CSeqDBLockHold &locked) const
Get filtered sequence header information.
Definition: seqdbvol.cpp:1886
bool m_OidFileOpened
Definition: seqdbvol.hpp:1437
string GetDate() const
Get the formatting date of the volume.
Definition: seqdbvol.cpp:2380
int m_MemBit
The filtering MEMB_BIT.
Definition: seqdbvol.hpp:1408
void x_UnleaseGiFile(void) const
Definition: seqdbvol.cpp:171
void SetOffsetRanges(int oid, const TRangeList &offset_ranges, bool append_ranges, bool cache_data) const
Apply a range of offsets to a database sequence.
Definition: seqdbvol.cpp:3131
void GetStringBounds(string &low_id, string &high_id, int &count) const
Get String Bounds.
Definition: seqdbvol.cpp:3116
void HashToOids(unsigned hash, vector< int > &oids, CSeqDBLockHold &locked) const
Get the OIDs for a given sequence hash.
Definition: seqdbvol.cpp:3233
void x_GetAmbChar(int oid, vector< Int4 > &ambchars) const
Get ambiguity information.
Definition: seqdbvol.cpp:2332
char x_GetSeqType() const
Returns 'p' for protein databases, or 'n' for nucleotide.
Definition: seqdbvol.cpp:270
static const string kOidNotFound
String containing the error message in exceptions thrown when a given OID cannot be found.
Definition: seqdb.hpp:316
static CRef< CBlast_def_line_set > ExtractBlastDefline(const CBioseq &bioseq)
Extract a Blast-def-line-set object from a Bioseq retrieved by CSeqDB.
Definition: seqdbvol.cpp:1247
@Seq_descr.hpp User-defined methods of the data storage class.
Definition: Seq_descr.hpp:55
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
void erase(iterator pos)
Definition: map.hpp:167
const_iterator end() const
Definition: map.hpp:152
void clear()
Definition: map.hpp:169
const_iterator find(const key_type &key) const
Definition: map.hpp:153
Definition: set.hpp:45
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
const_iterator begin() const
Definition: set.hpp:135
size_type size() const
Definition: set.hpp:132
bool empty() const
Definition: set.hpp:133
const_iterator find(const key_type &key) const
Definition: set.hpp:137
const_iterator end() const
Definition: set.hpp:136
static int base_length[29]
#define T(s)
Definition: common.h:230
#define kAsnDeflineObjLabel
static int failure
Definition: t0019.c:11
#define true
Definition: bool.h:35
#define false
Definition: bool.h:36
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static void DLIST_NAME() remove(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:90
static char tmp[3200]
Definition: utf8.c:42
static tds_mutex mtx
Definition: condition.c:43
int offset
Definition: replacements.h:160
#define H(x, y, z)
Definition: md4.c:180
static TDSRET convert(TDSSOCKET *tds, TDSICONV *conv, TDS_ICONV_DIRECTION direction, const char *from, size_t from_len, char *dest, size_t *dest_len)
Definition: charconv.c:57
char data[12]
Definition: iconv.c:80
#define INVALID_GI
Definition: ncbimisc.hpp:1089
#define ZERO_TAX_ID
Definition: ncbimisc.hpp:1115
#define GI_FROM(T, value)
Definition: ncbimisc.hpp:1086
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define TAX_ID_TO(T, tax_id)
Definition: ncbimisc.hpp:1110
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
#define TAX_ID_FROM(T, value)
Definition: ncbimisc.hpp:1111
#define ZERO_GI
Definition: ncbimisc.hpp:1088
#define GI_TO(T, gi)
Definition: ncbimisc.hpp:1085
string
Definition: cgiapp.hpp:690
#define NULL
Definition: ncbistd.hpp:225
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
TPrim & Set(void)
Definition: serialbase.hpp:351
const string AsFastaString(void) const
Definition: Seq_id.cpp:2266
E_SIC
Compare return values.
Definition: Seq_id.hpp:579
@ fAcc_predicted
Definition: Seq_id.hpp:254
@ e_NO
different SeqId types-can't compare
Definition: Seq_id.hpp:582
@ e_YES
SeqIds compared, but are different.
Definition: Seq_id.hpp:583
void Close(void)
Detach reader from a data source.
Definition: objistr.cpp:539
void OpenFromBuffer(const char *buffer, size_t size)
Attach reader to a data source.
Definition: objistr.cpp:501
void AddReference(void) const
Add reference to object.
Definition: ncbiobj.hpp:489
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
void RemoveReference(void) const
Remove reference to object.
Definition: ncbiobj.hpp:500
bool ReferencedOnlyOnce(void) const THROWS_NONE
Check if object is referenced only once.
Definition: ncbiobj.hpp:475
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int16_t Int2
2-byte (16-bit) signed integer
Definition: ncbitype.h:100
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
void Reset(void)
Reset random number generator to initial startup condition (LFG only)
Definition: random_gen.cpp:234
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static string Int8ToString(Int8 value, TNumToStringFlags flags=0, int base=10)
Convert Int8 to string.
Definition: ncbistr.hpp:5153
#define kEmptyStr
Definition: ncbistr.hpp:123
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5078
const char * data(void) const
Return a pointer to the array represented.
Definition: tempstr.hpp:313
size_type length(void) const
Return the length of the represented array.
Definition: tempstr.hpp:320
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
size_type size(void) const
Return the length of the represented array.
Definition: tempstr.hpp:327
@ fConvErr_NoThrow
Do not throw an exception on error.
Definition: ncbistr.hpp:285
@ fAllowTrailingSymbols
Ignore trailing non-numerics characters.
Definition: ncbistr.hpp:298
static TID GetSelf(void)
Definition: ncbithr.cpp:515
const TSeqid & GetSeqid(void) const
Get the Seqid member data.
bool CanGetTitle(void) const
Check if it is safe to call GetTitle method.
bool IsSetLinks(void) const
Check if a value has been assigned to Links data member.
TTaxid GetTaxid(void) const
Get the Taxid member data.
bool IsSet(void) const
Check if a value has been assigned to data member.
bool IsSetMemberships(void) const
bit arrays Repurposed to store the (multiple) taxIDs associated with WP proteins.
bool IsSetTaxid(void) const
Check if a value has been assigned to Taxid data member.
const TLinks & GetLinks(void) const
Get the Links member data.
const TMemberships & GetMemberships(void) const
Get the Memberships member data.
Tdata & Set(void)
Assign a value to data member.
bool CanGet(void) const
Check if it is safe to call Get method.
const Tdata & Get(void) const
Get the member data.
bool CanGetSeqid(void) const
Check if it is safe to call GetSeqid method.
bool CanGetMemberships(void) const
Check if it is safe to call GetMemberships method.
const TTitle & GetTitle(void) const
Get the Title member data.
bool IsStr(void) const
Check if variant Str is selected.
Definition: Object_id_.hpp:291
const TTag & GetTag(void) const
Get the Tag member data.
Definition: Dbtag_.hpp:267
void SetTag(TTag &value)
Assign a value to Tag data member.
Definition: Dbtag_.cpp:66
const TDb & GetDb(void) const
Get the Db member data.
Definition: Dbtag_.hpp:220
vector< vector< char > * > TOss
TData & SetData(void)
Assign a value to Data data member.
void SetNum(TNum value)
Assign a value to Num data member.
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
void SetLabel(TLabel &value)
Assign a value to Label data member.
TStr & SetStr(void)
Select the variant.
Definition: Object_id_.hpp:304
const TData & GetData(void) const
Get the Data member data.
void SetType(TType &value)
Assign a value to Type data member.
void SetData(TData &value)
Assign a value to Data data member.
const TType & GetType(void) const
Get the Type member data.
void SetDb(const TDb &value)
Assign a value to Db data member.
Definition: Dbtag_.hpp:229
TId GetId(void) const
Get the variant data.
Definition: Object_id_.hpp:270
void SetCommon(const TCommon &value)
Assign a value to Common data member.
Definition: Org_ref_.hpp:428
TDb & SetDb(void)
Assign a value to Db data member.
Definition: Org_ref_.hpp:497
void SetTaxname(const TTaxname &value)
Assign a value to Taxname data member.
Definition: Org_ref_.hpp:381
TGeneral & SetGeneral(void)
Select the variant.
Definition: Seq_id_.cpp:375
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_id_.hpp:746
TVersion GetVersion(void) const
Get the Version member data.
bool CanGetVersion(void) const
Check if it is safe to call GetVersion method.
bool CanGetAccession(void) const
Check if it is safe to call GetAccession method.
const TAccession & GetAccession(void) const
Get the Accession member data.
@ e_General
for other databases
Definition: Seq_id_.hpp:105
@ e_Gi
GenInfo Integrated Database.
Definition: Seq_id_.hpp:106
@ e_Local
local use
Definition: Seq_id_.hpp:95
list< CRef< CSeqdesc > > Tdata
Definition: Seq_descr_.hpp:91
TId & SetId(void)
Assign a value to Id data member.
Definition: Bioseq_.hpp:296
TTitle & SetTitle(void)
Select the variant.
Definition: Seqdesc_.hpp:1039
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
TSource & SetSource(void)
Select the variant.
Definition: Seqdesc_.cpp:572
TNcbistdaa & SetNcbistdaa(void)
Select the variant.
Definition: Seq_data_.hpp:697
void SetDescr(TDescr &value)
Assign a value to Descr data member.
Definition: Bioseq_.cpp:65
TUser & SetUser(void)
Select the variant.
Definition: Seqdesc_.cpp:390
void SetRepr(TRepr value)
Assign a value to Repr data member.
Definition: Seq_inst_.hpp:574
Tdata & Set(void)
Assign a value to data member.
Definition: Seq_descr_.hpp:172
void SetLength(TLength value)
Assign a value to Length data member.
Definition: Seq_inst_.hpp:668
void SetSeq_data(TSeq_data &value)
Assign a value to Seq_data data member.
Definition: Seq_inst_.cpp:130
TNcbi4na & SetNcbi4na(void)
Select the variant.
Definition: Seq_data_.hpp:577
void SetMol(TMol value)
Assign a value to Mol data member.
Definition: Seq_inst_.hpp:621
@ eRepr_raw
continuous sequence
Definition: Seq_inst_.hpp:94
@ eRepr_not_set
empty
Definition: Seq_inst_.hpp:92
@ eMol_na
just a nucleic acid
Definition: Seq_inst_.hpp:113
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
char * buf
int i
range(_Ty, _Ty) -> range< _Ty >
const struct ncbi::grid::netcache::search::fields::SIZE size
const CharType(& source)[N]
Definition: pointer.h:1149
Multi-threading – mutexes; rw-locks; semaphore.
T max(T x_, T y_)
T min(T x_, T y_)
#define A
#define count
static uint8_t * buffer
Definition: pcre2test.c:1016
ESeqDBAllocType
Certain methods have an "Alloc" version.
@ eAtlas
@ eMalloc
ESeqDBIdType SeqDB_SimplifySeqid(CSeq_id &bestid, const string *acc, Int8 &num_id, string &str_id, bool &simpler)
Seq-id simplification.
const int kSeqDBNuclNcbiNA8
Used to request ambiguities in Ncbi/NA8 format.
ESeqDBIdType SeqDB_SimplifyAccession(const string &acc, Int8 &num_id, string &str_id, bool &simpler)
String id simplification.
const int kSeqDBNuclBlastNA8
Used to request ambiguities in BLAST/NA8 format.
ESeqDBIdType
Various identifier formats used in Id lookup.
@ eStringId
Each PIG identifier refers to exactly one protein sequence.
@ eTiId
Genomic ID is a relatively stable numeric identifier for sequences.
@ ePigId
Trace ID is a numeric identifier for Trace sequences.
@ eGiId
@ eHashId
Some sequence sources uses string identifiers.
@ eOID
Lookup from sequence hash values to OIDs.
@ fExcludeModel
T SeqDB_GetStdOrd(const T *stdord_obj)
Read a network order integer value.
The SeqDB oid filtering layer.
static bool s_SeqDB_SeqIdIn(const list< CRef< CSeq_id > > &seqids, const CSeq_id &target)
Search for a Seq-id in a list of Seq-ids.
Definition: seqdbvol.cpp:1013
bool s_IncludeDefline_Taxid(const CBlast_def_line &def, const set< TTaxId > &user_tax_ids)
Definition: seqdbvol.cpp:1892
static void s_SeqDBMapNA2ToNA4(const char *buf2bit, vector< char > &buf4bit, int base_length)
Convert sequence data from NA2 to NA4 format.
Definition: seqdbvol.cpp:389
Uint4 s_ResVal(const vector< Int4 > &ambchars, Uint4 i)
Get ambiguous residue value (old version)
Definition: seqdbvol.cpp:659
void SeqDB_UnpackAmbiguities(const CTempString &sequence, const CTempString &ambiguities, string &result)
Unpack an ambiguous nucleotide sequence.
Definition: seqdbvol.cpp:1698
static void s_SeqDBRebuildDNA_NA8(char *seq, const vector< Int4 > &amb_chars, const SSeqDBSlice &region)
Rebuild an ambiguous region from sequence and ambiguity data.
Definition: seqdbvol.cpp:785
Uint4 s_ResLenOld(const vector< Int4 > &ambchars, Uint4 i)
Get ambiguous region length (old version)
Definition: seqdbvol.cpp:675
Uint4 s_ResLenNew(const vector< Int4 > &ambchars, Uint4 i)
Get length of ambiguous region (new version)
Definition: seqdbvol.cpp:623
set< pair< int, int > > TRangeVector
List of offset ranges as begin/end pairs.
Definition: seqdbvol.cpp:1587
bool s_IncludeDefline_NegativeTaxid(const CBlast_def_line &def, const set< TTaxId > &user_tax_ids)
Definition: seqdbvol.cpp:1938
static vector< Uint1 > s_SeqDBMapNA2ToNA4Setup()
Build NA2 to NcbiNA4 translation table.
Definition: seqdbvol.cpp:351
Uint4 s_ResPosNew(const vector< Int4 > &ambchars, Uint4 i)
Get position of ambiguous region (new version)
Definition: seqdbvol.cpp:639
static void s_SeqDBRebuildDNA_NA4(vector< char > &buf4bit, const vector< Int4 > &amb_chars)
Rebuild an ambiguous region from sequence and ambiguity data.
Definition: seqdbvol.cpp:710
static void s_SeqDBWriteSeqDataProt(CSeq_inst &seqinst, const char *seq_buffer, int length)
Store protein sequence data in a Seq-inst.
Definition: seqdbvol.cpp:840
static void s_GetBioseqTitle(CRef< CBlast_def_line_set > deflines, string &title)
Get the title string for a CBioseq.
Definition: seqdbvol.cpp:938
static void s_SeqDBFitsInFour(T id)
Definition: seqdbvol.cpp:3059
unsigned SeqDB_ncbina8_to_blastna8[]
Definition: seqdbvol.cpp:571
static CRef< CBlast_def_line_set > s_OssToDefline(const CUser_field::TData::TOss &oss)
Efficiently decode a Blast-def-line-set from binary ASN.1.
Definition: seqdbvol.cpp:1176
Uint4 s_ResPosOld(const vector< Int4 > &ambchars, Uint4 i)
Get ambiguous residue value (old version)
Definition: seqdbvol.cpp:691
static void s_SeqDBMapNcbiNA8ToBlastNA8(char *buf, const SSeqDBSlice &range)
Convert sequence data from Ncbi-NA8 to Blast-NA8 format.
Definition: seqdbvol.cpp:601
static vector< Uint1 > s_SeqDBMapNA2ToNA8Setup()
Build NA2 to Ncbi-NA8 translation table.
Definition: seqdbvol.cpp:440
static void s_SeqDBMaskSequence(char *seq, CSeqDB::TSequenceRanges *masks, char mask_letter, const SSeqDBSlice &range)
Definition: seqdbvol.cpp:1499
static void s_SeqDBWriteSeqDataNucl(CSeq_inst &seqinst, const char *seq_buffer, int length)
Store non-ambiguous nucleotide sequence data in a Seq-inst.
Definition: seqdbvol.cpp:877
CRef< CBlast_def_line_set > s_ExtractBlastDefline(const T &bioseq)
Definition: seqdbvol.cpp:1215
static void s_SeqDBMapNA2ToNA8(const char *buf2bit, char *buf8bit, const SSeqDBSlice &range)
Convert sequence data from NA2 to NA8 format.
Definition: seqdbvol.cpp:481
bool s_IncludeDefline_MaskFilter(const CBlast_def_line &def, Uint8 mask)
Definition: seqdbvol.cpp:1925
Defines database volume access classes.
List of sequence offset ranges.
Definition: seqdb.hpp:236
bool empty() const
Definition: seqdb.hpp:272
size_type size() const
Definition: seqdb.hpp:274
OID-Range type to simplify interfaces.
int begin
First oid in range.
int end
OID after last included oid.
SSeqDBTaxInfo.
string common_name
Common name, such as "noisy night monkey".
string scientific_name
Scientific name, such as "Aotus vociferans".
Definition: _hash_fun.h:40
#define _ASSERT
else result
Definition: token2.c:20
void free(voidpf ptr)
voidp malloc(uInt size)
Modified on Fri Sep 20 14:58:26 2024 by modify_doxy.py rev. 669887