NCBI C++ ToolKit
seqdbisam.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef OBJTOOLS_READERS_SEQDB__SEQDBISAM_HPP
2 #define OBJTOOLS_READERS_SEQDB__SEQDBISAM_HPP
3 
4 /* $Id: seqdbisam.hpp 100101 2023-06-15 14:10:29Z merezhuk $
5  * ===========================================================================
6  *
7  * PUBLIC DOMAIN NOTICE
8  * National Center for Biotechnology Information
9  *
10  * This software/database is a "United States Government Work" under the
11  * terms of the United States Copyright Act. It was written as part of
12  * the author's official duties as a United States Government employee and
13  * thus cannot be copyrighted. This software/database is freely available
14  * to the public for use. The National Library of Medicine and the U.S.
15  * Government have not placed any restriction on its use or reproduction.
16  *
17  * Although all reasonable efforts have been taken to ensure the accuracy
18  * and reliability of the software and data, the NLM and the U.S.
19  * Government do not and cannot warrant the performance or results that
20  * may be obtained by using this software or data. The NLM and the U.S.
21  * Government disclaim all warranties, express or implied, including
22  * warranties of performance, merchantability or fitness for any particular
23  * purpose.
24  *
25  * Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Author: Kevin Bealer
30  *
31  */
32 
33 /// @file seqdbisam.hpp
34 /// ISAM index database access object.
35 ///
36 /// Defines classes:
37 /// CSeqDBIsam
38 ///
39 /// Implemented for: UNIX, MS-Windows
40 
41 
44 
46 
47 /// Bring the object directory definitions into this scope
49 
50 // Use Parabolic Binary Search to find the first gilist that
51 // is greater-than-or-equal-to the key
52 template <class T> static inline void
54  int & gi_index,
55  int gis_size,
56  const T & key)
57 {
58  while( (gi_index < gis_size)
59  && (gis.GetKey<T>(gi_index) < key)) {
60 
61  ++gi_index;
62  int jump = 2;
63 
64  while( (gi_index + jump < gis_size)
65  && (gis.GetKey<T>(gi_index + jump) < key)) {
66 
67  gi_index += jump;
68  jump *= 2;
69  }
70  }
71 
72  // skipping translated elements
73  while( (gi_index < gis_size)
74  && (gis.IsValueSet<T>(gi_index) )) ++gi_index;
75 }
76 
77 // Use Parabolic Binary Search to find the largest sample that
78 // is less-than-or-equal-to the first untranslated target GI.
79 template <class T> static inline void
80 s_AdvanceKeyList(const vector<T> & keys,
81  int & index,
82  int num_keys,
83  const T & target)
84 {
85  while( (index < num_keys)
86  && (keys[index] <= target)) {
87 
88  ++index;
89  int jump = 2;
90 
91  while( (index + jump < num_keys)
92  && (keys[index + jump] <= target)) {
93 
94  index += jump;
95  jump *= 2;
96  }
97  }
98  --index;
99 }
100 
101 // apply the translation (if we have it) for those GIs.
102 template <class T> static inline void
104  int & gi_index,
105  int gis_size,
106  const T & key,
107  int value)
108 {
109  while( (gi_index < gis_size)
110  && (gis.GetKey<T>(gi_index) == key)) {
111 
112  gis.SetValue<T>(gi_index, value);
113  ++gi_index;
114  }
115 }
116 
117 /// CSeqDBIsam
118 ///
119 /// Manages one ISAM file, which will translate either PIGs, GIs, or
120 /// Accessions to OIDs. Translation in the other direction is done in
121 /// the CSeqDBVol code. Files managed by this class include those
122 /// with the extensions pni, pnd, ppi, ppd, psi, psd, nsi, nsd, nni,
123 /// and nnd. Each instance of this object will manage one pair of
124 /// these files, including one whose name ends in 'i' and one whose
125 /// name ends in 'd'.
126 
128 public:
129  /// Import the type representing one GI, OID association.
131 
132  /// Types of database this class can access.
133  enum EIsamDbType {
134  eNumeric = 0, /// Numeric database with Key/Value pairs in the index file.
135  eNumericNoData = 1, /// This type is not supported.
136  eString = 2, /// String database type used here.
137  eStringDatabase = 3, /// This type is not supported.
138  eStringBin = 4, /// This type is not supported.
139  eNumericLongId = 5 /// Like eNumeric but with 8 bytes of Key data per record.
140  };
141 
142  /// Type which is large enough to span the bytes of an ISAM file.
144 
145  /// This class works with OIDs relative to a specific volume.
146  typedef int TOid;
147 
148  /// PIG identifiers for numeric indices over protein volumes.
149  //typedef int TPig;
150 
151  /// Genomic IDs, the most common numerical identifier.
152 // typedef int TGi;
153 
154  /// Identifier type for trace databases.
155  typedef Int8 TTi;
156 
157  /// Type large enough to hold any numerical ID.
158  typedef Int8 TId;
159 
160  /// Constructor
161  ///
162  /// An ISAM file object corresponds to an index file and a data
163  /// file, and converts identifiers (string, GI, or PIG) into OIDs
164  /// relative to a particular database volume.
165  ///
166  /// @param atlas
167  /// The memory management object. [in]
168  /// @param dbname
169  /// The name of the volume's files (minus the extension). [in]
170  /// @param prot_nucl
171  /// Whether the sequences are protein or nucleotide. [in]
172  /// @param file_ext_char
173  /// This is 's', 'n', or 'p', for string, GI, or PIG, respectively. [in]
174  /// @param ident_type
175  /// The type of identifiers this database translates. [in]
176  CSeqDBIsam(CSeqDBAtlas & atlas,
177  const string & dbname,
178  char prot_nucl,
179  char file_ext_char,
180  ESeqDBIdType ident_type);
181 
182  /// Destructor
183  ///
184  /// Releases all resources associated with this object.
185  ~CSeqDBIsam();
186 
187  /// PIG translation
188  ///
189  /// A PIG identifier is translated to an OID. PIG identifiers are
190  /// used exclusively for protein sequences. One PIG corresponds
191  /// to exactly one sequences of amino acids, and vice versa. They
192  /// are also stable; the sequence a PIG points to will never be
193  /// changed.
194  ///
195  /// @param pig
196  /// The PIG to look up. [in]
197  /// @param oid
198  /// The returned oid. [out]
199  /// @param locked
200  /// The lock hold object for this thread. [in|out]
201  /// @return
202  /// true if the PIG was found
203  bool PigToOid(TPig pig, TOid & oid)
204  {
205  _ASSERT(m_IdentType == ePigId);
206  return x_IdentToOid(pig, oid);
207  }
208 
209  /// GI or TI translation
210  ///
211  /// A GI or TI identifier is translated to an OID. GI identifiers
212  /// are used for all types of sequences. TI identifiers are used
213  /// primarily for nucleotide data in the Trace DBs. Multiple GIs
214  /// may indicate the same sequence of bases and the same OID, but
215  /// TIs are usually unique.
216  ///
217  /// @param id
218  /// The GI or TI to look up. [in]
219  /// @param oid
220  /// The returned oid. [out]
221  /// @param locked
222  /// The lock hold object for this thread. [in|out]
223  /// @return
224  /// true if the GI was found
225  bool IdToOid(Int8 id, TOid & oid)
226  {
227  _ASSERT(m_IdentType == eGiId || m_IdentType == eTiId);
228  return x_IdentToOid(id, oid);
229  }
230 
231  /// Translate Gis and Tis to Oids for the given ID list.
232  ///
233  /// This method iterates over a vector of Gi/OID and/or Ti/OID
234  /// pairs. For each pair where the OID is -1, the GI or TI will
235  /// be looked up in the ISAM file, and (if found) the correct OID
236  /// will be stored (otherwise the -1 will remain). This method
237  /// will normally be called once for each volume.
238  ///
239  /// @param vol_start
240  /// The starting OID of this volume. [in]
241  /// @param vol_end
242  /// The fist OID past the end of this volume. [in]
243  /// @param ids
244  /// The set of GI-OID or TI-OID pairs. [in|out]
245  /// @param locked
246  /// The lock holder object for this thread. [in|out]
247  void IdsToOids(int vol_start,
248  int vol_end,
249  CSeqDBGiList & ids);
250 
251 
252  /// Compute list of included OIDs based on a negative ID list.
253  ///
254  /// This method iterates over a vector of Gis or Tis, along with
255  /// the corresponding ISAM file for this volume. Each OID found
256  /// in the ISAM file is marked in the negative ID list. For those
257  /// for which the GI or TI is not mentioned in the negative ID
258  /// list, the OID will be marked as an 'included' OID in the ID
259  /// list (that OID will be searched). The OIDs for IDs that are
260  /// not found in the ID list will be marked as 'visible' OIDs.
261  /// When this process is done for all volumes, the SeqDB object
262  /// will use all OIDs that are either marked as 'included' or NOT
263  /// marked as 'visible'. The 'visible' list is needed because
264  /// otherwise iteration would skip IDs that are do not have GIs or
265  /// TIs (whichever is being iterated). To use this method, this
266  /// volume must have an ISAM file matching the negative ID list's
267  /// identifier type or an exception will be thrown.
268  ///
269  /// @param vol_start
270  /// The starting OID of this volume. [in]
271  /// @param vol_end
272  /// The fist OID past the end of this volume. [in]
273  /// @param ids
274  /// The set of GI-OID pairs. [in|out]
275  /// @param locked
276  /// The lock holder object for this thread. [in|out]
277  void IdsToOids(int vol_start,
278  int vol_end,
279  CSeqDBNegativeList & ids);
280 
281 
282 
283  /// String translation
284  ///
285  /// A string id is translated to one or more OIDs. String ids are
286  /// used by some groups which produce sequence data. In some
287  /// cases, the string may correspond to more than one OID. For
288  /// this reason, the OIDs are returned in a vector. The string
289  /// provided is looked up in several ways. If it contains a pipe
290  /// character ("|") the data will be interpreted as a SeqID. This
291  /// routine can use faster lookup mechanisms if the simplification
292  /// routines were able to recognize the sequence as one of several
293  /// types that have numerical indices. The version_check flag is
294  /// needed to support sparse indexing. If version_check is true,
295  /// and the string has a version, and the lookup fails, this
296  /// method will try to remove the version and search again. On
297  /// return from this method version_check will be set to true if
298  /// and only if the first search failed and the versionless search
299  /// succeeded. CSeqDBVol::x_CheckVersions() can then be called to
300  /// verify the OIDs; see that method for more information about
301  /// this scenario.
302  ///
303  /// @param acc
304  /// The string to look up. [in]
305  /// @param oids
306  /// The returned oids. [out]
307  /// @param adjusted
308  /// Whether the simplification adjusted the string. [in|out]
309  /// @param version_check
310  /// If the version can be stripped [in] and if it was [out].
311  /// @param locked
312  /// The lock hold object for this thread. [in|out]
313  void StringToOids(const string & acc,
314  vector<TOid> & oids,
315  bool adjusted,
316  bool & version_check);
317 
318 
319 
320  /// Seq-id translation
321  ///
322  /// A Seq-id identifier (serialized to a string) is translated
323  /// into an OID. This routine will attempt to simplify the seqid
324  /// so as to use the faster numeric lookup techniques whenever
325  /// possible.
326  ///
327  /// @param acc
328  /// A string containing the Seq-id. [in]
329  /// @param oid
330  /// The returned oid. [out]
331  /// @param locked
332  /// The lock hold object for this thread. [in|out]
333  bool SeqidToOid(const string & acc, TOid & oid);//does not exist
334 
335  /// Sequence hash lookup
336  ///
337  /// This methods tries to find sequences associated with a given
338  /// sequence hash value. The provided value is numeric but the
339  /// ISAM file uses a string format, because string searches can
340  /// return multiple results per key, and there may be multiple
341  /// OIDs for a given hash value due to identical sequences and
342  /// collisions.
343  ///
344  /// @param hash
345  /// The sequence hash value to look up. [in]
346  /// @param oids
347  /// The returned oids. [out]
348  /// @param locked
349  /// The lock hold object for this thread. [in|out]
350  void HashToOids(unsigned hash,
351  vector<TOid> & oids);
352 
353 
354 
355  /// Return any memory held by this object to the atlas.
356  void UnLease();
357 
358  /// Get Numeric Bounds.
359  ///
360  /// Fetch the lowest, highest, and total number of numeric keys in
361  /// the database index. If the operation fails, zero will be
362  /// returned for count.
363  ///
364  /// @param low_id Lowest numeric id value in database. [out]
365  /// @param high_id Highest numeric id value in database. [out]
366  /// @param count Number of numeric id values in database. [out]
367  /// @param locked Lock holder object for this thread. [in]
368  void GetIdBounds(Int8 & low_id,
369  Int8 & high_id,
370  int & count);
371 
372 
373 
374  /// Get String Bounds.
375  ///
376  /// Fetch the lowest, highest, and total number of string keys in
377  /// the database index. If the operation fails, zero will be
378  /// returned for count.
379  ///
380  /// @param low_id Lowest string id value in database. [out]
381  /// @param high_id Highest string id value in database. [out]
382  /// @param count Number of string id values in database. [out]
383  /// @param locked Lock holder object for this thread. [in]
384  void GetIdBounds(string & low_id,
385  string & high_id,
386  int & count);
387 
388 
389 
390  /// Check if a given ISAM index exists.
391  ///
392  /// @param dbname Base name of the database volume.
393  /// @param prot_nucl 'n' or 'p' for protein or nucleotide.
394  /// @param file_ext_char Identifier symbol; 's' for string, etc.
395  static bool IndexExists(const string & dbname,
396  char prot_nucl,
397  char file_ext_char);
398 
399 private:
400  /// Stores a key for an ISAM file.
401  ///
402  /// This class stores a key of either of the types used by ISAM
403  /// files. It provides functionality for ordering comparisons of
404  /// keys.
405 
406  class SIsamKey {
407  public:
408  // If case insensitive string comparisons are desired, the
409  // keys should be upcased before calling these methods.
410 
411  /// Constructor.
413  : m_IsSet(false), m_NKey(-1)
414  {
415  }
416 
417  /// Returns true if this object has an assigned value.
418  bool IsSet()
419  {
420  return m_IsSet;
421  }
422 
423  /// Assign a numeric value to this object.
424  void SetNumeric(Int8 ident)
425  {
426  m_IsSet = true;
427  m_NKey = ident;
428  }
429 
430  /// Fetch the numeric value of this object.
431  Int8 GetNumeric() const
432  {
433  return m_NKey;
434  }
435 
436  /// Fetch the string value of this object.
437  void SetString(const string & ident)
438  {
439  m_IsSet = true;
440  m_SKey = ident;
441  }
442 
443  /// Fetch the numeric value of this object.
444  string GetString() const
445  {
446  return m_SKey;
447  }
448 
449  /// Returns true if the provided integer compares as lower
450  /// than the assigned lower boundary for this ISAM file.
452  {
453  return (m_IsSet && (ident < m_NKey));
454  }
455 
456  /// Returns true if the provided string compares as lower than
457  /// the assigned lower boundary for this ISAM file.
458  bool OutsideFirstBound(const string & ident)
459  {
460  return (m_IsSet && (ident < m_SKey));
461  }
462 
463  /// Returns true if the provided integer compares as higher
464  /// than the assigned upper boundary for this ISAM file.
465  bool OutsideLastBound(Int8 ident)
466  {
467  return (m_IsSet && (ident > m_NKey));
468  }
469 
470  /// Returns true if the provided string compares as lower than
471  /// the assigned upper boundary for this ISAM file.
472  bool OutsideLastBound(const string & ident)
473  {
474  return (m_IsSet && (ident > m_SKey));
475  }
476 
477  private:
478  /// True if this object has an assigned value.
479  bool m_IsSet;
480 
481  /// The key, if it is a number.
483 
484  /// The key, if it is a string.
485  string m_SKey;
486  };
487 
488  /// Exit conditions occurring in this code.
489  enum EErrorCode {
490  eNotFound = 1, /// The key was not found
491  eNoError = 0, /// Lookup was successful
492  eBadVersion = -10, /// The format version of the ISAM file is unsupported.
493  eBadType = -11, /// The requested ISAM type did not match the file.
494  eWrongFile = -12, /// The file was not found, or was the wrong length.
495  eInitFailed = -13 /// The file was not found, or was the wrong length.
496  };
497 
498  /// Load and extract all index samples into array at once
499  template <class T>
501  vector<T> & keys,
502  vector<TIndx> & offs)
503  {
504  const char * keydatap = lease.GetFileDataPtr(m_KeySampleOffset);
505 
506 
507  for (int index=0; index < m_NumSamples; ++index) {
508  keys.push_back(static_cast<T>(x_GetNumericKey(keydatap)));
509  // vals.push_back(x_GetNumericData(keydatap));
510  offs.push_back(index * m_PageSize * m_TermSize);
511  keydatap += m_TermSize;
512  }
513 
514  offs.push_back(m_NumTerms * m_TermSize);
515  }
516 
517  /// Load and extract a data page into array at once
518  template <class T>
520  vector<T> & keys,
521  vector<int> & vals,
522  int num_keys,
523  TIndx begin)
524  {
525 
526  const char * keydatap = lease.GetFileDataPtr(begin);
527 
528 
529  for (int index=0; index < num_keys; ++index) {
530  keys.push_back(static_cast<T>(x_GetNumericKey(keydatap)));
531  vals.push_back(x_GetNumericData(keydatap));
532  keydatap += m_TermSize;
533  }
534  }
535 
536 
537  /// GiList Translation
538  ///
539  /// Given a GI list, this routine finds the OID for each ID in the
540  /// list not already having a translation.
541  ///
542  /// @param vol_start
543  /// The starting OID for this ISAM file's database volume.
544  /// @param gis
545  /// The GI list to translate.
546  /// @param locked
547  /// The lock holder object for this thread.
548  template <class T>
549  void x_TranslateGiList(int vol_start,
550  CSeqDBGiList & gis)
551 
552  {
553  int gilist_size = gis.GetSize<T>();
554  if (! gilist_size) return;
555 
557 
558  if(m_Initialized == false) {
560  eArgErr,
561  "Error: Unable to use ISAM index in batch mode.");
562  }
563 
564  vector<T> sample_keys;
565  vector<TIndx> page_offs;
566  vector<T> keys;
567  vector<int> vals;
568 
569  sample_keys.reserve(m_NumSamples);
570  page_offs.reserve(m_NumSamples + 1);
571  keys.reserve(m_PageSize);
572  vals.reserve(m_PageSize);
573 
574  x_LoadIndex(m_IndexLease, sample_keys, page_offs);
575 
576  int gilist_index = 0;
577  int sample_index = 0;
578 
579  while((gilist_index < gilist_size) && (sample_index < m_NumSamples)) {
580 
581  s_AdvanceGiList<T>(gis, gilist_index, gilist_size,
582  sample_keys[sample_index]);
583 
584  if (gilist_index >= gilist_size) break;
585 
586  s_AdvanceKeyList<T>(sample_keys, sample_index, m_NumSamples,
587  gis.GetKey<T>(gilist_index));
588 
589  // Now we should be ready to search a data block.
590  keys.clear();
591  vals.clear();
592 
593  int num_keys = m_PageSize;
594  if (sample_index + 1 == m_NumSamples) {
595  num_keys = m_NumTerms - sample_index * m_PageSize;
596  }
597  x_LoadData(m_DataLease, keys, vals, num_keys, page_offs[sample_index]);
598 
599  int index = 0;
600 
601  while ((gilist_index < gilist_size) && (index < num_keys)) {
602 
603  s_AdvanceKeyList<T>(keys, index, num_keys,
604  gis.GetKey<T>(gilist_index));
605 
606  s_SetTranslation<T>(gis, gilist_index, gilist_size,
607  keys[index], vals[index] + vol_start);
608 
609  ++index;
610  if (index >= num_keys) break;
611 
612  s_AdvanceGiList<T>(gis, gilist_index, gilist_size, keys[index]);
613 
614  s_SetTranslation<T>(gis, gilist_index, gilist_size,
615  keys[index], vals[index] + vol_start);
616 
617  }
618 
619  // We could be finished here because we exhausted the GI list
620  // We must be done with that one by now..
621  ++sample_index;
622  }
623  }
624 
625  /// Numeric identifier lookup
626  ///
627  /// Given a numeric identifier, this routine finds the OID.
628  ///
629  /// @param id
630  /// The GI or PIG identifier to look up.
631  /// @param oid
632  /// The returned oid.
633  /// @param locked
634  /// The lock holder object for this thread.
635  /// @return
636  /// true if the identifier was found.
637  bool x_IdentToOid(Int8 id,
638  TOid & oid);
639 
640 
641  /// Index file search
642  ///
643  /// Given a numeric identifier, this routine finds the OID or the
644  /// page in the data file where the OID can be found.
645  ///
646  /// @param Number
647  /// The GI or PIG identifier to look up.
648  /// @param Data
649  /// The returned OID.
650  /// @param Index
651  /// The returned location in the ISAM table, or NULL.
652  /// @param SampleNum
653  /// The returned location in the data file if not done.
654  /// @param done
655  /// true if the OID was found.
656  /// @param locked
657  // The lock holder object for this thread.
658  /// @return
659  /// A non-zero error on failure, or eNoError on success.
660  EErrorCode
661  x_SearchIndexNumeric(Int8 Number,
662  int * Data,
663  Uint4 * Index,
664  Int4 & SampleNum,
665  bool & done);
666 
667 
668  /// Negative ID List Translation
669  ///
670  /// Given a Negative ID list, this routine turns on the bits for
671  /// the OIDs found in the volume but not in the negated ID list.
672  ///
673  /// @param vol_start
674  /// The starting OID for this ISAM file's database volume.
675  /// @param vol_end
676  /// The ending OID for this ISAM file's database volume.
677  /// @param gis
678  /// The Negative ID list to translate.
679  /// @param use_tis
680  /// Iterate over TIs if true (GIs otherwise).
681  /// @param locked
682  /// The lock holder object for this thread.
683  void
684  x_SearchNegativeMulti(int vol_start,
685  int vol_end,
686  CSeqDBNegativeList & gis,
687  bool use_tis);
688 
689 
690  void
691  x_SearchNegativeMultiSeq(int vol_start,
692  int vol_end,
693  CSeqDBNegativeList & gis);
694 
695 
696 
697  /// Data file search
698  ///
699  /// Given a numeric identifier, this routine finds the OID in the
700  /// data file.
701  ///
702  /// @param Number
703  /// The GI or PIG identifier to look up.
704  /// @param Data
705  /// The returned OID.
706  /// @param Index
707  /// The returned location in the ISAM table, or NULL.
708  /// @param SampleNum
709  /// The location of the page in the data file to search.
710  /// @param locked
711  /// The lock holder object for this thread.
712  /// @return
713  /// A non-zero error on failure, or eNoError on success.
714  EErrorCode
715  x_SearchDataNumeric(Int8 Number,
716  int * Data,
717  Uint4 * Index,
718  Int4 SampleNum);
719 
720 
721  /// Numeric identifier lookup
722  ///
723  /// Given a numeric identifier, this routine finds the OID.
724  ///
725  /// @param Number
726  /// The GI or PIG identifier to look up.
727  /// @param Data
728  /// The returned OID.
729  /// @param Index
730  /// The returned location in the ISAM table, or NULL.
731  /// @param locked
732  /// The lock holder object for this thread.
733  /// @return
734  /// A non-zero error on failure, or eNoError on success.
735  EErrorCode
736  x_NumericSearch(Int8 Number,
737  int * Data,
738  Uint4 * Index);
739 
740 
741  /// String identifier lookup
742  ///
743  /// Given a string identifier, this routine finds the OID(s).
744  ///
745  /// @param term_in
746  /// The string identifier to look up.
747  /// @param term_out
748  /// The returned keys (as strings).
749  /// @param value_out
750  /// The returned oids (as strings).
751  /// @param index_out
752  /// The locations where the matches were found.
753  /// @param locked
754  /// The lock holder object for this thread.
755  /// @return
756  /// A non-zero error on failure, or eNoError on success.
757  EErrorCode
758  x_StringSearch(const string & term_in,
759  vector<string> & term_out,
760  vector<string> & value_out,
761  vector<TIndx> & index_out);
762 
763 
764  /// Initialize the search object
765  ///
766  /// The first identifier search sets up the object by calling this
767  /// function, which reads the metadata from the index file and
768  /// sets all the fields needed for ISAM lookups.
769  ///
770  /// @param locked
771  /// The lock holder object for this thread.
772  /// @return
773  /// A non-zero error on failure, or eNoError on success.
774  EErrorCode
775  x_InitSearch(void);
776 
777  /// Determine the number of elements in the data page.
778  ///
779  /// The number of elements is determined based on whether this is
780  /// the last page and the configured page size.
781  ///
782  /// @param SampleNum
783  /// Which data page will be searched.
784  /// @param Start
785  /// The returned index of the start of the page.
786  /// @return
787  /// The number of elements in this data page.
788  int x_GetPageNumElements(Int4 SampleNum,
789  Int4 * Start);
790 
791  /// Lookup a string in a sparse table
792  ///
793  /// This does string lookup in a sparse string table. There is no
794  /// support (code) for this since there are currently no examples
795  /// of this kind of table to test against.
796  ///
797  /// @param acc
798  /// The string to look up.
799  /// @param oids
800  /// The returned oids found by the search.
801  /// @param adjusted
802  /// Whether the key was changed by the identifier simplification logic.
803  /// @param locked
804  /// The lock holder object for this thread.
805  /// @return
806  /// true if results were found
807  bool x_SparseStringToOids(const string & acc,
808  vector<int> & oids,
809  bool adjusted);
810 
811 
812  /// Find the first character to differ in two strings
813  ///
814  /// This finds the index of the first character to differ in
815  /// meaningful way between two strings. One of the strings is a
816  /// term that is passed in; the other is assumed to be located in
817  /// the ISAM table, a lease to which is passed to this function.
818  ///
819  /// @param term_in
820  /// The key string to compare against.
821  /// @param lease
822  /// A lease to hold the data in the ISAM table file.
823  /// @param file_name
824  /// The name of the ISAM file to work with.
825  /// @param file_length
826  /// The length of the file named by file_name.
827  /// @param at_least
828  /// Try to get at least this many bytes.
829  /// @param KeyOffset
830  /// The location of the key in the leased file.
831  /// @param ignore_case
832  /// Whether to treat the search as case-sensitive
833  /// @param locked
834  /// The lock holder object for this thread.
835  /// @return
836  /// The position of the first difference.
837  int
838  x_DiffCharLease(const string & term_in,
839  CSeqDBFileMemMap & lease,
840  const string & file_name,
841  TIndx file_length,
842  Uint4 at_least,
843  TIndx KeyOffset,
844  bool ignore_case);
845 
846 
847  /// Find the first character to differ in two strings
848  ///
849  /// This finds the index of the first character to differ in
850  /// meaningful way between two strings. One of the strings is a
851  /// term that is passed in; the other is a range of memory
852  /// represented by two pointers.
853  ///
854  /// @param term_in
855  /// The key string to compare against.
856  /// @param begin
857  /// A pointer to the start of the second string.
858  /// @param end
859  /// A pointer to the end of the second string.
860  /// @param ignore_case
861  /// Whether to treat the search as case-sensitive
862  /// @return
863  /// The position of the first difference.
864  int
865  x_DiffChar(const string & term_in,
866  const char * begin,
867  const char * end,
868  bool ignore_case);
869 
870  /// Extract the data from a key-value pair in memory.
871  ///
872  /// Given pointers to a location in mapped memory, and the end of
873  /// the mapped data, this finds the key and data values for the
874  /// object at that location.
875  ///
876  /// @param key_start
877  /// A pointer to the beginning of the key-value pair in memory.
878  /// @param entry_end
879  /// A pointer to the end of the mapped area of memory.
880  /// @param key_out
881  /// A string holding the ISAM entry's key
882  /// @param data_out
883  /// A string holding the ISAM entry's value
884  void x_ExtractData(const char * key_start,
885  const char * entry_end,
886  vector<string> & key_out,
887  vector<string> & data_out);
888 
889  /// Get the offset of the specified sample.
890  ///
891  /// For string ISAM indices, the index file contains a table of
892  /// offsets of the index file samples. This function gets the
893  /// offset of the specified sample in the index file's table.
894  ///
895  /// @param sample_offset
896  /// The offset into the file of the set of samples.
897  /// @param sample_num
898  /// The index of the sample to get.
899  /// @param locked
900  /// This thread's lock holder object.
901  /// @return
902  /// The offset of the sample in the index file.
903  TIndx x_GetIndexKeyOffset(TIndx sample_offset,
904  Uint4 sample_num);
905 
906  /// Read a string from the index file.
907  ///
908  /// Given an offset into the index file, and a maximum length,
909  /// this function returns the bytes in a string object.
910  ///
911  /// @param key_offset
912  /// The offset into the file of the first byte.
913  /// @param length
914  /// The maximum number of bytes to get.
915  /// @param prefix
916  /// The string in which to return the data.
917  /// @param trim_to_null
918  /// Whether to search for a null and return only that much data.
919  /// @param locked
920  /// This thread's lock holder object.
921  void x_GetIndexString(TIndx key_offset,
922  int length,
923  string & prefix,
924  bool trim_to_null);
925 
926 
927  /// Find the first character to differ in two strings
928  ///
929  /// This finds the index of the first character to differ between
930  /// two strings. The first string is provided, the second is one
931  /// of the sample strings, indicated by the index of that sample
932  /// value.
933  ///
934  /// @param term_in
935  /// The key string to compare against.
936  /// @param SampleNum
937  /// Selects which sample to compare with.
938  /// @param KeyOffset
939  /// The returned offset of the key that was used.
940  /// @param locked
941  /// This thread's lock holder object.
942  int x_DiffSample(const string & term_in,
943  Uint4 SampleNum,
944  TIndx & KeyOffset);
945 
946 
947  /// Find matches in the given page of a string ISAM file.
948  ///
949  /// This searches the area around a specific page of the data file
950  /// to find all matches to term_in. The results are returned in
951  /// vectors. This method may search multiple pages.
952  ///
953  /// @param term_in
954  /// The key string to compare against.
955  /// @param sample_index
956  /// Selects which page to search.
957  /// @param indices_out
958  /// The index of each match.
959  /// @param keys_out
960  /// The key of each match.
961  /// @param data_out
962  /// The value of each match.
963  /// @param locked
964  /// This thread's lock holder object.
965  void x_ExtractAllData(const string & term_in,
966  TIndx sample_index,
967  vector<TIndx> & indices_out,
968  vector<string> & keys_out,
969  vector<string> & data_out);
970 
971 
972  /// Find matches in the given memory area of a string ISAM file.
973  ///
974  /// This searches the specified section of memory to find all
975  /// matches to term_in. The results are returned in vectors.
976  ///
977  /// @param term_in
978  /// The key string to compare against.
979  /// @param page_index
980  /// Selects which page to search.
981  /// @param beginp
982  /// Pointer to the start of the memory area
983  /// @param endp
984  /// Pointer to the end of the memory area
985  /// @param indices_out
986  /// The index of each match.
987  /// @param keys_out
988  /// The key of each match.
989  /// @param data_out
990  /// The value of each match.
991  void x_ExtractPageData(const string & term_in,
992  TIndx page_index,
993  const char * beginp,
994  const char * endp,
995  vector<TIndx> & indices_out,
996  vector<string> & keys_out,
997  vector<string> & data_out);
998 
999  /// Map a page into memory
1000  ///
1001  /// Given two indices, this method maps into memory the area
1002  /// starting at the beginning of the first index and extending to
1003  /// the end of the other. (If the indices are equal, only one
1004  /// page would be mapped.)
1005  ///
1006  /// @param SampleNum1
1007  /// The first page index.
1008  /// @param SampleNum2
1009  /// The second page index.
1010  /// @param beginp
1011  /// The returned starting offset of the mapped area.
1012  /// @param endp
1013  /// The returned ending offset of the mapped area.
1014  /// @param locked
1015  /// This thread's lock holder object.
1016  void x_LoadPage(TIndx SampleNum1,
1017  TIndx SampleNum2,
1018  const char ** beginp,
1019  const char ** endp);
1020 
1021 
1022  /// Test a sample key value from a numeric index.
1023  ///
1024  /// This method reads the key value of an index file sample
1025  /// element from a numeric index file. The calling code should
1026  /// insure that the data is mapped in, and that the file type is
1027  /// correct. The key value found will be compared to the search
1028  /// key. This method will return 0 for an exact match, -1 if the
1029  /// key is less than the sample, or 1 if the key is greater. If
1030  /// the match is exact, it will also return the data in data_out.
1031  ///
1032  /// @param index_lease
1033  /// The memory lease to use with the index file.
1034  /// @param index
1035  /// The index of the sample to get.
1036  /// @param key_in
1037  /// The key for which the user is searching.
1038  /// @param key_out
1039  /// The key found will be returned here.
1040  /// @param data_out
1041  /// If an exact match, the data found will be returned here.
1042  /// @return
1043  /// -1, 0 or 1 when key_in is less, equal greater than key_out.
1044  int x_TestNumericSample(CSeqDBFileMemMap & index_lease,
1045  int index,
1046  Int8 key_in,
1047  Int8 & key_out,
1048  int & data_out);
1049 
1050  /// Get a sample key value from a numeric index.
1051  ///
1052  /// Given the index of a sample value, this code will get the key.
1053  /// If data values are stored in the index file, the corresponding
1054  /// data value will also be returned. The offset of the data
1055  /// block is computed and returned as well.
1056  ///
1057  /// @param index_lease
1058  /// The memory lease to use with the index file.
1059  /// @param index
1060  /// The index of the sample to get.
1061  /// @param key_out
1062  /// The key found will be returned here.
1063  /// @param data_out
1064  /// If an exact match, the data found will be returned here.
1065  void x_GetNumericSample(CSeqDBFileMemMap & index_lease,
1066  int index,
1067  Int8 & key_out,
1068  int & data_out);
1069 
1070  /// Find ID in the negative GI list using PBS.
1071  ///
1072  /// Use parabolic binary search to find the specified ID in the
1073  /// negative ID list. The 'index' value is the index to start the
1074  /// search at (this must refer to an index at or before the target
1075  /// data if the search is to succeed). Whether the search was
1076  /// successful or not, the index will be moved forward past any
1077  /// elements with values less than 'key'.
1078  ///
1079  /// @param ids Negative ID list. [in|out]
1080  /// @param index Index into negative ID list. [in|out]
1081  /// @param key Key for which to search. [in]
1082  /// @param use_tis If true, search for a TI, else for a GI. [in]
1083  /// @return True if the search found the ID.
1084  inline bool
1085  x_FindInNegativeList(CSeqDBNegativeList & ids,
1086  int & index,
1087  Int8 key,
1088  bool use_tis);
1089 
1090  inline bool
1091  x_FindInNegativeList(CSeqDBNegativeList & ids,
1092  int & index,
1093  string key);
1094 
1095 
1096  /// Map a data page.
1097  ///
1098  /// The caller provides an index into the sample file. The page
1099  /// of data is mapped, and a pointer is returned. In addition,
1100  /// the starting index (start) of the data is returned, along with
1101  /// the number of elements in that page.
1102  ///
1103  /// @param sample_index Index into the index (i.e. pni) file. [in]
1104  /// @param start Index of first element of the page. [out]
1105  /// @param num_elements Number of elements in the page. [out]
1106  /// @param data_page_begin Pointer to the returned data. [out]
1107  /// @param locked The lock holder object for this thread. [out]
1108  void x_MapDataPage(int sample_index,
1109  int & start,
1110  int & num_elements,
1111  const void ** data_page_begin);
1112 
1113 
1114  /// Get a particular data element from a data page.
1115  /// @param dpage A pointer to that page in memory. [in]
1116  /// @param index The index of the element to fetch. [in]
1117  /// @param key The returned key. [out]
1118  /// @param data The returned value. [out]
1119  void x_GetDataElement(const void * dpage,
1120  int index,
1121  Int8 & key,
1122  int & data);
1123 
1124  void x_GetDataElement(const void * dpage,
1125  int index,
1126  string & key,
1127  int & data);
1128 
1129  /// Find the least and greatest keys in this ISAM file.
1130  void x_FindIndexBounds();
1131 
1132  /// Check whether a numeric key is within this volume's bounds.
1133  /// @param key The key for which to do the check.
1134  /// @param locked The lock holder object for this thread.
1135  bool x_OutOfBounds(Int8 key);
1136 
1137  /// Check whether a string key is within this volume's bounds.
1138  /// @param key The key for which to do the check.
1139  /// @param locked The lock holder object for this thread.
1140  bool x_OutOfBounds(string key);
1141 
1142  /// Converts a string to lower case.
1143  static void x_Lower(string & s)
1144  {
1145  for(size_t i = 0; i < s.size(); i++) {
1146  s[i] = tolower(s[i]);
1147  }
1148  }
1149 
1150  /// Fetch a GI or TI from a GI list.
1151  static Int8 x_GetId(CSeqDBNegativeList & ids, int index, bool use_tis)
1152  {
1153  return (use_tis
1154  ? ids.GetTi(index)
1155  : GI_TO(Int8, ids.GetGi(index)));
1156  }
1157 
1158  static string x_GetId(CSeqDBNegativeList & ids, int index)
1159  {
1160  return (ids.GetSi(index));
1161  }
1162 
1163 
1164  /// Make filenames for ISAM file.
1165  ///
1166  /// @param dbname Base name of the database volume. [in]
1167  /// @param prot_nucl 'n' or 'p' for protein or nucleotide. [in]
1168  /// @param file_ext_char Identifier symbol; 's' for string, etc. [in]
1169  /// @param index_name Filename of ISAM index file. [out]
1170  /// @param data_name Filename of ISAM data file. [out]
1171  static void x_MakeFilenames(const string & dbname,
1172  char prot_nucl,
1173  char file_ext_char,
1174  string & index_name,
1175  string & data_name);
1176 
1177  // Data
1178 
1179  /// The memory management layer
1181 
1182  /// The type of identifier this class uses
1184 
1185  /// A persistent lease on the ISAM index file.
1187 
1188 
1189  /// A persistent lease on the ISAM data file.
1191 
1192 
1193  /// The format type of database files found (eNumeric or eString).
1194  int m_Type;
1195 
1196  /// The filename of the ISAM data file.
1197  string m_DataFname;
1198 
1199  /// The filename of the ISAM index file.
1201 
1202  /// The length of the ISAM data file.
1204 
1205  /// The length of the ISAM index file.
1207 
1208  /// Number of terms in database
1210 
1211  /// Number of terms in ISAM index
1213 
1214  /// Page size of ISAM index
1216 
1217  /// Maximum string length in the database
1219 
1220  /// Options set by upper layer
1222 
1223  /// Flag indicating whether initialization has been done.
1225 
1226  /// Offset of samples in index file.
1228 
1229  /// Check if data for String ISAM sorted
1231 
1232  /// Pointer to index file if no memmap.
1233  char * m_FileStart;
1234 
1235  /// First and last offset's of last page.
1237 
1238  /// First and last offset's of last page.
1240 
1241  /// First volume key
1243 
1244  /// Last volume key
1246 
1247  /// Use Uint8 for the key
1248  bool m_LongId;
1249 
1250  /// size of the numeric key-data pair
1252 
1253  Uint8 x_GetNumericKey(const void *p) {
1254  if (m_LongId)
1255  return((Uint8) SeqDB_GetStdOrd((Uint8 *)p));
1256  else
1257  return((Uint4) SeqDB_GetStdOrd((Uint4 *)p));
1258  }
1259 
1260  int x_GetNumericData(const void *p) {
1261  if (m_LongId)
1262  return((int) SeqDB_GetStdOrd(((Uint4 *)p)+2));
1263  else
1264  return((int) SeqDB_GetStdOrd(((Uint4 *)p)+1));
1265  }
1266 
1267  void x_LoadStringData(const char *begin,
1268  string & key,
1269  int & data)
1270  {
1271  const char * keydatap = begin;
1272  const char * key_begin = keydatap;
1273  while (*keydatap != 0x02) ++keydatap;
1274  key = string(key_begin, keydatap);
1275 
1276  key_begin = ++keydatap;
1277  while (*keydatap != 0x0a) ++keydatap;
1278  data = NStr::StringToUInt(string(key_begin, keydatap));
1279  }
1280 
1281 };
1282 
1283 inline int
1285  int index,
1286  Int8 key_in,
1287  Int8 & key_out,
1288  int & data_out)
1289 {
1290 
1291  const void * keydatap = 0;
1292 
1293  TIndx offset_begin = m_KeySampleOffset + (m_TermSize * index);
1294 
1295 
1296  keydatap = index_lease.GetFileDataPtr(offset_begin);
1297 
1298  key_out = x_GetNumericKey(keydatap);
1299 
1300  int rv = 0;
1301 
1302  if (key_in < key_out) {
1303  rv = -1;
1304  } else if (key_in > key_out) {
1305  rv = 1;
1306  } else {
1307  rv = 0;
1308  data_out = x_GetNumericData(keydatap);
1309  }
1310 
1311  return rv;
1312 }
1313 
1314 inline void
1316  int index,
1317  Int8 & key_out,
1318  int & data_out)
1319 {
1320  const void * keydatap = 0;
1321 
1322  TIndx offset_begin = m_KeySampleOffset + (m_TermSize * index);
1323 
1324 
1325  keydatap = index_lease.GetFileDataPtr(offset_begin);
1326 
1327  key_out = x_GetNumericKey(keydatap);
1328  data_out = x_GetNumericData(keydatap);
1329 }
1330 
1331 /// Load and extract all index samples into array at once
1332 template <>
1333 inline void CSeqDBIsam::x_LoadIndex<TGi>(
1334  CSeqDBFileMemMap & lease,
1335  vector<TGi> & keys,
1336  vector<TIndx> & offs
1337 )
1338 {
1339 
1340  const char * keydatap = lease.GetFileDataPtr(m_KeySampleOffset);
1341 
1342  for (int index=0; index < m_NumSamples; ++index) {
1343  keys.push_back(GI_FROM(Uint8, x_GetNumericKey(keydatap)));
1344  offs.push_back(index * m_PageSize * m_TermSize);
1345  keydatap += m_TermSize;
1346  }
1347 
1348  offs.push_back(m_NumTerms * m_TermSize);
1349 }
1350 
1351 /// Load and extract a data page into array at once
1352 template <>
1353 inline void CSeqDBIsam::x_LoadData<TGi>(
1354  CSeqDBFileMemMap & lease,
1355  vector<TGi> & keys,
1356  vector<int> & vals,
1357  int num_keys,
1358  TIndx begin
1359 )
1360 {
1361 
1362  const char * keydatap = lease.GetFileDataPtr(begin);
1363 
1364  for (int index=0; index < num_keys; ++index) {
1365  keys.push_back(GI_FROM(Uint8, x_GetNumericKey(keydatap)));
1366  vals.push_back(x_GetNumericData(keydatap));
1367  keydatap += m_TermSize;
1368  }
1369 }
1370 
1371 template <> inline void
1372 CSeqDBIsam::x_LoadIndex<string>(CSeqDBFileMemMap & lease,
1373  vector<string> & keys,
1374  vector<TIndx> & offs)
1375 {
1376  TIndx offset_begin = m_KeySampleOffset;
1377  TIndx sample_begin = offset_begin + sizeof(Uint4) * (m_NumSamples + 1);
1378 
1379  // load offset array
1380  const Uint4 * offset = (const Uint4 *) lease.GetFileDataPtr(offset_begin);
1381  for (int index=0; index <= m_NumSamples; ++index, ++offset) {
1382  // Get the data_offsets
1383  offs.push_back(SeqDB_GetStdOrd((Uint4*) offset));
1384  }
1385 
1386  // load sample array
1387  offset = (const Uint4 *) lease.GetFileDataPtr(sample_begin);
1388  for (int index=0; index < m_NumSamples; ++index, ++offset) {
1389  // Get the index_offsets
1390  offset_begin = SeqDB_GetStdOrd((Uint4*) offset);
1391 
1392  // Lookup the samples
1393  const char * keydatap = (const char *) lease.GetFileDataPtr(offset_begin) - 1;
1394 
1395 
1396  const char * key_begin = ++ keydatap;
1397  while (*keydatap != 0x02) ++keydatap;
1398  keys.push_back(string(key_begin, keydatap));
1399 
1400  /* key_begin = ++keydatap;
1401  while (*keydatap != 0x00) ++keydatap;
1402  vals.push_back(NStr::StringToUInt(string(key_begin, keydatap))); */
1403  }
1404 }
1405 
1406 template <> inline void
1407 CSeqDBIsam::x_LoadData<string>(CSeqDBFileMemMap & lease,
1408  vector<string> & keys,
1409  vector<int> & vals,
1410  int num_keys,
1411  TIndx begin)
1412 {
1413  const char * keydatap = (const char *) lease.GetFileDataPtr(begin) - 1;
1414 
1415  for (int index=0; index < num_keys; ++index) {
1416 
1417  const char * key_begin = ++keydatap;
1418  while (*keydatap != 0x02) ++keydatap;
1419  keys.push_back(string(key_begin, keydatap));
1420 
1421  key_begin = ++keydatap;
1422  while (*keydatap != 0x0a) ++keydatap;
1423  vals.push_back(NStr::StringToUInt(string(key_begin, keydatap)));
1424  }
1425 }
1426 
1427 inline bool
1429  int & index,
1430  Int8 key,
1431  bool use_tis)
1432 {
1433  bool found = false;
1434 
1435  // Skip any that are less than key.
1436  int ids_size = ids.ListSize();
1437 
1438  while((index < ids_size) && (x_GetId(ids, index, use_tis) < key)) {
1439  index++;
1440 
1441  int jump = 2;
1442 
1443  while((index + jump) < ids_size &&
1444  x_GetId(ids, index + jump, use_tis) < key) {
1445  index += jump;
1446  jump += jump;
1447  }
1448  }
1449 
1450  // Check whether the GI or TI was found.
1451 
1452  if ((index < ids_size) && (x_GetId(ids,index,use_tis) == key)) {
1453  found = true;
1454  }
1455 
1456  return found;
1457 }
1458 
1459 
1460 inline bool
1462  int & index,
1463  string key)
1464 {
1465  bool found = false;
1466 
1467  // Skip any that are less than key.
1468  int ids_size = ids.ListSize();
1469 
1470  while((index < ids_size) && (x_GetId(ids, index) < key)) {
1471  index++;
1472 
1473  int jump = 2;
1474 
1475  while((index + jump) < ids_size &&
1476  x_GetId(ids, index + jump) < key) {
1477  index += jump;
1478  jump += jump;
1479  }
1480  }
1481 
1482  // Check whether the GI or TI was found.
1483 
1484  if ((index < ids_size) && (x_GetId(ids,index) == key)) {
1485  found = true;
1486  }
1487 
1488  return found;
1489 }
1490 
1491 
1492 inline void
1493 CSeqDBIsam::x_MapDataPage(int sample_index,
1494  int & start,
1495  int & num_elements,
1496  const void ** data_page_begin)
1497 
1498 {
1499  num_elements =
1500  x_GetPageNumElements(sample_index, & start);
1501 
1502  TIndx offset_begin = start * m_TermSize;
1503  //TIndx offset_end = offset_begin + m_TermSize * num_elements;
1504 
1505  *data_page_begin = m_DataLease.GetFileDataPtr(m_DataFname,offset_begin);
1506 }
1507 
1508 inline void
1509 CSeqDBIsam::x_GetDataElement(const void * dpage,
1510  int index,
1511  Int8 & key,
1512  int & data)
1513 {
1514  key = x_GetNumericKey ((char *)dpage + index * m_TermSize);
1515  data = x_GetNumericData((char *)dpage + index * m_TermSize);
1516 }
1517 
1518 inline void
1519 CSeqDBIsam::x_GetDataElement(const void * dpage,
1520  int index,
1521  string & key,
1522  int & data)
1523 {
1524  x_LoadStringData((char *)dpage + index * m_TermSize,
1525  key,
1526  data);
1527 }
1528 
1530 
1531 #endif // OBJTOOLS_READERS_SEQDB__SEQDBFILE_HPP
CObject –.
Definition: ncbiobj.hpp:180
CSeqDBAtlas class.
Definition: seqdbatlas.hpp:297
CNcbiStreamoff TIndx
The type used for file offsets.
Definition: seqdbatlas.hpp:301
CSeqDBException.
Definition: seqdbcommon.hpp:73
const char * GetFileDataPtr(const string &fname, TIndx offset)
Get a pointer to the specified offset.
Definition: seqdbatlas.hpp:755
CSeqDBGiList.
bool IsValueSet(int index) const
T GetKey(int index) const
void SetValue(int index, int oid)
int GetSize() const
@ eGi
The array is sorted by GI.
void InsureOrder(ESortOrder order)
Sort if necessary to insure order of elements.
Stores a key for an ISAM file.
Definition: seqdbisam.hpp:406
bool OutsideLastBound(Int8 ident)
Returns true if the provided integer compares as higher than the assigned upper boundary for this ISA...
Definition: seqdbisam.hpp:465
string m_SKey
The key, if it is a string.
Definition: seqdbisam.hpp:485
bool IsSet()
Returns true if this object has an assigned value.
Definition: seqdbisam.hpp:418
string GetString() const
Fetch the numeric value of this object.
Definition: seqdbisam.hpp:444
void SetString(const string &ident)
Fetch the string value of this object.
Definition: seqdbisam.hpp:437
Int8 m_NKey
The key, if it is a number.
Definition: seqdbisam.hpp:482
void SetNumeric(Int8 ident)
Assign a numeric value to this object.
Definition: seqdbisam.hpp:424
Int8 GetNumeric() const
Fetch the numeric value of this object.
Definition: seqdbisam.hpp:431
bool m_IsSet
True if this object has an assigned value.
Definition: seqdbisam.hpp:479
bool OutsideLastBound(const string &ident)
Returns true if the provided string compares as lower than the assigned upper boundary for this ISAM ...
Definition: seqdbisam.hpp:472
bool OutsideFirstBound(Int8 ident)
Returns true if the provided integer compares as lower than the assigned lower boundary for this ISAM...
Definition: seqdbisam.hpp:451
SIsamKey()
Constructor.
Definition: seqdbisam.hpp:412
bool OutsideFirstBound(const string &ident)
Returns true if the provided string compares as lower than the assigned lower boundary for this ISAM ...
Definition: seqdbisam.hpp:458
CSeqDBIsam.
Definition: seqdbisam.hpp:127
Int4 m_LastOffset
First and last offset's of last page.
Definition: seqdbisam.hpp:1239
bool IdToOid(Int8 id, TOid &oid)
GI or TI translation.
Definition: seqdbisam.hpp:225
EIsamDbType
Types of database this class can access.
Definition: seqdbisam.hpp:133
CSeqDBGiList::SGiOid TGiOid
Import the type representing one GI, OID association.
Definition: seqdbisam.hpp:130
CSeqDBFileMemMap m_DataLease
A persistent lease on the ISAM data file.
Definition: seqdbisam.hpp:1190
TIndx m_IndexFileLength
The length of the ISAM index file.
Definition: seqdbisam.hpp:1206
bool m_LongId
Use Uint8 for the key.
Definition: seqdbisam.hpp:1248
int x_GetPageNumElements(Int4 SampleNum, Int4 *Start)
Determine the number of elements in the data page.
Definition: seqdbisam.cpp:123
Int8 TId
Type large enough to hold any numerical ID.
Definition: seqdbisam.hpp:158
ESeqDBIdType m_IdentType
The type of identifier this class uses.
Definition: seqdbisam.hpp:1183
SIsamKey m_LastKey
Last volume key.
Definition: seqdbisam.hpp:1245
static string x_GetId(CSeqDBNegativeList &ids, int index)
Definition: seqdbisam.hpp:1158
void x_TranslateGiList(int vol_start, CSeqDBGiList &gis)
GiList Translation.
Definition: seqdbisam.hpp:549
Int4 m_IdxOption
Options set by upper layer.
Definition: seqdbisam.hpp:1221
void x_LoadData(CSeqDBFileMemMap &lease, vector< T > &keys, vector< int > &vals, int num_keys, TIndx begin)
Load and extract a data page into array at once.
Definition: seqdbisam.hpp:519
void x_LoadStringData(const char *begin, string &key, int &data)
Definition: seqdbisam.hpp:1267
Int8 TTi
PIG identifiers for numeric indices over protein volumes.
Definition: seqdbisam.hpp:155
void x_GetNumericSample(CSeqDBFileMemMap &index_lease, int index, Int8 &key_out, int &data_out)
Get a sample key value from a numeric index.
Definition: seqdbisam.hpp:1315
void x_GetDataElement(const void *dpage, int index, Int8 &key, int &data)
Get a particular data element from a data page.
Definition: seqdbisam.hpp:1509
Int4 m_NumSamples
Number of terms in ISAM index.
Definition: seqdbisam.hpp:1212
int x_TestNumericSample(CSeqDBFileMemMap &index_lease, int index, Int8 key_in, Int8 &key_out, int &data_out)
Test a sample key value from a numeric index.
Definition: seqdbisam.hpp:1284
EErrorCode
Exit conditions occurring in this code.
Definition: seqdbisam.hpp:489
bool PigToOid(TPig pig, TOid &oid)
PIG translation.
Definition: seqdbisam.hpp:203
Uint8 x_GetNumericKey(const void *p)
Definition: seqdbisam.hpp:1253
void x_LoadIndex(CSeqDBFileMemMap &lease, vector< T > &keys, vector< TIndx > &offs)
Load and extract all index samples into array at once.
Definition: seqdbisam.hpp:500
Int4 m_NumTerms
Number of terms in database.
Definition: seqdbisam.hpp:1209
int TOid
This class works with OIDs relative to a specific volume.
Definition: seqdbisam.hpp:146
bool m_Initialized
Flag indicating whether initialization has been done.
Definition: seqdbisam.hpp:1224
bool SeqidToOid(const string &acc, TOid &oid)
Seq-id translation.
static void x_Lower(string &s)
Converts a string to lower case.
Definition: seqdbisam.hpp:1143
Int4 m_PageSize
Page size of ISAM index.
Definition: seqdbisam.hpp:1215
TIndx m_DataFileLength
The length of the ISAM data file.
Definition: seqdbisam.hpp:1203
int m_Type
The format type of database files found (eNumeric or eString).
Definition: seqdbisam.hpp:1194
bool m_TestNonUnique
Check if data for String ISAM sorted.
Definition: seqdbisam.hpp:1230
TIndx m_KeySampleOffset
Offset of samples in index file.
Definition: seqdbisam.hpp:1227
Int4 m_FirstOffset
First and last offset's of last page.
Definition: seqdbisam.hpp:1236
char * m_FileStart
Pointer to index file if no memmap.
Definition: seqdbisam.hpp:1233
SIsamKey m_FirstKey
First volume key.
Definition: seqdbisam.hpp:1242
static Int8 x_GetId(CSeqDBNegativeList &ids, int index, bool use_tis)
Fetch a GI or TI from a GI list.
Definition: seqdbisam.hpp:1151
void x_MapDataPage(int sample_index, int &start, int &num_elements, const void **data_page_begin)
Map a data page.
Definition: seqdbisam.hpp:1493
CSeqDBAtlas::TIndx TIndx
Type which is large enough to span the bytes of an ISAM file.
Definition: seqdbisam.hpp:143
int x_GetNumericData(const void *p)
Definition: seqdbisam.hpp:1260
bool x_FindInNegativeList(CSeqDBNegativeList &ids, int &index, Int8 key, bool use_tis)
Find ID in the negative GI list using PBS.
Definition: seqdbisam.hpp:1428
Int4 m_MaxLineSize
Maximum string length in the database.
Definition: seqdbisam.hpp:1218
string m_DataFname
The filename of the ISAM data file.
Definition: seqdbisam.hpp:1197
string m_IndexFname
The filename of the ISAM index file.
Definition: seqdbisam.hpp:1200
int m_TermSize
size of the numeric key-data pair
Definition: seqdbisam.hpp:1251
CSeqDBFileMemMap m_IndexLease
A persistent lease on the ISAM index file.
Definition: seqdbisam.hpp:1186
CSeqDBAtlas & m_Atlas
The memory management layer.
Definition: seqdbisam.hpp:1180
CSeqDBNegativeList.
TGi GetGi(int index) const
Access an element of the GI array.
TTi GetTi(int index) const
Access an element of the TI array.
const string GetSi(int index) const
Access an element of the SeqId array.
#define T(s)
Definition: common.h:230
const char * file_name[]
#define false
Definition: bool.h:36
int offset
Definition: replacements.h:160
char data[12]
Definition: iconv.c:80
#define GI_FROM(T, value)
Definition: ncbimisc.hpp:1086
#define GI_TO(T, gi)
Definition: ncbimisc.hpp:1085
string
Definition: cgiapp.hpp:690
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static unsigned int StringToUInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to unsigned int.
Definition: ncbistr.cpp:642
#define NCBI_XOBJREAD_EXPORT
Definition: ncbi_export.h:1315
char * dbname(DBPROCESS *dbproc)
Get name of current database.
Definition: dblib.c:6929
int i
const struct ncbi::grid::netcache::search::fields::KEY key
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
int tolower(Uchar c)
Definition: ncbictype.hpp:72
#define count
@ eNotFound
Not found.
Int4 TOid
Ordinal ID in BLAST databases.
Definition: seqdbcommon.hpp:58
Uint4 TPig
ESeqDBIdType
Various identifier formats used in Id lookup.
@ eTiId
Genomic ID is a relatively stable numeric identifier for sequences.
@ ePigId
Trace ID is a numeric identifier for Trace sequences.
@ eGiId
CSeqDBAtlas::TIndx TIndx
Index file.
Definition: seqdbfile.cpp:69
File access objects for CSeqDB.
T SeqDB_GetStdOrd(const T *stdord_obj)
Read a network order integer value.
USING_SCOPE(objects)
Bring the object directory definitions into this scope.
static void s_AdvanceKeyList(const vector< T > &keys, int &index, int num_keys, const T &target)
Definition: seqdbisam.hpp:80
static void s_AdvanceGiList(CSeqDBGiList &gis, int &gi_index, int gis_size, const T &key)
Definition: seqdbisam.hpp:53
static void s_SetTranslation(CSeqDBGiList &gis, int &gi_index, int gis_size, const T &key, int value)
Definition: seqdbisam.hpp:103
Structure that holds GI,OID pairs.
Definition: _hash_fun.h:40
#define _ASSERT
@ eNumeric
Definition: text_util.cpp:53
done
Definition: token1.c:1
Modified on Fri Sep 20 14:58:07 2024 by modify_doxy.py rev. 669887