NCBI C++ ToolKit
seqdbfile.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef OBJTOOLS_READERS_SEQDB__SEQDBFILE_HPP
2 #define OBJTOOLS_READERS_SEQDB__SEQDBFILE_HPP
3 
4 /* $Id: seqdbfile.hpp 102994 2024-08-19 12:33:48Z fongah2 $
5  * ===========================================================================
6  *
7  * PUBLIC DOMAIN NOTICE
8  * National Center for Biotechnology Information
9  *
10  * This software/database is a "United States Government Work" under the
11  * terms of the United States Copyright Act. It was written as part of
12  * the author's official duties as a United States Government employee and
13  * thus cannot be copyrighted. This software/database is freely available
14  * to the public for use. The National Library of Medicine and the U.S.
15  * Government have not placed any restriction on its use or reproduction.
16  *
17  * Although all reasonable efforts have been taken to ensure the accuracy
18  * and reliability of the software and data, the NLM and the U.S.
19  * Government do not and cannot warrant the performance or results that
20  * may be obtained by using this software or data. The NLM and the U.S.
21  * Government disclaim all warranties, express or implied, including
22  * warranties of performance, merchantability or fitness for any particular
23  * purpose.
24  *
25  * Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Author: Kevin Bealer
30  *
31  */
32 
33 /// @file seqdbfile.hpp
34 /// File access objects for CSeqDB.
35 ///
36 /// Defines classes:
37 /// CSeqDBRawFile
38 /// CSeqDBExtFile
39 /// CSeqDBIdxFile
40 /// CSeqDBSeqFile
41 /// CSeqDBHdrFile
42 ///
43 /// Implemented for: UNIX, MS-Windows
44 
47 
48 #include <corelib/ncbistr.hpp>
49 #include <corelib/ncbifile.hpp>
50 #include <corelib/ncbi_bswap.hpp>
51 #include <corelib/ncbiobj.hpp>
53 #include <set>
54 
56 
57 /// Raw file.
58 ///
59 /// This is the lowest level of SeqDB file object. It controls basic
60 /// (byte data) access to the file, isolating higher levels from
61 /// differences in handling mmapped vs opened files. This has mostly
62 /// become a thin wrapper around the Atlas functionality.
63 
65 public:
66  /// Type which spans possible file offsets.
68 
69  /// Constructor
70  ///
71  /// Builds a "raw" file object, which is the lowest level of the
72  /// SeqDB file objects. It provides byte swapping and reading
73  /// methods, which are implemented via the atlas layer.
74  ///
75  /// @param atlas
76  /// The memory management layer object.
78  : m_Atlas(atlas)
79  {
80  }
81 
82  /// MMap or Open a file.
83  ///
84  /// This serves to verify the existence of, open, and cache the
85  /// length of a file.
86  ///
87  /// @param name
88  /// The filename to open.
89  /// @param locked
90  /// The lock holder object for this thread.
91  /// @return
92  /// true if the file was opened successfully.
93  bool Open(const CSeqDB_Path & name)
94  {
95  _ASSERT(name.Valid());
96 
97  // FIXME: should use path even in atlas code
98  bool success = m_Atlas.GetFileSizeL(name.GetPathS(), m_Length);
99 
100  if (success) {
101  m_FileName = name.GetPathS();
102  }
103 
104  return success;
105  }
106 
107  /// Get a pointer to a section of the file.
108  ///
109  /// This method insures that the memory lease has a hold that
110  /// includes the requested section of the file, and returns a
111  /// pointer to the start offset.
112  ///
113  /// @param lease
114  /// The memory lease object for this file.
115  /// @param start
116  /// The starting offset for the first byte of the region.
117  /// @param end
118  /// The offset for the first byte after the region.
119  /// @param locked
120  /// The lock holder object for this thread.
121  /// @return
122  /// A pointer to the file data at the start offset.
123  const char * GetFileDataPtr(CSeqDBFileMemMap & lease, // commented
124  TIndx start,
125  TIndx end) const
126  {
127  _ASSERT(! m_FileName.empty());
128  SEQDB_FILE_ASSERT(start < end);
129  SEQDB_FILE_ASSERT(m_Length >= end);
130 
131  const char *p = (const char *)lease.GetFileDataPtr(m_FileName,start);
132 
133  return p;
134  }
135 
136  /// Get the length of the file.
137  ///
138  /// The file length is returned as a four byte integer, which is
139  /// the current maximum size for the blastdb component files.
140  ///
141  /// @return
142  /// The length of the file.
144  {
145  return m_Length;
146  }
147 
148  /// Read a four byte numerical object from the file
149  ///
150  /// Given a pointer to an object in memory, this reads a numerical
151  /// value for it from the file. The data in the file is assumed
152  /// to be in network byte order, and the user version in the local
153  /// default byte order (host order). The size of the object is
154  /// taken as sizeof(Uint4).
155  ///
156  /// @param lease
157  /// A memory lease object to use for the read.
158  /// @param offset
159  /// The starting offset of the value in the file.
160  /// @param value
161  /// A pointer to the object.
162  /// @param
163  /// The lock holder object for this thread.
164  /// @return
165  /// The offset of the first byte after the object.
167  TIndx offset,
168  Uint4 * value) const;
169 
170 
171  /// Read an eight byte numerical object from the file
172  ///
173  /// Given a pointer to an object in memory, this reads a numerical
174  /// value for it from the file. The data in the file is assumed
175  /// to be in network byte order, and the user version in the local
176  /// default byte order (host order). The size of the object is
177  /// taken as sizeof(Uint8).
178  ///
179  /// @param lease
180  /// A memory lease object to use for the read.
181  /// @param offset
182  /// The starting offset of the value in the file.
183  /// @param value
184  /// A pointer to the object.
185  /// @param locked
186  /// The lock holder object for this thread.
187  /// @return
188  /// The offset of the first byte after the object.
190  TIndx offset,
191  Uint8 * value) const;
192 
193 
194  /// Read a string object from the file
195  ///
196  /// Given a pointer to a string object, this reads a string value
197  /// for it from the file. The data in the file is assumed to be a
198  /// four byte length in network byte order, followed by the bytes
199  /// of the string. The amount of data is this length + 4.
200  ///
201  /// @param lease
202  /// A memory lease object to use for the read.
203  /// @param offset
204  /// The starting offset of the string length in the file.
205  /// @param value
206  /// A pointer to the returned string.
207  /// @param locked
208  /// The lock holder object for this thread.
209  /// @return
210  /// The offset of the first byte after the string.
212  TIndx offset,
213  string * value) const;
214 
215 
216  /// Read part of the file into a buffer
217  ///
218  /// Copy the file data from offsets start to end into the array at
219  /// buf, which is assumed to already have been allocated. This
220  /// method assumes the atlas lock is held.
221  ///
222  /// @param lease
223  /// A memory lease object to use for the read.
224  /// @param buf
225  /// The destination for the data to be read.
226  /// @param start
227  /// The starting offset for the first byte to read.
228  /// @param end
229  /// The offset for the first byte after the area to read.
230  inline void ReadBytes(CSeqDBFileMemMap & lease,
231  char * buf,
232  TIndx start,
233  TIndx end) const;
234 
235 private:
236  /// The memory management layer object.
238 
239  /// The name of this file.
240  string m_FileName;
241 
242  /// The length of this file.
244 };
245 
246 
247 
248 /// Database component file
249 ///
250 /// This represents any database component file with an extension like
251 /// "pxx" or "nxx". This finds the correct type (protein or
252 /// nucleotide) if that is unknown, and computes the filename based on
253 /// a filename template like "path/to/file/basename.-in".
254 ///
255 /// This also provides a 'protected' interface to the specific db
256 /// files, and defines a few useful methods.
257 
258 class CSeqDBExtFile : public CObject {
259 public:
260  /// Type which spans possible file offsets.
262 
263  /// Constructor
264  ///
265  /// This builds an object which has a few properties required by
266  /// most or all database volume component files. This object
267  /// keeps a lease on the file from the first access until
268  /// instructed not to, moving and expanding that lease to cover
269  /// incoming requests. By keeping a lease, lookups, file opens,
270  /// and other expensive operations are usually avoided on
271  /// subsequent calls. This object also provides some methods to
272  /// read data in a byte swapped or direct way.
273  /// @param atlas
274  /// The memory management layer object.
275  /// @param dbfilename
276  /// The name of the managed file.
277  /// @param prot_nucl
278  /// The sequence data type.
279  /// @param locked
280  /// The lock holder object for this thread.
281  CSeqDBExtFile(CSeqDBAtlas & atlas,
282  const string & dbfilename,
283  char prot_nucl);
284 
285 
286  /// Destructor
287  virtual ~CSeqDBExtFile()
288  {
289  }
290 
291 
292  /// Release memory held in the atlas layer by this object.
293  void UnLease()
294  {
295  m_Lease.Clear();
296  }
297 
298 protected:
299 
300  /// Read part of the file into a buffer
301  ///
302  /// Copy the file data from offsets start to end into the array at
303  /// buf, which is assumed to already have been allocated. This
304  /// method assumes the atlas lock is held.
305  ///
306  /// @param buf
307  /// The destination for the data to be read.
308  /// @param start
309  /// The starting offset for the first byte to read.
310  /// @param end
311  /// The offset for the first byte after the area to read.
312  void x_ReadBytes(char * buf,
313  TIndx start,
314  TIndx end) const
315  {
316  m_File.ReadBytes(m_Lease, buf, start, end);
317  }
318 
319  /// Read a numerical object from the file
320  ///
321  /// Given a pointer to an object in memory, this reads a numerical
322  /// value for it from the file. The data in the file is assumed
323  /// to be in network byte order, and the user version in the local
324  /// default byte order (host order). The offset of the data is
325  /// provided, and the size of the object is taken as sizeof(T).
326  ///
327  /// @param lease
328  /// A memory lease object to use for the read.
329  /// @param offset
330  /// The starting offset of the object in the file.
331  /// @param value
332  /// A pointer to the object.
333  /// @param locked
334  /// The lock holder object for this thread.
335  /// @return
336  /// The offset of the first byte after the object.
337  template<class T>
339  TIndx offset,
340  T * value)
341 
342  {
343  return m_File.ReadSwapped(lease, offset, value);
344  }
345 
346  /// Get the volume's sequence data type.
347  ///
348  /// This object knows which type of sequence data it deals with -
349  /// this method returns that information.
350  ///
351  /// @return
352  /// The type of sequence data in use.
353  char x_GetSeqType() const
354  {
355  return m_ProtNucl;
356  }
357 
358  /// Sets the sequence data type.
359  ///
360  /// The sequence data will be set as protein or nucleotide. An
361  /// exception is thrown if an invalid type is provided. The first
362  /// character of the file extension will be modified to reflect
363  /// the sequence data type.
364  ///
365  /// @param prot_nucl
366  /// Either 'p' or 'n' for protein or nucleotide.
367  void x_SetFileType(char prot_nucl);
368 
369  // Data
370 
371  /// The memory layer management object.
373 
374  /// The name of this file.
375  string m_FileName;
376 
377  /// Either 'p' for protein or 'n' for nucleotide.
379 
380  /// A memory lease used by this file.
382 
383  /// The raw file object.
385 };
386 
387 void inline CSeqDBExtFile::x_SetFileType(char prot_nucl)
388 {
389  m_ProtNucl = prot_nucl;
390 
391  if ((m_ProtNucl != 'p') &&
392  (m_ProtNucl != 'n')) {
393 
394  NCBI_THROW(CSeqDBException, eArgErr,
395  "Invalid argument: seq type must be 'p' or 'n'.");
396  }
397 
398  _ASSERT(m_FileName.size() >= 5);
399 
400  m_FileName[m_FileName.size() - 3] = m_ProtNucl;
401 }
402 
403 
404 /// Index file
405 ///
406 /// This is the .pin or .nin file; it provides indices into the other
407 /// files. The version, title, date, and other summary information is
408 /// also stored here.
409 
410 class CSeqDBIdxFile : public CSeqDBExtFile {
411 public:
412  /// Constructor
413  ///
414  /// This builds an object which provides access to the index file
415  /// for a volume. The index file contains metadata about the
416  /// volume, such as the title and construction date. The index
417  /// file also contains indices into the header and sequence data
418  /// files. Because these offsets are four byte integers, all
419  /// volumes have a size of no more than 2^32 bytes, but in
420  /// practice, they are usually kept under 2^30 bytes.
421  ///
422  /// @param atlas
423  /// The memory management layer object.
424  /// @param dbname
425  /// The name of the database volume.
426  /// @param prot_nucl
427  /// The sequence data type.
428  /// @param locked
429  /// The lock holder object for this thread.
430  CSeqDBIdxFile(CSeqDBAtlas & atlas,
431  const string & dbname,
432  char prot_nucl);
433 
434 
435  /// Destructor
436  virtual ~CSeqDBIdxFile()
437  {
438  // Synchronization removed from this path - it was causing a
439  // deadlock in an error path, and destruction and construction
440  // are necessarily single threaded in any case.
441 
442  //Verify();
443  UnLease();
444  }
445 
446  /// Get the location of a sequence's ambiguity data
447  ///
448  /// This method returns the offsets of the start and end of the
449  /// ambiguity data for a specific nucleotide sequence. If this
450  /// range is non-empty, then this sequence has ambiguous regions,
451  /// which are encoded as a series of instructions for modifying
452  /// the compressed 4 base/byte nucleotide data. The ambiguity
453  /// data is encoded as randomized noise, with the intention of
454  /// minimizing accidental matches.
455  ///
456  /// @param oid
457  /// The sequence to get data for.
458  /// @param start
459  /// The returned start offset of the sequence.
460  /// @param end
461  /// The returned end offset of the sequence.
462  /// @return
463  /// true if the sequence has ambiguity data.
464  inline bool
465  GetAmbStartEnd(int oid,
466  TIndx & start,
467  TIndx & end) const;
468 
469  /// Get the location of a sequence's header data
470  ///
471  /// This method returns the offsets of the start and end of the
472  /// header data for a specific database sequence. The header data
473  /// is a Blast-def-line-set in binary ASN.1. This data includes
474  /// associated taxonomy data, Seq-ids, and membership bits.
475  ///
476  /// @param oid
477  /// The sequence to get data for.
478  /// @param start
479  /// The returned start offset of the sequence.
480  /// @param end
481  /// The returned end offset of the sequence.
482  inline void
483  GetHdrStartEnd(int oid,
484  TIndx & start,
485  TIndx & end) const;
486 
487  /// Get the location of a sequence's packed sequence data
488  ///
489  /// This method returns the offsets of the start and end of the
490  /// packed sequence data for a specific database sequence. For
491  /// protein data, the packed version is the only supported
492  /// encoding, and is stored at one base per byte. The header data
493  /// is encoded as a Blast-def-line-set in binary ASN.1. This data
494  /// includes taxonomy information, Seq-ids for this sequence, and
495  /// membership bits.
496  ///
497  /// @param oid
498  /// The sequence to get data for.
499  /// @param start
500  /// The returned start offset of the sequence.
501  /// @param end
502  /// The returned end offset of the sequence.
503  inline void
504  GetSeqStartEnd(int oid,
505  TIndx & start,
506  TIndx & end) const;
507 
508  /// Get the location of a sequence's packed sequence data
509  ///
510  /// This method returns the offsets of the start and end of the
511  /// packed sequence data for a specific database sequence. For
512  /// protein data, the packed version is the only supported
513  /// encoding, and is stored at one base per byte. The header data
514  /// is encoded as a Blast-def-line-set in binary ASN.1. This data
515  /// includes taxonomy information, Seq-ids for this sequence, and
516  /// membership bits.
517  ///
518  /// @param oid
519  /// The sequence to get data for.
520  /// @param start
521  /// The returned start offset of the sequence.
522  inline void
523  GetSeqStart(int oid,
524  TIndx & start) const;
525 
526  /// Get the sequence data type.
527  char GetSeqType() const
528  {
529  return x_GetSeqType();
530  }
531 
532  /// Get the volume title.
533  string GetTitle() const
534  {
535  return m_Title;
536  }
537 
538  /// Get the construction date of the volume.
539  string GetDate() const
540  {
541  return m_Date;
542  }
543 
544  /// Get the number of oids in this volume.
545  int GetNumOIDs() const
546  {
547  return m_NumOIDs;
548  }
549 
550  /// Get the length of the volume (in bases).
552  {
553  return m_VolLen;
554  }
555 
556  /// Get the length of the longest sequence in this volume.
557  int GetMaxLength() const
558  {
559  return m_MaxLen;
560  }
561 
562  /// Get the length of the shortest sequence in this volume.
563  int GetMinLength() const
564  {
565  return m_MinLen;
566  }
567 
568  /// Release any memory leases temporarily held here.
569  void UnLease()
570  {
571  //Verify();
572  x_ClrHdr();
573  x_ClrSeq();
574  x_ClrAmb();
575  }
576 
577  string GetLMDBFileName()const {return m_LMDBFile;}
578 
579  /// Verify the integrity of this object and subobjects.
580  /*
581  void Verify()
582  {
583  m_HdrLease.Verify();
584  m_SeqLease.Verify();
585  m_AmbLease.Verify();
586  }
587  */
588 private:
589 
590  /// A memory lease used by the header section of this file.
592  //mutable CMemoryFile *m_MmappedHdrIndex;
593 
594  /// A memory lease used by the sequence section of this file.
596  //mutable CMemoryFile* m_MmappedSeqIndex;
597 
598  /// A memory lease used by the ambiguity section of this file.
600  //mutable CMemoryFile *m_MmappedAmbIndex;
601 
602  // Swapped data from .[pn]in file
603 
604  /// The volume title.
605  string m_Title;
606 
607  /// The construction date of the volume.
608  string m_Date;
609 
610  /// The number of oids in this volume.
612 
613  /// The length of the volume (in bases).
615 
616  /// The length of the longest sequence in this volume.
618 
619  /// The length of the shortest sequence in this volume.
621 
622  // Other pointers and indices
623 
624  // These can be mutable because they:
625  // 1. Do not constitute true object state.
626  // 2. Are modified only under lock (CSeqDBRawFile::m_Atlas.m_Lock).
627 
628  /// Return header data (assumes locked).
629  void x_ClrHdr() const
630  {
631  m_HdrLease.Clear();
632  }
633 
634  /// Return sequence data (assumes locked).
635  void x_ClrSeq() const
636  {
637  m_SeqLease.Clear();
638  }
639 
640  /// Return ambiguity data (assumes locked).
641  void x_ClrAmb() const
642  {
643  m_AmbLease.Clear();
644  }
645 
646  /// Get header data (assumes locked).
647  Uint4 * x_GetHdr() const
648  {
649 
651  }
652 
653  /// Get sequence data (assumes locked).
654  Uint4 * x_GetSeq() const
655  {
656 
658  }
659 
660  /// Get ambiguity data (assumes locked).
661  Uint4 * x_GetAmb() const
662  {
663  _ASSERT(x_GetSeqType() == 'n');
664 
666  }
667 
668 
669  /// offset of the start of the header section.
671 
672  /// Offset of the end of the header section.
674 
675  /// Offset of the start of the sequence section.
677 
678  /// Offset of the end of the sequence section.
680 
681  /// Offset of the start of the ambiguity section.
683 
684  /// Offset of the end of the ambiguity section.
686 
687  /// Name of matching SQLite file (empty if version 4 DB)
688  string m_LMDBFile;
689  /// Volume number (only set in version 5 DBs)
691 };
692 
693 bool
694 CSeqDBIdxFile::GetAmbStartEnd(int oid, TIndx & start, TIndx & end) const
695 {
697  if ('n' == x_GetSeqType()) {
698  start = SeqDB_GetStdOrd(& x_GetAmb()[oid]);
699  end = SeqDB_GetStdOrd(& x_GetSeq()[oid+1]);
700 
701  return (start <= end);
702  }
703 
704  return false;
705 }
706 
707 void
708 CSeqDBIdxFile::GetHdrStartEnd(int oid, TIndx & start, TIndx & end) const
709 {
711  start = SeqDB_GetStdOrd(& x_GetHdr()[oid]);
712  end = SeqDB_GetStdOrd(& x_GetHdr()[oid+1]);
713 }
714 
715 void
716 CSeqDBIdxFile::GetSeqStartEnd(int oid, TIndx & start, TIndx & end) const
717 {
719  start = SeqDB_GetStdOrd(& x_GetSeq()[oid]);
720 
721  if ('p' == x_GetSeqType()) {
722  end = SeqDB_GetStdOrd(& x_GetSeq()[oid+1]);
723  } else {
724  end = SeqDB_GetStdOrd(& x_GetAmb()[oid]);
725  }
726 }
727 
728 void
729 CSeqDBIdxFile::GetSeqStart(int oid, TIndx & start) const
730 {
732  start = SeqDB_GetStdOrd(& x_GetSeq()[oid]);
733 }
734 
735 
736 /// Sequence data file
737 ///
738 /// This is the .psq or .nsq file; it provides the raw sequence data,
739 /// and for nucleotide sequences, ambiguity data. For nucleotide
740 /// sequences, the last byte will contain a two bit marker with a
741 /// number from 0-3, which indicates how much of the rest of that byte
742 /// is filled with base information (0-3 bases, which is 0-6 bits).
743 /// For ambiguous regions, the sequence data is normally randomized in
744 /// this file, to reduce the number of accidental false positives
745 /// during the search. The ambiguity data encodes the location of,
746 /// and actual data for, those regions.
747 
748 class CSeqDBSeqFile : public CSeqDBExtFile {
749 public:
750  /// Type which spans possible file offsets.
752 
753  /// Constructor
754  ///
755  /// This builds an object which provides access to the sequence
756  /// data file for a volume. This file is simply a concatenation
757  /// of all the sequence data for the database sequences. In a
758  /// protein file, these are just the database sequences seperated
759  /// by NUL bytes. In a nucleotide volume, the packed data for
760  /// each sequence is followed by ambiguity data for that sequence
761  /// (if any such data exists).
762  ///
763  /// @param atlas
764  /// The memory management layer object.
765  /// @param dbname
766  /// The name of the database volume.
767  /// @param prot_nucl
768  /// The sequence data type.
769  /// @param locked
770  /// The lock holder object for this thread.
772  const string & dbname,
773  char prot_nucl)
774  : CSeqDBExtFile(atlas, dbname + ".-sq", prot_nucl)
775  {
776  }
777 
778  /// Destructor
779  virtual ~CSeqDBSeqFile()
780  {
781  }
782 
783  /// Read part of the file into a buffer
784  ///
785  /// Copy the sequence data from offsets start to end into the
786  /// array at buf, which is assumed to already have been allocated.
787  /// This method assumes the atlas lock is held.
788  ///
789  /// @param buf
790  /// The destination for the data to be read.
791  /// @param start
792  /// The starting offset for the first byte to read.
793  /// @param end
794  /// The offset for the first byte after the area to read.
795  void ReadBytes(char * buf,
796  TIndx start,
797  TIndx end) const
798  {
799  x_ReadBytes(buf, start, end);
800  }
801 
802  /// Get a pointer into the file contents.
803  ///
804  /// Copy the sequence data from offsets start to end into the
805  /// array at buf, which is assumed to already have been allocated.
806  /// This method assumes the atlas lock is held. If the user will
807  /// take ownership of the memory region hold, the keep argument
808  /// should be specified as true.
809  ///
810  /// @param start
811  /// The starting offset for the first byte to read.
812  /// @param end
813  /// The offset for the first byte after the area to read.
814  /// @param keep
815  /// True if an extra hold should be acquired on the data.
816  /// @param hold
817  /// Specify true to get a request-duration hold.
818  /// @param locked
819  /// The lock holder object for this thread.
820  /// @return
821  /// A pointer into the file data.
822  const char * GetFileDataPtr(TIndx start) const // commented
823  {
824  const char *p = (const char *)m_Lease.GetFileDataPtr(start);
825 
826  return p;
827  }
828 };
829 
830 
831 /// Header file
832 ///
833 /// This is the .phr or .nhr file. It contains descriptive data for
834 /// each sequence, including taxonomic information and identifiers for
835 /// sequence files. The version, title, date, and other summary
836 /// information is also stored here.
837 
838 class CSeqDBHdrFile : public CSeqDBExtFile {
839 public:
840  /// Type which spans possible file offsets.
842 
843  /// Constructor
844  ///
845  /// This builds an object which provides access to the header data
846  /// file for a volume. This file is simply a concatenation of the
847  /// header data for each object, stored as a Blast-def-line-set
848  /// objects in binary ASN.1.
849  ///
850  /// @param atlas
851  /// The memory management layer object.
852  /// @param dbname
853  /// The name of the database volume.
854  /// @param prot_nucl
855  /// The sequence data type.
856  /// @param locked
857  /// The lock holder object for this thread.
859  const string & dbname,
860  char prot_nucl)
861  : CSeqDBExtFile(atlas, dbname + ".-hr", prot_nucl)
862  {
863  }
864 
865  /// Destructor
866  virtual ~CSeqDBHdrFile()
867  {
868  }
869 
870  /// Read part of the file into a buffer
871  ///
872  /// Copy the sequence data from offsets start to end into the
873  /// array at buf, which is assumed to already have been allocated.
874  /// This method assumes the atlas lock is held. If the user will
875  /// take ownership of the memory region hold, the keep argument
876  /// should be specified as true.
877  ///
878  /// @param buf
879  /// The buffer to receive the data.
880  /// @param start
881  /// The starting offset for the first byte to read.
882  /// @param end
883  /// The offset for the first byte after the area to read.
884  void ReadBytes(char * buf,
885  TIndx start,
886  TIndx end) const
887  {
888  x_ReadBytes(buf, start, end);
889  }
890 
891  /// Read part of the file into a buffer
892  ///
893  /// Copy the sequence data from offsets start to end into the
894  /// array at buf, which is assumed to already have been allocated.
895  /// This method assumes the atlas lock is held. If the user will
896  /// take ownership of the memory region hold, the keep argument
897  /// should be specified as true.
898  ///
899  /// @param start
900  /// The starting offset for the first byte to read.
901  /// @param end
902  /// The offset for the first byte after the area to read.
903  /// @param locked
904  /// The lock holder object for this thread.
905  /// @return
906  /// A pointer into the file data.
907  const char * GetFileDataPtr(TIndx start) const // commented
908  {
909  // Header data never requires the 'hold' option because asn.1
910  // processing is done immediately.
911 
912  const char *p = (const char *)m_Lease.GetFileDataPtr(start);
913  return p;
914  }
915 };
916 
917 
918 // Does not modify (or use) internal file offset
919 
920 // Assumes locked.
921 
923  char * buf,
924  TIndx start,
925  TIndx end) const
926 {
927  memcpy(buf, lease.GetFileDataPtr(m_FileName,start), end-start);
928 
929 }
930 
932 
933 #endif // OBJTOOLS_READERS_SEQDB__SEQDBFILE_HPP
934 
935 
CObject –.
Definition: ncbiobj.hpp:180
CSeqDBAtlas class.
Definition: seqdbatlas.hpp:297
bool GetFileSizeL(const string &fname, TIndx &length)
Get size of a file.
Definition: seqdbatlas.cpp:160
CNcbiStreamoff TIndx
The type used for file offsets.
Definition: seqdbatlas.hpp:301
CSeqDBException.
Definition: seqdbcommon.hpp:73
Database component file.
Definition: seqdbfile.hpp:258
CSeqDBRawFile m_File
The raw file object.
Definition: seqdbfile.hpp:384
char x_GetSeqType() const
Get the volume's sequence data type.
Definition: seqdbfile.hpp:353
void x_SetFileType(char prot_nucl)
Sets the sequence data type.
Definition: seqdbfile.hpp:387
CSeqDBAtlas::TIndx TIndx
Type which spans possible file offsets.
Definition: seqdbfile.hpp:261
void x_ReadBytes(char *buf, TIndx start, TIndx end) const
Read part of the file into a buffer.
Definition: seqdbfile.hpp:312
CSeqDBAtlas & m_Atlas
The memory layer management object.
Definition: seqdbfile.hpp:372
char m_ProtNucl
Either 'p' for protein or 'n' for nucleotide.
Definition: seqdbfile.hpp:378
CSeqDBFileMemMap m_Lease
A memory lease used by this file.
Definition: seqdbfile.hpp:381
TIndx x_ReadSwapped(CSeqDBFileMemMap &lease, TIndx offset, T *value)
Read a numerical object from the file.
Definition: seqdbfile.hpp:338
virtual ~CSeqDBExtFile()
Destructor.
Definition: seqdbfile.hpp:287
void UnLease()
Release memory held in the atlas layer by this object.
Definition: seqdbfile.hpp:293
string m_FileName
The name of this file.
Definition: seqdbfile.hpp:375
CSeqDBExtFile(CSeqDBAtlas &atlas, const string &dbfilename, char prot_nucl)
Constructor.
Definition: seqdbfile.cpp:106
const char * GetFileDataPtr(const string &fname, TIndx offset)
Get a pointer to the specified offset.
Definition: seqdbatlas.hpp:755
void Init(const string &filename)
Initializes a memory map object.
Definition: seqdbatlas.hpp:702
void Clear()
Clears the memory mapobject.
Definition: seqdbatlas.hpp:735
Header file.
Definition: seqdbfile.hpp:838
const char * GetFileDataPtr(TIndx start) const
Read part of the file into a buffer.
Definition: seqdbfile.hpp:907
CSeqDBHdrFile(CSeqDBAtlas &atlas, const string &dbname, char prot_nucl)
Constructor.
Definition: seqdbfile.hpp:858
void ReadBytes(char *buf, TIndx start, TIndx end) const
Read part of the file into a buffer.
Definition: seqdbfile.hpp:884
CSeqDBAtlas::TIndx TIndx
Type which spans possible file offsets.
Definition: seqdbfile.hpp:841
virtual ~CSeqDBHdrFile()
Destructor.
Definition: seqdbfile.hpp:866
Index file.
Definition: seqdbfile.hpp:410
void x_ClrSeq() const
Return sequence data (assumes locked).
Definition: seqdbfile.hpp:635
Uint4 * x_GetAmb() const
Get ambiguity data (assumes locked).
Definition: seqdbfile.hpp:661
string m_LMDBFile
Name of matching SQLite file (empty if version 4 DB)
Definition: seqdbfile.hpp:688
TIndx m_EndSeq
Offset of the end of the sequence section.
Definition: seqdbfile.hpp:679
string GetDate() const
Get the construction date of the volume.
Definition: seqdbfile.hpp:539
void x_ClrAmb() const
Return ambiguity data (assumes locked).
Definition: seqdbfile.hpp:641
Uint4 m_NumOIDs
The number of oids in this volume.
Definition: seqdbfile.hpp:611
TIndx m_OffHdr
offset of the start of the header section.
Definition: seqdbfile.hpp:670
void GetHdrStartEnd(int oid, TIndx &start, TIndx &end) const
Get the location of a sequence's header data.
Definition: seqdbfile.hpp:708
void x_ClrHdr() const
Return header data (assumes locked).
Definition: seqdbfile.hpp:629
void UnLease()
Release any memory leases temporarily held here.
Definition: seqdbfile.hpp:569
string GetTitle() const
Get the volume title.
Definition: seqdbfile.hpp:533
int GetNumOIDs() const
Get the number of oids in this volume.
Definition: seqdbfile.hpp:545
CSeqDBFileMemMap m_SeqLease
A memory lease used by the sequence section of this file.
Definition: seqdbfile.hpp:595
string GetLMDBFileName() const
Definition: seqdbfile.hpp:577
bool GetAmbStartEnd(int oid, TIndx &start, TIndx &end) const
Get the location of a sequence's ambiguity data.
Definition: seqdbfile.hpp:694
Uint4 m_MaxLen
The length of the longest sequence in this volume.
Definition: seqdbfile.hpp:617
TIndx m_EndAmb
Offset of the end of the ambiguity section.
Definition: seqdbfile.hpp:685
string m_Title
The volume title.
Definition: seqdbfile.hpp:605
TIndx m_EndHdr
Offset of the end of the header section.
Definition: seqdbfile.hpp:673
Uint8 GetVolumeLength() const
Get the length of the volume (in bases).
Definition: seqdbfile.hpp:551
Uint4 * x_GetHdr() const
Get header data (assumes locked).
Definition: seqdbfile.hpp:647
Uint4 m_Volume
Volume number (only set in version 5 DBs)
Definition: seqdbfile.hpp:690
int GetMinLength() const
Get the length of the shortest sequence in this volume.
Definition: seqdbfile.hpp:563
Uint8 m_VolLen
The length of the volume (in bases).
Definition: seqdbfile.hpp:614
TIndx m_OffSeq
Offset of the start of the sequence section.
Definition: seqdbfile.hpp:676
virtual ~CSeqDBIdxFile()
Destructor.
Definition: seqdbfile.hpp:436
void GetSeqStart(int oid, TIndx &start) const
Get the location of a sequence's packed sequence data.
Definition: seqdbfile.hpp:729
char GetSeqType() const
Get the sequence data type.
Definition: seqdbfile.hpp:527
TIndx m_OffAmb
Offset of the start of the ambiguity section.
Definition: seqdbfile.hpp:682
CSeqDBFileMemMap m_AmbLease
A memory lease used by the ambiguity section of this file.
Definition: seqdbfile.hpp:599
CSeqDBIdxFile(CSeqDBAtlas &atlas, const string &dbname, char prot_nucl)
Constructor.
Definition: seqdbfile.cpp:134
int GetMaxLength() const
Get the length of the longest sequence in this volume.
Definition: seqdbfile.hpp:557
string m_Date
The construction date of the volume.
Definition: seqdbfile.hpp:608
Uint4 m_MinLen
The length of the shortest sequence in this volume.
Definition: seqdbfile.hpp:620
void GetSeqStartEnd(int oid, TIndx &start, TIndx &end) const
Get the location of a sequence's packed sequence data.
Definition: seqdbfile.hpp:716
CSeqDBFileMemMap m_HdrLease
Verify the integrity of this object and subobjects.
Definition: seqdbfile.hpp:591
Uint4 * x_GetSeq() const
Get sequence data (assumes locked).
Definition: seqdbfile.hpp:654
Raw file.
Definition: seqdbfile.hpp:64
TIndx GetFileLength() const
Get the length of the file.
Definition: seqdbfile.hpp:143
TIndx ReadSwapped(CSeqDBFileMemMap &lease, TIndx offset, Uint4 *value) const
Read a four byte numerical object from the file.
Definition: seqdbfile.cpp:71
void ReadBytes(CSeqDBFileMemMap &lease, char *buf, TIndx start, TIndx end) const
Read part of the file into a buffer.
Definition: seqdbfile.hpp:922
TIndx m_Length
The length of this file.
Definition: seqdbfile.hpp:243
string m_FileName
The name of this file.
Definition: seqdbfile.hpp:240
CSeqDBRawFile(CSeqDBAtlas &atlas)
Constructor.
Definition: seqdbfile.hpp:77
CSeqDBAtlas::TIndx TIndx
Type which spans possible file offsets.
Definition: seqdbfile.hpp:67
bool Open(const CSeqDB_Path &name)
MMap or Open a file.
Definition: seqdbfile.hpp:93
const char * GetFileDataPtr(CSeqDBFileMemMap &lease, TIndx start, TIndx end) const
Get a pointer to a section of the file.
Definition: seqdbfile.hpp:123
CSeqDBAtlas & m_Atlas
The memory management layer object.
Definition: seqdbfile.hpp:237
Sequence data file.
Definition: seqdbfile.hpp:748
CSeqDBSeqFile(CSeqDBAtlas &atlas, const string &dbname, char prot_nucl)
Constructor.
Definition: seqdbfile.hpp:771
CSeqDBAtlas::TIndx TIndx
Type which spans possible file offsets.
Definition: seqdbfile.hpp:751
void ReadBytes(char *buf, TIndx start, TIndx end) const
Read part of the file into a buffer.
Definition: seqdbfile.hpp:795
virtual ~CSeqDBSeqFile()
Destructor.
Definition: seqdbfile.hpp:779
const char * GetFileDataPtr(TIndx start) const
Get a pointer into the file contents.
Definition: seqdbfile.hpp:822
CSeqDB_Path.
const string & GetPathS() const
Get the path as a string.
bool Valid() const
Returns true if this object has a value.
The NCBI C++ standard methods for dealing with std::string.
#define T(s)
Definition: common.h:230
int offset
Definition: replacements.h:160
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
char * dbname(DBPROCESS *dbproc)
Get name of current database.
Definition: dblib.c:6929
char * buf
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
Portable reference counted smart and weak pointers using CWeakRef, CRef, CObject and CObjectEx.
The SeqDB memory management layer.
Defines exception class and several constants for SeqDB.
CSeqDBAtlas::TIndx TIndx
Index file.
Definition: seqdbfile.cpp:69
This file defines several SeqDB utility functions related to byte order and file system portability.
#define SEQDB_FILE_ASSERT(YESNO)
T SeqDB_GetStdOrd(const T *stdord_obj)
Read a network order integer value.
#define _ASSERT
Modified on Fri Sep 20 14:57:50 2024 by modify_doxy.py rev. 669887