NCBI C++ ToolKit
gene_info_reader.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: gene_info_reader.hpp 102617 2024-06-12 13:07:39Z zaretska $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Vahram Avagyan
27  *
28  */
29 
30 /// @file gene_info_reader.hpp
31 /// Defines a class for reading Gene information from files.
32 ///
33 /// Defines the CGeneInfoFileReader class which implements the
34 /// IGeneInfoInput interface. The class reads and memory-maps several
35 /// pre-computed and sorted binary files and uses them for fast
36 /// access to Gene information and Gi to/from Gene ID conversions.
37 
38 #ifndef OBJTOOLS_BLAST_GENE_INFO_READER___GENE_INFO_READER__HPP
39 #define OBJTOOLS_BLAST_GENE_INFO_READER___GENE_INFO_READER__HPP
40 
41 //==========================================================================//
42 
45 
46 #include <corelib/ncbifile.hpp>
47 
49 
50 
51 //==========================================================================//
52 
53 /// Name of the environment variable holding the path to Gene info files.
54 #define GENE_INFO_PATH_ENV_VARIABLE "GENE_INFO_PATH"
55 
56 /// Name of the processed "Gi to GeneID" file.
57 #define GENE_GI2GENE_FILE_NAME "geneinfo.g2i"
58 /// Name of the processed "GeneID to Offset" file.
59 #define GENE_GENE2OFFSET_FILE_NAME "geneinfo.i2o"
60 /// Name of the processed "Gi to Offset" file.
61 #define GENE_GI2OFFSET_FILE_NAME "geneinfo.g2o"
62 /// Name of the processed "Gene ID to Gi" file.
63 #define GENE_GENE2GI_FILE_NAME "geneinfo.i2g"
64 /// Name of the combined "Gene Data" file.
65 #define GENE_ALL_GENE_DATA_FILE_NAME "geneinfo.dat"
66 /// Name of the general information/statistics file.
67 #define GENE_GENERAL_INFO_FILE_NAME "geneinfo.log"
68 
69 /// CGeneInfoFileReader
70 ///
71 /// Class implementing the IGeneInfoInput interface using binary files.
72 ///
73 /// CGeneInfoFileReader reads and memory-maps sorted binary files for fast
74 /// Gi to Gene ID, Gene ID to Gene Info, Gi to Gene Info, and Gene ID to Gi
75 /// conversions.
76 /// The Gene Info lookup is represented by two files,
77 /// one contains (Gi, Offset) or (Gene ID, Offset) pairs, the other one
78 /// contains all the Gene data. The lookup is performed in two steps: first,
79 /// the offset to the Gene data is obtained, then the Gene data line is
80 /// read, parsed, and the corresponding CGeneInfo object is constructed.
81 /// The paths to the pre-computed and sorted files are either provided
82 /// directly to the constructor, or the class attempts to read them from
83 /// a path stored in an environment variable (the preferred approach).
84 
86  public CGeneFileUtils
87 {
88 private:
89  /// Path to the Gi to Gene ID file.
91 
92  /// Path to the Gene ID to Offset file.
94 
95  /// Path to the Gi to Offset file.
97 
98  /// Path to the Gene ID to Gi file.
100 
101  /// Path to the file containing all the Gene data.
103 
104  /// Perform Gi to Offset lookups directly.
106 
107  /// Memory-mapped Gi to Gene ID file.
108  unique_ptr<CMemoryFile> m_memGi2GeneFile;
109 
110  /// Memory-mapped Gene ID to Offset file.
111  unique_ptr<CMemoryFile> m_memGene2OffsetFile;
112 
113  /// Memory-mapped Gi to Offset file.
114  unique_ptr<CMemoryFile> m_memGi2OffsetFile;
115 
116  /// Memory-mapped Gene ID to Gi file.
117  unique_ptr<CMemoryFile> m_memGene2GiFile;
118 
119  /// Input stream for the Gene data file.
121 
122  /// Cached map of looked up Gene Info objects.
124 
125 private:
126  /// Memory-map all the files.
128 
129  /// Unmap all the memory-mapped files.
131 
132  /// Fill the Gene ID list given a Gi.
133  bool x_GiToGeneId(TGi gi, list<int>& listGeneIds);
134 
135  /// Set the offset value given a Gene ID.
136  bool x_GeneIdToOffset(int geneId, int& nOffset);
137 
138  /// Set the offset value given a Gi.
139  bool x_GiToOffset(TGi gi, list<int>& listOffsets);
140 
141  /// Fill the Gi list given a Gene ID, and the Gi field index,
142  /// which represents the Gi type to be read from the file.
143  bool x_GeneIdToGi(int geneId, int iGiField, list<TGi>& listGis);
144 
145  /// Read Gene data at the given offset and create the info object.
146  bool x_OffsetToInfo(int nOffset, CRef<CGeneInfo>& info);
147 
148 public:
149  /// Construct using direct paths.
150  ///
151  /// This version of the constructor takes the paths to
152  /// the pre-computed binary files and attempts
153  /// to open and map the files.
154  ///
155  /// @param strGi2GeneFile
156  /// Path to the Gi to Gene ID file
157  /// @param strGene2OffsetFile
158  /// Path to the Gene ID to Offset file.
159  /// @param strGi2OffsetFile
160  /// Path to the Gi to Offset file.
161  /// @param strAllGeneDataFile
162  /// Path to the Gene data file.
163  /// @param strGene2GiFile
164  /// Path to the Gene ID to Gi file.
165  /// @param bGiToOffsetLookup
166  /// Perform Gi to Offset lookups directly.
167  CGeneInfoFileReader(const string& strGi2GeneFile,
168  const string& strGene2OffsetFile,
169  const string& strGi2OffsetFile,
170  const string& strAllGeneDataFile,
171  const string& strGene2GiFile,
172  bool bGiToOffsetLookup = true);
173 
174  /// Construct using paths read from an environment variable.
175  ///
176  /// This version of the constructor reads the paths to
177  /// the pre-computed binary files from an environment variable
178  /// and attempts to open and map the files.
179  ///
180  /// @param bGiToOffsetLookup
181  /// Perform Gi to Offset lookups directly.
182  CGeneInfoFileReader(bool bGiToOffsetLookup = true);
183 
184  /// Destructor.
186 
187  /// GetGeneIdsForGi implementation, see IGeneInfoInput.
188  virtual bool
189  GetGeneIdsForGi(TGi gi, TGeneIdList& geneIdList);
190 
191  /// GetRNAGisForGeneId implementation, see IGeneInfoInput.
192  virtual bool
193  GetRNAGisForGeneId(int geneId, TGiList& giList);
194 
195  /// GetProteinGisForGeneId implementation, see IGeneInfoInput.
196  virtual bool
197  GetProteinGisForGeneId(int geneId, TGiList& giList);
198 
199  /// GetGenomicGisForGeneId implementation, see IGeneInfoInput.
200  virtual bool
201  GetGenomicGisForGeneId(int geneId, TGiList& giList);
202 
203  /// GetGeneInfoForGi implementation, see IGeneInfoInput.
204  virtual bool
206 
207  /// GetGeneInfoForId implementation, see IGeneInfoInput.
208  virtual bool
209  GetGeneInfoForId(int geneId, TGeneInfoList& infoList);
210 };
211 
212 //==========================================================================//
213 
214 
216 
217 #endif
218 
CGeneFileUtils.
Definition: file_utils.hpp:63
CGeneInfoFileReader.
bool x_GeneIdToOffset(int geneId, int &nOffset)
Set the offset value given a Gene ID.
string m_strGi2OffsetFile
Path to the Gi to Offset file.
bool x_GeneIdToGi(int geneId, int iGiField, list< TGi > &listGis)
Fill the Gi list given a Gene ID, and the Gi field index, which represents the Gi type to be read fro...
CNcbiIfstream m_inAllData
Input stream for the Gene data file.
unique_ptr< CMemoryFile > m_memGi2GeneFile
Memory-mapped Gi to Gene ID file.
bool m_bGiToOffsetLookup
Perform Gi to Offset lookups directly.
CGeneInfoFileReader(const string &strGi2GeneFile, const string &strGene2OffsetFile, const string &strGi2OffsetFile, const string &strAllGeneDataFile, const string &strGene2GiFile, bool bGiToOffsetLookup=true)
Construct using direct paths.
virtual ~CGeneInfoFileReader()
Destructor.
void x_MapMemFiles()
Memory-map all the files.
unique_ptr< CMemoryFile > m_memGi2OffsetFile
Memory-mapped Gi to Offset file.
virtual bool GetGeneInfoForGi(TGi gi, TGeneInfoList &infoList)
GetGeneInfoForGi implementation, see IGeneInfoInput.
TGeneIdToGeneInfoMap m_mapIdToInfo
Cached map of looked up Gene Info objects.
unique_ptr< CMemoryFile > m_memGene2GiFile
Memory-mapped Gene ID to Gi file.
string m_strGene2GiFile
Path to the Gene ID to Gi file.
virtual bool GetGeneIdsForGi(TGi gi, TGeneIdList &geneIdList)
GetGeneIdsForGi implementation, see IGeneInfoInput.
virtual bool GetRNAGisForGeneId(int geneId, TGiList &giList)
GetRNAGisForGeneId implementation, see IGeneInfoInput.
void x_UnmapMemFiles()
Unmap all the memory-mapped files.
string m_strGene2OffsetFile
Path to the Gene ID to Offset file.
string m_strGi2GeneFile
Path to the Gi to Gene ID file.
bool x_GiToGeneId(TGi gi, list< int > &listGeneIds)
Fill the Gene ID list given a Gi.
unique_ptr< CMemoryFile > m_memGene2OffsetFile
Memory-mapped Gene ID to Offset file.
bool x_GiToOffset(TGi gi, list< int > &listOffsets)
Set the offset value given a Gi.
virtual bool GetGenomicGisForGeneId(int geneId, TGiList &giList)
GetGenomicGisForGeneId implementation, see IGeneInfoInput.
virtual bool GetProteinGisForGeneId(int geneId, TGiList &giList)
GetProteinGisForGeneId implementation, see IGeneInfoInput.
virtual bool GetGeneInfoForId(int geneId, TGeneInfoList &infoList)
GetGeneInfoForId implementation, see IGeneInfoInput.
bool x_OffsetToInfo(int nOffset, CRef< CGeneInfo > &info)
Read Gene data at the given offset and create the info object.
string m_strAllGeneDataFile
Path to the file containing all the Gene data.
CGeneInfoFileReader(bool bGiToOffsetLookup=true)
Construct using paths read from an environment variable.
CRef –.
Definition: ncbiobj.hpp:618
IGeneInfoInput.
Definition: gene_info.hpp:240
vector< CRef< CGeneInfo > > TGeneInfoList
List of Gene Information objects.
Definition: gene_info.hpp:252
list< int > TGeneIdList
List of Gene IDs.
Definition: gene_info.hpp:246
list< TGi > TGiList
List of Gis.
Definition: gene_info.hpp:243
Definition: map.hpp:338
General file processing routines and structures.
Gene information class and related interfaces.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
#define NCBI_XOBJREAD_EXPORT
Definition: ncbi_export.h:1315
static MDB_envinfo info
Definition: mdb_load.c:37
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
Modified on Fri Sep 20 14:57:07 2024 by modify_doxy.py rev. 669887