NCBI C++ ToolKit
seqdbtax.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: seqdbtax.cpp 91827 2020-12-14 18:07:01Z grichenk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Kevin Bealer
27  *
28  */
29 
30 /// @file seqdbtax.cpp
31 /// Implementation for the CSeqDBVol class, which provides an
32 /// interface for all functionality of one database volume.
33 #include <ncbi_pch.hpp>
34 #include <objtools/error_codes.hpp>
36 
37 /// Tell the error reporting framework what part of the code we're in.
38 #define NCBI_USE_ERRCODE_X Objtools_SeqDBTax
39 
41 
42 
43 /// CSeqDBTaxId class
44 ///
45 /// This is a memory overlay class. Do not change the size or layout
46 /// of this class unless corresponding changes happen to the taxonomy
47 /// database file format. This class's constructor and destructor are
48 /// not called; instead, a pointer to mapped memory is cast to a
49 /// pointer to this type, and the access methods are used to examine
50 /// the fields.
51 
52 class CSeqDBTaxId {
53 public:
54  /// Constructor
55  ///
56  /// This class is a read-only memory overlay and is not expected
57  /// to ever be constructed.
59  {
60  _ASSERT(0);
61  }
62 
63  /// Return the taxonomic identifier field (in host order)
65  {
67  }
68 
69  /// Return the offset field (in host order)
70  Int4 GetOffset() const
71  {
72  return SeqDB_GetStdOrd(& m_Offset);
73  }
74 
75 private:
76  /// This structure should not be copy constructed
78 
79  /// The taxonomic identifier
81 
82  /// The offset of the start of the taxonomy data.
84 };
85 
86 
87 
89 {
90 public:
93 
94  const char* GetDataPtr() {return m_DataPtr;}
95  const CSeqDBTaxId* GetIndexPtr() {return m_IndexPtr;}
96  bool IsMissingTaxInfo() {return m_MissingDB;}
97  const Int4 GetTaxidCount() { return m_AllTaxidCount;}
98  size_t GetDataFileSize() { return m_DataFileSize;}
99 
100 private:
101  /// The filename of the taxonomic db index file
102  string m_IndexFN;
103 
104  /// The filename of the taxnomoic db data file
105  string m_DataFN;
106 
107  /// Total number of taxids in the database
109 
110  /// Memory map of the index file
111  unique_ptr<CMemoryFile> m_IndexFileMap;
112  unique_ptr<CMemoryFile> m_DataFileMap;
114  char * m_DataPtr;
115 
117 
118 
119  /// Indicator if tax db files are missing
121 };
122 
123 
124 
126  : m_AllTaxidCount(0),
127  m_IndexPtr(NULL),
128  m_DataPtr(NULL),
129  m_DataFileSize(0),
130  m_MissingDB(false)
131 {
132 
133  // It is reasonable for this database to not exist.
134  m_IndexFN = SeqDB_ResolveDbPath("taxdb.bti");
135 
136  if (m_IndexFN.size()) {
138  m_DataFN[m_DataFN.size()-1] = 'd';
139  }
140 
141  if (! (m_IndexFN.size() &&
142  m_DataFN.size() &&
143  CFile(m_IndexFN).Exists() &&
144  CFile(m_DataFN).Exists())) {
145  m_MissingDB = true;
146  return;
147  }
148 
149  // Size for header data plus one taxid object.
150 
151  Uint4 data_start = (4 + // magic
152  4 + // taxid count
153  16); // 4 reserved fields
154 
155  Uint4 idx_file_len = (Uint4) CFile(m_IndexFN).GetLength();
156 
157  if (idx_file_len < (data_start + sizeof(CSeqDBTaxId))) {
158  m_MissingDB = true;
159  return;
160  }
161 
163 
164  m_IndexFileMap->Map();
165 
166  // Last check-up of the database validity
167 
168 
169  Uint4 * magic_num_ptr = (Uint4 *)m_IndexFileMap->GetPtr();
170 
171  const unsigned TAX_DB_MAGIC_NUMBER = 0x8739;
172 
173  if (TAX_DB_MAGIC_NUMBER != SeqDB_GetStdOrd(magic_num_ptr ++)) {
174  m_MissingDB = true;
175  m_IndexFileMap.reset();
176  ERR_POST("Error: Tax database file has wrong magic number.");
177  return;
178  }
179 
180  m_AllTaxidCount = SeqDB_GetStdOrd(magic_num_ptr ++);
181 
182  // Skip the four reserved fields
183  magic_num_ptr += 4;
184 
185  int taxid_array_size = int((idx_file_len - data_start)/sizeof(CSeqDBTaxId));
186 
187  if (taxid_array_size != m_AllTaxidCount) {
188  m_MissingDB = true;
189  m_IndexFileMap.reset();
190  ERR_POST("SeqDB: Taxid metadata indicates (" << m_AllTaxidCount
191  << ") entries but file has room for (" << taxid_array_size
192  << ").");
193 
194  if (taxid_array_size < m_AllTaxidCount) {
195  m_AllTaxidCount = taxid_array_size;
196  }
197  return;
198  }
199 
200  m_DataFileMap.reset(new CMemoryFile(m_DataFN));
201 
202  m_DataPtr = (char *) (m_DataFileMap->GetPtr());
203  m_DataFileSize = m_DataFileMap->GetSize();
204  m_IndexPtr = (CSeqDBTaxId*) magic_num_ptr;
205 
206 }
207 
209 {
210  if (!m_MissingDB) {
211  m_IndexFileMap->Unmap();
212  m_IndexFileMap.reset();
213  m_DataFileMap->Unmap();
214  m_DataFileMap.reset();
215  }
216 }
217 
218 
220  SSeqDBTaxInfo & info )
221 {
222  static CTaxDBFileInfo t;
223  if (t.IsMissingTaxInfo()) return false;
224 
225  Int4 low_index = 0;
226  Int4 high_index = t.GetTaxidCount() - 1;
227 
228  const char * Data = t.GetDataPtr();
229  const CSeqDBTaxId* Index = t.GetIndexPtr();
230  TTaxId low_taxid = Index[low_index ].GetTaxId();
231  TTaxId high_taxid = Index[high_index].GetTaxId();
232 
233  if((tax_id < low_taxid) || (tax_id > high_taxid))
234  return false;
235 
236  Int4 new_index = (low_index+high_index)/2;
237  Int4 old_index = new_index;
238 
239  while(1) {
240  TTaxId curr_taxid = Index[new_index].GetTaxId();
241 
242  if (tax_id < curr_taxid) {
243  high_index = new_index;
244  } else if (tax_id > curr_taxid){
245  low_index = new_index;
246  } else { /* Got it ! */
247  break;
248  }
249 
250  new_index = (low_index+high_index)/2;
251  if (new_index == old_index) {
252  if (tax_id > curr_taxid) {
253  new_index++;
254  }
255  break;
256  }
257  old_index = new_index;
258  }
259 
260  if (tax_id == Index[new_index].GetTaxId()) {
261  info.taxid = tax_id;
262 
263  Uint4 begin_data(Index[new_index].GetOffset());
264  Uint4 end_data(0);
265 
266  if (new_index == high_index) {
267  // Last index is special...
268  end_data = Uint4(t.GetDataFileSize());
269 
270  if (end_data < begin_data) {
271  // Should not happen.
272  ERR_POST( "Error: Offset error at end of taxdb file.");
273  return false;
274  }
275  } else {
276  end_data = (Index[new_index+1].GetOffset());
277  }
278 
279  const char * start_ptr = &Data[begin_data];
280 
281  CSeqDB_Substring buffer(start_ptr, start_ptr + (end_data - begin_data));
282  CSeqDB_Substring sci, com, blast, king;
283  bool rc1, rc2, rc3;
284 
285  rc1 = SeqDB_SplitString(buffer, sci, '\t');
286  rc2 = SeqDB_SplitString(buffer, com, '\t');
287  rc3 = SeqDB_SplitString(buffer, blast, '\t');
288  king = buffer;
289 
290  if (rc1 && rc2 && rc3 && buffer.Size()) {
291  sci .GetString(info.scientific_name);
292  com .GetString(info.common_name);
293  blast .GetString(info.blast_name);
294  king .GetString(info.s_kingdom);
295 
296  return true;
297  }
298  }
299 
300  return false;
301 }
302 
304 
CFile –.
Definition: ncbifile.hpp:1604
CMemoryFile –.
Definition: ncbifile.hpp:2860
CSeqDBTaxId class.
Definition: seqdbtax.cpp:52
Uint4 m_Offset
The offset of the start of the taxonomy data.
Definition: seqdbtax.cpp:83
Int4 GetOffset() const
Return the offset field (in host order)
Definition: seqdbtax.cpp:70
TTaxId GetTaxId() const
Return the taxonomic identifier field (in host order)
Definition: seqdbtax.cpp:64
Uint4 m_Taxid
The taxonomic identifier.
Definition: seqdbtax.cpp:80
CSeqDBTaxId(const CSeqDBTaxId &)
This structure should not be copy constructed.
CSeqDBTaxId()
Constructor.
Definition: seqdbtax.cpp:58
static bool GetTaxNames(TTaxId tax_id, SSeqDBTaxInfo &info)
Get the taxonomy names for a given tax id.
Definition: seqdbtax.cpp:219
String slicing.
void GetString(string &s) const
Return the data by assigning it to a string.
const char * GetDataPtr()
Definition: seqdbtax.cpp:94
bool m_MissingDB
Indicator if tax db files are missing.
Definition: seqdbtax.cpp:120
char * m_DataPtr
Definition: seqdbtax.cpp:114
unique_ptr< CMemoryFile > m_IndexFileMap
Memory map of the index file.
Definition: seqdbtax.cpp:111
const Int4 GetTaxidCount()
Definition: seqdbtax.cpp:97
const CSeqDBTaxId * GetIndexPtr()
Definition: seqdbtax.cpp:95
unique_ptr< CMemoryFile > m_DataFileMap
Definition: seqdbtax.cpp:112
size_t m_DataFileSize
Definition: seqdbtax.cpp:116
string m_IndexFN
The filename of the taxonomic db index file.
Definition: seqdbtax.cpp:102
bool IsMissingTaxInfo()
Definition: seqdbtax.cpp:96
CSeqDBTaxId * m_IndexPtr
Definition: seqdbtax.cpp:113
string m_DataFN
The filename of the taxnomoic db data file.
Definition: seqdbtax.cpp:105
size_t GetDataFileSize()
Definition: seqdbtax.cpp:98
Int4 m_AllTaxidCount
Total number of taxids in the database.
Definition: seqdbtax.cpp:108
#define false
Definition: bool.h:36
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
#define TAX_ID_FROM(T, value)
Definition: ncbimisc.hpp:1111
#define NULL
Definition: ncbistd.hpp:225
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
Int8 GetLength(void) const
Get size of file.
Definition: ncbifile.cpp:3204
TTaxId GetTaxId(const CBioseq_Handle &handle)
return the tax-id associated with a given sequence.
Definition: sequence.cpp:274
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
Definition of all error codes used in objtools libraries.
static MDB_envinfo info
Definition: mdb_load.c:37
EIPRangeType t
Definition: ncbi_localip.c:101
static pcre_uint8 * buffer
Definition: pcretest.c:1051
string SeqDB_ResolveDbPath(const string &filename)
Resolve a file path using SeqDB's path algorithms.
bool SeqDB_SplitString(CSeqDB_Substring &buffer, CSeqDB_Substring &front, char delim)
Parse a prefix from a substring.
T SeqDB_GetStdOrd(const T *stdord_obj)
Read a network order integer value.
Defines database volume access classes.
SSeqDBTaxInfo.
#define _ASSERT
Modified on Sun Apr 14 05:27:29 2024 by modify_doxy.py rev. 669887