1 #ifndef OBJTOOLS_READERS_SEQDB__SEQDBISAM_HPP
2 #define OBJTOOLS_READERS_SEQDB__SEQDBISAM_HPP
52 template <
class T>
static inline void
58 while( (gi_index < gis_size)
64 while( (gi_index + jump < gis_size)
73 while( (gi_index < gis_size)
79 template <
class T>
static inline void
85 while( (index < num_keys)
86 && (keys[index] <= target)) {
91 while( (index + jump < num_keys)
92 && (keys[index + jump] <= target)) {
102 template <
class T>
static inline void
109 while( (gi_index < gis_size)
206 return x_IdentToOid(pig, oid);
228 return x_IdentToOid(
id, oid);
247 void IdsToOids(
int vol_start,
277 void IdsToOids(
int vol_start,
313 void StringToOids(
const string & acc,
316 bool & version_check);
350 void HashToOids(
unsigned hash,
351 vector<TOid> & oids);
368 void GetIdBounds(
Int8 & low_id,
384 void GetIdBounds(
string & low_id,
395 static bool IndexExists(
const string &
dbname,
413 : m_IsSet(
false), m_NKey(-1)
453 return (m_IsSet && (ident < m_NKey));
460 return (m_IsSet && (ident < m_SKey));
467 return (m_IsSet && (ident > m_NKey));
474 return (m_IsSet && (ident > m_SKey));
502 vector<TIndx> & offs)
507 for (
int index=0; index < m_NumSamples; ++index) {
508 keys.push_back(
static_cast<T>(x_GetNumericKey(keydatap)));
510 offs.push_back(index * m_PageSize * m_TermSize);
511 keydatap += m_TermSize;
514 offs.push_back(m_NumTerms * m_TermSize);
529 for (
int index=0; index < num_keys; ++index) {
530 keys.push_back(
static_cast<T>(x_GetNumericKey(keydatap)));
531 vals.push_back(x_GetNumericData(keydatap));
532 keydatap += m_TermSize;
554 if (! gilist_size)
return;
558 if(m_Initialized ==
false) {
561 "Error: Unable to use ISAM index in batch mode.");
564 vector<T> sample_keys;
565 vector<TIndx> page_offs;
569 sample_keys.reserve(m_NumSamples);
570 page_offs.reserve(m_NumSamples + 1);
571 keys.reserve(m_PageSize);
572 vals.reserve(m_PageSize);
574 x_LoadIndex(m_IndexLease, sample_keys, page_offs);
576 int gilist_index = 0;
577 int sample_index = 0;
579 while((gilist_index < gilist_size) && (sample_index < m_NumSamples)) {
581 s_AdvanceGiList<T>(gis, gilist_index, gilist_size,
582 sample_keys[sample_index]);
584 if (gilist_index >= gilist_size)
break;
586 s_AdvanceKeyList<T>(sample_keys, sample_index, m_NumSamples,
593 int num_keys = m_PageSize;
594 if (sample_index + 1 == m_NumSamples) {
595 num_keys = m_NumTerms - sample_index * m_PageSize;
597 x_LoadData(m_DataLease, keys, vals, num_keys, page_offs[sample_index]);
601 while ((gilist_index < gilist_size) && (index < num_keys)) {
603 s_AdvanceKeyList<T>(keys, index, num_keys,
606 s_SetTranslation<T>(gis, gilist_index, gilist_size,
607 keys[index], vals[index] + vol_start);
610 if (index >= num_keys)
break;
612 s_AdvanceGiList<T>(gis, gilist_index, gilist_size, keys[index]);
614 s_SetTranslation<T>(gis, gilist_index, gilist_size,
615 keys[index], vals[index] + vol_start);
637 bool x_IdentToOid(
Int8 id,
661 x_SearchIndexNumeric(
Int8 Number,
684 x_SearchNegativeMulti(
int vol_start,
691 x_SearchNegativeMultiSeq(
int vol_start,
715 x_SearchDataNumeric(
Int8 Number,
736 x_NumericSearch(
Int8 Number,
758 x_StringSearch(
const string & term_in,
759 vector<string> & term_out,
760 vector<string> & value_out,
761 vector<TIndx> & index_out);
788 int x_GetPageNumElements(
Int4 SampleNum,
807 bool x_SparseStringToOids(
const string & acc,
838 x_DiffCharLease(
const string & term_in,
865 x_DiffChar(
const string & term_in,
884 void x_ExtractData(
const char * key_start,
885 const char * entry_end,
886 vector<string> & key_out,
887 vector<string> & data_out);
903 TIndx x_GetIndexKeyOffset(
TIndx sample_offset,
921 void x_GetIndexString(
TIndx key_offset,
942 int x_DiffSample(
const string & term_in,
965 void x_ExtractAllData(
const string & term_in,
967 vector<TIndx> & indices_out,
968 vector<string> & keys_out,
969 vector<string> & data_out);
991 void x_ExtractPageData(
const string & term_in,
995 vector<TIndx> & indices_out,
996 vector<string> & keys_out,
997 vector<string> & data_out);
1016 void x_LoadPage(
TIndx SampleNum1,
1018 const char ** beginp,
1019 const char ** endp);
1108 void x_MapDataPage(
int sample_index,
1111 const void ** data_page_begin);
1119 void x_GetDataElement(
const void * dpage,
1124 void x_GetDataElement(
const void * dpage,
1130 void x_FindIndexBounds();
1140 bool x_OutOfBounds(
string key);
1145 for(
size_t i = 0;
i < s.size();
i++) {
1160 return (ids.
GetSi(index));
1171 static void x_MakeFilenames(
const string &
dbname,
1174 string & index_name,
1175 string & data_name);
1271 const char * keydatap = begin;
1272 const char * key_begin = keydatap;
1273 while (*keydatap != 0x02) ++keydatap;
1276 key_begin = ++keydatap;
1277 while (*keydatap != 0x0a) ++keydatap;
1291 const void * keydatap = 0;
1302 if (key_in < key_out) {
1304 }
else if (key_in > key_out) {
1320 const void * keydatap = 0;
1333 inline void CSeqDBIsam::x_LoadIndex<TGi>(
1336 vector<TIndx> & offs
1353 inline void CSeqDBIsam::x_LoadData<TGi>(
1364 for (
int index=0; index < num_keys; ++index) {
1371 template <>
inline void
1373 vector<string> & keys,
1374 vector<TIndx> & offs)
1393 const char * keydatap = (
const char *) lease.
GetFileDataPtr(offset_begin) - 1;
1396 const char * key_begin = ++ keydatap;
1397 while (*keydatap != 0x02) ++keydatap;
1398 keys.push_back(
string(key_begin, keydatap));
1406 template <>
inline void
1408 vector<string> & keys,
1413 const char * keydatap = (
const char *) lease.
GetFileDataPtr(begin) - 1;
1415 for (
int index=0; index < num_keys; ++index) {
1417 const char * key_begin = ++keydatap;
1418 while (*keydatap != 0x02) ++keydatap;
1419 keys.push_back(
string(key_begin, keydatap));
1421 key_begin = ++keydatap;
1422 while (*keydatap != 0x0a) ++keydatap;
1438 while((index < ids_size) && (
x_GetId(ids, index, use_tis) <
key)) {
1443 while((index + jump) < ids_size &&
1452 if ((index < ids_size) && (
x_GetId(ids,index,use_tis) ==
key)) {
1470 while((index < ids_size) && (
x_GetId(ids, index) <
key)) {
1475 while((index + jump) < ids_size &&
1484 if ((index < ids_size) && (
x_GetId(ids,index) ==
key)) {
1496 const void ** data_page_begin)
CNcbiStreamoff TIndx
The type used for file offsets.
const char * GetFileDataPtr(const string &fname, TIndx offset)
Get a pointer to the specified offset.
bool IsValueSet(int index) const
T GetKey(int index) const
void SetValue(int index, int oid)
@ eGi
The array is sorted by GI.
void InsureOrder(ESortOrder order)
Sort if necessary to insure order of elements.
Stores a key for an ISAM file.
bool OutsideLastBound(Int8 ident)
Returns true if the provided integer compares as higher than the assigned upper boundary for this ISA...
string m_SKey
The key, if it is a string.
bool IsSet()
Returns true if this object has an assigned value.
string GetString() const
Fetch the numeric value of this object.
void SetString(const string &ident)
Fetch the string value of this object.
Int8 m_NKey
The key, if it is a number.
void SetNumeric(Int8 ident)
Assign a numeric value to this object.
Int8 GetNumeric() const
Fetch the numeric value of this object.
bool m_IsSet
True if this object has an assigned value.
bool OutsideLastBound(const string &ident)
Returns true if the provided string compares as lower than the assigned upper boundary for this ISAM ...
bool OutsideFirstBound(Int8 ident)
Returns true if the provided integer compares as lower than the assigned lower boundary for this ISAM...
bool OutsideFirstBound(const string &ident)
Returns true if the provided string compares as lower than the assigned lower boundary for this ISAM ...
Int4 m_LastOffset
First and last offset's of last page.
bool IdToOid(Int8 id, TOid &oid)
GI or TI translation.
EIsamDbType
Types of database this class can access.
CSeqDBGiList::SGiOid TGiOid
Import the type representing one GI, OID association.
CSeqDBFileMemMap m_DataLease
A persistent lease on the ISAM data file.
TIndx m_IndexFileLength
The length of the ISAM index file.
bool m_LongId
Use Uint8 for the key.
int x_GetPageNumElements(Int4 SampleNum, Int4 *Start)
Determine the number of elements in the data page.
Int8 TId
Type large enough to hold any numerical ID.
ESeqDBIdType m_IdentType
The type of identifier this class uses.
SIsamKey m_LastKey
Last volume key.
static string x_GetId(CSeqDBNegativeList &ids, int index)
void x_TranslateGiList(int vol_start, CSeqDBGiList &gis)
GiList Translation.
Int4 m_IdxOption
Options set by upper layer.
void x_LoadData(CSeqDBFileMemMap &lease, vector< T > &keys, vector< int > &vals, int num_keys, TIndx begin)
Load and extract a data page into array at once.
void x_LoadStringData(const char *begin, string &key, int &data)
Int8 TTi
PIG identifiers for numeric indices over protein volumes.
void x_GetNumericSample(CSeqDBFileMemMap &index_lease, int index, Int8 &key_out, int &data_out)
Get a sample key value from a numeric index.
void x_GetDataElement(const void *dpage, int index, Int8 &key, int &data)
Get a particular data element from a data page.
Int4 m_NumSamples
Number of terms in ISAM index.
int x_TestNumericSample(CSeqDBFileMemMap &index_lease, int index, Int8 key_in, Int8 &key_out, int &data_out)
Test a sample key value from a numeric index.
EErrorCode
Exit conditions occurring in this code.
bool PigToOid(TPig pig, TOid &oid)
PIG translation.
Uint8 x_GetNumericKey(const void *p)
void x_LoadIndex(CSeqDBFileMemMap &lease, vector< T > &keys, vector< TIndx > &offs)
Load and extract all index samples into array at once.
Int4 m_NumTerms
Number of terms in database.
int TOid
This class works with OIDs relative to a specific volume.
bool m_Initialized
Flag indicating whether initialization has been done.
bool SeqidToOid(const string &acc, TOid &oid)
Seq-id translation.
static void x_Lower(string &s)
Converts a string to lower case.
Int4 m_PageSize
Page size of ISAM index.
TIndx m_DataFileLength
The length of the ISAM data file.
int m_Type
The format type of database files found (eNumeric or eString).
bool m_TestNonUnique
Check if data for String ISAM sorted.
TIndx m_KeySampleOffset
Offset of samples in index file.
Int4 m_FirstOffset
First and last offset's of last page.
char * m_FileStart
Pointer to index file if no memmap.
SIsamKey m_FirstKey
First volume key.
static Int8 x_GetId(CSeqDBNegativeList &ids, int index, bool use_tis)
Fetch a GI or TI from a GI list.
void x_MapDataPage(int sample_index, int &start, int &num_elements, const void **data_page_begin)
Map a data page.
CSeqDBAtlas::TIndx TIndx
Type which is large enough to span the bytes of an ISAM file.
int x_GetNumericData(const void *p)
bool x_FindInNegativeList(CSeqDBNegativeList &ids, int &index, Int8 key, bool use_tis)
Find ID in the negative GI list using PBS.
Int4 m_MaxLineSize
Maximum string length in the database.
string m_DataFname
The filename of the ISAM data file.
string m_IndexFname
The filename of the ISAM index file.
int m_TermSize
size of the numeric key-data pair
CSeqDBFileMemMap m_IndexLease
A persistent lease on the ISAM index file.
CSeqDBAtlas & m_Atlas
The memory management layer.
TGi GetGi(int index) const
Access an element of the GI array.
TTi GetTi(int index) const
Access an element of the TI array.
const string GetSi(int index) const
Access an element of the SeqId array.
#define GI_FROM(T, value)
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
int32_t Int4
4-byte (32-bit) signed integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
int64_t Int8
8-byte (64-bit) signed integer
uint64_t Uint8
8-byte (64-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
static unsigned int StringToUInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to unsigned int.
#define NCBI_XOBJREAD_EXPORT
char * dbname(DBPROCESS *dbproc)
Get name of current database.
const struct ncbi::grid::netcache::search::fields::KEY key
const GenericPointer< typename T::ValueType > T2 value
Int4 TOid
Ordinal ID in BLAST databases.
ESeqDBIdType
Various identifier formats used in Id lookup.
@ eTiId
Genomic ID is a relatively stable numeric identifier for sequences.
@ ePigId
Trace ID is a numeric identifier for Trace sequences.
CSeqDBAtlas::TIndx TIndx
Index file.
File access objects for CSeqDB.
T SeqDB_GetStdOrd(const T *stdord_obj)
Read a network order integer value.
USING_SCOPE(objects)
Bring the object directory definitions into this scope.
static void s_AdvanceKeyList(const vector< T > &keys, int &index, int num_keys, const T &target)
static void s_AdvanceGiList(CSeqDBGiList &gis, int &gi_index, int gis_size, const T &key)
static void s_SetTranslation(CSeqDBGiList &gis, int &gi_index, int gis_size, const T &key, int value)
Structure that holds GI,OID pairs.