46 #define ISAM_VERSION 1
49 #define DEFAULT_NISAM_SIZE 256
52 #define DEFAULT_SISAM_SIZE 64
55 #define MEMORY_ONLY_PAGE_SIZE 1
66 bool found_index_file =
103 TIndx disk_file_length(0);
104 bool found_data_file =
126 Int4 num_elements(0);
165 while(Stop >= Start) {
166 SampleNum = ((
Uint4)(Stop + Start)) >> 1;
171 const void* keydatap(0);
197 Start = SampleNum +1;
228 "Error: Unable to use ISAM index in batch mode.");
248 int gilist_index = 0;
251 const void * data_page (0);
254 int start = 0, num_elements = 0;
261 for(
int i = 0;
i < num_elements;
i++) {
276 if (gilist_index < gilist_size) {
285 if (isam_data < vol_end) {
308 bool sameAccession =
false;
313 if(acc2 == accession) {
314 sameAccession =
true;
318 return sameAccession;
325 bool sameAccession =
false;
326 if(currIndex < num_keys - 1) {
329 return sameAccession;
339 if (! gilist_size)
return;
348 "Error: Unable to use ISAM index in batch mode.");
352 vector<string> sample_keys;
353 vector<TIndx> page_offs;
365 int gilist_index = 0;
366 int sample_index = 0;
381 for(
int i = 0;
i < num_keys;
i++) {
385 if (gilist_index < gilist_size) {
391 if (vals[
i] < vol_end) {
434 Int4 last = Start + NumElements - 1;
436 const void * KeyDataPage =
NULL;
437 const void * KeyDataPageStart =
NULL;
445 KeyDataPage = (
char *)KeyDataPageStart - Start *
m_TermSize;
458 }
else if (Key < Number) {
466 if (found ==
false) {
481 *Index = Start + current;
532 TIndx offset_begin = KeyOffset;
533 TIndx term_end = KeyOffset + term_in.size() + 1;
534 TIndx map_end = term_end + at_least;
536 if (map_end > file_length) {
537 map_end = file_length;
539 if (term_end > map_end) {
541 result =
int(file_length - offset_begin);
550 file_data + term_in.size() + 1,
553 if (dc_result != -1) {
597 const char * file_data = begin;
598 int bytes =
int(end - begin);
600 for(
i = 0; (
i < bytes) &&
i < (
int) term_in.size();
i++) {
601 char ch1 = term_in[
i];
602 char ch2 = file_data[
i];
619 const char * p = file_data +
i;
621 while((p < end) && ((*p) ==
' ')) {
625 if (((p == end) ||
ENDS_ISAM_KEY(*p)) && (
i == (
int) term_in.size())) {
638 vector<TIndx> & indices_out,
639 vector<string> & keys_out,
640 vector<string> & data_out)
644 bool ignore_case =
true;
648 const char * indexp(beginp);
649 bool found_match(
false);
651 while (indexp < endp) {
665 indices_out.push_back(page_index + TermNum);
690 vector<TIndx> & indices_out,
691 vector<string> & keys_out,
692 vector<string> & data_out)
699 bool ignore_case =
true;
704 bool done_b(
false), done_e(
false);
706 const char * beginp(0);
707 const char * endp(0);
712 while(! (done_b && done_e)) {
713 if (sample_index < pre_amt) {
717 beg_off = sample_index - pre_amt;
724 end_off = sample_index + post_amt;
727 x_LoadPage(beg_off, end_off, & beginp, & endp);
735 if (diff_begin != -1) {
743 const char * last_term(0);
744 const char * p(endp-1);
748 enum { eEndNulls, eLastTerm } search_stage = eEndNulls;
753 if (search_stage == eEndNulls) {
755 search_stage = eLastTerm;
776 if (diff_end != -1) {
794 const char * map_end,
795 vector<string> & keys_out,
796 vector<string> & data_out)
798 const char * data_ptr(0);
799 const char * p(key_start);
805 keys_out.push_back(
string(key_start, data_ptr));
806 data_out.push_back(
string(data_ptr+1, p));
808 keys_out.push_back(
string(key_start, p));
809 data_out.push_back(
"");
827 TIndx offset_begin = sample_offset + (sample_num *
sizeof(
Uint4));
844 const char * key_offset_addr =
849 for(
int i = 0;
i<length;
i++) {
850 if (! key_offset_addr[
i]) {
857 str.assign(key_offset_addr, length);
874 bool ignore_case(
true);
882 TIndx offset_begin = SampleOffset + (SampleNum *
sizeof(
Uint4));
901 const char ** beginp,
907 _ASSERT(SampleNum2 > SampleNum1);
935 vector<string> & terms_out,
936 vector<string> & values_out,
937 vector<TIndx> & indices_out)
943 bool short_match(
false);
944 bool follow_match(
false);
946 size_t preexisting_data_count = values_out.size();
957 bool ignore_case =
true;
964 int Length = (
int) term_in.size();
977 while(Stop >= Start) {
978 SampleNum = ((
Uint4)(Stop + Start)) >> 1;
991 if (BytesToEnd > (
TIndx) max_lines_2) {
992 BytesToEnd = max_lines_2;
1009 if (short_match && (diff >= Length)) {
1013 while(SampleNum > 0) {
1027 if (prefix != term_in) {
1035 found_short = SampleNum + 1;
1050 found_short = SampleNum;
1059 ?
tolower((
unsigned char) term_in[diff]) <
tolower((
unsigned char) KeyData[diff])
1060 : term_in[diff] < KeyData[diff]) {
1063 Start = SampleNum + 1;
1076 const char * beginp(0);
1077 const char * endp(0);
1079 x_LoadPage(SampleNum, SampleNum + 1, & beginp, & endp);
1095 if (preexisting_data_count == values_out.size()) {
1108 m_IdentType (ident_type),
1109 m_IndexLease (atlas),
1110 m_DataLease (atlas),
1117 m_Initialized (
false),
1118 m_KeySampleOffset(0),
1119 m_TestNonUnique (
true),
1128 switch(ident_type) {
1143 "Error: ident type argument not valid");
1155 string msg(
"Error: Could not open input file (");
1175 string & index_name,
1179 (!
isalpha((
unsigned char) prot_nucl)) ||
1180 (!
isalpha((
unsigned char) file_ext_char))) {
1184 "Error: argument not valid");
1187 index_name.reserve(
dbname.size() + 4);
1188 data_name.reserve(
dbname.size() + 4);
1192 index_name += prot_nucl;
1193 index_name += file_ext_char;
1195 data_name = index_name;
1204 string iname, dname;
1236 vector<TOid> & oids,
1238 bool & version_check)
1241 bool strip_version = version_check;
1242 version_check =
false;
1252 string accession(
string(
"gb|") + acc +
"|");
1253 string locus_str(
string(
"gb||") + acc);
1257 vector<string> keys_out;
1258 vector<string> data_out;
1259 vector<TIndx> indices_out;
1265 indices_out)) < 0) {
1277 indices_out)) < 0) {
1291 indices_out)) < 0) {
1301 if ((! found) && strip_version) {
1302 size_t pos = acc.find(
".");
1304 bool is_version =
false;
1306 if (pos != string::npos) {
1307 int ver_len =
static_cast<int>(acc.size() - pos) - 1;
1309 is_version = (ver_len <= 3 && ver_len >= 1);
1311 for(
size_t vp = pos+1; vp < acc.size(); vp++) {
1320 string nover(acc, 0, pos);
1328 if (data_out.size()) {
1329 version_check =
true;
1360 indices_out)) < 0)) {
1371 ITERATE(vector<string>, iter, data_out) {
1372 oids.push_back(atoi((*iter).c_str()));
1382 cerr <<
" this should be derived from readdb_acc2fastaEx().." << endl;
1398 x_TranslateGiList<TGi>(vol_start, ids);
1402 x_TranslateGiList<TTi>(vol_start, ids);
1406 x_TranslateGiList<string>(vol_start, ids);
1410 x_TranslateGiList<TPig>(vol_start, ids);
1416 "Error: Wrong type of idlist specified.");
1472 int num_elements(0);
1474 const void * data_page(0);
1508 elem_index = num_elements - 1;
1520 const char * beginp(0);
1521 const char * endp(0);
1526 x_LoadPage(Start, Start + 1, & beginp, & endp);
1530 vector<string> keys_out;
1531 vector<string> data_out;
1545 x_LoadPage(Stop, Stop + 1, & beginp, & endp);
1549 const char * lastp(0);
1550 const char * indexp(beginp);
1552 while (indexp < endp) {
1667 vector<TOid> & oids)
1682 vector<string> keys_out;
1683 vector<string> data_out;
1684 vector<TIndx> indices_out;
1689 indices_out)) < 0) {
1699 ITERATE(vector<string>, iter, data_out) {
1700 oids.push_back(atoi(iter->c_str()));
bool GetFileSizeL(const string &fname, TIndx &length)
Get size of a file.
const char * GetFileDataPtr(const string &fname, TIndx offset)
Get a pointer to the specified offset.
void Init(const string &filename)
Initializes a memory map object.
void Clear()
Clears the memory mapobject.
bool OutsideLastBound(Int8 ident)
Returns true if the provided integer compares as higher than the assigned upper boundary for this ISA...
bool IsSet()
Returns true if this object has an assigned value.
string GetString() const
Fetch the numeric value of this object.
void SetString(const string &ident)
Fetch the string value of this object.
void SetNumeric(Int8 ident)
Assign a numeric value to this object.
Int8 GetNumeric() const
Fetch the numeric value of this object.
bool OutsideFirstBound(Int8 ident)
Returns true if the provided integer compares as lower than the assigned lower boundary for this ISAM...
EErrorCode x_StringSearch(const string &term_in, vector< string > &term_out, vector< string > &value_out, vector< TIndx > &index_out)
String identifier lookup.
EErrorCode x_SearchIndexNumeric(Int8 Number, int *Data, Uint4 *Index, Int4 &SampleNum, bool &done)
Index file search.
CSeqDBIsam(CSeqDBAtlas &atlas, const string &dbname, char prot_nucl, char file_ext_char, ESeqDBIdType ident_type)
Constructor.
@ eNumericLongId
This type is not supported.
@ eString
This type is not supported.
@ eNumericNoData
Numeric database with Key/Value pairs in the index file.
void x_SearchNegativeMulti(int vol_start, int vol_end, CSeqDBNegativeList &gis, bool use_tis)
Negative ID List Translation.
CSeqDBFileMemMap m_DataLease
A persistent lease on the ISAM data file.
TIndx m_IndexFileLength
The length of the ISAM index file.
bool m_LongId
Use Uint8 for the key.
int x_DiffCharLease(const string &term_in, CSeqDBFileMemMap &lease, const string &file_name, TIndx file_length, Uint4 at_least, TIndx KeyOffset, bool ignore_case)
Find the first character to differ in two strings.
int x_DiffChar(const string &term_in, const char *begin, const char *end, bool ignore_case)
Find the first character to differ in two strings.
int x_GetPageNumElements(Int4 SampleNum, Int4 *Start)
Determine the number of elements in the data page.
ESeqDBIdType m_IdentType
The type of identifier this class uses.
SIsamKey m_LastKey
Last volume key.
Int4 m_IdxOption
Options set by upper layer.
void x_LoadData(CSeqDBFileMemMap &lease, vector< T > &keys, vector< int > &vals, int num_keys, TIndx begin)
Load and extract a data page into array at once.
void x_GetDataElement(const void *dpage, int index, Int8 &key, int &data)
Get a particular data element from a data page.
Int4 m_NumSamples
Number of terms in ISAM index.
void HashToOids(unsigned hash, vector< TOid > &oids)
Sequence hash lookup.
EErrorCode
Exit conditions occurring in this code.
@ eBadVersion
Lookup was successful.
@ eBadType
The format version of the ISAM file is unsupported.
@ eWrongFile
The requested ISAM type did not match the file.
@ eNoError
The key was not found.
@ eInitFailed
The file was not found, or was the wrong length.
int x_DiffSample(const string &term_in, Uint4 SampleNum, TIndx &KeyOffset)
Find the first character to differ in two strings.
Uint8 x_GetNumericKey(const void *p)
void x_LoadIndex(CSeqDBFileMemMap &lease, vector< T > &keys, vector< TIndx > &offs)
Load and extract all index samples into array at once.
bool x_SparseStringToOids(const string &acc, vector< int > &oids, bool adjusted)
Lookup a string in a sparse table.
void x_FindIndexBounds()
Find the least and greatest keys in this ISAM file.
Int4 m_NumTerms
Number of terms in database.
void IdsToOids(int vol_start, int vol_end, CSeqDBGiList &ids)
Translate Gis and Tis to Oids for the given ID list.
EErrorCode x_SearchDataNumeric(Int8 Number, int *Data, Uint4 *Index, Int4 SampleNum)
Data file search.
int TOid
This class works with OIDs relative to a specific volume.
bool m_Initialized
Flag indicating whether initialization has been done.
TIndx x_GetIndexKeyOffset(TIndx sample_offset, Uint4 sample_num)
Get the offset of the specified sample.
static void x_MakeFilenames(const string &dbname, char prot_nucl, char file_ext_char, string &index_name, string &data_name)
Make filenames for ISAM file.
static void x_Lower(string &s)
Converts a string to lower case.
bool x_OutOfBounds(Int8 key)
Check whether a numeric key is within this volume's bounds.
void x_SearchNegativeMultiSeq(int vol_start, int vol_end, CSeqDBNegativeList &gis)
EErrorCode x_InitSearch(void)
Initialize the search object.
void x_GetIndexString(TIndx key_offset, int length, string &prefix, bool trim_to_null)
Read a string from the index file.
void x_ExtractPageData(const string &term_in, TIndx page_index, const char *beginp, const char *endp, vector< TIndx > &indices_out, vector< string > &keys_out, vector< string > &data_out)
Find matches in the given memory area of a string ISAM file.
void GetIdBounds(Int8 &low_id, Int8 &high_id, int &count)
Get Numeric Bounds.
Int4 m_PageSize
Page size of ISAM index.
TIndx m_DataFileLength
The length of the ISAM data file.
void UnLease()
Return any memory held by this object to the atlas.
int m_Type
The format type of database files found (eNumeric or eString).
TIndx m_KeySampleOffset
Offset of samples in index file.
SIsamKey m_FirstKey
First volume key.
void x_LoadPage(TIndx SampleNum1, TIndx SampleNum2, const char **beginp, const char **endp)
Map a page into memory.
void x_ExtractAllData(const string &term_in, TIndx sample_index, vector< TIndx > &indices_out, vector< string > &keys_out, vector< string > &data_out)
Find matches in the given page of a string ISAM file.
void x_MapDataPage(int sample_index, int &start, int &num_elements, const void **data_page_begin)
Map a data page.
CSeqDBAtlas::TIndx TIndx
Type which is large enough to span the bytes of an ISAM file.
int x_GetNumericData(const void *p)
void StringToOids(const string &acc, vector< TOid > &oids, bool adjusted, bool &version_check)
String translation.
bool x_FindInNegativeList(CSeqDBNegativeList &ids, int &index, Int8 key, bool use_tis)
Find ID in the negative GI list using PBS.
Int4 m_MaxLineSize
Maximum string length in the database.
void x_ExtractData(const char *key_start, const char *entry_end, vector< string > &key_out, vector< string > &data_out)
Extract the data from a key-value pair in memory.
EErrorCode x_NumericSearch(Int8 Number, int *Data, Uint4 *Index)
Numeric identifier lookup.
string m_DataFname
The filename of the ISAM data file.
static bool IndexExists(const string &dbname, char prot_nucl, char file_ext_char)
Check if a given ISAM index exists.
string m_IndexFname
The filename of the ISAM index file.
int m_TermSize
size of the numeric key-data pair
bool x_IdentToOid(Int8 id, TOid &oid)
Numeric identifier lookup.
CSeqDBFileMemMap m_IndexLease
A persistent lease on the ISAM index file.
CSeqDBAtlas & m_Atlas
The memory management layer.
int GetNumTis() const
Get the number of TIs in the array.
void AddIncludedOid(int oid)
Include an OID in the iteration.
void AddVisibleOid(int oid)
Indicate a visible OID.
int GetNumGis() const
Get the number of GIs in the array.
int GetNumSis() const
Get the number of SeqIds in the array.
void InsureOrder()
Sort list if not already sorted.
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
static const char * str(char *buf, int n)
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
virtual bool Exists(void) const
Check existence of file.
const string AsFastaString(void) const
@ fParse_RawText
Try to ID raw non-numeric accessions.
@ fParse_AnyLocal
Treat otherwise unidentified strings as local accessions as long as they don't resemble FASTA-style I...
int32_t Int4
4-byte (32-bit) signed integer
uint32_t Uint4
4-byte (32-bit) unsigned integer
int64_t Int8
8-byte (64-bit) signed integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
static string UIntToString(unsigned int value, TNumToStringFlags flags=0, int base=10)
Convert UInt to string.
static bool SplitInTwo(const CTempString str, const CTempString delim, string &str1, string &str2, TSplitFlags flags=0)
Split a string into two pieces using the specified delimiters.
char * dbname(DBPROCESS *dbproc)
Get name of current database.
unsigned int
A callback function used to compare two keys in a database.
const string version
version string
const struct ncbi::grid::netcache::search::fields::SIZE size
const struct ncbi::grid::netcache::search::fields::KEY key
static const BitmapCharRec ch1
static const BitmapCharRec ch2
Useful/utility classes and methods.
ESeqDBIdType
Various identifier formats used in Id lookup.
@ eStringId
Each PIG identifier refers to exactly one protein sequence.
@ eTiId
Genomic ID is a relatively stable numeric identifier for sequences.
@ ePigId
Trace ID is a numeric identifier for Trace sequences.
@ eHashId
Some sequence sources uses string identifiers.
#define SEQDB_ISEOL(x)
Macro for EOL chars.
T SeqDB_GetStdOrd(const T *stdord_obj)
Read a network order integer value.
USING_SCOPE(objects)
Place these definitions in the ncbi namespace.
#define DEFAULT_SISAM_SIZE
Default page size for string indices.
static bool ENDS_ISAM_KEY(char P)
Returns true if the character is a terminator for an ISAM key.
#define DEFAULT_NISAM_SIZE
Default page size for numeric indices.
#define ISAM_VERSION
Format version of the ISAM files.
#define MEMORY_ONLY_PAGE_SIZE
Special page size value which indicates a memory-only string index.
static bool s_IsSameAccession(string acc1, string acc2)
static char s_SeqDBIsam_NullifyEOLs(char c)
Return NUL for nulls or EOL characters.
const char ISAM_DATA_CHAR
The terminating character for string ISAM keys when data is present.
ISAM index database access object.
static SLJIT_INLINE sljit_ins msg(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)