NCBI C++ ToolKit
seqdbisam.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: seqdbisam.cpp 100101 2023-06-15 14:10:29Z merezhuk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Kevin Bealer
27  *
28  */
29 //IRENA: ALL locking removed from this file CSeqDBLockHold & stays in some functions but is not used
30 /// @file seqdbisam.cpp
31 /// Implementation for the CSeqDBIsam class, which manages an ISAM
32 /// index of some particular kind of identifiers.
33 #include <ncbi_pch.hpp>
37 #include <corelib/ncbiutil.hpp>
38 
39 /// Place these definitions in the ncbi namespace
41 
42 /// Import this namespace
44 
45 /// Format version of the ISAM files
46 #define ISAM_VERSION 1
47 
48 /// Default page size for numeric indices
49 #define DEFAULT_NISAM_SIZE 256
50 
51 /// Default page size for string indices
52 #define DEFAULT_SISAM_SIZE 64
53 
54 /// Special page size value which indicates a memory-only string index
55 #define MEMORY_ONLY_PAGE_SIZE 1
56 
57 
60 {
61  if(m_Initialized == true)
62  return eNoError;
63 
64  TIndx info_needed = 10 * sizeof(Int4);
65 
66  bool found_index_file =
68 
69  if ((! found_index_file) || (m_IndexFileLength < info_needed)) {
70  return eWrongFile;
71  }
72 
73 
75 
76  // Check for consistence of files and parameters
77 
78  Int4 Version = SeqDB_GetStdOrd(& FileInfo[0]);
79 
80  if (Version != ISAM_VERSION)
81  return eBadVersion;
82 
83  Int4 IsamType = SeqDB_GetStdOrd(& FileInfo[1]);
84 
85  if (IsamType == eNumericLongId && m_Type == eNumeric) {
86  m_LongId = true;
87  m_TermSize = 12;
88  IsamType = eNumeric;
89  }
90 
91  if (IsamType != m_Type)
92  return eBadType;
93 
98 
100  // Special case of memory-only index
102 
103  TIndx disk_file_length(0);
104  bool found_data_file =
105  m_Atlas.GetFileSizeL(m_DataFname, disk_file_length);
106 
107  if ((! found_data_file) || (m_DataFileLength != disk_file_length)) {
108  return eWrongFile;
109  }
110  }
111 
112  // This space reserved for future use
113 
115 
116  m_KeySampleOffset = (9 * sizeof(Int4));
117 
118  m_Initialized = true;
119 
120  return eNoError;
121 }
122 
124  Int4 * start)
125 {
126  Int4 num_elements(0);
127 
128  *start = sample_num * m_PageSize;
129 
130  if (sample_num + 1 == m_NumSamples) {
131  num_elements = m_NumTerms - *start;
132  } else {
133  num_elements = m_PageSize;
134  }
135 
136  return num_elements;
137 }
138 
141  int * Data,
142  Uint4 * Index,
143  Int4 & SampleNum,
144  bool & done)
145 
146 {
147  if(m_Initialized == false) {
148  done = true;
149  // Return just any error
150  return eInitFailed;
151  }
152 
153  if (x_OutOfBounds(Number)) {
154  done = true;
155  return eNotFound;
156  }
157 
159 
160  // Search the sample file.
161 
162  Int4 Start (0);
163  Int4 Stop (m_NumSamples - 1);
164 
165  while(Stop >= Start) {
166  SampleNum = ((Uint4)(Stop + Start)) >> 1;
167 
168  TIndx offset_begin = m_KeySampleOffset + (m_TermSize * SampleNum);
169  //TIndx offset_end = offset_begin + m_TermSize;
170 
171  const void* keydatap(0);
172 
173  Int8 Key(0);
174 
175  keydatap = m_IndexLease.GetFileDataPtr(m_IndexFname,offset_begin);
176  Key = x_GetNumericKey (keydatap);
177 
178  // If this is an exact match, return the master term number.
179 
180  if (Key == Number) {
181  if (Data != NULL) {
182  *Data = x_GetNumericData(keydatap);
183  }
184 
185  if (Index != NULL)
186  *Index = SampleNum * m_PageSize;
187 
188  done = true;
189  return eNoError;
190  }
191 
192  // Otherwise, search for the next sample.
193 
194  if ( Number < Key )
195  Stop = --SampleNum;
196  else
197  Start = SampleNum +1;
198  }
199 
200  // If the term is out of range altogether, report not finding it.
201 
202  if ( (SampleNum < 0) || (SampleNum >= m_NumSamples)) {
203 
204  if (Data != NULL)
205  *Data = eNotFound;
206 
207  if(Index != NULL)
208  *Index = eNotFound;
209 
210  done = true;
211  return eNotFound;
212  }
213 
214  done = false;
215  return eNoError;
216 }
217 
218 void
220  int vol_end,
221  CSeqDBNegativeList & ids,
222  bool use_tis)
223 
224 {
225  if(m_Initialized == false) {
227  eArgErr,
228  "Error: Unable to use ISAM index in batch mode.");
229  }
230 
231  //m_Atlas.Lock(locked);
232 
233 
234  // We can use Parabolic Binary Search for the negative GI list but
235  // not for the ISAM file data, because in the negative ID list
236  // case, every line of the ISAM data must be looked at.
237 
239 
240  //......................................................................
241  //
242  // Translate the entire Gi List.
243  //
244  //......................................................................
245 
246  int gilist_size = use_tis ? ids.GetNumTis() : ids.GetNumGis();
247 
248  int gilist_index = 0;
249 
250  int sample_index(0);
251  const void * data_page (0);
252 
253  while(sample_index < m_NumSamples) {
254  int start = 0, num_elements = 0;
255 
256  x_MapDataPage(sample_index,
257  start,
258  num_elements,
259  & data_page);
260 
261  for(int i = 0; i < num_elements; i++) {
262  Int8 isam_key(0);
263  int isam_data(0);
264 
265  // 1. Get the ID+OID from the data page.
266 
267  x_GetDataElement(data_page,
268  i,
269  isam_key,
270  isam_data);
271 
272  // 2. Look for it in the negative id list.
273 
274  bool found = false;
275 
276  if (gilist_index < gilist_size) {
277  found = x_FindInNegativeList(ids,
278  gilist_index,
279  isam_key,
280  use_tis);
281  }
282 
283  // 3. If not found, add the OID to the negative ID list.
284 
285  if (isam_data < vol_end) {
286  if (found) {
287  // OID is found, but may not be included yet.
288  ids.AddVisibleOid(isam_data + vol_start);
289  } else {
290  // OID is included for iteration.
291  ids.AddIncludedOid(isam_data + vol_start);
292  }
293  }
294  }
295 
296  // Move to next data page. Note that for a negative ID list
297  // processing, we don't actually fetch any samples, because
298  // every ID->OID line needs to be examined anyway.
299 
300  sample_index ++;
301  }
302 }
303 
304 //In case if acc2 does not have version and acc1 has version
305 //check if it is the same accession
306 static bool s_IsSameAccession(string acc1, string acc2)
307 {
308  bool sameAccession = false;
309  if(NStr::Find(acc2,".") == NPOS) { // no version in acc2
310  if(NStr::Find(acc1,".") != NPOS && NStr::Find(acc1, acc2) != NPOS) {
311  string accession, version;
312  NStr::SplitInTwo(acc1,".", accession, version);
313  if(acc2 == accession) {
314  sameAccession = true;
315  }
316  }
317  }
318  return sameAccession;
319 }
320 
321 //In case if keys[currIndex] does not have version and keys[currIndex + 1] has version
322 //check if it is the same accession
323 static bool s_IsSameAccession(vector <string> keys, int num_keys, int currIndex)
324 {
325  bool sameAccession = false;
326  if(currIndex < num_keys - 1) {
327  sameAccession = s_IsSameAccession(keys[currIndex + 1], keys[currIndex]);
328  }
329  return sameAccession;
330 }
331 
332 void
334  int vol_end,
335  CSeqDBNegativeList & ids)
336 
337 {
338  int gilist_size = ids.ListSize();
339  if (! gilist_size) return;
340 
341  if(m_Initialized == false) {
342  // Most ordinary errors (missing GIs for example) are
343  // ignored for "multi" mode searches. But if a GI list is
344  // specified, and cannot be interpreted, it is an error.
345 
347  eArgErr,
348  "Error: Unable to use ISAM index in batch mode.");
349  }
350 
351 
352  vector<string> sample_keys;
353  vector<TIndx> page_offs;
354  vector<string> keys;
355  vector<int> vals;
356 
357  sample_keys.reserve(m_NumSamples);
358  page_offs.reserve(m_NumSamples + 1);
359  keys.reserve(m_PageSize);
360  vals.reserve(m_PageSize);
361 
362 
363  x_LoadIndex(m_IndexLease, sample_keys, page_offs);
364 
365  int gilist_index = 0;
366  int sample_index = 0;
367 
368  while(sample_index < m_NumSamples) {
369 
370  // Now we should be ready to search a data block.
371  keys.clear();
372  vals.clear();
373 
374  int num_keys = m_PageSize;
375  if (sample_index + 1 == m_NumSamples) {
376  num_keys = m_NumTerms - sample_index * m_PageSize;
377  }
378 
379  x_LoadData(m_DataLease, keys, vals, num_keys, page_offs[sample_index]);
380 
381  for(int i = 0; i < num_keys; i++) {
382  // 2. Look for it in the negative id list.
383 
384  bool found = false;
385  if (gilist_index < gilist_size) {
386  found = x_FindInNegativeList(ids,
387  gilist_index,
388  keys[i]);
389 
390  }
391  if (vals[i] < vol_end) {
392  if (found) {
393  // OID is found, but may not be included yet.
394  ids.AddVisibleOid(vals[i] + vol_start);
395  //If next accession is the same as current, but with version
396  if(s_IsSameAccession(keys, num_keys, i)) {
397  i++; //skip next check - sequence already excluded
398  }
399  } else {
400  // OID is included for iteration.
401  //Only include next accession if it is not the same as current (but with version)
402  //because it may be in exclude list. The check will be done in the next step
403  if(!s_IsSameAccession(keys, num_keys, i)) {
404  ids.AddIncludedOid(vals[i] + vol_start);
405  }
406  }
407  }
408 
409  }
410  // Move to next data page. Note that for a negative ID list
411  // processing, we don't actually fetch any samples, because
412  // every ID->OID line needs to be examined anyway.
413 
414  sample_index ++;
415 
416  }
417 }
418 
419 
422  int * Data,
423  Uint4 * Index,
424  Int4 SampleNum)
425 
426 {
427  // Load the appropriate page of numbers into memory.
429 
430  Int4 Start(0);
431  Int4 NumElements = x_GetPageNumElements(SampleNum, & Start);
432 
433  Int4 first = Start;
434  Int4 last = Start + NumElements - 1;
435 
436  const void * KeyDataPage = NULL;
437  const void * KeyDataPageStart = NULL;
438 
439  TIndx offset_begin = Start * m_TermSize;
440  //TIndx offset_end = offset_begin + m_TermSize * NumElements;
441 
442  KeyDataPageStart = m_DataLease.GetFileDataPtr(m_DataFname,offset_begin);
443 
444 
445  KeyDataPage = (char *)KeyDataPageStart - Start * m_TermSize;
446 
447  bool found (false);
448  Int4 current (0);
449 
450  // Search the page for the number.
451  while (first <= last) {
452  current = (first+last)/2;
453 
454  Int8 Key = x_GetNumericKey((char *)KeyDataPage + current * m_TermSize);
455 
456  if (Key > Number) {
457  last = --current;
458  } else if (Key < Number) {
459  first = ++current;
460  } else {
461  found = true;
462  break;
463  }
464  }
465 
466  if (found == false) {
467  if (Data != NULL)
468  *Data = eNotFound;
469 
470  if(Index != NULL)
471  *Index = eNotFound;
472 
473  return eNotFound;
474  }
475 
476  if (Data != NULL) {
477  *Data = x_GetNumericData((char *)KeyDataPage + current * m_TermSize);
478  }
479 
480  if(Index != NULL)
481  *Index = Start + current;
482 
483  return eNoError;
484 }
485 
486 
487 // ------------------------NumericSearch--------------------------
488 // Purpose: Main search function of Numeric ISAM
489 //
490 // Parameters: Key - interer to search
491 // Data - returned value (for NIASM with data)
492 // Index - internal index in database
493 // Returns: ISAM Error Code
494 // NOTE: None
495 // ----------------------------------------------------------------
496 
499  int * Data,
500  Uint4 * Index)
501 
502 {
503  bool done (false);
504  Int4 SampleNum (0);
505 
506  EErrorCode error =
507  x_SearchIndexNumeric(Number, Data, Index, SampleNum, done);
508 
509  if (! done) {
510  error = x_SearchDataNumeric(Number, Data, Index, SampleNum);
511  }
512 
513  return error;
514 }
515 
516 int CSeqDBIsam::x_DiffCharLease(const string & term_in,
517  CSeqDBFileMemMap & lease,
518  const string & file_name,
519  TIndx file_length,
520  Uint4 at_least,
521  TIndx KeyOffset,
522  bool ignore_case)
523 
524 {
525  int result(-1);
526 
527  //m_Atlas.Lock(locked);
528 
529  // Add one to term_end to insure we don't consider "AA" and "AAB"
530  // as equal.
531 
532  TIndx offset_begin = KeyOffset;
533  TIndx term_end = KeyOffset + term_in.size() + 1;
534  TIndx map_end = term_end + at_least;
535 
536  if (map_end > file_length) {
537  map_end = file_length;
538 
539  if (term_end > map_end) {
540  term_end = map_end;
541  result = int(file_length - offset_begin);
542  }
543  }
544 
545  const char * file_data = (const char *)lease.GetFileDataPtr(file_name,offset_begin);
546 
547  Int4 dc_result =
548  x_DiffChar(term_in,
549  file_data,
550  file_data + term_in.size() + 1,
551  ignore_case);
552 
553  if (dc_result != -1) {
554  return dc_result;
555  }
556 
557  return result;
558 }
559 
560 /// Return NUL for nulls or EOL characters
561 ///
562 /// This function returns a NUL byte for any of NUL, CR, or NL. This
563 /// is done because these characters are used to terminate the
564 /// variable length records in a string-based ISAM file.
565 ///
566 /// @param c
567 /// A character
568 /// @return
569 /// NUL or the same character
570 static inline char
572 {
573  if (SEQDB_ISEOL(c)) {
574  return 0;
575  } else {
576  return c;
577  }
578 }
579 
580 /// The terminating character for string ISAM keys when data is present.
581 const char ISAM_DATA_CHAR = (char) 2;
582 
583 /// Returns true if the character is a terminator for an ISAM key.
584 static inline bool ENDS_ISAM_KEY(char P)
585 {
586  return (P == ISAM_DATA_CHAR) || (s_SeqDBIsam_NullifyEOLs(P) == 0);
587 }
588 
589 Int4 CSeqDBIsam::x_DiffChar(const string & term_in,
590  const char * begin,
591  const char * end,
592  bool ignore_case)
593 {
594  int result(-1);
595  int i(0);
596 
597  const char * file_data = begin;
598  int bytes = int(end - begin);
599 
600  for(i = 0; (i < bytes) && i < (int) term_in.size(); i++) {
601  char ch1 = term_in[i];
602  char ch2 = file_data[i];
603 
604  if (ch1 != ch2) {
607 
608  if (ignore_case) {
609  ch1 = toupper((unsigned char) ch1);
610  ch2 = toupper((unsigned char) ch2);
611  }
612 
613  if (ch1 != ch2) {
614  break;
615  }
616  }
617  }
618 
619  const char * p = file_data + i;
620 
621  while((p < end) && ((*p) == ' ')) {
622  p++;
623  }
624 
625  if (((p == end) || ENDS_ISAM_KEY(*p)) && (i == (int) term_in.size())) {
626  result = -1;
627  } else {
628  result = i;
629  }
630 
631  return result;
632 }
633 
634 void CSeqDBIsam::x_ExtractPageData(const string & term_in,
635  TIndx page_index,
636  const char * beginp,
637  const char * endp,
638  vector<TIndx> & indices_out,
639  vector<string> & keys_out,
640  vector<string> & data_out)
641 {
642  // Collect all 'good' data from the page.
643 
644  bool ignore_case = true;
645 
646  Uint4 TermNum(0);
647 
648  const char * indexp(beginp);
649  bool found_match(false);
650 
651  while (indexp < endp) {
652  Int4 Diff = x_DiffChar(term_in,
653  indexp,
654  endp,
655  ignore_case);
656 
657  if (Diff == -1) { // Complete match
658  found_match = true;
659 
660  x_ExtractData(indexp,
661  endp,
662  keys_out,
663  data_out);
664 
665  indices_out.push_back(page_index + TermNum);
666  } else {
667  // If we found a match, but the current term doesn't
668  // match, then we are past the set of matching entries.
669 
670  if (found_match) {
671  break;
672  }
673  }
674 
675  // Skip remainder of term, and any nulls after it.
676 
677  while((indexp < endp) && s_SeqDBIsam_NullifyEOLs(*indexp)) {
678  indexp++;
679  }
680  while((indexp < endp) && (! s_SeqDBIsam_NullifyEOLs(*indexp))) {
681  indexp++;
682  }
683 
684  TermNum++;
685  }
686 }
687 
688 void CSeqDBIsam::x_ExtractAllData(const string & term_in,
689  TIndx sample_index,
690  vector<TIndx> & indices_out,
691  vector<string> & keys_out,
692  vector<string> & data_out)
693 
694 {
695  // The object at sample_index is known to match; we will iterate
696  // over the surrounding values to see if they match as well. No
697  // assumptions about how many keys can match are made here.
698 
699  bool ignore_case = true;
700 
701  int pre_amt = 1;
702  int post_amt = 1;
703 
704  bool done_b(false), done_e(false);
705 
706  const char * beginp(0);
707  const char * endp(0);
708 
709  TIndx beg_off(0);
710  TIndx end_off(0);
711 
712  while(! (done_b && done_e)) {
713  if (sample_index < pre_amt) {
714  beg_off = 0;
715  done_b = true;
716  } else {
717  beg_off = sample_index - pre_amt;
718  }
719 
720  if ((m_NumSamples - sample_index) < post_amt) {
721  end_off = m_NumSamples;
722  done_e = true;
723  } else {
724  end_off = sample_index + post_amt;
725  }
726 
727  x_LoadPage(beg_off, end_off, & beginp, & endp);
728 
729  if (! done_b) {
730  Int4 diff_begin = x_DiffChar(term_in,
731  beginp,
732  endp,
733  ignore_case);
734 
735  if (diff_begin != -1) {
736  done_b = true;
737  } else {
738  pre_amt ++;
739  }
740  }
741 
742  if (! done_e) {
743  const char * last_term(0);
744  const char * p(endp-1);
745 
746  // Skip over any non-terminating junk at the end
747 
748  enum { eEndNulls, eLastTerm } search_stage = eEndNulls;
749 
750  while(p > beginp) {
751  bool terminal = (0 == s_SeqDBIsam_NullifyEOLs(*p));
752 
753  if (search_stage == eEndNulls) {
754  if (! terminal) {
755  search_stage = eLastTerm;
756  }
757  } else {
758  if (terminal) {
759  last_term = p + 1;
760  break;
761  }
762  }
763 
764  p--;
765  }
766 
767  if (! last_term) {
768  last_term = beginp;
769  }
770 
771  Int4 diff_end = x_DiffChar(term_in,
772  last_term,
773  endp,
774  ignore_case);
775 
776  if (diff_end != -1) {
777  done_e = true;
778  } else {
779  post_amt ++;
780  }
781  }
782  }
783 
784  x_ExtractPageData(term_in,
785  m_PageSize * beg_off,
786  beginp,
787  endp,
788  indices_out,
789  keys_out,
790  data_out);
791 }
792 
793 void CSeqDBIsam::x_ExtractData(const char * key_start,
794  const char * map_end,
795  vector<string> & keys_out,
796  vector<string> & data_out)
797 {
798  const char * data_ptr(0);
799  const char * p(key_start);
800 
801  while(p < map_end) {
802  switch(s_SeqDBIsam_NullifyEOLs(*p)) {
803  case 0:
804  if (data_ptr) {
805  keys_out.push_back(string(key_start, data_ptr));
806  data_out.push_back(string(data_ptr+1, p));
807  } else {
808  keys_out.push_back(string(key_start, p));
809  data_out.push_back("");
810  }
811  return;
812 
813  case ISAM_DATA_CHAR:
814  data_ptr = p;
815 
816  default:
817  p++;
818  }
819  }
820 }
821 
824  Uint4 sample_num)
825 
826 {
827  TIndx offset_begin = sample_offset + (sample_num * sizeof(Uint4));
828  //TIndx offset_end = offset_begin + sizeof(Uint4);
829 
830 
831  Int4 * key_offset_addr = (Int4 *)m_IndexLease.GetFileDataPtr(offset_begin);
832  return SeqDB_GetStdOrd(key_offset_addr);
833 }
834 
835 void
837  int length,
838  string & str,
839  bool trim_to_null)
840 
841 {
842  //TIndx offset_end = key_offset + length;
843 
844  const char * key_offset_addr =
845  (const char *)m_IndexLease.GetFileDataPtr(key_offset);
846 
847 
848  if (trim_to_null) {
849  for(int i = 0; i<length; i++) {
850  if (! key_offset_addr[i]) {
851  length = i;
852  break;
853  }
854  }
855  }
856 
857  str.assign(key_offset_addr, length);
858 }
859 
860 // Given an index, this computes the diff from the input term. It
861 // also returns the offset for that sample's key in KeyOffset.
862 
863 int CSeqDBIsam::x_DiffSample(const string & term_in,
864  Uint4 SampleNum,
865  TIndx & KeyOffset)
866 
867 {
868  // Meaning:
869  // a. Compute SampleNum*4
870  // b. Address this number into SamplePos (indexlease)
871  // c. Swap this number to compute Key offset.
872  // d. Add to beginning of file to get key data pointer.
873 
874  bool ignore_case(true);
875 
876  TIndx SampleOffset(m_KeySampleOffset);
877 
879  SampleOffset += (m_NumSamples + 1) * sizeof(Uint4);
880  }
881 
882  TIndx offset_begin = SampleOffset + (SampleNum * sizeof(Uint4));
883  //TIndx offset_end = offset_begin + sizeof(Uint4);
884 
885  KeyOffset = SeqDB_GetStdOrd((Int4*)m_IndexLease.GetFileDataPtr(offset_begin));
886 
887  Uint4 max_lines_2 = m_MaxLineSize * 2;
888 
889  return x_DiffCharLease(term_in,
890  m_IndexLease,
891  m_IndexFname,
893  max_lines_2,
894  KeyOffset,
895  ignore_case);
896 
897 }
898 
900  TIndx SampleNum2,
901  const char ** beginp,
902  const char ** endp)
903 
904 {
905  // Load the appropriate page of terms into memory.
906 
907  _ASSERT(SampleNum2 > SampleNum1);
908 
909  TIndx begin_offset = m_KeySampleOffset + SampleNum1 * sizeof(Uint4);
910  //TIndx end_offset = m_KeySampleOffset + (SampleNum2 + 1) * sizeof(Uint4);
911 
912  Uint4 * key_offsets((Uint4*)m_IndexLease.GetFileDataPtr(begin_offset));
913 
914 
915  Uint4 key_off1 = SeqDB_GetStdOrd(& key_offsets[0]);
916  Uint4 key_off2 = SeqDB_GetStdOrd(& key_offsets[SampleNum2 - SampleNum1]);
917 
918  *beginp = (const char *) m_DataLease.GetFileDataPtr(m_DataFname,key_off1);
919  *endp = (const char *) m_DataLease.GetFileDataPtr(key_off2);
920 }
921 
922 
923 // ------------------------StringSearch--------------------------
924 // Purpose: Main search function of string search.
925 //
926 // Parameters: Key - interer to search
927 // Data - returned value
928 // Index - internal index in database
929 // Returns: ISAM Error Code
930 // NOTE: None
931 // --------------------------------------------------------------
932 
934 CSeqDBIsam::x_StringSearch(const string & term_in,
935  vector<string> & terms_out,
936  vector<string> & values_out,
937  vector<TIndx> & indices_out)
938 
939 {
940  // These are always false; They may relate to the prior find_one /
941  // expand_to_many method of getting multiple OIDs.
942 
943  bool short_match(false);
944  bool follow_match(false);
945 
946  size_t preexisting_data_count = values_out.size();
947 
948  if (m_Initialized == false) {
949  return eInitFailed;
950  }
951 
952  if (x_OutOfBounds(term_in)) {
953  return eNotFound;
954  }
955 
956  // We will set this option to avoid more complications
957  bool ignore_case = true;
958 
959  // search the sample file first
960 
961  TIndx Start(0);
962  TIndx Stop(m_NumSamples - 1);
963 
964  int Length = (int) term_in.size();
965 
966  TIndx SampleOffset(m_KeySampleOffset);
967 
969  SampleOffset += (m_NumSamples + 1) * sizeof(Uint4);
970  }
971 
972  int found_short(-1);
973 
974  string short_term;
975  int SampleNum(-1);
976 
977  while(Stop >= Start) {
978  SampleNum = ((Uint4)(Stop + Start)) >> 1;
979 
980  TIndx KeyOffset(0);
981 
982  int diff = x_DiffSample(term_in, SampleNum, KeyOffset);
983 
984  // If this is an exact match, return the master term number.
985 
986  const char * KeyData = (const char *)m_IndexLease.GetFileDataPtr(KeyOffset);
987  TIndx BytesToEnd = m_IndexFileLength - KeyOffset;
988 
989  Uint4 max_lines_2 = m_MaxLineSize * 2;
990 
991  if (BytesToEnd > (TIndx) max_lines_2) {
992  BytesToEnd = max_lines_2;
993  }
994 
995  if (diff == -1) {
996  x_ExtractAllData(term_in,
997  SampleNum,
998  indices_out,
999  terms_out,
1000  values_out);
1001 
1002 
1003  return eNoError;
1004  }
1005 
1006  // If the key is a superset of the sample term, backup until
1007  // just before the term.
1008 
1009  if (short_match && (diff >= Length)) {
1010  if (SampleNum > 0)
1011  SampleNum--;
1012 
1013  while(SampleNum > 0) {
1014  TIndx key_offset =
1015  x_GetIndexKeyOffset(SampleOffset,
1016  SampleNum);
1017 
1018 
1019  string prefix;
1020  x_GetIndexString(key_offset, Length, prefix, false);
1021 
1022  if (ignore_case) {
1023  if (NStr::CompareNocase(prefix, term_in) != 0) {
1024  break;
1025  }
1026  } else {
1027  if (prefix != term_in) {
1028  break;
1029  }
1030  }
1031 
1032  SampleNum--;
1033  }
1034 
1035  found_short = SampleNum + 1;
1036 
1037  TIndx key_offset =
1038  x_GetIndexKeyOffset(SampleOffset,
1039  SampleNum + 1);
1040 
1041 
1042  string prefix;
1043  x_GetIndexString(key_offset, max_lines_2, short_term, true);
1044 
1045  break;
1046  } else {
1047  // If preceding is desired, note the key.
1048 
1049  if (follow_match) {
1050  found_short = SampleNum;
1051 
1052  x_GetIndexString(KeyOffset, max_lines_2, short_term, true);
1053  }
1054  }
1055 
1056  // Otherwise, search for the next sample.
1057 
1058  if (ignore_case
1059  ? tolower((unsigned char) term_in[diff]) < tolower((unsigned char) KeyData[diff])
1060  : term_in[diff] < KeyData[diff]) {
1061  Stop = --SampleNum;
1062  } else {
1063  Start = SampleNum + 1;
1064  }
1065  }
1066 
1067 
1068  // If the term is out of range altogether, report not finding it.
1069 
1070  if ( (SampleNum < 0) || (SampleNum >= m_NumSamples)) {
1071  return eNotFound;
1072  }
1073 
1074  // Load the appropriate page of terms into memory.
1075 
1076  const char * beginp(0);
1077  const char * endp(0);
1078 
1079  x_LoadPage(SampleNum, SampleNum + 1, & beginp, & endp);
1080 
1081  // Search the page for the term.
1082 
1083  x_ExtractPageData(term_in,
1084  m_PageSize * SampleNum,
1085  beginp,
1086  endp,
1087  indices_out,
1088  terms_out,
1089  values_out);
1090 
1091  // For now the short and follow logic is not implemented.
1092 
1093  EErrorCode rv(eNoError);
1094 
1095  if (preexisting_data_count == values_out.size()) {
1096  rv = eNotFound;
1097  }
1098 
1099  return rv;
1100 }
1101 
1103  const string & dbname,
1104  char prot_nucl,
1105  char file_ext_char,
1106  ESeqDBIdType ident_type)
1107  : m_Atlas (atlas),
1108  m_IdentType (ident_type),
1109  m_IndexLease (atlas),
1110  m_DataLease (atlas),
1111  m_Type (eNumeric),
1112  m_NumTerms (0),
1113  m_NumSamples (0),
1114  m_PageSize (0),
1115  m_MaxLineSize (0),
1116  m_IdxOption (0),
1117  m_Initialized (false),
1118  m_KeySampleOffset(0),
1119  m_TestNonUnique (true),
1120  m_FileStart (0),
1121  m_FirstOffset (0),
1122  m_LastOffset (0),
1123  m_LongId (false),
1124  m_TermSize (8)
1125 {
1126  // These are the types that readdb.c seems to use.
1127 
1128  switch(ident_type) {
1129  case eGiId:
1130  case ePigId:
1131  case eTiId:
1132  m_Type = eNumeric;
1133  break;
1134 
1135  case eStringId:
1136  case eHashId:
1137  m_Type = eString;
1138  break;
1139 
1140  default:
1142  eArgErr,
1143  "Error: ident type argument not valid");
1144  }
1145 
1147  prot_nucl,
1148  file_ext_char,
1149  m_IndexFname,
1150  m_DataFname);
1151 
1152  if (! (CFile(m_IndexFname).Exists() &&
1153  CFile(m_DataFname).Exists()) ) {
1154 
1155  string msg("Error: Could not open input file (");
1156  msg += m_IndexFname + "/" + m_DataFname + ")";
1157  NCBI_THROW(CSeqDBException, eFileErr, msg);
1158  }
1161  if(m_Type == eNumeric) {
1163  } else {
1165  }
1166  if (eNoError !=x_InitSearch()) {
1167  m_Initialized = false;
1168  }
1170 }
1171 
1173  char prot_nucl,
1174  char file_ext_char,
1175  string & index_name,
1176  string & data_name)
1177 {
1178  if (dbname.empty() ||
1179  (! isalpha((unsigned char) prot_nucl)) ||
1180  (! isalpha((unsigned char) file_ext_char))) {
1181 
1183  eArgErr,
1184  "Error: argument not valid");
1185  }
1186 
1187  index_name.reserve(dbname.size() + 4);
1188  data_name.reserve(dbname.size() + 4);
1189 
1190  index_name = dbname;
1191  index_name += '.';
1192  index_name += prot_nucl;
1193  index_name += file_ext_char;
1194 
1195  data_name = index_name;
1196  index_name += 'i';
1197  data_name += 'd';
1198 }
1199 
1200 bool CSeqDBIsam::IndexExists(const string & dbname,
1201  char prot_nucl,
1202  char file_ext_char)
1203 {
1204  string iname, dname;
1205  x_MakeFilenames(dbname, prot_nucl, file_ext_char, iname, dname);
1206 
1207  return CFile(iname).Exists() && CFile(dname).Exists();
1208 }
1209 
1211 {
1212  UnLease();
1213 }
1214 //Remove this
1216 {
1217  m_IndexLease.Clear();
1218  m_DataLease.Clear();
1219 }
1220 
1222 {
1223  EErrorCode err =
1224  x_NumericSearch(ident, & oid, 0);
1225 
1226  if (err == eNoError) {
1227  return true;
1228  }
1229 
1230  oid = -1u; /* NCBI_FAKE_WARNING */
1231 
1232  return false;
1233 }
1234 
1235 void CSeqDBIsam::StringToOids(const string & acc,
1236  vector<TOid> & oids,
1237  bool adjusted,
1238  bool & version_check)
1239 
1240 {
1241  bool strip_version = version_check;
1242  version_check = false;
1243 
1245 
1246  if(m_Initialized == false) {
1247  return;
1248  }
1249 
1250  bool found = false;
1251 
1252  string accession(string("gb|") + acc + "|");
1253  string locus_str(string("gb||") + acc);
1254 
1255  EErrorCode err = eNoError;
1256 
1257  vector<string> keys_out;
1258  vector<string> data_out;
1259  vector<TIndx> indices_out;
1260 
1261  if (! adjusted) {
1262  if ((err = x_StringSearch(accession,
1263  keys_out,
1264  data_out,
1265  indices_out)) < 0) {
1266  return;
1267  }
1268 
1269  if (err == eNoError) {
1270  found = true;
1271  }
1272 
1273  if ((! found) &&
1274  (err = x_StringSearch(locus_str,
1275  keys_out,
1276  data_out,
1277  indices_out)) < 0) {
1278 
1279  return;
1280  }
1281 
1282  if (err != eNotFound) {
1283  found = true;
1284  }
1285  }
1286 
1287  if ((! found) &&
1288  (err = x_StringSearch(acc,
1289  keys_out,
1290  data_out,
1291  indices_out)) < 0) {
1292 
1293 
1294  return;
1295  }
1296 
1297  if (err != eNotFound) {
1298  found = true;
1299  }
1300 
1301  if ((! found) && strip_version) {
1302  size_t pos = acc.find(".");
1303 
1304  bool is_version = false;
1305 
1306  if (pos != string::npos) {
1307  int ver_len = static_cast<int>(acc.size() - pos) - 1;
1308 
1309  is_version = (ver_len <= 3 && ver_len >= 1);
1310 
1311  for(size_t vp = pos+1; vp < acc.size(); vp++) {
1312  if (! isdigit(acc[vp])) {
1313  is_version = false;
1314  break;
1315  }
1316  }
1317  }
1318 
1319  if (is_version) {
1320  string nover(acc, 0, pos);
1321 
1322  err = x_StringSearch(nover,
1323  keys_out,
1324  data_out,
1325  indices_out);
1326 
1327 
1328  if (data_out.size()) {
1329  version_check = true;
1330  }
1331 
1332  if (err < 0) {
1333  return;
1334  }
1335  }
1336  }
1337 
1338  if (err != eNotFound) {
1339  found = true;
1340  }
1341 
1342  if (! found) {
1343  // Use CSeq_id to parse the id string and build a replacement,
1344  // FASTA type string. This allows some IDs, such as PDBs with
1345  // chains, such as '1qcfA' to be parsed.
1346 
1347  string id;
1348 
1349  try {
1351  id = seqid.AsFastaString();
1352  }
1353  catch(CSeqIdException &) {
1354  }
1355 
1356  if (id.size() &&
1357  ((err = x_StringSearch(id,
1358  keys_out,
1359  data_out,
1360  indices_out)) < 0)) {
1361 
1362  return;
1363  }
1364  }
1365 
1366  if (err != eNotFound) {
1367  found = true;
1368  }
1369 
1370  if (found) {
1371  ITERATE(vector<string>, iter, data_out) {
1372  oids.push_back(atoi((*iter).c_str()));
1373  }
1374  }
1375 }
1376 
1378  vector<int> &,
1379  bool)
1380 
1381 {
1382  cerr << " this should be derived from readdb_acc2fastaEx().." << endl;
1383  _TROUBLE;
1384  return false;
1385 }
1386 
1387 void CSeqDBIsam::IdsToOids(int vol_start,
1388  int vol_end,
1389  CSeqDBGiList & ids)
1390 
1391 
1392 {
1393  // The vol_start parameter is needed because translations in the
1394  // GI list should refer to global OIDs, not per-volume OIDs.
1395 
1396  switch (m_IdentType) {
1397  case eGiId:
1398  x_TranslateGiList<TGi>(vol_start, ids);
1399  break;
1400 
1401  case eTiId:
1402  x_TranslateGiList<TTi>(vol_start, ids);
1403  break;
1404 
1405  case eStringId:
1406  x_TranslateGiList<string>(vol_start, ids);
1407  break;
1408 
1409  case ePigId:
1410  x_TranslateGiList<TPig>(vol_start, ids);
1411  break;
1412 
1413  default:
1415  eArgErr,
1416  "Error: Wrong type of idlist specified.");
1417  }
1418 }
1419 
1420 void CSeqDBIsam::IdsToOids(int vol_start,
1421  int vol_end,
1422  CSeqDBNegativeList & ids)
1423 
1424 
1425 {
1426  // The vol_start parameter is needed because translations in the
1427  // GI list should refer to global OIDs, not per-volume OIDs.
1428 
1430 
1431  //m_Atlas.Lock(locked);
1432 
1433  ids.InsureOrder();
1434 
1435  if ((m_IdentType == eGiId) && ids.GetNumGis()) {
1436  x_SearchNegativeMulti(vol_start,
1437  vol_end,
1438  ids,
1439  false);
1440 
1441  }
1442 
1443  if ((m_IdentType == eTiId) && ids.GetNumTis()) {
1444  x_SearchNegativeMulti(vol_start,
1445  vol_end,
1446  ids,
1447  true);
1448 
1449  }
1450 
1451  if(m_IdentType == eStringId && ids.GetNumSis()) {
1452  x_SearchNegativeMultiSeq(vol_start,
1453  vol_end,
1454  ids);
1455  //true,
1456 
1457  }
1458 }
1459 
1461 {
1462  Int4 Start (0);
1463  Int4 Stop (m_NumSamples - 1);
1464 
1465  //m_Atlas.Lock(locked);
1466 
1467 
1468  if (m_Type == eNumeric) {
1469  //
1470  // Get first key from data file
1471 
1472  int num_elements(0);
1473  int start(0);
1474  const void * data_page(0);
1475 
1476  x_MapDataPage(Start,
1477  start,
1478  num_elements,
1479  & data_page);
1480 
1481 
1482  _ASSERT(num_elements);
1483 
1484  int elem_index = 0;
1485 
1486  Int8 data_gi(0);
1487  int data_oid(-1);
1488 
1489  x_GetDataElement(data_page,
1490  elem_index,
1491  data_gi,
1492  data_oid);
1493 
1494  m_FirstKey.SetNumeric(data_gi);
1495 
1496 
1497  //
1498  // Get last key from data file
1499 
1500  x_MapDataPage(Stop,
1501  start,
1502  num_elements,
1503  & data_page);
1504 
1505 
1506  _ASSERT(num_elements);
1507 
1508  elem_index = num_elements - 1;
1509 
1510  x_GetDataElement(data_page,
1511  elem_index,
1512  data_gi,
1513  data_oid);
1514 
1515  m_LastKey.SetNumeric(data_gi);
1516  } else {
1517  //
1518  // Load the appropriate page of terms into memory.
1519 
1520  const char * beginp(0);
1521  const char * endp(0);
1522 
1523  //
1524  // Load the first page
1525 
1526  x_LoadPage(Start, Start + 1, & beginp, & endp);
1527 
1528  // Get first term
1529 
1530  vector<string> keys_out;
1531  vector<string> data_out; // not used
1532 
1533  x_ExtractData(beginp,
1534  endp,
1535  keys_out,
1536  data_out);
1537 
1538  x_Lower(keys_out.front());
1539  m_FirstKey.SetString(keys_out.front());
1540 
1541 
1542  //
1543  // Load the last page
1544 
1545  x_LoadPage(Stop, Stop + 1, & beginp, & endp);
1546 
1547  // Advance to last item
1548 
1549  const char * lastp(0);
1550  const char * indexp(beginp);
1551 
1552  while (indexp < endp) {
1553  // Remember our new "last term" value.
1554 
1555  lastp = indexp;
1556 
1557  // Skip remainder of term, and any nulls after it.
1558 
1559  while((indexp < endp) && s_SeqDBIsam_NullifyEOLs(*indexp)) {
1560  indexp++;
1561  }
1562  while((indexp < endp) && (! s_SeqDBIsam_NullifyEOLs(*indexp))) {
1563  indexp++;
1564  }
1565  }
1566 
1567  // Get the last key
1568 
1569  _ASSERT(lastp);
1570 
1571  keys_out.clear();
1572  data_out.clear();
1573 
1574  x_ExtractData(lastp,
1575  endp,
1576  keys_out,
1577  data_out);
1578 
1579  x_Lower(keys_out.front());
1580  m_LastKey.SetString(keys_out.front());
1581  }
1582 }
1583 
1585 {
1586  if (! (m_FirstKey.IsSet() && m_LastKey.IsSet())) {
1587  return false;
1588  }
1589 
1590  _ASSERT(m_Type == eNumeric);
1591 
1593  return true;
1594  }
1595 
1597  return true;
1598  }
1599 
1600  return false;
1601 }
1602 
1604 {
1605  if (! (m_FirstKey.IsSet() && m_LastKey.IsSet())) {
1606  return false;
1607  }
1608 
1609  _ASSERT(m_Type == eString);
1610 
1611  x_Lower(key);
1612 
1614  return true;
1615  }
1616 
1618  return true;
1619  }
1620 
1621  return false;
1622 }
1623 
1625  Int8 & high_id,
1626  int & count)
1627 
1628 
1629 {
1630  if(m_Initialized == false) {
1631  count = 0;
1632  return;
1633  }
1634 
1635  if (! (m_FirstKey.IsSet() && m_LastKey.IsSet())) {
1636  count = 0;
1637  return;
1638  }
1639 
1640  low_id = m_FirstKey.GetNumeric();
1641  high_id = m_LastKey.GetNumeric();
1642  count = m_NumTerms;
1643 }
1644 
1645 void CSeqDBIsam::GetIdBounds(string & low_id,
1646  string & high_id,
1647  int & count)
1648 
1649 
1650 {
1651  if(m_Initialized == false) {
1652  count = 0;
1653  return;
1654  }
1655 
1656  if (! (m_FirstKey.IsSet() && m_LastKey.IsSet())) {
1657  count = 0;
1658  return;
1659  }
1660 
1661  low_id = m_FirstKey.GetString();
1662  high_id = m_LastKey.GetString();
1663  count = m_NumTerms;
1664 }
1665 
1667  vector<TOid> & oids)
1668 
1669 
1670 {
1672  if(m_Initialized == false) {
1673  return;
1674  }
1675 
1676  bool found = false;
1677 
1678  string key(NStr::UIntToString(hash));
1679 
1680  EErrorCode err = eNoError;
1681 
1682  vector<string> keys_out;
1683  vector<string> data_out;
1684  vector<TIndx> indices_out;
1685 
1686  if ((err = x_StringSearch(key,
1687  keys_out,
1688  data_out,
1689  indices_out)) < 0) {
1690 
1691  return;
1692  }
1693 
1694  if (err != eNotFound) {
1695  found = true;
1696  }
1697 
1698  if (found) {
1699  ITERATE(vector<string>, iter, data_out) {
1700  oids.push_back(atoi(iter->c_str()));
1701  }
1702  }
1703 }
1704 
1706 
CFile –.
Definition: ncbifile.hpp:1605
CSeqDBAtlas class.
Definition: seqdbatlas.hpp:297
bool GetFileSizeL(const string &fname, TIndx &length)
Get size of a file.
Definition: seqdbatlas.cpp:160
CSeqDBException.
Definition: seqdbcommon.hpp:73
const char * GetFileDataPtr(const string &fname, TIndx offset)
Get a pointer to the specified offset.
Definition: seqdbatlas.hpp:757
void Clear()
Clears the memory mapobject.
Definition: seqdbatlas.hpp:737
void Init(const string filename)
Initializes a memory map object.
Definition: seqdbatlas.hpp:702
CSeqDBGiList.
bool OutsideLastBound(Int8 ident)
Returns true if the provided integer compares as higher than the assigned upper boundary for this ISA...
Definition: seqdbisam.hpp:465
bool IsSet()
Returns true if this object has an assigned value.
Definition: seqdbisam.hpp:418
string GetString() const
Fetch the numeric value of this object.
Definition: seqdbisam.hpp:444
void SetString(const string &ident)
Fetch the string value of this object.
Definition: seqdbisam.hpp:437
void SetNumeric(Int8 ident)
Assign a numeric value to this object.
Definition: seqdbisam.hpp:424
Int8 GetNumeric() const
Fetch the numeric value of this object.
Definition: seqdbisam.hpp:431
bool OutsideFirstBound(Int8 ident)
Returns true if the provided integer compares as lower than the assigned lower boundary for this ISAM...
Definition: seqdbisam.hpp:451
EErrorCode x_StringSearch(const string &term_in, vector< string > &term_out, vector< string > &value_out, vector< TIndx > &index_out)
String identifier lookup.
Definition: seqdbisam.cpp:934
EErrorCode x_SearchIndexNumeric(Int8 Number, int *Data, Uint4 *Index, Int4 &SampleNum, bool &done)
Index file search.
Definition: seqdbisam.cpp:140
CSeqDBIsam(CSeqDBAtlas &atlas, const string &dbname, char prot_nucl, char file_ext_char, ESeqDBIdType ident_type)
Constructor.
Definition: seqdbisam.cpp:1102
@ eNumericLongId
This type is not supported.
Definition: seqdbisam.hpp:139
@ eString
This type is not supported.
Definition: seqdbisam.hpp:136
@ eNumericNoData
Numeric database with Key/Value pairs in the index file.
Definition: seqdbisam.hpp:135
void x_SearchNegativeMulti(int vol_start, int vol_end, CSeqDBNegativeList &gis, bool use_tis)
Negative ID List Translation.
Definition: seqdbisam.cpp:219
CSeqDBFileMemMap m_DataLease
A persistent lease on the ISAM data file.
Definition: seqdbisam.hpp:1190
TIndx m_IndexFileLength
The length of the ISAM index file.
Definition: seqdbisam.hpp:1206
bool m_LongId
Use Uint8 for the key.
Definition: seqdbisam.hpp:1248
int x_DiffCharLease(const string &term_in, CSeqDBFileMemMap &lease, const string &file_name, TIndx file_length, Uint4 at_least, TIndx KeyOffset, bool ignore_case)
Find the first character to differ in two strings.
Definition: seqdbisam.cpp:516
int x_DiffChar(const string &term_in, const char *begin, const char *end, bool ignore_case)
Find the first character to differ in two strings.
Definition: seqdbisam.cpp:589
int x_GetPageNumElements(Int4 SampleNum, Int4 *Start)
Determine the number of elements in the data page.
Definition: seqdbisam.cpp:123
ESeqDBIdType m_IdentType
The type of identifier this class uses.
Definition: seqdbisam.hpp:1183
SIsamKey m_LastKey
Last volume key.
Definition: seqdbisam.hpp:1245
Int4 m_IdxOption
Options set by upper layer.
Definition: seqdbisam.hpp:1221
void x_LoadData(CSeqDBFileMemMap &lease, vector< T > &keys, vector< int > &vals, int num_keys, TIndx begin)
Load and extract a data page into array at once.
Definition: seqdbisam.hpp:519
void x_GetDataElement(const void *dpage, int index, Int8 &key, int &data)
Get a particular data element from a data page.
Definition: seqdbisam.hpp:1509
Int4 m_NumSamples
Number of terms in ISAM index.
Definition: seqdbisam.hpp:1212
void HashToOids(unsigned hash, vector< TOid > &oids)
Sequence hash lookup.
Definition: seqdbisam.cpp:1666
EErrorCode
Exit conditions occurring in this code.
Definition: seqdbisam.hpp:489
@ eBadVersion
Lookup was successful.
Definition: seqdbisam.hpp:492
@ eBadType
The format version of the ISAM file is unsupported.
Definition: seqdbisam.hpp:493
@ eWrongFile
The requested ISAM type did not match the file.
Definition: seqdbisam.hpp:494
@ eNoError
The key was not found.
Definition: seqdbisam.hpp:491
@ eInitFailed
The file was not found, or was the wrong length.
Definition: seqdbisam.hpp:495
int x_DiffSample(const string &term_in, Uint4 SampleNum, TIndx &KeyOffset)
Find the first character to differ in two strings.
Definition: seqdbisam.cpp:863
~CSeqDBIsam()
Destructor.
Definition: seqdbisam.cpp:1210
Uint8 x_GetNumericKey(const void *p)
Definition: seqdbisam.hpp:1253
void x_LoadIndex(CSeqDBFileMemMap &lease, vector< T > &keys, vector< TIndx > &offs)
Load and extract all index samples into array at once.
Definition: seqdbisam.hpp:500
bool x_SparseStringToOids(const string &acc, vector< int > &oids, bool adjusted)
Lookup a string in a sparse table.
Definition: seqdbisam.cpp:1377
void x_FindIndexBounds()
Find the least and greatest keys in this ISAM file.
Definition: seqdbisam.cpp:1460
Int4 m_NumTerms
Number of terms in database.
Definition: seqdbisam.hpp:1209
void IdsToOids(int vol_start, int vol_end, CSeqDBGiList &ids)
Translate Gis and Tis to Oids for the given ID list.
Definition: seqdbisam.cpp:1387
EErrorCode x_SearchDataNumeric(Int8 Number, int *Data, Uint4 *Index, Int4 SampleNum)
Data file search.
Definition: seqdbisam.cpp:421
int TOid
This class works with OIDs relative to a specific volume.
Definition: seqdbisam.hpp:146
bool m_Initialized
Flag indicating whether initialization has been done.
Definition: seqdbisam.hpp:1224
TIndx x_GetIndexKeyOffset(TIndx sample_offset, Uint4 sample_num)
Get the offset of the specified sample.
Definition: seqdbisam.cpp:823
static void x_MakeFilenames(const string &dbname, char prot_nucl, char file_ext_char, string &index_name, string &data_name)
Make filenames for ISAM file.
Definition: seqdbisam.cpp:1172
static void x_Lower(string &s)
Converts a string to lower case.
Definition: seqdbisam.hpp:1143
bool x_OutOfBounds(Int8 key)
Check whether a numeric key is within this volume's bounds.
Definition: seqdbisam.cpp:1584
void x_SearchNegativeMultiSeq(int vol_start, int vol_end, CSeqDBNegativeList &gis)
Definition: seqdbisam.cpp:333
EErrorCode x_InitSearch(void)
Initialize the search object.
Definition: seqdbisam.cpp:59
void x_GetIndexString(TIndx key_offset, int length, string &prefix, bool trim_to_null)
Read a string from the index file.
Definition: seqdbisam.cpp:836
void x_ExtractPageData(const string &term_in, TIndx page_index, const char *beginp, const char *endp, vector< TIndx > &indices_out, vector< string > &keys_out, vector< string > &data_out)
Find matches in the given memory area of a string ISAM file.
Definition: seqdbisam.cpp:634
void GetIdBounds(Int8 &low_id, Int8 &high_id, int &count)
Get Numeric Bounds.
Definition: seqdbisam.cpp:1624
Int4 m_PageSize
Page size of ISAM index.
Definition: seqdbisam.hpp:1215
TIndx m_DataFileLength
The length of the ISAM data file.
Definition: seqdbisam.hpp:1203
void UnLease()
Return any memory held by this object to the atlas.
Definition: seqdbisam.cpp:1215
int m_Type
The format type of database files found (eNumeric or eString).
Definition: seqdbisam.hpp:1194
TIndx m_KeySampleOffset
Offset of samples in index file.
Definition: seqdbisam.hpp:1227
SIsamKey m_FirstKey
First volume key.
Definition: seqdbisam.hpp:1242
void x_LoadPage(TIndx SampleNum1, TIndx SampleNum2, const char **beginp, const char **endp)
Map a page into memory.
Definition: seqdbisam.cpp:899
void x_ExtractAllData(const string &term_in, TIndx sample_index, vector< TIndx > &indices_out, vector< string > &keys_out, vector< string > &data_out)
Find matches in the given page of a string ISAM file.
Definition: seqdbisam.cpp:688
void x_MapDataPage(int sample_index, int &start, int &num_elements, const void **data_page_begin)
Map a data page.
Definition: seqdbisam.hpp:1493
CSeqDBAtlas::TIndx TIndx
Type which is large enough to span the bytes of an ISAM file.
Definition: seqdbisam.hpp:143
int x_GetNumericData(const void *p)
Definition: seqdbisam.hpp:1260
void StringToOids(const string &acc, vector< TOid > &oids, bool adjusted, bool &version_check)
String translation.
Definition: seqdbisam.cpp:1235
bool x_FindInNegativeList(CSeqDBNegativeList &ids, int &index, Int8 key, bool use_tis)
Find ID in the negative GI list using PBS.
Definition: seqdbisam.hpp:1428
Int4 m_MaxLineSize
Maximum string length in the database.
Definition: seqdbisam.hpp:1218
void x_ExtractData(const char *key_start, const char *entry_end, vector< string > &key_out, vector< string > &data_out)
Extract the data from a key-value pair in memory.
Definition: seqdbisam.cpp:793
EErrorCode x_NumericSearch(Int8 Number, int *Data, Uint4 *Index)
Numeric identifier lookup.
Definition: seqdbisam.cpp:498
string m_DataFname
The filename of the ISAM data file.
Definition: seqdbisam.hpp:1197
static bool IndexExists(const string &dbname, char prot_nucl, char file_ext_char)
Check if a given ISAM index exists.
Definition: seqdbisam.cpp:1200
string m_IndexFname
The filename of the ISAM index file.
Definition: seqdbisam.hpp:1200
int m_TermSize
size of the numeric key-data pair
Definition: seqdbisam.hpp:1251
bool x_IdentToOid(Int8 id, TOid &oid)
Numeric identifier lookup.
Definition: seqdbisam.cpp:1221
CSeqDBFileMemMap m_IndexLease
A persistent lease on the ISAM index file.
Definition: seqdbisam.hpp:1186
CSeqDBAtlas & m_Atlas
The memory management layer.
Definition: seqdbisam.hpp:1180
CSeqDBNegativeList.
int GetNumTis() const
Get the number of TIs in the array.
void AddIncludedOid(int oid)
Include an OID in the iteration.
void AddVisibleOid(int oid)
Indicate a visible OID.
int GetNumGis() const
Get the number of GIs in the array.
int GetNumSis() const
Get the number of SeqIds in the array.
void InsureOrder()
Sort list if not already sorted.
CSeqIdException –.
Definition: Seq_id.hpp:1001
const char * file_name[]
#define true
Definition: bool.h:35
#define false
Definition: bool.h:36
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:51
#define P(a, b)
Definition: sqlwparams.h:19
static const char * str(char *buf, int n)
Definition: stats.c:84
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NULL
Definition: ncbistd.hpp:225
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
virtual bool Exists(void) const
Check existence of file.
Definition: ncbifile.hpp:4039
const string AsFastaString(void) const
Definition: Seq_id.cpp:2266
@ fParse_RawText
Try to ID raw non-numeric accessions.
Definition: Seq_id.hpp:81
@ fParse_AnyLocal
Treat otherwise unidentified strings as local accessions as long as they don't resemble FASTA-style I...
Definition: Seq_id.hpp:90
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
#define NPOS
Definition: ncbistr.hpp:133
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2882
static string UIntToString(unsigned int value, TNumToStringFlags flags=0, int base=10)
Convert UInt to string.
Definition: ncbistr.hpp:5111
static bool SplitInTwo(const CTempString str, const CTempString delim, string &str1, string &str2, TSplitFlags flags=0)
Split a string into two pieces using the specified delimiters.
Definition: ncbistr.cpp:3545
char * dbname(DBPROCESS *dbproc)
Get name of current database.
Definition: dblib.c:6929
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
static int version
Definition: mdb_load.c:29
const struct ncbi::grid::netcache::search::fields::SIZE size
const struct ncbi::grid::netcache::search::fields::KEY key
static const BitmapCharRec ch1
Definition: ncbi_10x20.c:1827
static const BitmapCharRec ch2
Definition: ncbi_10x20.c:1819
int isalpha(Uchar c)
Definition: ncbictype.hpp:61
int tolower(Uchar c)
Definition: ncbictype.hpp:72
int isdigit(Uchar c)
Definition: ncbictype.hpp:64
int toupper(Uchar c)
Definition: ncbictype.hpp:73
Useful/utility classes and methods.
static const char * prefix[]
Definition: pcregrep.c:405
ESeqDBIdType
Various identifier formats used in Id lookup.
@ eStringId
Each PIG identifier refers to exactly one protein sequence.
@ eTiId
Genomic ID is a relatively stable numeric identifier for sequences.
@ ePigId
Trace ID is a numeric identifier for Trace sequences.
@ eGiId
@ eHashId
Some sequence sources uses string identifiers.
#define SEQDB_ISEOL(x)
Macro for EOL chars.
T SeqDB_GetStdOrd(const T *stdord_obj)
Read a network order integer value.
USING_SCOPE(objects)
Place these definitions in the ncbi namespace.
#define DEFAULT_SISAM_SIZE
Default page size for string indices.
Definition: seqdbisam.cpp:52
static bool ENDS_ISAM_KEY(char P)
Returns true if the character is a terminator for an ISAM key.
Definition: seqdbisam.cpp:584
#define DEFAULT_NISAM_SIZE
Default page size for numeric indices.
Definition: seqdbisam.cpp:49
#define ISAM_VERSION
Format version of the ISAM files.
Definition: seqdbisam.cpp:46
#define MEMORY_ONLY_PAGE_SIZE
Special page size value which indicates a memory-only string index.
Definition: seqdbisam.cpp:55
static bool s_IsSameAccession(string acc1, string acc2)
Definition: seqdbisam.cpp:306
static char s_SeqDBIsam_NullifyEOLs(char c)
Return NUL for nulls or EOL characters.
Definition: seqdbisam.cpp:571
const char ISAM_DATA_CHAR
The terminating character for string ISAM keys when data is present.
Definition: seqdbisam.cpp:581
ISAM index database access object.
Definition: _hash_fun.h:40
#define _TROUBLE
#define _ASSERT
@ eNumeric
Definition: text_util.cpp:53
done
Definition: token1.c:1
else result
Definition: token2.c:20
Modified on Mon Jul 15 05:31:37 2024 by modify_doxy.py rev. 669887