NCBI C++ ToolKit
seqdbcommon.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: seqdbcommon.cpp 100751 2023-09-07 12:41:08Z boratyng $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Kevin Bealer
27  *
28  */
29 
30 /// @file seqdbcommon.cpp
31 /// Definitions of various helper functions for SeqDB.
32 #include <ncbi_pch.hpp>
33 #include <corelib/metareg.hpp>
34 #include <corelib/ncbienv.hpp>
35 #include <corelib/ncbifile.hpp>
37 #include <util/sequtil/sequtil.hpp>
39 #include <objects/seq/seq__.hpp>
44 #include <algorithm>
45 
47 
48 const string kSeqDBGroupAliasFileName("index.alx");
49 
51 {
52  int off = s.FindLastOf(CFile::GetPathSeparator());
53 
54  if (off != -1) {
55  s.EraseFront(off + 1);
56  }
57 
58  return s;
59 }
60 
61 
63 {
64  int off = s.FindLastOf(CFile::GetPathSeparator());
65 
66  if (off != -1) {
67  s.Resize(off);
68  } else {
69  s.Clear();
70  }
71 
72  return s;
73 }
74 
75 
77 {
78  // This used to remove anything after the last "." it could find.
79  // Then it was changed to only remove the part after the ".", if
80  // it did not contain a "/" character.
81 
82  // Now it has been made even stricter, it looks for something like
83  // "(.*)([.][a-zA-Z]{3})" and removes the second sub-expression if
84  // there is a match. This is because of mismatches like "1234.00"
85  // that are not real "file extensions" in the way that SeqDB wants
86  // to process them.
87 
88  int slen = s.Size();
89 
90  if (slen > 4) {
91  string extn(s.GetEnd()-4, s.GetEnd());
92  string extn2(extn, 2, 4);
93  // Of course, nal and pal are not the only valid
94  // extensions, but this code is only used with these two,
95  // as far as I know, at this moment in time.
96 
97  if (extn[0] == '.' &&
98  (extn[1] == 'n' || extn[1] == 'p') &&
99  (extn2 == "al" || extn2 == "in" || extn2 == "db")) {
100  /*
101  isalpha( s[slen-3] ) &&
102  isalpha( s[slen-2] ) &&
103  isalpha( s[slen-1] )) {*/
104 
105  s.Resize(slen - 4);
106  }
107  }
108 
109  return s;
110 }
111 
112 
115  char delim)
116 {
117  for(int i = 0; i < buffer.Size(); i++) {
118  if (buffer[i] == delim) {
119  front = buffer;
120 
121  buffer.EraseFront(i + 1);
122  front.Resize(i);
123 
124  return true;
125  }
126  }
127  return false;
128 }
129 
130 
132  const CSeqDB_Substring & two,
133  const CSeqDB_Substring * extn,
134  string & outp)
135 {
136  char delim = CFile::GetPathSeparator();
137 
138  int extn_amt = extn ? (extn->Size()+1) : 0;
139 
140  if (two.Empty()) {
141  // We only use the extension if there is a filename.
142  one.GetString(outp);
143  return;
144  }
145 
146  bool only_two = false;
147 
148  if (one.Empty() || two[0] == delim) {
149  only_two = true;
150  }
151 
152  // Drive letter test for CP/M derived systems
153  if (delim == '\\' &&
154  two.Size() > 3 &&
155  isalpha(two[0]) &&
156  two[1] == ':' &&
157  two[2] == '\\') {
158 
159  only_two = true;
160  }
161 
162  if (only_two) {
163  outp.reserve(two.Size() + extn_amt);
164  two.GetString(outp);
165 
166  if (extn) {
167  outp.append(".");
168  outp.append(extn->GetBegin(), extn->GetEnd());
169  }
170  return;
171  }
172 
173  outp.reserve(one.Size() + two.Size() + 1 + extn_amt);
174 
175  one.GetString(outp);
176 
177  if (outp[outp.size() - 1] != delim) {
178  outp += delim;
179  }
180 
181  outp.append(two.GetBegin(), two.GetEnd());
182 
183  if (extn) {
184  outp.append(".");
185  outp.append(extn->GetBegin(), extn->GetEnd());
186  }
187 }
188 
189 
190 bool SeqDB_CompareVolume(const string & s1, const string & s2)
191 {
192  string x1, x2;
195  if (x1 != x2) return (x1 < x2);
196  else return (s1 < s2);
197 }
198 
199 /// File existence test interface.
201 public:
202  /// Destructor
204  {
205  }
206 
207  /// Check if file exists at fully qualified path.
208  /// @param fname Filename.
209  /// @return True if the file was found.
210  virtual bool DoesFileExist(const string & fname) = 0;
211 };
212 
213 
214 /// Test whether an index or alias file exists
215 ///
216 /// The provide filename is combined with both of the extensions
217 /// appropriate to the database sequence type, and the resulting
218 /// strings are checked for existence in the file system. The
219 /// 'access' object defines how to check file existence.
220 ///
221 /// @param dbname
222 /// Input path and filename
223 /// @param dbtype
224 /// Database type, either protein or nucleotide
225 /// @param access
226 /// The file access object.
227 /// @param linkoutdb_search
228 /// Determines whether linkoutdb files should be searched for
229 /// @return
230 /// true if either of the index or alias files is found
231 
232 static bool s_SeqDB_DBExists(const string & dbname,
233  char dbtype,
234  CSeqDB_FileExistence & access,
235  bool linkoutdb_search)
236 {
237  string path;
238  path.reserve(dbname.size() + 4);
239  path.assign(dbname.data(), dbname.data() + dbname.size());
240 
241  if (linkoutdb_search) {
242  _ASSERT(dbtype == 'p');
243  path.append(".sqlite3");
244  if (access.DoesFileExist(path)) {
245  return true;
246  }
247  } else {
248  path.append(".-al");
249 
250  path[path.size()-3] = dbtype;
251 
252  if (access.DoesFileExist(path)) {
253  return true;
254  }
255 
256  path[path.size()-2] = 'i';
257  path[path.size()-1] = 'n';
258 
259  if (access.DoesFileExist(path)) {
260  return true;
261  }
262  }
263 
264  return false;
265 }
266 
267 
268 /// Returns the character used to seperate path components in the
269 /// current operating system or platform.
270 static string s_GetPathSplitter()
271 {
272  const char * splitter = 0;
273 
274 #if defined(NCBI_OS_UNIX)
275  splitter = ":";
276 #else
277  splitter = ";";
278 #endif
279 
280  return splitter;
281 }
282 
283 
284 void SeqDB_ConvertOSPath(string & dbs)
285 {
286  // See also CDirEntry::ConvertToOSPath()
287 
288  char delim = CDirEntry::GetPathSeparator();
289 
290  for(size_t i = 0; i<dbs.size(); i++) {
291  if (dbs[i] == '/' || dbs[i] == '\\') {
292  dbs[i] = delim;
293  }
294  }
295 }
296 
297 
298 string SeqDB_MakeOSPath(const string & dbs)
299 {
300  string cvt(dbs);
301  SeqDB_ConvertOSPath(cvt);
302  return cvt;
303 }
304 
305 
306 /// Search for a file in a provided set of paths
307 ///
308 /// This function takes a search path as a ":" delimited set of path
309 /// names, and searches in those paths for the given database
310 /// component. The component name may include path components. If
311 /// the exact flag is set, the path is assumed to contain any required
312 /// extension; otherwise extensions for index and alias files will be
313 /// tried. Each element of the search path is tried in sequential
314 /// order for both index or alias files (if exact is not set), before
315 /// moving to the next element of the search path. The path returned
316 /// from this function will not contain a file extension unless the
317 /// provided filename did (in which case, exact is normally set).
318 ///
319 /// @param blast_paths
320 /// List of filesystem paths seperated by ":".
321 /// @param dbname
322 /// Base name of the database index or alias file to search for.
323 /// @param dbtype
324 /// Type of database, either protein or nucleotide.
325 /// @param exact
326 /// Set to true if dbname already contains any needed extension.
327 /// @param linkoutdb_search
328 /// Determines whether linkoutdb files should be searched for
329 /// @return
330 /// Full pathname, minus extension, or empty string if none found.
331 
332 static string s_SeqDB_TryPaths(const string & blast_paths,
333  const string & dbname,
334  char dbtype,
335  bool exact,
336  CSeqDB_FileExistence & access,
337  bool linkoutdb_search = false)
338 {
339  // 1. If this was a vector<CSeqDB_Substring>, the tokenize would
340  // not need to do any allocations (but would need rewriting).
341  //
342  // 2. If this was split into several functions, and/or a stateful
343  // class was used, this would perform better here, and would
344  // allow improvement of the search routine for combined group
345  // indices (see comments in CSeqDBAliasSets::FindAliasPath).
346 
347  vector<string> roads;
348  NStr::Split(blast_paths, s_GetPathSplitter(), roads, NStr::fSplit_Tokenize);
349 
350  string result;
351  string attempt;
352 
353  ITERATE(vector<string>, road, roads) {
354  attempt.erase();
355 
358  0,
359  attempt);
360 
361  if (exact) {
362  if (access.DoesFileExist(attempt)) {
363  result = attempt;
364  break;
365  }
366  } else {
367  if (s_SeqDB_DBExists(attempt, dbtype, access, linkoutdb_search)) {
368  result = attempt;
369  break;
370  }
371  }
372  }
373 
374  return result;
375 }
376 
377 static string
379  char dbtype,
380  string * sp,
381  bool exact,
382  CSeqDB_FileExistence & access,
383  const string path="")
384 {
385  const string pathology = (path=="") ? CSeqDBAtlas::GenerateSearchPath() : path;
386 
387  if (sp) {
388  *sp = pathology;
389  }
390 
391  return s_SeqDB_TryPaths(pathology, dbname, dbtype, exact, access);
392 }
393 
394 /// Check file existence using CSeqDBAtlas.
396 public:
397  /// Constructor.
399  : m_Atlas (atlas)
400  {
401  }
402 
403  /// Test file existence.
404  /// @param fname Fully qualified name of file for which to look.
405  /// @return True iff file exists.
406  virtual bool DoesFileExist(const string & fname)
407  {
408  return m_Atlas.DoesFileExist(fname);
409  }
410 
411 private:
413 };
414 
415 
416 string SeqDB_FindBlastDBPath(const string & dbname,
417  char dbtype,
418  string * sp,
419  bool exact,
420  CSeqDBAtlas & atlas)
421 {
422  CSeqDB_AtlasAccessor access(atlas);
423 
425  dbtype,
426  sp,
427  exact,
428  access,
429  atlas.GetSearchPath());
430 }
431 
432 
433 /// Check file existence using CFile.
435 public:
436  /// Constructor.
438  {
439  }
440 
441  /// Test file existence.
442  /// @param fname Fully qualified name of file for which to look.
443  /// @return True iff file exists.
444  virtual bool DoesFileExist(const string & fname)
445  {
446  // Use the same criteria as the Atlas code would.
447  CFile whole(SeqDB_MakeOSPath(fname));
448  return whole.GetLength() != (Int8) -1;
449  }
450 };
451 
452 
453 string SeqDB_ResolveDbPath(const string & filename)
454 {
455  CSeqDB_SimpleAccessor access;
456 
457  return s_SeqDB_FindBlastDBPath(filename,
458  '-',
459  0,
460  true,
461  access);
462 }
463 
464 string SeqDB_ResolveDbPathNoExtension(const string & filename,
465  char dbtype /* = '-' */)
466 {
467  CSeqDB_SimpleAccessor access;
468 
469  return s_SeqDB_FindBlastDBPath(filename, dbtype, 0, false, access);
470 }
471 
472 string SeqDB_ResolveDbPathForLinkoutDB(const string & filename)
473 {
474  const char dbtype('p'); // this is determined by blastdb_links application
475  CSeqDB_SimpleAccessor access;
476  const string pathology = CSeqDBAtlas::GenerateSearchPath();
477  return s_SeqDB_TryPaths(pathology, filename, dbtype, false, access, true);
478 }
479 
480 void SeqDB_JoinDelim(string & a, const string & b, const string & delim)
481 {
482  if (b.empty()) {
483  return;
484  }
485 
486  if (a.empty()) {
487  // a has no size - but might have capacity
489  return;
490  }
491 
492  size_t newlen = a.length() + b.length() + delim.length();
493 
494  if (a.capacity() < newlen) {
495  size_t newcap = 16;
496 
497  while(newcap < newlen) {
498  newcap <<= 1;
499  }
500 
501  a.reserve(newcap);
502  }
503 
504  a += delim;
505  a += b;
506 }
507 
508 
510  : m_CurrentOrder(eNone), m_MaskOpts(0)
511 {
512 }
513 
514 
515 /// Compare SGiOid structs by OID.
517 public:
518  /// Test whether lhs is less than (occurs before) rhs.
519  /// @param lhs Left hand side of less-than operator. [in]
520  /// @param rhs Right hand side of less-than operator. [in]
521  /// @return True if lhs has a lower OID than rhs.
523  const CSeqDBGiList::SGiOid & rhs)
524  {
525  return lhs.oid < rhs.oid;
526  }
527 };
528 
529 
530 /// Compare SGiOid structs by GI.
532 public:
533  /// Test whether lhs is less than (occurs before) rhs.
534  /// @param lhs Left hand side of less-than operator. [in]
535  /// @param rhs Right hand side of less-than operator. [in]
536  /// @return True if lhs has a lower GI than rhs.
538  const CSeqDBGiList::SGiOid & rhs)
539  {
540  return lhs.gi < rhs.gi;
541  }
542 };
543 
545 public:
546  /// Test whether lhs is less than (occurs before) rhs.
547  /// @param lhs Left hand side of less-than operator. [in]
548  /// @param rhs Right hand side of less-than operator. [in]
549  /// @return True if lhs has a lower GI than rhs.
551  const CSeqDBGiList::SPigOid & rhs)
552  {
553  return lhs.pig < rhs.pig;
554  }
555 };
556 
557 
558 
559 /// Compare SGiOid structs by GI.
561 public:
562  /// Test whether lhs is less than (occurs before) rhs.
563  /// @param lhs Left hand side of less-than operator. [in]
564  /// @param rhs Right hand side of less-than operator. [in]
565  /// @return True if lhs has a lower GI than rhs.
567  const CSeqDBGiList::STiOid & rhs)
568  {
569  return lhs.ti < rhs.ti;
570  }
571 };
572 
573 
574 /// Compare SSeqIdOid structs by SeqId.
576 public:
577  /// Test whether lhs is less than (occurs before) rhs.
578  /// @param lhs Left hand side of less-than operator. [in]
579  /// @param rhs Right hand side of less-than operator. [in]
580  /// @return True if lhs sorts before rhs by Seq-id.
582  const CSeqDBGiList::SSiOid & rhs)
583  {
584  return lhs.si < rhs.si;
585  }
586 };
587 
588 
589 template<class TCompare, class TVector>
591 {
592  bool already = true;
593 
594  TCompare compare_less;
595 
596  for(int i = 1; i < (int) v.size(); i++) {
597  if (compare_less(v[i], v[i-1])) {
598  already = false;
599  break;
600  }
601  }
602 
603  if (! already) {
604  sort(v.begin(), v.end(), compare_less);
605  }
606 }
607 
609 {
610  NON_CONST_ITERATE(vector<CSeqDBGiList::SSiOid>, itr, m_SisOids) {
611  string str_id = SeqDB_SimplifyAccession(itr->si);
612  itr->si = NStr::ToLower(str_id);
613  }
614 }
615 
617 {
618  // Code depends on OID order after translation, because various
619  // methods of SeqDB use this class for filtering purposes.
620  static CFastMutex mtx;
621  CFastMutexGuard mtx_gurad(mtx);
622  if ((order < m_CurrentOrder) || (order == eNone)) {
624  eFileErr,
625  "Out of sequence sort order requested.");
626  }
627 
628  // Input is usually sorted by GI, so we first test for sortedness.
629  // If it will fail it will probably do so almost immediately.
630 
631  if (order != m_CurrentOrder) {
632  switch(order) {
633  case eNone:
634  break;
635 
636  case eGi:
637  s_InsureOrder<CSeqDB_SortGiLessThan>(m_GisOids);
638  s_InsureOrder<CSeqDB_SortTiLessThan>(m_TisOids);
639  s_InsureOrder<CSeqDB_SortSiLessThan>(m_SisOids);
640  s_InsureOrder<CSeqDB_SortPigLessThan>(m_PigsOids);
641  break;
642 
643  default:
645  eFileErr,
646  "Unrecognized sort order requested.");
647  }
648 
649  m_CurrentOrder = order;
650  }
651 }
652 
653 
654 bool CSeqDBGiList::FindGi(TGi gi) const
655 {
656  int oid(0), index(0);
657  return (const_cast<CSeqDBGiList *>(this))->GiToOid(gi, oid, index);
658 }
659 
660 
661 bool CSeqDBGiList::GiToOid(TGi gi, int & oid)
662 {
663  int index(0);
664  return GiToOid(gi, oid, index);
665 }
666 
667 
668 bool CSeqDBGiList::GiToOid(TGi gi, int & oid, int & index)
669 {
670  InsureOrder(eGi); // would assert be better?
671 
672  int b(0), e((int)m_GisOids.size());
673 
674  while(b < e) {
675  int m = (b + e)/2;
676  TGi m_gi = m_GisOids[m].gi;
677 
678  if (m_gi < gi) {
679  b = m + 1;
680  } else if (m_gi > gi) {
681  e = m;
682  } else {
683  oid = m_GisOids[m].oid;
684  index = m;
685  return true;
686  }
687  }
688 
689  oid = index = -1;
690  return false;
691 }
692 
693 
694 bool CSeqDBGiList::FindTi(TTi ti) const
695 {
696  int oid(0), index(0);
697  return (const_cast<CSeqDBGiList *>(this))->TiToOid(ti, oid, index);
698 }
699 
700 
701 bool CSeqDBGiList::TiToOid(TTi ti, int & oid)
702 {
703  int index(0);
704  return TiToOid(ti, oid, index);
705 }
706 
707 
708 bool CSeqDBGiList::TiToOid(TTi ti, int & oid, int & index)
709 {
710  InsureOrder(eGi); // would assert be better?
711 
712  int b(0), e((int)m_TisOids.size());
713 
714  while(b < e) {
715  int m = (b + e)/2;
716  TTi m_ti = m_TisOids[m].ti;
717 
718  if (m_ti < ti) {
719  b = m + 1;
720  } else if (m_ti > ti) {
721  e = m;
722  } else {
723  oid = m_TisOids[m].oid;
724  index = m;
725  return true;
726  }
727  }
728 
729  oid = index = -1;
730  return false;
731 }
732 
733 bool CSeqDBGiList::FindSi(const string &si) const
734 {
735  int oid(0), index(0);
736  return (const_cast<CSeqDBGiList *>(this))->SiToOid(si, oid, index);
737 }
738 
739 bool CSeqDBGiList::SiToOid(const string &si, int & oid)
740 {
741  int index(0);
742  return SiToOid(si, oid, index);
743 }
744 
745 bool CSeqDBGiList::SiToOid(const string &si, int & oid, int & index)
746 {
747  InsureOrder(eGi);
748 
749  int b(0), e((int)m_SisOids.size());
750 
751  while(b < e) {
752  int m = (b + e)/2;
753  const string & m_si = m_SisOids[m].si;
754 
755  if (m_si < si) {
756  b = m + 1;
757  } else if (si < m_si) {
758  e = m;
759  } else {
760  oid = m_SisOids[m].oid;
761  index = m;
762  return true;
763  }
764  }
765 
766  oid = index = -1;
767  return false;
768 }
769 
770 void
771 CSeqDBGiList::GetGiList(vector<TGi>& gis) const
772 {
773  gis.clear();
774  gis.reserve(GetNumGis());
775 
776  ITERATE(vector<SGiOid>, itr, m_GisOids) {
777  gis.push_back(itr->gi);
778  }
779 }
780 
781 void
782 CSeqDBGiList::GetPigList(vector<TPig>& pigs) const
783 {
784  pigs.clear();
785  pigs.reserve(GetNumPigs());
786 
787  ITERATE(vector<SPigOid>, itr, m_PigsOids) {
788  pigs.push_back(itr->pig);
789  }
790 }
791 
792 void
793 CSeqDBGiList::GetTiList(vector<TTi>& tis) const
794 {
795  tis.clear();
796  tis.reserve(GetNumTis());
797 
798  ITERATE(vector<STiOid>, itr, m_TisOids) {
799  tis.push_back(itr->ti);
800  }
801 }
802 
803 
804 void
805 CSeqDBGiList::GetSiList(vector<string>& sis) const
806 {
807  sis.clear();
808  sis.reserve(GetNumSis());
809 
810  ITERATE(vector<SSiOid>, itr, m_SisOids) {
811  sis.push_back(itr->si);
812  }
813 }
814 
815 
816 
817 
818 
819 void SeqDB_ReadBinaryGiList(const string & fname, vector<TGi> & gis)
820 {
821  CMemoryFile mfile(SeqDB_MakeOSPath(fname));
822 
823  Uint4 * beginp = (Uint4*) mfile.GetPtr();
824  Uint4 * endp = (Uint4*) (((char*)mfile.GetPtr()) + mfile.GetSize());
825 
826  Int4 num_gis = (Int4) (endp - beginp) - 2;
827 
828  gis.clear();
829 
830  if (((endp - beginp) < 2U)
831  || (beginp[0] != 0xFFFFFFFFU)
832  || (SeqDB_GetStdOrd(beginp + 1) != (Uint4) num_gis)) {
834  eFileErr,
835  "Specified file is not a valid binary GI file.");
836  }
837 
838  gis.reserve(num_gis);
839 
840  for(Uint4 * elem = (beginp + 2); elem < endp; ++elem) {
841  gis.push_back(GI_FROM(Uint4, SeqDB_GetStdOrd(elem)));
842  }
843 }
844 
845 /// This function determines whether a file is a valid binary GI/TI file.
846 /// @param fbeginp pointer to start of file [in]
847 /// @param fendp pointer to end of file [in]
848 /// @param has_long_ids will be set to true if the gi file contains long IDs [out]
849 /// @param has_tis will be set to true if the input file contains Trace IDs,
850 /// otherwise the file contains GIs [out]
851 /// @returns true if file is binary
852 /// @throws CSeqDBException if file is empty or invalid gi file
853 static
854 bool s_SeqDB_IsBinaryNumericList(const char* fbeginp, const char* fendp,
855  bool& has_long_ids, bool* has_tis = NULL)
856 {
857  bool retval = false;
858  has_long_ids = false;
859  if (has_tis)
860  *has_tis = false;
861  Uint8 file_size = fendp - fbeginp;
862 
863  if (file_size == 0) {
865  eFileErr,
866  "Specified file is empty.");
867  } else if (isdigit((unsigned char)(*((char*) fbeginp))) ||
868  ((unsigned char)(*((char*) fbeginp)) == '#')) {
869  retval = false;
870  } else if ((file_size >= 8) && ((*fbeginp & 0xFF) == 0xFF)) {
871  retval = true;
872 
873  int marker = fbeginp[3] & 0xFF;
874 
875  if (marker == 0xFE || marker == 0xFC) {
876  has_long_ids = true;
877  }
878  if (has_tis && (marker == 0xFD || marker == 0xFC)) {
879  *has_tis = true;
880  }
881  } else {
883  eFileErr,
884  "Specified file is not a valid GI/TI list.");
885  }
886  return retval;
887 }
888 
889 int s_ReadDigit(const char d, const string & list_type)
890 {
891  switch(d) {
892  case '0':
893  return 0;
894  case '1':
895  return 1;
896  case '2':
897  return 2;
898  case '3':
899  return 3;
900  case '4':
901  return 4;
902  case '5':
903  return 5;
904  case '6':
905  return 6;
906  case '7':
907  return 7;
908  case '8':
909  return 8;
910  case '9':
911  return 9;
912  case ' ':
913  case '\n':
914  case '\r':
915  return -1;
916  default:
917  {
918  string msg = string("Invalid byte in text" + list_type + " list [") +
919  NStr::UIntToString((unsigned char) d) + "].";
920  NCBI_THROW(CSeqDBException, eFileErr, msg);
921  }
922  }
923 }
924 
925 void SeqDB_ReadMemoryGiList(const char * fbeginp,
926  const char * fendp,
927  vector<CSeqDBGiList::SGiOid> & gis,
928  bool * in_order)
929 {
930  bool long_ids = false;
931  Uint8 file_size = fendp - fbeginp;
932 
933  if (s_SeqDB_IsBinaryNumericList(fbeginp, fendp, long_ids)) {
934  _ASSERT(long_ids == false);
935  Uint4* bbeginp = (Uint4*) fbeginp;
936  Uint4* bendp = (Uint4*) fendp;
937 
938  Uint8 num_gis = bendp - bbeginp - 2;
939 
940  gis.clear();
941 
942  if ((bbeginp[0] != 0xFFFFFFFFU)
943  || (SeqDB_GetStdOrd(bbeginp + 1) != (Uint4) num_gis)) {
945  eFileErr,
946  "Specified file is not a valid binary GI file.");
947  }
948 
949  gis.reserve(num_gis);
950 
951  if (in_order) {
952  TGi prev_gi = ZERO_GI;
953  bool in_gi_order = true;
954 
955  Uint4* elem = bbeginp + 2;
956  while(elem < bendp) {
957  TGi this_gi = GI_FROM(Uint4, SeqDB_GetStdOrd(elem));
958  gis.push_back(this_gi);
959 
960  if (prev_gi > this_gi) {
961  in_gi_order = false;
962  break;
963  }
964  prev_gi = this_gi;
965  elem++;
966  }
967 
968  while(elem < bendp) {
969  gis.push_back(GI_FROM(Uint4, SeqDB_GetStdOrd(elem++)));
970  }
971 
972  *in_order = in_gi_order;
973  } else {
974  for(Uint4 * elem = (bbeginp + 2); elem < bendp; ++elem) {
975  gis.push_back(GI_FROM(Uint4, SeqDB_GetStdOrd(elem)));
976  }
977  }
978  } else {
979  _ASSERT(long_ids == false);
980  // We would prefer to do only one allocation, so assume
981  // average gi is 6 digits plus newline. A few extra will be
982  // allocated, but this is preferable to letting the vector
983  // double itself (which it still will do if needed).
984 
985  gis.reserve((int) (file_size / 7));
986 
987  Uint4 elem(0);
988  const string list_type("GI");
989 
990  for(const char * p = fbeginp; p < fendp; p ++) {
991  int dig = s_ReadDigit(*p, list_type);
992  if (dig == -1) {
993  if (elem != 0) {
994  gis.push_back(GI_FROM(Uint4, elem));
995  }
996  elem = 0;
997  continue;
998  }
999  elem *= 10;
1000  elem += dig;
1001  }
1002  }
1003 }
1004 
1005 void SeqDB_ReadMemoryPigList(const char * fbeginp,
1006  const char * fendp,
1007  vector<CSeqDBGiList::SPigOid> & pigs,
1008  bool * in_order)
1009 {
1010  bool long_ids = false;
1011  Int8 file_size = fendp - fbeginp;
1012 
1013  if (s_SeqDB_IsBinaryNumericList(fbeginp, fendp, long_ids)) {
1014  Uint4* bbeginp = (Uint4*) fbeginp;
1015  Uint4* bendp = (Uint4*) fendp;
1016 
1017  Int4 num_pigs = (Int4) (bendp - bbeginp) - 2;
1018 
1019  pigs.clear();
1020 
1021  if (((bendp - bbeginp) < 2U)
1022  || (bbeginp[0] != 0xFFFFFFFFU)
1023  || (SeqDB_GetStdOrd(bbeginp + 1) != (Uint4) num_pigs)) {
1025  eFileErr,
1026  "Specified file is not a valid binary IPG file.");
1027  }
1028 
1029  pigs.reserve(num_pigs);
1030 
1031  if (in_order) {
1032  TPig prev_pig = 0;
1033  bool sorted = true;
1034 
1035  Uint4* elem = bbeginp + 2;
1036  while(elem < bendp) {
1037  TPig this_pig = SeqDB_GetStdOrd(elem);
1038  pigs.push_back(this_pig);
1039 
1040  if (prev_pig > this_pig) {
1041  sorted = false;
1042  break;
1043  }
1044  prev_pig = this_pig;
1045  elem++;
1046  }
1047 
1048  while(elem < bendp) {
1049  pigs.push_back(SeqDB_GetStdOrd(elem++));
1050  }
1051 
1052  *in_order = sorted;
1053  } else {
1054  for(Uint4 * elem = (bbeginp + 2); elem < bendp; ++elem) {
1055  pigs.push_back(SeqDB_GetStdOrd(elem));
1056  }
1057  }
1058  } else {
1059  pigs.reserve((int) (file_size / 7));
1060 
1061  Uint4 elem(0);
1062  const string list_type("IPG");
1063 
1064  for(const char * p = fbeginp; p < fendp; p ++) {
1065  int dig = s_ReadDigit(*p, list_type);
1066  if (dig == -1) {
1067  // Skip blank lines or comments by ignoring zero.
1068  if (elem != 0) {
1069  pigs.push_back(elem);
1070  }
1071  elem = 0;
1072  continue;
1073  }
1074  elem *= 10;
1075  elem += dig;
1076  }
1077  }
1078 }
1079 
1080 void SeqDB_ReadMemoryTaxIdList(const char * fbeginp,
1081  const char * fendp,
1082  CSeqDBGiList::STaxIdsOids & taxids)
1083 {
1084  bool long_ids = false;
1085  if (s_SeqDB_IsBinaryNumericList(fbeginp, fendp, long_ids)) {
1086  Int4* bbeginp = (Int4*) fbeginp;
1087  Int4* bendp = (Int4*) fendp;
1088 
1089  Uint8 num_taxids = (bendp - bbeginp) - 2;
1090 
1091  taxids.tax_ids.clear();
1092  taxids.oids.clear();
1093 
1094  if (((bendp - bbeginp) < 2) || (bbeginp[0] != 0xFFFFFFFF)
1095  || (SeqDB_GetStdOrd(bbeginp + 1) != (Int4) num_taxids)) {
1096  NCBI_THROW(CSeqDBException, eFileErr,
1097  "Specified file is not a valid binary Tax Id List file.");
1098  }
1099 
1100  for(Int4 * elem = (bbeginp + 2); elem < bendp; ++elem) {
1101  taxids.tax_ids.insert(TAX_ID_FROM(Int4, SeqDB_GetStdOrd(elem)));
1102  }
1103  } else {
1104  Int4 elem(0);
1105  const string list_type("TAXID");
1106 
1107  for(const char * p = fbeginp; p < fendp; p ++) {
1108  int dig = s_ReadDigit(*p, list_type);
1109  if (dig == -1) {
1110  // Skip blank lines or comments by ignoring zero.
1111  if (elem != 0) {
1112  taxids.tax_ids.insert(TAX_ID_FROM(Int4, elem));
1113  }
1114  elem = 0;
1115  continue;
1116  }
1117  elem *= 10;
1118  elem += dig;
1119  }
1120  }
1121 }
1122 
1123 // [ NOTE: The 8 byte versions described here are not yet
1124 // implemented. ]
1125 //
1126 // FF..FF = -1 -> GI list <32 bit>
1127 // FF..FE = -2 -> GI list <64 bit>
1128 // FF..FD = -3 -> TI list <32 bit>
1129 // FF..FC = -4 -> TI list <64 bit>
1130 //
1131 // Format of the 8 byte TI list; note that we are still limited to
1132 // 2^32-1 TIs, which would involve an 32 GB identifier list file; this
1133 // code (in its current form) will not work at all on a 32 bit system
1134 // for GI files with more than about 500 megasequences, or TI files
1135 // with more than about 256 megasequences, assuming the current 16
1136 // bytes per vector element. This is larger than the current total
1137 // number of GI sequences, but not larger than the number of TIs, so a
1138 // TI query for all TIs everywhere will most likely choke on a 32 bit
1139 // system because the data will simply not fit into memory (there are
1140 // nearly that many active TIs and the program will have other memory
1141 // expenditures.)
1142 //
1143 // 4 bytes: FF FF FF F?
1144 // 4 bytes: <number of TIs>
1145 // 8 bytes: TI#0
1146 // 8 bytes: TI#1
1147 // ...
1148 
1149 void SeqDB_ReadMemoryTiList(const char * fbeginp,
1150  const char * fendp,
1151  vector<CSeqDBGiList::STiOid> & tis,
1152  bool * in_order)
1153 {
1154  bool long_ids = false;
1155  Int8 file_size = fendp - fbeginp;
1156 
1157  if (s_SeqDB_IsBinaryNumericList(fbeginp, fendp, long_ids)) {
1158  Int4 * bbeginp = (Int4*) fbeginp;
1159  Int4 * bendp = (Int4*) fendp;
1160  Int4 * bdatap = bbeginp + 2;
1161 
1162  Uint4 num_tis = (int)(bendp-bdatap);
1163 
1164  int remainder = num_tis % 2;
1165 
1166  if (long_ids) {
1167  num_tis /= 2;
1168  }
1169 
1170  tis.clear();
1171 
1172  bool bad_fmt = false;
1173 
1174  if (bendp < bdatap) {
1175  bad_fmt = true;
1176  } else {
1177  int marker = SeqDB_GetStdOrd(bbeginp);
1178  unsigned num_ids = SeqDB_GetStdOrd(bbeginp+1);
1179 
1180  if ((marker != -3 && marker != -4) ||
1181  (num_ids != num_tis) ||
1182  (remainder && long_ids)) {
1183 
1184  bad_fmt = true;
1185  }
1186  }
1187 
1188  if (bad_fmt) {
1190  eFileErr,
1191  "Specified file is not a valid binary GI or TI file.");
1192  }
1193 
1194  tis.reserve(num_tis);
1195 
1196  if (long_ids) {
1197  Int8 * bdatap8 = (Int8*) bdatap;
1198  Int8 * bendp8 = (Int8*) bendp;
1199 
1200  if (in_order) {
1201  Int8 prev_ti =0;
1202  bool in_ti_order = true;
1203 
1204  Int8 * elem = bdatap8;
1205 
1206  while(elem < bendp8) {
1207  Int8 this_ti = (Int8) SeqDB_GetStdOrd(elem);
1208  tis.push_back(this_ti);
1209 
1210  if (prev_ti > this_ti) {
1211  in_ti_order = false;
1212  break;
1213  }
1214  prev_ti = this_ti;
1215  elem ++;
1216  }
1217 
1218  while(elem < bendp8) {
1219  tis.push_back((Int8) SeqDB_GetStdOrd(elem++));
1220  }
1221 
1222  *in_order = in_ti_order;
1223  } else {
1224  for(Int8 * elem = bdatap8; elem < bendp8; elem ++) {
1225  tis.push_back((Int8) SeqDB_GetStdOrd(elem));
1226  }
1227  }
1228  } else {
1229  if (in_order) {
1230  int prev_ti =0;
1231  bool in_ti_order = true;
1232 
1233  Int4 * elem = bdatap;
1234 
1235  while(elem < bendp) {
1236  int this_ti = (int) SeqDB_GetStdOrd(elem);
1237  tis.push_back(this_ti);
1238 
1239  if (prev_ti > this_ti) {
1240  in_ti_order = false;
1241  break;
1242  }
1243  prev_ti = this_ti;
1244  elem ++;
1245  }
1246 
1247  while(elem < bendp) {
1248  tis.push_back((int) SeqDB_GetStdOrd(elem++));
1249  }
1250 
1251  *in_order = in_ti_order;
1252  } else {
1253  for(Int4 * elem = bdatap; elem < bendp; elem ++) {
1254  tis.push_back((int) SeqDB_GetStdOrd(elem));
1255  }
1256  }
1257  }
1258  } else {
1259  // We would prefer to do only one allocation, so assume
1260  // average gi is 6 digits plus newline. A few extra will be
1261  // allocated, but this is preferable to letting the vector
1262  // double itself (which it still will do if needed).
1263 
1264  tis.reserve(int(file_size / 7));
1265 
1266  Int8 elem(0);
1267  const string list_type("TI");
1268 
1269  for(const char * p = fbeginp; p < fendp; p ++) {
1270  int dig = s_ReadDigit(*p, list_type);
1271  if (dig == -1) {
1272  if (elem != 0) {
1273  tis.push_back(elem);
1274  }
1275  elem = 0;
1276  continue;
1277  }
1278  elem *= 10;
1279  elem += dig;
1280  }
1281  }
1282 }
1283 
1284 void SeqDB_ReadMemorySiList(const char * fbeginp,
1285  const char * fendp,
1286  vector<CSeqDBGiList::SSiOid> & sis,
1287  bool * in_order)
1288 {
1289  Int8 file_size = fendp - fbeginp;
1290 
1291  // We would prefer to do only one allocation, so assume
1292  // average seqid is 6 digits plus newline. A few extra will be
1293  // allocated, but this is preferable to letting the vector
1294  // double itself (which it still will do if needed).
1295 
1296  sis.reserve(sis.size() + int(file_size / 7));
1297 
1298  const char * p = fbeginp;
1299  const char * head;
1300  while ( p < fendp) {
1301  // find the head of the seqid
1302  while (p< fendp && (*p=='>' || *p==' ' || *p=='\t' || *p=='\n' || *p=='\r')) ++p;
1303  if (p< fendp && *p == '#') {
1304  // anything beyond this point in the line is a comment
1305  while (p< fendp && *p!='\n') ++p;
1306  continue;
1307  }
1308  head = p;
1309  while (p< fendp && *p!=' ' && *p!='\t' && *p!='\n' && *p!='\r') ++p;
1310  if (p > head) {
1311  string acc(head, p);
1312  string str_id = NStr::TruncateSpaces(acc, NStr::eTrunc_Both);
1313  if (str_id != "") {
1314  sis.push_back(str_id);
1315  } else {
1316  cerr << "WARNING: " << acc
1317  << " is not a valid seqid string." << endl;
1318  }
1319  }
1320  }
1321  if (in_order) *in_order = false;
1322 }
1323 
1324 void SeqDB_ReadMemoryMixList(const char * fbeginp,
1325  const char * fendp,
1326  vector<CSeqDBGiList::SGiOid> & gis,
1327  vector<CSeqDBGiList::STiOid> & tis,
1328  vector<CSeqDBGiList::SSiOid> & sis,
1329  bool * in_order)
1330 {
1331  Int8 file_size = fendp - fbeginp;
1332 
1333  // We would prefer to do only one allocation, so assume
1334  // average seqid is 6 digits plus newline. A few extra will be
1335  // allocated, but this is preferable to letting the vector
1336  // double itself (which it still will do if needed).
1337 
1338  sis.reserve(sis.size() + int(file_size / 7));
1339 
1340  const char * p = fbeginp;
1341  const char * head;
1342  while ( p < fendp) {
1343  // find the head of the seqid
1344  while (p< fendp && (*p=='>' || *p==' ' || *p=='\t' || *p=='\n' || *p=='\r')) ++p;
1345  if (p< fendp && *p == '#') {
1346  // anything beyond this point in the line is a comment
1347  while (p< fendp && *p!='\n') ++p;
1348  continue;
1349  }
1350  head = p;
1351  while (p< fendp && *p!=' ' && *p!='\t' && *p!='\n' && *p!='\r') ++p;
1352  if (p > head) {
1353  string acc(head, p);
1354  string str_id;
1355  Int8 num_id;
1356  bool simpler;
1357  ESeqDBIdType id_type = SeqDB_SimplifyAccession(acc, num_id, str_id, simpler);
1358  if (eStringId == id_type) {
1359  sis.push_back(NStr::ToLower(str_id));
1360  }
1361  else if (eTiId == id_type) {
1362  tis.push_back((TTi) num_id);
1363  }
1364  else if (eGiId == id_type) {
1365  gis.push_back(GI_FROM(Int8, num_id));
1366  }
1367  else {
1368  cerr << "WARNING: " << acc
1369  << " is not a valid seqid string." << endl;
1370  }
1371  }
1372  }
1373  if (in_order) *in_order = false;
1374 }
1375 
1377 {
1378  CMemoryFile mfile(SeqDB_MakeOSPath(fname));
1379 
1380  Int8 file_size = mfile.GetSize();
1381  const char * fbeginp = (char*) mfile.GetPtr();
1382  const char * fendp = fbeginp + (int)file_size;
1383 
1384  bool ignore = false;
1385  bool has_tis = false;
1386  bool retval = s_SeqDB_IsBinaryNumericList(fbeginp, fendp, ignore, &has_tis);
1388  return retval;
1389  } else {
1390  retval = has_tis && retval;
1391  }
1392  return retval;
1393 }
1394 
1395 bool SeqDB_IsBinaryTiList(const string & fname)
1396 {
1398 }
1399 
1400 bool SeqDB_IsBinaryGiList(const string & fname)
1401 {
1403 }
1404 
1405 void SeqDB_ReadGiList(const string & fname, vector<CSeqDBGiList::SGiOid> & gis, bool * in_order)
1406 {
1407  CMemoryFile mfile(SeqDB_MakeOSPath(fname));
1408 
1409  Int8 file_size = mfile.GetSize();
1410  const char * fbeginp = (char*) mfile.GetPtr();
1411  const char * fendp = fbeginp + file_size;
1412 
1413  SeqDB_ReadMemoryGiList(fbeginp, fendp, gis, in_order);
1414 }
1415 
1416 
1417 void SeqDB_ReadTiList(const string & fname, vector<CSeqDBGiList::STiOid> & tis, bool * in_order)
1418 {
1419  CMemoryFile mfile(SeqDB_MakeOSPath(fname));
1420 
1421  Int8 file_size = mfile.GetSize();
1422  const char * fbeginp = (char*) mfile.GetPtr();
1423  const char * fendp = fbeginp + file_size;
1424 
1425  SeqDB_ReadMemoryTiList(fbeginp, fendp, tis, in_order);
1426 }
1427 
1428 void SeqDB_ReadMixList(const string & fname, vector<CSeqDBGiList::SGiOid> & gis,
1429  vector<CSeqDBGiList::STiOid> & tis, vector<CSeqDBGiList::SSiOid> & sis, bool * in_order)
1430 {
1431  CMemoryFile mfile(SeqDB_MakeOSPath(fname));
1432 
1433  Int8 file_size = mfile.GetSize();
1434  const char *fbeginp = (char*) mfile.GetPtr();
1435  const char *fendp = fbeginp + file_size;
1436 
1437  SeqDB_ReadMemoryMixList(fbeginp, fendp, gis, tis, sis, in_order);
1438 }
1439 
1440 void SeqDB_ReadPigList(const string & fname, vector<CSeqDBGiList::SPigOid> & pigs, bool * in_order)
1441 {
1442  CMemoryFile mfile(SeqDB_MakeOSPath(fname));
1443 
1444  Int8 file_size = mfile.GetSize();
1445  const char * fbeginp = (char*) mfile.GetPtr();
1446  const char * fendp = fbeginp + file_size;
1447 
1448  SeqDB_ReadMemoryPigList(fbeginp, fendp, pigs, in_order);
1449 }
1450 
1451 void SeqDB_ReadTaxIdList(const string & fname, CSeqDBGiList::STaxIdsOids & taxids)
1452 {
1453  CMemoryFile mfile(SeqDB_MakeOSPath(fname));
1454 
1455  Int8 file_size = mfile.GetSize();
1456  const char * fbeginp = (char*) mfile.GetPtr();
1457  const char * fendp = fbeginp + file_size;
1458 
1459  SeqDB_ReadMemoryTaxIdList(fbeginp, fendp, taxids);
1460 }
1461 
1462 void SeqDB_ReadGiList(const string & fname, vector<TGi> & gis, bool * in_order)
1463 {
1464  typedef vector<CSeqDBGiList::SGiOid> TPairList;
1465 
1466  TPairList pairs;
1467  SeqDB_ReadGiList(fname, pairs, in_order);
1468 
1469  gis.reserve(pairs.size());
1470 
1471  ITERATE(TPairList, iter, pairs) {
1472  gis.push_back(iter->gi);
1473  }
1474 }
1475 
1476 void SeqDB_ReadSiList(const string & fname, vector<CSeqDBGiList::SSiOid> & sis, bool * in_order, SBlastSeqIdListInfo & db_info)
1477 {
1478  CMemoryFile mfile(SeqDB_MakeOSPath(fname));
1479  if (CBlastSeqidlistFile::GetSeqidlist(mfile, sis, db_info)) {
1480  *in_order = true;
1481  return;
1482  }
1483  else {
1484  Int8 file_size = mfile.GetSize();
1485  const char *fbeginp = (char*) mfile.GetPtr();
1486  const char *fendp = fbeginp + file_size;
1487  SeqDB_ReadMemorySiList(fbeginp, fendp, sis, in_order);
1488  }
1489 }
1490 
1492 {
1493  InsureOrder();
1494  int b(0), e((int)m_Gis.size());
1495 
1496  while(b < e) {
1497  int m = (b + e)/2;
1498  TGi m_gi = m_Gis[m];
1499 
1500  if (m_gi < gi) {
1501  b = m + 1;
1502  } else if (m_gi > gi) {
1503  e = m;
1504  } else {
1505  return true;
1506  }
1507  }
1508 
1509  return false;
1510 }
1511 
1512 
1514 {
1515  InsureOrder();
1516 
1517  int b(0), e((int)m_Tis.size());
1518 
1519  while(b < e) {
1520  int m = (b + e)/2;
1521  TTi m_ti = m_Tis[m];
1522 
1523  if (m_ti < ti) {
1524  b = m + 1;
1525  } else if (m_ti > ti) {
1526  e = m;
1527  } else {
1528  return true;
1529  }
1530  }
1531 
1532  return false;
1533 }
1534 
1536 {
1537  bool match_type = false;
1538  return FindId(id, match_type);
1539 }
1540 
1541 
1543 {
1544  InsureOrder();
1545  int b(0), e((int)m_Sis.size());
1546 
1547  while(b < e) {
1548  int m = (b + e)/2;
1549  string m_si = m_Sis[m];
1550 
1551  if (m_si < si) {
1552  b = m + 1;
1553  } else if (m_si > si) {
1554  e = m;
1555  } else {
1556  return true;
1557  }
1558  }
1559 
1560  return false;
1561 }
1562 
1563 
1564 bool CSeqDBNegativeList::FindId(const CSeq_id & id, bool & match_type)
1565 {
1566  if (id.IsGi()) {
1567  match_type = (GetNumGis() > 0) ? true : false;
1568  if(match_type) {
1569  return(FindGi(id.GetGi()));
1570  }
1571  } else if (id.IsGeneral() && id.GetGeneral().GetDb() == "ti") {
1572  match_type = (GetNumTis() > 0) ? true : false;
1573 
1574  if(match_type) {
1575  const CObject_id & obj = id.GetGeneral().GetTag();
1576 
1577  Int8 ti = (obj.IsId()
1578  ? obj.GetId()
1579  : NStr::StringToInt8(obj.GetStr()));
1580 
1581  return FindTi(ti);
1582  }
1583  } else {
1584  match_type = (GetNumSis() > 0) ? true : false;
1585 
1586  if(match_type) {
1587  if(FindSi(GetBlastSeqIdString(id, true))) return true;
1588  if(FindSi(GetBlastSeqIdString(id, false))) return true;
1589 
1590  // For isam lookup
1591  Int8 num_id;
1592  string str_id;
1593  bool simpler;
1594 
1595  SeqDB_SimplifySeqid(*(const_cast<CSeq_id *>(&id)), 0, num_id, str_id, simpler);
1596 
1597  if (FindSi(str_id)) {
1598  return true;
1599  }
1600 
1601  // We may have to strip the version to find it...
1602  size_t pos = str_id.find(".");
1603  if (pos != str_id.npos) {
1604  string nover(str_id, 0, pos);
1605  return FindSi(nover);
1606  }
1607  }
1608  }
1609  return false;
1610 }
1611 
1613 {
1614  NON_CONST_ITERATE(vector<string>, itr, m_Sis) {
1615  string str_id = SeqDB_SimplifyAccession(*itr);
1616  *itr = NStr::ToLower(str_id);
1617  }
1618 }
1619 
1621 {
1622  static CFastMutex mtx;
1623  CFastMutexGuard mtx_gurad(mtx);
1624  if (m_LastSortSize != (m_Gis.size() + m_Tis.size() +m_Sis.size())) {
1625  std::sort(m_Gis.begin(), m_Gis.end());
1626  std::sort(m_Tis.begin(), m_Tis.end());
1627  std::sort(m_Sis.begin(), m_Sis.end());
1628 
1629  m_LastSortSize = m_Gis.size() + m_Tis.size() + m_Sis.size();
1630  }
1631 }
1632 
1634 {
1635  if (id.IsGi()) {
1636  return FindGi(id.GetGi());
1637  } else if (id.IsGeneral() && id.GetGeneral().GetDb() == "ti") {
1638  const CObject_id & obj = id.GetGeneral().GetTag();
1639 
1640  TTi ti = (obj.IsId()
1641  ? (TTi) obj.GetId()
1642  : (TTi) NStr::StringToInt8(obj.GetStr()));
1643 
1644  return FindTi(ti);
1645  } else {
1646  if(FindSi(GetBlastSeqIdString(id, true))) return true;
1647  if(FindSi(GetBlastSeqIdString(id, false))) return true;
1648 
1649  /// For isam lookup
1650  Int8 num_id;
1651  string str_id;
1652  bool simpler;
1653  SeqDB_SimplifySeqid(*(const_cast<CSeq_id *>(&id)), 0, num_id, str_id, simpler);
1654  if (FindSi(str_id)) return true;
1655 
1656  // We may have to strip the version to find it...
1657  size_t pos = str_id.find(".");
1658  if (pos != str_id.npos) {
1659  string nover(str_id, 0, pos);
1660  return FindSi(nover);
1661  }
1662  }
1663  return false;
1664 }
1665 
1666 
1667 CSeqDBFileGiList::CSeqDBFileGiList(const string & fname, EIdType idtype)
1668 {
1669  bool in_order = false;
1670  switch(idtype) {
1671  case eGiList:
1672  SeqDB_ReadGiList(fname, m_GisOids, & in_order);
1673  break;
1674  case eTiList:
1675  SeqDB_ReadTiList(fname, m_TisOids, & in_order);
1676  break;
1677  case eSiList:
1678  SeqDB_ReadSiList(fname, m_SisOids, & in_order, m_ListInfo);
1679  break;
1680  case eMixList:
1681  SeqDB_ReadMixList(fname, m_GisOids, m_TisOids, m_SisOids, & in_order);
1682  break;
1683  case ePigList:
1684  SeqDB_ReadPigList(fname, m_PigsOids, & in_order);
1685  break;
1686  case eTaxIdList:
1688  in_order = true;
1689  break;
1690  }
1691  m_CurrentOrder = in_order ? eGi : eNone;
1692 }
1693 /*
1694 CSeqDBFileGiList::CSeqDBFileGiList(vector<string> fnames, EIdType idtype)
1695 {
1696  bool in_order = false;
1697  switch(idtype) {
1698  case eGiList:
1699  case eTiList:
1700  NCBI_THROW(CSeqDBException,
1701  eArgErr,
1702  "Only multiple seqid list is supported.");
1703  case eSiList:
1704  ITERATE(vector<string>, iter, fnames) {
1705  SeqDB_ReadSiList(*iter, m_SisOids, & in_order);
1706  }
1707  break;
1708  case eMixList:
1709  ITERATE(vector<string>, iter, fnames) {
1710  SeqDB_ReadMixList(*iter, m_GisOids, m_TisOids, m_SisOids, & in_order);
1711  }
1712  break;
1713  }
1714  m_CurrentOrder = in_order ? eGi : eNone;
1715 }
1716 */
1717 void SeqDB_CombineAndQuote(const vector<string> & dbs,
1718  string & dbname)
1719 {
1720  int sz = 0;
1721 
1722  for(unsigned i = 0; i < dbs.size(); i++) {
1723  sz += int(3 + dbs[i].size());
1724  }
1725 
1726  dbname.reserve(sz);
1727 
1728  for(unsigned i = 0; i < dbs.size(); i++) {
1729  if (dbname.size()) {
1730  dbname.append(" ");
1731  }
1732 
1733  if (dbs[i].find(" ") != string::npos) {
1734  dbname.append("\"");
1735  dbname.append(dbs[i]);
1736  dbname.append("\"");
1737  } else {
1738  dbname.append(dbs[i]);
1739  }
1740  }
1741 }
1742 
1743 
1744 void SeqDB_SplitQuoted(const string & dbname,
1745  vector<CTempString> & dbs,
1746  bool keep_quote)
1747 {
1748  vector<CSeqDB_Substring> subs;
1749 
1750  SeqDB_SplitQuoted(dbname, subs, keep_quote);
1751 
1752  dbs.resize(0);
1753  dbs.reserve(subs.size());
1754 
1755  ITERATE(vector<CSeqDB_Substring>, iter, subs) {
1756  CTempString tmp(iter->GetBegin(), iter->Size());
1757  dbs.push_back(tmp);
1758  }
1759 }
1760 
1761 
1762 void SeqDB_SplitQuoted(const string & dbname,
1763  vector<CSeqDB_Substring> & dbs,
1764  bool keep_quote)
1765 {
1766  // split names
1767 
1768  const char * sp = dbname.data();
1769 
1770  bool quoted = false;
1771  unsigned begin = 0;
1772 
1773  for(unsigned i = 0; i < dbname.size(); i++) {
1774  char ch = dbname[i];
1775 
1776  if (quoted) {
1777  // Quoted mode sees '"' as the only actionable token.
1778  if (ch == '"') {
1779  if (begin < i) {
1780  if(keep_quote) i++;
1781  dbs.push_back(CSeqDB_Substring(sp + begin, sp + i));
1782  }
1783  begin = i + 1;
1784  quoted = false;
1785  }
1786  } else {
1787  // Non-quote mode: Space or quote starts the next string.
1788 
1789  if (ch == ' ') {
1790  if (begin < i) {
1791  dbs.push_back(CSeqDB_Substring(sp + begin, sp + i));
1792  }
1793  begin = i + 1;
1794  } else if (ch == '"') {
1795  if (begin < i) {
1796  dbs.push_back(CSeqDB_Substring(sp + begin, sp + i));
1797  }
1798  begin = keep_quote ? i : i + 1;
1799  quoted = true;
1800  }
1801  }
1802  }
1803 
1804  if (begin < dbname.size()) {
1805  dbs.push_back(CSeqDB_Substring(sp + begin, sp + dbname.size()));
1806  }
1807 }
1808 
1809 
1811 {
1812  _ASSERT(this != & gilist);
1813 
1815  sort(gis.begin(), gis.end());
1816 
1817  int list_i = 0;
1818  int list_n = gilist.GetNumGis();
1819  int gis_i = 0;
1820  int gis_n = (int) gis.size();
1821 
1822  while(list_i < list_n && gis_i < gis_n) {
1823  TGi L = gilist.GetGiOid(list_i).gi;
1824  TGi G = gis[gis_i];
1825 
1826  if (L < G) {
1827  list_i ++;
1828  continue;
1829  }
1830 
1831  if (L > G) {
1832  gis_i ++;
1833  continue;
1834  }
1835 
1836  m_GisOids.push_back(gilist.GetGiOid(list_i));
1837 
1838  list_i++;
1839  gis_i++;
1840  }
1841 
1842  m_CurrentOrder = m_GisOids.size() ? eGi : eNone;
1843 }
1844 
1845 
1847 {
1848  neg_gilist.InsureOrder();
1849  sort(gis.begin(), gis.end());
1850 
1851  int list_i = 0;
1852  int list_n = neg_gilist.GetNumGis();
1853  int gis_i = 0;
1854  int gis_n = (int) gis.size();
1855 
1856  while(list_i < list_n && gis_i < gis_n) {
1857  TGi L = neg_gilist.GetGi(list_i);
1858  TGi G = gis[gis_i];
1859 
1860  if (L < G) {
1861  list_i ++;
1862  continue;
1863  }
1864 
1865  if (L > G) {
1866  m_GisOids.push_back(gis[gis_i]);
1867  gis_i ++;
1868  continue;
1869  }
1870 
1871  list_i++;
1872 
1873  TGi last_gi = gis[gis_i];
1874  do { gis_i++; } while (gis_i < gis_n && gis[gis_i] == last_gi);
1875  }
1876 
1877  // push all the remaining vector gi's if any left
1878  while (gis_i < gis_n) {
1879  m_GisOids.push_back(gis[gis_i++]);
1880  }
1881 
1882  m_CurrentOrder = m_GisOids.size() ? eGi : eNone;
1883 }
1884 
1885 
1886 CSeqDBIdSet::CSeqDBIdSet(const vector<Int4> & ids, EIdType t, bool positive)
1887  : m_Positive(positive), m_IdType(t), m_Ids(new CSeqDBIdSet_Vector(ids))
1888 {
1890 }
1891 
1892 CSeqDBIdSet::CSeqDBIdSet(const vector<Int8> & ids, EIdType t, bool positive)
1893  : m_Positive(positive), m_IdType(t), m_Ids(new CSeqDBIdSet_Vector(ids))
1894 {
1896 }
1897 
1898 CSeqDBIdSet::CSeqDBIdSet(const vector<Uint8> & ids, EIdType t, bool positive)
1899  : m_Positive(positive), m_IdType(t), m_Ids(new CSeqDBIdSet_Vector(ids))
1900 {
1902 }
1903 
1904 #ifdef NCBI_STRICT_GI
1905 CSeqDBIdSet::CSeqDBIdSet(const vector<TGi> & ids, EIdType t, bool positive)
1906  : m_Positive(positive), m_IdType(t), m_Ids(new CSeqDBIdSet_Vector(ids))
1907 {
1909 }
1910 #endif
1911 
1912 CSeqDBIdSet::CSeqDBIdSet(const vector<string> & ids, EIdType t, bool positive)
1913  : m_Positive(positive), m_IdType(t), m_Ids(new CSeqDBIdSet_Vector(ids))
1914 {
1916 }
1917 
1918 void CSeqDBIdSet::x_SortAndUnique(vector<Int8> & ids)
1919 {
1920  sort(ids.begin(), ids.end());
1921  ids.erase(unique(ids.begin(), ids.end()), ids.end());
1922 }
1923 
1924 
1925 void CSeqDBIdSet::x_SortAndUnique(vector<string> & ids)
1926 {
1927  sort(ids.begin(), ids.end());
1928  ids.erase(unique(ids.begin(), ids.end()), ids.end());
1929 }
1930 
1932 {
1933  m_Positive = ! m_Positive;
1934 }
1935 
1938  bool A_pos,
1939  bool B_pos,
1940  bool & result_pos,
1941  bool & incl_A,
1942  bool & incl_B,
1943  bool & incl_AB)
1944 {
1945  typedef CSeqDBIdSet TIdList;
1946 
1947  incl_A = incl_B = incl_AB = false;
1948 
1949  // Each binary boolean function can be represented as a 4 bit
1950  // descriptor. The four bits indicate whether the result is true
1951  // when it appears, respectively, in neither list, only the second
1952  // list, only the first list, or in both lists. For example, the
1953  // operation (A AND B) can be represented as (0001), and (A OR !B)
1954  // can be written as (1011). In a positive ID list, 1 means that
1955  // an ID should be included in database iteration.
1956 
1957  // But 4-bit descriptors starting with a '1' correspond to logical
1958  // operations that include all IDs not appearing in either input
1959  // set. But of course we do not have access to the IDs that do
1960  // not appear, so we cannot (feasibly) compute such operations.
1961 
1962  // To solve this problem, De Morgan's Laws are used to transform
1963  // the operation into its inverse, the results of which can be
1964  // applied to SeqDB as a negative ID list.
1965 
1966  // For our purposes, these three transforms are needed:
1967  //
1968  // 1. (!X and !Y) becomes !(X or Y)
1969  // 2. (!X or Y) becomes !(X and !Y)
1970  // 3. (X or !Y) becomes !(!X and Y)
1971 
1972  result_pos = true;
1973 
1974  switch(op) {
1975  case eAnd:
1976  if ((! A_pos) && (! B_pos)) {
1977  op = TIdList::eOr;
1978  result_pos = false;
1979  A_pos = B_pos = true;
1980  }
1981  break;
1982 
1983  case eOr:
1984  if ((! A_pos) || (! B_pos)) {
1985  op = TIdList::eAnd;
1986  result_pos = false;
1987  A_pos = ! A_pos;
1988  B_pos = ! B_pos;
1989  }
1990  break;
1991 
1992  case eXor:
1993  result_pos = A_pos == B_pos;
1994  break;
1995 
1996  default:
1997  break;
1998  }
1999 
2000  // Once we have a legal operation, we construct these flags to
2001  // summarize the boolean operation. (Each of these corresponds to
2002  // one of the bits in the 4-bit descriptor.)
2003 
2004  switch(op) {
2005  case eAnd:
2006  _ASSERT(A_pos || B_pos);
2007  incl_A = !B_pos;
2008  incl_B = !A_pos;
2009  incl_AB = A_pos && B_pos;
2010  break;
2011 
2012  case eOr:
2013  _ASSERT(A_pos || B_pos);
2014  incl_A = incl_B = incl_AB = true;
2015  break;
2016 
2017  case eXor:
2018  incl_AB = (A_pos != B_pos);
2019  incl_A = incl_B = ! incl_AB;
2020  break;
2021 
2022  default:
2023  break;
2024  }
2025 }
2026 
2029  const vector<Int8> & A,
2030  bool A_pos,
2031  const vector<Int8> & B,
2032  bool B_pos,
2033  vector<Int8> & result,
2034  bool & result_pos)
2035 {
2036  bool incl_A(false),
2037  incl_B(false),
2038  incl_AB(false);
2039 
2041  A_pos,
2042  B_pos,
2043  result_pos,
2044  incl_A,
2045  incl_B,
2046  incl_AB);
2047 
2048  size_t A_i(0), B_i(0);
2049 
2050  while((A_i < A.size()) && (B_i < B.size())) {
2051  Int8 Ax(A[A_i]), Bx(B[B_i]), target(-1);
2052  bool included(false);
2053 
2054  if (Ax < Bx) {
2055  ++ A_i;
2056  target = Ax;
2057  included = incl_A;
2058  } else if (Ax > Bx) {
2059  ++ B_i;
2060  target = Bx;
2061  included = incl_B;
2062  } else {
2063  ++ A_i;
2064  ++ B_i;
2065  target = Ax;
2066  included = incl_AB;
2067  }
2068 
2069  if (included) {
2070  result.push_back(target);
2071  }
2072  }
2073 
2074  if (incl_A) {
2075  while(A_i < A.size()) {
2076  result.push_back(A[A_i++]);
2077  }
2078  }
2079 
2080  if (incl_B) {
2081  while(B_i < B.size()) {
2082  result.push_back(B[B_i++]);
2083  }
2084  }
2085 }
2086 
2087 void CSeqDBIdSet::Compute(EOperation op,
2088  const vector<Int4> & ids,
2089  bool positive)
2090 {
2092 
2094 
2095  x_SortAndUnique(B->Set());
2096 
2097  bool result_pos(true);
2098 
2100  m_Ids->Set(),
2101  m_Positive,
2102  B->Set(),
2103  positive,
2104  result->Set(),
2105  result_pos);
2106 
2107  m_Positive = result_pos;
2108  m_Ids = result;
2109 }
2110 
2112  const vector<Int8> & ids,
2113  bool positive)
2114 {
2116 
2118  x_SortAndUnique(B->Set());
2119 
2120  bool result_pos(true);
2121 
2123  m_Ids->Set(),
2124  m_Positive,
2125  B->Set(),
2126  positive,
2127  result->Set(),
2128  result_pos);
2129 
2130  m_Positive = result_pos;
2131  m_Ids = result;
2132 }
2133 
2135  const vector<Uint8> & ids,
2136  bool positive)
2137 {
2139 
2141  x_SortAndUnique(B->Set());
2142 
2143  bool result_pos(true);
2144 
2146  m_Ids->Set(),
2147  m_Positive,
2148  B->Set(),
2149  positive,
2150  result->Set(),
2151  result_pos);
2152 
2153  m_Positive = result_pos;
2154  m_Ids = result;
2155 }
2156 
2158 {
2159  if (m_IdType != ids.m_IdType ) {
2161  eArgErr,
2162  "Set operation requested but ID types don't match.");
2163  }
2164 
2166  bool result_pos(true);
2167 
2169  m_Ids->Set(),
2170  m_Positive,
2171  ids.m_Ids->Get(),
2172  ids.m_Positive,
2173  result->Set(),
2174  result_pos);
2175 
2176  m_Positive = result_pos;
2177  m_Ids = result;
2178 }
2179 
2181 {
2183 
2184  if (! m_Positive) {
2186  eFileErr,
2187  "Positive ID list requested but only negative exists.");
2188  }
2189 
2190  if (m_IdType == eTi) {
2191  ids->ReserveTis(m_Ids->Size());
2192 
2193  ITERATE(vector<Int8>, iter, m_Ids->Set()) {
2194  ids->AddTi(*iter);
2195  }
2196  } else {
2197  ids->ReserveGis(m_Ids->Size());
2198 
2199  ITERATE(vector<Int8>, iter, m_Ids->Set()) {
2200  _ASSERT(((*iter) >> 32) == 0);
2201  ids->AddGi(GI_FROM(Int8, *iter));
2202  }
2203  }
2204 
2205  return ids;
2206 }
2207 
2209 {
2210  if (m_Positive) {
2212  eFileErr,
2213  "Negative ID list requested but only positive exists.");
2214  }
2215 
2217 
2218  if (m_IdType == eTi) {
2219  ids->ReserveTis(m_Ids->Size());
2220 
2221  ITERATE(vector<Int8>, iter, m_Ids->Set()) {
2222  ids->AddTi(*iter);
2223  }
2224  } else if (m_IdType == eGi) {
2225  ids->ReserveGis(m_Ids->Size());
2226 
2227  ITERATE(vector<Int8>, iter, m_Ids->Set()) {
2228  _ASSERT(((*iter) >> 32) == 0);
2229  ids->AddGi(GI_FROM(Int8, *iter));
2230  }
2231  }
2232  else {
2233  ids->ReserveSis(m_Ids->Size());
2234 
2235  ITERATE(vector<string>, iter, m_Ids->SetSeqIDs()) {
2236  ids->AddSi(*iter);
2237  }
2238  }
2239 
2240  return ids;
2241 }
2242 
2244  : m_Positive (false),
2245  m_IdType (eGi),
2246  m_Ids (new CSeqDBIdSet_Vector)
2247 {
2248 }
2249 
2251 {
2252  return (! m_Positive) && (0 == m_Ids->Size());
2253 }
2254 
2255 void SeqDB_FileIntegrityAssert(const string & file,
2256  int line,
2257  const string & text)
2258 {
2259  string msg = "Validation failed: [" + text + "] at ";
2260  msg += file + ":" + NStr::IntToString(line);
2262 }
2263 
2265  const string * acc,
2266  Int8 & num_id,
2267  string & str_id,
2268  bool & simpler)
2269 {
2271 
2272  const CTextseq_id * tsip = 0;
2273 
2274  bool matched = true;
2275 
2276  switch(bestid.Which()) {
2277  case CSeq_id::e_Gi:
2278  simpler = true;
2279  num_id = GI_TO(Int8, bestid.GetGi());
2280  result = eGiId;
2281  break;
2282 
2283  case CSeq_id::e_Gibbsq: /* gibbseq */
2284  simpler = true;
2285  result = eStringId;
2286  str_id = NStr::UIntToString(bestid.GetGibbsq());
2287  break;
2288 
2289  case CSeq_id::e_General:
2290  {
2291  const CDbtag & dbt = bestid.GetGeneral();
2292 
2293  if (dbt.CanGetDb()) {
2294  if (dbt.GetDb() == "BL_ORD_ID") {
2295  simpler = true;
2296  num_id = dbt.GetTag().GetId();
2297  result = eOID;
2298  break;
2299  }
2300 
2301  if (dbt.GetDb() == "PIG") {
2302  simpler = true;
2303  num_id = dbt.GetTag().GetId();
2304  result = ePigId;
2305  break;
2306  }
2307 
2308  if (dbt.GetDb() == "ti") {
2309  simpler = true;
2310  num_id = (dbt.GetTag().IsStr()
2311  ? NStr::StringToInt8(dbt.GetTag().GetStr())
2312  : dbt.GetTag().GetId());
2313 
2314  result = eTiId;
2315  break;
2316  }
2317 
2318 
2319  if (NStr::CompareNocase(dbt.GetDb(), "GNOMON") == 0) {
2320  str_id = bestid.AsFastaString();
2321  str_id = NStr::ToLower(str_id);
2322  result = eStringId;
2323  break;
2324  }
2325  }
2326 
2327  if (dbt.CanGetTag() && dbt.GetTag().IsStr()) {
2328  result = eStringId;
2329  str_id = dbt.GetTag().GetStr();
2330  str_id = NStr::ToLower(str_id);
2331  } else {
2332  // Use the default logic.
2333  matched = false;
2334  }
2335  }
2336  break;
2337 
2338  case CSeq_id::e_Local: /* local */
2339  simpler = true;
2340  result = eStringId;
2341  {
2342  const CObject_id & objid = bestid.GetLocal();
2343 
2344  if (objid.IsStr()) {
2345  // sparse version will leave "lcl|" off.
2346  str_id = objid.GetStr();
2347  str_id = NStr::ToLower(str_id);
2348  } else {
2349  // Local numeric ids are stored as strings.
2350  str_id = "lcl|" + NStr::IntToString(objid.GetId());
2351  }
2352  }
2353  break;
2354 
2355  // tsip types
2356 
2357  case CSeq_id::e_Embl: /* embl */
2358  case CSeq_id::e_Ddbj: /* ddbj */
2359  case CSeq_id::e_Genbank: /* genbank */
2360  case CSeq_id::e_Tpg: /* Third Party Annot/Seq Genbank */
2361  case CSeq_id::e_Tpe: /* Third Party Annot/Seq EMBL */
2362  case CSeq_id::e_Tpd: /* Third Party Annot/Seq DDBJ */
2363  case CSeq_id::e_Other: /* other */
2364  case CSeq_id::e_Swissprot: /* swissprot (now with versions) */
2365  case CSeq_id::e_Gpipe: /* internal NCBI genome pipeline */
2366  tsip = bestid.GetTextseq_Id();
2367  break;
2368 
2369  case CSeq_id::e_Pir: /* pir */
2370  case CSeq_id::e_Prf: /* prf */
2371  tsip = bestid.GetTextseq_Id();
2372  break;
2373 
2374  default:
2375  matched = false;
2376  }
2377 
2378  // Default: if we have a string, use it; if we only have seqid,
2379  // create a string. This should not happen if the seqid matches
2380  // one of the cases above, which currently correspond to all the
2381  // supported seqid types.
2382 
2385 
2386  if (! matched) {
2387  // (should not happen normally)
2388 
2389  simpler = false;
2390  result = eStringId;
2391 
2392  if (acc) {
2393  str_id = *acc;
2394  str_id = NStr::ToLower(str_id);
2395  } else {
2396  bestid.GetLabel(& str_id, CSeq_id::eFasta, label_flags);
2397  str_id = NStr::ToLower(str_id);
2398  }
2399  }
2400 
2401  if (tsip) {
2402  bool found = false;
2403 
2404  if (tsip->CanGetAccession()) {
2405  str_id = tsip->GetAccession();
2406  str_id = NStr::ToLower(str_id);
2407  found = true;
2408 
2409  if (tsip->CanGetVersion()) {
2410  str_id += ".";
2411  str_id += NStr::UIntToString(tsip->GetVersion());
2412  }
2413  } else if (tsip->CanGetName()) {
2414  str_id = tsip->GetName();
2415  str_id = NStr::ToLower(str_id);
2416  found = true;
2417  }
2418 
2419  if (found) {
2420  simpler = true;
2421  result = eStringId;
2422  }
2423  }
2424 
2425  return result;
2426 }
2427 
2428 /// Find the end of a single element in a Seq-id set
2429 ///
2430 /// Seq-id strings sometimes contain several Seq-ids. This function
2431 /// looks for the end of the first Seq-id, and will return its length.
2432 /// Static methods of CSeq_id are used to evaluate tokens.
2433 ///
2434 /// @param str
2435 /// Seq-id string to search.
2436 /// @param pos
2437 /// Position at which to start search.
2438 /// @return
2439 /// End position of first fasta id, or string::npos in case of error.
2440 
2441 static size_t
2442 s_SeqDB_EndOfFastaID(const string & str, size_t pos)
2443 {
2444  // (Derived from s_EndOfFastaID()).
2445 
2446  size_t vbar = str.find('|', pos);
2447 
2448  if (vbar == string::npos) {
2449  return string::npos; // bad
2450  }
2451 
2452  string portion(str, pos, vbar - pos);
2453 
2454  CSeq_id::E_Choice choice =
2455  CSeq_id::WhichInverseSeqId(portion.c_str());
2456 
2457  if (choice != CSeq_id::e_not_set) {
2458  size_t vbar_prev = vbar;
2459  int count;
2460  for (count=0; ; ++count, vbar_prev = vbar) {
2461  vbar = str.find('|', vbar_prev + 1);
2462 
2463  if (vbar == string::npos) {
2464  break;
2465  }
2466 
2467  int start_pt = int(vbar_prev + 1);
2468  string element(str, start_pt, vbar - start_pt);
2469 
2470  choice = CSeq_id::WhichInverseSeqId(element.c_str());
2471 
2472  if (choice != CSeq_id::e_not_set) {
2473  vbar = vbar_prev;
2474  break;
2475  }
2476  }
2477  } else {
2478  return string::npos; // bad
2479  }
2480 
2481  return (vbar == string::npos) ? str.size() : vbar;
2482 }
2483 
2484 /// Parse string into a sequence of Seq-id objects.
2485 ///
2486 /// A string is broken down into Seq-ids and the set of Seq-ids is
2487 /// returned.
2488 ///
2489 /// @param line
2490 /// The string to interpret.
2491 /// @param seqids
2492 /// The returned set of Seq-id objects.
2493 /// @return
2494 /// true if any Seq-id objects were found.
2495 
2496 static bool
2497 s_SeqDB_ParseSeqIDs(const string & line,
2498  vector< CRef< CSeq_id > > & seqids)
2499 {
2500  // (Derived from s_ParseFastaDefline()).
2501 
2502  seqids.clear();
2503  size_t pos = 0;
2504 
2505  while (pos < line.size()) {
2506  size_t end = s_SeqDB_EndOfFastaID(line, pos);
2507 
2508  if (end == string::npos) {
2509  // We didn't get a clean parse -- ignore the data after
2510  // this point, and return what we have.
2511  break;
2512  }
2513 
2514  string element(line, pos, end - pos);
2515 
2516  CRef<CSeq_id> id;
2517 
2518  try {
2519  id = new CSeq_id(element);
2520  }
2521  catch(invalid_argument &) {
2522  // Maybe this should be done: "seqids.clear();"
2523  break;
2524  }
2525 
2526  seqids.push_back(id);
2527  pos = end + 1;
2528  }
2529 
2530  return ! seqids.empty();
2531 }
2532 
2533 
2534 
2536  Int8 & num_id,
2537  string & str_id,
2538  bool & simpler)
2539 {
2541  num_id = (Uint4)-1;
2542 
2543  vector< CRef< CSeq_id > > seqid_set;
2544 
2545  if (s_SeqDB_ParseSeqIDs(acc, seqid_set)) {
2546  // Something like SeqIdFindBest()
2547  CRef<CSeq_id> bestid =
2548  FindBestChoice(seqid_set, CSeq_id::BestRank);
2549 
2550  result = SeqDB_SimplifySeqid(*bestid, & acc, num_id, str_id, simpler);
2551  } else {
2552 
2553  // Check for bare pdb accession with underscore (such as 12AS_A). These
2554  // are not in the isam index and need to be translated to the
2555  // standard form (pdb|12AS|A).
2556  list< CRef<CSeq_id> > seqids;
2557  try {
2558  CSeq_id::ParseFastaIds(seqids, acc, false);
2559  }
2560  catch (...) {
2561  seqids.clear();
2562  }
2563 
2564  if (!seqids.empty() && seqids.front()->IsPdb() &&
2565  acc.find("_") != string::npos) {
2566 
2567  str_id = seqids.front()->AsFastaString();
2568  str_id = NStr::ToLower(str_id);
2569  }
2570  else if (!seqids.empty() && seqids.front()->IsLocal()) {
2571  // Chec for gnl dbs
2572  if( acc.find(":") != string::npos) {
2573  static const char* GNL_DBs[] = {"CDD", "SRA", "TSA", "GNOMON", NULL};
2574  string db_tag, gnl_id;
2575  NStr::SplitInTwo(acc, ":", db_tag, gnl_id);
2576  const char** p = GNL_DBs;
2577  for (; p && *p; ++p) {
2578  if(NStr::EqualNocase(*p, db_tag.c_str())) {
2579  str_id = "gnl|" + db_tag + "|" + gnl_id;
2580  seqids.front().Reset();
2581  CRef<CSeq_id> new_id(new CSeq_id(str_id));
2582  seqids.front() = new_id;
2583  break;
2584  }
2585  }
2586  if(*p == NULL) {
2587  str_id = acc;
2588  }
2589  }
2590  else {
2591  Int8 n_id = 0;
2592  if (NStr::StringToNumeric<Int8>(acc,&n_id, NStr::fConvErr_NoThrow)) {
2593  str_id = "lcl|" + acc;
2594  }
2595  else {
2596  str_id = acc;
2597  }
2598  }
2599  }
2600  else {
2601  str_id = acc;
2602  }
2603  result = eStringId;
2604  simpler = false;
2605  }
2606 
2607  return result;
2608 }
2609 
2610 const string SeqDB_SimplifyAccession(const string &acc)
2611 {
2612  Int8 num_id;
2613  string str_id;
2614  bool simpler(false);
2615  ESeqDBIdType result = SeqDB_SimplifyAccession(acc, num_id, str_id, simpler);
2616  if (result == eStringId) return str_id;
2617  else return "";
2618 }
2619 
2620 void SeqDB_GetFileExtensions(bool db_is_protein, vector<string>& extn, EBlastDbVersion dbver)
2621 {
2622  // NOTE: If more extensions are added, please keep in sync with
2623  // updatedb.pl's DistributeBlastDbsToBackends
2624  // and Blast.pm's @blastdb_extensions
2625  extn.clear();
2626 
2627  const string kExtnMol(1, db_is_protein ? 'p' : 'n');
2628 
2629  extn.push_back(kExtnMol + "al"); // alias file
2630  extn.push_back(kExtnMol + "in"); // index file
2631  extn.push_back(kExtnMol + "hr"); // header file
2632  extn.push_back(kExtnMol + "sq"); // sequence file
2633  extn.push_back(kExtnMol + "ni"); // ISAM numeric index file
2634  extn.push_back(kExtnMol + "nd"); // ISAM numeric data file
2635  if (dbver == eBDB_Version4) {
2636  extn.push_back(kExtnMol + "si"); // ISAM string index file
2637  extn.push_back(kExtnMol + "sd"); // ISAM string data file
2638  }
2639  extn.push_back(kExtnMol + "pi"); // ISAM PIG index file
2640  extn.push_back(kExtnMol + "pd"); // ISAM PIG data file
2641  if (dbver == eBDB_Version5) {
2642  vector<string> lmdbs;
2643  SeqDB_GetLMDBFileExtensions(db_is_protein, lmdbs);
2644  extn.insert(extn.end(), lmdbs.begin(), lmdbs.end());
2645  }
2646  // Contain masking information
2647  extn.push_back(kExtnMol + "aa"); // ISAM mask index file
2648  extn.push_back(kExtnMol + "ab"); // ISAM mask data file (big-endian)
2649  extn.push_back(kExtnMol + "ac"); // ISAM mask data file (little-endian)
2650  extn.push_back(kExtnMol + "og"); // OID to GI file
2651  extn.push_back(kExtnMol + "hi"); // ISAM sequence hash index file
2652  extn.push_back(kExtnMol + "hd"); // ISAM sequence hash data file
2653  extn.push_back(kExtnMol + "ti"); // ISAM trace id index file
2654  extn.push_back(kExtnMol + "td"); // ISAM trace id data file
2655 }
2656 
2657 
2658 void SeqDB_GetLMDBFileExtensions(bool db_is_protein, vector<string>& extn)
2659 {
2660 
2661  static const char * ext[]={"db", "os", "ot", "tf", "to", "db-lock", "tf-lock", NULL};
2662  extn.clear();
2663  const string kExtnMol(1, db_is_protein ? 'p' : 'n');
2664  for(const char ** p=ext; *p != NULL; p++) {
2665  extn.push_back(kExtnMol + (*p));
2666  }
2667 }
2668 
2669 
2670 void SeqDB_GetMetadataFileExtension(bool db_is_protein, string & extn)
2671 {
2672  const string kExtnMol(1, db_is_protein ? 'p' : 'n');
2673  extn = kExtnMol + "js";
2674  return;
2675 }
2676 
2677 bool IsStringId(const CSeq_id & id)
2678 {
2679  switch(id.Which()) {
2680  case CSeq_id::e_Gi:
2681  return false;
2682  break;
2683  case CSeq_id::e_General:
2684  {
2685  const CDbtag & dbt = id.GetGeneral();
2686  if (dbt.CanGetDb() && (dbt.GetDb() == "PIG")) {
2687  return false;
2688  }
2689  }
2690  default:
2691  return true;
2692  break;
2693  };
2694 }
2695 
2696 string GetBlastSeqIdString(const CSeq_id & seqid, bool version)
2697 {
2698  if(seqid.IsPir() || seqid.IsPrf()) {
2699  return seqid.AsFastaString();
2700  }
2701 
2702  return seqid.GetSeqIdString(version);
2703 }
2704 
2705 const string SeqDB_GetOidMaskFileExt(bool db_is_protein, EOidMaskType t)
2706 {
2707  switch (t) {
2709  return (db_is_protein ? "pxm":"nxm");
2710  break;
2711  default:
2712  NCBI_THROW(CSeqDBException, eArgErr, "Invalid oid mask type.");
2713  break;
2714  }
2715 }
2716 
2717 
2719 
@ eNone
None specified.
Definition: blast_def.h:326
#define false
Definition: bool.h:36
static int GetSeqidlist(CMemoryFile &file, vector< CSeqDBGiList::SSiOid > &idlist, SBlastSeqIdListInfo &list_info)
Get seqidlist from dbv5 seqidlist file.
Definition: Dbtag.hpp:53
CFastMutex –.
Definition: ncbimtx.hpp:667
CFile –.
Definition: ncbifile.hpp:1604
CIntersectionGiList(CSeqDBGiList &gilist, vector< TGi > &gis)
Construct an intersection of two lists of GIs.
CMemoryFile –.
Definition: ncbifile.hpp:2860
CSeqDBAtlas class.
Definition: seqdbatlas.hpp:298
bool DoesFileExist(const string &fname)
Check if file exists.
Definition: seqdbatlas.cpp:148
const string GetSearchPath() const
Get BlastDB search path.
Definition: seqdbatlas.hpp:505
static const string GenerateSearchPath()
Generate search path.
Definition: seqdbatlas.hpp:511
CSeqDBException.
Definition: seqdbcommon.hpp:73
@ eFileErr
Files were missing or contents were incorrect.
Definition: seqdbcommon.hpp:81
CSeqDBFileGiList(const string &fname, EIdType idtype=eGiList)
Build a GI list from a file.
CSeqDBGiList.
void AddTi(TTi ti)
Add a new TI to the list.
vector< SGiOid > m_GisOids
Pairs of GIs and OIDs.
int GetNumGis() const
Get the number of GIs in the array.
bool GiToOid(TGi gi, int &oid)
Try to find a GI and return the associated OID.
const SGiOid & GetGiOid(int index) const
Access an element of the array.
vector< SPigOid > m_PigsOids
bool FindSi(const string &si) const
int GetNumSis() const
Get the number of Seq-ids in the array.
bool SiToOid(const string &si, int &oid)
CSeqDBGiList()
Constructor.
void GetPigList(vector< TPig > &pigs) const
void GetGiList(vector< TGi > &gis) const
Get the gi list.
bool TiToOid(TTi ti, int &oid)
Try to find a TI and return the associated OID.
bool FindTi(TTi ti) const
Test for existence of a TI.
void GetTiList(vector< TTi > &tis) const
Get the ti list.
int GetNumTis() const
Get the number of TIs in the array.
vector< STiOid > m_TisOids
Pairs of GIs and OIDs.
int GetNumPigs() const
SBlastSeqIdListInfo m_ListInfo
void GetSiList(vector< string > &sis) const
TODO Get the seqid list?
ESortOrder
Possible sorting states.
@ eNone
The array is unsorted or the sortedness is unknown.
@ eGi
The array is sorted by GI.
void AddGi(TGi gi)
Add a new GI to the list.
void ReserveGis(size_t n)
Reserve space for GIs.
void ReserveTis(size_t n)
Reserve space for TIs.
STaxIdsOids m_TaxIdsOids
void PreprocessIdsForISAMSiLookup()
Preprocess ids for ISAM string id lookup.
bool FindGi(TGi gi) const
Test for existence of a GI.
void InsureOrder(ESortOrder order)
Sort if necessary to insure order of elements.
vector< SSiOid > m_SisOids
Pairs of Seq-ids and OIDs.
bool FindId(const CSeq_id &id)
Test for existence of a Seq-id by type.
ESortOrder m_CurrentOrder
Indicates the current sort order, if any, of this container.
Helper class to allow copy-on-write semantics for CSeqDBIdSet.
const vector< Int8 > & Get() const
Access the Int8 set.
size_t Size() const
Get the number of elements stored here.
vector< Int8 > & Set()
Access the Int8 set.
vector< string > & SetSeqIDs()
Access the string set.
SeqDB ID list for performing boolean set operations.
CRef< CSeqDBIdSet_Vector > m_Ids
Ids stored here.
static void x_SortAndUnique(vector< Int8 > &ids)
Sort and unique the internal set.
void x_BooleanSetOperation(EOperation op, const vector< Int8 > &A, bool A_pos, const vector< Int8 > &B, bool B_pos, vector< Int8 > &result, bool &result_pos)
Compute boolean operation on two vectors.
static void x_SummarizeBooleanOp(EOperation op, bool A_pos, bool B_pos, bool &result_pos, bool &incl_A, bool &incl_B, bool &incl_AB)
Compute inclusion flags for a boolean operation.
CSeqDBIdSet()
Construct a 'blank' CSeqDBIdSet object.
bool m_Positive
True if the current list is positive.
void Negate()
Invert the current list.
bool Blank() const
Check if an ID list is blank.
EIdType
Type of IDs stored here.
EOperation
Types of operations that may be performed on GI lists.
void Compute(EOperation op, const vector< int > &ids, bool positive=true)
Perform a logical operation on a list.
EIdType m_IdType
Id type.
CRef< CSeqDBNegativeList > GetNegativeList()
Retrieve a negative GI list.
CRef< CSeqDBGiList > GetPositiveList()
Retrieve a positive GI list.
CSeqDBNegativeList.
void AddSi(const string &si)
Add a new SeqId to the list.
void AddGi(TGi gi)
Add a new GI to the list.
void ReserveGis(size_t n)
Reserve space for GIs.
void AddTi(TTi ti)
Add a new TI to the list.
int GetNumTis() const
Get the number of TIs in the array.
bool FindId(const CSeq_id &id, bool &match_type)
Test for existence of a TI or GI here and report whether the ID was one of those types.
void ReserveTis(size_t n)
Reserve space for TIs.
vector< TTi > m_Tis
TIs to exclude from the SeqDB instance.
TGi GetGi(int index) const
Access an element of the GI array.
bool FindSi(string si)
vector< string > m_Sis
SeqIds to exclude from the SeqDB instance.
bool FindTi(TTi ti)
Test for existence of a TI.
void PreprocessIdsForISAMSiLookup()
int GetNumGis() const
Get the number of GIs in the array.
bool FindGi(TGi gi)
Test for existence of a GI.
size_t m_LastSortSize
Zero if unsorted, or the size it had after the last sort.
vector< TGi > m_Gis
GIs to exclude from the SeqDB instance.
int GetNumSis() const
Get the number of SeqIds in the array.
void ReserveSis(size_t n)
void InsureOrder()
Sort list if not already sorted.
Check file existence using CSeqDBAtlas.
CSeqDBAtlas & m_Atlas
CSeqDB_AtlasAccessor(CSeqDBAtlas &atlas)
Constructor.
virtual bool DoesFileExist(const string &fname)
Test file existence.
File existence test interface.
virtual ~CSeqDB_FileExistence()
Destructor.
virtual bool DoesFileExist(const string &fname)=0
Check if file exists at fully qualified path.
CSeqDB_Path.
CSeqDB_Substring FindBaseName() const
Returns the portion of this path containing the base name.
Check file existence using CFile.
virtual bool DoesFileExist(const string &fname)
Test file existence.
CSeqDB_SimpleAccessor()
Constructor.
Compare SGiOid structs by GI.
int operator()(const CSeqDBGiList::SGiOid &lhs, const CSeqDBGiList::SGiOid &rhs)
Test whether lhs is less than (occurs before) rhs.
Compare SGiOid structs by OID.
int operator()(const CSeqDBGiList::SGiOid &lhs, const CSeqDBGiList::SGiOid &rhs)
Test whether lhs is less than (occurs before) rhs.
int operator()(const CSeqDBGiList::SPigOid &lhs, const CSeqDBGiList::SPigOid &rhs)
Test whether lhs is less than (occurs before) rhs.
Compare SSeqIdOid structs by SeqId.
int operator()(const CSeqDBGiList::SSiOid &lhs, const CSeqDBGiList::SSiOid &rhs)
Test whether lhs is less than (occurs before) rhs.
Compare SGiOid structs by GI.
int operator()(const CSeqDBGiList::STiOid &lhs, const CSeqDBGiList::STiOid &rhs)
Test whether lhs is less than (occurs before) rhs.
String slicing.
void GetString(string &s) const
Return the data by assigning it to a string.
int Size() const
Return the length of the string in bytes.
void Clear()
Reset the string to an empty state.
const char * GetEnd() const
Returns a pointer to the end of the string, which is always a pointer to the character past the last ...
void EraseFront(int n)
Disinclude data from the beginning of the string.
void Resize(int n)
Change the length of the string.
const char * GetBegin() const
Returns a pointer to the start of the string.
bool Empty() const
Returns true iff the string is empty.
int FindLastOf(char ch) const
Find last instance of a character in the substring.
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
void clear()
Definition: set.hpp:153
static tds_mutex mtx
Definition: condition.c:43
#define head
Definition: ct_nlmzip_i.h:138
static const char si[8][64]
Definition: des.c:146
#define G(x, y, z)
Definition: md4.c:179
#define A(i)
Definition: ecp_curves.c:948
#define GI_FROM(T, value)
Definition: ncbimisc.hpp:1086
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define TAX_ID_FROM(T, value)
Definition: ncbimisc.hpp:1111
#define ZERO_GI
Definition: ncbimisc.hpp:1088
#define GI_TO(T, gi)
Definition: ncbimisc.hpp:1085
string
Definition: cgiapp.hpp:687
#define NULL
Definition: ncbistd.hpp:225
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
size_t GetSize(void) const
Get length of the mapped region.
Definition: ncbifile.hpp:4287
void * GetPtr(void) const
Get pointer to beginning of data.
Definition: ncbifile.hpp:4281
static char GetPathSeparator(void)
Get path separator symbol specific for the current platform.
Definition: ncbifile.cpp:433
ELabelFlags
Definition: Seq_id.hpp:582
const string AsFastaString(void) const
Definition: Seq_id.cpp:2265
string GetSeqIdString(bool with_version=false) const
Return seqid string with optional version for text seqid type.
Definition: Seq_id.cpp:2144
void GetLabel(string *label, ELabelType type=eDefault, TLabelFlags flags=fLabel_Default) const
Append a label for this Seq-id to the supplied string.
Definition: Seq_id.cpp:2039
static SIZE_TYPE ParseFastaIds(CBioseq::TId &ids, const CTempString &s, bool allow_partial_failure=false)
Parse an entire set of |-delimited FASTA-style IDs, appending the results to IDS.
Definition: Seq_id.cpp:2602
static E_Choice WhichInverseSeqId(const CTempString &SeqIdCode)
Converts a string to a choice, no need to require a member.
Definition: Seq_id.cpp:599
static int BestRank(const CRef< CSeq_id > &id)
Definition: Seq_id.hpp:742
const CTextseq_id * GetTextseq_Id(void) const
Return embedded CTextseq_id, if any.
Definition: Seq_id.cpp:169
@ fLabel_Version
Show the version.
Definition: Seq_id.hpp:583
@ fLabel_GeneralDbIsContent
For type general, use the database name as the tag and the (text or numeric) key as the content.
Definition: Seq_id.hpp:586
@ eFasta
Tagged ID in NCBI's traditional FASTA style.
Definition: Seq_id.hpp:575
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
static Int8 StringToInt8(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to Int8.
Definition: ncbistr.cpp:793
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3457
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5083
static string UIntToString(unsigned int value, TNumToStringFlags flags=0, int base=10)
Convert UInt to string.
Definition: ncbistr.hpp:5108
static bool SplitInTwo(const CTempString str, const CTempString delim, string &str1, string &str2, TSplitFlags flags=0)
Split a string into two pieces using the specified delimiters.
Definition: ncbistr.cpp:3550
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5352
static string TruncateSpaces(const string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string.
Definition: ncbistr.cpp:3182
static string & ToLower(string &str)
Convert string to lower case – string& version.
Definition: ncbistr.cpp:405
@ fConvErr_NoThrow
Do not throw an exception on error.
Definition: ncbistr.hpp:285
@ fSplit_Tokenize
All delimiters are merged and trimmed, to get non-empty tokens only.
Definition: ncbistr.hpp:2508
@ eTrunc_Both
Truncate spaces at both begin and end of string.
Definition: ncbistr.hpp:2242
C::value_type FindBestChoice(const C &container, F score_func)
Find the best choice (lowest score) for values in a container.
Definition: ncbiutil.hpp:250
bool IsStr(void) const
Check if variant Str is selected.
Definition: Object_id_.hpp:291
const TTag & GetTag(void) const
Get the Tag member data.
Definition: Dbtag_.hpp:267
bool IsId(void) const
Check if variant Id is selected.
Definition: Object_id_.hpp:264
bool CanGetDb(void) const
Check if it is safe to call GetDb method.
Definition: Dbtag_.hpp:214
bool CanGetTag(void) const
Check if it is safe to call GetTag method.
Definition: Dbtag_.hpp:261
const TDb & GetDb(void) const
Get the Db member data.
Definition: Dbtag_.hpp:220
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
TId GetId(void) const
Get the variant data.
Definition: Object_id_.hpp:270
TGibbsq GetGibbsq(void) const
Get the variant data.
Definition: Seq_id_.hpp:787
const TName & GetName(void) const
Get the Name member data.
bool CanGetName(void) const
Check if it is safe to call GetName method.
bool IsPrf(void) const
Check if variant Prf is selected.
Definition: Seq_id_.hpp:916
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_id_.hpp:746
TGi GetGi(void) const
Get the variant data.
Definition: Seq_id_.hpp:889
TVersion GetVersion(void) const
Get the Version member data.
bool CanGetVersion(void) const
Check if it is safe to call GetVersion method.
E_Choice
Choice variants.
Definition: Seq_id_.hpp:93
const TLocal & GetLocal(void) const
Get the variant data.
Definition: Seq_id_.cpp:193
bool CanGetAccession(void) const
Check if it is safe to call GetAccession method.
const TGeneral & GetGeneral(void) const
Get the variant data.
Definition: Seq_id_.cpp:369
bool IsPir(void) const
Check if variant Pir is selected.
Definition: Seq_id_.hpp:853
const TAccession & GetAccession(void) const
Get the Accession member data.
@ e_Other
for historical reasons, 'other' = 'refseq'
Definition: Seq_id_.hpp:104
@ e_Gpipe
Internal NCBI genome pipeline processing ID.
Definition: Seq_id_.hpp:113
@ e_Tpe
Third Party Annot/Seq EMBL.
Definition: Seq_id_.hpp:111
@ e_Tpd
Third Party Annot/Seq DDBJ.
Definition: Seq_id_.hpp:112
@ e_Gibbsq
Geninfo backbone seqid.
Definition: Seq_id_.hpp:96
@ e_General
for other databases
Definition: Seq_id_.hpp:105
@ e_Ddbj
DDBJ.
Definition: Seq_id_.hpp:107
@ e_Gi
GenInfo Integrated Database.
Definition: Seq_id_.hpp:106
@ e_Prf
PRF SEQDB.
Definition: Seq_id_.hpp:108
@ e_not_set
No variant selected.
Definition: Seq_id_.hpp:94
@ e_Tpg
Third Party Annot/Seq Genbank.
Definition: Seq_id_.hpp:110
@ e_Local
local use
Definition: Seq_id_.hpp:95
char * dbname(DBPROCESS *dbproc)
Get name of current database.
Definition: dblib.c:6929
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
where boath are integers</td > n< td ></td > n</tr > n< tr > n< td > tse</td > n< td > optional</td > n< td > String</td > n< td class=\"description\"> TSE option controls what blob is whole
FILE * file
int i
static void text(MDB_val *v)
Definition: mdb_dump.c:62
static int version
Definition: mdb_load.c:29
CMetaRegistry: Singleton class for loading CRegistry data from files; keeps track of what it loaded f...
constexpr auto sort(_Init &&init)
constexpr auto front(list< Head, As... >, T=T()) noexcept -> Head
const struct ncbi::grid::netcache::search::fields::SIZE size
unsigned int a
Definition: ncbi_localip.c:102
EIPRangeType t
Definition: ncbi_localip.c:101
int isalpha(Uchar c)
Definition: ncbictype.hpp:61
int isdigit(Uchar c)
Definition: ncbictype.hpp:64
Defines unified interface to application:
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
T positive(T x_)
static char tmp[2048]
Definition: utf8.c:42
static pcre_uint8 * buffer
Definition: pcretest.c:1051
The SeqDB memory management layer.
bool SeqDB_IsBinaryTiList(const string &fname)
Returns true if the file name passed contains a binary TI list.
void SeqDB_ReadPigList(const string &fname, vector< CSeqDBGiList::SPigOid > &pigs, bool *in_order)
void SeqDB_ReadMemoryPigList(const char *fbeginp, const char *fendp, vector< CSeqDBGiList::SPigOid > &pigs, bool *in_order)
static bool s_SeqDB_DBExists(const string &dbname, char dbtype, CSeqDB_FileExistence &access, bool linkoutdb_search)
Test whether an index or alias file exists.
void SeqDB_ReadGiList(const string &fname, vector< CSeqDBGiList::SGiOid > &gis, bool *in_order)
Read a text or binary GI list from a file.
void SeqDB_GetLMDBFileExtensions(bool db_is_protein, vector< string > &extn)
Retrieves file extensions for BLAST LMDB files.
void SeqDB_ReadMemorySiList(const char *fbeginp, const char *fendp, vector< CSeqDBGiList::SSiOid > &sis, bool *in_order)
Read a text SeqID list from an area of memory.
void SeqDB_ReadBinaryGiList(const string &fname, vector< TGi > &gis)
Read a binary-format GI list from a file.
void SeqDB_ReadMemoryGiList(const char *fbeginp, const char *fendp, vector< CSeqDBGiList::SGiOid > &gis, bool *in_order)
Read a text or binary GI list from an area of memory.
CSeqDB_Substring SeqDB_RemoveExtn(CSeqDB_Substring s)
Returns a filename minus greedy path.
Definition: seqdbcommon.cpp:76
bool SeqDB_CompareVolume(const string &s1, const string &s2)
Compares two volume file names and determine the volume order.
ESeqDBIdType SeqDB_SimplifySeqid(CSeq_id &bestid, const string *acc, Int8 &num_id, string &str_id, bool &simpler)
Seq-id simplification.
void SeqDB_ReadMemoryMixList(const char *fbeginp, const char *fendp, vector< CSeqDBGiList::SGiOid > &gis, vector< CSeqDBGiList::STiOid > &tis, vector< CSeqDBGiList::SSiOid > &sis, bool *in_order)
Read an ID list (mixed type) from an area of memory.
string SeqDB_FindBlastDBPath(const string &dbname, char dbtype, string *sp, bool exact, CSeqDBAtlas &atlas)
Finds a file in the search path.
string GetBlastSeqIdString(const CSeq_id &seqid, bool version)
Return ID string as stored in lmdb.
void SeqDB_ReadMixList(const string &fname, vector< CSeqDBGiList::SGiOid > &gis, vector< CSeqDBGiList::STiOid > &tis, vector< CSeqDBGiList::SSiOid > &sis, bool *in_order)
Read a text SeqId list from a file.
static string s_SeqDB_FindBlastDBPath(const string &dbname, char dbtype, string *sp, bool exact, CSeqDB_FileExistence &access, const string path="")
void SeqDB_SplitQuoted(const string &dbname, vector< CTempString > &dbs, bool keep_quote)
Split a (possibly) quoted list of database names into pieces.
int s_ReadDigit(const char d, const string &list_type)
static string s_SeqDB_TryPaths(const string &blast_paths, const string &dbname, char dbtype, bool exact, CSeqDB_FileExistence &access, bool linkoutdb_search=false)
Search for a file in a provided set of paths.
CSeqDB_Substring SeqDB_RemoveFileName(CSeqDB_Substring s)
Returns a path minus filename.
Definition: seqdbcommon.cpp:62
static string s_GetPathSplitter()
Returns the character used to seperate path components in the current operating system or platform.
void SeqDB_GetFileExtensions(bool db_is_protein, vector< string > &extn, EBlastDbVersion dbver)
Retrieves a list of all supported file extensions for BLAST databases.
bool IsStringId(const CSeq_id &id)
Determine if id is srting id.
void SeqDB_JoinDelim(string &a, const string &b, const string &delim)
Join two strings with a delimiter.
ESeqDBIdType SeqDB_SimplifyAccession(const string &acc, Int8 &num_id, string &str_id, bool &simpler)
String id simplification.
static bool s_SeqDB_ParseSeqIDs(const string &line, vector< CRef< CSeq_id > > &seqids)
Parse string into a sequence of Seq-id objects.
bool SeqDB_IsBinaryGiList(const string &fname)
Read a text or binary SeqId list from a file.
void s_InsureOrder(TVector &v)
void SeqDB_FileIntegrityAssert(const string &file, int line, const string &text)
Report file corruption by throwing an eFile CSeqDBException.
CSeqDB_Substring SeqDB_RemoveDirName(CSeqDB_Substring s)
Returns a filename minus greedy path.
Definition: seqdbcommon.cpp:50
void SeqDB_ReadTaxIdList(const string &fname, CSeqDBGiList::STaxIdsOids &taxids)
void SeqDB_ReadSiList(const string &fname, vector< CSeqDBGiList::SSiOid > &sis, bool *in_order, SBlastSeqIdListInfo &db_info)
Read a text SeqId list from a file.
const string SeqDB_GetOidMaskFileExt(bool db_is_protein, EOidMaskType t)
string SeqDB_ResolveDbPathForLinkoutDB(const string &filename)
Resolve a file path using SeqDB's path algorithms.
void SeqDB_ReadMemoryTaxIdList(const char *fbeginp, const char *fendp, CSeqDBGiList::STaxIdsOids &taxids)
void SeqDB_ReadMemoryTiList(const char *fbeginp, const char *fendp, vector< CSeqDBGiList::STiOid > &tis, bool *in_order)
Read a text or binary TI list from an area of memory.
void SeqDB_CombineAndQuote(const vector< string > &dbs, string &dbname)
Combine and quote list of database names.
string SeqDB_MakeOSPath(const string &dbs)
Return path with delimiters changed to platform preferred kind.
void SeqDB_ReadTiList(const string &fname, vector< CSeqDBGiList::STiOid > &tis, bool *in_order)
Read a text or binary TI list from a file.
static bool s_SeqDB_IsBinaryNumericList(const char *fbeginp, const char *fendp, bool &has_long_ids, bool *has_tis=NULL)
This function determines whether a file is a valid binary GI/TI file.
string SeqDB_ResolveDbPath(const string &filename)
Resolve a file path using SeqDB's path algorithms.
string SeqDB_ResolveDbPathNoExtension(const string &filename, char dbtype)
Resolve a file path using SeqDB's path algorithms.
void SeqDB_GetMetadataFileExtension(bool db_is_protein, string &extn)
bool SeqDB_SplitString(CSeqDB_Substring &buffer, CSeqDB_Substring &front, char delim)
Parse a prefix from a substring.
static bool s_ContainsBinaryNumericIdList(const string &fname, CSeqDBFileGiList::EIdType type)
static size_t s_SeqDB_EndOfFastaID(const string &str, size_t pos)
Find the end of a single element in a Seq-id set.
void SeqDB_CombinePath(const CSeqDB_Substring &one, const CSeqDB_Substring &two, const CSeqDB_Substring *extn, string &outp)
Combine a filesystem path and file name.
const string kSeqDBGroupAliasFileName("index.alx")
void SeqDB_ConvertOSPath(string &dbs)
Change path delimiters to platform preferred kind in-place.
Defines exception class and several constants for SeqDB.
Uint4 TPig
Uint8 TTi
EBlastDbVersion
BLAST database version.
Definition: seqdbcommon.hpp:51
@ eBDB_Version4
Definition: seqdbcommon.hpp:52
@ eBDB_Version5
Definition: seqdbcommon.hpp:53
ESeqDBIdType
Various identifier formats used in Id lookup.
@ eStringId
Each PIG identifier refers to exactly one protein sequence.
@ eTiId
Genomic ID is a relatively stable numeric identifier for sequences.
@ ePigId
Trace ID is a numeric identifier for Trace sequences.
@ eGiId
@ eOID
Lookup from sequence hash values to OIDs.
EOidMaskType
@ fExcludeModel
This file defines several SeqDB utility functions related to byte order and file system portability.
void SeqDB_ThrowException(CSeqDBException::EErrCode code, const string &msg)
Thow a SeqDB exception; this is seperated into a function primarily to allow a breakpoint to be set.
Definition: seqdbatlas.cpp:70
T SeqDB_GetStdOrd(const T *stdord_obj)
Read a network order integer value.
void s_SeqDB_QuickAssign(string &dst, const char *bp, const char *ep)
Higher Performance String Assignment.
static const char * str(char *buf, int n)
Definition: stats.c:84
Structure that holds GI,OID pairs.
TGi gi
The GI or 0 if unknown.
int oid
The OID or -1 if unknown.
TPig pig
The PIG or 0 if unknown.
Structure that holds Seq-id,OID pairs.
string si
The String-id or "" if unknown.
vector< blastdb::TOid > oids
Structure that holds TI,OID pairs.
TTi ti
The TI or 0 if unknown.
Blast DB v5 seqid list info.
Definition: type.c:6
#define _ASSERT
else result
Definition: token2.c:20
@ eGi
GI Index.
vector< CRef< CSeq_id > > TIdList
Modified on Sat Dec 02 09:19:52 2023 by modify_doxy.py rev. 669887