NCBI C++ ToolKit
seqdbalias.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: seqdbalias.cpp 97226 2022-06-28 12:33:29Z fongah2 $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Kevin Bealer
27  *
28  */
29 
30 /// @file seqdbalias.cpp
31 /// Code which manages a hierarchical tree of alias file data.
32 ///
33 /// Defines classes:
34 /// CSeqDB_TitleWalker
35 /// CSeqDB_MaxLengthWalker
36 /// CSeqDB_MinLengthWalker
37 /// CSeqDB_NSeqsWalker
38 /// CSeqDB_NOIDsWalker
39 /// CSeqDB_TotalLengthWalker
40 /// CSeqDB_VolumeLengthWalker
41 /// CSeqDB_MembBitWalker
42 ///
43 /// Implemented for: UNIX, MS-Windows
44 #include <ncbi_pch.hpp>
45 #include <corelib/ncbistr.hpp>
46 #include <corelib/ncbifile.hpp>
47 #include <algorithm>
48 #include <sstream>
49 
50 #include "seqdbalias.hpp"
52 
54 
56  const string & name_list,
57  char prot_nucl,
58  bool expand_links)
59  : m_AliasSets (atlas),
60  m_IsProtein (prot_nucl == 'p'),
61  m_MinLength (-1),
62  m_NumSeqs (-1),
63  m_NumSeqsStats (-1),
64  m_NumOIDs (-1),
65  m_TotalLength (-1),
66  m_TotalLengthStats (-1),
67  m_VolumeLength (-1),
68  m_MembBit (-1),
69  m_HasTitle (false),
70  m_NeedTotalsScan (-1),
71  m_HasFilters (0),
72  m_OidMaskType (0)
73 {
74  if (name_list.size() && prot_nucl != '-') {
75  m_Node.Reset(new CSeqDBAliasNode(atlas,
76  name_list,
77  prot_nucl,
79  expand_links));
80 
82  }
83 }
84 
85 void CSeqDBAliasNode::x_Tokenize(const string & dbnames)
86 {
87  vector<CSeqDB_Substring> dbs;
88  SeqDB_SplitQuoted(dbnames, dbs);
89 
90  m_DBList.resize(dbs.size());
91  m_SkipLocal.resize(dbs.size(),false);
92 
93  for(size_t i = 0; i<dbs.size(); i++) {
94  m_DBList[i].Assign(dbs[i]);
95  m_DBList[i].FixDelimiters();
96  }
97 }
98 
100  const string & dbname_list,
101  char prot_nucl,
102  CSeqDBAliasSets & alias_sets,
103  bool expand_links)
104  : m_Atlas (atlas),
105  m_DBPath ("."),
106  m_ThisName ("-"),
107  m_HasGiMask(true),
108  m_AliasSets(alias_sets),
109  m_ExpandLinks(expand_links)
110 {
111  CSeqDBLockHold locked(atlas);
112 
113 
114  m_Values["DBLIST"] = dbname_list;
115 
116  x_Tokenize(dbname_list);
117 
118  // Skip gi mask if more than one DBs are specified.
119  if (m_DBList.size() != 1) {
120  m_HasGiMask = false;
121  }
122 
123  x_ResolveNames(prot_nucl, locked);
124 
125  CSeqDBAliasStack recurse;
126 
127  x_ExpandAliases(CSeqDB_BasePath("-"), prot_nucl, recurse, locked);
128 
129 
130 
131  _ASSERT(recurse.Size() == 0);
132 
133  // When we get here, the subnodes tree has been built
134  if (m_HasGiMask) {
135  if (m_SubNodes.size() != 1 ||
136  m_SubNodes[0]->m_Values.find("MASKLIST")
137  == m_SubNodes[0]->m_Values.end()) {
138  m_HasGiMask = false;
139  }
140  }
141 }
142 
143 
144 // Private Constructor
145 //
146 // This is the constructor for nodes other than the top-level node.
147 // As such it is private and only called from this class.
148 //
149 // This constructor constructs subnodes by calling x_ExpandAliases,
150 // which calls this constructor again with the subnode's arguments.
151 // But no node should be its own ancestor. To prevent this kind of
152 // recursive loop, each file adds its full path to a stack of strings
153 // and will not create a subnode for any path already in that set.
154 
156  const CSeqDB_DirName & dbpath,
157  const CSeqDB_BaseName & dbname,
158  char prot_nucl,
159  CSeqDBAliasStack & recurse,
160  CSeqDBLockHold & locked,
161  CSeqDBAliasSets & alias_sets,
162  bool expand_links)
163  : m_Atlas (atlas),
164  m_DBPath (dbpath),
165  m_ThisName (m_DBPath, dbname, prot_nucl, 'a', 'l'),
166  m_AliasSets (alias_sets),
167  m_ExpandLinks (expand_links)
168 {
169  recurse.Push(m_ThisName);
170 
171  x_ReadValues(m_ThisName, locked);
172  x_Tokenize(m_Values["DBLIST"]);
173 
175 
176  x_ExpandAliases(basepath, prot_nucl, recurse, locked);
177 
178  recurse.Pop();
179 }
180 
181 bool
183  char dbtype,
184  bool exact,
185  string & resolved)
186 {
188 
189  if (i == m_PathLookup.end()) {
190  resolved = SeqDB_FindBlastDBPath(dbname,
191  dbtype,
192  0,
193  exact,
194  m_Atlas);
195 
196  m_PathLookup[dbname] = resolved;
197  } else {
198  resolved = (*i).second;
199  }
200 
201  return ! resolved.empty();
202 }
203 
204 
205 bool
207  CSeqDB_Path * resolved,
208  CSeqDBLockHold & locked)
209 {
210  CSeqDB_Path aset_path;
211  CSeqDB_FileName alias_fname;
212 
213  x_DbToIndexName(dbpath, aset_path, alias_fname);
214 
215  CSeqDB_Path resolved_aset;
216 
217  if (! FindBlastDBPath(aset_path, resolved_aset)) {
218  return false;
219  }
220 
221  CSeqDB_Path afpath(resolved_aset.FindDirName(),
222  alias_fname.GetFileNameSub());
223 
224  // This is not ideal. If the alias file is found, but does not
225  // contain the alias in question, we punt, allowing normal alias
226  // file reading to take over. The correct technique would be to
227  // try the next location in the database search path.
228  //
229  // Solving this correctly means cracking FindBlastDBPath() into
230  // three pieces, one that builds a list of paths, one that tries a
231  // specified path, and a third that calls the first, then iterates
232  // over the list, calling the second.
233  //
234  // This can be done later - punting could be inefficient in some
235  // cases but should work correctly.
236 
237  if (! ReadAliasFile(afpath, 0, 0, locked)) {
238  return false;
239  }
240 
241  if (resolved) {
242  *resolved = afpath;
243  }
244 
245  return true;
246 }
247 
248 
249 void CSeqDBAliasNode::x_ResolveNames(char prot_nucl, CSeqDBLockHold & locked)
250 {
251  m_DBPath = CSeqDB_DirName(".");
252 
253  size_t i = 0;
254 
255  for(i = 0; i < m_DBList.size(); i++) {
256  // skip local DB search only if absolute path is given
257  if(m_DBList[i].GetBasePathS().find(CDirEntry::GetPathSeparator()) != string::npos) {
258  m_SkipLocal[i] = true;
259  }
260 
261  const CSeqDB_Path db_path( (CSeqDB_BasePath(m_DBList[i])), prot_nucl, 'a', 'l' );
262 
263  CSeqDB_Path resolved_path;
264 
265  // search for X/kSeqDBGroupAliasFileName
266  if (! m_AliasSets.FindAliasPath(db_path, & resolved_path, locked)) {
267  CSeqDB_BasePath base(db_path.FindBasePath());
268  CSeqDB_BasePath resolved_bp;
269 
270  // search for X/base.nal/nin
271  if (m_AliasSets.FindBlastDBPath(base, prot_nucl, resolved_bp)) {
272  resolved_path = CSeqDB_Path(resolved_bp, prot_nucl, 'a', 'l');
273  }
274  }
275 
276  if (! resolved_path.Valid()) {
277  string p_or_n;
278 
279  switch(prot_nucl) {
280  case 'p':
281  p_or_n = "protein";
282  break;
283 
284  case 'n':
285  p_or_n = "nucleotide";
286  break;
287 
288  default:
289  string msg("SeqDB: Internal error: bad sequence type for database [");
290  msg += m_DBList[i].GetBasePathS() + "]";
291 
293  eFileErr,
294  msg);
295  }
296 
297  // Do over (to get the search path). This doesnt use the
298  // resolution map, since speed is not of the essence.
299 
300  string search_path;
301  string input_path;
302 
303  db_path.FindBasePath().GetString(input_path);
304 
305  SeqDB_FindBlastDBPath(input_path,
306  prot_nucl,
307  & search_path,
308  false,
309  m_Atlas);
310 
311  ostringstream oss;
312  oss << "No alias or index file found for " << p_or_n
313  << " database [" << m_DBList[i].GetBasePathS()
314  << "] in search path [" << search_path << "]";
315 
316  string msg(oss.str());
317 
319  eFileErr,
320  msg);
321  } else {
322  // full dereferenced name but without suffix /X/base
323  if (m_ExpandLinks) {
324  string dir_name, base_name;
325  resolved_path.FindDirName().GetString(dir_name);
326  resolved_path.FindBaseName().GetString(base_name);
327  m_DBList[i].Assign(CSeqDB_Substring(
330  base_name ));
331  } else {
332  m_DBList[i].Assign(resolved_path.FindBasePath());
333  }
334  }
335  }
336 
337  // Everything from here depends on m_DBList[0] existing.
338  if (m_DBList.empty())
339  return;
340 
341  size_t common = m_DBList[0].GetBasePathS().size();
342 
343  // Reduce common length to length of min db path.
344  for(i = 1; common && (i < m_DBList.size()); i++) {
345  if (m_DBList[i].GetBasePathS().size() < common) {
346  common = m_DBList[i].GetBasePathS().size();
347  }
348  }
349 
350  if (common) {
351  --common;
352  }
353 
354  // Reduce common length to largest universal prefix.
355  const string & first = m_DBList[0].GetBasePathS();
356 
357  for(i = 1; common && (i < m_DBList.size()); i++) {
358  // Reduce common prefix length until match is found.
359  while(memcmp(first.c_str(),
360  m_DBList[i].GetBasePathS().c_str(),
361  common)) {
362  --common;
363  }
364  }
365 
366  // Adjust back to whole path component.
367  while(common && (first[common-1] != CFile::GetPathSeparator())) {
368  --common;
369  }
370 
371  if (common > 1) {
372  // Factor out common path components.
373  m_DBPath.Assign( CSeqDB_Substring(first.data(), first.data() + common) );
374 
375  for(i = 0; i < m_DBList.size(); i++) {
376  CSeqDB_Substring sub(m_DBList[i].GetBasePathS());
377  sub.EraseFront((int) common);
378 
379  m_DBList[i].Assign(sub);
380  }
381  }
382 }
383 
384 /// Parse a name-value pair.
385 ///
386 /// The specified section of memory, corresponding to a line from an
387 /// alias file or group alias file, is read, and the name and value
388 /// are returned in the provided strings, whose capacity is managed
389 /// via the quick assignment function.
390 ///
391 /// @param bp The memory region starts here. [in]
392 /// @param ep The end of the memory region. [in]
393 /// @param name The field name is returned here. [out]
394 /// @param value The field value is returned here. [out]
395 
396 static void s_SeqDB_ReadLine(const char * bp,
397  const char * ep,
398  string & name,
399  string & value)
400 {
401  name.erase();
402  value.erase();
403 
404  const char * p = bp;
405 
406  // If first nonspace char is '#', line is a comment, so skip.
407  if (*p == '#') {
408  return;
409  }
410 
411  // Find name
412  const char * spacep = p;
413 
414  while((spacep < ep) && ((*spacep != ' ') && (*spacep != '\t')))
415  spacep ++;
416 
417  s_SeqDB_QuickAssign(name, p, spacep);
418 
419  // Skip spaces, tabs, to find value
420  while((spacep < ep) && ((*spacep == ' ') || (*spacep == '\t')))
421  spacep ++;
422 
423  // Strip spaces, tabs from end
424  while((spacep < ep) && ((ep[-1] == ' ') || (ep[-1] == '\t')))
425  ep --;
426 
427  s_SeqDB_QuickAssign(value, spacep, ep);
428 
429  for(size_t i = 0; i<value.size(); i++) {
430  if (value[i] == '\t') {
431  value[i] = ' ';
432  }
433  }
434 }
435 
436 
437 void CSeqDBAliasNode::x_ReadLine(const char * bp,
438  const char * ep,
439  string & name,
440  string & value)
441 {
442  s_SeqDB_ReadLine(bp, ep, name, value);
443 
444  if (name.size()) {
445  // Store in this nodes' dictionary.
446  m_Values[name].swap(value);
447  }
448 }
449 
450 
452  CSeqDB_Path & index_path,
453  CSeqDB_FileName & alias_fname)
454 {
455  index_path.ReplaceFilename(dbpath,
457  alias_fname.Assign(dbpath.FindFileName());
458 }
459 
460 
461 
462 /// Find starting points of included data in the group alias file.
463 ///
464 /// This function scans the memory region containing the group alias
465 /// file's data, looking for the string provided as the key. The key
466 /// marks the start of each alias file included in the group alias
467 /// file. This code compiles a list of pointers representing the
468 /// starts and ends of the interesting data within the group alias
469 /// file memory region.
470 ///
471 /// The first pointer returned here is the start of a line containing
472 /// the alias file string, then a pointer to the end of that line,
473 /// then a pointer to the start of the next line containing the key,
474 /// and so on, repeating. Finally, the pointer to the end of the data
475 /// is returned. Therefore, to find the names of all the alias files,
476 /// you would examine the range from p0 to p1, p2 to p3, and so on.
477 /// To find the contents of the alias files, you would examine p1 to
478 /// p2, p3 to p4, and so on. The last pointer is appended because it
479 /// makes it easier to write the loop in the recieving code.
480 ///
481 /// @param bp The memory region starts here. [in]
482 /// @param ep The end of the memory region. [in]
483 /// @param key The seperating string. [out]
484 /// @param offsets [out]
485 static void
486 s_SeqDB_FindOffsets(const char * bp,
487  const char * ep,
488  const string & key,
489  vector<const char *> & offsets)
490 {
491  size_t keylen = key.size();
492 
493  const char * last_keyp = ep - keylen;
494 
495  for(const char * p = bp; p < last_keyp; p++) {
496  bool found = true;
497 
498  for(size_t i = 0; i < keylen; i++) {
499  if (p[i] != key[i]) {
500  found = false;
501  break;
502  }
503  }
504 
505  if (found) {
506  // This snippet of code verifies that the key found by the
507  // above loop is either at the start of the memory region,
508  // or is the first non-whitespace on the line it inhabits.
509  // If a database title includes the phrase ALIAS_FILE, we
510  // don't treat it as the start of a new alias file.
511 
512  const char * p2 = p - 1;
513 
514  while((p2 >= bp) && !SEQDB_ISEOL(*p2)) {
515  if ((*p2) != ' ' && (*p2) != '\t') {
516  found = false;
517  break;
518  }
519 
520  p2 --;
521  }
522 
523  if (found) {
524  // Push back start of "ALIAS_FILE" string.
525 
526  offsets.push_back(p);
527 
528  for(p2 = p + keylen; p2 < ep && !SEQDB_ISEOL(*p2); p2++)
529  ;
530 
531  // And end of that line (or of the file).
532  offsets.push_back(p2);
533 
534  p = p2;
535  }
536  }
537  }
538 
539  // As with ISAM files, we append an additional pointer, to
540  // indicate the end of the last entry's contents.
541 
542  offsets.push_back(ep);
543 }
544 
545 
547  CSeqDBLockHold & locked)
548 {
549  string key("ALIAS_FILE");
550 
551  CSeqDBFileMemMap lease(m_Atlas,aset_path.GetPathS());
552 
553  CSeqDBAtlas::TIndx length(0);
554  m_Atlas.GetFileSizeL(aset_path.GetPathS(), length);
555 
556  const char * bp = lease.GetFileDataPtr(0);
557 
558  const char * ep = bp + (size_t) length;
559 
560  vector<const char *> offsets;
561 
562  s_SeqDB_FindOffsets(bp, ep, key, offsets);
563 
564  // Now, for each offset, read the "ALIAS_FILE" line and store the
565  // contents of that (virtual) file in the alias set.
566 
567  if (offsets.size() > 2) {
568  size_t last_start = offsets.size() - 2;
569 
570  string name, value;
571 
572  TAliasGroup & group = m_Groups[aset_path.GetPathS()];
573 
574  for(size_t i = 0; i < last_start; i += 2) {
575  // The line being read here is "ALIAS_FILE <filename>"
576 
577  s_SeqDB_ReadLine(offsets[i],
578  offsets[i+1],
579  name,
580  value);
581 
582  if (name != key || value.empty()) {
583  string msg("Alias set file: syntax error near offset "
584  + NStr::NumericToString(offsets[i] - bp) + ".");
585 
586  NCBI_THROW(CSeqDBException, eFileErr, msg);
587  }
588 
589  group[value].assign(offsets[i+1], offsets[i+2]);
590  }
591  }
592 }
593 
594 
596  const char ** bp,
597  const char ** ep,
598  CSeqDBLockHold & locked)
599 {
600  // Compute name of combined alias file.
601 
602  CSeqDB_Path aset_path;
603  CSeqDB_FileName alias_fname;
604 
605  x_DbToIndexName(dbpath, aset_path, alias_fname);
606 
607  // Check whether we already have this combined alias file.
608 
609  if (m_Groups.find(aset_path.GetPathS()) == m_Groups.end()) {
610  if (! m_Atlas.DoesFileExist(aset_path)) {
611  return false;
612  }
613 
614  x_ReadAliasSetFile(aset_path, locked);
615  }
616 
617  // Find and read the specific, included, alias file.
618 
619  TAliasGroup & group = m_Groups[aset_path.GetPathS()];
620 
621  if (group.find(alias_fname.GetFileNameS()) == group.end()) {
622  return false;
623  }
624 
625  // It would be simpler to move the if (bp||ep) test out to here,
626  // and instead just not add any empty files to the map. In fact,
627  // it may already avoid adding empty files...
628 
629  // Also, it would probably be a good idea to trim whitespace from
630  // the top and bottom of alias file contents, since it is nearly
631  // free to do so before the strings are actually constructed.
632 
633  const string & file_data = group[alias_fname.GetFileNameS()];
634 
635  if (file_data.empty()) {
636  return false;
637  }
638 
639  if (bp || ep) {
640  _ASSERT(bp && ep);
641 
642  *bp = file_data.data();
643  *ep = file_data.data() + file_data.size();
644  }
645 
646  return true;
647 }
648 
649 
651  const CSeqDB_Path & path,
652  const char ** bp,
653  const char ** ep,
654  CSeqDBLockHold & locked)
655 {
656  bool has_group_file = false;
657 
658  has_group_file = m_AliasSets.ReadAliasFile(path, bp, ep, locked);
659 
660  if (! has_group_file) {
661  CSeqDBAtlas::TIndx length(0);
662 
663  m_Atlas.GetFileSizeL(path.GetPathS(), length);
664  *bp = lease.GetFileDataPtr(0);
665  *ep = (*bp) + length;
666  }
667 }
668 
669 
671  CSeqDBLockHold & locked)
672 {
673 
674  CSeqDBFileMemMap lease(m_Atlas,path.GetPathS());
675 
676  const char * bp(0);
677  const char * ep(0);
678 
679  x_ReadAliasFile(lease, path, & bp, & ep, locked);
680 
681  const char * p = bp;
682 
683  // Existence should already be verified.
684  _ASSERT(bp);
685 
686  // These are kept here to reduce allocations.
687  string name_s, value_s;
688 
689  while(p < ep) {
690  // Skip spaces
691  while((p < ep) && (*p == ' ')) {
692  p++;
693  }
694 
695  const char * eolp = p;
696 
697  while((eolp < ep) && !SEQDB_ISEOL(*eolp)) {
698  eolp++;
699  }
700 
701  // Non-empty line, so read it.
702  if (eolp != p) {
703  x_ReadLine(p, eolp, name_s, value_s);
704  }
705 
706  p = eolp + 1;
707  }
708 }
709 
710 
712  char prot_nucl,
713  CSeqDBAliasStack & recurse,
714  CSeqDBLockHold & locked)
715 {
716  CSeqDB_DirName dirname (node_path.FindDirName());
718 
720  subnode( new CSeqDBAliasNode(m_Atlas,
721  dirname,
722  basename,
723  prot_nucl,
724  recurse,
725  locked,
726  m_AliasSets,
727  m_ExpandLinks) );
728 
729  m_SubNodes.push_back(subnode);
730 }
731 
732 
734  char prot_nucl,
735  CSeqDBAliasStack & recurse,
736  CSeqDBLockHold & locked)
737 {
738  if (m_DBList.empty()) {
739  string situation;
740 
741  if (this_name.GetBasePathS() == "-") {
742  situation = "passed to CSeqDB::CSeqDB().";
743  } else {
744  situation = string("found in alias file [")
745  + this_name.GetBasePathS() + "].";
746  }
747 
749  eArgErr,
750  string("No database names were ") + situation);
751  }
752 
753  for(size_t i = 0; i < m_DBList.size(); i++) {
754  // Inquiry: Is the following comparison correct for all
755  // combinations of alias file and database name and path?
756  // Which is to say, does it correctly deal with names of alias
757  // files and names of volumes that collide?
758 
759  // If there is a directory on the mentioned name, we assume
760  // this is NOT an overriding alias file, and skip the test
761  // that treats it as a volume name.
762 
763  // If this an alias file refers to a volume of the same name
764  // using "../<cwd>", it will detect and fail with an alias
765  // file cyclicality message at this point.
766 
767  // If the base name of the alias file is also listed in
768  // "dblist", it is assumed to refer to a volume instead of
769  // to itself. In this case, we do NOT search local copy
770 
771  if (m_DBList[i].FindDirName().Empty()) {
772  if (m_DBList[i].FindBaseName() == this_name.FindBaseName()) {
773 
774  string normal_name;
775  if (m_ExpandLinks) {
776  // normalize this_name
777  string dir_name, base_name;
778  this_name.FindDirName().GetString(dir_name);
779  this_name.FindBaseName().GetString(base_name);
780  normal_name = CDirEntry::NormalizePath(dir_name, eFollowLinks) +
781  CDirEntry::GetPathSeparator() + base_name;
782  } else {
783  normal_name = this_name.GetBasePathS();
784  }
785 
786  bool found = false;
787  for (int i = 0; i < (int) m_VolNames.size(); i++) {
788  if (m_VolNames[i].GetBasePathS() == normal_name) {
789  found = true;
790  break;
791  }
792  }
793 
794  if (!found) {
795  m_VolNames.push_back(CSeqDB_BasePath(normal_name));
796  }
797  continue;
798  }
799  }
800 
801  // Join the "current" directory (location of this alias node)
802  // to the path specified in the alias file.
803 
805  CSeqDB_Path new_db_path( base, prot_nucl, 'a', 'l' );
806 
807  if ( recurse.Exists(new_db_path) ) {
809  eFileErr,
810  "Illegal configuration: DB alias files are mutually recursive.");
811  }
812 
813  // If we find the new name in the combined alias file or one
814  // of the individual ones, build a subnode.
815 
816  if ( m_AliasSets.FindAliasPath(new_db_path, 0, locked) ||
817  m_Atlas.DoesFileExist(new_db_path) ) {
818 
819  x_AppendSubNode(base, prot_nucl, recurse, locked);
820  continue;
821  }
822 
823  // The name was not found as an alias file, so check for the
824  // existence of a volume file at the same location.
825 
826  bool found = false;
827  CSeqDB_BasePath bp;
828 
829  // Always check local copy first unless full path is given
830  if (!m_SkipLocal[i]) {
831  // GCC 3.0.4 requires the extra parentheses below.
833  CSeqDB_BaseName(m_DBList[i].FindBaseName()));
834  CSeqDB_Path new_local_vol_path(local_base, prot_nucl, 'i', 'n' );
835  if (m_Atlas.DoesFileExist(new_local_vol_path)) {
836  bp = CSeqDB_BasePath(new_local_vol_path.FindBasePath());
837  found = true;
838  }
839  }
840 
841  if (!found) {
842  CSeqDB_Path new_vol_path( base, prot_nucl, 'i', 'n' );
843  if (m_Atlas.DoesFileExist(new_vol_path)) {
844  bp = CSeqDB_BasePath(new_db_path.FindBasePath() );
845  found = true;
846  }
847  }
848 
849  if (found) {
850  string normal_name;
851  if (m_ExpandLinks) {
852  // normalize this_name
853  string dir_name, base_name;
854  bp.FindDirName().GetString(dir_name);
855  bp.FindBaseName().GetString(base_name);
856  normal_name = CDirEntry::NormalizePath(dir_name, eFollowLinks) +
857  CDirEntry::GetPathSeparator() + base_name;
858  } else {
859  normal_name = bp.GetBasePathS();
860  }
861  found = false;
862  for (int i = 0; i < (int) m_VolNames.size(); i++) {
863  if (m_VolNames[i].GetBasePathS() == normal_name) {
864  found = true;
865  break;
866  }
867  }
868 
869  if (!found) {
870  m_VolNames.push_back(CSeqDB_BasePath(normal_name));
871  }
872 
873  continue;
874  }
875 
876  // If all that failed, "restart" the search using the blast DB
877  // path configuration. This ensures always finding the local
878  // copy of a database even if the alias file resides on remote site.
879 
881  CSeqDB_BasePath restart(m_DBList[i]);
882 
883  if (m_AliasSets.FindBlastDBPath(restart, prot_nucl, result)) {
884  // either alias or volume file exists for this entry
885  CSeqDB_Path new_alias( result, prot_nucl, 'a', 'l' );
886  CSeqDB_Path new_volume( result, prot_nucl, 'i', 'n' );
887 
888  if (m_Atlas.DoesFileExist(new_alias)) {
889  x_AppendSubNode( result, prot_nucl, recurse, locked );
890  } else if (m_Atlas.DoesFileExist(new_volume)) {
891  string normal_name;
892  if (m_ExpandLinks) {
893  // normalize this_name
894  string dir_name, base_name;
895  result.FindDirName().GetString(dir_name);
896  result.FindBaseName().GetString(base_name);
897  normal_name = CDirEntry::NormalizePath(dir_name, eFollowLinks) +
898  CDirEntry::GetPathSeparator() + base_name;
899  } else {
900  normal_name = result.GetBasePathS();
901  }
902  bool found = false;
903  for (int i = 0; i < (int) m_VolNames.size(); i++) {
904  if (m_VolNames[i].GetBasePathS() == normal_name) {
905  found = true;
906  break;
907  }
908  }
909 
910  if (!found) {
911  m_VolNames.push_back(CSeqDB_BasePath(normal_name));
912  }
913  }
914  continue;
915  }
916 
917 
918  ostringstream oss;
919  oss << "Could not find volume or alias file ("
920  << m_DBList[i].GetBasePathS() << ") referenced in alias file ("
921  << this_name.GetBasePathS() << ").";
922 
923  NCBI_THROW(CSeqDBException, eFileErr, oss.str());
924  }
925 }
926 
927 void CSeqDBAliasNode::FindVolumePaths(vector<string> & vols, vector<string> * alias, bool recursive) const
928 {
929 
930  set<string> volset;
931  set<string> aliset;
932 
933  if (recursive) {
934  x_FindVolumePaths(volset, aliset);
935  } else {
936  // No alias file list is populated in the non-recursive mode
937  ITERATE(vector<CSeqDB_BasePath>, path, m_VolNames) {
938  volset.insert(path->GetBasePathS());
939  }
941  ITERATE(vector<CSeqDB_BasePath>, path, (*iter)->m_VolNames) {
942  volset.insert(path->GetBasePathS());
943  }
944  ITERATE(TSubNodeList, sub, (*iter)->m_SubNodes) {
945  volset.insert(((*sub)->m_ThisName).GetPathS());
946  }
947  }
948  }
949 
950  vols.clear();
951  ITERATE(set<string>, iter, volset) {
952  vols.push_back(*iter);
953  }
954 
955  // Sort to insure deterministic order.
956  sort(vols.begin(), vols.end(), SeqDB_CompareVolume);
957 
958  if (alias) {
959  alias->clear();
960  ITERATE(set<string>, iter, aliset) {
961  alias->push_back(*iter);
962  }
963  sort(alias->begin(), alias->end(), SeqDB_CompareVolume);
964  }
965 }
966 
967 
969 {
970  ITERATE(TVolNames, iter, m_VolNames) {
971  vols.insert(iter->GetBasePathS());
972  }
973 
974  string alias_base = m_ThisName.GetPathS();
975  if (alias_base != "-") {
976  alias.insert(m_ThisName.GetPathS());
977  }
978 
980  (*iter)->x_FindVolumePaths(vols, alias);
981  }
982 }
983 
984 
985 /// Walker for TITLE field of alias file
986 ///
987 /// The TITLE field of the alias file is a string describing the set
988 /// of sequences collected by that file. The title is reported via
989 /// the "CSeqDB::GetTitle()" method.
990 
992 public:
993  /// This provides the alias file key used for this field.
994  virtual const char * GetFileKey() const
995  {
996  return "TITLE";
997  }
998 
999  /// Collect data from a volume
1000  ///
1001  /// If the TITLE field is not specified in an alias file, we can
1002  /// use the title(s) in the database volume(s). Values from alias
1003  /// node tree siblings are concatenated with "; " used as a
1004  /// delimiter.
1005  ///
1006  /// @param vol
1007  /// A database volume
1008  virtual void Accumulate(const CSeqDBVol & vol)
1009  {
1010  AddString( vol.GetTitle() );
1011  }
1012 
1013  /// Collect data from an alias file
1014  ///
1015  /// If the TITLE field is specified in an alias file, it will be
1016  /// used unmodified. Values from alias node tree siblings are
1017  /// concatenated with "; " used as a delimiter.
1018  ///
1019  /// @param value
1020  /// A database volume
1021  virtual void AddString(const string & value)
1022  {
1023  SeqDB_JoinDelim(m_Value, value, "; ");
1024  }
1025 
1026  /// Returns the database title string.
1027  string GetTitle()
1028  {
1029  return m_Value;
1030  }
1031 
1032 private:
1033  /// The title string we are accumulating.
1034  string m_Value;
1035 };
1036 
1037 
1038 /// Walker for MAX_SEQ_LENGTH field of alias file
1039 ///
1040 /// This functor encapsulates the specifics of the MAX_SEQ_LENGTH
1041 /// field of the alias file. The NSEQ fields specifies the number of
1042 /// sequences to use when reporting information via the
1043 /// "CSeqDB::GetNumSeqs()" method. It is not the same as the number
1044 /// of OIDs unless there are no filtering mechanisms in use.
1045 /// (Note: this seems to be unused.)
1046 
1048 public:
1049  /// Constructor
1051  {
1052  m_Value = 0;
1053  }
1054 
1055  /// This provides the alias file key used for this field.
1056  virtual const char * GetFileKey() const
1057  {
1058  return "MAX_SEQ_LENGTH";
1059  }
1060 
1061  /// Collect data from the volume
1062  ///
1063  /// If the MAX_SEQ_LENGTH field is not specified in an alias file,
1064  /// the maximum values of all contributing volumes is used.
1065  ///
1066  /// @param vol
1067  /// A database volume
1068  virtual void Accumulate(const CSeqDBVol & vol)
1069  {
1070  int new_max = vol.GetMaxLength();
1071 
1072  if (new_max > m_Value)
1073  m_Value = new_max;
1074  }
1075 
1076  /// Collect data from an alias file
1077  ///
1078  /// Values from alias node tree siblings are compared, and the
1079  /// maximum value is used as the result.
1080  ///
1081  /// @param value
1082  /// A database volume
1083  virtual void AddString(const string & value)
1084  {
1085  int new_max = NStr::StringToUInt(value);
1086 
1087  if (new_max > m_Value)
1088  m_Value = new_max;
1089  }
1090 
1091  /// Returns the maximum sequence length.
1093  {
1094  return m_Value;
1095  }
1096 
1097 private:
1098  /// The maximum sequence length.
1099  int m_Value;
1100 };
1101 
1103 public:
1104  /// Constructor
1106  {
1107  m_Value = INT4_MAX;
1108  }
1109 
1110  /// This provides the alias file key used for this field.
1111  virtual const char * GetFileKey() const
1112  {
1113  return "MIN_SEQ_LENGTH";
1114  }
1115 
1116  /// Collect data from the volume
1117  ///
1118  /// If the MAX_SEQ_LENGTH field is not specified in an alias file,
1119  /// the maximum values of all contributing volumes is used.
1120  ///
1121  /// @param vol
1122  /// A database volume
1123  virtual void Accumulate(const CSeqDBVol & vol)
1124  {
1125  int new_min = vol.GetMinLength();
1126 
1127  if (new_min < m_Value) {
1128  m_Value = new_min;
1129  }
1130  }
1131 
1132  /// Collect data from an alias file
1133  ///
1134  /// Values from alias node tree siblings are compared, and the
1135  /// maximum value is used as the result.
1136  ///
1137  /// @param value
1138  /// A database volume
1139  virtual void AddString(const string & value)
1140  {
1141  int new_min = NStr::StringToUInt(value);
1142 
1143  if (new_min < m_Value) {
1144  m_Value = new_min;
1145  }
1146  }
1147 
1148  /// Returns the maximum sequence length.
1150  {
1151  return m_Value;
1152  }
1153 
1154 private:
1155  /// The maximum sequence length.
1156  int m_Value;
1157 };
1158 
1159 /// Walker for NSEQ field of alias file
1160 ///
1161 /// The NSEQ field of the alias file specifies the number of sequences
1162 /// to use when reporting information via the "CSeqDB::GetNumSeqs()"
1163 /// method. It is not the same as the number of OIDs unless there are
1164 /// no filtering mechanisms in use.
1165 
1167 public:
1168  /// Constructor
1170  {
1171  m_Value = 0;
1172  }
1173 
1174  /// This provides the alias file key used for this field.
1175  virtual const char * GetFileKey() const
1176  {
1177  return "NSEQ";
1178  }
1179 
1180  /// Collect data from the volume
1181  ///
1182  /// If the NSEQ field is not specified in an alias file, the
1183  /// number of OIDs in the volume is used instead.
1184  ///
1185  /// @param vol
1186  /// A database volume
1187  virtual void Accumulate(const CSeqDBVol & vol)
1188  {
1189  m_Value += vol.GetNumOIDs();
1190  }
1191 
1192  /// Collect data from an alias file
1193  ///
1194  /// If the NSEQ field is specified in an alias file, it will be
1195  /// used. Values from alias node tree siblings are summed.
1196  ///
1197  /// @param value
1198  /// A database volume
1199  virtual void AddString(const string & value)
1200  {
1202  }
1203 
1204  /// Returns the accumulated number of OIDs.
1205  Int8 GetNum() const
1206  {
1207  return m_Value;
1208  }
1209 
1210 private:
1211  /// The accumulated number of OIDs.
1213 };
1214 
1215 
1216 /// Walker for OID count accumulation.
1217 ///
1218 /// The number of OIDs should be like the number of sequences, but
1219 /// without the value adjustments made by alias files. To preserve
1220 /// this relationship, this class inherits from CSeqDB_NSeqsWalker.
1221 
1223 public:
1224  /// This disables the key; the spaces would not be preserved, so
1225  /// this is a non-matchable string in this context.
1226  virtual const char * GetFileKey() const
1227  {
1228  return " no key ";
1229  }
1230 };
1231 
1232 /// Walker for STATS_NSEQ field of alias file
1233 ///
1234 /// The STATS_NSEQ field of the alias file specifies the number of
1235 /// sequences to use when reporting information via the
1236 /// "CSeqDB::GetNumSeqsStats()" method. It is not the same as the
1237 /// number of OIDs unless there are no filtering mechanisms in use.
1238 
1240 public:
1241  /// This does the same calculation as above but uses another key.
1242  virtual const char * GetFileKey() const
1243  {
1244  return "STATS_NSEQ";
1245  }
1246 
1247  /// STATS_NSEQ field
1248  ///
1249  /// The STATS_* versions of these walkers do not return volume
1250  /// lengths, instead the value zero will be returned if the field
1251  /// is not specified.
1252  ///
1253  /// @param vol
1254  /// A database volume
1255  virtual void Accumulate(const CSeqDBVol & vol)
1256  {
1257  // only alias file data is included.
1258  }
1259 };
1260 
1261 
1262 /// Walker for total length accumulation.
1263 ///
1264 /// The total length of the database is the sum of the lengths of all
1265 /// volumes of the database (measured in bases).
1266 
1268 public:
1269  /// Constructor
1271  {
1272  m_Value = 0;
1273  }
1274 
1275  /// This provides the alias file key used for this field.
1276  virtual const char * GetFileKey() const
1277  {
1278  return "LENGTH";
1279  }
1280 
1281  /// Collect data from the volume
1282  ///
1283  /// If the LENGTH field is not specified in an alias file, the
1284  /// sum of the volume lengths will be used.
1285  ///
1286  /// @param vol
1287  /// A database volume
1288  virtual void Accumulate(const CSeqDBVol & vol)
1289  {
1290  m_Value += vol.GetVolumeLength();
1291  }
1292 
1293  /// Collect data from an alias file
1294  ///
1295  /// If the LENGTH field is specified in an alias file, it will be
1296  /// used. Values from alias node tree siblings are summed.
1297  ///
1298  /// @param value
1299  /// A database volume
1300  virtual void AddString(const string & value)
1301  {
1303  }
1304 
1305  /// Returns the accumulated volume length.
1307  {
1308  return m_Value;
1309  }
1310 
1311 private:
1312  /// The accumulated volume length.
1314 };
1315 
1316 
1317 /// Walker for volume length accumulation.
1318 ///
1319 /// The volume length should be like total length, but without the
1320 /// value adjustments made by alias files. To preserve this
1321 /// relationship, this class inherits from CSeqDB_TotalLengthWalker.
1322 /// (Note: this seems to be unused.)
1323 
1325 public:
1326  /// This disables the key; the spaces would not be preserved, so
1327  /// this is a non-matchable string in this context.
1328  virtual const char * GetFileKey() const
1329  {
1330  return " no key ";
1331  }
1332 };
1333 
1334 
1335 /// Walker for total length stats accumulation.
1336 ///
1337 /// The total length of the database is the sum of the lengths of all
1338 /// volumes of the database (measured in bases).
1339 
1341 public:
1342  /// This does the same calculation as above but uses another key.
1343  virtual const char * GetFileKey() const
1344  {
1345  return "STATS_TOTLEN";
1346  }
1347 
1348  /// STATS_TOTLEN field
1349  ///
1350  /// The STATS_* versions of these walkers do not return volume
1351  /// lengths, instead the value zero will be returned if the field
1352  /// is not specified.
1353  ///
1354  /// @param vol
1355  /// A database volume
1356  virtual void Accumulate(const CSeqDBVol & vol)
1357  {
1358  // only alias file data is included.
1359  }
1360 };
1361 
1362 
1363 /// Walker for membership bit
1364 ///
1365 /// This just searches alias files for the membership bit if one is
1366 /// specified.
1367 
1369 public:
1370  /// Constructor
1372  {
1373  m_Value = 0;
1374  }
1375 
1376  /// This provides the alias file key used for this field.
1377  virtual const char * GetFileKey() const
1378  {
1379  return "MEMB_BIT";
1380  }
1381 
1382  /// Collect data from the volume
1383  ///
1384  /// If the MEMB_BIT field is not specified in an alias file, then
1385  /// it is not needed. This field is intended to allow filtration
1386  /// of deflines by taxonomic category, which is only needed if an
1387  /// alias file reduces the taxonomic scope.
1388  virtual void Accumulate(const CSeqDBVol &)
1389  {
1390  // Volumes don't have this data, only alias files.
1391  }
1392 
1393  /// Collect data from an alias file
1394  ///
1395  /// If the MEMB_BIT field is specified in an alias file, it will
1396  /// be used unmodified. No attempt is made to combine or collect
1397  /// bit values - currently, only one can be used at a time.
1398  ///
1399  /// @param value
1400  /// A database volume
1401  virtual void AddString(const string & value)
1402  {
1404  }
1405 
1406  /// Returns the membership bit.
1407  int GetMembBit() const
1408  {
1409  return m_Value;
1410  }
1411 
1412 private:
1413  /// The membership bit.
1414  int m_Value;
1415 };
1416 
1417 /// Walker for oid mask type
1418 ///
1419 /// This just searches alias files for the oid mask type if one is
1420 /// specified.
1421 
1423 public:
1424  /// Constructor
1426  {
1427  m_Value = 0;
1428  }
1429 
1430  /// This provides the alias file key used for this field.
1431  virtual const char * GetFileKey() const
1432  {
1433  return "OID_MASK_TYPE";
1434  }
1435 
1436  /// Collect data from the volume
1437  ///
1438  /// If the MEMB_BIT field is not specified in an alias file, then
1439  /// it is not needed. This field is intended to allow filtration
1440  /// of deflines by taxonomic category, which is only needed if an
1441  /// alias file reduces the taxonomic scope.
1442  virtual void Accumulate(const CSeqDBVol &)
1443  {
1444  // Volumes don't have this data, only alias files.
1445  }
1446 
1447  /// Collect data from an alias file
1448  ///
1449  /// If the MEMB_BIT field is specified in an alias file, it will
1450  /// be used unmodified. No attempt is made to combine or collect
1451  /// bit values - currently, only one can be used at a time.
1452  ///
1453  /// @param value
1454  /// A database volume
1455  virtual void AddString(const string & value)
1456  {
1458  }
1459 
1460  /// Returns the oid mask type.
1461  int GetOidMaskType() const
1462  {
1463  return m_Value;
1464  }
1465 
1466 private:
1467  /// The oid mask type.
1468  int m_Value;
1469 };
1470 
1471 
1472 /// Test for completeness of GI list alias file values.
1473 ///
1474 /// This searches alias files to determine whether NSEQS and LENGTH
1475 /// are specified in all of the cases where they should be. If any
1476 /// volume has a GI list but the number of included sequences or
1477 /// length is not specified, then SeqDB must scan the database to
1478 /// compute this length.
1479 
1481 public:
1482  /// Constructor
1484  {
1485  m_NeedScan = false;
1486  }
1487 
1488  /// Collect data from the volume
1489  ///
1490  /// Volume data is not used by this class.
1491  virtual void Accumulate(const CSeqDBVol &)
1492  {
1493  // Volumes don't have this data, only alias files.
1494  }
1495 
1496  /// Explore the values in this alias file
1497  ///
1498  /// If the NSEQ and LENGTH fields are specified, this method can
1499  /// close this branch of the traversal tree. Otherwise, if the
1500  /// GILIST or TILIST is specified, then this branch of the
1501  /// traversal will fail to produce accurate totals information,
1502  /// therefore an oid scan is required, and we are done.
1503  ///
1504  /// @param vars
1505  /// The name/value mapping for this node.
1506  /// @return
1507  /// True if the traversal should cease descent.
1508  virtual bool Explore(const TVarList & vars)
1509  {
1510  // If we already know that a scan is needed, we can skip all
1511  // further analysis (by returning true at all points).
1512 
1513  if (m_NeedScan)
1514  return true;
1515 
1516  // If we find both NSEQ and LENGTH, then this branch of the
1517  // alias file is covered.
1518 
1519  if (vars.find("NSEQ") != vars.end() &&
1520  vars.find("LENGTH") != vars.end()) {
1521 
1522  return true;
1523  }
1524 
1525  // If we we have an attached GILIST (but don't have both NSEQ
1526  // and LENGTH), then we need to scan the entire database.
1527 
1528  if (vars.find("GILIST") != vars.end()) {
1529  m_NeedScan = true;
1530  return true;
1531  }
1532 
1533  // Ditto for an attached TILIST.
1534 
1535  if (vars.find("TILIST") != vars.end()) {
1536  m_NeedScan = true;
1537  return true;
1538  }
1539 
1540  // Ditto for an attached SILIST.
1541 
1542  if (vars.find("SEQIDLIST") != vars.end()) {
1543  m_NeedScan = true;
1544  return true;
1545  }
1546  if (vars.find("TAXIDLIST") != vars.end()) {
1547  m_NeedScan = true;
1548  return true;
1549  }
1550 
1551  if (vars.find("OIDLIST") != vars.end()) {
1552  m_NeedScan = true;
1553  return true;
1554  }
1555 
1556  // If none of those conditions is met, traversal proceeds.
1557  return false;
1558  }
1559 
1560  /// Returns true if a scan is required.
1561  bool NeedScan() const
1562  {
1563  return m_NeedScan;
1564  }
1565 
1566 private:
1567  /// True unless/until a node with incomplete totals was found.
1569 };
1570 
1571 
1572 void
1574  const CSeqDBVolSet & volset) const
1575 {
1577  m_Values.find(walker->GetFileKey());
1578 
1579  if (value != m_Values.end()) {
1580  walker->AddString( (*value).second );
1581  return;
1582  }
1583 
1584  ITERATE(TSubNodeList, node, m_SubNodes) {
1585  (*node)->WalkNodes( walker, volset );
1586  }
1587 
1588  ITERATE(TVolNames, volname, m_VolNames) {
1589  if (const CSeqDBVol * vptr = volset.GetVol(volname->GetBasePathS())) {
1590  walker->Accumulate( *vptr );
1591  }
1592  }
1593 }
1594 
1595 
1596 void
1598  const CSeqDBVolSet & volset) const
1599 {
1600  if (explorer->Explore(m_Values)) {
1601  return;
1602  }
1603 
1604  ITERATE(TSubNodeList, node, m_SubNodes) {
1605  (*node)->WalkNodes( explorer, volset );
1606  }
1607 
1608  ITERATE(TVolNames, volname, m_VolNames) {
1609  if (const CSeqDBVol * vptr = volset.GetVol(volname->GetBasePathS())) {
1610  explorer->Accumulate( *vptr );
1611  }
1612  }
1613 }
1614 
1615 // This could be changed to use ComputeMasks, then apply the masks for
1616 // each node in a second traversal. However, it probably makes more
1617 // sense to ignore this because eventually this functionality can be
1618 // scrapped once the filter tree based OID wrangling is ready.
1619 
1620 string CSeqDBAliasNode::GetTitle(const CSeqDBVolSet & volset) const
1621 {
1622  CSeqDB_TitleWalker walk;
1623  WalkNodes(& walk, volset);
1624 
1625  return walk.GetTitle();
1626 }
1627 
1629 {
1631  WalkNodes(& walk, vols);
1632 
1633  return walk.GetMinLength();
1634 }
1635 
1637 {
1638  CSeqDB_NSeqsWalker walk;
1639  WalkNodes(& walk, vols);
1640 
1641  return walk.GetNum();
1642 }
1643 
1645 {
1647  WalkNodes(& walk, vols);
1648 
1649  return walk.GetNum();
1650 }
1651 
1653 {
1654  CSeqDB_NOIDsWalker walk;
1655  WalkNodes(& walk, vols);
1656 
1657  return walk.GetNum();
1658 }
1659 
1661 {
1663  WalkNodes(& walk, volset);
1664 
1665  return walk.GetLength();
1666 }
1667 
1669 {
1671  WalkNodes(& walk, volset);
1672 
1673  return walk.GetLength();
1674 }
1675 
1676 // (Note: this seems to be unused.)
1678 {
1680  WalkNodes(& walk, volset);
1681 
1682  return walk.GetLength();
1683 }
1684 
1686 {
1687  CSeqDB_MembBitWalker walk;
1688  WalkNodes(& walk, volset);
1689 
1690  return walk.GetMembBit();
1691 }
1692 
1693 
1695 {
1696  CSeqDB_IdListValuesTest explore;
1697  WalkNodes(& explore, volset);
1698 
1699  return explore.NeedScan();
1700 }
1701 
1703 {
1705  WalkNodes(& walk, volset);
1706 
1707  return walk.GetOidMaskType();
1708 }
1709 
1710 
1711 
1713 CompleteAliasFileValues(const CSeqDBVolSet & volset)
1714 {
1715  // First, complete the values stored in the child nodes.
1716 
1718  (**node).CompleteAliasFileValues(volset);
1719  }
1720 
1721  // Then, get the various values for this node.
1722 
1723  if (m_Values.find("TITLE") == m_Values.end()) {
1724  m_Values["TITLE"] = GetTitle(volset);
1725  }
1726 }
1727 
1728 
1731 {
1733 
1734  afv[m_ThisName.GetPathS()].push_back(m_Values);
1735 
1736  ITERATE(TSubNodeList, node, m_SubNodes) {
1737  (**node).GetAliasFileValues(afv);
1738  }
1739 }
1740 
1742 GetMaskList(vector <string> & mask_list)
1743 {
1744  if (!m_HasGiMask) {
1745  return;
1746  }
1747 
1748  mask_list.clear();
1749 
1750  // parse the white spaces...
1751  vector <CTempString> masks;
1752  SeqDB_SplitQuoted(m_SubNodes[0]->m_Values["MASKLIST"], masks);
1753  ITERATE(vector <CTempString>, mask, masks) {
1754  mask_list.push_back(string(*mask));
1755  }
1756 }
1757 
1759  const CSeqDBVolSet & volset)
1760 {
1762 
1763  // Now complete the 'volume' values.
1764  for(int i = 0; i < volset.GetNumVols(); i++) {
1765  const CSeqDBVol * v = volset.GetVol(i);
1766 
1767  string key = v->GetVolName();
1768 
1769  if (afv.find(key) != afv.end()) {
1770  // If this name already corresponds to an alias file,
1771  // don't replace it with a volume.
1772  continue;
1773  }
1774 
1775  // Add the title of the volume.
1776  map<string,string> values;
1777  values["TITLE"] = v->GetTitle();
1778 
1779  string extn = (m_IsProtein ? ".pin" : ".nin");
1780 
1781  afv[v->GetVolName() + extn].push_back(values);
1782  }
1783 
1784  m_Node->GetAliasFileValues(afv);
1785 }
1786 
1787 
1789 {
1790  // Default is zero; -1 means not-computed-yet.
1791  if (m_MembBit == -1) {
1792  m_MembBit = m_Node->GetMembBit(volset);
1793  }
1794 
1795  return m_MembBit;
1796 }
1797 
1798 
1799 string CSeqDBAliasFile::GetTitle(const CSeqDBVolSet & volset) const
1800 {
1801  if (! m_HasTitle)
1802  m_Title = m_Node->GetTitle(volset);
1803 
1804  return m_Title;
1805 }
1806 
1808 {
1809  if (m_MinLength == -1)
1810  m_MinLength = m_Node->GetMinLength(volset);
1811 
1812  return m_MinLength;
1813 }
1814 
1816 {
1817  if (m_NumSeqs == -1)
1818  m_NumSeqs = m_Node->GetNumSeqs(volset);
1819 
1820  return m_NumSeqs;
1821 }
1822 
1823 
1825 {
1826  if (m_NumSeqsStats == -1)
1828 
1829  return m_NumSeqsStats;
1830 }
1831 
1832 
1834 {
1835  if (m_NumOIDs == -1)
1836  m_NumOIDs = m_Node->GetNumOIDs(volset);
1837 
1838  return m_NumOIDs;
1839 }
1840 
1841 
1843 {
1844  if (m_TotalLength == -1)
1845  m_TotalLength = m_Node->GetTotalLength(volset);
1846 
1847  return m_TotalLength;
1848 }
1849 
1850 
1852 {
1853  if (m_TotalLengthStats == -1)
1855 
1856  return m_TotalLengthStats;
1857 }
1858 
1859 
1861 {
1862  if (m_VolumeLength == -1)
1864 
1865  return m_VolumeLength;
1866 }
1867 
1868 
1870 {
1871  if (m_NeedTotalsScan == -1) {
1872  bool need = m_Node->NeedTotalsScan(volset);
1873  m_NeedTotalsScan = need ? 1 : 0;
1874  }
1875  return m_NeedTotalsScan == 1;
1876 }
1877 
1879 {
1880  ftree.SetName(m_ThisName.GetPathS());
1881  ftree.AddFilters(m_NodeMasks);
1882 
1883  ITERATE(TSubNodeList, node, m_SubNodes) {
1885 
1886  (*node)->BuildFilterTree( *subtree );
1887  ftree.AddNode(subtree);
1888  }
1889 
1890  ITERATE(TVolNames, volname, m_VolNames) {
1891  ftree.AddVolume(*volname);
1892  }
1893 }
1894 
1896 {
1897  // Default is zero.
1898  m_OidMaskType = m_Node->GetOidMaskType(volset);
1899 
1900  return m_OidMaskType;
1901 }
1902 
1904 {
1905  if (m_TopTree.Empty()) {
1906  x_ComputeMasks();
1907 
1910  }
1911 
1912  return m_TopTree;
1913 }
1914 
1915 void
1917 {
1918  ddc.SetFrame("CSeqDBAliasFile");
1919  CObject::DebugDump(ddc, depth);
1920  for (SIZE_TYPE i = 0; i < m_VolumeNames.size(); i++) {
1921  ddc.Log("m_VolumeNames[" + NStr::SizetToString(i) + "]",
1922  m_VolumeNames[i]);
1923  }
1924  for (SIZE_TYPE i = 0; i < m_AliasNames.size(); i++) {
1925  ddc.Log("m_AliasNames[" + NStr::SizetToString(i) + "]",
1926  m_AliasNames[i]);
1927  }
1928  ddc.Log("m_IsProtein", m_IsProtein);
1929  ddc.Log("m_MinLength", m_MinLength);
1930  ddc.Log("m_NumSeqs", m_NumSeqs);
1931  ddc.Log("m_NumSeqsStats", m_NumSeqsStats);
1932  ddc.Log("m_NumOIDs", m_NumOIDs);
1933  ddc.Log("m_TotalLength", m_TotalLength);
1934  ddc.Log("m_TotalLengthStats", m_TotalLengthStats);
1935  ddc.Log("m_VolumeLength", m_VolumeLength);
1936  ddc.Log("m_MembBit", m_MembBit);
1937  ddc.Log("m_HasTitle", m_HasTitle);
1938  ddc.Log("m_Title", m_Title);
1939  ddc.Log("m_NeedTotalsScan", m_NeedTotalsScan);
1940  ddc.Log("m_HasFilters", m_HasFilters);
1941 }
1942 
1943 void CSeqDBAliasNode::ComputeMasks(bool & has_filters)
1944 {
1945  if (! m_NodeMasks.empty()) {
1946  return;
1947  }
1948 
1949  typedef CSeqDB_AliasMask TMask;
1950 
1951  TVarList::iterator gil_iter = m_Values.find(string("GILIST"));
1952  TVarList::iterator til_iter = m_Values.find(string("TILIST"));
1953  TVarList::iterator sil_iter = m_Values.find(string("SEQIDLIST"));
1954  TVarList::iterator oid_iter = m_Values.find(string("OIDLIST"));
1955  TVarList::iterator f_oid_iter = m_Values.find(string("FIRST_OID"));
1956  TVarList::iterator l_oid_iter = m_Values.find(string("LAST_OID"));
1957  TVarList::iterator mbit_iter = m_Values.find(string("MEMB_BIT"));
1958  TVarList::iterator taxid_iter = m_Values.find(string("TAXIDLIST"));
1959  TVarList::iterator oid_mask_type_iter = m_Values.find(string("OID_MASK_TYPE"));
1960 
1961  if (! m_DBList.empty()) {
1962  if (oid_iter != m_Values.end() ||
1963  gil_iter != m_Values.end() ||
1964  til_iter != m_Values.end() ||
1965  sil_iter != m_Values.end() ||
1966  f_oid_iter != m_Values.end() ||
1967  l_oid_iter != m_Values.end() ||
1968  mbit_iter != m_Values.end() ||
1969  taxid_iter != m_Values.end()) {
1970 
1971  has_filters = true;
1972 
1973  int first_oid = 0;
1974  int last_oid = INT_MAX;
1975  bool has_range = false;
1976 
1977  if (f_oid_iter != m_Values.end()) {
1978  first_oid = NStr::StringToUInt(f_oid_iter->second);
1979 
1980  // Starts at one, adjust to zero-indexed.
1981  if (first_oid)
1982  first_oid--;
1983 
1984  has_range = true;
1985  }
1986 
1987  if (l_oid_iter != m_Values.end()) {
1988  // Zero indexing and post notation adjustments cancel.
1989  last_oid = NStr::StringToUInt(l_oid_iter->second);
1990  has_range = true;
1991  }
1992 
1993  if (has_range) {
1994  CRef<TMask> mask(new TMask(first_oid, last_oid));
1995  m_NodeMasks.push_back(mask);
1996  }
1997 
1998  if (oid_iter != m_Values.end()) {
1999  CSeqDB_FileName lst(oid_iter->second);
2000  CSeqDB_Path lst_path(m_DBPath, lst);
2001  CFile oid_f(lst_path.GetPathS());
2002  if (!oid_f.Exists() && m_VolNames.size() != 0) {
2003  CSeqDB_Path tmp(m_VolNames[0].FindDirName(), lst.GetFileNameSub());
2004  lst_path = tmp;
2005  }
2006 
2007  int oid_mask_type = 0;
2008  if(oid_mask_type_iter != m_Values.end()) {
2009  oid_mask_type = NStr::StringToUInt(oid_mask_type_iter->second);
2010  }
2011  CRef<TMask> mask(new TMask(TMask::eOidList, lst_path, oid_mask_type));
2012  m_NodeMasks.push_back(mask);
2013  }
2014 
2015  if (gil_iter != m_Values.end()) {
2016  const string & gilname = gil_iter->second;
2017 
2018  if (gilname.find(" ") != gilname.npos) {
2019  string msg =
2020  string("Alias file (") + m_DBPath.GetDirNameS() +
2021  ") has multiple GI lists (" + gilname + ").";
2022 
2023  NCBI_THROW(CSeqDBException, eFileErr, msg);
2024  }
2025 
2026  CSeqDB_FileName lst(gilname);
2027  CSeqDB_Path lst_path(m_DBPath, lst);
2028 
2029  CRef<TMask> mask(new TMask(TMask::eGiList, lst_path));
2030  m_NodeMasks.push_back(mask);
2031  }
2032 
2033  if (til_iter != m_Values.end()) {
2034  const string & tilname = til_iter->second;
2035 
2036  if (tilname.find(" ") != tilname.npos) {
2037  string msg =
2038  string("Alias file (") + m_DBPath.GetDirNameS() +
2039  ") has multiple TI lists (" + tilname + ").";
2040 
2041  NCBI_THROW(CSeqDBException, eFileErr, msg);
2042  }
2043 
2044  CSeqDB_FileName lst(tilname);
2045  CSeqDB_Path lst_path(m_DBPath, lst);
2046 
2047  CRef<TMask> mask(new TMask(TMask::eTiList, lst_path));
2048  m_NodeMasks.push_back(mask);
2049  }
2050 
2051  if (sil_iter != m_Values.end()) {
2052  const string & silname = sil_iter->second;
2053 
2054  if (silname.find(" ") != silname.npos) {
2055  string msg =
2056  string("Alias file (") + m_DBPath.GetDirNameS() +
2057  ") has multiple SEQID lists (" + silname + ").";
2058 
2059  NCBI_THROW(CSeqDBException, eFileErr, msg);
2060  }
2061 
2062  CSeqDB_FileName lst(silname);
2063  CSeqDB_Path lst_path(m_DBPath, lst);
2064 
2065  CRef<TMask> mask(new TMask(TMask::eSiList, lst_path));
2066  m_NodeMasks.push_back(mask);
2067  }
2068 
2069  if (mbit_iter != m_Values.end()) {
2070  int mbit = NStr::StringToUInt(mbit_iter->second);
2071  CRef<TMask> mask(new TMask(mbit));
2072  m_NodeMasks.push_back(mask);
2073  }
2074 
2075  if (taxid_iter != m_Values.end()) {
2076  const string & taxid_name = taxid_iter->second;
2077  if (taxid_name.find(" ") != taxid_name.npos) {
2078  string msg = string("Alias file (") + m_DBPath.GetDirNameS() +
2079  ") has multiple Tax ids lists (" + taxid_name + ").";
2080  NCBI_THROW(CSeqDBException, eFileErr, msg);
2081  }
2082 
2083  CSeqDB_FileName lst(taxid_name);
2084  CSeqDB_Path lst_path(m_DBPath, lst);
2085 
2086  CRef<TMask> mask(new TMask(TMask::eTaxIdList, lst_path));
2087  m_NodeMasks.push_back(mask);
2088  }
2089  }
2090  }
2091 
2093  (**sn).ComputeMasks(has_filters);
2094  }
2095 }
2096 
2098 
@ eOidList
Data is a list of discontiguous ordinal ids (indices)
ncbi::TMaskedQueryRegions mask
void SetFrame(const string &frame)
Definition: ddumpable.cpp:137
void Log(const string &name, const char *value, CDebugDumpFormatter::EValueType type=CDebugDumpFormatter::eValue, const string &comment=kEmptyStr)
Definition: ddumpable.cpp:151
CFile –.
Definition: ncbifile.hpp:1604
CSeqDBAliasSets m_AliasSets
Combined alias files.
int m_MembBit
Membership bit.
void GetAliasFileValues(TAliasFileValues &afv, const CSeqDBVolSet &volset)
Get Name/Value Data From Alias Files.
Int8 GetNumSeqsStats(const CSeqDBVolSet &volset) const
Get the number of sequences available.
bool m_HasTitle
True if we have the database title.
bool m_HasFilters
Are there filters for this database?
Uint8 GetTotalLength(const CSeqDBVolSet &volset) const
Get the total length of the set of databases.
vector< string > m_AliasNames
The cached output of the topmost node's FindVolumePaths(recursive).
Int4 m_MinLength
Shortest sequence length.
void x_ComputeMasks()
Compute filtering options for all volumes.
Uint8 GetTotalLengthStats(const CSeqDBVolSet &volset) const
Get the total length of the set of databases.
bool m_IsProtein
True if this is a protein database.
bool NeedTotalsScan(const CSeqDBVolSet &volset) const
Check whether a db scan is need to compute correct totals.
Int4 GetMinLength(const CSeqDBVolSet &volset) const
Get the number of sequences available.
Int8 GetNumOIDs(const CSeqDBVolSet &volset) const
Get the size of the OID range.
int GetOidMaskType(const CSeqDBVolSet &volset) const
Get the Oid Mask Type.
Int8 m_TotalLength
Total length.
Int8 m_NumOIDs
Number of OIDs.
Int8 m_NumSeqs
Number of sequences.
int m_NumSeqsStats
Number of sequences for statistics purposes.
CRef< CSeqDB_FilterTree > m_TopTree
Filter tree representing all alias file filtering.
int m_NeedTotalsScan
1 if we need a totals scan, 0 if not, -1 if not known.
CSeqDBAliasFile(CSeqDBAtlas &atlas, const string &name_list, char prot_nucl, bool expand_links=true)
Constructor.
Definition: seqdbalias.cpp:55
int m_OidMaskType
Oid Mask Type.
string GetTitle(const CSeqDBVolSet &volset) const
Get the title.
void DebugDump(CDebugDumpContext ddc, unsigned int depth) const
Dump debug information for this object.
Uint8 GetVolumeLength(const CSeqDBVolSet &volset) const
Get the sum of the volume lengths.
Int8 m_VolumeLength
Total length ignoring filtering.
string m_Title
Database title.
CRef< CSeqDBAliasNode > m_Node
This is the alias node tree's "artificial" topmost node, which aggregates the user provided database ...
Int8 GetNumSeqs(const CSeqDBVolSet &volset) const
Get the number of sequences available.
vector< string > m_VolumeNames
The cached output of the topmost node's FindVolumePaths(recursive).
Int8 m_TotalLengthStats
Total length for statistics purposes.
CRef< CSeqDB_FilterTree > GetFilterTree()
Get filtering tree for all volumes.
int GetMembBit(const CSeqDBVolSet &volset) const
Get the membership bit.
CSeqDBAliasNode class.
Definition: seqdbalias.hpp:428
void x_ResolveNames(char prot_nucl, CSeqDBLockHold &locked)
Name resolution.
Definition: seqdbalias.cpp:249
void x_ExpandAliases(const CSeqDB_BasePath &this_name, char prot_nucl, CSeqDBAliasStack &recurse, CSeqDBLockHold &locked)
Expand a node of the alias node tree recursively.
Definition: seqdbalias.cpp:733
TVolNames m_VolNames
Set of volume names associated with this node.
Definition: seqdbalias.hpp:941
void ComputeMasks(bool &has_filters)
Computes the masking information for each alias node.
vector< bool > m_SkipLocal
Should we skip local DB search for this DBLIST?
Definition: seqdbalias.hpp:957
Uint8 GetTotalLength(const CSeqDBVolSet &volset) const
Get the total length of the set of databases.
void x_ReadValues(const CSeqDB_Path &fn, CSeqDBLockHold &locked)
Read the alias file.
Definition: seqdbalias.cpp:670
void x_ReadLine(const char *bp, const char *ep, string &name_s, string &value_s)
Read one line of the alias file.
Definition: seqdbalias.cpp:437
void WalkNodes(CSeqDB_AliasWalker *walker, const CSeqDBVolSet &volset) const
Apply a simple visitor to each node of the alias node tree.
int GetMembBit(const CSeqDBVolSet &volset) const
Get the membership bit.
Uint8 GetVolumeLength(const CSeqDBVolSet &volset) const
Get the sum of the volume lengths.
bool NeedTotalsScan(const CSeqDBVolSet &volset) const
Check whether a db scan is need to compute correct totals.
Int4 GetMinLength(const CSeqDBVolSet &volset) const
Get the number of sequences available.
void x_Tokenize(const string &dbnames)
Tokenize (split) the list of database names.
Definition: seqdbalias.cpp:85
vector< CRef< CSeqDB_AliasMask > > m_NodeMasks
Mask objects for this node.
Definition: seqdbalias.hpp:963
vector< CSeqDB_BasePath > TVolNames
Type used to store a set of volume names for each node.
Definition: seqdbalias.hpp:925
bool m_HasGiMask
Do we have Gi masks for the top node? (only applicable to the top node)
Definition: seqdbalias.hpp:954
void FindVolumePaths(vector< string > &vols, vector< string > *alias, bool recursive) const
Get the list of volume names.
Definition: seqdbalias.cpp:927
void GetAliasFileValues(TAliasFileValues &afv) const
Get Name/Value Data From Alias Files.
CSeqDB_DirName m_DBPath
The common prefix for the DB paths.
Definition: seqdbalias.hpp:935
void CompleteAliasFileValues(const CSeqDBVolSet &volset)
Add computed values to alias node lacking them.
void x_AppendSubNode(CSeqDB_BasePath &node_path, char prot_nucl, CSeqDBAliasStack &recurse, CSeqDBLockHold &locked)
Append a subnode to this alias node.
Definition: seqdbalias.cpp:711
void GetMaskList(vector< string > &mask_list)
Get Gi-based Mask Names From Alias Files.
vector< CSeqDB_BasePath > m_DBList
Tokenized version of DBLIST.
Definition: seqdbalias.hpp:950
CSeqDBAtlas & m_Atlas
The memory management layer for this SeqDB instance.
Definition: seqdbalias.hpp:932
Int8 GetNumSeqsStats(const CSeqDBVolSet &volset) const
Get the number of sequences available.
TVarList m_Values
List of KEY/VALUE pairs from this alias file.
Definition: seqdbalias.hpp:938
string GetTitle(const CSeqDBVolSet &volset) const
Get the title.
int GetOidMaskType(const CSeqDBVolSet &volset) const
Get the Oid Mask Type.
void x_ReadAliasFile(CSeqDBFileMemMap &lease, const CSeqDB_Path &fname, const char **bp, const char **ep, CSeqDBLockHold &locked)
Get the contents of an alias file.
Definition: seqdbalias.cpp:650
TSubNodeList m_SubNodes
List of subnodes contained by this node.
Definition: seqdbalias.hpp:944
vector< CRef< CSeqDBAliasNode > > TSubNodeList
Type used to store the set of subnodes for this node.
Definition: seqdbalias.hpp:928
Int8 GetNumSeqs(const CSeqDBVolSet &volset) const
Get the number of sequences available.
void x_FindVolumePaths(set< string > &vols, set< string > &alias) const
Build a list of volume names used by the alias node tree.
Definition: seqdbalias.cpp:968
void BuildFilterTree(class CSeqDB_FilterTree &ftree) const
Build the filter tree for this node and its children.
Int8 GetNumOIDs(const CSeqDBVolSet &volset) const
Get the size of the OID range.
bool m_ExpandLinks
Do not expand link when resolving paths.
Definition: seqdbalias.hpp:966
CSeqDBAliasSets & m_AliasSets
Combined alias files.
Definition: seqdbalias.hpp:960
CSeqDBAliasNode(CSeqDBAtlas &atlas, const string &name_list, char prot_nucl, CSeqDBAliasSets &alias_sets, bool expand_links)
Public Constructor.
Definition: seqdbalias.cpp:99
CSeqDB_Path m_ThisName
Filename of this alias file.
Definition: seqdbalias.hpp:947
Uint8 GetTotalLengthStats(const CSeqDBVolSet &volset) const
Get the total length of the set of databases.
CSeqDBAliasSets class.
Definition: seqdbalias.hpp:229
bool x_FindBlastDBPath(const string &dbname, char dbtype, bool exact, string &resolved)
Find a file given a partial path and name.
Definition: seqdbalias.cpp:182
bool ReadAliasFile(const CSeqDB_Path &dbpath, const char **bp, const char **ep, CSeqDBLockHold &locked)
Read an alias file given the path.
Definition: seqdbalias.cpp:595
bool FindBlastDBPath(const CSeqDB_Path &dbname, CSeqDB_Path &resolved)
Find a file given a partial path and name.
Definition: seqdbalias.hpp:285
void x_ReadAliasSetFile(const CSeqDB_Path &group_fname, CSeqDBLockHold &locked)
Read the contents of the group alias file.
Definition: seqdbalias.cpp:546
void x_DbToIndexName(const CSeqDB_Path &fname, CSeqDB_Path &index_name, CSeqDB_FileName &alias_name)
Find the path of a group index from an alias file name.
Definition: seqdbalias.cpp:451
CSeqDBAtlas & m_Atlas
Reference to the memory management layer.
Definition: seqdbalias.hpp:382
map< string, string > m_PathLookup
Caches results of FindBlastDBPath.
Definition: seqdbalias.hpp:394
bool FindAliasPath(const CSeqDB_Path &dbpath, CSeqDB_Path *resolved, CSeqDBLockHold &locked)
Resolve the alias file path.
Definition: seqdbalias.cpp:206
TAliasGroupMap m_Groups
Alias groups.
Definition: seqdbalias.hpp:391
CSeqDBAliasStack.
Definition: seqdbalias.hpp:125
bool Exists(const CSeqDB_Path &name)
Check whether the stack contains the specified string.
Definition: seqdbalias.hpp:143
void Push(const CSeqDB_Path &name)
Push a new string onto to the stack.
Definition: seqdbalias.hpp:159
void Pop()
Remove the top element of the stack.
Definition: seqdbalias.hpp:195
unsigned Size()
Return the number of in-use elements.
Definition: seqdbalias.hpp:202
CSeqDBAtlas class.
Definition: seqdbatlas.hpp:298
bool GetFileSizeL(const string &fname, TIndx &length)
Get size of a file.
Definition: seqdbatlas.cpp:160
bool DoesFileExist(const string &fname)
Check if file exists.
Definition: seqdbatlas.cpp:148
CNcbiStreamoff TIndx
The type used for file offsets.
Definition: seqdbatlas.hpp:302
CSeqDBException.
Definition: seqdbcommon.hpp:73
const char * GetFileDataPtr(const string &fname, TIndx offset)
Get a pointer to the specified offset.
Definition: seqdbatlas.hpp:754
CSeqDBLockHold.
Definition: seqdbatlas.hpp:167
CSeqDBVolSet.
const CSeqDBVol * GetVol(int i) const
Find a volume by index.
int GetNumVols() const
Get the number of volumes.
CSeqDBVol class.
Definition: seqdbvol.hpp:169
const string & GetVolName() const
Get the volume name.
Definition: seqdbvol.hpp:452
int GetNumOIDs() const
Get the number of OIDs for this volume.
Definition: seqdbvol.cpp:2370
string GetTitle() const
Get the volume title.
Definition: seqdbvol.cpp:2375
int GetMinLength() const
Get the length of the smallest sequence in this volume.
Definition: seqdbvol.cpp:2390
int GetMaxLength() const
Get the length of the largest sequence in this volume.
Definition: seqdbvol.cpp:2385
Uint8 GetVolumeLength() const
Get the total length of this volume (in bases).
Definition: seqdbvol.cpp:1880
CSeqDBAliasExplorer class.
Definition: seqdbalias.hpp:90
virtual bool Explore(const TVarList &values)=0
This will be called with the map of key/value pairs associated with this alias file.
virtual void Accumulate(const CSeqDBVol &volumes)=0
This will be called with each CVolume that is in the alias file tree structure (in order of traversal...
Something else yet again etc.
Definition: seqdbfilter.hpp:51
CSeqDBAliasWalker class.
Definition: seqdbalias.hpp:64
virtual void AddString(const string &)=0
This will be called with the value associated with this key in the alias file.
virtual void Accumulate(const CSeqDBVol &)=0
This will be called with each CVolume that is in the alias file tree structure (in order of traversal...
virtual const char * GetFileKey() const =0
Override to provide the alias file KEY name for the type of summary data you want to gather,...
CSeqDB_BaseName.
CSeqDB_BasePath.
CSeqDB_Substring FindBaseName() const
Return the portion of this path representing the base name.
const string & GetBasePathS() const
Return this path as a string.
CSeqDB_Substring FindDirName() const
Return the portion of this path representing the directory.
CSeqDB_DirName.
void Assign(const CSeqDB_Substring &sub)
Assign a new directory name from a substring.
const string & GetDirNameS() const
Get the directory name as a string.
CSeqDB_FileName.
void Assign(const CSeqDB_Substring &sub)
Assign a new filename to this object.
const string & GetFileNameS() const
Get the filename as a string.
CSeqDB_Substring GetFileNameSub() const
Get the filename as a substring.
Tree of nodes describing filtering of database sequences.
void AddVolume(const CSeqDB_BasePath &vol)
Attach a volume to this node.
void SetName(string name)
Set the node name.
void AddFilters(const TFilters &filters)
Add filters to this node.
void AddNode(CRef< CSeqDB_FilterTree > node)
Add a child node to this node.
Test for completeness of GI list alias file values.
virtual void Accumulate(const CSeqDBVol &)
Collect data from the volume.
bool m_NeedScan
True unless/until a node with incomplete totals was found.
virtual bool Explore(const TVarList &vars)
Explore the values in this alias file.
bool NeedScan() const
Returns true if a scan is required.
CSeqDB_IdListValuesTest()
Constructor.
Walker for MAX_SEQ_LENGTH field of alias file.
virtual void AddString(const string &value)
Collect data from an alias file.
virtual const char * GetFileKey() const
This provides the alias file key used for this field.
int m_Value
The maximum sequence length.
CSeqDB_MaxLengthWalker()
Constructor.
int GetMaxLength()
Returns the maximum sequence length.
virtual void Accumulate(const CSeqDBVol &vol)
Collect data from the volume.
Walker for membership bit.
CSeqDB_MembBitWalker()
Constructor.
virtual void AddString(const string &value)
Collect data from an alias file.
int m_Value
The membership bit.
virtual const char * GetFileKey() const
This provides the alias file key used for this field.
virtual void Accumulate(const CSeqDBVol &)
Collect data from the volume.
int GetMembBit() const
Returns the membership bit.
int GetMinLength()
Returns the maximum sequence length.
int m_Value
The maximum sequence length.
virtual void AddString(const string &value)
Collect data from an alias file.
virtual const char * GetFileKey() const
This provides the alias file key used for this field.
virtual void Accumulate(const CSeqDBVol &vol)
Collect data from the volume.
CSeqDB_MinLengthWalker()
Constructor.
Walker for OID count accumulation.
virtual const char * GetFileKey() const
This disables the key; the spaces would not be preserved, so this is a non-matchable string in this c...
Walker for STATS_NSEQ field of alias file.
virtual const char * GetFileKey() const
This does the same calculation as above but uses another key.
virtual void Accumulate(const CSeqDBVol &vol)
STATS_NSEQ field.
Walker for NSEQ field of alias file.
Int8 GetNum() const
Returns the accumulated number of OIDs.
Int8 m_Value
The accumulated number of OIDs.
virtual void Accumulate(const CSeqDBVol &vol)
Collect data from the volume.
virtual void AddString(const string &value)
Collect data from an alias file.
virtual const char * GetFileKey() const
This provides the alias file key used for this field.
CSeqDB_NSeqsWalker()
Constructor.
Walker for oid mask type.
virtual void Accumulate(const CSeqDBVol &)
Collect data from the volume.
virtual const char * GetFileKey() const
This provides the alias file key used for this field.
int m_Value
The oid mask type.
CSeqDB_OidMaskTypeWalker()
Constructor.
virtual void AddString(const string &value)
Collect data from an alias file.
int GetOidMaskType() const
Returns the oid mask type.
CSeqDB_Path.
CSeqDB_Substring FindDirName() const
Returns the portion of this path containing the directory.
const string & GetPathS() const
Get the path as a string.
CSeqDB_Substring FindBasePath() const
Returns the portion of this path containing the base path.
void ReplaceFilename(const CSeqDB_Path &dir_src, const CSeqDB_Substring &fname)
Combines the directory from a path with a filename.
CSeqDB_Substring FindFileName() const
Returns the portion of this path containing the file name.
CSeqDB_Substring FindBaseName() const
Returns the portion of this path containing the base name.
bool Valid() const
Returns true if this object has a value.
String slicing.
void GetString(string &s) const
Return the data by assigning it to a string.
void EraseFront(int n)
Disinclude data from the beginning of the string.
Walker for TITLE field of alias file.
Definition: seqdbalias.cpp:991
string GetTitle()
Returns the database title string.
virtual void Accumulate(const CSeqDBVol &vol)
Collect data from a volume.
string m_Value
The title string we are accumulating.
virtual const char * GetFileKey() const
This provides the alias file key used for this field.
Definition: seqdbalias.cpp:994
virtual void AddString(const string &value)
Collect data from an alias file.
Walker for total length stats accumulation.
virtual void Accumulate(const CSeqDBVol &vol)
STATS_TOTLEN field.
virtual const char * GetFileKey() const
This does the same calculation as above but uses another key.
Walker for total length accumulation.
virtual void AddString(const string &value)
Collect data from an alias file.
virtual const char * GetFileKey() const
This provides the alias file key used for this field.
virtual void Accumulate(const CSeqDBVol &vol)
Collect data from the volume.
Uint8 m_Value
The accumulated volume length.
CSeqDB_TotalLengthWalker()
Constructor.
Uint8 GetLength() const
Returns the accumulated volume length.
Walker for volume length accumulation.
virtual const char * GetFileKey() const
This disables the key; the spaces would not be preserved, so this is a non-matchable string in this c...
container_type::const_iterator const_iterator
Definition: map.hpp:53
const_iterator end() const
Definition: map.hpp:152
bool empty() const
Definition: map.hpp:149
const_iterator find(const key_type &key) const
Definition: map.hpp:153
void swap(this_type &m)
Definition: map.hpp:118
Definition: map.hpp:338
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
The NCBI C++ standard methods for dealing with std::string.
static unsigned char depth[2 *(256+1+29)+1]
bool Empty(const CNcbiOstrstream &src)
Definition: fileutil.cpp:523
#define true
Definition: bool.h:35
#define false
Definition: bool.h:36
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static char tmp[3200]
Definition: utf8.c:42
#define basename(path)
Definition: replacements.h:116
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
@ eFollowLinks
Follow symbolic links.
Definition: ncbimisc.hpp:145
string
Definition: cgiapp.hpp:687
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
static string NormalizePath(const string &path, EFollowLinks follow_links=eIgnoreLinks)
Normalize a path.
Definition: ncbifile.cpp:820
static char GetPathSeparator(void)
Get path separator symbol specific for the current platform.
Definition: ncbifile.cpp:433
static string GetCwd(void)
Get the current working directory.
Definition: ncbifile.cpp:3708
virtual bool Exists(void) const
Check existence of file.
Definition: ncbifile.hpp:4038
virtual void DebugDump(CDebugDumpContext ddc, unsigned int depth) const
Define method for dumping debug information.
Definition: ncbiobj.cpp:988
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
static string SizetToString(size_t value, TNumToStringFlags flags=0, int base=10)
Convert size_t to string.
Definition: ncbistr.cpp:2751
static Int8 StringToInt8(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to Int8.
Definition: ncbistr.cpp:793
static Uint8 StringToUInt8(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to Uint8.
Definition: ncbistr.cpp:873
static unsigned int StringToUInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to unsigned int.
Definition: ncbistr.cpp:642
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
char * dbname(DBPROCESS *dbproc)
Get name of current database.
Definition: dblib.c:6929
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
constexpr auto sort(_Init &&init)
const struct ncbi::grid::netcache::search::fields::KEY key
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
#define INT4_MAX
largest nubmer represented by signed int
Definition: ncbi_std.h:141
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
static void s_SeqDB_FindOffsets(const char *bp, const char *ep, const string &key, vector< const char * > &offsets)
Find starting points of included data in the group alias file.
Definition: seqdbalias.cpp:486
static void s_SeqDB_ReadLine(const char *bp, const char *ep, string &name, string &value)
Parse a name-value pair.
Definition: seqdbalias.cpp:396
Defines database alias file access classes.
const string kSeqDBGroupAliasFileName
The name of the group alias file name expected at each directory For more documentation,...
bool SeqDB_CompareVolume(const string &volpath1, const string &volpath2)
Compares two volume file names and determine the volume order.
File access objects for CSeqDB.
string SeqDB_FindBlastDBPath(const string &file_name, char dbtype, string *sp, bool exact, CSeqDBAtlas &atlas)
Finds a file in the search path.
void SeqDB_SplitQuoted(const string &dbname, vector< CSeqDB_Substring > &dbs, bool keep_quote=false)
Combine and quote list of database names.
void SeqDB_JoinDelim(string &a, const string &b, const string &delim)
Join two strings with a delimiter.
#define SEQDB_ISEOL(x)
Macro for EOL chars.
void s_SeqDB_QuickAssign(string &dst, const char *bp, const char *ep)
Higher Performance String Assignment.
#define _ASSERT
else result
Definition: token2.c:20
@ eTiList
Filter a BLAST database via TIs (Trace IDs)
Definition: writedb.hpp:612
@ eTaxIdList
Filter a BLAST database via Taxonomy Id list.
Definition: writedb.hpp:614
@ eGiList
Filter a BLAST database via GIs.
Definition: writedb.hpp:611
Modified on Wed Apr 24 14:09:42 2024 by modify_doxy.py rev. 669887