NCBI C++ ToolKit
seqdbalias.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef OBJTOOLS_READERS_SEQDB__SEQDBALIAS_HPP
2 #define OBJTOOLS_READERS_SEQDB__SEQDBALIAS_HPP
3 
4 /* $Id: seqdbalias.hpp 97226 2022-06-28 12:33:29Z fongah2 $
5  * ===========================================================================
6  *
7  * PUBLIC DOMAIN NOTICE
8  * National Center for Biotechnology Information
9  *
10  * This software/database is a "United States Government Work" under the
11  * terms of the United States Copyright Act. It was written as part of
12  * the author's official duties as a United States Government employee and
13  * thus cannot be copyrighted. This software/database is freely available
14  * to the public for use. The National Library of Medicine and the U.S.
15  * Government have not placed any restriction on its use or reproduction.
16  *
17  * Although all reasonable efforts have been taken to ensure the accuracy
18  * and reliability of the software and data, the NLM and the U.S.
19  * Government do not and cannot warrant the performance or results that
20  * may be obtained by using this software or data. The NLM and the U.S.
21  * Government disclaim all warranties, express or implied, including
22  * warranties of performance, merchantability or fitness for any particular
23  * purpose.
24  *
25  * Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Author: Kevin Bealer
30  *
31  */
32 
33 /// @file seqdbalias.hpp
34 /// Defines database alias file access classes.
35 ///
36 /// Defines classes:
37 /// CSeqDB_AliasWalker
38 /// CSeqDBAliasNode
39 /// CSeqDBAliasFile
40 ///
41 /// Implemented for: UNIX, MS-Windows
42 
43 #include <iostream>
44 
47 #include "seqdboidlist.hpp"
50 #include "seqdbvolset.hpp"
52 
54 
55 using namespace ncbi::objects;
56 
57 
58 /// CSeqDBAliasWalker class
59 ///
60 /// Derivatives of this abstract class can be used to gather summary
61 /// data from the entire include tree of alias files. For details of
62 /// the traversal order, see the WalkNodes documentation.
63 
65 public:
66  /// Destructor
67  virtual ~CSeqDB_AliasWalker() {}
68 
69  /// Override to provide the alias file KEY name for the type of
70  /// summary data you want to gather, for example "NSEQ".
71  virtual const char * GetFileKey() const = 0;
72 
73  /// This will be called with each CVolume that is in the alias
74  /// file tree structure (in order of traversal).
75  virtual void Accumulate(const CSeqDBVol &) = 0;
76 
77  /// This will be called with the value associated with this key in
78  /// the alias file.
79  virtual void AddString (const string &) = 0;
80 };
81 
82 
83 /// CSeqDBAliasExplorer class
84 ///
85 /// This is similar to the AliasWalker class. Where the AliasWalker
86 /// provides a search key, the AliasExplorer is provided access to the
87 /// name->value map. This allows it to examine relationships between
88 /// values and to do more complex analyses.
89 
91 public:
92  /// Type of set used for KEY/VALUE pairs within each node
94 
95  /// Destructor
96  virtual ~CSeqDB_AliasExplorer() {}
97 
98  /// This will be called with each CVolume that is in the alias
99  /// file tree structure (in order of traversal).
100  ///
101  /// @param volumes
102  /// A volume found during alias file traversal.
103  virtual void Accumulate(const CSeqDBVol & volumes) = 0;
104 
105  /// This will be called with the map of key/value pairs associated
106  /// with this alias file. It should return true if this branch of
107  /// the traversal tree has been satisfied, or false if traversal
108  /// below this point is desireable.
109  ///
110  /// @param values
111  /// The name/value pair map for this node.
112  /// @return
113  /// True if this branch of traversal is done.
114  virtual bool Explore(const TVarList & values) = 0;
115 };
116 
117 
118 /// CSeqDBAliasStack
119 ///
120 /// When expanding a CSeqDBAliasNode, a test must be done to determine
121 /// whether each child nodes has already been expanded in this branch
122 /// of the traversal. This class provides a set mechanism which
123 /// tracks node ancestry.
124 
126 public:
127  /// Constructor
129  : m_Count(0)
130  {
131  m_NodeNames.resize(4);
132  }
133 
134  /// Check whether the stack contains the specified string.
135  ///
136  /// This iterates over the vector of strings and returns true if
137  /// the specified string is found.
138  ///
139  /// @param name
140  /// The alias file base name to add.
141  /// @return
142  /// True if the string was found in the stack.
143  bool Exists(const CSeqDB_Path & name)
144  {
145  for(unsigned i=0; i<m_Count; i++) {
146  if (m_NodeNames[i] == name) {
147  return true;
148  }
149  }
150  return false;
151  }
152 
153  /// Push a new string onto to the stack.
154  ///
155  /// The specified string is added to the stack.
156  ///
157  /// @param name
158  /// The alias file base name to add.
159  void Push(const CSeqDB_Path & name)
160  {
161  // This design aims at efficiency (cycles, not memory).
162  // Specifically, it tries to accomplish the following:
163  //
164  // 1. The m_NodeNames vector will be resized at most ln2(N)
165  // times where N is the maximal DEPTH of traversal.
166  //
167  // 2. Strings are not deallocated on return from lower depths,
168  // instead they are left in place as buffers for future
169  // assignments.
170  //
171  // 3. A particular element of the string array should be
172  // reallocated at most ln2(M/16) times, where M is the
173  // maximal length of the string, regardless of the number
174  // of traversals through that node-depth.
175  //
176  // The vector size is increased with resize(), in a doubling
177  // pattern, and string data is reserve()d. This code will
178  // maintain vector.size == vector.capacity at all times. If
179  // vector.size fluctuated with each adding and removing of an
180  // element, the strings between old-size and new-size would be
181  // destructed, losing existing allocations. With strings, the
182  // resize method might cause blanking of memory, but the
183  // reserve method should not. In either case, the string size
184  // will be set by the assign() method, and the true vector
185  // usage is tracked via the m_Count field.
186 
187  if (m_NodeNames.size() == m_Count) {
188  m_NodeNames.resize(m_NodeNames.size() * 2);
189  }
190 
191  m_NodeNames[m_Count++].Assign(name.GetPathS());
192  }
193 
194  /// Remove the top element of the stack
195  void Pop()
196  {
197  _ASSERT(m_Count);
198  m_Count--;
199  }
200 
201  /// Return the number of in-use elements.
202  unsigned Size()
203  {
204  return m_Count;
205  }
206 
207 private:
208  /// List of node names.
209  vector<CSeqDB_Path> m_NodeNames;
210 
211  /// Number of in-use node names.
212  unsigned m_Count;
213 
214  /// Disable copy operator.
215  CSeqDBAliasStack & operator =(const CSeqDBAliasStack &);
216 
217  /// Disable copy constructor.
219 };
220 
221 
222 /// CSeqDBAliasSets class
223 ///
224 /// This acts as a layer between the alias processing code and the
225 /// atlas code in the case where a combined alias is used. It
226 /// intercepts calls to find and use individual alias files and uses
227 /// combined alias files instead.
228 
230 public:
231  /// Constructor
233  : m_Atlas(atlas)
234  {
235  }
236 
237  /// Read an alias file given the path.
238  ///
239  /// This finds an alias file, or an equivalent section of a group
240  /// alias file, given a filename. The contents of the file (or of
241  /// the corresponding part of the group file) are returned as a
242  /// pair of pointers to the start and end of the buffer stored in
243  /// the string that contains this data. This code triggers the
244  /// parsing of the entire group alias file if it exists and has
245  /// not hithereto been read. Group alias files could replace
246  /// individual alias files, but at the moment, both will always be
247  /// present. If the group alias file does exist, it is assumed to
248  /// be authoritative and complete.
249  ///
250  /// @param dbpath The name of the alias file (if it exists).
251  /// @param bp The start of the alias file contents. [out]
252  /// @param ep The end of the alias file contents. [out]
253  /// @param locked The lock holder object for this thread. [in]
254  /// @return True if an alias file (or equivalent data) was found.
255  bool ReadAliasFile(const CSeqDB_Path & dbpath,
256  const char ** bp,
257  const char ** ep,
258  CSeqDBLockHold & locked);
259 
260  /// Resolve the alias file path.
261  ///
262  /// Given a partial path and name designating a particular db
263  /// alias file, this method finds the absolute path of the group
264  /// index file for that alias file, or if that is not found, the
265  /// individual alias file.
266  ///
267  /// @param dbpath The path to the file. [in]
268  /// @param resolved The resolved path is returned here. [out]
269  /// @param locked The lock holder object for this thread. [in]
270  /// @return True if the path was found.
271  bool FindAliasPath(const CSeqDB_Path & dbpath,
272  CSeqDB_Path * resolved,
273  CSeqDBLockHold & locked);
274 
275  /// Find a file given a partial path and name.
276  ///
277  /// Given a path designating a particular disk file, this method
278  /// finds the absolute path of that file. The filename is assumed
279  /// to contain the correct extension.
280  ///
281  /// @param dbname The partial path to the file, with extension. [in]
282  /// @param resolved The resolved path is returned here. [out]
283  /// @param locked The lock holder object for this thread. [in]
284  /// @return True if the path was found.
286  CSeqDB_Path & resolved)
287  {
288  string resolved_str;
289 
290  if (x_FindBlastDBPath(dbname.GetPathS(),
291  '-',
292  true,
293  resolved_str)) {
294 
295  resolved.Assign(resolved_str);
296  return true;
297  }
298 
299  return false;
300  }
301 
302  /// Find a file given a partial path and name.
303  ///
304  /// Given a path designating a particular disk file, this method
305  /// finds the absolute path of that file. The filename is assumed
306  /// to not contain an extension. Instead, the user indicates the
307  /// type of database (p or n) and the function will search for
308  /// that kind of database volume or alias file ('pin' or 'pal' for
309  /// protein, 'nin' or 'nal' for nucleotide.)
310  ///
311  /// @param dbname The partial path to the file. [in]
312  /// @param dbtype The type of sequences used. [in]
313  /// @param resolved The resolved path is returned here. [out]
314  /// @param locked The lock holder object for this thread. [in]
315  /// @return True if the path was found.
317  char dbtype,
318  CSeqDB_BasePath & resolved)
319  {
320  string resolved_str;
321 
322  if (x_FindBlastDBPath(dbname.GetBasePathS(),
323  dbtype,
324  false,
325  resolved_str)) {
326 
327  resolved.Assign(resolved_str);
328  return true;
329  }
330 
331  return false;
332  }
333 
334 private:
335  /// Find a file given a partial path and name.
336  ///
337  /// Given a path designating a particular disk file, this method
338  /// finds the absolute path of that file. The user indicates the
339  /// type of database (p or n) to find appropriate extensions for
340  /// index or alias files, or specifies exact=true if the filename
341  /// already has the correct extension. Before the filesystem is
342  /// consulted, however, the m_PathLookup map is checked to see if
343  /// an answer to this query already exists.
344  ///
345  /// @param dbname The partial path to the file. [in]
346  /// @param dbtype The type of sequences in the DB. [in]
347  /// @param exact Specify true if dbname contains the extension. [in]
348  /// @param resolved The resolved path is returned here. [out]
349  /// @param locked The lock holder object for this thread. [in]
350  /// @return True if the path was found.
351  bool x_FindBlastDBPath(const string & dbname,
352  char dbtype,
353  bool exact,
354  string & resolved);
355 
356  /// Find the path of a group index from an alias file name.
357  ///
358  /// This method takes the path of an alias file as input. The
359  /// filename is extracted and returned in alias_name. The name
360  /// of the associated group index file is computed and returned
361  /// in index_path. This consists of the directory of the alias
362  /// file combined with the standard group index filename.
363  ///
364  /// @param fname Location of the individual alias file. [in]
365  /// @param index_name Location of the group index file. [out]
366  /// @param alias_name Filename portion of the alias file. [out]
367  void x_DbToIndexName(const CSeqDB_Path & fname,
368  CSeqDB_Path & index_name,
369  CSeqDB_FileName & alias_name);
370 
371  /// Read the contents of the group alias file.
372  ///
373  /// This reads a group alias file. The individual alias file
374  /// contents are stored in m_Groups, but are not parsed yet.
375  ///
376  /// @param group_fname The filename for the group file. [in]
377  /// @param locked The lock holder object for this thread. [in]
378  void x_ReadAliasSetFile(const CSeqDB_Path & group_fname,
379  CSeqDBLockHold & locked);
380 
381  /// Reference to the memory management layer.
383 
384  /// Aggregated alias file - maps filename to file contents.
386 
387  /// Full index filename to aggregated alias file.
389 
390  /// Alias groups.
392 
393  /// Caches results of FindBlastDBPath
395 
396  /// Disable copy operator.
397  CSeqDBAliasSets & operator =(const CSeqDBAliasSets &);
398 
399  /// Disable copy constructor.
401 };
402 
403 /// CSeqDBAliasNode class
404 ///
405 /// This is one node of the alias node tree, an n-ary tree which
406 /// represents the relationships of the alias files and volumes used
407 /// by a CSeqDB instance. The children of this node are the other
408 /// alias files mentioned in this node's DBLIST key. Each node may
409 /// also have volumes, which are not strictly children (not the same
410 /// species), but are treated that way for the purpose of some
411 /// computations. The volumes are the non-alias objects mentioned in
412 /// the DBLIST, and are the containers for actual sequence, header,
413 /// and summary data.
414 ///
415 /// As a special case, an alias node which mentions its own name in
416 /// the DBLIST is interpreted as referring to an index file with the
417 /// same base name and path. Alias node trees can be quite complex
418 /// and nodes can share database volumes; sometimes there are hundreds
419 /// of nodes which refer to only a few underlying database volumes.
420 ///
421 /// Nodes have two primary purposes: to override summary data (such as
422 /// the "title" field) which would otherwise be taken from the volume,
423 /// and to aggregate other alias files or volumes. The top level
424 /// alias node is virtual - it does not refer to a real file on disk.
425 /// It's purpose is to aggregate the space-seperated list of databases
426 /// which are provided to the CSeqDB constructor.
427 
428 class CSeqDBAliasNode : public CObject {
429  /// Type of set used for KEY/VALUE pairs within each node
431 
432  /// Import type to allow shorter name.
434 
435 public:
436  /// Public Constructor
437  ///
438  /// This is the user-visible constructor, which builds the top level
439  /// node in the dbalias node tree. This design effectively treats the
440  /// user-input database list as if it were an alias file containing
441  /// only the DBLIST specification.
442  ///
443  /// @param atlas
444  /// The memory management layer.
445  /// @param name_list
446  /// The space delimited list of database names.
447  /// @param prot_nucl
448  /// The type of sequences stored here.
449  /// @param alias_sets
450  /// An alias file caching and combining layer.
451  /// @param expand_links
452  /// Indicate if soft links should be expanded
454  const string & name_list,
455  char prot_nucl,
456  CSeqDBAliasSets & alias_sets,
457  bool expand_links);
458 
459  /// Get the list of volume names
460  ///
461  /// The alias node tree is iterated to produce a list of all
462  /// volume names. This list will be sorted and unique.
463  ///
464  /// @param vols
465  /// The returned set of volume names
466  /// @param alias
467  /// The returned set of alias names
468  /// @param recursive
469  /// If true will descend the alias tree to the volume nodes
470  void FindVolumePaths(vector<string> & vols, vector<string> * alias, bool recursive) const;
471 
472  /// Get the title
473  ///
474  /// This iterates this node and possibly subnodes of it to build
475  /// and return a title string. Alias files may override this
476  /// value (stopping traversal at that depth).
477  ///
478  /// @param volset
479  /// The set of database volumes
480  /// @return
481  /// A string describing the database
482  string GetTitle(const CSeqDBVolSet & volset) const;
483 
484  /// Get the number of sequences available
485  ///
486  /// This iterates this node and possibly subnodes of it to compute
487  /// the shortest sequence length.
488  ///
489  /// @param volset
490  /// The set of database volumes
491  /// @return
492  /// The shortest sequence length
493  Int4 GetMinLength(const CSeqDBVolSet & volset) const;
494 
495  /// Get the number of sequences available
496  ///
497  /// This iterates this node and possibly subnodes of it to compute
498  /// the number of sequences available here. Alias files may
499  /// override this value (stopping traversal at that depth). It is
500  /// normally used to provide information on how many OIDs exist
501  /// after filtering has been applied.
502  ///
503  /// @param volset
504  /// The set of database volumes
505  /// @return
506  /// The number of included sequences
507  Int8 GetNumSeqs(const CSeqDBVolSet & volset) const;
508 
509  /// Get the number of sequences available
510  ///
511  /// This iterates this node and possibly subnodes of it to compute
512  /// the number of sequences available here. Alias files may
513  /// override this value (stopping traversal at that depth). It is
514  /// normally used to provide information on how many OIDs exist
515  /// after filtering has been applied.
516  ///
517  /// @param volset
518  /// The set of database volumes
519  /// @return
520  /// The number of included sequences
521  Int8 GetNumSeqsStats(const CSeqDBVolSet & volset) const;
522 
523  /// Get the size of the OID range
524  ///
525  /// This iterates this node and possibly subnodes of it to compute
526  /// the number of sequences in all volumes as encountered in
527  /// traversal. Alias files cannot override this value. Filtering
528  /// does not affect this value.
529  ///
530  /// @param volset
531  /// The set of database volumes
532  /// @return
533  /// The number of OIDs found during traversal
534  Int8 GetNumOIDs(const CSeqDBVolSet & volset) const;
535 
536  /// Get the total length of the set of databases
537  ///
538  /// This iterates this node and possibly subnodes of it to compute
539  /// the total length of all sequences in all volumes included in
540  /// the database. This may count volumes several times (depending
541  /// on alias tree structure). Alias files can override this value
542  /// (stopping traversal at that depth). It is normally used to
543  /// describe the amount of sequence data remaining after filtering
544  /// has been applied.
545  ///
546  /// @param volset
547  /// The set of database volumes
548  /// @return
549  /// The total length of all included sequences
550  Uint8 GetTotalLength(const CSeqDBVolSet & volset) const;
551 
552  /// Get the total length of the set of databases
553  ///
554  /// This iterates this node and possibly subnodes of it to compute
555  /// the total length of all sequences in all volumes included in
556  /// the database. This may count volumes several times (depending
557  /// on alias tree structure). Alias files can override this value
558  /// (stopping traversal at that depth). It is normally used to
559  /// describe the amount of sequence data remaining after filtering
560  /// has been applied.
561  ///
562  /// @param volset
563  /// The set of database volumes
564  /// @return
565  /// The total length of all included sequences
566  Uint8 GetTotalLengthStats(const CSeqDBVolSet & volset) const;
567 
568  /// Get the sum of the volume lengths
569  ///
570  /// This iterates this node and possibly subnodes of it to compute
571  /// the total length of all sequences in all volumes as
572  /// encountered in traversal. This may count volumes several
573  /// times (depending on alias tree structure). Alias files cannot
574  /// override this value.
575  ///
576  /// @param volset
577  /// The set of database volumes
578  /// @return
579  /// The sum of all volumes lengths as traversed
580  Uint8 GetVolumeLength(const CSeqDBVolSet & volset) const;
581 
582  /// Get the membership bit
583  ///
584  /// This iterates this node and possibly subnodes of it to find
585  /// the membership bit, if there is one. If more than one alias
586  /// node provides a membership bit, only one will be used. This
587  /// value can only be found in alias files (volumes do not have a
588  /// single internal membership bit).
589  ///
590  /// @param volset
591  /// The set of database volumes
592  /// @return
593  /// The membership bit, or zero if none was found.
594  int GetMembBit(const CSeqDBVolSet & volset) const;
595 
596  /// Check whether a db scan is need to compute correct totals.
597  ///
598  /// This traverses this node and its subnodes to determine whether
599  /// accurate estimation of the total number of sequences and bases
600  /// requires a linear time scan of the index files.
601  ///
602  /// @param volset
603  /// The set of database volumes.
604  /// @return
605  /// True if the database scan is required.
606  bool NeedTotalsScan(const CSeqDBVolSet & volset) const;
607 
608  /// Apply a simple visitor to each node of the alias node tree
609  ///
610  /// This iterates this node and possibly subnodes of it. If the
611  /// alias file contains an entry with the key returned by
612  /// walker.GetFileKey(), the string will be sent to walker via the
613  /// AddString() method. If the alias file does not provide the
614  /// value, the walker object will be applied to each subnode (by
615  /// calling WalkNodes), and then to each volume of the tree by
616  /// calling the Accumulate() method on the walker object. Each
617  /// type of summary data has its own properties, so there is a
618  /// CSeqDB_AliasWalker derived class for each type of summary data
619  /// that needs this kind of traversal. This technique is referred
620  /// to as the "visitor" design pattern.
621  ///
622  /// @param walker
623  /// The visitor object to recursively apply to the tree.
624  /// @param volset
625  /// The set of database volumes
626  void WalkNodes(CSeqDB_AliasWalker * walker,
627  const CSeqDBVolSet & volset) const;
628 
629  /// Apply a complex visitor to each node of the alias node tree
630  ///
631  /// This iterates this node and possibly subnodes of it. At each
632  /// node, the map of keys to values is provided to the explorer
633  /// object via the Explore() method. If the explorer object
634  /// returns false, this branch of the tree has been pruned and
635  /// traversal will not continue downward. If it returns true,
636  /// traversal continues down through the tree. If traversal was
637  /// not pruned, and volumes exist for this node, the Accumulate
638  /// method is called for each volume after traversal through
639  /// subnodes has been done. Compared to the version that takes a
640  /// CSeqDB_AliasWalker, this version of this method allows more
641  /// flexibility because the explorer object has access to the
642  /// entire map of name/value pairs.
643  ///
644  /// @param explorer
645  /// The visitor object to recursively apply to the tree.
646  /// @param volset
647  /// The set of database volumes
648  void WalkNodes(CSeqDB_AliasExplorer * explorer,
649  const CSeqDBVolSet & volset) const;
650 
651  /// Set filtering options for all volumes
652  ///
653  /// This method applies all of this alias node's filtering options
654  /// to all of its associated volumes (and subnodes, for GI lists).
655  /// It then iterates over subnodes, recursively calling SetMasks()
656  /// to apply filtering options throughout the alias node tree.
657  /// The virtual OID lists are not built as a result of this
658  /// process, but the data necessary for virtual OID construction
659  /// is copied to the volume objects.
660  ///
661  /// @param volset
662  /// The database volume set
663  void SetMasks(CSeqDBVolSet & volset);
664 
665  /// Get Name/Value Data From Alias Files
666  ///
667  /// SeqDB treats each alias file as a map from a variable name to
668  /// a value. This method will return a map from the basename of
669  /// the filename of each alias file, to a mapping from variable
670  /// name to value for each entry in that file. For example, the
671  /// value of the "DBLIST" entry in the "wgs.nal" file would be
672  /// values["wgs"]["DBLIST"]. The lines returned have been
673  /// processed somewhat by SeqDB, including normalizing tabs to
674  /// whitespace, trimming leading and trailing whitespace, and
675  /// removal of comments and other non-value lines. Care should be
676  /// taken when using the values returned by this method. SeqDB
677  /// uses an internal "virtual" alias file entry to aggregate the
678  /// values passed into SeqDB by the user. This mapping uses a
679  /// filename of "-" and contains a single entry mapping "DBLIST"
680  /// to SeqDB's database name input. This entry is the root of the
681  /// alias file inclusion tree. Also note that alias files that
682  /// appear in several places in the alias file inclusion tree only
683  /// have one entry in the returned map (and are only parsed once
684  /// by SeqDB).
685  ///
686  /// @param afv
687  /// The alias file values will be returned here.
688  void GetAliasFileValues(TAliasFileValues & afv) const;
689 
690  /// Add computed values to alias node lacking them.
691  ///
692  /// Some of the standard alias file key/values pairs are, in fact,
693  /// designed to override for values found in the corresponding
694  /// volumes. The callers of the GetAliasFileValues() method may
695  /// want to use these values on a per-alias-file basis. But of
696  /// these values are only present in the alias file if the author
697  /// of that file wanted to replace the value found in the volume.
698  ///
699  /// This method iterates over the alias file nodes, filling in
700  /// values found in the volumes, in those cases where the alias
701  /// file did not override the value. Only those values that have
702  /// been useful to a user of CSeqDB are added via this method,
703  /// which so only includes the TITLE.
704  ///
705  /// @param volset The set of volumes for this database.
706  void CompleteAliasFileValues(const CSeqDBVolSet & volset);
707 
708  /// Build the filter tree for this node and its children.
709  /// @param ftree The result is returned here.
710  void BuildFilterTree(class CSeqDB_FilterTree & ftree) const;
711 
712  /// Computes the masking information for each alias node.
713  ///
714  /// This object process each alias file node to construct a
715  /// summary of the kind of OID filtering applied there. The
716  /// has_filters parameter will be set to true if any filtering
717  /// was done.
718  ///
719  /// @param has_filters Will be set true if any filtering is done.
720  void ComputeMasks(bool & has_filters);
721 
722  /// Get Gi-based Mask Names From Alias Files
723  ///
724  /// This will return the MASKLIST field of the alias node.
725  ///
726  /// @param mask_list
727  /// The mask names will be returned here.
728  void GetMaskList(vector <string> & mask_list);
729 
730  /// Is the top node alias file associated with Gi based masks?
731  ///
732  /// This will return true if the MASKLIST field of the top alias
733  /// node is set.
734  ///
735  /// @return TRUE if MASKLIST field is present
736  bool HasGiMask() const
737  {
738  return m_HasGiMask;
739  };
740 
741  /// Get the Oid Mask Type
742  ///
743  /// This iterates the alias node tree to find the oid mask type,
744  /// if there is one.
745  ///
746  /// @param volset
747  /// The set of database volumes
748  /// @return
749  /// Oid mask type, or zero if none was found.
750  int GetOidMaskType(const CSeqDBVolSet & volset) const;
751 
752 private:
753  /// Private Constructor
754  ///
755  /// This constructor is used to build the alias nodes other than
756  /// the topmost node. It is private, because such nodes are only
757  /// constructed via internal mechanisms of this class. One
758  /// potential issue for alias node hierarchies is that it is easy
759  /// to (accidentally) construct mutually recursive alias
760  /// configurations. To prevent an infinite recursion in this
761  /// case, this constructor takes a set of strings, which indicate
762  /// all the nodes that have already been constructed. It is
763  /// passed by value (copied) because the same node can be used,
764  /// legally and safely, in more than one branch of the same alias
765  /// node tree. If the node to build is already in this set, the
766  /// constructor will throw an exception. As a special case, if a
767  /// name in a DBLIST line is the same as the node it is in, it is
768  /// assumed to be a volume name (even though an alias file exists
769  /// with that name), so this will not trigger the cycle detection
770  /// exception.
771  ///
772  /// @param atlas
773  /// The memory management layer
774  /// @param dbpath
775  /// The working directory for relative paths in this node
776  /// @param dbname
777  /// The name of this node
778  /// @param prot_nucl
779  /// Indicates whether protein or nucletide sequences will be used
780  /// @param recurse
781  /// Node history for cycle detection
782  /// @param locked
783  /// The lock holder object for this thread. [in]
784  /// @param alias_sets
785  /// An alias file caching and combining layer.
786  /// @param expand_links
787  /// Indicate if soft links should be expanded
789  const CSeqDB_DirName & dbpath,
790  const CSeqDB_BaseName & dbname,
791  char prot_nucl,
792  CSeqDBAliasStack & recurse,
793  CSeqDBLockHold & locked,
794  CSeqDBAliasSets & alias_sets,
795  bool expand_links);
796 
797  /// Read the alias file
798  ///
799  /// This function read the alias file from the atlas, parsing the
800  /// lines and storing the KEY/VALUE pairs in this node. It
801  /// ignores KEY values that are not supported in SeqDB, although
802  /// currently SeqDB should support all of the defined KEYs.
803  ///
804  /// @param fn
805  /// The name of the alias file
806  /// @param locked
807  /// The lock holder object for this thread. [in]
808  void x_ReadValues(const CSeqDB_Path & fn, CSeqDBLockHold & locked);
809 
810  /// Read one line of the alias file
811  ///
812  /// This method parses the specified character string, storing the
813  /// results in the KEY/VALUE map in this node. The input string
814  /// is specified as a begin/end pair of pointers. If the string
815  /// starts with "#", the function has no effect. Otherwise, if
816  /// there are tabs in the input string, they are silently
817  /// converted to spaces, and the part of the string before the
818  /// first space after the first nonspace is considered to be the
819  /// key. The rest of the line (with initial and trailing spaces
820  /// removed) is taken as the value.
821  ///
822  /// @param bp
823  /// A pointer to the first character of the line
824  /// @param ep
825  /// A pointer to (one past) the last character of the line
826  /// @param name_s
827  /// The variable name from the file
828  /// @param value_s
829  /// The value from the file
830  void x_ReadLine(const char * bp,
831  const char * ep,
832  string & name_s,
833  string & value_s);
834 
835  /// Expand a node of the alias node tree recursively
836  ///
837  /// This method expands a node of the alias node tree, recursively
838  /// building the tree from the specified node downward. (This
839  /// method and the private version of the constructor are mutually
840  /// recursive.) The cyclic tree check is done, and paths of these
841  /// components are resolved. The alias file is parsed, and for
842  /// each member of the DBLIST set, a subnode is constructed or a
843  /// volume name is stored (if the element is the same as this
844  /// node's name).
845  ///
846  /// @param this_name
847  /// The name of this node
848  /// @param prot_nucl
849  /// Indicates whether this is a protein or nucleotide database.
850  /// @param recurse
851  /// Set of all ancestor nodes for this node.
852  /// @param locked
853  /// The lock holder object for this thread. [in]
854  void x_ExpandAliases(const CSeqDB_BasePath & this_name,
855  char prot_nucl,
856  CSeqDBAliasStack & recurse,
857  CSeqDBLockHold & locked);
858 
859  /// Build a list of volume names used by the alias node tree
860  ///
861  /// This adds the volume names used here to the specified set.
862  /// The same method is called on all subnodes, so all volumes from
863  /// this point of the tree down will be added by this call.
864  ///
865  /// @param vols
866  /// The set of strings to receive the volume names
867  void x_FindVolumePaths(set<string> & vols, set<string> & alias) const;
868 
869  /// Name resolution
870  ///
871  /// This finds the path for each name in m_DBList, and resolves
872  /// the path for each. This is only done during construction of
873  /// the topmost node. Names supplied by the end user get this
874  /// treatment, lower level nodes will have absolute or relative
875  /// paths to specify the database locations.
876  ///
877  /// After alls names are resolved, the longest common prefix (of
878  /// all names) is found and moved to the dbname_path variable (and
879  /// removed from each individual name).
880  ///
881  /// @param prot_nucl
882  /// Indicates whether this is a protein or nucleotide database
883  /// @param locked
884  /// The lock hold object for this thread. [in]
885  void x_ResolveNames(char prot_nucl, CSeqDBLockHold & locked);
886 
887  /// Get the contents of an alias file.
888  ///
889  /// Fetches the lines belonging to an alias file, either directly
890  /// or via a combined alias file.
891  void x_ReadAliasFile(CSeqDBFileMemMap & lease,
892  const CSeqDB_Path & fname,
893  const char ** bp,
894  const char ** ep,
895  CSeqDBLockHold & locked);
896 
897  /// Tokenize (split) the list of database names.
898  ///
899  /// The provided string is split using the space character as a
900  /// delimiter. The resulting names are added to the m_DBList
901  /// vector and will become sub-nodes or opened as volumes.
902  ///
903  /// @param dbnames Space seperated list of database names.
904  void x_Tokenize(const string & dbnames);
905 
906  /// Append a subnode to this alias node.
907  ///
908  /// This method appends a new subnode to this node of the alias
909  /// node tree. It is called by the x_ExpandAliases method.
910  ///
911  /// @param node_path
912  /// The base path of the new node's volume.
913  /// @param prot_nucl
914  /// Indicates whether this is a protein or nucleotide database.
915  /// @param recurse
916  /// Set of all ancestor nodes for this node.
917  /// @param locked
918  /// The lock holder object for this thread. [in]
919  void x_AppendSubNode(CSeqDB_BasePath & node_path,
920  char prot_nucl,
921  CSeqDBAliasStack & recurse,
922  CSeqDBLockHold & locked);
923 
924  /// Type used to store a set of volume names for each node
925  typedef vector<CSeqDB_BasePath> TVolNames;
926 
927  /// Type used to store the set of subnodes for this node
928  typedef vector< CRef<CSeqDBAliasNode> > TSubNodeList;
929 
930 
931  /// The memory management layer for this SeqDB instance
933 
934  /// The common prefix for the DB paths.
936 
937  /// List of KEY/VALUE pairs from this alias file
939 
940  /// Set of volume names associated with this node
942 
943  /// List of subnodes contained by this node
945 
946  /// Filename of this alias file
948 
949  /// Tokenized version of DBLIST
950  vector<CSeqDB_BasePath> m_DBList;
951 
952  /// Do we have Gi masks for the top node?
953  /// (only applicable to the top node)
955 
956  /// Should we skip local DB search for this DBLIST?
957  vector<bool> m_SkipLocal;
958 
959  /// Combined alias files.
961 
962  /// Mask objects for this node.
963  vector< CRef<CSeqDB_AliasMask> > m_NodeMasks;
964 
965  /// Do not expand link when resolving paths
967 
968  /// Disable copy operator.
969  CSeqDBAliasNode & operator =(const CSeqDBAliasNode &);
970 
971  /// Disable copy constructor.
973 };
974 
975 
976 /// CSeqDBAliasFile class
977 ///
978 /// This class is an interface to the alias node tree. It provides
979 /// functionality to classes like CSeqDBImpl (and others) that do not
980 /// need to understand alias walkers, nodes, and tree traversal.
981 
982 class CSeqDBAliasFile : public CObject {
983  /// Import type to allow shorter name.
985 
986 public:
987  /// Constructor
988  ///
989  /// This builds a tree of CSeqDBAliasNode objects from a
990  /// space-seperated list of database names. Every database
991  /// instance has at least one node, because the top most node is
992  /// an "artificial" node, which serves only to aggregate the list
993  /// of databases specified to the constructor. The tree is
994  /// constructed in a depth first manner, and will be complete upon
995  /// return from this constructor.
996  ///
997  /// @param atlas
998  /// The SeqDB memory management layer.
999  /// @param name_list
1000  /// A space seperated list of database names.
1001  /// @param prot_nucl
1002  /// Indicates whether the database is protein or nucleotide.
1003  /// @param expand_links
1004  /// Indicates whether the soft links should be expanded
1005  CSeqDBAliasFile(CSeqDBAtlas & atlas,
1006  const string & name_list,
1007  char prot_nucl,
1008  bool expand_links = true);
1009 
1010  /// Get the list of volume names.
1011  ///
1012  /// This method returns a reference to the vector of volume names.
1013  /// The vector will contain all volume names mentioned in any of
1014  /// the DBLIST lines in the hierarchy of the alias node tree. The
1015  /// volume names do not include an extension (such as .pin or .nin).
1016  ///
1017  /// @return
1018  /// Reference to the internal vector of volume names.
1019  const vector<string> & GetVolumeNames() const
1020  {
1021  return m_VolumeNames;
1022  }
1023 
1024  /// Find the base names of volumes.
1025  ///
1026  /// This method populates the vector with volume names.
1027  ///
1028  /// @param vols The vector to be populated with volume names
1029  /// @param recursive If true, vol will include all volume names
1030  /// within the alias node tree. Otherwise, only the top-node volume
1031  /// names are included
1032  void FindVolumePaths(vector<string> & vols, vector<string> * alias, bool recursive) const
1033  {
1034  if (recursive) {
1035  // use the cached results
1036  vols = m_VolumeNames;
1037  if (alias) {
1038  *alias = m_AliasNames;
1039  }
1040  }
1041  else {
1042  m_Node->FindVolumePaths(vols, alias, recursive);
1043  }
1044  };
1045 
1046  /// Get the title
1047  ///
1048  /// This iterates the alias node tree to build and return a title
1049  /// string. Alias files may override this value (stopping
1050  /// traversal at that depth).
1051  ///
1052  /// @param volset
1053  /// The set of database volumes
1054  /// @return
1055  /// A string describing the database
1056  string GetTitle(const CSeqDBVolSet & volset) const;
1057 
1058  /// Get the number of sequences available
1059  ///
1060  /// This iterates this node and possibly subnodes of it to compute
1061  /// the shortest sequence length.
1062  ///
1063  /// @param volset
1064  /// The set of database volumes
1065  /// @return
1066  /// The shortest sequence length
1067  Int4 GetMinLength(const CSeqDBVolSet & volset) const;
1068 
1069  /// Get the number of sequences available
1070  ///
1071  /// This iterates the alias node tree to compute the number of
1072  /// sequences available here. Alias files may override this value
1073  /// (stopping traversal at that depth). It is normally used to
1074  /// provide information on how many OIDs exist after filtering has
1075  /// been applied.
1076  ///
1077  /// @param volset
1078  /// The set of database volumes
1079  /// @return
1080  /// The number of included sequences
1081  Int8 GetNumSeqs(const CSeqDBVolSet & volset) const;
1082 
1083  /// Get the number of sequences available
1084  ///
1085  /// This iterates the alias node tree to compute the number of
1086  /// sequences available here. Alias files may override this value
1087  /// (stopping traversal at that depth). It is normally used to
1088  /// provide information on how many OIDs exist after filtering has
1089  /// been applied. This is like GetNumSeqs, but uses STATS_NSEQ.
1090  ///
1091  /// @param volset
1092  /// The set of database volumes
1093  /// @return
1094  /// The number of included sequences
1095  Int8 GetNumSeqsStats(const CSeqDBVolSet & volset) const;
1096 
1097  /// Get the size of the OID range
1098  ///
1099  /// This iterates the alias node tree to compute the number of
1100  /// sequences in all volumes as encountered in traversal. Alias
1101  /// files cannot override this value. Filtering does not affect
1102  /// this value.
1103  ///
1104  /// @param volset
1105  /// The set of database volumes
1106  /// @return
1107  /// The number of OIDs found during traversal
1108  Int8 GetNumOIDs(const CSeqDBVolSet & volset) const;
1109 
1110  /// Get the total length of the set of databases
1111  ///
1112  /// This iterates the alias node tree to compute the total length
1113  /// of all sequences in all volumes included in the database.
1114  /// This may count volumes several times (depending on alias tree
1115  /// structure). Alias files can override this value (stopping
1116  /// traversal at that depth). It is normally used to describe the
1117  /// amount of sequence data remaining after filtering has been
1118  /// applied.
1119  ///
1120  /// @param volset
1121  /// The set of database volumes
1122  /// @return
1123  /// The total length of all included sequences
1124  Uint8 GetTotalLength(const CSeqDBVolSet & volset) const;
1125 
1126  /// Get the total length of the set of databases
1127  ///
1128  /// This iterates the alias node tree to compute the total length
1129  /// of all sequences in all volumes included in the database.
1130  /// This may count volumes several times (depending on alias tree
1131  /// structure). Alias files can override this value (stopping
1132  /// traversal at that depth). It is normally used to describe the
1133  /// amount of sequence data remaining after filtering has been
1134  /// applied. This is like GetTotalLength but uses STATS_TOTLEN.
1135  ///
1136  /// @param volset
1137  /// The set of database volumes
1138  /// @return
1139  /// The total length of all included sequences
1140  Uint8 GetTotalLengthStats(const CSeqDBVolSet & volset) const;
1141 
1142  /// Get the sum of the volume lengths
1143  ///
1144  /// This iterates the alias node tree to compute the total length
1145  /// of all sequences in all volumes as encountered in traversal.
1146  /// This may count volumes several times (depending on alias tree
1147  /// structure). Alias files cannot override this value.
1148  ///
1149  /// @param volset
1150  /// The set of database volumes
1151  /// @return
1152  /// The sum of all volumes lengths as traversed
1153  Uint8 GetVolumeLength(const CSeqDBVolSet & volset) const;
1154 
1155  /// Get the membership bit
1156  ///
1157  /// This iterates the alias node tree to find the membership bit,
1158  /// if there is one. If more than one alias node provides a
1159  /// membership bit, only one will be used. This value can only be
1160  /// found in alias files (volumes do not have a single internal
1161  /// membership bit).
1162  ///
1163  /// @param volset
1164  /// The set of database volumes
1165  /// @return
1166  /// The membership bit, or zero if none was found.
1167  int GetMembBit(const CSeqDBVolSet & volset) const;
1168 
1169  /// Check whether a db scan is need to compute correct totals.
1170  ///
1171  /// This traverses this node and its subnodes to determine whether
1172  /// accurate estimation of the total number of sequences and bases
1173  /// requires a linear time scan of the index files.
1174  ///
1175  /// @param volset
1176  /// The set of database volumes.
1177  /// @return
1178  /// True if the database scan is required.
1179  bool NeedTotalsScan(const CSeqDBVolSet & volset) const;
1180 
1181  /// Check if any volume filtering exists.
1182  ///
1183  /// This method computes and caches the sequence filtering for
1184  /// this node and any subnodes, and returns true if any filtering
1185  /// exists. Subsequent calls will just return the cached value.
1186  ///
1187  /// @return True if any filtering exists.
1188  bool HasFilters()
1189  {
1190  x_ComputeMasks();
1191  return m_HasFilters;
1192  }
1193 
1194  /// Get filtering tree for all volumes.
1195  ///
1196  /// This method applies the filtering options found in the alias
1197  /// node tree to all associated volumes (iterating over the tree
1198  /// recursively). The virtual OID lists are not built as a result
1199  /// of this process, but the data necessary for virtual OID
1200  /// construction is copied to the volume objects.
1201  ///
1202  /// @return A filter tree for all volumes.
1203  CRef<CSeqDB_FilterTree> GetFilterTree();
1204 
1205  /// Get Name/Value Data From Alias Files
1206  ///
1207  /// SeqDB treats each alias file as a map from a variable name to
1208  /// a value. This method will return a map from the basename of
1209  /// the filename of each alias file, to a mapping from variable
1210  /// name to value for each entry in that file. For example, the
1211  /// value of the "DBLIST" entry in the "wgs.nal" file would be
1212  /// values["wgs"]["DBLIST"]. The lines returned have been
1213  /// processed somewhat by SeqDB, including normalizing tabs to
1214  /// whitespace, trimming leading and trailing whitespace, and
1215  /// removal of comments and other non-value lines. Care should be
1216  /// taken when using the values returned by this method. SeqDB
1217  /// uses an internal "virtual" alias file entry to aggregate the
1218  /// values passed into SeqDB by the user. This mapping uses a
1219  /// filename of "-" and contains a single entry mapping "DBLIST"
1220  /// to SeqDB's database name input. This entry is the root of the
1221  /// alias file inclusion tree. Also note that alias files that
1222  /// appear in several places in the alias file inclusion tree only
1223  /// have one entry in the returned map (and are only parsed once
1224  /// by SeqDB).
1225  ///
1226  /// @param afv
1227  /// The alias file values will be returned here.
1228  /// @param volset
1229  /// The set of database volumes
1230  void GetAliasFileValues(TAliasFileValues & afv,
1231  const CSeqDBVolSet & volset);
1232 
1233  /// Is the top node alias file associated with Gi based masks?
1234  ///
1235  /// This will return true if the MASKLIST field of the top alias
1236  /// node is set.
1237  ///
1238  /// @return TRUE if MASKLIST field is present
1239  bool HasGiMask() const
1240  {
1241  return (m_Node->HasGiMask());
1242  }
1243 
1244  /// Get Gi-based Mask Names From Alias Files
1245  ///
1246  /// This will return the MASKLIST field of the top alias node.
1247  ///
1248  /// @param mask_list
1249  /// The mask names will be returned here.
1250  void GetMaskList(vector <string> &mask_list)
1251  {
1252  m_Node->GetMaskList(mask_list);
1253  }
1254 
1255  /// Dump debug information for this object
1256  /// @sa CDebugDumpable
1257  void DebugDump(CDebugDumpContext ddc, unsigned int depth) const;
1258 
1259  /// Get the Oid Mask Type
1260  ///
1261  /// This iterates the alias node tree to find the oid mask type,
1262  /// if there is one.
1263  ///
1264  /// @param volset
1265  /// The set of database volumes
1266  /// @return
1267  /// Oid mask type, or zero if none was found.
1268  int GetOidMaskType(const CSeqDBVolSet & volset) const;
1269 
1270 private:
1271  /// Compute filtering options for all volumes.
1272  ///
1273  /// This method applies the filtering options found in the alias
1274  /// node tree to all associated volumes (iterating over the tree
1275  /// recursively). The virtual OID lists are not built as a result
1276  /// of this process, but the data necessary for virtual OID
1277  /// construction is copied to the volume objects.
1279  {
1280  m_Node->ComputeMasks(m_HasFilters);
1281  }
1282 
1283  /// Combined alias files.
1285 
1286  /// This is the alias node tree's "artificial" topmost node, which
1287  /// aggregates the user provided database names.
1289 
1290  /// The cached output of the topmost node's FindVolumePaths(recursive).
1291  vector<string> m_VolumeNames;
1292 
1293  /// The cached output of the topmost node's FindVolumePaths(recursive).
1294  vector<string> m_AliasNames;
1295 
1296  /// True if this is a protein database.
1298 
1299  /// Shortest sequence length
1301 
1302  /// Number of sequences.
1303  mutable Int8 m_NumSeqs;
1304 
1305  /// Number of sequences for statistics purposes.
1306  mutable int m_NumSeqsStats;
1307 
1308  /// Number of OIDs.
1309  mutable Int8 m_NumOIDs;
1310 
1311  /// Total length.
1313 
1314  /// Total length for statistics purposes.
1316 
1317  /// Total length ignoring filtering.
1319 
1320  /// Membership bit.
1321  mutable int m_MembBit;
1322 
1323  /// True if we have the database title.
1324  mutable bool m_HasTitle;
1325 
1326  /// Database title.
1327  mutable string m_Title;
1328 
1329  /// 1 if we need a totals scan, 0 if not, -1 if not known.
1330  mutable int m_NeedTotalsScan;
1331 
1332  /// Filter tree representing all alias file filtering.
1334 
1335  /// Are there filters for this database?
1337 
1338  /// Oid Mask Type
1339  mutable int m_OidMaskType;
1340 
1341  /// Disable copy operator.
1342  CSeqDBAliasFile & operator =(const CSeqDBAliasFile &);
1343 
1344  /// Disable copy constructor.
1346 };
1347 
1348 
1350 
1351 #endif // OBJTOOLS_READERS_SEQDB__SEQDBALIAS_HPP
1352 
1353 
CObject –.
Definition: ncbiobj.hpp:180
CSeqDBAliasFile class.
Definition: seqdbalias.hpp:982
CSeqDBAliasSets m_AliasSets
Combined alias files.
int m_MembBit
Membership bit.
bool HasGiMask() const
Is the top node alias file associated with Gi based masks?
TSeqDBAliasFileValues TAliasFileValues
Import type to allow shorter name.
Definition: seqdbalias.hpp:984
bool m_HasTitle
True if we have the database title.
bool m_HasFilters
Are there filters for this database?
vector< string > m_AliasNames
The cached output of the topmost node's FindVolumePaths(recursive).
Int4 m_MinLength
Shortest sequence length.
void x_ComputeMasks()
Compute filtering options for all volumes.
bool m_IsProtein
True if this is a protein database.
void GetMaskList(vector< string > &mask_list)
Get Gi-based Mask Names From Alias Files.
Int8 m_TotalLength
Total length.
Int8 m_NumOIDs
Number of OIDs.
Int8 m_NumSeqs
Number of sequences.
int m_NumSeqsStats
Number of sequences for statistics purposes.
CRef< CSeqDB_FilterTree > m_TopTree
Filter tree representing all alias file filtering.
int m_NeedTotalsScan
1 if we need a totals scan, 0 if not, -1 if not known.
void FindVolumePaths(vector< string > &vols, vector< string > *alias, bool recursive) const
Find the base names of volumes.
CSeqDBAliasFile(const CSeqDBAliasFile &)
Disable copy constructor.
int m_OidMaskType
Oid Mask Type.
bool HasFilters()
Check if any volume filtering exists.
Int8 m_VolumeLength
Total length ignoring filtering.
const vector< string > & GetVolumeNames() const
Get the list of volume names.
string m_Title
Database title.
CRef< CSeqDBAliasNode > m_Node
This is the alias node tree's "artificial" topmost node, which aggregates the user provided database ...
vector< string > m_VolumeNames
The cached output of the topmost node's FindVolumePaths(recursive).
Int8 m_TotalLengthStats
Total length for statistics purposes.
CSeqDBAliasNode class.
Definition: seqdbalias.hpp:428
TVolNames m_VolNames
Set of volume names associated with this node.
Definition: seqdbalias.hpp:941
vector< bool > m_SkipLocal
Should we skip local DB search for this DBLIST?
Definition: seqdbalias.hpp:957
map< string, string > TVarList
Type of set used for KEY/VALUE pairs within each node.
Definition: seqdbalias.hpp:430
vector< CRef< CSeqDB_AliasMask > > m_NodeMasks
Mask objects for this node.
Definition: seqdbalias.hpp:963
vector< CSeqDB_BasePath > TVolNames
Type used to store a set of volume names for each node.
Definition: seqdbalias.hpp:925
bool m_HasGiMask
Do we have Gi masks for the top node? (only applicable to the top node)
Definition: seqdbalias.hpp:954
bool HasGiMask() const
Is the top node alias file associated with Gi based masks?
Definition: seqdbalias.hpp:736
CSeqDBAliasNode(const CSeqDBAliasNode &)
Disable copy constructor.
CSeqDB_DirName m_DBPath
The common prefix for the DB paths.
Definition: seqdbalias.hpp:935
vector< CSeqDB_BasePath > m_DBList
Tokenized version of DBLIST.
Definition: seqdbalias.hpp:950
CSeqDBAtlas & m_Atlas
The memory management layer for this SeqDB instance.
Definition: seqdbalias.hpp:932
TVarList m_Values
List of KEY/VALUE pairs from this alias file.
Definition: seqdbalias.hpp:938
TSeqDBAliasFileValues TAliasFileValues
Import type to allow shorter name.
Definition: seqdbalias.hpp:433
TSubNodeList m_SubNodes
List of subnodes contained by this node.
Definition: seqdbalias.hpp:944
vector< CRef< CSeqDBAliasNode > > TSubNodeList
Type used to store the set of subnodes for this node.
Definition: seqdbalias.hpp:928
bool m_ExpandLinks
Do not expand link when resolving paths.
Definition: seqdbalias.hpp:966
void SetMasks(CSeqDBVolSet &volset)
Set filtering options for all volumes.
CSeqDBAliasSets & m_AliasSets
Combined alias files.
Definition: seqdbalias.hpp:960
CSeqDB_Path m_ThisName
Filename of this alias file.
Definition: seqdbalias.hpp:947
CSeqDBAliasSets class.
Definition: seqdbalias.hpp:229
bool FindBlastDBPath(const CSeqDB_Path &dbname, CSeqDB_Path &resolved)
Find a file given a partial path and name.
Definition: seqdbalias.hpp:285
bool FindBlastDBPath(const CSeqDB_BasePath &dbname, char dbtype, CSeqDB_BasePath &resolved)
Find a file given a partial path and name.
Definition: seqdbalias.hpp:316
map< string, string > TAliasGroup
Aggregated alias file - maps filename to file contents.
Definition: seqdbalias.hpp:385
CSeqDBAliasSets(const CSeqDBAliasSets &)
Disable copy constructor.
CSeqDBAtlas & m_Atlas
Reference to the memory management layer.
Definition: seqdbalias.hpp:382
map< string, TAliasGroup > TAliasGroupMap
Full index filename to aggregated alias file.
Definition: seqdbalias.hpp:388
CSeqDBAliasSets(CSeqDBAtlas &atlas)
Constructor.
Definition: seqdbalias.hpp:232
map< string, string > m_PathLookup
Caches results of FindBlastDBPath.
Definition: seqdbalias.hpp:394
TAliasGroupMap m_Groups
Alias groups.
Definition: seqdbalias.hpp:391
CSeqDBAliasStack.
Definition: seqdbalias.hpp:125
bool Exists(const CSeqDB_Path &name)
Check whether the stack contains the specified string.
Definition: seqdbalias.hpp:143
void Push(const CSeqDB_Path &name)
Push a new string onto to the stack.
Definition: seqdbalias.hpp:159
vector< CSeqDB_Path > m_NodeNames
List of node names.
Definition: seqdbalias.hpp:209
CSeqDBAliasStack()
Constructor.
Definition: seqdbalias.hpp:128
CSeqDBAliasStack(const CSeqDBAliasStack &)
Disable copy constructor.
void Pop()
Remove the top element of the stack.
Definition: seqdbalias.hpp:195
unsigned Size()
Return the number of in-use elements.
Definition: seqdbalias.hpp:202
unsigned m_Count
Number of in-use node names.
Definition: seqdbalias.hpp:212
CSeqDBAtlas class.
Definition: seqdbatlas.hpp:298
CSeqDBLockHold.
Definition: seqdbatlas.hpp:167
CSeqDBVolSet.
CSeqDBVol class.
Definition: seqdbvol.hpp:169
CSeqDBAliasExplorer class.
Definition: seqdbalias.hpp:90
virtual bool Explore(const TVarList &values)=0
This will be called with the map of key/value pairs associated with this alias file.
map< string, string > TVarList
Type of set used for KEY/VALUE pairs within each node.
Definition: seqdbalias.hpp:93
virtual void Accumulate(const CSeqDBVol &volumes)=0
This will be called with each CVolume that is in the alias file tree structure (in order of traversal...
virtual ~CSeqDB_AliasExplorer()
Destructor.
Definition: seqdbalias.hpp:96
CSeqDBAliasWalker class.
Definition: seqdbalias.hpp:64
virtual void AddString(const string &)=0
This will be called with the value associated with this key in the alias file.
virtual void Accumulate(const CSeqDBVol &)=0
This will be called with each CVolume that is in the alias file tree structure (in order of traversal...
virtual const char * GetFileKey() const =0
Override to provide the alias file KEY name for the type of summary data you want to gather,...
virtual ~CSeqDB_AliasWalker()
Destructor.
Definition: seqdbalias.hpp:67
CSeqDB_BaseName.
CSeqDB_BasePath.
void Assign(const CSeqDB_Substring &sub)
Assign the value from a substring.
CSeqDB_DirName.
CSeqDB_FileName.
Tree of nodes describing filtering of database sequences.
CSeqDB_Path.
const string & GetPathS() const
Get the path as a string.
void Assign(const string &path)
Assigns the provided value to this path.
static unsigned char depth[2 *(256+1+29)+1]
NCBI_XOBJUTIL_EXPORT string GetTitle(const CBioseq_Handle &hnd, TGetTitleFlags flags=0)
Definition: seqtitle.cpp:106
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
char * dbname(DBPROCESS *dbproc)
Get name of current database.
Definition: dblib.c:6929
int i
static const char * x_ReadLine(const char *path, char *line, size_t size)
Definition: ncbi_namerd.c:1081
Defines BLAST database access classes.
Defines exception class and several constants for SeqDB.
File access objects for CSeqDB.
This file defines several SeqDB utility functions related to byte order and file system portability.
The SeqDB oid filtering layer.
Defines database volume access classes.
Manages a set of database volumes.
#define _ASSERT
Modified on Fri Jan 05 07:25:28 2024 by modify_doxy.py rev. 669887