NCBI C++ ToolKit
huge_asn_reader.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: huge_asn_reader.hpp 103011 2024-08-21 17:32:06Z kans $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Sergiy Gotvyanskyy
27 *
28 * File Description:
29 *
30 *
31 */
32 
33 #ifndef _HUGE_ASN_READER_HPP_INCLUDED_
34 #define _HUGE_ASN_READER_HPP_INCLUDED_
35 
36 #include <corelib/ncbistd.hpp>
37 #include <corelib/ncbiutil.hpp>
43 #include <objects/seq/Seq_inst.hpp>
44 #include <objects/seq/Seqdesc.hpp>
46 #include <optional>
47 
49 
50 class CObjectIStream;
51 
53 
54 class CBioseq;
55 class CSeq_submit;
56 class CSeqdesc;
57 
59 
60 
62  public IHugeAsnSource,
63  public CObject
64 {
65 public:
66  using TFileSize = std::streamoff;
67 
69  CHugeAsnReader(CHugeFile* file, ILineErrorListener * pMessageListener);
70  virtual ~CHugeAsnReader();
71 
72  void Open(CHugeFile* file, ILineErrorListener * pMessageListener) override;
73  bool GetNextBlob() override;
74  CRef<CSeq_entry> GetNextSeqEntry() override;
75  CConstRef<CSubmit_block> GetSubmitBlock() const override;
76  bool IsNotJustLocalOrGeneral() const;
77  bool HasRefSeq() const;
78  CRef<CSerialObject> ReadAny();
79 
80  struct TBioseqInfo;
81  struct TBioseqSetInfo;
82  using TBioseqSetList = std::list<TBioseqSetInfo>;
83  using TBioseqList = std::list<TBioseqInfo>;
84 
85  struct TBioseqInfo
86  {
88  TBioseqSetList::const_iterator m_parent_set;
89  TSeqPos m_length = static_cast<TSeqPos>(-1);
91  std::list<CConstRef<CSeq_id>> m_ids;
94  };
95 
97  {
99  TBioseqSetList::const_iterator m_parent_set;
102  bool m_HasAnnot{false};
103  optional<int> m_Level;
104  };
105 
107 
108  using TBioseqIndex = std::map<CConstRef<CSeq_id>, TBioseqList::const_iterator, CRefLess>;
109  using TBioseqSetIndex = std::map<CConstRef<CSeq_id>, TBioseqSetList::const_iterator, CRefLess>;
110 
111  auto& GetBioseqs() const { return m_bioseq_list; };
112  auto& GetBiosets() const { return m_bioseq_set_list; };
113  auto GetFormat() const { return m_file->m_format; };
114  auto GetMaxLocalId() const { return m_max_local_id; };
115 
116  // These metods are for CDataLoader, each top object is a 'blob'
117  const TBioseqSetInfo* FindTopObject(CConstRef<CSeq_id> seqid) const;
118 
119  enum class eAddTopEntry{ yes, no };
120  virtual CRef<CSeq_entry> LoadSeqEntry(const TBioseqSetInfo& info, eAddTopEntry add_top_entry = eAddTopEntry::yes) const;
121 
122  const TBioseqInfo* FindBioseq(CConstRef<CSeq_id> seqid) const;
123  CConstRef<CSeqdesc> GetClosestDescriptor(const TBioseqInfo& info, CSeqdesc::E_Choice choice) const;
124  CConstRef<CSeqdesc> GetClosestDescriptor(const CSeq_id& id, CSeqdesc::E_Choice choice) const;
125 
126  // Direct loading methods
127  CRef<CSeq_entry> LoadSeqEntry(CConstRef<CSeq_id> seqid) const;
128  CRef<CBioseq> LoadBioseq(CConstRef<CSeq_id> seqid) const;
129 
130  bool IsMultiSequence() const override;
131  bool HasHugeSetAnnot() const { return m_HasHugeSetAnnot; }
132  static bool IsHugeSet(CBioseq_set::TClass setClass);
133 
134 
135  virtual void FlattenGenbankSet();
136  auto& GetTopEntry() const { return m_top_entry; }
137  auto& GetFlattenedIndex() const { return m_FlattenedIndex; }
138  auto& GetTopIds() const { return m_top_ids; }
139  unique_ptr<CObjectIStream> MakeObjStream(TFileSize pos) const;
140 
141  const CBioseq_set::TClass* GetTopLevelClass() const;
142 
143  using t_more_hooks = std::function<void(CObjectIStream&)>;
144  void ExtendReadHooks(t_more_hooks hooks);
145  void ResetTopEntry();
146 
147 protected:
148  // temporary structure for indexing
150  {
151  list<CConstRef<CSeq_id>> m_ids;
152  TSeqPos m_length = 0;
156  };
157 
158  struct TContext
159  {
160  std::deque<TBioseqInfoRec> bioseq_stack;
161  std::deque<TBioseqSetList::iterator> bioseq_set_stack;
162  };
163 
164  virtual void x_SetHooks(CObjectIStream& objStream, TContext& context);
165  virtual void x_SetFeatIdHooks(CObjectIStream& objStream, TContext& context);
166  virtual void x_SetBioseqHooks(CObjectIStream& objStream, TContext& context);
167  virtual void x_SetBioseqSetHooks(CObjectIStream& objStream, TContext& context);
168 
169  using TStreamPos = streampos;
170  TStreamPos GetCurrentPos() const;
171 
172 
173 private:
174  void x_ResetIndex();
175  void x_IndexNextAsn1();
176  void x_ThrowDuplicateId(
177  const TBioseqSetInfo& existingInfo,const TBioseqSetInfo& newInfo, const CSeq_id& duplicateId);
178 
179  CRef<CSeq_descr> x_GetTopLevelDescriptors() const;
180  bool x_HasNestedGenbankSets() const;
181 
182  ILineErrorListener * mp_MessageListener = nullptr;
183  TStreamPos m_current_pos = 0; // points to current blob in concatenated ASN.1 file
185  std::list<t_more_hooks> m_more_hooks;
186 
187 // global lists, readonly after indexing
188 protected:
190  TStreamPos m_next_pos = 0; // points to next unprocessed blob in concatenated ASN.1 file
191  int m_max_local_id = 0;
194  std::list<CConstRef<CSeq_id>> m_top_ids;
195  bool m_HasHugeSetAnnot{ false };
196 private:
200 
201 // flattenization structures, readonly after flattenization, accept m_Current
205  TBioseqSetList::const_iterator m_Current;
206  const CBioseq_set::TClass* m_pTopLevelClass { nullptr };
207 };
208 
212 
213 #endif // _HUGE_ASN_READER_HPP_INCLUDED_
User-defined methods of the data storage class.
std::streamoff TFileSize
auto & GetTopEntry() const
std::list< TBioseqInfo > TBioseqList
auto & GetBiosets() const
auto GetFormat() const
std::map< CConstRef< CSeq_id >, TBioseqSetList::const_iterator, CRefLess > TBioseqSetIndex
std::list< CConstRef< CSeq_id > > m_top_ids
TBioseqIndex m_bioseq_index
std::map< CConstRef< CSeq_id >, TBioseqList::const_iterator, CRefLess > TBioseqIndex
auto & GetTopIds() const
auto & GetBioseqs() const
bool HasHugeSetAnnot() const
CConstRef< CSubmit_block > m_submit_block
TBioseqSetList::const_iterator m_Current
std::list< TBioseqSetInfo > TBioseqSetList
TBioseqSetList m_FlattenedSets
auto GetMaxLocalId() const
TBioseqList m_bioseq_list
TBioseqSetList m_bioseq_set_list
TBioseqSetIndex m_FlattenedIndex
auto & GetFlattenedIndex() const
std::list< t_more_hooks > m_more_hooks
std::function< void(CObjectIStream &)> t_more_hooks
CRef< CHugeFile > m_file
CRef< CSeq_entry > m_top_entry
CObjectIStream –.
Definition: objistr.hpp:93
CObject –.
Definition: ncbiobj.hpp:180
Include a standard set of the NCBI C++ Toolkit most basic headers.
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
#define NCBI_XHUGEASN_EXPORT
Definition: ncbi_export.h:809
ERepr
representation class
Definition: Seq_inst_.hpp:91
E_Choice
Choice variants.
Definition: Seqdesc_.hpp:109
EMol
molecule class in living organism
Definition: Seq_inst_.hpp:108
@ eRepr_not_set
empty
Definition: Seq_inst_.hpp:92
@ eMol_not_set
> cdna = rna
Definition: Seq_inst_.hpp:109
fallback to Cassandra storage</td > n</tr > n</table > n</td > n< td > yes
FILE * file
static MDB_envinfo info
Definition: mdb_load.c:37
Definition: fix_pub.hpp:45
Useful/utility classes and methods.
list< CConstRef< CSeq_id > > m_ids
std::list< CConstRef< CSeq_id > > m_ids
TBioseqSetList::const_iterator m_parent_set
CConstRef< CSeq_descr > m_descr
TBioseqSetList::const_iterator m_parent_set
CConstRef< CSeq_descr > m_descr
std::deque< TBioseqInfoRec > bioseq_stack
std::deque< TBioseqSetList::iterator > bioseq_set_stack
Compare objects pointed to by (smart) pointer.
Definition: ncbiutil.hpp:67
static CS_CONTEXT * context
Definition: will_convert.c:21
Modified on Fri Sep 20 14:57:53 2024 by modify_doxy.py rev. 669887