NCBI C++ ToolKit
huge_asn_reader.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: huge_asn_reader.hpp 102385 2024-04-29 14:33:08Z foleyjp $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Sergiy Gotvyanskyy
27 *
28 * File Description:
29 *
30 *
31 */
32 
33 #ifndef _HUGE_ASN_READER_HPP_INCLUDED_
34 #define _HUGE_ASN_READER_HPP_INCLUDED_
35 
36 #include <corelib/ncbistd.hpp>
37 #include <corelib/ncbiutil.hpp>
43 #include <objects/seq/Seq_inst.hpp>
44 #include <objects/seq/Seqdesc.hpp>
46 #include <optional>
47 
49 
50 class CObjectIStream;
51 
53 
54 class CBioseq;
55 class CSeq_submit;
56 class CSeqdesc;
57 
59 
60 
62  public IHugeAsnSource,
63  public CObject
64 {
65 public:
66  using TFileSize = std::streamoff;
67 
69  CHugeAsnReader(CHugeFile* file, ILineErrorListener * pMessageListener);
70  virtual ~CHugeAsnReader();
71 
72  void Open(CHugeFile* file, ILineErrorListener * pMessageListener) override;
73  bool GetNextBlob() override;
74  CRef<CSeq_entry> GetNextSeqEntry() override;
75  CConstRef<CSubmit_block> GetSubmitBlock() const override;
76  CRef<CSerialObject> ReadAny();
77 
78  struct TBioseqInfo;
79  struct TBioseqSetInfo;
80  using TBioseqSetList = std::list<TBioseqSetInfo>;
81  using TBioseqList = std::list<TBioseqInfo>;
82 
83  struct TBioseqInfo
84  {
86  TBioseqSetList::const_iterator m_parent_set;
87  TSeqPos m_length = static_cast<TSeqPos>(-1);
89  std::list<CConstRef<CSeq_id>> m_ids;
92  };
93 
95  {
97  TBioseqSetList::const_iterator m_parent_set;
100  bool m_HasAnnot{false};
101  optional<int> m_Level;
102  };
103 
105 
106  using TBioseqIndex = std::map<CConstRef<CSeq_id>, TBioseqList::const_iterator, CRefLess>;
107  using TBioseqSetIndex = std::map<CConstRef<CSeq_id>, TBioseqSetList::const_iterator, CRefLess>;
108 
109  auto& GetBioseqs() const { return m_bioseq_list; };
110  auto& GetBiosets() const { return m_bioseq_set_list; };
111  auto GetFormat() const { return m_file->m_format; };
112  auto GetMaxLocalId() const { return m_max_local_id; };
113 
114  // These metods are for CDataLoader, each top object is a 'blob'
115  const TBioseqSetInfo* FindTopObject(CConstRef<CSeq_id> seqid) const;
116 
117  enum class eAddTopEntry{ yes, no };
118  virtual CRef<CSeq_entry> LoadSeqEntry(const TBioseqSetInfo& info, eAddTopEntry add_top_entry = eAddTopEntry::yes) const;
119 
120  const TBioseqInfo* FindBioseq(CConstRef<CSeq_id> seqid) const;
121  CConstRef<CSeqdesc> GetClosestDescriptor(const TBioseqInfo& info, CSeqdesc::E_Choice choice) const;
122  CConstRef<CSeqdesc> GetClosestDescriptor(const CSeq_id& id, CSeqdesc::E_Choice choice) const;
123 
124  // Direct loading methods
125  CRef<CSeq_entry> LoadSeqEntry(CConstRef<CSeq_id> seqid) const;
126  CRef<CBioseq> LoadBioseq(CConstRef<CSeq_id> seqid) const;
127 
128  bool IsMultiSequence() const override;
129  bool HasHugeSetAnnot() const { return m_HasHugeSetAnnot; }
130  static bool IsHugeSet(CBioseq_set::TClass setClass);
131 
132 
133  virtual void FlattenGenbankSet();
134  auto& GetTopEntry() const { return m_top_entry; }
135  auto& GetFlattenedIndex() const { return m_FlattenedIndex; }
136  auto& GetTopIds() const { return m_top_ids; }
137  unique_ptr<CObjectIStream> MakeObjStream(TFileSize pos) const;
138 
139  const CBioseq_set::TClass* GetTopLevelClass() const;
140 
141  using t_more_hooks = std::function<void(CObjectIStream&)>;
142  void ExtendReadHooks(t_more_hooks hooks);
143  void ResetTopEntry();
144 
145 protected:
146  // temporary structure for indexing
148  {
149  list<CConstRef<CSeq_id>> m_ids;
150  TSeqPos m_length = 0;
154  };
155 
156  struct TContext
157  {
158  std::deque<TBioseqInfoRec> bioseq_stack;
159  std::deque<TBioseqSetList::iterator> bioseq_set_stack;
160  };
161 
162  virtual void x_SetHooks(CObjectIStream& objStream, TContext& context);
163  virtual void x_SetFeatIdHooks(CObjectIStream& objStream, TContext& context);
164  virtual void x_SetBioseqHooks(CObjectIStream& objStream, TContext& context);
165  virtual void x_SetBioseqSetHooks(CObjectIStream& objStream, TContext& context);
166 
167  using TStreamPos = streampos;
168  TStreamPos GetCurrentPos() const;
169 
170 
171 private:
172  void x_ResetIndex();
173  void x_IndexNextAsn1();
174  void x_ThrowDuplicateId(
175  const TBioseqSetInfo& existingInfo,const TBioseqSetInfo& newInfo, const CSeq_id& duplicateId);
176 
177  CRef<CSeq_descr> x_GetTopLevelDescriptors() const;
178  bool x_HasNestedGenbankSets() const;
179 
180 
181  ILineErrorListener * mp_MessageListener = nullptr;
182  TStreamPos m_current_pos = 0; // points to current blob in concatenated ASN.1 file
184  std::list<t_more_hooks> m_more_hooks;
185 
186 // global lists, readonly after indexing
187 protected:
189  TStreamPos m_next_pos = 0; // points to next unprocessed blob in concatenated ASN.1 file
190  int m_max_local_id = 0;
193  std::list<CConstRef<CSeq_id>> m_top_ids;
194  bool m_HasHugeSetAnnot{ false };
195 private:
197 
198 // flattenization structures, readonly after flattenization, accept m_Current
202  TBioseqSetList::const_iterator m_Current;
203  const CBioseq_set::TClass* m_pTopLevelClass { nullptr };
204 };
205 
209 
210 #endif // _HUGE_ASN_READER_HPP_INCLUDED_
User-defined methods of the data storage class.
std::streamoff TFileSize
auto & GetTopEntry() const
std::list< TBioseqInfo > TBioseqList
auto & GetBiosets() const
auto GetFormat() const
std::map< CConstRef< CSeq_id >, TBioseqSetList::const_iterator, CRefLess > TBioseqSetIndex
std::list< CConstRef< CSeq_id > > m_top_ids
TBioseqIndex m_bioseq_index
std::map< CConstRef< CSeq_id >, TBioseqList::const_iterator, CRefLess > TBioseqIndex
auto & GetTopIds() const
auto & GetBioseqs() const
bool HasHugeSetAnnot() const
CConstRef< CSubmit_block > m_submit_block
TBioseqSetList::const_iterator m_Current
std::list< TBioseqSetInfo > TBioseqSetList
TBioseqSetList m_FlattenedSets
auto GetMaxLocalId() const
TBioseqList m_bioseq_list
TBioseqSetList m_bioseq_set_list
TBioseqSetIndex m_FlattenedIndex
auto & GetFlattenedIndex() const
std::list< t_more_hooks > m_more_hooks
std::function< void(CObjectIStream &)> t_more_hooks
CRef< CHugeFile > m_file
CRef< CSeq_entry > m_top_entry
CObjectIStream –.
Definition: objistr.hpp:93
CObject –.
Definition: ncbiobj.hpp:180
Include a standard set of the NCBI C++ Toolkit most basic headers.
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
#define NCBI_XHUGEASN_EXPORT
Definition: ncbi_export.h:809
ERepr
representation class
Definition: Seq_inst_.hpp:91
E_Choice
Choice variants.
Definition: Seqdesc_.hpp:109
EMol
molecule class in living organism
Definition: Seq_inst_.hpp:108
@ eRepr_not_set
empty
Definition: Seq_inst_.hpp:92
@ eMol_not_set
> cdna = rna
Definition: Seq_inst_.hpp:109
fallback to Cassandra storage</td > n</tr > n</table > n</td > n< td > yes
FILE * file
static MDB_envinfo info
Definition: mdb_load.c:37
Definition: fix_pub.hpp:45
Useful/utility classes and methods.
list< CConstRef< CSeq_id > > m_ids
std::list< CConstRef< CSeq_id > > m_ids
TBioseqSetList::const_iterator m_parent_set
CConstRef< CSeq_descr > m_descr
TBioseqSetList::const_iterator m_parent_set
CConstRef< CSeq_descr > m_descr
std::deque< TBioseqInfoRec > bioseq_stack
std::deque< TBioseqSetList::iterator > bioseq_set_stack
Compare objects pointed to by (smart) pointer.
Definition: ncbiutil.hpp:67
static CS_CONTEXT * context
Definition: will_convert.c:21
Modified on Wed Jun 19 17:05:31 2024 by modify_doxy.py rev. 669887