NCBI C++ ToolKit
huge_file_process.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: huge_file_process.cpp 102385 2024-04-29 14:33:08Z foleyjp $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Sergiy Gotvyanskyy
27 * File Description:
28 * Utility class for processing ASN.1 files using Huge Files approach
29 *
30 */
31 #include <ncbi_pch.hpp>
32 
33 #include <objects/seq/Seqdesc.hpp>
38 
42 
44 
46 #include <objects/seq/Bioseq.hpp>
48 
50 #include <objmgr/scope.hpp>
52 
54 
57 
58 namespace
59 {
60 
61  class CAutoRevoker
62  {
63  public:
64  template<class TLoader>
65  CAutoRevoker(struct SRegisterLoaderInfo<TLoader>& info)
66  : m_loader{ info.GetLoader() } {}
67  ~CAutoRevoker()
68  {
70  }
71  private:
72  CDataLoader* m_loader = nullptr;
73  };
74 
75 }
76 
77 
79 {
80  CBioseq_set::GetTypeInfo(),
81  CBioseq::GetTypeInfo(),
82  CSeq_entry::GetTypeInfo(),
83  CSeq_submit::GetTypeInfo(),
84 };
85 
86 
88  m_pHugeFile { new CHugeFile },
89  m_pReader{ new CHugeAsnReader }
90 {}
91 
92 
93 
95  m_pHugeFile { new CHugeFile },
96  m_pReader { pReader }
97 {}
98 
99 
102 {
103  Open(file_name, types);
104 }
105 
107 {
108  return g_supported_types.find(info) != g_supported_types.end();
109 }
110 
112 {
114  OpenReader();
115 }
116 
118 {
120 }
121 
123 {
125 }
126 
128 {
129  m_pReader->Open(m_pHugeFile.GetPointer(), nullptr);
130 }
131 
133 {
134 }
135 
137 {
138  if (!m_pReader->GetNextBlob()) {
139  return false;
140  }
141 
142  do
143  {
145  CRef<CSeq_entry> entry;
146  do
147  {
148  entry.Reset();
149 
150  if (seqid.Empty())
151  entry = m_pReader->GetNextSeqEntry();
152  else
153  {
154  auto seq = m_pReader->LoadBioseq(seqid);
155  if (seq.NotEmpty())
156  {
157  entry = Ref(new CSeq_entry);
158  entry->SetSeq(*seq);
159  if (auto pTopEntry = m_pReader->GetTopEntry(); pTopEntry) {
160  auto pNewEntry = Ref(new CSeq_entry());
161  pNewEntry->Assign(*pTopEntry);
162  pNewEntry->SetSet().SetSeq_set().push_back(entry);
163  entry = pNewEntry;
164  }
165  }
166  }
167 
168  if (entry)
169  {
170  handler(m_pReader->GetSubmitBlock(), entry);
171  }
172  }
173  while ( entry && seqid.Empty());
174  } while (m_pReader->GetNextBlob());
175 
176  return true;
177 }
178 
180 {
181  while (m_pReader->GetNextBlob()) {
183  bool processed = handler(m_pReader.GetPointer(), m_pReader->GetTopIds());
184  if (!processed)
185  return false;
186  }
187 
188  return true;
189 }
190 
192 {
193  if (m_pReader->GetNextBlob()) {
195  return true;
196  }
197 
198  return false;
199 }
200 
201 
203 {
204  while (m_pReader->GetNextBlob()) {
206  bool processed = handler(*this);
207  if (!processed)
208  return false;
209  }
210 
211  return true;
212 }
213 
215 {
216  if (!handler)
217  return false;
218 
219  string loader_name = CDirEntry::CreateAbsolutePath(GetFile().m_filename);
221  *CObjectManager::GetInstance(), loader_name, &GetReader(), CObjectManager::eNonDefault, 1); //CObjectManager::kPriority_Local);
222 
223  CAutoRevoker autorevoker(info);
224 
225  if (!scope)
226  scope = Ref(new CScope(*CObjectManager::GetInstance()));
227 
228  scope->AddDataLoader(loader_name);
229 
230  try
231  {
232  for (auto id: GetReader().GetTopIds())
233  {
234  {
235  auto beh = scope->GetBioseqHandle(*id);
236  auto parent = beh.GetTopLevelEntry();
237  handler(parent);
238  }
239  scope->ResetHistory();
240  }
241  }
242  catch(const std::exception& e)
243  {
244  scope->RemoveDataLoader(loader_name);
245  throw;
246  }
247  scope->RemoveDataLoader(loader_name);
248 
249  return true;
250 }
251 
252 
254 {
255  CSeq_entry_Handle parent = beh.GetParentEntry();
256  while(parent)
257  {
258  if (parent.IsTopLevelEntry())
259  break;
260 
261  if (auto temp = parent.GetParentEntry(); temp) {
262  if (temp.IsSet() && temp.GetSet().IsSetClass() &&
263  CHugeAsnReader::IsHugeSet(temp.GetSet().GetClass())) {
264  break;
265  }
266 
267  parent = temp;
268  }
269  else
270  break;
271  }
272 
273  return parent;
274 }
275 
276 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
CBioseq_Handle –.
static TRegisterLoaderInfo RegisterInObjectManager(CObjectManager &om, const string &loader_name, CHugeAsnReader *reader, CObjectManager::EIsDefault is_default=CObjectManager::eNonDefault, CObjectManager::TPriority priority=CObjectManager::kPriority_Default)
void Open(CHugeFile *file, ILineErrorListener *pMessageListener) override
bool GetNextBlob() override
auto & GetTopEntry() const
static bool IsHugeSet(CBioseq_set::TClass setClass)
CConstRef< CSubmit_block > GetSubmitBlock() const override
CRef< CSeq_entry > GetNextSeqEntry() override
auto & GetTopIds() const
virtual void FlattenGenbankSet()
CRef< CBioseq > LoadBioseq(CConstRef< CSeq_id > seqid) const
bool ForEachEntry(CRef< CScope > scope, THandlerEntries handler)
bool ForEachBlob(THandlerBlobs)
CRef< CHugeFile > m_pHugeFile
void Open(const string &file_name, const set< TTypeInfo > *types=&g_supported_types)
static CSeq_entry_Handle GetTopLevelEntry(CBioseq_Handle beh)
bool Read(THandler handler, CRef< CSeq_id > seqid)
CHugeAsnReader & GetReader()
virtual ~CHugeFileProcess(void)
destructor
CHugeFile & GetFile()
std::function< void(CConstRef< CSubmit_block >, CRef< CSeq_entry >)> THandler
static const set< TTypeInfo > g_supported_types
CRef< CHugeAsnReader > m_pReader
void OpenFile(const string &file_name)
std::function< bool(CSeq_entry_Handle &seh)> THandlerEntries
CHugeFileProcess()
constructors
std::function< bool(CHugeFileProcess &)> THandlerBlobs
static bool IsSupported(TTypeInfo info)
std::function< bool(CHugeAsnReader *, const std::list< CConstRef< CSeq_id > > &)> THandlerIds
void Open(const std::string &filename, const set< TTypeInfo > *supported_types)
Definition: huge_file.cpp:135
CScope –.
Definition: scope.hpp:92
CSeq_entry_Handle –.
Definition: Seq_entry.hpp:56
CTypeInfo class contains all information about C++ types (both basic and classes): members and layout...
Definition: typeinfo.hpp:76
Definition: set.hpp:45
void(*)(CSeq_entry_Handle seh, IWorkbench *wb, const CSerialObject &obj) handler
const char * file_name[]
static const struct type types[]
Definition: type.c:22
static string CreateAbsolutePath(const string &path, ERelativeToWhat rtw=eRelativeToCwd)
Get an absolute path from some, possibly relative, path.
Definition: ncbifile.cpp:665
void ResetHistory(EActionIfLocked action=eKeepIfLocked)
Clean all unused TSEs from the scope's cache and release the memory.
Definition: scope.cpp:325
void AddDataLoader(const string &loader_name, TPriority pri=kPriority_Default)
Add data loader by name.
Definition: scope.cpp:510
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
void RemoveDataLoader(const string &loader_name, EActionIfLocked action=eThrowIfLocked)
Revoke data loader from the scope.
Definition: scope.cpp:369
bool RevokeDataLoader(CDataLoader &loader)
Revoke previously registered data loader.
CSeq_entry_Handle GetParentEntry(void) const
Get parent Seq-entry handle.
bool IsTopLevelEntry(void) const
Check if this handle is top-level entry.
CSeq_entry_Handle GetTopLevelEntry(void) const
Get top level Seq-entry handle.
CSeq_entry_Handle GetParentEntry(void) const
Get parent Seq-entry handle.
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
Definition: ncbiobj.hpp:2015
TObjectType * GetPointer(void) THROWS_NONE
Get pointer,.
Definition: ncbiobj.hpp:998
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
static MDB_envinfo info
Definition: mdb_load.c:37
Definition: fix_pub.hpp:45
The Object manager core.
SRegisterLoaderInfo –.
Modified on Tue Jun 18 13:41:02 2024 by modify_doxy.py rev. 669887