NCBI C++ ToolKit
huge_file_process.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: huge_file_process.cpp 101225 2023-11-16 19:11:06Z gotvyans $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Sergiy Gotvyanskyy
27 * File Description:
28 * Utility class for processing ASN.1 files using Huge Files approach
29 *
30 */
31 #include <ncbi_pch.hpp>
32 
33 #include <objects/seq/Seqdesc.hpp>
38 
42 
44 
46 #include <objects/seq/Bioseq.hpp>
48 
50 #include <objmgr/scope.hpp>
52 
54 
57 
58 namespace
59 {
60 
61  class CAutoRevoker
62  {
63  public:
64  template<class TLoader>
65  CAutoRevoker(struct SRegisterLoaderInfo<TLoader>& info)
66  : m_loader{ info.GetLoader() } {}
67  ~CAutoRevoker()
68  {
70  }
71  private:
72  CDataLoader* m_loader = nullptr;
73  };
74 
75 }
76 
77 
79 {
80  CBioseq_set::GetTypeInfo(),
81  CBioseq::GetTypeInfo(),
82  CSeq_entry::GetTypeInfo(),
83  CSeq_submit::GetTypeInfo(),
84 };
85 
86 
88  m_pHugeFile { new CHugeFile },
89  m_pReader{ new CHugeAsnReader }
90 {}
91 
92 
94  m_pHugeFile { new CHugeFile },
95  m_pReader { pReader }
96 {}
97 
100 {
101  Open(file_name, types);
102 }
103 
105 {
106  return g_supported_types.find(info) != g_supported_types.end();
107 }
108 
110 {
112  OpenReader();
113 }
114 
116 {
118 }
119 
121 {
122  m_pHugeFile->Open(file_name, types);
123 }
124 
126 {
127  m_pReader->Open(m_pHugeFile.get(), nullptr);
128 }
129 
131 {
132 }
133 
135 {
136  if (!m_pReader->GetNextBlob()) {
137  return false;
138  }
139 
140  do
141  {
142  m_pReader->FlattenGenbankSet();
143  CRef<CSeq_entry> entry;
144  do
145  {
146  entry.Reset();
147 
148  if (seqid.Empty())
149  entry = m_pReader->GetNextSeqEntry();
150  else
151  {
152  auto seq = m_pReader->LoadBioseq(seqid);
153  if (seq.NotEmpty())
154  {
155  entry = Ref(new CSeq_entry);
156  entry->SetSeq(*seq);
157  if (auto pTopEntry = m_pReader->GetTopEntry(); pTopEntry) {
158  auto pNewEntry = Ref(new CSeq_entry());
159  pNewEntry->Assign(*pTopEntry);
160  pNewEntry->SetSet().SetSeq_set().push_back(entry);
161  entry = pNewEntry;
162  }
163  }
164  }
165 
166  if (entry)
167  {
168  handler(m_pReader->GetSubmitBlock(), entry);
169  }
170  }
171  while ( entry && seqid.Empty());
172  } while (m_pReader->GetNextBlob());
173 
174  return true;
175 }
176 
178 {
179  while (m_pReader->GetNextBlob()) {
180  m_pReader->FlattenGenbankSet();
181  bool processed = handler(m_pReader.get(), m_pReader->GetTopIds());
182  if (!processed)
183  return false;
184  }
185 
186  return true;
187 }
188 
190 {
191  if (m_pReader->GetNextBlob()) {
192  m_pReader->FlattenGenbankSet();
193  return true;
194  }
195 
196  return false;
197 }
198 
199 
201 {
202  while (m_pReader->GetNextBlob()) {
203  m_pReader->FlattenGenbankSet();
204  bool processed = handler(*this);
205  if (!processed)
206  return false;
207  }
208 
209  return true;
210 }
211 
213 {
214  if (!handler)
215  return false;
216 
217  string loader_name = CDirEntry::CreateAbsolutePath(GetFile().m_filename);
219  *CObjectManager::GetInstance(), loader_name, &GetReader(), CObjectManager::eNonDefault, 1); //CObjectManager::kPriority_Local);
220 
221  CAutoRevoker autorevoker(info);
222 
223  if (!scope)
224  scope = Ref(new CScope(*CObjectManager::GetInstance()));
225 
226  scope->AddDataLoader(loader_name);
227 
228  try
229  {
230  for (auto id: GetReader().GetTopIds())
231  {
232  {
233  auto beh = scope->GetBioseqHandle(*id);
234  auto parent = beh.GetTopLevelEntry();
235  handler(parent);
236  }
237  scope->ResetHistory();
238  }
239  }
240  catch(const std::exception& e)
241  {
242  scope->RemoveDataLoader(loader_name);
243  throw;
244  }
245  scope->RemoveDataLoader(loader_name);
246 
247  return true;
248 }
249 
250 
252 {
253  CSeq_entry_Handle parent = beh.GetParentEntry();
254  while(parent)
255  {
256  if (parent.IsTopLevelEntry())
257  break;
258 
259  if (auto temp = parent.GetParentEntry(); temp) {
260  if (temp.IsSet() && temp.GetSet().IsSetClass() &&
261  CHugeAsnReader::IsHugeSet(temp.GetSet().GetClass())) {
262  break;
263  }
264 
265  parent = temp;
266  }
267  else
268  break;
269  }
270 
271  return parent;
272 }
273 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
CBioseq_Handle –.
static TRegisterLoaderInfo RegisterInObjectManager(CObjectManager &om, const string &loader_name, CHugeAsnReader *reader, CObjectManager::EIsDefault is_default=CObjectManager::eNonDefault, CObjectManager::TPriority priority=CObjectManager::kPriority_Default)
static bool IsHugeSet(CBioseq_set::TClass setClass)
bool ForEachEntry(CRef< CScope > scope, THandlerEntries handler)
bool ForEachBlob(THandlerBlobs)
void Open(const string &file_name, const set< TTypeInfo > *types=&g_supported_types)
static CSeq_entry_Handle GetTopLevelEntry(CBioseq_Handle beh)
bool Read(THandler handler, CRef< CSeq_id > seqid)
CHugeAsnReader & GetReader()
virtual ~CHugeFileProcess(void)
destructor
CHugeFile & GetFile()
std::function< void(CConstRef< CSubmit_block >, CRef< CSeq_entry >)> THandler
unique_ptr< CHugeAsnReader > m_pReader
static const set< TTypeInfo > g_supported_types
unique_ptr< CHugeFile > m_pHugeFile
void OpenFile(const string &file_name)
std::function< bool(CSeq_entry_Handle &seh)> THandlerEntries
CHugeFileProcess()
constructors
std::function< bool(CHugeFileProcess &)> THandlerBlobs
static bool IsSupported(TTypeInfo info)
std::function< bool(CHugeAsnReader *, const std::list< CConstRef< CSeq_id > > &)> THandlerIds
CScope –.
Definition: scope.hpp:92
CSeq_entry_Handle –.
Definition: Seq_entry.hpp:56
CTypeInfo class contains all information about C++ types (both basic and classes): members and layout...
Definition: typeinfo.hpp:76
Definition: set.hpp:45
void(*)(CSeq_entry_Handle seh, IWorkbench *wb, const CSerialObject &obj) handler
const char * file_name[]
static string CreateAbsolutePath(const string &path, ERelativeToWhat rtw=eRelativeToCwd)
Get an absolute path from some, possibly relative, path.
Definition: ncbifile.cpp:665
void ResetHistory(EActionIfLocked action=eKeepIfLocked)
Clean all unused TSEs from the scope's cache and release the memory.
Definition: scope.cpp:325
void AddDataLoader(const string &loader_name, TPriority pri=kPriority_Default)
Add data loader by name.
Definition: scope.cpp:510
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
void RemoveDataLoader(const string &loader_name, EActionIfLocked action=eThrowIfLocked)
Revoke data loader from the scope.
Definition: scope.cpp:369
bool RevokeDataLoader(CDataLoader &loader)
Revoke previously registered data loader.
CSeq_entry_Handle GetParentEntry(void) const
Get parent Seq-entry handle.
bool IsTopLevelEntry(void) const
Check if this handle is top-level entry.
CSeq_entry_Handle GetTopLevelEntry(void) const
Get top level Seq-entry handle.
CSeq_entry_Handle GetParentEntry(void) const
Get parent Seq-entry handle.
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
Definition: ncbiobj.hpp:2015
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
static MDB_envinfo info
Definition: mdb_load.c:37
Definition: fix_pub.hpp:45
The Object manager core.
SRegisterLoaderInfo –.
static const struct type types[]
Definition: type.c:22
Modified on Sat Dec 09 04:49:35 2023 by modify_doxy.py rev. 669887