NCBI C++ ToolKit
asn_object_loader.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: asn_object_loader.cpp 46398 2021-04-13 19:16:35Z shkeda $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Roman Katargin
27 */
28 
29 
30 #include <ncbi_pch.hpp>
31 
33 
34 #include <util/icanceled.hpp>
35 
38 #include <gui/objutils/label.hpp>
39 
40 #include <util/format_guess.hpp>
43 
48 
54 
59 
73 
74 #include <serial/serial.hpp>
75 #include <serial/pack_string.hpp>
76 
77 #include <wx/filename.h>
78 
81 
84 
85 ///
86 /// Class CSniffReader reads NCBI object files in number of different formats.
87 /// Uses try and fail deserialization for binary ASN.1 files
88 ///
89 class CSniffReader : public CObjectsSniffer, public IAsnLoadRegister
90 {
91 public:
93 
94  // Event function called when parser finds a top level object
95  virtual void OnTopObjectFoundPre(const CObjectInfo& object,
96  CNcbiStreampos stream_pos);
97 
98  // Event function alled after top object deserialization
99  virtual void OnTopObjectFoundPost(const CObjectInfo& object);
100 
101  // Overload from CObjectsSniffer
102  virtual void OnObjectFoundPre(const CObjectInfo& object,
103  CNcbiStreampos stream_pos);
104 
105  // Overload from CObjectsSniffer
106  virtual void OnObjectFoundPost(const CObjectInfo& object);
107 
108  virtual void Reset();
109 
110  bool ObjectLoaded() const { return m_ObjectLoaded; }
111 
112 /// IAsnLoadRegister implementation
113  virtual void Register(CObjectTypeInfo ti);
114 
115 private:
116  unsigned int m_ObjLevel; // Object level counter, used to identify
117  // the TSE
118 
121 };
122 
124 {
125 }
126 
128  : m_FileNames(filenames)
129 {
130 }
131 
132 CAsnObjectLoader::CAsnObjectLoader(const string& asnTextData)
133  : m_AsnTextData(asnTextData)
134 {
135 }
136 
138 {
139  return m_Objects;
140 }
141 
143 {
144  return "Loading NCBI ASN.1 Files";
145 }
146 
148 {
149  return true;
150 }
151 
152 static const wxChar* kMemoryObject = wxT("Memory object");
153 
155 {
157  m_Scope.Reset(new CScope(*obj_mgr));
158  m_Scope->AddDefaults();
159 }
160 
162 {
163  Init();
164 
165  ITERATE(vector<wxString>, it, m_FileNames) {
166  if (canceled.IsCanceled())
167  return false;
168 
169  const wxString& fn = *it;
170  try {
171  m_CurrentFileName = *it;
172 
173  CCompressedFile file(fn);
174  CFormatGuess::EFormat fmt = file.GuessFormat();
175  file.Reset();
176 
178  wxString ext;
179  switch (fmt) {
181  sfmt = eSerial_AsnBinary;
182  break;
184  sfmt = eSerial_AsnText;
185  break;
186  case CFormatGuess::eXml:
187  sfmt = eSerial_Xml;
188  break;
190  wxFileName::SplitPath(fn, nullptr, nullptr, nullptr, &ext);
191  if (ext.IsSameAs(wxT("asn"), false)) {
192  LOG_POST(Info << "Format Guess failed, format assumed to be BinaryASN, based on file extension");
193  sfmt = eSerial_AsnBinary;
194  }
195  break;
196  default:
197  break;
198  }
199 
200  if (sfmt == eSerial_None) {
201  x_UpdateHTMLResults(fn, 0, "Serial format could not be determined.");
202  }
203  else {
205  LoadFromStream(file.GetIstream(), objects, sfmt, &canceled);
206  for (auto& o : objects)
207  m_Objects.push_back(o);
208  }
209  }
210  catch (const CException& e) {
211  string err_msg = e.what();
212  vector<string> phrases;
213  NStr::Split(err_msg, "\r\n", phrases);
214  x_UpdateHTMLResults(fn, 0, phrases[1]);
215  }
216  catch (const exception& e) {
217  x_UpdateHTMLResults(fn, 0, e.what());
218  }
219 
220  m_CurrentFileName.clear();
221  } // ITERATE
222 
223  if (!m_AsnTextData.empty()) {
224  try {
226 
229  LoadFromStream(istr, objects, eSerial_AsnText, &canceled);
230 
231  for (auto& o : objects)
232  m_Objects.push_back(o);
233  }
234  catch (const CException& e) {
235  string err_msg = e.what();
236  vector<string> phrases;
237  NStr::Split(err_msg, "\r\n", phrases);
238  x_UpdateHTMLResults(m_CurrentFileName, 0, phrases[1]);
239  }
240  catch (const exception& e) {
242  }
243 
244  m_CurrentFileName.clear();
245  }
246 
247  return true;
248 }
249 
251 {
252  if (fmt == eSerial_None)
253  return;
254 
255  // use object sniffer to guess
256  CSniffReader sniffer(*this);
257  unique_ptr<CObjectIStream>
258  sinput(CObjectIStream::Open(fmt, istr));
259 
260  // memory profile optimization:
261  // we should use pooled strings to reduce the memory burden
262  // (from Eugene Vasilchenko)
263  //
266  info.FindVariant("str")
267  .SetLocalReadHook(*sinput, new CPackStringChoiceHook);
268 
270  info.FindMember("key")
271  .SetLocalReadHook(*sinput, new CPackStringClassHook(32, 128));
272 
274  info.FindMember("db")
275  .SetLocalReadHook(*sinput, new CPackStringClassHook);
276 
277  info = CType<CGb_qual>();
278  info.FindMember("qual")
279  .SetLocalReadHook(*sinput, new CPackStringClassHook);
280 
281  sinput->UseMemoryPool();
282 
283  // now, probe!
284  //
285 
286  sniffer.AddCandidate(ncbi::objects::CSeq_table::GetTypeInfo());
287 
288  if (sinput->GetDataFormat() == eSerial_AsnText || sinput->GetDataFormat() == eSerial_Xml)
289  sniffer.SetReportDataErrors();
290 
291  sniffer.Probe(*sinput);
292  if (!sniffer.ObjectLoaded()) {
293  LOG_POST(Info << "No Top objects found:\n" << m_CurrentFileName.ToUTF8());
294  }
295  else {
296  // We don't want to overwhelm the project tree view by creating
297  // a large number of project items. Specifically, we need to
298  // address many (>10) seq-aligns or seq-feats problem.
299  TOrigObjects align_objs;
300  TOrigObjects feat_objs;
301  TOrigObjects other_objs;
303  if (dynamic_cast<CSeq_align*>(iter->GetPointer())) {
304  align_objs.push_back(*iter);
305  }
306  else if (dynamic_cast<CSeq_feat*>(iter->GetPointer())) {
307  feat_objs.push_back(*iter);
308  }
309  else {
310  other_objs.push_back(*iter);
311  }
312  }
313  m_OrigObjects.clear();
314 
315  string annot_title;
316  if (!m_CurrentFileName.empty()) {
317  wxString fbase;
318  wxFileName::SplitPath(m_CurrentFileName, 0, &fbase, 0);
319  annot_title = ToStdString(fbase);
320  }
321 
322  const size_t max_obj_prj = 10;
323  if (align_objs.size() > max_obj_prj) {
324  // combine all seq-aligns into one seq-annot
325  CRef<CSeq_annot> annot(new CSeq_annot);
327  annot->SetTitleDesc(annot_title);
328  CSeq_annot::TData::TAlign& aligns = annot->SetData().SetAlign();
329  NON_CONST_ITERATE(TOrigObjects, iter, align_objs) {
330  CRef<CSeq_align> align(dynamic_cast<CSeq_align*>(iter->GetPointer()));
331  aligns.push_back(align);
332  }
333  other_objs.push_back(CRef<CObject>(annot.GetPointer()));
334 
335  }
336  else if (!align_objs.empty()) {
337  std::copy(align_objs.begin(), align_objs.end(), back_inserter(other_objs));
338  }
339 
340  if (feat_objs.size() > max_obj_prj) {
341  // combine all seq-feats into one seq-annot
342  CRef<CSeq_annot> annot(new CSeq_annot);
344  annot->SetTitleDesc(annot_title);
345  CSeq_annot::TData::TFtable& feats = annot->SetData().SetFtable();
346  NON_CONST_ITERATE(TOrigObjects, iter, feat_objs) {
347  CRef<CSeq_feat> feat(dynamic_cast<CSeq_feat*>(iter->GetPointer()));
348  feats.push_back(feat);
349  }
350 
351  other_objs.push_back(CRef<CObject>(annot.GetPointer()));
352  }
353  else if (!feat_objs.empty()) {
354  std::copy(feat_objs.begin(), feat_objs.end(), back_inserter(other_objs));
355  }
356 
357 
358  string annot_name = annot_title.empty() ? "Undefined" : annot_title;
359 
360  // initialize a label for each object added
361  NON_CONST_ITERATE(TOrigObjects, iter, other_objs) {
362  CSerialObject* so = static_cast<CSerialObject*>(iter->GetPointer());
363 
364  // if annot name is empty assign the file name
365  for (CTypeIterator<CAnnotdesc> it(*so); it; ++it) {
366  if (it->IsName() && it->GetName().empty())
367  it->SetName(annot_name);
368  }
369 
370  CBioTreeContainer* btrc = dynamic_cast<CBioTreeContainer*>(so);
371  if (btrc && !btrc->IsSetLabel()) {
372  // if bio-tree label is not set explicitly -- we assume the filename
373  btrc->SetLabel(annot_title);
374  }
375 
376  string label;
378  if (label.empty() && btrc && btrc->IsSetLabel()) {
379  label = btrc->GetLabel();
380  }
381 
383  string fileName(wxFileName(m_CurrentFileName).GetFullName().ToAscii());
384  label = fileName + " - " + label;
385  }
386 
387  // Convert Bioseq and Bioseq-set to Seq-entry
388  CRef<CSeq_entry> se;
389  if (CBioseq_set* bioseq_set = dynamic_cast<CBioseq_set*>(so)) {
390  se.Reset(new CSeq_entry());
391  se->SetSet(*bioseq_set);
392  }
393  else if (CBioseq* bioseq = dynamic_cast<CBioseq*>(so)) {
394  se.Reset(new CSeq_entry());
395  se->SetSeq(*bioseq);
396  }
397  SObject object(se ? *se : *so, label);
398 
400  string filePath(m_CurrentFileName.ToUTF8());
401  object.SetFileName(filePath);
402  }
403  objects.push_back(object);
404  }
405  }
406 }
407 
409 {
410  if (objInfo.GetTypeFamily() == eTypeFamilyClass ||
411  objInfo.GetTypeFamily() == eTypeFamilyChoice) {
412 
413  CObject* obj = static_cast<CObject*>(objInfo.GetObjectPtr());
414  m_OrigObjects.push_back(CRef<CObject>(obj));
415  return true;
416  }
417 
418  return false;
419 }
420 
422 {
423  x_ShowErrorsDlg(wxT("ASN import errors"));
424  return true;
425 }
426 
427 /////////////////////////////////////////////////////////////////////////////
428 
429 
431 : m_ObjLevel(0), m_Loader(loader), m_ObjectLoaded(false)
432 {
433  /// basic top-level data model types:
438 
439  /// annotations:
441 
442  /// alignments:
447 
448  /// locations/identifiers:
451 
452  /// features:
454 
455  /// miscellaneous:
463 
464 
465  /// add types from certain modules,
466  /// being sure not to duplicate what's above (that's trouble)
467 
468  /// collect types added above
470  const TCandidates& cands = GetCandidates();
471  ITERATE (TCandidates, cand, cands) {
472  types.insert(cand->type_info.GetTypeInfo()->GetName());
473  }
474 
475  /// register a bunch of modules
476  vector<string> modules;
477 
479  modules.push_back("NCBI-Sequence");
480 
482  modules.push_back("NCBI-Seqloc");
483 
485  modules.push_back("NCBI-Seqalign");
486 
488  modules.push_back("NCBI-Seqfeat");
489 
491  modules.push_back("NCBI-Seqset");
492 
494  modules.push_back("NCBI-Submit");
495 
497  modules.push_back("NCBI-BioTree");
498 
500  modules.push_back("NCBI-Entrezgene");
501 
503  modules.push_back("NCBI-GUI-Project");
504 
506  modules.push_back("NCBI-PubMed");
507 
508  /// add all types in those modules as candidates, avoiding duplication
509  ITERATE (vector<string>, module, modules) {
511  CClassTypeInfoBase::GetRegisteredClassNames(*module, type_names);
513  if (types.find(*type_name) == types.end()) {
514  const CTypeInfo* type_info =
516  AddCandidate(type_info);
517  }
518  }
519  }
520 
521  vector<IAsnLoadExtension*> extensions;
523 
524  ITERATE(vector<IAsnLoadExtension*>, it, extensions)
525  (*it)->RegisterAsnTypes(*this);
526 }
527 
529 {
530  AddCandidate(ti);
531 }
532 
533 
534 void CSniffReader::Reset()
535 {
536  m_ObjLevel = 0;
537 }
538 
540  CNcbiStreampos stream_pos)
541 {
542  if (m_ObjLevel == 0) {
543  OnTopObjectFoundPre(object, stream_pos);
544  }
545  ++m_ObjLevel;
546 }
547 
548 
550 {
551  _ASSERT(m_ObjLevel > 0);
552  if (m_ObjLevel == 1) {
553  OnTopObjectFoundPost(object);
554  }
555  --m_ObjLevel;
556 }
557 
558 
559 void CSniffReader::OnTopObjectFoundPre(const CObjectInfo& /*object*/,
560  CNcbiStreampos /*stream_pos*/)
561 {
562 }
563 
564 
566 {
568 }
569 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
void NCBI_BioTree_RegisterModuleClasses(void)
void NCBI_Entrezgene_RegisterModuleClasses(void)
void NCBI_GUI_Project_RegisterModuleClasses(void)
void NCBI_PubMed_RegisterModuleClasses(void)
void NCBI_Seqalign_RegisterModuleClasses(void)
void NCBI_Seqfeat_RegisterModuleClasses(void)
void NCBI_Seqloc_RegisterModuleClasses(void)
void NCBI_Seqset_RegisterModuleClasses(void)
void NCBI_Sequence_RegisterModuleClasses(void)
void NCBI_Submit_RegisterModuleClasses(void)
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
#define EXT_POINT__ASN_LOAD_EXTENSION
USING_SCOPE(objects)
static const wxChar * kMemoryObject
CAsnObjectLoader.
EFormat
The formats are checked in the same order as declared here.
@ eBinaryASN
Binary ASN.1.
@ eUnknown
unknown format
@ eTextASN
Text ASN.1.
CObjectInfo –.
Definition: objectinfo.hpp:597
CObjectTypeInfo –.
Definition: objectinfo.hpp:94
CObject –.
Definition: ncbiobj.hpp:180
Serialized objects sniffer.
Definition: obj_sniff.hpp:65
const TCandidates & GetCandidates() const
Return reference on the internal vector of object candidates.
Definition: obj_sniff.hpp:115
vector< SCandidateInfo > TCandidates
Definition: obj_sniff.hpp:101
void SetReportDataErrors(bool report=true)
Report syntax errors to client (as an exception)
Definition: obj_sniff.hpp:174
void AddCandidate(CObjectTypeInfo ti, EEventCallBackMode emode=eCallAlways)
Add new possible type to the recognition list.
Definition: obj_sniff.cpp:156
void Probe(CObjectIStream &input)
The main worker function.
Definition: obj_sniff.cpp:162
void x_ShowErrorsDlg(const wxString &title)
void x_UpdateHTMLResults(const wxString &object, objects::ILineErrorListener *errCont, const string &exception="", const string &error_msg="", const wxString &objectName=wxT("File:"))
CScope –.
Definition: scope.hpp:92
void SetNameDesc(const string &name)
Definition: Seq_annot.cpp:66
void SetTitleDesc(const string &title)
Definition: Seq_annot.cpp:96
Definition: Seq_entry.hpp:56
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
Base class for all serializable objects.
Definition: serialbase.hpp:150
Class CSniffReader reads NCBI object files in number of different formats.
Definition: fileloader.cpp:303
virtual void OnTopObjectFoundPre(const CObjectInfo &object, CNcbiStreampos stream_pos)
virtual void OnObjectFoundPost(const CObjectInfo &object)
Event handling virtual function, called when candidate is found and deserialized.
Definition: fileloader.cpp:324
bool ObjectLoaded() const
virtual void OnTopObjectFoundPre(const CObjectInfo &object, CNcbiStreampos stream_pos)
Definition: fileloader.cpp:308
CAsnObjectLoader & m_Loader
virtual void Register(CObjectTypeInfo ti)
IAsnLoadRegister implementation.
virtual void OnObjectFoundPre(const CObjectInfo &object, CNcbiStreampos stream_pos)
Event handling virtual function, called when candidate is found but before deserialization.
virtual void OnTopObjectFoundPost(const CObjectInfo &object)
virtual void OnTopObjectFoundPost(const CObjectInfo &object)
Definition: fileloader.cpp:434
unsigned int m_ObjLevel
Definition: fileloader.cpp:339
virtual void OnObjectFoundPost(const CObjectInfo &object)
Event handling virtual function, called when candidate is found and deserialized.
virtual void Reset()
Event indicates that sniffer objects needs to reset it's status and get ready for the next probing.
Definition: fileloader.cpp:332
virtual void Reset()
Event indicates that sniffer objects needs to reset it's status and get ready for the next probing.
virtual void OnObjectFoundPre(const CObjectInfo &object, CNcbiStreampos stream_pos)
Event handling virtual function, called when candidate is found but before deserialization.
Definition: fileloader.cpp:315
CSniffReader(list< CRef< CSerialObject > > &Results)
Definition: fileloader.cpp:342
CTypeInfo class contains all information about C++ types (both basic and classes): members and layout...
Definition: typeinfo.hpp:76
Template class for iteration on objects of class C.
Definition: iterator.hpp:673
IAsnLoadRegister - interface to register ASN types that could be loaded by ASN loader.
Interface for testing cancellation request in a long lasting operation.
Definition: icanceled.hpp:51
vector< SObject > TObjects
#define false
Definition: bool.h:36
static const struct type types[]
Definition: type.c:22
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
virtual const char * what(void) const noexcept
Standard report (includes full backlog).
Definition: ncbiexpt.cpp:342
void Info(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1185
virtual bool Execute(ICanceled &canceled)
bool x_OnTopObjectFound(const CObjectInfo &objInfo)
virtual bool PostExecute()
virtual bool PreExecute()
const string m_AsnTextData
vector< wxString > m_FileNames
TOrigObjects m_OrigObjects
Original object list for current file.
CRef< objects::CScope > m_Scope
virtual TObjects & GetObjects()
void LoadFromStream(CNcbiIstream &istr, TObjects &objects, ESerialDataFormat fmt, ICanceled *canceled=0)
vector< CRef< CObject > > TOrigObjects
virtual string GetDescription() const
TObjects m_Objects
Final complete object list with description set.
static void GetLabel(const CObject &obj, string *label, ELabelType type=eDefault)
Definition: label.cpp:140
void GetExtensionAsInterface(const string &ext_point_id, vector< CIRef< I > > &interfaces)
GetExtensionAsInterface() is a helper function that extracts all extensions implementing the specifie...
@ eDefault
Definition: label.hpp:73
ESerialDataFormat
Data file format.
Definition: serialdef.hpp:71
@ eTypeFamilyClass
Definition: serialdef.hpp:140
@ eTypeFamilyChoice
Definition: serialdef.hpp:141
@ eSerial_AsnText
ASN.1 text.
Definition: serialdef.hpp:73
@ eSerial_Xml
XML.
Definition: serialdef.hpp:75
@ eSerial_None
Definition: serialdef.hpp:72
@ eSerial_AsnBinary
ASN.1 binary.
Definition: serialdef.hpp:74
TObjectPtr GetObjectPtr(void) const
Get pointer to object.
static CObjectIStream * Open(ESerialDataFormat format, CNcbiIstream &inStream, bool deleteInStream)
Create serial object reader and attach it to an input stream.
Definition: objistr.cpp:195
ETypeFamily GetTypeFamily(void) const
Get data type family.
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
TObjectType * GetPointer(void) THROWS_NONE
Get pointer,.
Definition: ncbiobj.hpp:998
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
virtual bool IsCanceled(void) const =0
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
IO_PREFIX::streampos CNcbiStreampos
Portable alias for streampos.
Definition: ncbistre.hpp:134
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3452
static void GetRegisteredClassNames(const string &module, TRegClasses &names)
Definition: classinfob.cpp:275
static TTypeInfo GetClassInfoByName(const string &name)
Definition: classinfob.cpp:244
static const char label[]
void SetLabel(const TLabel &value)
Assign a value to Label data member.
bool IsSetLabel(void) const
bio-tree label (short name) Check if a value has been assigned to Label data member.
const TLabel & GetLabel(void) const
Get the Label member data.
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_annot_.cpp:244
list< CRef< CSeq_align > > TAlign
Definition: Seq_annot_.hpp:194
list< CRef< CSeq_feat > > TFtable
Definition: Seq_annot_.hpp:193
FILE * file
static MDB_envinfo info
Definition: mdb_load.c:37
#define wxT(x)
Definition: muParser.cpp:41
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
The Object manager core.
static int filenames
Definition: pcre2grep.c:247
#define _ASSERT
static const char * type_name(CS_INT value)
Definition: will_convert.c:122
string ToStdString(const wxString &s)
Definition: wx_utils.hpp:161
Modified on Fri Sep 20 14:57:34 2024 by modify_doxy.py rev. 669887