NCBI C++ ToolKit
thread_state.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: thread_state.cpp 102322 2024-04-23 12:29:28Z foleyjp $
2 * ========================================================================== =
3 *
4 *PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software / database is a "United States Government Work" under the
8 * terms of the United States Copyright Act.It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted.This software / database is freely available
11 * to the public for use.The National Library of Medicineand the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 *Although all reasonable efforts have been taken to ensure the accuracy
15 * andreliability of the software and data, the NLMand the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data.The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 *========================================================================== =
25 *
26 *Author: Frank Ludwig
27 *
28 * File Description :
29 *validator
30 *
31 */
32 
33 #include <ncbi_pch.hpp>
34 #include <common/ncbi_source_ver.h>
35 #include <corelib/ncbistd.hpp>
36 #include <corelib/ncbistre.hpp>
37 #include <corelib/ncbiapp.hpp>
38 #include <corelib/ncbienv.hpp>
39 #include <corelib/ncbiargs.hpp>
40 #include <corelib/error_codes.hpp>
41 
42 #include <serial/serial.hpp>
43 #include <serial/objistr.hpp>
44 #include <serial/objectio.hpp>
45 
47 #include <connect/ncbi_util.h>
48 
49 // Objects includes
53 #include <objects/seq/Bioseq.hpp>
57 #include <objects/seq/Seq_inst.hpp>
58 #include <objects/seq/Pubdesc.hpp>
59 #include <objects/seq/Seqdesc.hpp>
69 
71 
72 // Object Manager includes
74 #include <objmgr/scope.hpp>
75 #include <objmgr/seq_vector.hpp>
76 #include <objmgr/seq_descr_ci.hpp>
77 #include <objmgr/feat_ci.hpp>
78 #include <objmgr/align_ci.hpp>
79 #include <objmgr/graph_ci.hpp>
80 #include <objmgr/seq_annot_ci.hpp>
81 #include <objmgr/bioseq_ci.hpp>
83 
92 #include <future>
93 #include <util/message_queue.hpp>
94 #include "app_config.hpp"
95 #include "thread_state.hpp"
100 #include "message_handler.hpp"
101 
102 using namespace ncbi;
104 USING_SCOPE(validator);
106 
107 namespace
108 {
109 
110  class CAutoRevoker
111  {
112  public:
113  template<class TLoader>
114  CAutoRevoker(struct SRegisterLoaderInfo<TLoader>& info)
115  : m_loader{ info.GetLoader() } {}
116  ~CAutoRevoker()
117  {
119  }
120  private:
121  CDataLoader* m_loader = nullptr;
122  };
123 }
124 
125 
126 const set<TTypeInfo> s_known_types{
127  CSeq_submit::GetTypeInfo(), CSeq_entry::GetTypeInfo(), CSeq_annot::GetTypeInfo(),
128  CSeq_feat::GetTypeInfo(), CBioSource::GetTypeInfo(), CPubdesc::GetTypeInfo(),
129  CBioseq_set::GetTypeInfo(), CBioseq::GetTypeInfo(), CSeqdesc::GetTypeInfo(),
130 };
131 
132 
134 {
135  CRef<objects::CSeq_entry> entry(new objects::CSeq_entry());
136  entry->SetSeq().SetInst().SetMol(objects::CSeq_inst::eMol_dna);
137  entry->SetSeq().SetInst().SetRepr(objects::CSeq_inst::eRepr_raw);
138  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AATTGGCCAAAATTGGCCAAAATTGGCCAAAATTGGCCAAAATTGGCCAAAATTGGCCAA");
139  entry->SetSeq().SetInst().SetLength(60);
140 
141  CRef<objects::CSeq_id> id(new objects::CSeq_id());
142  id->SetLocal().SetStr("good");
143  entry->SetSeq().SetId().push_back(id);
144 
145  CRef<objects::CSeqdesc> mdesc(new objects::CSeqdesc());
146  mdesc->SetMolinfo().SetBiomol(objects::CMolInfo::eBiomol_genomic);
147  entry->SetSeq().SetDescr().Set().push_back(mdesc);
148 
149  return entry;
150 }
151 
152 // ============================================================================
154 // ============================================================================
155  mAppConfig(appConfig)
156 {
157  m_Options = appConfig.m_Options;
158  m_pContext.reset(new SValidatorContext());
160  m_pContext->m_taxon_update = taxon;
161 }
162 
164 {
165 }
166 
167 
169  const CObjectInfo::CMemberIterator& member,
170  IMessageHandler& msgHandler)
171 {
172  m_Level++;
173 
174  if (m_Level == 1) {
175  size_t n = 0;
176  // Read each element separately to a local TSeqEntry,
177  // process it somehow, and... not store it in the container.
178  for (CIStreamContainerIterator i(in, member); i; ++i) {
179  try {
180  // Get seq-entry to validate
182  i >> *se;
183 
184  // Validate Seq-entry
185  CValidator validator(*m_ObjMgr, m_pContext);
186  CRef<CScope> scope = BuildScope();
187  CSeq_entry_Handle seh = scope->AddTopLevelSeqEntry(*se);
188 
189  CBioseq_CI bi(seh);
190  if (bi) {
191  m_CurrentId = "";
192  bi->GetId().front().GetSeqId()->GetLabel(&m_CurrentId);
193  if (!mAppConfig.mQuiet) {
194  LOG_POST_XX(Corelib_App, 1, m_CurrentId);
195  }
196  }
197 
198  if (mAppConfig.mDoCleanup) {
199  m_Cleanup.SetScope(scope);
200  m_Cleanup.BasicCleanup(*se);
201  }
202 
203  if (mAppConfig.mOnlyAnnots) {
204  for (CSeq_annot_CI ni(seh); ni; ++ni) {
205  const CSeq_annot_Handle& sah = *ni;
206  validator.Validate(sah, m_Options, msgHandler);
207  m_NumRecords++;
208  }
209  }
210  else {
212  validator.Validate(seh, m_Options, msgHandler);
213  m_NumRecords++;
214  double elapsed = sw.Elapsed();
215  if (elapsed > m_Longest) {
216  m_Longest = elapsed;
218  }
219  }
220  scope->RemoveTopLevelSeqEntry(seh);
221  scope->ResetHistory();
222  n++;
223  }
224  catch (exception&) {
225  if (!mAppConfig.mContinue) {
226  throw;
227  }
228  // should we issue some sort of warning?
229  }
230  }
231  }
232  else {
233  in.ReadClassMember(member);
234  }
235  m_Level--;
236 }
237 
238 
239 // ============================================================================
241 // ============================================================================
242 {
243  CRef<CScope> scope(new CScope(*m_ObjMgr));
244  scope->AddDefaults();
245  return scope;
246 }
247 
248 
249 // ============================================================================
250 unique_ptr<CObjectIStream> CAsnvalThreadState::OpenFile(TTypeInfo& asn_info, const string& fname) const
251 // ============================================================================
252 {
254  unique_ptr<CNcbiIstream> hold_stream;
255  CNcbiIstream* InputStream = &NcbiCin;
256 
257  if (!fname.empty()) {
258  own = eTakeOwnership;
259  hold_stream = make_unique<CNcbiIfstream>(fname, ios::binary);
260  InputStream = hold_stream.get();
261  }
262 
264 
265  CFormatGuessEx FG(*InputStream);
266  CFileContentInfo contentInfo;
269  switch (format)
270  {
271  case CFormatGuess::eGZip: method = CCompressStream::eGZipFile; break;
272  case CFormatGuess::eBZip2: method = CCompressStream::eBZip2; break;
273  case CFormatGuess::eLzo: method = CCompressStream::eLZO; break;
274  default: method = CCompressStream::eNone; break;
275  }
276  if (method != CCompressStream::eNone)
277  {
278  CDecompressIStream* decompress(new CDecompressIStream(*InputStream, method, CCompressStream::fDefault, own));
279  hold_stream.release();
280  hold_stream.reset(decompress);
281  InputStream = hold_stream.get();
282  own = eTakeOwnership;
283  CFormatGuessEx fg(*InputStream);
284  format = fg.GuessFormatAndContent(contentInfo);
285  }
286 
287  unique_ptr<CObjectIStream> objectStream;
288  switch (format)
289  {
292  objectStream.reset(CObjectIStream::Open(format == CFormatGuess::eBinaryASN ? eSerial_AsnBinary : eSerial_AsnText, *InputStream, own));
293  hold_stream.release();
294  asn_info = contentInfo.mInfoGenbank.mTypeInfo;
295  break;
296  default:
297  break;
298  }
299  return objectStream;
300 }
301 
302 
303 static void s_StartWrite(IMessageHandler& msgHandler) // Commence write if necessary
304 { // does nothing if msgHandler.InvokeWrite() returns false
305  if (msgHandler.InvokeWrite()) {
306  msgHandler.RequestStop();
307  msgHandler.Write();
308  }
309 }
310 
312 {
313  CNcbiOstrstream os;
314  os << "Unable to read invalid ASN.1";
315  m_ReadFailure = true;
316 
317  const CSerialException* p_serial_exception = dynamic_cast<const CSerialException*>(p_exception);
318  if (p_serial_exception && mAppConfig.mVerbosity != CAppConfig::eVerbosity_XML) {
319  if (mpIstr) {
320  os << ": " << mpIstr->GetPosition();
321  }
322  if (p_serial_exception->GetErrCode() == CSerialException::eEOF) {
323  os << ": unexpected end of file";
324  }
326  // manually call ReportAll(0) because what() includes a lot of info
327  // that's not of interest to the submitter such as stacktraces and
328  // GetMsg() doesn't include enough info.
329  os << ": " + p_exception->ReportAll(0);
330  }
331  }
332 
333  string errstr = CNcbiOstrstreamToString(os);
334  // newlines don't play well with XML
335  errstr = NStr::Replace(errstr, "\n", " * ");
336  errstr = NStr::Replace(errstr, " * ", " * ");
337 
339  s_StartWrite(msgHandler);
340 }
341 
342 
343 
345 {
346  CRef<CSeq_submit> submit(new CSeq_submit);
347 
348  auto hook = [this, &msgHandler](CObjectIStream& in, const CObjectInfo::CMemberIterator& member) {ReadClassMember(in, member, msgHandler); };
350  SetLocalReadHook(info.FindMember("seq-set"), *mpIstr, hook);
351 
352  // Read the CSeq_submit, it will call the hook object each time we
353  // encounter a Seq-entry
354  try {
355  *mpIstr >> *submit;
356  }
357  catch (const CException&) {
358  LOG_POST_XX(Corelib_App, 1, "FAILURE: Record is not a batch Seq-submit, do not use -a u to process.");
359  ++m_Reported;
360  }
361 }
362 
363 
364 
366 {
367  CRef<CBioseq_set> seqset(new CBioseq_set);
368 
369  // Register the Seq-entry hook
370  auto hook = [this, &msgHandler](CObjectIStream& in, const CObjectInfo::CMemberIterator& member) {ReadClassMember(in, member, msgHandler); };
372  SetLocalReadHook(info.FindMember("seq-set"), *mpIstr, hook);
373 
374 
375  // Read the CBioseq_set, it will call the hook object each time we
376  // encounter a Seq-entry
377  try {
378  *mpIstr >> *seqset;
379  }
380  catch (const CException&) {
381  LOG_POST_XX(Corelib_App, 1, "FAILURE: Record is not a batch Bioseq-set, do not use -a t to process.");
382  ++m_Reported;
383  }
384 }
385 
386 
387 
389 {
390  // Validate Seq-entry
391  CValidator validator(*m_ObjMgr, m_pContext);
392  CRef<CScope> scope = BuildScope();
393  if (mAppConfig.mDoCleanup) {
394  m_Cleanup.SetScope(scope);
396  }
397  CSeq_entry_Handle seh = scope->AddTopLevelSeqEntry(se);
398  CBioseq_CI bi(seh);
399  if (bi) {
400  m_CurrentId = "";
401  bi->GetId().front().GetSeqId()->GetLabel(&m_CurrentId);
402  if (!mAppConfig.mQuiet) {
403  LOG_POST_XX(Corelib_App, 1, m_CurrentId);
404  }
405  }
406 
407  if (mAppConfig.mOnlyAnnots) {
408  for (CSeq_annot_CI ni(seh); ni; ++ni) {
409  const CSeq_annot_Handle& sah = *ni;
410  validator.Validate(sah, m_Options, msgHandler);
411  m_NumRecords++;
412  }
413  return;
414  }
415  CValidErrorSuppress::TCodes suppressed;
417  validator.Validate(se, scope, m_Options, msgHandler, &suppressed);
418  m_NumRecords++;
419  return;
420 }
421 
422 
424 {
426 
428 
429  CValidator validator(*m_ObjMgr, m_pContext);
430  CRef<CScope> scope = BuildScope();
432 
433  validator.Validate(*sd, *ctx, 0, msgHandler);
434  m_NumRecords++;
435 }
436 
437 
439 {
440  // Get seq-entry to validate
442 
443  auto se = Ref(new CSeq_entry);
444  se->SetSeq(*bioseq);
445 
446  // Validate Seq-entry
447  ProcessSeqEntry(*se, msgHandler);
448 }
449 
451 {
452  // Get seq-entry to validate
454 
455  auto se = Ref(new CSeq_entry);
456  se->SetSet(*bioseqset);
457 
458  // Validate Seq-entry
459  ProcessSeqEntry(*se, msgHandler);
460 }
461 
462 
464 {
466 
467  CValidator validator(*m_ObjMgr, m_pContext);
468  CRef<CScope> scope = BuildScope();
469  m_NumRecords++;
470  validator.Validate(*pd, scope, m_Options, msgHandler);
471 }
472 
473 
475 {
476  // Get seq-entry to validate
478 
479  ProcessSeqEntry(*se, msgHandler);
480 }
481 
482 
484 {
485 
487 
488  m_CurrentId = "";
489  // Validate Seq-submit
490  CValidator validator(*m_ObjMgr, m_pContext);
491  CRef<CScope> scope = BuildScope();
492  if (ss->GetData().IsEntrys()) {
493  for (auto& se: ss->SetData().SetEntrys() ) {
494  auto teh = scope->AddTopLevelSeqEntry(*se);
495  if (teh) {
496  CBioseq_CI bi(teh);
497  if (bi) {
498  bi->GetId().front().GetSeqId()->GetLabel(&m_CurrentId);
499  }
500  }
501  }
502  }
503  if (mAppConfig.mDoCleanup) {
504  m_Cleanup.SetScope(scope);
505  m_Cleanup.BasicCleanup(*ss);
506  }
507 
508  if (!mAppConfig.mQuiet) {
509  LOG_POST_XX(Corelib_App, 1, m_CurrentId);
510  }
511  CValidErrorSuppress::TCodes suppressed;
513  validator.Validate(*ss, scope, m_Options, msgHandler, &suppressed);
514  m_NumRecords++;
515 }
516 
517 
519 {
521 
522  // Validate Seq-annot
523  CValidator validator(*m_ObjMgr, m_pContext);
524  CRef<CScope> scope = BuildScope();
525  if (mAppConfig.mDoCleanup) {
526  m_Cleanup.SetScope(scope);
527  m_Cleanup.BasicCleanup(*sa);
528  }
529  CSeq_annot_Handle sah = scope->AddSeq_annot(*sa);
530  validator.Validate(sah, m_Options, msgHandler);
531  m_NumRecords++;
532 }
533 
534 
536 {
538 
539  CRef<CScope> scope = BuildScope();
540  if (mAppConfig.mDoCleanup) {
541  m_Cleanup.SetScope(scope);
542  m_Cleanup.BasicCleanup(*feat);
543  }
544 
545  CValidator validator(*m_ObjMgr, m_pContext);
546  m_NumRecords++;
547  validator.Validate(*feat, scope, m_Options, msgHandler);
548 }
549 
550 
552 {
554 
555  CValidator validator(*m_ObjMgr, m_pContext);
556  CRef<CScope> scope = BuildScope();
557  m_NumRecords++;
558  validator.Validate(*src, scope, m_Options, msgHandler);
559 }
560 
561 
563 {
564 
565  // Process file based on its content
566  if (asninfo==nullptr) {
567  auto content = mpIstr->GuessDataType(s_known_types);
568  if (content.size() == 1) {
569  asninfo = *content.begin();
570  }
571  }
572  if (asninfo==nullptr) {
573  NCBI_THROW(CException, eUnknown, "Unrecognized data type");
574  }
575 
576  CRef<CSerialObject> serial;
577  try
578  {
579  auto obj_info = mpIstr->Read(asninfo);
580  serial.Reset(static_cast<CSerialObject*>(obj_info.GetObjectPtr()));
581  }
582  catch (CException& e) {
584  ERR_POST(Error << e);
585  }
586  ReportReadFailure(&e, msgHandler);
587  return;
588  }
589 
590  string asn_type = asninfo->GetName();
591  bool unhandledType{false};
592 
593  try
594  {
595  if (asn_type == "Seq-submit") { // Seq-submit
596  ProcessSeqSubmit(serial, msgHandler);
597  } else if (asn_type == "Seq-entry") { // Seq-entry
598  ProcessSeqEntry(serial, msgHandler);
599  } else if (asn_type == "Seq-annot") { // Seq-annot
600  ProcessSeqAnnot(serial, msgHandler);
601  } else if (asn_type == "Seq-feat") { // Seq-feat
602  ProcessSeqFeat(serial, msgHandler);
603  } else if (asn_type == "BioSource") { // BioSource
604  ProcessBioSource(serial, msgHandler);
605  } else if (asn_type == "Pubdesc") { // Pubdesc
606  ProcessPubdesc(serial, msgHandler);
607  } else if (asn_type == "Bioseq-set") { // Bioseq-set
608  ProcessBioseqset(serial, msgHandler);
609  } else if (asn_type == "Bioseq") { // Bioseq
610  ProcessBioseq(serial, msgHandler);
611  } else if (asn_type == "Seqdesc") { // Seq-desc
612  ProcessSeqDesc(serial, msgHandler);
613  } else {
614  unhandledType = true;
615  }
616  }
617  catch(const CException& e)
618  {
619  if (NStr::StartsWith(e.GetMsg(), "duplicate Bioseq id", NStr::eNocase)) {
620  string errstr = e.GetMsg();
621  errstr = NStr::Replace(errstr, "\n", " * ");
622  errstr = NStr::Replace(errstr, " * ", " * ");
624  return;
625  }
626  throw;
627  }
628  if (unhandledType) {
629  NCBI_THROW(CException, eUnknown, "Unhandled type " + asn_type);
630  }
631 }
632 
633 
635  const string& loader_name,
636  CConstRef<CSubmit_block> pSubmitBlock,
637  CConstRef<CSeq_id> seqid,
638  IMessageHandler& msgHandler) const
639 {
640  CRef<CSeq_entry> pEntry;
641  CRef<CScope> scope = BuildScope();
642  if (!loader_name.empty())
643  scope->AddDataLoader(loader_name);
644 
645  CValidator validator(*m_ObjMgr, m_pContext);
646 
647  CSeq_entry_Handle top_h;
648  auto seq_id_h = CSeq_id_Handle::GetHandle(*seqid);
649  if (scope->Exists(seq_id_h)) {
650  if (auto bioseq_h = scope->GetBioseqHandle(seq_id_h); bioseq_h) {
651  top_h = bioseq_h.GetTopLevelEntry();
652  if (top_h) {
653  pEntry = Ref(const_cast<CSeq_entry*>(top_h.GetCompleteSeq_entry().GetPointer()));
654  }
655  }
656  }
657 
658  if (top_h) {
659  if (mAppConfig.mDoCleanup) {
661  cleanup.SetScope(scope);
662  cleanup.BasicCleanup(*pEntry);
663  }
664 
665  CValidErrorSuppress::TCodes suppressed;
666  if (pSubmitBlock) {
667  auto pSubmit = Ref(new CSeq_submit());
668  pSubmit->SetSub().Assign(*pSubmitBlock);
669  pSubmit->SetData().SetEntrys().push_back(pEntry);
670  CValidErrorSuppress::SetSuppressedCodes(*pSubmit, suppressed);
671  validator.Validate(*pSubmit, scope, m_Options, msgHandler, &suppressed);
672  }
673  else {
674  CValidErrorSuppress::SetSuppressedCodes(*pEntry, suppressed);
675  validator.Validate(*pEntry, scope, m_Options, msgHandler, &suppressed);
676  }
677  }
678 }
679 
680 
681 void CAsnvalThreadState::ValidateOneHugeBlob(edit::CHugeFileProcess& process, IMessageHandler& msgHandler)
682 {
683  string loader_name = CDirEntry::CreateAbsolutePath(process.GetFile().m_filename);
684  bool use_mt = true;
685  #ifdef _DEBUG
686  use_mt = false;
687  #endif
688 
689  auto& reader = process.GetReader();
690 
691  auto info = edit::CHugeAsnDataLoader::RegisterInObjectManager(
692  *m_ObjMgr, loader_name, &reader, CObjectManager::eNonDefault, 1); //CObjectManager::kPriority_Local);
693 
694  CAutoRevoker autorevoker(info);
695  CHugeFileValidator hugeFileValidator(reader, m_Options);
696  hugeFileValidator.UpdateValidatorContext(m_GlobalInfo, *m_pContext);
697 
698  if (!mAppConfig.mQuiet) {
699  if (const auto& topIds = reader.GetTopIds(); !topIds.empty()) {
700  m_CurrentId.clear();
701  topIds.front()->GetLabel(&m_CurrentId);
702  LOG_POST_XX(Corelib_App, 1, m_CurrentId);
703  }
704  }
705 
706  if (m_pContext->PreprocessHugeFile) {
707  hugeFileValidator.ReportGlobalErrors(m_GlobalInfo, msgHandler);
708  s_StartWrite(msgHandler);
709  }
710 
711 
712  if (use_mt) {
713  ValidateBlobAsync(loader_name, process, msgHandler);
714  } else {
715  ValidateBlobSequential(loader_name, process, msgHandler); // Need to revisit this
716  }
717 
718  hugeFileValidator.ReportPostErrors(*m_pContext, msgHandler);
719  s_StartWrite(msgHandler);
720 }
721 
722 
723 
724 void CAsnvalThreadState::ValidateOneHugeFile(edit::CHugeFileProcess& process, IMessageHandler& msgHandler)
725 {
726  while (true)
727  {
729 
730  try {
731  if (!process.ReadNextBlob())
732  break;
733 
734  }
735  catch (const edit::CHugeFileException& e) {
736  if (e.GetErrCode() == edit::CHugeFileException::eDuplicateSeqIds)
737  {
739  s_StartWrite(msgHandler);
740  ++m_Reported;
741  continue;
742  }
743  throw;
744  }
745  catch (const CException& e) {
746  ReportReadFailure(&e, msgHandler);
747  return;
748  }
749 
750  m_NumRecords++;
751  ValidateOneHugeBlob(process, msgHandler);
752  }
753 }
754 
755 
757  CAsnvalThreadState* _this,
758  const string& loader_name,
759  CConstRef<CSubmit_block> pSubmitBlock,
760  CConstRef<CSeq_id> seqid,
761  IMessageHandler& msgHandler)
762 {
763  CThreadExitData exit_data;
764  try
765  {
767  _this->ValidateAsync(loader_name, pSubmitBlock, seqid, msgHandler);
768  double elapsed = sw.Elapsed();
769  exit_data.mLongest = elapsed;
770  }
771  catch (const CException& e) {
772  string errstr = e.GetMsg();
773  errstr = NStr::Replace(errstr, "\n", " * ");
774  errstr = NStr::Replace(errstr, " * ", " * ");
776  ERR_POST(e);
777  }
778  return exit_data;
779 }
780 
781 
782 void CAsnvalThreadState::ValidateBlobAsync(const string& loader_name, edit::CHugeFileProcess& process,
783  IMessageHandler& msgHandler)
784 {
785  auto& reader = process.GetReader();
786  auto writer_task = std::async([this, &msgHandler] { if(msgHandler.InvokeWrite()){ msgHandler.Write(); } });
787 
789  // start a loop in a separate thread
790  auto topids_task = std::async(std::launch::async, [this, &val_queue, &loader_name, &reader, &msgHandler]()
791  {
792  auto pSubmitBlock = reader.GetSubmitBlock();
793  for (auto seqid : reader.GetTopIds())
794  {
795  auto fut = std::async(std::launch::async, ValidateWorker,
796  this, loader_name, pSubmitBlock, seqid, std::ref(msgHandler));
797  // std::future is not copiable, so passing it for move constructor
798  val_queue.push_back(std::move(fut));
799  }
800 
801  val_queue.push_back({});
802  });
803 
804 
805  while (true)
806  {
807  auto result = val_queue.pop_front();
808  if (!result.valid()) {
809  if (msgHandler.InvokeWrite()) {
810  msgHandler.RequestStop(); // stop write
811  }
812  break;
813  }
814  auto exit_data = result.get();
815  if (exit_data.mLongest > m_Longest) {
816  m_Longest = exit_data.mLongest;
817  m_LongestId = m_CurrentId; //exit_data.mLongestId;
818  }
819  }
820 
821  topids_task.wait();
822  writer_task.wait();
823 }
824 
825 
826 
828  const string& loader_name,
829  edit::CHugeFileProcess& process,
830  IMessageHandler& msgHandler)
831 {
832  auto& reader = process.GetReader();
833 
834  for (auto seqid : reader.GetTopIds())
835  {
836  auto pSubmitBlock = reader.GetSubmitBlock();
837  ValidateAsync(loader_name, pSubmitBlock, seqid, msgHandler);
838  }
839  s_StartWrite(msgHandler);
840 }
841 
842 
844 {
846 
847  ValidateInput(asninfo, msgHandler);
848 
849  double elapsed = sw.Elapsed();
850  if (elapsed > m_Longest) {
851  m_Longest = elapsed;
853  }
854  s_StartWrite(msgHandler);
855 
856  if (m_ReadFailure) {
857  return false;
858  }
859  return true;
860 }
861 
862 
864 {
865  if (asninfo == CBioseq_set::GetTypeInfo()) {
866  ProcessBSSReleaseFile(msgHandler);
867  s_StartWrite(msgHandler);
868  return true;
869  }
870  else
871  if (asninfo == CSeq_submit::GetTypeInfo()) {
872  const auto commandLineOptions = m_Options;
874  try {
875  ProcessSSMReleaseFile(msgHandler);
876  }
877  catch (CException&) {
878  m_Options = commandLineOptions;
879  throw;
880  }
881  m_Options = commandLineOptions;
882  s_StartWrite(msgHandler);
883  return true;
884  }
885  else {
886  LOG_POST_XX(Corelib_App, 1, "FAILURE: Record is neither a Seq-submit nor Bioseq-set; do not use -batch to process.");
887  return false;
888  }
889 }
890 
891 
893 {
894  unique_ptr<IMessageHandler> pMsgHandler;
895  if (mAppConfig.mHugeFile) { // Also need to check input stream here
896  pMsgHandler.reset(new CAsyncMessageHandler(mAppConfig, ostr));
897  }
898  else {
899  pMsgHandler.reset(new CSerialMessageHandler(mAppConfig, ostr));
900  }
901  auto result = ValidateOneFile(filename, *pMsgHandler);
902  result.mReported += pMsgHandler->GetNumReported();
903  return result;
904 }
905 
906 
908 {
909  if (!mAppConfig.mQuiet) {
910  LOG_POST_XX(Corelib_App, 1, filename);
911  }
912 
913  TTypeInfo asninfo = nullptr;
914  unique_ptr<edit::CHugeFileProcess> mpHugeFileProcess;
915 
916  if (filename.empty())
917  mpIstr = OpenFile(asninfo, filename);
918  else {
919  auto huge_reader = std::make_unique<edit::CHugeAsnReader>();
920  huge_reader->ExtendReadHooks([this](CObjectIStream& istream)
921  {
923  });
924 
925  mpHugeFileProcess.reset(new edit::CHugeFileProcess(huge_reader.release()));
926  try {
927  mpHugeFileProcess->Open(filename, &s_known_types);
928  asninfo = mpHugeFileProcess->GetFile().m_content;
929  }
930  catch (const CObjReaderParseException&) {
931  mpHugeFileProcess.reset();
932  throw;
933  }
934 
935  if (asninfo) {
936  if (!mAppConfig.mHugeFile || mAppConfig.mBatch || !edit::CHugeFileProcess::IsSupported(asninfo)) {
937  mpIstr = mpHugeFileProcess->GetReader().MakeObjStream(0);
938  }
939  }
940  else {
941  mpIstr = OpenFile(asninfo, filename);
942  }
943  }
944 
945  if (mAppConfig.mHugeFile && !mpIstr) {
946  ValidateOneHugeFile(*mpHugeFileProcess, msgHandler);
947  } else {
948 
949  bool proceed = true;
950 
951  do {
952  if (!asninfo) {
953  ReportReadFailure(nullptr, msgHandler);
954  LOG_POST_XX(Corelib_App, 1, "FAILURE: Unable to process invalid ASN.1 file " + filename);
955  break;
956  }
957 
958  try {
959  if (mAppConfig.mBatch) {
960  if (!ValidateBatchMode(asninfo, msgHandler)) {
961  proceed = ValidateTraditionally(asninfo, msgHandler);
962  }
963  }
964  else
965  proceed = ValidateTraditionally(asninfo, msgHandler);
966 
967  if (mpIstr->EndOfData()) // force to SkipWhiteSpace
968  break;
969  else {
970  auto types = mpIstr->GuessDataType(s_known_types);
971  asninfo = types.empty() ? nullptr : *types.begin();
972  }
973  }
974  catch (const CException& e) {
975  string errstr = e.GetMsg();
976  errstr = NStr::Replace(errstr, "\n", " * ");
977  errstr = NStr::Replace(errstr, " * ", " * ");
979  ++m_Reported;
980  ERR_POST(e);
981  }
982  }
983  while (proceed);
984  }
985 
986  mpIstr.reset();
987 
989 }
990 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
@ eErr_GENERIC_InvalidAsn
@ eErr_GENERIC_DuplicateIDs
@ eErr_INTERNAL_Exception
unsigned int m_Options
Definition: app_config.hpp:67
bool mDoCleanup
Definition: app_config.hpp:57
bool mOnlyAnnots
Definition: app_config.hpp:65
bool mHugeFile
Definition: app_config.hpp:66
bool mContinue
Definition: app_config.hpp:64
EVerbosity mVerbosity
Definition: app_config.hpp:58
void ProcessPubdesc(CRef< CSerialObject > serial, IMessageHandler &msgHandler)
void ValidateBlobAsync(const string &loader_name, edit::CHugeFileProcess &process, IMessageHandler &msgHandler)
bool ValidateTraditionally(TTypeInfo asninfo, IMessageHandler &msgHandler)
void ValidateBlobSequential(const string &loader_name, edit::CHugeFileProcess &process, IMessageHandler &msgHandler)
void ReportReadFailure(const CException *p_exception, IMessageHandler &msgHandler)
void ProcessSeqAnnot(CRef< CSerialObject > serial, IMessageHandler &msgHandler)
shared_ptr< SValidatorContext > m_pContext
static CThreadExitData ValidateWorker(CAsnvalThreadState *_this, const string &loader_name, CConstRef< CSubmit_block > pSubmitBlock, CConstRef< CSeq_id > seqid, IMessageHandler &msgHandler)
void ProcessBioSource(CRef< CSerialObject > serial, IMessageHandler &msgHandler)
void ReadClassMember(CObjectIStream &in, const CObjectInfo::CMemberIterator &member, IMessageHandler &msgHandler)
std::list< CConstRef< CValidError > > m_eval
void ProcessSeqEntry(CSeq_entry &se, IMessageHandler &msgHandler)
CHugeFileValidator::TGlobalInfo m_GlobalInfo
void ProcessBioseq(CRef< CSerialObject > serial, IMessageHandler &msgHandler)
CRef< CScope > BuildScope() const
bool ValidateBatchMode(TTypeInfo asninfo, IMessageHandler &msgHandler)
CAsnvalThreadState(const CAppConfig &, SValidatorContext::taxupdate_func_t taxon)
void ProcessBioseqset(CRef< CSerialObject > serial, IMessageHandler &msgHandler)
void ProcessSeqSubmit(CRef< CSerialObject > serial, IMessageHandler &msgHandler)
void ValidateOneHugeFile(edit::CHugeFileProcess &process, IMessageHandler &msgHandler)
CThreadExitData ValidateOneFile(const string &infilename, CNcbiOstream &ostr)
void ProcessSSMReleaseFile(IMessageHandler &msgHandler)
const CAppConfig & mAppConfig
void ProcessSeqFeat(CRef< CSerialObject > serial, IMessageHandler &msgHandler)
unique_ptr< CObjectIStream > OpenFile(TTypeInfo &asn_info, const string &filename) const
void ProcessBSSReleaseFile(IMessageHandler &msgHandler)
std::atomic< size_t > m_Reported
void ValidateInput(TTypeInfo asninfo, IMessageHandler &msgHandler)
void ValidateAsync(const string &loader_name, CConstRef< CSubmit_block > pSubmitBlock, CConstRef< CSeq_id > seqid, IMessageHandler &msgHandler) const
void ProcessSeqDesc(CRef< CSerialObject > serial, IMessageHandler &msgHandler)
void ValidateOneHugeBlob(edit::CHugeFileProcess &process, IMessageHandler &msgHandler)
CRef< CObjectManager > m_ObjMgr
unique_ptr< CObjectIStream > mpIstr
unsigned int m_Options
CBioseq_CI –.
Definition: bioseq_ci.hpp:69
TChanges BasicCleanup(CSeq_entry &se, Uint4 options=0)
Definition: cleanup.cpp:132
void SetScope(CScope *scope)
Definition: cleanup.cpp:108
CDecompressIStream –.
Wraps CFormatGuess, and if CFormatGuess's result is Unknown, it tries every file reader until one wor...
CFormatGuess::EFormat GuessFormatAndContent(CFileContentInfo &contentInfo)
void SetRecognizedGenbankTypes(const set< TTypeInfo > &recognizedGenbankTypes)
EFormat
The formats are checked in the same order as declared here.
@ eBZip2
bzip2 compressed file
@ eBinaryASN
Binary ASN.1.
@ eLzo
lzo compressed file
@ eGZip
GNU zip compressed file.
@ eTextASN
Text ASN.1.
void ReportGlobalErrors(const TGlobalInfo &globalInfo, IValidError &errors) const
void UpdateValidatorContext(const TGlobalInfo &globalInfo, SValidatorContext &context) const
static void RegisterReaderHooks(CObjectIStream &objStream, SGlobalInfo &m_GlobalInfo)
void ReportPostErrors(const SValidatorContext &context, IValidError &errors) const
Reading (iterating through) elements of containers (SET OF, SEQUENCE OF).
Definition: objectio.hpp:164
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
CObjectIStream –.
Definition: objistr.hpp:93
CObjectInfoMI –.
Definition: objectiter.hpp:432
CObjectTypeInfo –.
Definition: objectinfo.hpp:94
CScope –.
Definition: scope.hpp:92
CSeq_annot_CI –.
CSeq_annot_Handle –.
CSeq_entry_Handle –.
Definition: Seq_entry.hpp:56
Root class for all serialization exceptions.
Definition: exception.hpp:50
CStopWatch –.
Definition: ncbitime.hpp:1938
CTypeInfo class contains all information about C++ types (both basic and classes): members and layout...
Definition: typeinfo.hpp:76
static void SetSuppressedCodes(const CUser_object &user, TCodes &errCodes)
CRef< CValidError > Validate(const CSeq_entry &se, CScope *scope=nullptr, Uint4 options=0)
Definition: validator.cpp:100
@ eVal_seqsubmit_parent
Definition: validator.hpp:85
virtual bool InvokeWrite() const =0
void AddValidErrItem(EDiagSev sev, unsigned int ec, const string &msg, const string &desc, const CSerialObject &obj, const string &acc, const int ver, const string &location=kEmptyStr, const int seq_offset=0) override
virtual void Write()=0
virtual void RequestStop()=0
Definition: set.hpp:45
Include a standard set of the NCBI C++ Toolkit most basic headers.
static void cleanup(void)
Definition: ct_dynamic.c:30
CS_CONTEXT * ctx
Definition: t0006.c:12
static const struct type types[]
Definition: type.c:22
@ eTakeOwnership
An object can take ownership of another.
Definition: ncbi_types.h:136
@ eNoOwnership
No ownership is assumed.
Definition: ncbi_types.h:135
string
Definition: cgiapp.hpp:687
EMethod
Compression/decompression methods.
Definition: stream_util.hpp:98
@ eLZO
LZO (LZO1X)
@ eNone
no compression method (copy "as is")
Definition: stream_util.hpp:99
@ eGZipFile
.gz file (including concatenated files)
@ fDefault
Use algorithm-specific defaults.
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
#define LOG_POST_XX(error_name, err_subcode, message)
Definition: ncbidiag.hpp:569
@ eDiag_Fatal
Fatal error – guarantees exit(or abort)
Definition: ncbidiag.hpp:655
@ eDiag_Critical
Critical error message.
Definition: ncbidiag.hpp:654
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
TErrCode GetErrCode(void) const
Get error code.
Definition: ncbiexpt.cpp:453
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
const string & GetMsg(void) const
Get message string.
Definition: ncbiexpt.cpp:461
string ReportAll(TDiagPostFlags flags=eDPF_Exception) const
Report all exceptions.
Definition: ncbiexpt.cpp:370
static string CreateAbsolutePath(const string &path, ERelativeToWhat rtw=eRelativeToCwd)
Get an absolute path from some, possibly relative, path.
Definition: ncbifile.cpp:665
@ eSerial_AsnText
ASN.1 text.
Definition: serialdef.hpp:73
@ eSerial_AsnBinary
ASN.1 binary.
Definition: serialdef.hpp:74
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
static CObjectIStream * Open(ESerialDataFormat format, CNcbiIstream &inStream, bool deleteInStream)
Create serial object reader and attach it to an input stream.
Definition: objistr.cpp:195
@ eEOF
Unexpected end-of-file.
Definition: exception.hpp:55
void ResetHistory(EActionIfLocked action=eKeepIfLocked)
Clean all unused TSEs from the scope's cache and release the memory.
Definition: scope.cpp:325
void AddDataLoader(const string &loader_name, TPriority pri=kPriority_Default)
Add data loader by name.
Definition: scope.cpp:510
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CSeq_entry_Handle AddTopLevelSeqEntry(CSeq_entry &top_entry, TPriority pri=kPriority_Default, EExist action=eExist_Default)
Add seq_entry, default priority is higher than for defaults or loaders Add object to the score with p...
Definition: scope.cpp:522
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
void AddDefaults(TPriority pri=kPriority_Default)
Add default data loaders from object manager.
Definition: scope.cpp:504
CSeq_annot_Handle AddSeq_annot(CSeq_annot &annot, TPriority pri=kPriority_Default, EExist action=eExist_Throw)
Add Seq-annot, return its CSeq_annot_Handle.
Definition: scope.cpp:538
void RemoveTopLevelSeqEntry(const CTSE_Handle &entry)
Revoke TSE previously added using AddTopLevelSeqEntry() or AddBioseq().
Definition: scope.cpp:376
bool RevokeDataLoader(CDataLoader &loader)
Revoke previously registered data loader.
bool Exists(const CSeq_id &id)
Check existence of sequence with this id.
Definition: scope.cpp:393
CSeq_entry_Handle GetTopLevelEntry(void) const
Get top level Seq-entry handle.
CConstRef< CSeq_entry > GetCompleteSeq_entry(void) const
Complete and get const reference to the seq-entry.
const TId & GetId(void) const
TObjectType * GetPointer(void) const THROWS_NONE
Get pointer,.
Definition: ncbiobj.hpp:1684
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
Definition: ncbiobj.hpp:2015
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
#define NcbiCin
Definition: ncbistre.hpp:542
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
static string & Replace(const string &src, const string &search, const string &replace, string &dst, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3314
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
@ eNocase
Case insensitive compare.
Definition: ncbistr.hpp:1206
double Elapsed(void) const
Return time elapsed since first Start() or last Restart() call (in seconds).
Definition: ncbitime.hpp:2776
@ eStart
Start timer immediately after creating.
Definition: ncbitime.hpp:1942
const string & GetName(void) const
Get name of this type.
Definition: typeinfo.cpp:249
ENcbiOwnership
Ownership relations between objects.
Definition: ncbi_types.h:134
const TData & GetData(void) const
Get the Data member data.
void SetData(TData &value)
Assign a value to Data data member.
bool IsEntrys(void) const
Check if variant Entrys is selected.
static CStopWatch sw
Definition of all error codes used in corelib (xncbi.lib).
int i
yy_size_t n
static MDB_envinfo info
Definition: mdb_load.c:37
Definition: fix_pub.hpp:45
Magic spell ;-) needed for some weird compilers... very empiric.
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Defines command line argument related classes.
Defines unified interface to application:
#define nullptr
Definition: ncbimisc.hpp:45
NCBI C++ stream class wrappers for triggering between "new" and "old" C++ stream libraries.
static Format format
Definition: njn_ioutil.cpp:53
std::istream & in(std::istream &in_, double &x_)
The Object manager core.
void SetLocalReadHook(const CObjectTypeInfo &obj_type_info, CObjectIStream &ostr, _Func _func)
C++ I/O stream wrappers to compress/decompress data on-the-fly.
SRegisterLoaderInfo –.
function< CRef< CTaxon3_reply >(const vector< CRef< COrg_ref > > &list)> taxupdate_func_t
USING_SCOPE(objects)
const set< TTypeInfo > s_known_types
static CRef< objects::CSeq_entry > s_BuildGoodSeq()
static void s_StartWrite(IMessageHandler &msgHandler)
else result
Definition: token2.c:20
CFileContentInfoGenbank mInfoGenbank
#define const
Definition: zconf.h:232
Modified on Fri Apr 26 16:28:08 2024 by modify_doxy.py rev. 669887