NCBI C++ ToolKit
thread_state.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: thread_state.cpp 102951 2024-08-09 00:17:13Z kans $
2 * ========================================================================== =
3 *
4 *PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software / database is a "United States Government Work" under the
8 * terms of the United States Copyright Act.It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted.This software / database is freely available
11 * to the public for use.The National Library of Medicineand the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 *Although all reasonable efforts have been taken to ensure the accuracy
15 * andreliability of the software and data, the NLMand the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data.The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 *========================================================================== =
25 *
26 *Author: Frank Ludwig
27 *
28 * File Description :
29 *validator
30 *
31 */
32 
33 #include <ncbi_pch.hpp>
34 #include <common/ncbi_source_ver.h>
35 #include <corelib/ncbistd.hpp>
36 #include <corelib/ncbistre.hpp>
37 #include <corelib/ncbiapp.hpp>
38 #include <corelib/ncbienv.hpp>
39 #include <corelib/ncbiargs.hpp>
40 #include <corelib/error_codes.hpp>
41 
42 #include <serial/serial.hpp>
43 #include <serial/objistr.hpp>
44 #include <serial/objectio.hpp>
45 
47 #include <connect/ncbi_util.h>
48 
49 // Objects includes
53 #include <objects/seq/Bioseq.hpp>
57 #include <objects/seq/Seq_inst.hpp>
58 #include <objects/seq/Pubdesc.hpp>
59 #include <objects/seq/Seqdesc.hpp>
69 
71 
72 // Object Manager includes
74 #include <objmgr/scope.hpp>
75 #include <objmgr/seq_vector.hpp>
76 #include <objmgr/seq_descr_ci.hpp>
77 #include <objmgr/feat_ci.hpp>
78 #include <objmgr/align_ci.hpp>
79 #include <objmgr/graph_ci.hpp>
80 #include <objmgr/seq_annot_ci.hpp>
81 #include <objmgr/bioseq_ci.hpp>
83 
92 #include <future>
93 #include <util/message_queue.hpp>
94 #include "app_config.hpp"
95 #include "thread_state.hpp"
100 #include "message_handler.hpp"
101 
102 using namespace ncbi;
104 USING_SCOPE(validator);
106 
107 namespace
108 {
109 
110  class CAutoRevoker
111  {
112  public:
113  template<class TLoader>
114  CAutoRevoker(struct SRegisterLoaderInfo<TLoader>& info)
115  : m_loader{ info.GetLoader() } {}
116  ~CAutoRevoker()
117  {
119  }
120  private:
121  CDataLoader* m_loader = nullptr;
122  };
123 }
124 
125 
126 const set<TTypeInfo> s_known_types{
127  CSeq_submit::GetTypeInfo(), CSeq_entry::GetTypeInfo(), CSeq_annot::GetTypeInfo(),
128  CSeq_feat::GetTypeInfo(), CBioSource::GetTypeInfo(), CPubdesc::GetTypeInfo(),
129  CBioseq_set::GetTypeInfo(), CBioseq::GetTypeInfo(), CSeqdesc::GetTypeInfo(),
130 };
131 
132 
134 {
135  CRef<objects::CSeq_entry> entry(new objects::CSeq_entry());
136  entry->SetSeq().SetInst().SetMol(objects::CSeq_inst::eMol_dna);
137  entry->SetSeq().SetInst().SetRepr(objects::CSeq_inst::eRepr_raw);
138  entry->SetSeq().SetInst().SetSeq_data().SetIupacna().Set("AATTGGCCAAAATTGGCCAAAATTGGCCAAAATTGGCCAAAATTGGCCAAAATTGGCCAA");
139  entry->SetSeq().SetInst().SetLength(60);
140 
141  CRef<objects::CSeq_id> id(new objects::CSeq_id());
142  id->SetLocal().SetStr("good");
143  entry->SetSeq().SetId().push_back(id);
144 
145  CRef<objects::CSeqdesc> mdesc(new objects::CSeqdesc());
146  mdesc->SetMolinfo().SetBiomol(objects::CMolInfo::eBiomol_genomic);
147  entry->SetSeq().SetDescr().Set().push_back(mdesc);
148 
149  return entry;
150 }
151 
152 // ============================================================================
154 // ============================================================================
155  mAppConfig(appConfig)
156 {
157  m_Options = appConfig.m_Options;
158  m_pContext.reset(new SValidatorContext());
160  m_pContext->m_taxon_update = taxon;
161 }
162 
164 {
165 }
166 
167 
169  const CObjectInfo::CMemberIterator& member,
170  IMessageHandler& msgHandler)
171 {
172  m_Level++;
173 
174  if (m_Level == 1) {
175  size_t n = 0;
176  // Read each element separately to a local TSeqEntry,
177  // process it somehow, and... not store it in the container.
178  for (CIStreamContainerIterator i(in, member); i; ++i) {
179  try {
180  // Get seq-entry to validate
182  i >> *se;
183 
184  // Validate Seq-entry
185  CValidator validator(*m_ObjMgr, m_pContext);
186  CRef<CScope> scope = BuildScope();
187  CSeq_entry_Handle seh = scope->AddTopLevelSeqEntry(*se);
188 
189  CBioseq_CI bi(seh);
190  if (bi) {
191  m_CurrentId = "";
192  bi->GetId().front().GetSeqId()->GetLabel(&m_CurrentId);
193  if (!mAppConfig.mQuiet) {
194  LOG_POST_XX(Corelib_App, 1, m_CurrentId);
195  }
196  }
197 
198  if (mAppConfig.mDoCleanup) {
199  m_Cleanup.SetScope(scope);
200  m_Cleanup.BasicCleanup(*se);
201  }
202 
203  if (mAppConfig.mOnlyAnnots) {
204  for (CSeq_annot_CI ni(seh); ni; ++ni) {
205  const CSeq_annot_Handle& sah = *ni;
206  validator.Validate(sah, m_Options, msgHandler);
207  m_NumRecords++;
208  }
209  }
210  else {
212  validator.Validate(seh, m_Options, msgHandler);
213  m_NumRecords++;
214  double elapsed = sw.Elapsed();
215  if (elapsed > m_Longest) {
216  m_Longest = elapsed;
218  }
219  }
220  scope->RemoveTopLevelSeqEntry(seh);
221  scope->ResetHistory();
222  n++;
223  }
224  catch (exception&) {
225  if (!mAppConfig.mContinue) {
226  throw;
227  }
228  // should we issue some sort of warning?
229  }
230  }
231  }
232  else {
233  in.ReadClassMember(member);
234  }
235  m_Level--;
236 }
237 
238 
239 // ============================================================================
241 // ============================================================================
242 {
243  CRef<CScope> scope(new CScope(*m_ObjMgr));
244  scope->AddDefaults();
245  return scope;
246 }
247 
248 
249 // ============================================================================
250 unique_ptr<CObjectIStream> CAsnvalThreadState::OpenFile(TTypeInfo& asn_info, const string& fname) const
251 // ============================================================================
252 {
254  unique_ptr<CNcbiIstream> hold_stream;
255  CNcbiIstream* InputStream = &NcbiCin;
256 
257  if (!fname.empty()) {
258  own = eTakeOwnership;
259  hold_stream = make_unique<CNcbiIfstream>(fname, ios::binary);
260  InputStream = hold_stream.get();
261  }
262 
264 
265  CFormatGuessEx FG(*InputStream);
266  CFileContentInfo contentInfo;
269  switch (format)
270  {
271  case CFormatGuess::eGZip: method = CCompressStream::eGZipFile; break;
272  case CFormatGuess::eBZip2: method = CCompressStream::eBZip2; break;
273  case CFormatGuess::eLzo: method = CCompressStream::eLZO; break;
274  default: method = CCompressStream::eNone; break;
275  }
276  if (method != CCompressStream::eNone)
277  {
278  CDecompressIStream* decompress(new CDecompressIStream(*InputStream, method, CCompressStream::fDefault, own));
279  hold_stream.release();
280  hold_stream.reset(decompress);
281  InputStream = hold_stream.get();
282  own = eTakeOwnership;
283  CFormatGuessEx fg(*InputStream);
284  format = fg.GuessFormatAndContent(contentInfo);
285  }
286 
287  unique_ptr<CObjectIStream> objectStream;
288  switch (format)
289  {
292  objectStream.reset(CObjectIStream::Open(format == CFormatGuess::eBinaryASN ? eSerial_AsnBinary : eSerial_AsnText, *InputStream, own));
293  hold_stream.release();
294  asn_info = contentInfo.mInfoGenbank.mTypeInfo;
295  break;
296  default:
297  break;
298  }
299  return objectStream;
300 }
301 
302 
303 static void s_StartWrite(IMessageHandler& msgHandler, bool ignoreInferences = false) // Commence write if necessary
304 { // does nothing if msgHandler.InvokeWrite() returns false
305  if (msgHandler.InvokeWrite()) {
306  msgHandler.RequestStop();
307  msgHandler.Write(ignoreInferences);
308  }
309 }
310 
312 {
313  CNcbiOstrstream os;
314  os << "Unable to read invalid ASN.1";
315  m_ReadFailure = true;
316 
317  const CSerialException* p_serial_exception = dynamic_cast<const CSerialException*>(p_exception);
318  if (p_serial_exception && mAppConfig.mVerbosity != CAppConfig::eVerbosity_XML) {
319  if (mpIstr) {
320  os << ": " << mpIstr->GetPosition();
321  }
322  if (p_serial_exception->GetErrCode() == CSerialException::eEOF) {
323  os << ": unexpected end of file";
324  }
326  // manually call ReportAll(0) because what() includes a lot of info
327  // that's not of interest to the submitter such as stacktraces and
328  // GetMsg() doesn't include enough info.
329  os << ": " + p_exception->ReportAll(0);
330  }
331  }
332 
333  string errstr = CNcbiOstrstreamToString(os);
334  // newlines don't play well with XML
335  errstr = NStr::Replace(errstr, "\n", " * ");
336  errstr = NStr::Replace(errstr, " * ", " * ");
337 
339 
340  bool ignoreInferences = (m_pContext->CumulativeInferenceCount >= InferenceAccessionCutoff);
341  s_StartWrite(msgHandler, ignoreInferences);
342 }
343 
344 
345 
347 {
348  CRef<CSeq_submit> submit(new CSeq_submit);
349 
350  auto hook = [this, &msgHandler](CObjectIStream& in, const CObjectInfo::CMemberIterator& member) {ReadClassMember(in, member, msgHandler); };
352  SetLocalReadHook(info.FindMember("seq-set"), *mpIstr, hook);
353 
354  // Read the CSeq_submit, it will call the hook object each time we
355  // encounter a Seq-entry
356  try {
357  *mpIstr >> *submit;
358  }
359  catch (const CException&) {
360  LOG_POST_XX(Corelib_App, 1, "FAILURE: Record is not a batch Seq-submit, do not use -a u to process.");
361  ++m_Reported;
362  }
363 }
364 
365 
366 
368 {
369  CRef<CBioseq_set> seqset(new CBioseq_set);
370 
371  // Register the Seq-entry hook
372  auto hook = [this, &msgHandler](CObjectIStream& in, const CObjectInfo::CMemberIterator& member) {ReadClassMember(in, member, msgHandler); };
374  SetLocalReadHook(info.FindMember("seq-set"), *mpIstr, hook);
375 
376 
377  // Read the CBioseq_set, it will call the hook object each time we
378  // encounter a Seq-entry
379  try {
380  *mpIstr >> *seqset;
381  }
382  catch (const CException&) {
383  LOG_POST_XX(Corelib_App, 1, "FAILURE: Record is not a batch Bioseq-set, do not use -a t to process.");
384  ++m_Reported;
385  }
386 }
387 
388 
389 
391 {
392  // Validate Seq-entry
393  CValidator validator(*m_ObjMgr, m_pContext);
394  CRef<CScope> scope = BuildScope();
395  if (mAppConfig.mDoCleanup) {
396  m_Cleanup.SetScope(scope);
398  }
399  CSeq_entry_Handle seh = scope->AddTopLevelSeqEntry(se);
400  CBioseq_CI bi(seh);
401  if (bi) {
402  m_CurrentId = "";
403  bi->GetId().front().GetSeqId()->GetLabel(&m_CurrentId);
404  if (!mAppConfig.mQuiet) {
405  LOG_POST_XX(Corelib_App, 1, m_CurrentId);
406  }
407  }
408 
409  if (mAppConfig.mOnlyAnnots) {
410  for (CSeq_annot_CI ni(seh); ni; ++ni) {
411  const CSeq_annot_Handle& sah = *ni;
412  validator.Validate(sah, m_Options, msgHandler);
413  m_NumRecords++;
414  }
415  return;
416  }
417  CValidErrorSuppress::TCodes suppressed;
419  validator.Validate(se, scope, m_Options, msgHandler, &suppressed);
420  m_NumRecords++;
421  return;
422 }
423 
424 
426 {
428 
430 
431  CValidator validator(*m_ObjMgr, m_pContext);
432  CRef<CScope> scope = BuildScope();
434 
435  validator.Validate(*sd, *ctx, 0, msgHandler);
436  m_NumRecords++;
437 }
438 
439 
441 {
442  // Get seq-entry to validate
444 
445  auto se = Ref(new CSeq_entry);
446  se->SetSeq(*bioseq);
447 
448  // Validate Seq-entry
449  ProcessSeqEntry(*se, msgHandler);
450 }
451 
453 {
454  // Get seq-entry to validate
456 
457  auto se = Ref(new CSeq_entry);
458  se->SetSet(*bioseqset);
459 
460  // Validate Seq-entry
461  ProcessSeqEntry(*se, msgHandler);
462 }
463 
464 
466 {
468 
469  CValidator validator(*m_ObjMgr, m_pContext);
470  CRef<CScope> scope = BuildScope();
471  m_NumRecords++;
472  validator.Validate(*pd, scope, m_Options, msgHandler);
473 }
474 
475 
477 {
478  // Get seq-entry to validate
480 
481  ProcessSeqEntry(*se, msgHandler);
482 }
483 
484 
486 {
487 
489 
490  m_CurrentId = "";
491  // Validate Seq-submit
492  CValidator validator(*m_ObjMgr, m_pContext);
493  CRef<CScope> scope = BuildScope();
494  if (ss->GetData().IsEntrys()) {
495  for (auto& se: ss->SetData().SetEntrys() ) {
496  auto teh = scope->AddTopLevelSeqEntry(*se);
497  if (teh) {
498  CBioseq_CI bi(teh);
499  if (bi) {
500  bi->GetId().front().GetSeqId()->GetLabel(&m_CurrentId);
501  }
502  }
503  }
504  }
505  if (mAppConfig.mDoCleanup) {
506  m_Cleanup.SetScope(scope);
507  m_Cleanup.BasicCleanup(*ss);
508  }
509 
510  if (!mAppConfig.mQuiet) {
511  LOG_POST_XX(Corelib_App, 1, m_CurrentId);
512  }
513  CValidErrorSuppress::TCodes suppressed;
515  validator.Validate(*ss, scope, m_Options, msgHandler, &suppressed);
516  m_NumRecords++;
517 }
518 
519 
521 {
523 
524  // Validate Seq-annot
525  CValidator validator(*m_ObjMgr, m_pContext);
526  CRef<CScope> scope = BuildScope();
527  if (mAppConfig.mDoCleanup) {
528  m_Cleanup.SetScope(scope);
529  m_Cleanup.BasicCleanup(*sa);
530  }
531  CSeq_annot_Handle sah = scope->AddSeq_annot(*sa);
532  validator.Validate(sah, m_Options, msgHandler);
533  m_NumRecords++;
534 }
535 
536 
538 {
540 
541  CRef<CScope> scope = BuildScope();
542  if (mAppConfig.mDoCleanup) {
543  m_Cleanup.SetScope(scope);
544  m_Cleanup.BasicCleanup(*feat);
545  }
546 
547  CValidator validator(*m_ObjMgr, m_pContext);
548  m_NumRecords++;
549  validator.Validate(*feat, scope, m_Options, msgHandler);
550 }
551 
552 
554 {
556 
557  CValidator validator(*m_ObjMgr, m_pContext);
558  CRef<CScope> scope = BuildScope();
559  m_NumRecords++;
560  validator.Validate(*src, scope, m_Options, msgHandler);
561 }
562 
563 
565 {
566 
567  // Process file based on its content
568  if (asninfo==nullptr) {
569  auto content = mpIstr->GuessDataType(s_known_types);
570  if (content.size() == 1) {
571  asninfo = *content.begin();
572  }
573  }
574  if (asninfo==nullptr) {
575  NCBI_THROW(CException, eUnknown, "Unrecognized data type");
576  }
577 
578  CRef<CSerialObject> serial;
579  try
580  {
581  auto obj_info = mpIstr->Read(asninfo);
582  serial.Reset(static_cast<CSerialObject*>(obj_info.GetObjectPtr()));
583  }
584  catch (CException& e) {
586  ERR_POST(Error << e);
587  }
588  ReportReadFailure(&e, msgHandler);
589  return;
590  }
591 
592  string asn_type = asninfo->GetName();
593  bool unhandledType{false};
594 
595  try
596  {
597  if (asn_type == "Seq-submit") { // Seq-submit
598  ProcessSeqSubmit(serial, msgHandler);
599  } else if (asn_type == "Seq-entry") { // Seq-entry
600  ProcessSeqEntry(serial, msgHandler);
601  } else if (asn_type == "Seq-annot") { // Seq-annot
602  ProcessSeqAnnot(serial, msgHandler);
603  } else if (asn_type == "Seq-feat") { // Seq-feat
604  ProcessSeqFeat(serial, msgHandler);
605  } else if (asn_type == "BioSource") { // BioSource
606  ProcessBioSource(serial, msgHandler);
607  } else if (asn_type == "Pubdesc") { // Pubdesc
608  ProcessPubdesc(serial, msgHandler);
609  } else if (asn_type == "Bioseq-set") { // Bioseq-set
610  ProcessBioseqset(serial, msgHandler);
611  } else if (asn_type == "Bioseq") { // Bioseq
612  ProcessBioseq(serial, msgHandler);
613  } else if (asn_type == "Seqdesc") { // Seq-desc
614  ProcessSeqDesc(serial, msgHandler);
615  } else {
616  unhandledType = true;
617  }
618  }
619  catch(const CException& e)
620  {
621  if (NStr::StartsWith(e.GetMsg(), "duplicate Bioseq id", NStr::eNocase)) {
622  string errstr = e.GetMsg();
623  errstr = NStr::Replace(errstr, "\n", " * ");
624  errstr = NStr::Replace(errstr, " * ", " * ");
626  return;
627  }
628  throw;
629  }
630  if (unhandledType) {
631  NCBI_THROW(CException, eUnknown, "Unhandled type " + asn_type);
632  }
633 }
634 
635 
637  const string& loader_name,
638  CConstRef<CSubmit_block> pSubmitBlock,
639  CConstRef<CSeq_id> seqid,
640  IMessageHandler& msgHandler) const
641 {
642  CRef<CSeq_entry> pEntry;
643  CRef<CScope> scope = BuildScope();
644  if (!loader_name.empty())
645  scope->AddDataLoader(loader_name);
646 
647  CValidator validator(*m_ObjMgr, m_pContext);
648 
649  CSeq_entry_Handle top_h;
650  auto seq_id_h = CSeq_id_Handle::GetHandle(*seqid);
651  if (scope->Exists(seq_id_h)) {
652  if (auto bioseq_h = scope->GetBioseqHandle(seq_id_h); bioseq_h) {
653  top_h = bioseq_h.GetTopLevelEntry();
654  if (top_h) {
655  pEntry = Ref(const_cast<CSeq_entry*>(top_h.GetCompleteSeq_entry().GetPointer()));
656  }
657  }
658  }
659 
660  if (top_h) {
661  if (mAppConfig.mDoCleanup) {
663  cleanup.SetScope(scope);
664  cleanup.BasicCleanup(*pEntry);
665  }
666 
667  CValidErrorSuppress::TCodes suppressed;
668  if (pSubmitBlock) {
669  auto pSubmit = Ref(new CSeq_submit());
670  pSubmit->SetSub().Assign(*pSubmitBlock);
671  pSubmit->SetData().SetEntrys().push_back(pEntry);
672  CValidErrorSuppress::SetSuppressedCodes(*pSubmit, suppressed);
673  validator.Validate(*pSubmit, scope, m_Options, msgHandler, &suppressed);
674  }
675  else {
676  if (mAppConfig.mOnlyAnnots) {
677  for (CSeq_annot_CI ni(top_h); ni; ++ni) {
678  const CSeq_annot_Handle& sah = *ni;
679  validator.Validate(sah, m_Options, msgHandler);
680  }
681  return;
682  }
683  CValidErrorSuppress::SetSuppressedCodes(*pEntry, suppressed);
684  validator.Validate(*pEntry, scope, m_Options, msgHandler, &suppressed);
685  }
686  }
687 }
688 
689 
690 void CAsnvalThreadState::ValidateOneHugeBlob(edit::CHugeFileProcess& process, IMessageHandler& msgHandler)
691 {
692  string loader_name = CDirEntry::CreateAbsolutePath(process.GetFile().m_filename);
693  bool use_mt = true;
694  #ifdef _DEBUG
695  use_mt = false;
696  #endif
697 
698  auto& reader = process.GetReader();
699 
700  auto info = edit::CHugeAsnDataLoader::RegisterInObjectManager(
701  *m_ObjMgr, loader_name, &reader, CObjectManager::eNonDefault, 1); //CObjectManager::kPriority_Local);
702 
703  CAutoRevoker autorevoker(info);
704  CHugeFileValidator hugeFileValidator(reader, m_Options);
705  hugeFileValidator.UpdateValidatorContext(m_GlobalInfo, *m_pContext);
706 
707  if (!mAppConfig.mQuiet) {
708  if (const auto& topIds = reader.GetTopIds(); !topIds.empty()) {
709  m_CurrentId.clear();
710  topIds.front()->GetLabel(&m_CurrentId);
711  LOG_POST_XX(Corelib_App, 1, m_CurrentId);
712  }
713  }
714 
715  if (m_pContext->PreprocessHugeFile) {
716  if (! mAppConfig.mOnlyAnnots) {
717  hugeFileValidator.ReportGlobalErrors(m_GlobalInfo, msgHandler);
718  }
720  s_StartWrite(msgHandler, ignoreInferences);
721  }
722 
723 
724  if (use_mt) {
725  ValidateBlobAsync(loader_name, process, msgHandler);
726  } else {
727  ValidateBlobSequential(loader_name, process, msgHandler); // Need to revisit this
728  }
729 
730  hugeFileValidator.ReportPostErrors(*m_pContext, msgHandler);
731 
732  bool ignoreInferences = (m_pContext->CumulativeInferenceCount >= InferenceAccessionCutoff);
733  s_StartWrite(msgHandler, ignoreInferences);
734 }
735 
736 
737 
738 void CAsnvalThreadState::ValidateOneHugeFile(edit::CHugeFileProcess& process, IMessageHandler& msgHandler)
739 {
740  while (true)
741  {
743 
744  try {
745  if (!process.ReadNextBlob())
746  break;
747 
748  }
749  catch (const edit::CHugeFileException& e) {
750  if (e.GetErrCode() == edit::CHugeFileException::eDuplicateSeqIds)
751  {
753  bool ignoreInferences = (m_pContext->CumulativeInferenceCount >= InferenceAccessionCutoff);
754  s_StartWrite(msgHandler, ignoreInferences);
755  ++m_Reported;
756  continue;
757  }
758  throw;
759  }
760  catch (const CException& e) {
761  ReportReadFailure(&e, msgHandler);
762  return;
763  }
764 
765  m_NumRecords++;
766  ValidateOneHugeBlob(process, msgHandler);
767  }
768 }
769 
770 
772  CAsnvalThreadState* _this,
773  const string& loader_name,
774  CConstRef<CSubmit_block> pSubmitBlock,
775  CConstRef<CSeq_id> seqid,
776  IMessageHandler& msgHandler)
777 {
778  CThreadExitData exit_data;
779  try
780  {
782  _this->ValidateAsync(loader_name, pSubmitBlock, seqid, msgHandler);
783  double elapsed = sw.Elapsed();
784  exit_data.mLongest = elapsed;
785  }
786  catch (const CException& e) {
787  string errstr = e.GetMsg();
788  errstr = NStr::Replace(errstr, "\n", " * ");
789  errstr = NStr::Replace(errstr, " * ", " * ");
791  ERR_POST(e);
792  }
793  return exit_data;
794 }
795 
796 
797 void CAsnvalThreadState::ValidateBlobAsync(const string& loader_name, edit::CHugeFileProcess& process,
798  IMessageHandler& msgHandler)
799 {
800  bool ignoreInferences = (m_pContext->CumulativeInferenceCount >= InferenceAccessionCutoff);
801  auto& reader = process.GetReader();
802  auto writer_task = std::async([this, &ignoreInferences, &msgHandler] { if(msgHandler.InvokeWrite()){ msgHandler.Write(ignoreInferences); } });
803 
805  // start a loop in a separate thread
806  auto topids_task = std::async(std::launch::async, [this, &val_queue, &loader_name, &reader, &msgHandler]()
807  {
808  auto pSubmitBlock = reader.GetSubmitBlock();
809  for (auto seqid : reader.GetTopIds())
810  {
811  auto fut = std::async(std::launch::async, ValidateWorker,
812  this, loader_name, pSubmitBlock, seqid, std::ref(msgHandler));
813  // std::future is not copiable, so passing it for move constructor
814  val_queue.push_back(std::move(fut));
815  }
816 
817  val_queue.push_back({});
818  });
819 
820 
821  while (true)
822  {
823  auto result = val_queue.pop_front();
824  if (!result.valid()) {
825  if (msgHandler.InvokeWrite()) {
826  msgHandler.RequestStop(); // stop write
827  }
828  break;
829  }
830  auto exit_data = result.get();
831  if (exit_data.mLongest > m_Longest) {
832  m_Longest = exit_data.mLongest;
833  m_LongestId = m_CurrentId; //exit_data.mLongestId;
834  }
835  }
836 
837  topids_task.wait();
838  writer_task.wait();
839 }
840 
841 
842 
844  const string& loader_name,
845  edit::CHugeFileProcess& process,
846  IMessageHandler& msgHandler)
847 {
848  auto& reader = process.GetReader();
849 
850  for (auto seqid : reader.GetTopIds())
851  {
852  auto pSubmitBlock = reader.GetSubmitBlock();
853  ValidateAsync(loader_name, pSubmitBlock, seqid, msgHandler);
854  }
855  bool ignoreInferences = (m_pContext->CumulativeInferenceCount >= InferenceAccessionCutoff);
856  s_StartWrite(msgHandler, ignoreInferences);
857 }
858 
859 
861 {
863 
864  ValidateInput(asninfo, msgHandler);
865 
866  double elapsed = sw.Elapsed();
867  if (elapsed > m_Longest) {
868  m_Longest = elapsed;
870  }
871  bool ignoreInferences = (m_pContext->CumulativeInferenceCount >= InferenceAccessionCutoff);
872  s_StartWrite(msgHandler, ignoreInferences);
873 
874  if (m_ReadFailure) {
875  return false;
876  }
877  return true;
878 }
879 
880 
882 {
883  if (asninfo == CBioseq_set::GetTypeInfo()) {
884  ProcessBSSReleaseFile(msgHandler);
885  bool ignoreInferences = (m_pContext->CumulativeInferenceCount >= InferenceAccessionCutoff);
886  s_StartWrite(msgHandler, ignoreInferences);
887  return true;
888  }
889  else
890  if (asninfo == CSeq_submit::GetTypeInfo()) {
891  const auto commandLineOptions = m_Options;
893  try {
894  ProcessSSMReleaseFile(msgHandler);
895  }
896  catch (CException&) {
897  m_Options = commandLineOptions;
898  throw;
899  }
900  m_Options = commandLineOptions;
901  bool ignoreInferences = (m_pContext->CumulativeInferenceCount >= InferenceAccessionCutoff);
902  s_StartWrite(msgHandler, ignoreInferences);
903  return true;
904  }
905  else {
906  LOG_POST_XX(Corelib_App, 1, "FAILURE: Record is neither a Seq-submit nor Bioseq-set; do not use -batch to process.");
907  return false;
908  }
909 }
910 
911 
913 {
914  unique_ptr<IMessageHandler> pMsgHandler;
915  if (mAppConfig.mHugeFile) { // Also need to check input stream here
916  pMsgHandler.reset(new CAsyncMessageHandler(mAppConfig, ostr));
917  }
918  else {
919  pMsgHandler.reset(new CSerialMessageHandler(mAppConfig, ostr));
920  }
921  auto result = ValidateOneFile(filename, *pMsgHandler);
922  result.mReported += pMsgHandler->GetNumReported();
923  return result;
924 }
925 
926 
928 {
929  if (!mAppConfig.mQuiet) {
930  LOG_POST_XX(Corelib_App, 1, filename);
931  }
932 
933  TTypeInfo asninfo = nullptr;
934  unique_ptr<edit::CHugeFileProcess> mpHugeFileProcess;
935 
936  if (filename.empty())
937  mpIstr = OpenFile(asninfo, filename);
938  else {
939  auto huge_reader = Ref(new edit::CHugeAsnReader());
940  huge_reader->ExtendReadHooks([this](CObjectIStream& istream)
941  {
943  });
944 
945  mpHugeFileProcess.reset(new edit::CHugeFileProcess(huge_reader.GetPointer()));
946  try {
947  mpHugeFileProcess->Open(filename, &s_known_types);
948  asninfo = mpHugeFileProcess->GetFile().m_content;
949  }
950  catch (const CObjReaderParseException&) {
951  mpHugeFileProcess.reset();
952  throw;
953  }
954 
955  if (asninfo) {
956  if (!mAppConfig.mHugeFile || mAppConfig.mBatch || !edit::CHugeFileProcess::IsSupported(asninfo)) {
957  mpIstr = mpHugeFileProcess->GetReader().MakeObjStream(0);
958  }
959  }
960  else {
961  mpIstr = OpenFile(asninfo, filename);
962  }
963  }
964 
965  if (mAppConfig.mHugeFile && !mpIstr) {
966  ValidateOneHugeFile(*mpHugeFileProcess, msgHandler);
967  } else {
968 
969  bool proceed = true;
970 
971  do {
972  if (!asninfo) {
973  ReportReadFailure(nullptr, msgHandler);
974  LOG_POST_XX(Corelib_App, 1, "FAILURE: Unable to process invalid ASN.1 file " + filename);
975  break;
976  }
977 
978  try {
979  if (mAppConfig.mBatch) {
980  if (!ValidateBatchMode(asninfo, msgHandler)) {
981  proceed = ValidateTraditionally(asninfo, msgHandler);
982  }
983  }
984  else
985  proceed = ValidateTraditionally(asninfo, msgHandler);
986 
987  if (mpIstr->EndOfData()) // force to SkipWhiteSpace
988  break;
989  else {
990  auto types = mpIstr->GuessDataType(s_known_types);
991  asninfo = types.empty() ? nullptr : *types.begin();
992  }
993  }
994  catch (const CException& e) {
995  string errstr = e.GetMsg();
996  errstr = NStr::Replace(errstr, "\n", " * ");
997  errstr = NStr::Replace(errstr, " * ", " * ");
999  ++m_Reported;
1000  ERR_POST(e);
1001  }
1002  }
1003  while (proceed);
1004  }
1005 
1006  mpIstr.reset();
1007 
1009 }
1010 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
@ eErr_GENERIC_InvalidAsn
@ eErr_GENERIC_DuplicateIDs
@ eErr_INTERNAL_Exception
unsigned int m_Options
Definition: app_config.hpp:67
bool mDoCleanup
Definition: app_config.hpp:57
bool mOnlyAnnots
Definition: app_config.hpp:65
int mNumInstances
Definition: app_config.hpp:68
bool mHugeFile
Definition: app_config.hpp:66
bool mContinue
Definition: app_config.hpp:64
EVerbosity mVerbosity
Definition: app_config.hpp:58
void ProcessPubdesc(CRef< CSerialObject > serial, IMessageHandler &msgHandler)
void ValidateBlobAsync(const string &loader_name, edit::CHugeFileProcess &process, IMessageHandler &msgHandler)
bool ValidateTraditionally(TTypeInfo asninfo, IMessageHandler &msgHandler)
void ValidateBlobSequential(const string &loader_name, edit::CHugeFileProcess &process, IMessageHandler &msgHandler)
void ReportReadFailure(const CException *p_exception, IMessageHandler &msgHandler)
void ProcessSeqAnnot(CRef< CSerialObject > serial, IMessageHandler &msgHandler)
shared_ptr< SValidatorContext > m_pContext
static CThreadExitData ValidateWorker(CAsnvalThreadState *_this, const string &loader_name, CConstRef< CSubmit_block > pSubmitBlock, CConstRef< CSeq_id > seqid, IMessageHandler &msgHandler)
void ProcessBioSource(CRef< CSerialObject > serial, IMessageHandler &msgHandler)
void ReadClassMember(CObjectIStream &in, const CObjectInfo::CMemberIterator &member, IMessageHandler &msgHandler)
std::list< CConstRef< CValidError > > m_eval
void ProcessSeqEntry(CSeq_entry &se, IMessageHandler &msgHandler)
CHugeFileValidator::TGlobalInfo m_GlobalInfo
void ProcessBioseq(CRef< CSerialObject > serial, IMessageHandler &msgHandler)
CRef< CScope > BuildScope() const
bool ValidateBatchMode(TTypeInfo asninfo, IMessageHandler &msgHandler)
CAsnvalThreadState(const CAppConfig &, SValidatorContext::taxupdate_func_t taxon)
void ProcessBioseqset(CRef< CSerialObject > serial, IMessageHandler &msgHandler)
void ProcessSeqSubmit(CRef< CSerialObject > serial, IMessageHandler &msgHandler)
void ValidateOneHugeFile(edit::CHugeFileProcess &process, IMessageHandler &msgHandler)
CThreadExitData ValidateOneFile(const string &infilename, CNcbiOstream &ostr)
void ProcessSSMReleaseFile(IMessageHandler &msgHandler)
const CAppConfig & mAppConfig
void ProcessSeqFeat(CRef< CSerialObject > serial, IMessageHandler &msgHandler)
unique_ptr< CObjectIStream > OpenFile(TTypeInfo &asn_info, const string &filename) const
void ProcessBSSReleaseFile(IMessageHandler &msgHandler)
std::atomic< size_t > m_Reported
void ValidateInput(TTypeInfo asninfo, IMessageHandler &msgHandler)
void ValidateAsync(const string &loader_name, CConstRef< CSubmit_block > pSubmitBlock, CConstRef< CSeq_id > seqid, IMessageHandler &msgHandler) const
void ProcessSeqDesc(CRef< CSerialObject > serial, IMessageHandler &msgHandler)
void ValidateOneHugeBlob(edit::CHugeFileProcess &process, IMessageHandler &msgHandler)
CRef< CObjectManager > m_ObjMgr
unique_ptr< CObjectIStream > mpIstr
unsigned int m_Options
CBioseq_CI –.
Definition: bioseq_ci.hpp:69
TChanges BasicCleanup(CSeq_entry &se, Uint4 options=0)
Definition: cleanup.cpp:132
void SetScope(CScope *scope)
Definition: cleanup.cpp:108
CDecompressIStream –.
Wraps CFormatGuess, and if CFormatGuess's result is Unknown, it tries every file reader until one wor...
CFormatGuess::EFormat GuessFormatAndContent(CFileContentInfo &contentInfo)
void SetRecognizedGenbankTypes(const set< TTypeInfo > &recognizedGenbankTypes)
EFormat
The formats are checked in the same order as declared here.
@ eBZip2
bzip2 compressed file
@ eBinaryASN
Binary ASN.1.
@ eLzo
lzo compressed file
@ eGZip
GNU zip compressed file.
@ eTextASN
Text ASN.1.
void ReportGlobalErrors(const TGlobalInfo &globalInfo, IValidError &errors) const
void UpdateValidatorContext(const TGlobalInfo &globalInfo, SValidatorContext &context) const
static void RegisterReaderHooks(CObjectIStream &objStream, SGlobalInfo &m_GlobalInfo)
void ReportPostErrors(const SValidatorContext &context, IValidError &errors) const
Reading (iterating through) elements of containers (SET OF, SEQUENCE OF).
Definition: objectio.hpp:164
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
CObjectIStream –.
Definition: objistr.hpp:93
CObjectInfoMI –.
Definition: objectiter.hpp:432
CObjectTypeInfo –.
Definition: objectinfo.hpp:94
CScope –.
Definition: scope.hpp:92
CSeq_annot_CI –.
CSeq_annot_Handle –.
CSeq_entry_Handle –.
Definition: Seq_entry.hpp:56
Root class for all serialization exceptions.
Definition: exception.hpp:50
CStopWatch –.
Definition: ncbitime.hpp:1937
CTypeInfo class contains all information about C++ types (both basic and classes): members and layout...
Definition: typeinfo.hpp:76
static void SetSuppressedCodes(const CUser_object &user, TCodes &errCodes)
CRef< CValidError > Validate(const CSeq_entry &se, CScope *scope=nullptr, Uint4 options=0)
Definition: validator.cpp:101
@ eVal_seqsubmit_parent
Definition: validator.hpp:85
virtual void Write(bool ignoreInferences=true)=0
virtual bool InvokeWrite() const =0
void AddValidErrItem(EDiagSev sev, unsigned int ec, const string &msg, const string &desc, const CSerialObject &obj, const string &acc, const int ver, const string &location=kEmptyStr, const int seq_offset=0) override
virtual void RequestStop()=0
Definition: set.hpp:45
Include a standard set of the NCBI C++ Toolkit most basic headers.
static void cleanup(void)
Definition: ct_dynamic.c:30
CS_CONTEXT * ctx
Definition: t0006.c:12
static const struct type types[]
Definition: type.c:22
@ eTakeOwnership
An object can take ownership of another.
Definition: ncbi_types.h:136
@ eNoOwnership
No ownership is assumed.
Definition: ncbi_types.h:135
string
Definition: cgiapp.hpp:690
EMethod
Compression/decompression methods.
Definition: stream_util.hpp:98
@ eLZO
LZO (LZO1X)
@ eNone
no compression method (copy "as is")
Definition: stream_util.hpp:99
@ eGZipFile
.gz file (including concatenated files)
@ fDefault
Use algorithm-specific defaults.
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
#define LOG_POST_XX(error_name, err_subcode, message)
Definition: ncbidiag.hpp:569
@ eDiag_Fatal
Fatal error – guarantees exit(or abort)
Definition: ncbidiag.hpp:655
@ eDiag_Critical
Critical error message.
Definition: ncbidiag.hpp:654
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
TErrCode GetErrCode(void) const
Get error code.
Definition: ncbiexpt.cpp:453
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
const string & GetMsg(void) const
Get message string.
Definition: ncbiexpt.cpp:461
string ReportAll(TDiagPostFlags flags=eDPF_Exception) const
Report all exceptions.
Definition: ncbiexpt.cpp:370
static string CreateAbsolutePath(const string &path, ERelativeToWhat rtw=eRelativeToCwd)
Get an absolute path from some, possibly relative, path.
Definition: ncbifile.cpp:665
@ eSerial_AsnText
ASN.1 text.
Definition: serialdef.hpp:73
@ eSerial_AsnBinary
ASN.1 binary.
Definition: serialdef.hpp:74
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
static CObjectIStream * Open(ESerialDataFormat format, CNcbiIstream &inStream, bool deleteInStream)
Create serial object reader and attach it to an input stream.
Definition: objistr.cpp:195
@ eEOF
Unexpected end-of-file.
Definition: exception.hpp:55
void ResetHistory(EActionIfLocked action=eKeepIfLocked)
Clean all unused TSEs from the scope's cache and release the memory.
Definition: scope.cpp:325
void AddDataLoader(const string &loader_name, TPriority pri=kPriority_Default)
Add data loader by name.
Definition: scope.cpp:510
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CSeq_entry_Handle AddTopLevelSeqEntry(CSeq_entry &top_entry, TPriority pri=kPriority_Default, EExist action=eExist_Default)
Add seq_entry, default priority is higher than for defaults or loaders Add object to the score with p...
Definition: scope.cpp:522
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
void AddDefaults(TPriority pri=kPriority_Default)
Add default data loaders from object manager.
Definition: scope.cpp:504
CSeq_annot_Handle AddSeq_annot(CSeq_annot &annot, TPriority pri=kPriority_Default, EExist action=eExist_Throw)
Add Seq-annot, return its CSeq_annot_Handle.
Definition: scope.cpp:538
void RemoveTopLevelSeqEntry(const CTSE_Handle &entry)
Revoke TSE previously added using AddTopLevelSeqEntry() or AddBioseq().
Definition: scope.cpp:376
bool RevokeDataLoader(CDataLoader &loader)
Revoke previously registered data loader.
bool Exists(const CSeq_id &id)
Check existence of sequence with this id.
Definition: scope.cpp:393
CSeq_entry_Handle GetTopLevelEntry(void) const
Get top level Seq-entry handle.
CConstRef< CSeq_entry > GetCompleteSeq_entry(void) const
Complete and get const reference to the seq-entry.
const TId & GetId(void) const
TObjectType * GetPointer(void) const THROWS_NONE
Get pointer,.
Definition: ncbiobj.hpp:1684
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
Definition: ncbiobj.hpp:2015
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
#define NcbiCin
Definition: ncbistre.hpp:542
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
static string & Replace(const string &src, const string &search, const string &replace, string &dst, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3305
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5406
@ eNocase
Case insensitive compare.
Definition: ncbistr.hpp:1206
double Elapsed(void) const
Return time elapsed since first Start() or last Restart() call (in seconds).
Definition: ncbitime.hpp:2775
@ eStart
Start timer immediately after creating.
Definition: ncbitime.hpp:1941
const string & GetName(void) const
Get name of this type.
Definition: typeinfo.cpp:249
ENcbiOwnership
Ownership relations between objects.
Definition: ncbi_types.h:134
const TData & GetData(void) const
Get the Data member data.
void SetData(TData &value)
Assign a value to Data data member.
bool IsEntrys(void) const
Check if variant Entrys is selected.
static CStopWatch sw
Definition of all error codes used in corelib (xncbi.lib).
int i
yy_size_t n
static MDB_envinfo info
Definition: mdb_load.c:37
Definition: fix_pub.hpp:45
Magic spell ;-) needed for some weird compilers... very empiric.
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Defines command line argument related classes.
Defines unified interface to application:
#define nullptr
Definition: ncbimisc.hpp:45
NCBI C++ stream class wrappers for triggering between "new" and "old" C++ stream libraries.
static Format format
Definition: njn_ioutil.cpp:53
std::istream & in(std::istream &in_, double &x_)
The Object manager core.
void SetLocalReadHook(const CObjectTypeInfo &obj_type_info, CObjectIStream &ostr, _Func _func)
C++ I/O stream wrappers to compress/decompress data on-the-fly.
SRegisterLoaderInfo –.
function< CRef< CTaxon3_reply >(const vector< CRef< COrg_ref > > &list)> taxupdate_func_t
USING_SCOPE(objects)
const set< TTypeInfo > s_known_types
static void s_StartWrite(IMessageHandler &msgHandler, bool ignoreInferences=false)
static CRef< objects::CSeq_entry > s_BuildGoodSeq()
else result
Definition: token2.c:20
CFileContentInfoGenbank mInfoGenbank
const int InferenceAccessionCutoff
#define const
Definition: zconf.h:232
Modified on Fri Sep 20 14:57:29 2024 by modify_doxy.py rev. 669887