1 /* $Id: asn_cleanup.cpp 101210 2023-11-16 14:19:11Z gotvyans $
2  * ===========================================================================
3  *
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Aaron Ucko, Mati Shomrat, Colleen Bollin, NCBI
27  *
28  * File Description:
29  * runs ExtendedCleanup on ASN.1 files
30  *
31  * ===========================================================================
32  */
33 #include <ncbi_pch.hpp>
34 #include <common/ncbi_source_ver.h>
35 #include <corelib/ncbiapp.hpp>
37 #include <objmgr/util/sequence.hpp>
53 #include "huge_file_cleanup.hpp"
55 #include "read_hooks.hpp"
56 #include "bigfile_processing.hpp"
58 #include <common/ncbi_revision.h>
60 #ifndef NCBI_SC_VERSION
62 #elif (NCBI_SC_VERSION == 0)
64 #endif
77 };
79 struct TThreadState {
81  bool m_IsMultiSeq = false;
83 };
85 class CCleanupApp :
86  public CNcbiApplication,
88  public ISubmitBlockHandler,
89  public IProcessorCallback
90 {
91 public:
92  CCleanupApp();
93  void Init() override;
94  int Run() override;
96  bool HandleSubmitBlock(CSubmit_block& block) override;
97  bool HandleSeqEntry(CRef<CSeq_entry>& se) override;
99  bool HandleSeqID(const string& seqID);
101  // IProcessorCallback interface functionality
102  void Process(CRef<CSerialObject>& obj) override;
104 private:
105  // types
107  void x_OpenOStream(const string& filename, const string& dir = kEmptyStr, bool remove_orig_dir = true);
108  void x_CloseOStream();
109  bool x_ProcessBigFile(unique_ptr<CObjectIStream>& is, TTypeInfo asn_type);
110  void x_ProcessOneFile(unique_ptr<CObjectIStream>& is, EProcessingMode mode, TTypeInfo asn_type);
111  void x_ProcessOneFile(const string& filename);
112  void x_ProcessOneDirectory(const string& dirname, const string& suffix);
114  bool x_ProcessHugeFile(edit::CHugeFileProcess& process);
115  bool x_ProcessHugeFileBlob(edit::CHugeFileProcess& process);
116  CConstRef<CSerialObject> x_ProcessTraditionally(edit::CHugeAsnReader& reader);
117  void x_ProcessTraditionally(edit::CHugeFileProcess& process);
119  void x_FeatureOptionsValid(const string& opt);
120  void x_KOptionsValid(const string& opt);
121  void x_XOptionsValid(const string& opt);
122  bool x_ProcessFeatureOptions(const string& opt, CSeq_entry_Handle seh);
124  bool x_ProcessXOptions(const string& opt, CSeq_entry_Handle seh, Uint4 options);
125  bool x_GFF3Batch(CSeq_entry_Handle seh);
130  };
133  bool x_FixCDS(CSeq_entry_Handle seh, Uint4 options, const string& missing_prot_name);
135  bool x_BasicAndExtended(CSeq_entry_Handle entry, const string& label, Uint4 options = 0);
137  bool x_ReportChanges(const string_view prefix, CCleanupChangeCore changes);
139  // template<typename T> void x_WriteToFile(const T& s);
141  // data
142  unique_ptr<edit::CRemoteUpdater> m_remote_updater;
144  unique_ptr<CObjectOStream> m_Out; // output
145  CRef<CObjectManager> m_Objmgr; // Object Manager
146  bool m_do_basic = false;
147  bool m_do_extended = false;
149  bool m_IsHugeSet = false;
150 };
154 {
156 }
159 {
160  unique_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
161  arg_desc->SetUsageContext("", "Perform ExtendedCleanup on an ASN.1 Seq-entry into a flat report");
163  // input
164  {
165  // name
166  arg_desc->AddOptionalKey("i", "InputFile",
167  "Input file name", CArgDescriptions::eInputFile);
169  // input file serial format (AsnText\AsnBinary\XML, default: AsnText)
170  arg_desc->AddOptionalKey("serial", "SerialFormat", "Obsolete; Input file format is now autodetected",
173  // output file serial format (AsnText\AsnBinary\XML, default: AsnText)
174  arg_desc->AddOptionalKey("outformat", "OutputSerialFormat", "Output file format",
176  arg_desc->SetConstraint("outformat", &(*new CArgAllow_Strings,
177  "text", "binary", "XML", "JSON"));
179  // id
180  arg_desc->AddOptionalKey("id", "ID",
181  "Specific ID to display", CArgDescriptions::eString);
183  // input type:
184  arg_desc->AddOptionalKey("type", "AsnType", "Obsolete; ASN.1 object type is now autodetected",
187  // path
188  arg_desc->AddOptionalKey("indir", "path", "Path to files", CArgDescriptions::eDirectory);
190  // suffix
191  arg_desc->AddDefaultKey("x", "suffix", "File Selection Suffix", CArgDescriptions::eString, ".ent");
193  // results
194  arg_desc->AddOptionalKey("outdir", "results", "Path for Results", CArgDescriptions::eDirectory);
195  }
197  // batch processing
198  {
199  arg_desc->AddFlag("batch", "Process NCBI release file (Deprecated)",
201  }
203  // big file processing
204  {
205  arg_desc->AddFlag("bigfile", "Process big files containing many bioseqs");
206  }
208  // output
209  {
210  // name
211  arg_desc->AddOptionalKey("o", "OutputFile",
212  "Output file name", CArgDescriptions::eOutputFile);
213  }
215  // normal cleanup options (will replace -nocleanup and -basic)
216  {
217  arg_desc->AddOptionalKey("K", "Cleanup", "Systemic Cleaning Options\n"
218  "\tb Basic\n"
219  "\ts Extended\n"
220  "\tn Normalize Descriptor Order\n"
221  "\tu Remove Cleanup User-object\n",
223  }
225  // extra cleanup options
226  {
227  arg_desc->AddOptionalKey("F", "Feature", "Feature Cleaning Options\n"
228  "\tr Remove Redundant Gene xref\n"
229  "\ta Adjust for Missing Stop Codon\n"
230  "\tp Clear internal partials\n"
231  "\tz Delete or Update EC Numbers\n"
232  "\td Remove duplicate features\n",
235  arg_desc->AddOptionalKey("X", "Miscellaneous", "Other Cleaning Options\n"
236  "\td Automatic Definition Line\n"
237  "\tw GFF/WGS Genome Cleanup\n"
238  "\tr Regenerate Definition Lines\n"
239  "\tb Batch Cleanup of Multireader Output\n"
240  "\ta Remove Assembly Gaps\n"
241  "\ti Make Influenza Small Genome Sets\n"
242  "\tf Make IRD misc_feats\n",
245  arg_desc->AddFlag("T", "TaxonomyLookup");
246  }
248  // misc
249  {
250  // no-cleanup
251  arg_desc->AddFlag("nocleanup",
252  "Do not perform extended data cleanup prior to formatting");
253  arg_desc->AddFlag("basic",
254  "Perform basic data cleanup prior to formatting");
255  arg_desc->AddFlag("noobj",
256  "Do not create Ncbi_cleanup object");
258  // show progress
259  arg_desc->AddFlag("showprogress",
260  "List ID for which cleanup is occuring");
261  arg_desc->AddFlag("debug", "Save before.sqn");
263  // huge mode
264  arg_desc->AddFlag("huge",
265  "Process file in huge files mode");
266  arg_desc->AddFlag("disable-huge",
267  "Explicitly disable huge files mode");
268  arg_desc->SetDependency("disable-huge",
270  "huge");
271  }
273  // remote
276  SetupArgDescriptions(arg_desc.release());
277 }
280 void CCleanupApp::x_FeatureOptionsValid(const string& opt)
281 {
282  if (NStr::IsBlank(opt)) {
283  return;
284  }
285  string unrecognized;
286  for (char c : opt) {
287  if (! isspace(c)) {
288  if (c != 'r' && c != 'a' && c != 'p' && c != 'z' && c != 'd') {
289  unrecognized += c;
290  }
291  }
292  }
293  if (! unrecognized.empty()) {
294  NCBI_THROW(CArgException, eInvalidArg, "Invalid -F arguments:" + unrecognized);
295  }
296 }
299 void CCleanupApp::x_KOptionsValid(const string& opt)
300 {
301  if (NStr::IsBlank(opt)) {
302  return;
303  }
304  string unrecognized;
305  for (char c : opt) {
306  if (! isspace(c)) {
307  if (c != 'b' && c != 's' && c != 'u' && c != 'n') {
308  unrecognized += c;
309  }
310  }
311  }
312  if (! unrecognized.empty()) {
313  NCBI_THROW(CArgException, eInvalidArg, "Invalid -K arguments:" + unrecognized);
314  }
315 }
318 void CCleanupApp::x_XOptionsValid(const string& opt)
319 {
320  if (NStr::IsBlank(opt)) {
321  return;
322  }
323  string unrecognized;
324  for (char c : opt) {
325  if (! isspace(c)) {
326  if (c != 'w' && c != 'r' && c != 'b' && c != 'a' &&
327  c != 'i' && c != 'f' && c != 'd') {
328  unrecognized += c;
329  }
330  }
331  }
332  if (! unrecognized.empty()) {
333  NCBI_THROW(CArgException, eInvalidArg, "Invalid -X arguments:" + unrecognized);
334  }
335 }
338 bool CCleanupApp::x_ProcessBigFile(unique_ptr<CObjectIStream>& is, TTypeInfo asn_type)
339 {
340  EBigFileContentType content_type = eContentUndefined;
341  if (asn_type == CSeq_entry::GetTypeInfo()) {
342  content_type = eContentSeqEntry;
343  } else if (asn_type == CBioseq_set::GetTypeInfo()) {
344  content_type = eContentBioseqSet;
345  } else if (asn_type == CSeq_submit::GetTypeInfo()) {
346  content_type = eContentSeqSubmit;
347  } else {
348  _ASSERT(0);
349  }
351  return ProcessBigFile(*is, *m_Out, *this, content_type);
352 }
354 void CCleanupApp::x_ProcessOneFile(unique_ptr<CObjectIStream>& is, EProcessingMode mode, TTypeInfo asn_type)
355 {
356  if (mode == eModeBatch) {
357  CGBReleaseFile in(*is.release());
358  in.RegisterHandler([this](CRef<CSeq_entry>& entry) -> bool
359  {
360  return HandleSeqEntry(entry);
361  });
362  in.Read(); // HandleSeqEntry will be called from this function
363  } else if (mode == eModeBigfile) {
364  x_ProcessBigFile(is, asn_type);
365  }
366 }
368 static bool s_IsHugeMode(const CArgs& args, const CNcbiRegistry& cfg)
369 {
370  if (args["disable-huge"])
371  return false;
372  if (args["huge"])
373  return true;
374  return cfg.GetBool("asn_cleanup", "UseHugeFiles", false);
375 }
377 void CCleanupApp::x_ProcessOneFile(const string& filename)
378 {
379  const CArgs& args = GetArgs();
381  m_state = TThreadState();
385  _ASSERT(! NStr::IsBlank(filename));
387  if (args["type"]) {
388  cerr << "Warning: -type argument should not be used; ASN.1 object type is now autodetected." << endl;
389  }
390  if (args["serial"]) {
391  cerr << "Warning: -serial argument should not be used; Input file format is now autodetected." << endl;
392  }
394  CCleanupHugeAsnReader::TOptions options{ 0 };
395  if (m_do_extended) {
397  }
398  if (args["noobj"]) {
400  }
402  if (args["X"] && (NStr::Find(args["X"].AsString(), "i") != NPOS)) {
404  }
406  edit::CHugeFileProcess huge_process(new CCleanupHugeAsnReader(options));
407  huge_process.OpenFile(filename);
409  TTypeInfo asn_type = huge_process.GetFile().m_content;
410  if (! asn_type) {
411  string msg = "Unable to open input file " + filename + ". Content not recognized.";
412  NCBI_THROW(CArgException, eInvalidArg, msg);
413  }
415  // need to set output if -o not specified
416  bool opened_output = false;
418  if (! args["o"] && args["outdir"]) {
419  x_OpenOStream(filename, args["outdir"].AsString());
420  opened_output = true;
421  }
424  m_state.m_IsMultiSeq = false;
425  if (asn_type == CBioseq::GetTypeInfo()) {
426  // always regular mode
427  mode = eModeRegular;
428  } else if (s_IsHugeMode(args, GetConfig())) {
430  } else if (args["batch"]) {
431  mode = eModeBatch;
432  } else if (args["bigfile"]) {
433  mode = eModeBigfile;
434  }
436  if (mode == eModeHugefile) {
437  huge_process.OpenReader();
438  x_ProcessHugeFile(huge_process);
439  } else if (mode == eModeRegular) {
440  huge_process.OpenReader();
441  x_ProcessTraditionally(huge_process);
442  } else {
443  unique_ptr<CObjectIStream> is = huge_process.GetFile().MakeObjStream(0);
444  x_ProcessOneFile(is, mode, asn_type);
445  }
447  m_state.m_changes += dynamic_cast<CCleanupHugeAsnReader&>(huge_process.GetReader()).GetChanges();
449  if (opened_output) {
450  // close output file if we opened one
451  x_CloseOStream();
452  }
453 }
455 void CCleanupApp::x_ProcessTraditionally(edit::CHugeFileProcess& process)
456 {
457  bool proceed = true;
458  size_t num_cleaned = 0;
459  auto reader = process.GetReader();
461  while (proceed) {
463  auto anytop = x_ProcessTraditionally(reader);
464  proceed = anytop;
466  if (anytop) {
467  *m_Out << *anytop;
468  }
470  if (proceed) {
471  ++num_cleaned;
472  }
473  }
475  // if (num_cleaned == 0 || (! first_only && (is->GetFailFlags() & CObjectIStream::fEOF) != CObjectIStream::fEOF)) {
476  // NCBI_THROW(CArgException, eInvalidArg, "Unable to construct Seq-entry object");
477  // }
478 }
481 {
482  auto anytop = reader.ReadAny();
483  if (anytop.Empty())
484  return {};
486  CConstRef<CSerialObject> topobject;
487  CRef<CSeq_entry> topentry;
489  if (anytop->GetThisTypeInfo() == CSeq_entry::GetTypeInfo()) {
491  HandleSeqEntry(topentry);
492  topobject = topentry;
493  } else if (anytop->GetThisTypeInfo() == CSeq_submit::GetTypeInfo()) {
494  auto submit = Ref(CTypeConverter<CSeq_submit>::SafeCast(anytop));
495  if (submit->IsEntrys()) {
496  topentry = submit->SetData().SetEntrys().front();
497  if (submit->IsSetSub()) {
498  HandleSubmitBlock(submit->SetSub());
499  }
500  submit->SetData().SetEntrys().clear();
501  HandleSeqEntry(topentry);
502  submit->SetData().SetEntrys().push_back(topentry);
503  }
504  topobject = submit;
505  } else if (anytop->GetThisTypeInfo() == CBioseq_set::GetTypeInfo()) {
506  auto bioset = Ref(CTypeConverter<CBioseq_set>::SafeCast(anytop));
507  topentry = Ref(new CSeq_entry);
508  topentry->SetSet(*bioset);
509  bioset.Reset();
510  HandleSeqEntry(topentry);
511  if (topentry->IsSet())
512  topobject.Reset(&topentry->GetSet());
513  else
514  topobject.Reset(&topentry->GetSeq());
515  } else if (anytop->GetThisTypeInfo() == CBioseq::GetTypeInfo()) {
516  auto bioseq = Ref(CTypeConverter<CBioseq>::SafeCast(anytop));
517  topentry = Ref(new CSeq_entry);
518  topentry->SetSeq(*bioseq);
519  bioseq.Reset();
520  HandleSeqEntry(topentry);
521  if (topentry->IsSet())
522  topobject.Reset(&topentry->GetSet());
523  else
524  topobject.Reset(&topentry->GetSeq());
525  } else {
526  //_ASSERT(0);
527  }
528  return topobject;
529 }
531 bool CCleanupApp::x_ProcessHugeFileBlob(edit::CHugeFileProcess& process)
532 {
535  CConstRef<CSerialObject> topobject; // top object is used to write output, can be submit, entry, bioseq, bioseq_set
537  CRef<CSeq_submit> submit;
538  CRef<CSeq_entry> topentry;
542  topentry = Ref(new CSeq_entry);
543  auto& reader = dynamic_cast<CCleanupHugeAsnReader&>(process.GetReader());
545  if (reader.GetTopEntry()) {
546  topentry->Assign(*reader.GetTopEntry());
547  } else {
549  topentry->SetSet().SetSeq_set().clear();
550  }
553  if (reader.GetSubmitBlock()) {
554  submit.Reset(new CSeq_submit);
555  submit->SetSub().Assign(*reader.GetSubmitBlock());
556  submit->SetData().SetEntrys().clear();
557  submit->SetData().SetEntrys().push_back(topentry);
558  HandleSubmitBlock(submit->SetSub());
559  }
561  if (submit)
562  topobject = submit;
563  else
564  topobject = topentry;
566  writer.StartWriter(topobject);
567  try {
568  bool proceed = process.ForEachEntry(
570  [this, &writer](CSeq_entry_Handle seh) -> bool {
571  HandleSeqEntry(seh.GetEditHandle());
572  writer.PushNextEntry(seh.GetCompleteSeq_entry());
573  return true;
574  });
575  writer.FinishWriter();
576  return proceed;
577  } catch(...) {
578  writer.CancelWriter();
579  throw;
580  }
581 }
583 bool CCleanupApp::x_ProcessHugeFile(edit::CHugeFileProcess& process)
584 {
585  return process.ForEachBlob([this](edit::CHugeFileProcess& p_process) -> bool {
586  m_state.m_IsMultiSeq = p_process.GetReader().IsMultiSequence();
587  if (m_state.m_IsMultiSeq) {
588  bool proceed = x_ProcessHugeFileBlob(p_process);
589  if (! proceed)
590  return false;
591  } else {
592  auto topobject = x_ProcessTraditionally(p_process.GetReader());
593  m_Out->ResetLocalHooks();
594  *m_Out << *topobject;
595  }
596  return true;
597  });
598 }
601 void CCleanupApp::x_ProcessOneDirectory(const string& dirname, const string& suffix)
602 {
603  CDir dir(dirname);
605  string mask = "*" + suffix;
606  size_t num_files = 0;
609  for (CDir::TEntry ii : files) {
610  if (ii->IsFile()) {
611  string fname = CDirEntry::MakePath(dirname, ii->GetName());
612  x_ProcessOneFile(fname);
613  num_files++;
614  }
615  }
616  if (num_files == 0) {
617  NCBI_THROW(CArgException, eInvalidArg, "No files found!");
618  }
619 }
623 {
624  // initialize conn library
627  const CArgs& args = GetArgs();
629  // flag validation
630  if (args["F"]) {
631  x_FeatureOptionsValid(args["F"].AsString());
632  }
633  if (args["K"]) {
634  x_KOptionsValid(args["K"].AsString());
635  }
636  if (args["X"]) {
637  x_XOptionsValid(args["X"].AsString());
638  }
639  if (args["batch"] && args["bigfile"]) {
640  NCBI_THROW(CArgException, eInvalidArg, "\"batch\" and \"bigfile\" arguments are incompatible. Only one of them may be used.");
641  }
642  if (args["X"] && args["bigfile"]) {
643  NCBI_THROW(CArgException, eInvalidArg, "\"X\" and \"bigfile\" arguments are incompatible. Only one of them may be used.");
644  }
646  if (args["K"]) {
647  if (NStr::Find(args["K"].AsString(), "b") != string::npos) {
648  m_do_basic = true;
649  }
650  if (NStr::Find(args["K"].AsString(), "s") != string::npos) {
651  m_do_basic = true;
652  m_do_extended = true;
653  }
654  } else if (args["X"]) {
655  m_do_basic = true;
656  if (NStr::Find(args["X"].AsString(), "w") != string::npos) {
657  // Extended Cleanup is part of -X w
658  m_do_extended = false;
659  }
660  } else if (args["F"]) {
661  m_do_basic = true;
662  } else {
663  if (args["basic"]) {
664  m_do_basic = true;
665  }
666  if (! args["nocleanup"]) {
667  m_do_extended = true;
668  }
669  }
671  // create object manager
673  if ( !m_Objmgr ) {
674  NCBI_THROW(CArgException, eInvalidArg, "Could not create object manager");
675  }
679  m_remote_updater.reset(new edit::CRemoteUpdater(nullptr));
681  // need to set output (-o) if specified, if not -o and not -outdir need to use standard output
682  bool opened_output = false;
683  if (args["o"]) {
684  string abs_output_path = CDirEntry::CreateAbsolutePath(args["o"].AsString());
685  if (args["i"]) {
686  string fname = args["i"].AsString();
687  if (args["indir"]) {
688  fname = CDirEntry::MakePath(args["indir"].AsString(), fname);
689  }
690  if (abs_output_path == CDirEntry::CreateAbsolutePath(fname)) {
691  ERR_POST("Input and output files should be different");
692  return 1;
693  }
694  }
695  x_OpenOStream(args["o"].AsString(),
696  args["outdir"] ? args["outdir"].AsString() : kEmptyStr,
697  false);
698  opened_output = true;
699  } else if (! args["outdir"] || args["id"]) {
701  opened_output = true;
702  }
704  if (args["id"]) {
705  string seqID = args["id"].AsString();
706  HandleSeqID(seqID);
707  } else if (args["i"]) {
708  string fname = args["i"].AsString();
709  if (args["indir"]) {
710  fname = CDirEntry::MakePath(args["indir"].AsString(), fname);
711  }
712  x_ProcessOneFile(fname);
713  } else if (args["outdir"]) {
714  x_ProcessOneDirectory(args["indir"].AsString(), args["x"].AsString());
715  } else {
716  cerr << "Error: stdin is no longer supported; please use -i" << endl;
717  }
719  if (opened_output) {
720  // close output file if we opened one
721  x_CloseOStream();
722  }
724  if (m_do_basic && !m_do_extended)
725  x_ReportChanges("BasicCleanup", m_state.m_changes);
726  if (m_do_extended)
727  x_ReportChanges("ExtendedCleanup", m_state.m_changes);
730  m_remote_updater->ReportStats(std::cerr);
731  #endif
733  return 0;
734 }
736 bool CCleanupApp::HandleSeqID(const string& seq_id)
737 {
738  CRef<CScope> scope(new CScope(*m_Objmgr));
739  scope->AddDefaults();
741  CBioseq_Handle bsh;
742  try {
743  CSeq_id SeqId(seq_id);
744  bsh = scope->GetBioseqHandle(SeqId);
745  } catch (CException&) {
746  ERR_FATAL("The ID " << seq_id << " is not a valid seq ID.");
747  }
749  if (! bsh) {
750  ERR_FATAL("Sequence for " << seq_id << " cannot be retrieved.");
751  return false;
752  }
754  CRef<CSeq_entry> entry(new CSeq_entry());
756  HandleSeqEntry(entry);
757  *m_Out << *entry;
759  return true;
760 }
763 {
764  if (NStr::IsBlank(opt)) {
765  return false;
766  }
767  bool any_changes = false;
768  if (NStr::Find(opt, "r") != string::npos) {
769  any_changes |= CCleanup::RemoveUnnecessaryGeneXrefs(seh);
770  }
771  if (NStr::Find(opt, "a") != string::npos) {
772  any_changes |= x_FixCDS(seh, eFixCDS_ExtendToStop, kEmptyStr);
773  }
774  if (NStr::Find(opt, "p") != string::npos) {
775  any_changes |= CCleanup::ClearInternalPartials(seh);
776  }
777  if (NStr::Find(opt, "z") != string::npos) {
778  any_changes |= CCleanup::FixECNumbers(seh);
779  }
780  if (NStr::Find(opt, "d") != string::npos) {
781  any_changes |= x_RemoveDuplicateFeatures(seh);
782  }
783  return any_changes;
784 }
787 {
788  bool any_change = false;
790  if (deleted_feats.empty()) {
791  return false;
792  }
794  for (auto df : deleted_feats) {
796  eh.Remove();
797  any_change = true;
798  }
799  for (auto orph : orphans) {
800  CBioseq_EditHandle eh(orph);
801  eh.Remove();
802  any_change = true;
803  }
804  any_change |= CCleanup::RenormalizeNucProtSets(seh);
805  return any_change;
807 }
809 bool CCleanupApp::x_ProcessXOptions(const string& opt, CSeq_entry_Handle seh, Uint4 options)
810 {
811  bool any_changes = false;
812  if (NStr::Find(opt, "w") != string::npos) {
813  any_changes = CCleanup::WGSCleanup(seh, true, options);
814  }
815  if (NStr::Find(opt, "r") != string::npos) {
816  bool change_defline = CAutoDefWithTaxonomy::RegenerateDefLines(seh);
817  if (change_defline) {
818  any_changes = true;
820  }
821  }
822  if (NStr::Find(opt, "b") != string::npos) {
823  any_changes |= x_GFF3Batch(seh);
824  }
825  if (NStr::Find(opt, "a") != string::npos) {
826  any_changes |= CCleanup::ConvertDeltaSeqToRaw(seh);
827  }
828  if (! m_IsHugeSet && (NStr::Find(opt, "i") != string::npos)) {
829  if (CCleanup::MakeSmallGenomeSet(seh) > 0) {
830  any_changes = true;
831  }
832  }
833  if (NStr::Find(opt, "f") != string::npos) {
835  any_changes = true;
836  }
837  }
838  if (NStr::Find(opt, "d") != string::npos) {
839  CCleanup::AutodefId(seh);
840  any_changes = true;
841  }
842  return any_changes;
843 }
847 {
848  if (! sf.GetData().IsCdregion()) {
849  // not coding region
850  return false;
851  }
852  if (sequence::IsPseudo(sf, b.GetScope())) {
853  return false;
854  }
856  // check for existing stop codon
857  string translation;
858  try {
859  CSeqTranslator::Translate(sf, b.GetScope(), translation, true);
860  } catch (CSeqMapException& e) {
861  cout << e.what() << endl;
862  return false;
863  } catch (CSeqVectorException& e) {
864  cout << e.what() << endl;
865  return false;
866  }
867  if (NStr::EndsWith(translation, "*")) {
868  // already has stop codon
869  return false;
870  }
872  if (CCleanup::ExtendToStopCodon(sf, b, 50)) {
873  feature::RetranslateCDS(sf, b.GetScope());
874  return true;
875  } else {
876  return false;
877  }
878 }
881 bool CCleanupApp::x_FixCDS(CSeq_entry_Handle seh, Uint4 options, const string& missing_prot_name)
882 {
883  bool any_changes = false;
884  for (CBioseq_CI bi(seh, CSeq_inst::eMol_na); bi; ++bi) {
885  any_changes |= CCleanup::SetGeneticCodes(*bi);
887  CConstRef<CSeq_feat> orig = fi->GetSeq_feat();
888  CRef<CSeq_feat> sf(new CSeq_feat());
889  sf->Assign(*orig);
890  bool feat_change = false;
891  if ((options & eFixCDS_FrameFromLoc) &&
892  CCleanup::SetFrameFromLoc(sf->SetData().SetCdregion(), sf->GetLocation(), bi.GetScope())) {
893  feat_change = true;
894  }
895  if ((options & eFixCDS_Retranslate)) {
896  feat_change |= feature::RetranslateCDS(*sf, bi.GetScope());
897  }
898  if ((options & eFixCDS_ExtendToStop) &&
899  x_BatchExtendCDS(*sf, *bi)) {
901  if (mrna && CCleanup::LocationMayBeExtendedToMatch(mrna->GetLocation(), sf->GetLocation())) {
902  CRef<CSeq_feat> new_mrna(new CSeq_feat());
903  new_mrna->Assign(*mrna);
904  if (CCleanup::ExtendStopPosition(*new_mrna, sf)) {
906  efh.Replace(*new_mrna);
907  }
908  }
910  if (gene && CCleanup::LocationMayBeExtendedToMatch(gene->GetLocation(), sf->GetLocation())) {
911  CRef<CSeq_feat> new_gene(new CSeq_feat());
912  new_gene->Assign(*gene);
913  if (CCleanup::ExtendStopPosition(*new_gene, sf)) {
915  efh.Replace(*new_gene);
916  }
917  }
919  feat_change = true;
920  }
921  if (feat_change) {
923  ofh.Replace(*sf);
924  any_changes = true;
925  }
926  // also set protein name if missing, change takes place on protein bioseq
927  if (! NStr::IsBlank(missing_prot_name)) {
928  string current_name = CCleanup::GetProteinName(*sf, seh);
929  if (NStr::IsBlank(current_name)) {
930  CCleanup::SetProteinName(*sf, missing_prot_name, false, seh.GetScope());
931  any_changes = true;
932  }
933  }
934  }
935  }
936  return any_changes;
937 }
941 {
942  bool any_changes = x_FixCDS(seh, kGFF3CDSFixOptions, kEmptyStr);
944  cleanup.SetScope(&(seh.GetScope()));
946  auto changes = cleanup.BasicCleanup(seh, options);
947  any_changes |= (! changes.Empty());
948  changes = cleanup.ExtendedCleanup(seh, options);
949  any_changes |= (! changes.Empty());
950  any_changes |= x_FixCDS(seh, 0, "unnamed protein product");
952  return any_changes;
953 }
957 {
958  if (! m_do_basic && ! m_do_extended) {
959  return false;
960  }
962  bool any_changes = false;
964  cleanup.SetScope(&(entry.GetScope()));
966  if (m_state.m_IsMultiSeq) {
968  // if (submit)
969  // options |= CCleanup::eScope_UseInPlace; // RW-1070 - CCleanup::eScope_UseInPlace is essential
970  }
972  if (m_do_basic && ! m_do_extended) {
973  // perform BasicCleanup
974  try {
975  auto changes = *cleanup.BasicCleanup(entry, options);
976  m_state.m_changes += changes;
977  any_changes = ! changes.Empty();
978  } catch (CException& e) {
979  LOG_POST(Error << "error in basic cleanup: " << e.GetMsg() << label);
980  }
981  }
983  if (m_do_extended) {
984  // perform ExtendedCleanup
985  try {
986  auto changes = *cleanup.ExtendedCleanup(entry, options);
987  m_state.m_changes += changes;
988  any_changes = ! changes.Empty();
989  } catch (CException& e) {
990  LOG_POST(Error << "error in extended cleanup: " << e.GetMsg() << label);
991  }
992  }
993  return any_changes;
994 }
998 {
1000  bool any_changes = false;
1001  try {
1002  auto changes = *cleanup.BasicCleanup(block);
1003  any_changes = x_ReportChanges("BasicCleanup of SubmitBlock", changes);
1004  } catch (CException& e) {
1005  LOG_POST(Error << "error in cleanup of SubmitBlock: " << e.GetMsg());
1006  }
1007  return any_changes;
1008 }
1012 {
1013  string label;
1016  const CArgs& args = GetArgs();
1018  if (args["showprogress"]) {
1019  LOG_POST(Error << label + "\n");
1020  }
1022  if (args["debug"]) {
1023  ESerialDataFormat outFormat = eSerial_AsnText;
1025  unique_ptr<CObjectOStream> debug_out(CObjectOStream::Open(outFormat, "before.sqn",
1028  *debug_out << *(entry.GetCompleteSeq_entry());
1029  }
1031  bool any_changes = false;
1033  if (args["T"]) {
1034  validator::CTaxValidationAndCleanup tval(m_remote_updater->GetUpdateFunc());
1035  any_changes |= tval.DoTaxonomyUpdate(entry, true);
1036  }
1038  if (args["K"] && NStr::Find(args["K"].AsString(), "u") != string::npos) {
1039  CRef<CSeq_entry> se(const_cast<CSeq_entry*>(entry.GetCompleteSeq_entry().GetPointer()));
1040  any_changes |= CCleanup::RemoveNcbiCleanupObject(*se);
1041  }
1043  Uint4 options = 0;
1044  if (args["noobj"]) {
1046  }
1047  if (m_IsHugeSet) {
1048  options |= CCleanup::eClean_InHugeSeqSet;
1049  }
1051  any_changes |= x_BasicAndExtended(entry, label, options);
1053  if (args["F"]) {
1054  any_changes |= x_ProcessFeatureOptions(args["F"].AsString(), entry);
1055  }
1056  if (args["X"]) {
1057  any_changes |= x_ProcessXOptions(args["X"].AsString(), entry, options);
1058  }
1059  if (args["K"] && NStr::Find(args["K"].AsString(), "n") != string::npos && ! m_do_extended) {
1060  any_changes |= CCleanup::NormalizeDescriptorOrder(entry);
1061  }
1063  return true;
1064 }
1067 {
1068  if (! se) {
1069  return false;
1070  }
1072  auto entryHandle = m_state.m_Scope->AddTopLevelSeqEntry(*se);
1073  if (! entryHandle) {
1074  NCBI_THROW(CArgException, eInvalidArg, "Failed to insert entry to scope.");
1075  }
1077  if (HandleSeqEntry(entryHandle)) {
1078  if (entryHandle.GetCompleteSeq_entry().GetPointer() != se.GetPointer()) {
1079  se->Assign(*entryHandle.GetCompleteSeq_entry());
1080  }
1081  m_state.m_Scope->RemoveTopLevelSeqEntry(entryHandle);
1082  return true;
1083  }
1084  m_state.m_Scope->RemoveTopLevelSeqEntry(entryHandle);
1085  return false;
1086 }
1088 void CCleanupApp::x_OpenOStream(const string& filename, const string& dir, bool remove_orig_dir)
1089 {
1090  ESerialDataFormat outFormat = eSerial_AsnText;
1092  const CArgs& args = GetArgs();
1093  if (args["outformat"]) {
1094  if (args["outformat"].AsString() == "binary") {
1095  outFormat = eSerial_AsnBinary;
1096  } else if (args["outformat"].AsString() == "XML") {
1097  outFormat = eSerial_Xml;
1098  } else if (args["outformat"].AsString() == "JSON") {
1099  outFormat = eSerial_Json;
1100  }
1101  }
1103  if (NStr::IsBlank(filename)) {
1104  m_Out.reset(CObjectOStream::Open(outFormat, cout));
1105  } else if (! NStr::IsBlank(dir)) {
1106  string base = filename;
1107  if (remove_orig_dir) {
1108  const char buf[2] = { CDirEntry::GetPathSeparator(), 0 };
1109  size_t pos = NStr::Find(base, buf, NStr::eCase, NStr::eReverseSearch);
1110  if (pos != string::npos) {
1111  base = base.substr(pos + 1);
1112  }
1113  }
1114  string fname = CDirEntry::MakePath(dir, base);
1115  m_Out.reset(CObjectOStream::Open(outFormat, fname, eSerial_StdWhenAny));
1116  } else {
1117  m_Out.reset(CObjectOStream::Open(outFormat, filename, eSerial_StdWhenAny));
1118  }
1119 }
1123 {
1124  m_Out->Close();
1125  m_Out.reset();
1126 }
1128 // IProcessorCallback interface functionality
1130 {
1131  // static long long cnt;
1132  // cerr << ++cnt << ' ' << obj->GetThisTypeInfo()->GetName() << '\n';
1133  if (obj->GetThisTypeInfo() == CSeq_entry::GetTypeInfo()) {
1134  CRef<CSeq_entry> entry(dynamic_cast<CSeq_entry*>(obj.GetPointer()));
1135  HandleSeqEntry(entry);
1136  }
1137 }
1140 {
1141  bool any_changes = false;
1142  auto changes_str = changes.GetDescriptions();
1143  if (changes_str.empty()) {
1144  LOG_POST(Error << "No changes from " << prefix << "\n");
1145  } else {
1146  LOG_POST(Error << "Changes from " << prefix << ":\n");
1147  for (auto it : changes_str) {
1148  LOG_POST(Error << it);
1149  }
1150  any_changes = true;
1151  }
1152  return any_changes;
1153 }
1160 /////////////////////////////////////////////////////////////////////////////
1161 //
1162 // Main
1164 int main(int argc, const char** argv)
1165 {
1166  // scan and replace deprecated arguments; RW-1324
1167  for (int i = 1; i < argc; ++i) {
1168  string a = argv[i];
1169  if (a == "-r") {
1170  if ((i+1) < argc) {
1171  string param = argv[i+1];
1172  if (!param.empty() && param[0] != '-') {
1173  argv[i] = "-outdir";
1174  ++i; // skip parameter
1175  cerr << "Warning: deprecated use of -r argument. Please use -outdir instead." << endl;
1176  }
1177  }
1178  } else if (a == "-p") {
1179  argv[i] = "-indir";
1180  cerr << "Warning: argument -p is deprecated. Please use -indir instead." << endl;
1181  } else if (a == "-R") {
1182  argv[i] = "-r";
1183  cerr << "Warning: argument -R is deprecated. Please use -r instead." << endl;
1184  } else if (a == "-gbload") {
1185  argv[i] = "-genbank";
1186  cerr << "Warning: argument -gbload is deprecated. Please use -genbank instead." << endl;
1187  }
1188  }
1190  // this code converts single argument into multiple, just to simplify testing
1191  list<string> split_args;
1192  vector<const char*> new_argv;
1194  if (argc==2 && argv && argv[1] && strchr(argv[1], ' ')) {
1195  NStr::Split(argv[1], " ", split_args);
1197  auto it = split_args.begin();
1198  while (it != split_args.end()) {
1199  auto next = it; ++next;
1200  if (next != split_args.end() &&
1201  ((it->front() == '"' && it->back() != '"') ||
1202  (it->front() == '\'' && it->back() != '\'')))
1203  {
1204  it->append(" "); it->append(*next);
1205  next = split_args.erase(next);
1206  } else
1207  it = next;
1208  }
1209  for (auto& rec: split_args) {
1210  if (rec.front()=='\'' && rec.back()=='\'')
1211  rec=rec.substr(1, rec.length()-2);
1212  }
1213  argc = 1 + int(split_args.size());
1214  new_argv.reserve(argc);
1215  new_argv.push_back(argv[0]);
1216  for (const string& s : split_args) {
1217  new_argv.push_back(s.c_str());
1218  std::cerr << s.c_str() << " ";
1219  }
1220  std::cerr << "\n";
1222  argv =;
1223  }
1225  return CCleanupApp().AppMain(argc, argv);
1226 }
