NCBI C++ ToolKit
table2asn.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: table2asn.cpp 102023 2024-03-19 19:18:50Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Jonathan Kans, Clifford Clausen,
27  * Aaron Ucko, Sergiy Gotvyanskyy
28  *
29  * File Description:
30  * Converter of various files into ASN.1 format, main application function
31  *
32  */
33 
34 #include <ncbi_pch.hpp>
35 #include <common/ncbi_source_ver.h>
36 #include <corelib/ncbistd.hpp>
37 #include <corelib/ncbistre.hpp>
38 #include <corelib/ncbiapp.hpp>
39 #include <corelib/ncbienv.hpp>
40 #include <corelib/ncbiargs.hpp>
41 #include <corelib/ncbi_mask.hpp>
42 
44 #include <connect/ncbi_util.h>
45 
46 // Object Manager includes
48 #include <objmgr/scope.hpp>
49 #include <objmgr/bioseq_ci.hpp>
50 
51 #include <util/line_reader.hpp>
54 
55 #include "multireader.hpp"
56 #include "table2asn_context.hpp"
57 #include "struc_cmt_reader.hpp"
58 #include "feature_table_reader.hpp"
59 #include "fcs_reader.hpp"
60 #include "src_quals.hpp"
61 
64 #include <objects/general/Date.hpp>
65 
67 #include <objects/seq/Seq_gap.hpp>
68 
72 
74 
75 #include "table2asn_validator.hpp"
76 
77 #include <objmgr/feat_ci.hpp>
78 #include "visitors.hpp"
79 
81 
83 
88 
89 #include "table2asn.hpp"
90 #include "suspect_feat.hpp"
91 #include "utils.hpp"
92 
93 #include <common/ncbi_revision.h>
94 
95 #ifndef NCBI_SC_VERSION
96 # define THIS_IS_TRUNK_BUILD
97 #elif (NCBI_SC_VERSION == 0)
98 # define THIS_IS_TRUNK_BUILD
99 #endif
100 
101 #include <common/test_assert.h> /* This header must go last */
102 
103 using namespace ncbi;
104 using namespace objects;
105 
106 namespace
107 {
108  class CMissingInputException : exception
109  {
110  };
111 }
112 
113 static void s_FailOnBadInput(const string& specifics, IObjtoolsListener& listener)
114 {
115  listener.PutMessage(CObjtoolsMessage(specifics, eDiag_Fatal));
116  throw CMissingInputException();
117 }
118 
120 
122 {
123 public:
124  CObjtoolsDiagMessage(const string& txt, EDiagSev sev) :
125  m_txt(txt),
126  m_sev(sev)
127  {
128  }
129 
130  IObjtoolsMessage* Clone() const override { return new CObjtoolsDiagMessage(m_txt, m_sev); }
131 
132  void Write(CNcbiOstream& out) const override { out << m_txt; }
133  void Dump(CNcbiOstream& out) const override { out << m_txt; }
134  void WriteAsXML(CNcbiOstream& out) const override
135  {
136  out << "<message severity=\"" << NStr::XmlEncode(CNcbiDiag::SeverityName(m_sev))
137  << "\" problem=\"" << NStr::XmlEncode(m_txt) << "\" />";
138  }
139  void DumpAsXML(CNcbiOstream& out) const override
140  {
141  WriteAsXML(out);
142  }
143  string GetText() const override { return m_txt; }
144  EDiagSev GetSeverity() const override { return m_sev; }
145  int GetCode() const override { return 0; }
146  int GetSubCode() const override { return 0; }
147 
148 private:
149  string m_txt;
151 };
152 
153 class CTable2AsnLogger : public CMessageListenerLenient, public CDiagHandler
154 {
155 public:
156  CTable2AsnLogger() : m_enable_log(false) {}
158 
160  const string& sMessage,
161  const Uint8 iNumDone = 0,
162  const Uint8 iNumTotal = 0) override
163  {
164  if (m_enable_log)
165  CMessageListenerLenient::PutProgress(sMessage, iNumDone, iNumTotal);
166  }
167 
168  bool PutMessage(const IObjtoolsMessage& message) override
169  {
170  auto edit = dynamic_cast<const edit::CRemoteUpdaterMessage*>(&message);
171  if (edit) {
172  if (edit->m_error != edit::EPubmedError::citation_not_found)
173  return false;
174  }
175  return CMessageListenerLenient::PutMessage(message);
176  }
177 
178  void Post(const SDiagMessage& mess) override
179  {
180  stringstream ss;
181  mess.Write(ss, SDiagMessage::fNoEndl);
182  string str = ss.str();
183  EDiagSev sev = (mess.m_Flags & eDPF_IsNote) ? eDiag_Info : mess.m_Severity;
184  this->PutMessage(CObjtoolsDiagMessage(str, sev));
185  }
186 };
187 
188 void g_LogDiagMessage(ILineErrorListener* logger, EDiagSev sev, const string& msg)
189 {
190  logger->PutMessage(CObjtoolsDiagMessage(msg, sev));
191 }
192 
193 
195 {
197 }
198 
199 
201 {
202  arg_desc.AddOptionalKey(
203  "aln-file", "InFile", "Alignment input file",
205 
206  arg_desc.SetDependency(
207  "aln-file",
209  "i");
210 
211  arg_desc.AddDefaultKey(
212  "aln-gapchar", "STRING", "Alignment missing indicator",
214  "-");
215 
216  arg_desc.AddDefaultKey(
217  "aln-missing", "STRING", "Alignment missing indicator",
219  "");
220 
221  arg_desc.AddDefaultKey(
222  "aln-alphabet", "STRING", "Alignment alphabet",
224  "prot");
225 
226  arg_desc.SetConstraint(
227  "aln-alphabet",
228  &(*new CArgAllow_Strings,
229  "nuc",
230  "prot"));
231 }
232 
233 
235 {
236  unique_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
237  HideStdArgs(fHideDryRun);
238 
239  // Print -h document if no arguments supplied
240  arg_desc->SetMiscFlags(CArgDescriptions::fUsageIfNoArgs);
241 
242  // Prepare command line descriptions, inherit them from tbl2asn legacy application
243 
244  arg_desc->AddOptionalKey(
245  "indir", "Directory", "Path to input files",
247 
248  arg_desc->AddOptionalKey(
249  "outdir", "Directory", "Path to results",
251 
252  arg_desc->AddFlag("E", "Recurse");
253 
254  arg_desc->AddDefaultKey(
255  "x", "String", "Suffix", CArgDescriptions::eString, ".fsa");
256 
257  arg_desc->AddOptionalKey(
258  "i", "InFile", "Single Input File",
260 
261  x_SetAlnArgs(*arg_desc);
262 
263  arg_desc->AddOptionalKey(
264  "o", "OutFile", "Single Output File",
266 
267  arg_desc->AddDefaultKey(
268  "out-suffix", "String", "ASN.1 files suffix", CArgDescriptions::eString, ".sqn");
269 
270  arg_desc->AddFlag("binary", "Output binary ASN.1");
271 
272  arg_desc->AddOptionalKey("t", "InFile", "Template File",
274 
275  arg_desc->AddDefaultKey(
276  "a", "String", "File Type\n"
277 " a Any\n"
278 " s FASTA Set (s Batch, s1 Pop, s2 Phy, s3 Mut, s4 Eco,\n"
279 " s9 Small-genome)\n"
280 " d FASTA Delta, di FASTA Delta with Implicit Gaps\n"
281 #if 0
282 may be implemented in the future; RW-1253
283 " l FASTA+Gap Alignment (l Batch, l1 Pop, l2 Phy, l3 Mut, l4 Eco,\n"
284 " l9 Small-genome)\n"
285 #endif
286 " z FASTA with Gap Lines"
287 #if 0
288 may be implemented in the future; RW-1253
289 "\n"
290 " e PHRAP/ACE"
291 #endif
293 
294  arg_desc->AddFlag("J", "Delayed Genomic Product Set "); // done
295 
296  arg_desc->AddOptionalKey(
297  "A", "String", "Accession", CArgDescriptions::eString); // done
298  arg_desc->AddOptionalKey(
299  "C", "String", "Genome Center Tag", CArgDescriptions::eString); // done
300 // arg_desc->AddOptionalKey(
301 // "n", "String", "Organism Name", CArgDescriptions::eString); // done
302  arg_desc->AddOptionalKey(
303  "j", "String", "Source Qualifiers.\nThese qualifier values override any conflicting values read from a file (See -src-file)",
304  CArgDescriptions::eString); // done
305  arg_desc->AddOptionalKey("src-file", "InFile", "Single source qualifiers file. The qualifiers in this file override any conflicting qualifiers automically read from a .src file, which, in turn, take precedence over source qualifiers specified in a fasta defline", CArgDescriptions::eInputFile); //done
306  arg_desc->AddFlag("accum-mods", "Accumulate non-conflicting modifier values from different sources. For example, with this option, a 'note' modifier specified on the command line no longer overwrites a 'note' modifier read from a .src file. Both notes will appear in the output ASN.1. If modifier values conflict, the rules of precedence specified above apply");
307  arg_desc->AddOptionalKey(
308  "y", "String", "Comment", CArgDescriptions::eString); // done
309  arg_desc->AddOptionalKey(
310  "Y", "InFile", "Comment File", CArgDescriptions::eInputFile); // done
311  arg_desc->AddOptionalKey(
312  "D", "InFile", "Descriptors File", CArgDescriptions::eInputFile); // done
313  arg_desc->AddOptionalKey(
314  "f", "InFile", "Single 5 column table file or other annotations", CArgDescriptions::eInputFile); // done
315 
316  arg_desc->AddOptionalKey(
317  "V", "String", "Verification (combine any of the following letters)\n\
318  v Validate with Normal Stringency\n\
319  b Generate GenBank Flatfile\n\
320  t Validate with TSA Check", CArgDescriptions::eString);
321 
322  arg_desc->AddFlag("q", "Seq ID from File Name"); // done
323 
324  arg_desc->AddFlag("U", "Remove Unnecessary Gene Xref");
325  arg_desc->AddFlag("T", "Remote Taxonomy Lookup"); // done
326  arg_desc->AddFlag("P", "Remote Publication Lookup"); // done
327  arg_desc->AddFlag("W", "Log Progress"); // done
328  arg_desc->AddFlag("K", "Save Bioseq-set"); // done
329 
330  arg_desc->AddOptionalKey("H", "String", "Hold Until Publish\n\
331  y Hold for One Year\n\
332  mm/dd/yyyy", CArgDescriptions::eString); // done
333 
334  arg_desc->AddFlag("Z", "Output discrepancy report");
335  arg_desc->AddFlag("split-dr", "Create unique discrepancy report for each output file");
336 
337  arg_desc->AddOptionalKey("c", "String", "Cleanup (combine any of the following letters)\n\
338  b Basic cleanup (default)\n\
339  e Extended cleanup\n\
340  f Fix product names\n\
341  s Add exception to short introns\n\
342  w WGS cleanup (only needed when using a GFF3 file)\n\
343  d Correct Collection Dates (assume month first)\n\
344  D Correct Collection Dates(assume day first)\n\
345  x Extend ends of features by one or two nucleotides to abut gaps or sequence ends\n\
346  - avoid cleanup", CArgDescriptions::eString);
347 
348  arg_desc->AddOptionalKey("z", "OutFile", "Cleanup Log File", CArgDescriptions::eOutputFile);
349 
350  arg_desc->AddOptionalKey("N", "String", "Project Version Number", CArgDescriptions::eString); //done
351 
352  arg_desc->AddOptionalKey("w", "InFile", "Single Structured Comment File", CArgDescriptions::eInputFile); //done
353  arg_desc->AddOptionalKey("M", "String", "Master Genome Flags\n\
354  n Normal\n\
355  t TSA", CArgDescriptions::eString);
356 
357  arg_desc->AddOptionalKey("l", "String", "Add type of evidence used to assert linkage across assembly gaps. May be used multiple times. Must be one of the following:\n\
358  paired-ends\n\
359  align-genus\n\
360  align-xgenus\n\
361  align-trnscpt\n\
362  within-clone\n\
363  clone-contig\n\
364  map\n\
365  strobe\n\
366  unspecified\n\
367  pcr\n\
369 
370  arg_desc->AddOptionalKey("linkage-evidence-file", "InFile", "File listing linkage evidence for gaps of different lengths", CArgDescriptions::eInputFile);
371 
372  arg_desc->AddOptionalKey("gap-type", "String", "Set gap type for runs of Ns. Must be one of the following:\n\
373  scaffold\n\
374  short-arm\n\
375  heterochromatin\n\
376  centromere\n\
377  telomere\n\
378  repeat\n\
379  contamination\n\
380  contig\n\
381  unknown (obsolete)\n\
382  fragment\n\
383  clone\n\
384  other (for future use)", CArgDescriptions::eString);
385 
386  arg_desc->AddOptionalKey("m", "String", "Lineage to use for Discrepancy Report tests", CArgDescriptions::eString);
387 
388  // all new options are done
389  // arg_desc->AddOptionalKey("taxid", "Integer", "Organism taxonomy ID", CArgDescriptions::eInteger);
390  // arg_desc->AddOptionalKey("taxname", "String", "Taxonomy name", CArgDescriptions::eString);
391  arg_desc->AddOptionalKey("ft-url", "String", "FileTrack URL for the XML file retrieval", CArgDescriptions::eString);
392  arg_desc->AddOptionalKey("ft-url-mod", "String", "FileTrack URL for the XML file base modifications", CArgDescriptions::eString);
393 
394  arg_desc->AddOptionalKey("gaps-min", "Integer", "minimum run of Ns recognised as a gap", CArgDescriptions::eInteger);
395  arg_desc->AddOptionalKey("gaps-unknown", "Integer", "exact number of Ns recognised as a gap with unknown length", CArgDescriptions::eInteger);
396 
397  // disabled per RW-589
398  //arg_desc->AddOptionalKey("min-threshold", "Integer", "minimum length of sequence", CArgDescriptions::eInteger);
399  //arg_desc->AddOptionalKey("fcs-file", "FileName", "FCS report file", CArgDescriptions::eInputFile);
400  //arg_desc->AddFlag("fcs-trim", "Trim FCS regions instead of annotate");
401  arg_desc->AddFlag("postprocess-pubs", "Postprocess pubs: convert authors to standard");
402  arg_desc->AddOptionalKey("locus-tag-prefix", "String", "Add prefix to locus tags in annotation files", CArgDescriptions::eString);
403  arg_desc->AddFlag("no-locus-tags-needed", "Submission data does not require locus tags");
404  arg_desc->AddFlag("euk", "Assume eukaryote, and create missing mRNA features");
405  arg_desc->AddOptionalKey("suspect-rules", "String", "Path to a file containing suspect rules set. Overrides environment variable PRODUCT_RULES_LIST", CArgDescriptions::eString);
406  arg_desc->AddFlag("allow-acc", "Allow accession recognition in sequence IDs. Default is local");
407  arg_desc->AddFlag("augustus-fix", "(Deprecated) Special handling of unusual problems in Augustus annotations",
409  arg_desc->AddFlag("intronless", "Intronless alignments");
410  arg_desc->AddFlag("refine-prt-alignments", "Refine ProSplign aligments when processing .prt input");
411  arg_desc->AddOptionalKey("prt-alignment-filter-query", "String",
412  "Filter query string for .prt alignments", CArgDescriptions::eString);
413 
414  arg_desc->AddOptionalKey("logfile", "LogFile", "Error Log File", CArgDescriptions::eOutputFile);
415  arg_desc->AddOptionalKey("logxml", "LogFile", "XML Error Log File", CArgDescriptions::eOutputFile);
416  arg_desc->SetDependency("logxml",
418  "logfile");
419  arg_desc->AddFlag("split-logs", "Create unique log file for each output file");
420  arg_desc->AddFlag("verbose", "Be verbose on reporting");
421  arg_desc->AddFlag("huge", "Execute in huge-file mode");
422  arg_desc->AddFlag("disable-huge", "Explicitly disable huge-files mode");
423  arg_desc->SetDependency("disable-huge",
425  "huge");
426 
427  arg_desc->AddOptionalKey("usemt", "String", "Try to use as many threads as:\n\
428  one\n\
429  two\n\
431 
433  arg_desc->AddFlag("fetchall", "Search data in all available databases");
434 
435  // Program description
436  arg_desc->SetUsageContext("", "Converts files of various formats to ASN.1");
437 
438  // Pass argument descriptions to the application
439  SetupArgDescriptions(arg_desc.release());
440 }
441 
442 static void s_PubCleanup(CRef<CPub>& pub)
443 {
444  if (pub->IsArticle()) {
445  CCitArtCleaner::CleanArticle(pub->SetArticle(), true, true);
446  }
447 }
448 
449 
451 {
452  const auto& args = GetArgs();
453  string numThreadsConfig;
454  if (args["usemt"]) {
455  numThreadsConfig = args["usemt"].AsString();
456  } else {
457  numThreadsConfig = GetConfig().GetString("table2asn", "UseThreads", "one");
458  }
459  static constexpr array<string_view, 3> numThreadsValues{ "one", "two", "many" };
460  auto numThreads = distance(begin(numThreadsValues),
461  find(begin(numThreadsValues), end(numThreadsValues), numThreadsConfig));
462  numThreads += 1;
463  if (1 <= numThreads && numThreads <= 3) {
464  cerr << "Will be using " << numThreadsConfig << " threads" << endl;
465  return numThreads;
466  }
467  // default to using a single thread
468  return 1;
469 }
470 
472 {
473  const CArgs& args = GetArgs();
474 
475  Setup(args);
476 
477  CTime expires = GetFullVersion().GetBuildInfo().GetBuildTime();
478  if (! expires.IsEmpty()) {
479  expires.AddYear();
480  if (CTime(CTime::eCurrent) > expires) {
481  NcbiCerr << "This copy of " << GetProgramDisplayName()
482  << " is more than 1 year old. Please download the current version if it is newer." << endl;
483  }
484  }
485 
486  m_context.m_disable_huge_files = args["disable-huge"];
487  if (! m_context.m_disable_huge_files) {
488  m_context.m_can_use_huge_files = args["huge"] || GetConfig().GetBool("table2asn", "UseHugeFiles", false);
489  if (m_context.m_can_use_huge_files) {
490  std::cerr << "Will be using huge files scenario" << std::endl;
491  }
492  }
493 
494  m_context.m_split_log_files = args["split-logs"].AsBoolean();
495  if (m_context.m_split_log_files && args["logfile"]) {
496  NCBI_THROW(CArgException, eConstraint,
497  "-logfile cannot be used with -split-logs");
498  }
499  m_context.m_verbose = args["verbose"].AsBoolean();
500 
501  CTable2AsnLogger* app_logger = new CTable2AsnLogger;
502  CNcbiOstream* error_log = args["logfile"] ? &(args["logfile"].AsOutputFile()) : &NcbiCerr;
503  app_logger->SetProgressOstream(error_log);
504  SetDiagHandler(app_logger, false);
505  m_logger.Reset(app_logger);
506  m_context.m_logger = m_logger;
507  m_logger->m_enable_log = args["W"].AsBoolean();
508  m_context.m_remote_updater.reset(new edit::CRemoteUpdater(m_logger));
509  m_context.m_remote_updater->SetPubmedInterceptor(s_PubCleanup);
510  m_validator.Reset(new CTable2AsnValidator(m_context));
511 
512  m_context.m_SetIDFromFile = args["q"].AsBoolean();
513  m_context.m_allow_accession = args["allow-acc"].AsBoolean();
514  m_context.m_delay_genprodset = args["J"].AsBoolean();
515  m_context.m_accumulate_mods = args["accum-mods"].AsBoolean();
516  m_context.m_binary_asn1_output = args["binary"].AsBoolean();
517 
518  if (args["c"]) {
519  if (args["c"].AsString().find_first_not_of("-befwsdDx") != string::npos) {
520  NCBI_THROW(CArgException, eConvert,
521  "Unrecognized cleanup type " + args["c"].AsString());
522  }
523 
524  m_context.m_cleanup = args["c"].AsString();
525  } else
526  m_context.m_cleanup = "b"; // always cleanup
527 
528  if (args["M"]) {
529  m_context.m_master_genome_flag = args["M"].AsString();
530  m_context.m_delay_genprodset = true;
531  m_context.m_HandleAsSet = true;
532  m_context.m_cleanup += "fU";
533  m_context.m_validate = "v"; // do we still need that?
534  if (m_context.m_master_genome_flag.find('n') != string::npos) {
535  m_context.m_discrepancy_group = NDiscrepancy::eSubmitter;
536  } else if (m_context.m_master_genome_flag.find('t') != string::npos) {
537  m_context.m_discrepancy_group = NDiscrepancy::eTSA;
538  }
539  }
540 
541  m_reader.reset(new CMultiReader(m_context));
542 
543  // excluded per RW-589
544 #if 0
545  if (args["fcs-file"]) {
546  m_fcs_reader.reset(new CForeignContaminationScreenReportReader(m_context));
547  CRef<ILineReader> reader(ILineReader::New(args["fcs-file"].AsInputFile()));
548 
549  m_fcs_reader->LoadFile(*reader);
550  m_context.m_fcs_trim = args["fcs-trim"];
551 
552  if (args["min-threshold"])
553  m_context.m_minimal_sequence_length = args["min-threshold"].AsInteger();
554  }
555 #endif
556 
557  // if (args["n"])
558  // m_context.m_OrganismName = args["n"].AsString();
559 
560  if (args["y"])
561  m_context.m_Comment = args["y"].AsString();
562  else if (args["Y"]) {
563  CRef<ILineReader> reader(ILineReader::New(args["Y"].AsInputFile()));
564  while (! reader->AtEOF()) {
565  reader->ReadLine();
566  m_context.m_Comment += reader->GetCurrentLine();
567  m_context.m_Comment += " ";
568  }
569  }
570  NStr::TruncateSpacesInPlace(m_context.m_Comment);
571 
572  if (args["U"] && args["U"].AsBoolean())
573  m_context.m_cleanup += 'U';
574 
575  if (args["m"]) {
576  m_context.m_disc_lineage = args["m"].AsString();
577  }
578 
579  m_context.m_asn1_suffix = args["out-suffix"].AsString();
580 
581  m_context.m_save_bioseq_set = args["K"].AsBoolean();
582  m_context.prtAlnOptions.intronless = args["intronless"].AsBoolean();
583  m_context.prtAlnOptions.refineAlignment = args["refine-prt-alignments"].AsBoolean();
584  if (args["prt-alignment-filter-query"]) {
585  m_context.prtAlnOptions.filterQueryString = args["prt-alignment-filter-query"].AsString();
586  }
587  // if (args["taxname"])
588  // m_context.m_OrganismName = args["taxname"].AsString();
589  // if (args["taxid"])
590  // m_context.m_taxid = args["taxid"].AsInteger();
591  if (args["ft-url"])
592  m_context.m_ft_url = args["ft-url"].AsString();
593  if (args["ft-url-mod"])
594  m_context.m_ft_url_mod = args["ft-url-mod"].AsString();
595  if (args["A"])
596  m_context.m_accession.Reset(new CSeq_id(args["A"].AsString()));
597  if (args["j"]) {
598  m_context.mCommandLineMods = args["j"].AsString();
599  }
600  if (args["w"])
601  m_context.m_single_structure_cmt = args["w"].AsString();
602 
603  m_context.m_RemotePubLookup = args["P"].AsBoolean();
604  if (! m_context.m_RemotePubLookup) // those are always postprocessed
605  m_context.m_postprocess_pubs = args["postprocess-pubs"].AsBoolean();
606 
607  m_context.m_RemoteTaxonomyLookup = args["T"].AsBoolean();
608  if (m_context.m_RemoteTaxonomyLookup) {
609  m_context.m_cleanup += 'T';
610  }
611 
612  if (args["a"]) {
613  const string& a_arg = args["a"].AsString();
614  if (a_arg == "s" || a_arg == "z") {
615  m_context.m_HandleAsSet = true;
616  } else if (a_arg == "s1") {
617  m_context.m_HandleAsSet = true;
618  m_context.m_ClassValue = CBioseq_set::eClass_pop_set;
619  } else if (a_arg == "s2") {
620  m_context.m_HandleAsSet = true;
621  m_context.m_ClassValue = CBioseq_set::eClass_phy_set;
622  } else if (a_arg == "s3") {
623  m_context.m_HandleAsSet = true;
624  m_context.m_ClassValue = CBioseq_set::eClass_mut_set;
625  } else if (a_arg == "s4") {
626  m_context.m_HandleAsSet = true;
627  m_context.m_ClassValue = CBioseq_set::eClass_eco_set;
628  } else if (a_arg == "s9") {
629  m_context.m_HandleAsSet = true;
630  m_context.m_ClassValue = CBioseq_set::eClass_small_genome_set;
631  } else if (a_arg == "di") {
632  m_context.m_di_fasta = true;
633  } else if (a_arg == "d") {
634  m_context.m_d_fasta = true;
635  }
636  }
637  if (args["gaps-min"]) {
638  int gaps_min = args["gaps-min"].AsInteger();
639  if (gaps_min < 0) {
640  NCBI_THROW(CArgException, eConvert,
641  "Invalid value: gaps-min cannot be negative.");
642  }
643  m_context.m_gapNmin = static_cast<TSeqPos>(gaps_min);
644  }
645  if (args["gaps-unknown"]) {
646  int gaps_unknown = args["gaps-unknown"].AsInteger();
647  if (gaps_unknown < 0) {
648  NCBI_THROW(CArgException, eConvert,
649  "Invalid value: gaps-unknown cannot be negative.");
650  }
651  m_context.m_gap_Unknown_length = static_cast<TSeqPos>(gaps_unknown);
652  }
653  if (m_context.m_gap_Unknown_length > 0 && m_context.m_gapNmin == 0) {
654  m_context.m_gapNmin = m_context.m_gap_Unknown_length;
655  }
656 
657  if (args["linkage-evidence-file"]) {
658  // auto lefile_cstr = args["linkage-evidence-file"].AsString().c_str();
659  // auto pLEStream = make_unique<CNcbiIfstream>(lefile_cstr,ios::binary);
660 
662  args["linkage-evidence-file"].AsString(),
663  m_context.m_GapsizeToEvidence,
664  m_context.m_logger);
665  m_context.m_gap_type = CSeq_gap::eType_scaffold; // for compatibility with tbl2asn
666  }
667 
668  if (args["l"]) {
669  auto linkage_evidence_to_value = CLinkage_evidence::ENUM_METHOD_NAME(EType)();
670  for (auto& arg_it : args["l"].GetStringList()) {
671  try {
672  auto value = linkage_evidence_to_value->FindValue(arg_it);
673  m_context.m_DefaultEvidence.insert(value);
674  m_context.m_gap_type = CSeq_gap::eType_scaffold; // for compatibility with tbl2asn
675  } catch (...) {
676  NCBI_THROW(CArgException, eConvert,
677  "Unrecognized linkage evidence " + arg_it);
678  }
679  }
680  }
681 
682  if (args["gap-type"]) {
683  auto gaptype_to_value = CSeq_gap::ENUM_METHOD_NAME(EType)();
684  try {
685  auto value = gaptype_to_value->FindValue(args["gap-type"].AsString());
686  m_context.m_gap_type = value;
687  } catch (...) {
688  NCBI_THROW(CArgException, eConvert,
689  "Unrecognized gap type " + args["gap-type"].AsString());
690  }
691  }
692 
693  if (args["H"]) {
694  string sdate = args["H"].AsString();
695  if (sdate == "Y" || sdate == "y") {
696  m_context.m_HoldUntilPublish.SetCurrent();
697  m_context.m_HoldUntilPublish.SetYear(m_context.m_HoldUntilPublish.Year() + 1);
698  } else {
699  try {
700  if (sdate[0] == '\'' && sdate.length() > 0 && sdate[sdate.length() - 1] == '\'') {
701  sdate.erase(0, 1);
702  sdate.erase(sdate.length() - 1, 1);
703  }
704  m_context.m_HoldUntilPublish = CTime(sdate, "M/D/Y");
705  } catch (const CException&) {
706  int years = NStr::StringToInt(args["H"].AsString());
707  m_context.m_HoldUntilPublish.SetCurrent();
708  m_context.m_HoldUntilPublish.SetYear(m_context.m_HoldUntilPublish.Year() + years);
709  }
710  }
711  }
712 
713  if (args["N"])
714  m_context.m_ProjectVersionNumber = args["N"].AsString();
715 
716  if (args["C"]) {
717  m_context.m_genome_center_id = args["C"].AsString();
718  if (! m_context.m_ProjectVersionNumber.empty())
719  m_context.m_genome_center_id += m_context.m_ProjectVersionNumber;
720  }
721 
722  if (args["V"]) {
723  m_context.m_validate += args["V"].AsString();
724  size_t p;
725  while ((p = m_context.m_validate.find("b")) != string::npos) {
726  m_context.m_validate.erase(p, 1);
727  m_context.m_make_flatfile = true;
728  }
729  while ((p = m_context.m_validate.find("t")) != string::npos) {
730  //m_context.m_discrepancy = eTriState_False;
731  m_context.m_validate.erase(p, 1);
732  }
733  }
734 
735  if (args["Z"]) {
736  m_context.m_run_discrepancy = true;
737  if (args["split-dr"])
738  m_context.m_split_discrepancy = true;
739  }
740 
741  if (args["locus-tag-prefix"] || args["no-locus-tags-needed"]) {
742  if (args["locus-tag-prefix"] && args["no-locus-tags-needed"]) {
743  // mutually exclusive
744  NCBI_THROW(CArgException, eConstraint,
745  "-no-locus-tags-needed and -locus-tag-prefix are mutually exclusive");
746  }
747  if (args["no-locus-tags-needed"]) {
748  m_context.m_locus_tag_prefix = "";
749  m_context.m_locus_tags_needed = false;
750  } else {
751  m_context.m_locus_tag_prefix = args["locus-tag-prefix"].AsString();
752  m_context.m_locus_tags_needed = true;
753  }
754  }
755 
756  if (m_context.m_HandleAsSet) {
757  if (false)
758  NCBI_THROW(CArgException, eConstraint, "-s flag cannot be used with -d, -e, -l or -z");
759  }
760 
761  // Designate where do we output files: local folder, specified folder or a specific single output file
762  if (args["outdir"])
763  m_context.m_ResultsDirectory = CDir::AddTrailingPathSeparator(args["outdir"].AsString());
764 
765  if (args["o"]) {
766  m_context.m_output_filename = args["o"].AsString();
767  m_context.m_output = &args["o"].AsOutputFile();
768  } else {
769  if (args["outdir"]) {
770  CDir outputdir(m_context.m_ResultsDirectory);
771  if (! IsDryRun())
772  if (! outputdir.Exists())
773  outputdir.Create();
774  }
775  }
776 
777  m_context.m_eukaryote = args["euk"].AsBoolean();
778 
779  if (m_context.m_cleanup.find('f') != string::npos)
780  m_context.m_use_hypothetic_protein = true;
781 
782  if (args["suspect-rules"])
783  m_context.m_suspect_rules->SetRulesFilename(args["suspect-rules"].AsString());
784 
785  try {
786  if (args["t"]) {
787  m_context.m_t = true;
788  m_reader->LoadTemplate(args["t"].AsString());
789  }
790  } catch (const CException&) {
791  g_LogGeneralParsingError("Error loading template file", *m_logger);
792  }
793 
794  try {
795  if (args["D"]) {
796  m_reader->LoadDescriptors(args["D"].AsString(), m_global_files.m_descriptors);
797  }
798  } catch (const CException&) {
799  g_LogGeneralParsingError("Error loading descriptors file", *m_logger);
800  }
801 
802  if (m_logger->Count() == 0)
803  try {
804  if (args["f"]) {
805  string annot_file = args["f"].AsString();
806  if (! CFile(annot_file).Exists()) {
808  "The specified annotation file \"" + annot_file + "\" does not exist.",
809  *m_logger);
810  }
811  m_context.m_single_annot_file = args["f"].AsString();
812  }
813  if (args["src-file"]) {
814  string src_file = args["src-file"].AsString();
815  if (! CFile(src_file).Exists()) {
817  "The specified source qualifier file \"" + src_file + "\" does not exist.",
818  *m_logger);
819  }
820  m_context.m_single_source_qual_file = args["src-file"].AsString();
821  }
822 
823  // Designate where do we get input: single file or a folder or folder structure
824  if (args["i"]) {
825  m_context.m_current_file = args["i"].AsString();
826  CFile argAsFile(m_context.m_current_file);
827  if (! argAsFile.Exists()) {
829  "The specified input file \"" + m_context.m_current_file + "\" does not exist.",
830  *m_logger);
831  }
832  if (argAsFile.GetLength() > TBL2ASN_MAX_ALLOWED_FASTA_SIZE && m_context.m_disable_huge_files) {
833  if (CFormatGuess::Format(m_context.m_current_file) == CFormatGuess::eFasta) {
835  "The specified input file \"" +
836  m_context.m_current_file +
837  "\" is too long. The maximum permissible file size for a FASTA sequence is " +
838  NStr::NumericToString(TBL2ASN_MAX_ALLOWED_FASTA_SIZE, NStr::fWithCommas) +
839  " bytes. Consider allowing huge mode (remove \"-disable-huge\" from the command line).",
840  *m_logger);
841  }
842  }
843  ProcessOneFile(false);
844  } else if (args["indir"]) {
845  // initiate validator output
846  string indir = args["indir"].AsString();
847  CDir directory(indir);
848  if (! directory.Exists()) {
850  "The specified input directory \"" + indir + "\" does not exist.",
851  *m_logger);
852  }
853  string basename = m_context.m_output_filename.empty() ?
855  m_context.m_output_filename;
856 
857  m_context.m_base_name = basename;
858 
859  CMaskFileName masks;
860  masks.Add("*" + args["x"].AsString());
861 
862  ProcessOneDirectory(directory, masks, args["E"].AsBoolean());
863  } else if (args["aln-file"]) {
864  m_context.m_current_file = args["aln-file"].AsString();
865  if (! CFile(m_context.m_current_file).Exists()) {
867  "The specified alignment file \"" + m_context.m_current_file + "\" does not exist.",
868  *m_logger);
869  }
870  const bool isAlignment = true;
871  ProcessOneFile(isAlignment);
872  }
873 
874  // RW-927
875  if (m_context.m_verbose &&
876  m_global_files.mp_src_qual_map &&
877  ! m_global_files.mp_src_qual_map->Empty()) {
878  m_global_files.mp_src_qual_map->ReportUnusedIds();
879  }
880 
881  if (m_validator->ValTotalErrors() > 0) {
882  std::ofstream ostr;
883  ostr.exceptions(ios::failbit | ios::badbit);
884  ostr.open(m_context.GenerateOutputFilename(eFiles::stats, m_context.m_base_name));
885  m_validator->ValReportErrorStats(ostr);
886  }
887  m_validator->ReportDiscrepancies(m_context.GenerateOutputFilename(eFiles::dr, m_context.m_base_name));
888  } catch (const CMissingInputException&) {
889  // Error message has already been logged
890  } catch (const CException& ex) {
891  const CException* original = &ex;
892  // ASN writer populates exception with all nested exceptions stack which is not neccessary
893  // we need the original exception
894  while (original->GetPredecessor()) original = original->GetPredecessor();
895 
896  auto msg = original->GetMsg();
897  auto bad_res_exc = dynamic_cast<const CBadResiduesException*>(original);
898  if (bad_res_exc) {
899  int line = 0;
901  if (bad_res_exc->GetBadResiduePositions().m_BadIndexMap.size() == 1) {
902  line = bad_res_exc->GetBadResiduePositions().m_BadIndexMap.begin()->first;
903  } else {
904  lines.reserve(bad_res_exc->GetBadResiduePositions().m_BadIndexMap.size());
905  for(auto rec: bad_res_exc->GetBadResiduePositions().m_BadIndexMap) {
906  lines.push_back(rec.first);
907  }
908  }
909 
910  unique_ptr<CLineError> le(
912  bad_res_exc->GetBadResiduePositions().m_SeqId->AsFastaString(),
913  line, "", "", "", msg, lines));
914  m_logger->PutError(*le);
915  } else {
916  auto seq_map_exc = dynamic_cast<const CSeqMapException*>(original);
917  if (seq_map_exc) {
918  if (!args["r"] && !(args.Exist("vdb") && args["vdb"]) && msg.find("Cannot resolve") != string::npos) {
919  msg += " - try running with -r to enable remote retrieval of sequences";
920  }
921  }
922  g_LogGeneralParsingError(msg, *m_logger);
923  }
924  }
925 
926  int ret = 0;
927  if (m_logger->Count() == 0) {
928  #ifdef THIS_IS_TRUNK_BUILD
929  m_context.m_remote_updater->ReportStats(std::cerr);
930  #endif
931  } else {
932  m_logger->Dump();
933  if (args["logxml"]) {
934  CNcbiOstream& log_xml = args["logxml"].AsOutputFile();
935  log_xml << "<logmessages>" << endl;
936  m_logger->DumpAsXML(log_xml);
937  log_xml << "</logmessages>" << endl;
938  }
939 
940  size_t errors = m_logger->LevelCount(eDiag_Critical) +
941  m_logger->LevelCount(eDiag_Error) +
942  m_logger->LevelCount(eDiag_Fatal);
943  // all errors reported as failure
944  if (errors > 0) {
945  ret = 1;
946  } else {
947  // only warnings reported as 2
948  if (m_logger->LevelCount(eDiag_Warning) > 0)
949  ret = 2;
950  else // otherwise it's ok
951  ret = 0;
952  }
953  }
954 
955  // prevent further logging after m_logger is autodestroyed; RW-2219, RW-2233
956  SetDiagHandler(nullptr, false);
957  return ret;
958 }
959 
961  CFormatGuess::EFormat inputFormat,
964 {
965  auto scope = Ref(new CScope(*m_context.m_ObjMgr));
966  scope->AddDefaults();
967 
968  CRef<CSeq_entry> entry;
969  CRef<CSeq_submit> submit;
970 
971  m_reader->GetSeqEntry(entry, submit, obj);
972 
973  bool avoid_submit_block = false;
974 
975  entry->Parentize();
976 
977  if (m_context.m_SetIDFromFile) {
978  m_context.SetSeqId(*entry);
979  }
980 
981  m_context.ApplyAccession(*entry);
982 
983  if (! IsDryRun()) {
984  std::function<std::ostream&()> f = [this]() -> std::ostream& { return m_context.GetOstream(eFiles::fixedproducts); };
985  m_context.m_suspect_rules->SetupOutput(f);
986  }
987  m_context.ApplyFileTracks(*entry);
988 
989  const bool readModsFromTitle =
990  inputFormat == CFormatGuess::eFasta ||
991  inputFormat == CFormatGuess::eAlignment;
992  xProcessSecretFiles1Phase(readModsFromTitle, *entry);
993 
994  if (m_context.m_RemoteTaxonomyLookup) {
995  m_context.m_remote_updater->UpdateOrgFromTaxon(*entry);
996  } else {
998  }
999 
1000  m_secret_files->m_feature_table_reader->m_replacement_protein = m_secret_files->m_replacement_proteins;
1001  TAsyncToken token;
1002  m_secret_files->m_feature_table_reader->MergeCDSFeatures(*entry, token);
1003 
1004  entry->Parentize();
1005  m_secret_files->m_feature_table_reader->MoveProteinSpecificFeats(*entry);
1006 
1007  m_context.CorrectCollectionDates(*entry);
1008 
1009  if (m_secret_files->m_possible_proteins.NotEmpty())
1010  m_secret_files->m_feature_table_reader->AddProteins(*m_secret_files->m_possible_proteins, *entry);
1011 
1012  if (m_context.m_HandleAsSet) {
1013  m_secret_files->m_feature_table_reader->ConvertNucSetToSet(entry);
1014  }
1015 
1016  if ((inputFormat == CFormatGuess::eTextASN) ||
1017  (inputFormat == CFormatGuess::eBinaryASN)) {
1018  // if create-date exists apply update date
1019  m_context.ApplyCreateUpdateDates(*entry);
1020  }
1021 
1022  m_context.ApplyComments(*entry);
1023  ProcessSecretFiles2Phase(*entry);
1024 
1025  // this methods do not remove entry nor change it. But create 'result' object which either
1026  // equal to 'entry' or contain reference to 'entry'.
1027  if (avoid_submit_block)
1028  result = m_context.CreateSeqEntryFromTemplate(entry);
1029  else
1030  result = m_context.CreateSubmitFromTemplate(entry, submit);
1031 
1032  m_secret_files->m_feature_table_reader->MakeGapsFromFeatures(*entry);
1033 
1034  if (m_context.m_delay_genprodset) {
1035  VisitAllFeatures(*entry, [this](CSeq_feat& feature) { m_context.RenameProteinIdsQuals(feature); });
1036  } else {
1037  VisitAllFeatures(*entry, [this](CSeq_feat& feature) { m_context.RemoveProteinIdsQuals(feature); });
1038  }
1039 
1040  CSeq_entry_Handle seh = scope->AddTopLevelSeqEntry(*entry);
1042 
1043  if (m_context.m_RemotePubLookup) {
1044  m_context.m_remote_updater->UpdatePubReferences(*obj);
1045  }
1046  if (m_context.m_postprocess_pubs) {
1047  edit::CRemoteUpdater::PostProcessPubs(*entry);
1048  }
1049 
1050  if (m_context.m_cleanup.find('-') == string::npos) {
1051  m_validator->Cleanup(submit, seh, m_context.m_cleanup);
1052  }
1053 
1054  // make asn.1 look nicier
1055  edit::SortSeqDescr(*entry);
1056 
1057  m_secret_files->m_feature_table_reader->ChangeDeltaProteinToRawProtein(*entry);
1058 
1059  if (! IsDryRun()) {
1060  m_validator->UpdateECNumbers(*entry);
1061 
1062  if (! m_context.m_validate.empty()) {
1063  m_validator->ValCollect(submit, entry, m_context.m_validate);
1064  }
1065 
1066  m_validator->CollectDiscrepancies(submit, seh);
1067 
1068  if (m_context.m_make_flatfile) {
1069  MakeFlatFile(seh, submit, m_context.GetOstream(eFiles::gbf));
1070  }
1071  }
1072 }
1073 
1074 void CTbl2AsnApp::ProcessTopEntry(CFormatGuess::EFormat inputFormat, bool need_update_date, CRef<CSeq_submit>& submit, CRef<CSeq_entry>& entry)
1075 {
1076  m_context.ApplyComments(*entry);
1077 
1078  if (m_global_files.m_descriptors)
1079  m_reader->ApplyDescriptors(*entry, *m_global_files.m_descriptors);
1080 
1081  if (m_secret_files->m_descriptors)
1082  m_reader->ApplyDescriptors(*entry, *m_secret_files->m_descriptors);
1083 
1084  if (need_update_date) {
1085  m_context.ApplyUpdateDate(*entry);
1086  }
1087 
1088  if (submit) {
1089  if (m_context.m_RemotePubLookup) {
1090  m_context.m_remote_updater->UpdatePubReferences(*submit);
1091  }
1092 
1093  CCleanup cleanup(nullptr, CCleanup::eScope_UseInPlace); // RW-1070 - CCleanup::eScope_UseInPlace is essential
1094  cleanup.ExtendedCleanup(*submit, CCleanup::eClean_NoNcbiUserObjects);
1095  }
1096 
1097  bool need_report = (inputFormat == CFormatGuess::eFasta) && ! m_context.m_HandleAsSet;
1098  if (need_report) {
1100  eDiag_Warning,
1101  "File " + m_context.m_current_file + " contains multiple sequences",
1102  *(m_context.m_logger));
1103  }
1104 }
1105 
1107 {
1108  auto& submit = token.submit;
1109  auto& entry = token.top_entry;
1110  auto& scope = token.scope;
1111  auto& seh = token.seh;
1112 
1113  scope = Ref(new CScope(*m_context.m_ObjMgr));
1114  scope->AddDefaults();
1115 
1116  /*
1117  for FASTA inputs 'entry' argument is:
1118  - always a single seq object
1119  for ASN1. inputs:
1120  - either single seq object or seq-set if it's a nuc-prot-set
1121  'submit' is already processed clean and may contain single entry that points to 'entry' argument,
1122  'submit' object is not neccessary the one going to the output
1123  */
1124 
1125  CRef<CSerialObject> obj;
1126  if (submit)
1127  obj = submit;
1128  else
1129  obj = entry;
1130 
1131  if (m_context.m_SetIDFromFile) {
1132  m_context.SetSeqId(*entry);
1133  }
1134 
1135  m_context.ApplyAccession(*entry);
1136  m_context.ApplyFileTracks(*entry);
1137 
1138  const bool readModsFromTitle =
1139  inputFormat == CFormatGuess::eFasta ||
1140  inputFormat == CFormatGuess::eAlignment;
1141  ProcessSecretFiles1Phase(readModsFromTitle, token);
1142 
1143  if (m_context.m_RemoteTaxonomyLookup) {
1144  m_context.m_remote_updater->UpdateOrgFromTaxon(*entry);
1145  } else {
1147  }
1148 
1149  m_secret_files->m_feature_table_reader->m_replacement_protein = m_secret_files->m_replacement_proteins;
1150  m_secret_files->m_feature_table_reader->MergeCDSFeatures(*entry, token);
1151  entry->Parentize();
1152 
1153  m_secret_files->m_feature_table_reader->MoveProteinSpecificFeats(*entry);
1154 
1155  if (m_secret_files->m_possible_proteins.NotEmpty())
1156  m_secret_files->m_feature_table_reader->AddProteins(*m_secret_files->m_possible_proteins, *entry);
1157 
1158  m_context.CorrectCollectionDates(*entry);
1159 
1160  if (m_context.m_HandleAsSet) {
1161  //m_secret_files->m_feature_table_reader->ConvertNucSetToSet(entry);
1162  }
1163 
1164  if ((inputFormat == CFormatGuess::eTextASN) ||
1165  (inputFormat == CFormatGuess::eBinaryASN)) {
1166  // if create-date exists apply update date
1167  m_context.ApplyCreateUpdateDates(*entry);
1168  }
1169 
1170  m_context.ApplyComments(*entry);
1171 
1172  ProcessSecretFiles2Phase(*entry);
1173 
1174  m_secret_files->m_feature_table_reader->MakeGapsFromFeatures(*entry);
1175 
1176  if (m_context.m_delay_genprodset) {
1177  VisitAllFeatures(*entry, [this](CSeq_feat& feature) { m_context.RenameProteinIdsQuals(feature); });
1178  } else {
1179  VisitAllFeatures(*entry, [this](CSeq_feat& feature) { m_context.RemoveProteinIdsQuals(feature); });
1180  }
1181 
1182  seh = scope->AddTopLevelSeqEntry(*entry);
1184 
1185  // Do not repeat expensive processing of the top-level entry; RW-2107
1186  if ((inputFormat != CFormatGuess::eFasta) || ! *token.pPubLookupDone) {
1187  if (m_context.m_RemotePubLookup) {
1188  m_context.m_remote_updater->UpdatePubReferences(*obj);
1189  *token.pPubLookupDone = true;
1190  }
1191  if (m_context.m_postprocess_pubs) {
1192  m_context.m_remote_updater->PostProcessPubs(*entry);
1193  }
1194  }
1195 
1196  if (m_context.m_cleanup.find('-') == string::npos) {
1197  if (token.cleanup_mutex) {
1198  std::lock_guard<std::mutex> g{ *token.cleanup_mutex };
1199  m_validator->Cleanup(submit, seh, m_context.m_cleanup);
1200  } else {
1201  m_validator->Cleanup(submit, seh, m_context.m_cleanup);
1202  }
1203  }
1204 
1205  // make asn.1 look nicier
1206  edit::SortSeqDescr(*entry);
1207 
1208  m_secret_files->m_feature_table_reader->ChangeDeltaProteinToRawProtein(*entry);
1209 
1210  m_validator->UpdateECNumbers(*entry);
1211 
1212  if (! m_context.m_validate.empty()) {
1213  m_validator->ValCollect(submit, entry, m_context.m_validate);
1214  }
1215 
1216  m_validator->CollectDiscrepancies(submit, seh);
1217 
1218  // ff generator is invoked in other places
1219 }
1220 
1221 void CTbl2AsnApp::MakeFlatFile(CSeq_entry_Handle seh, CRef<CSeq_submit> submit, std::ostream& ostream)
1222 {
1223  CFlatFileGenerator ffgenerator(
1226 
1227  if (submit.Empty())
1228  ffgenerator.Generate(seh, ostream);
1229  else
1230  ffgenerator.Generate(*submit, seh.GetScope(), ostream);
1231 }
1232 
1234 {
1236  m_context.SetOutputFilename(e, m_context.GenerateOutputFilename(e));
1237  }
1238 
1239  if (m_context.m_split_discrepancy) {// otherwise leave it unopened
1240  m_context.SetOutputFilename(eFiles::dr, m_context.GenerateOutputFilename(eFiles::dr));
1241  }
1242  m_context.OpenDiagnosticOutputs();
1243 }
1244 
1246 {
1247  if (m_context.m_output) {
1248  m_context.SetOutputFile(eFiles::asn, *m_context.m_output);
1249  } else {
1250  m_context.SetOutputFilename(eFiles::asn, m_context.GenerateOutputFilename(eFiles::asn));
1251  }
1252  m_context.OpenDataOutputs();
1253 }
1254 
1256 {
1257  m_context.CloseDiagnosticOutputs();
1258 }
1259 
1261 {
1262  m_context.CloseDataOutputs();
1263 }
1264 
1266 {
1267  if (context.m_disable_huge_files) {
1268  return false;
1269  }
1270  return (context.m_can_use_huge_files ||
1273 }
1274 
1275 
1276 void CTbl2AsnApp::ProcessOneFile(bool isAlignment, bool manageDiagnosticStreams, bool manageDataStreams)
1277 {
1278  if (m_context.m_split_log_files)
1279  m_context.m_logger->ClearAll();
1280 
1281  CFile log_name;
1282  if (! IsDryRun() && m_context.m_split_log_files) {
1283  log_name = m_context.GenerateOutputFilename(eFiles::log);
1284  CNcbiOstream* error_log = new CNcbiOfstream(log_name.GetPath());
1285  m_logger->SetProgressOstream(error_log);
1286  SetDiagStream(error_log);
1287  }
1288 
1289  try {
1290  if (manageDiagnosticStreams) {
1291  SetupAndOpenDiagnosticStreams();
1292  }
1293  if (manageDataStreams) {
1294  SetupAndOpenDataStreams();
1295  }
1296  CNcbiOstream* output = &m_context.GetOstream(eFiles::asn);
1297 
1298  std::function<std::ostream&()> f = [this]() -> std::ostream& { return m_context.GetOstream(eFiles::fixedproducts); };
1299  m_context.m_suspect_rules->SetupOutput(f);
1300 
1301  m_context.m_huge_files_mode = false;
1302 
1303  LoadAdditionalFiles();
1304 
1305  if (isAlignment) {
1306  ProcessAlignmentFile(output);
1307  } else {
1308  m_validator->Clear();
1309 
1310  edit::CHugeFile hugeFile;
1311  try {
1312  hugeFile.Open(m_context.m_current_file, &CMultiReader::kSupportedTypes);
1313  } catch (CObjReaderParseException& e) {
1314  auto message = e.GetMsg();
1315  if (message == "File format not supported") {
1316  hugeFile.m_format = CFormatGuess::eFasta;
1317  } else {
1318  throw;
1319  }
1320  }
1321 
1322  if (s_UseHugeFileMode(m_context, hugeFile.m_format)) {
1323 
1324  if (! m_context.m_use_threads) {
1325  m_context.m_use_threads = xGetNumThreads();
1326  }
1327 
1328  ProcessHugeFile(hugeFile, output);
1329  } else {
1330  const string objectType =
1331  hugeFile.m_content ?
1332  hugeFile.m_content->GetName() :
1333  "";
1334  ProcessOneFile(
1335  hugeFile.m_format,
1336  objectType,
1337  // *(hugeFile.m_stream),
1338  hugeFile.m_stream,
1339  output);
1340  }
1341 
1342  if (! m_context.m_validate.empty())
1343  m_validator->ValReportErrors();
1344 
1345  if (m_context.m_split_discrepancy)
1346  m_validator->ReportDiscrepancies();
1347 
1348  ReportUnusedSourceQuals();
1349 
1350  } // !isAlignment
1351 
1352  if (! log_name.GetPath().empty()) {
1353  m_logger->SetProgressOstream(&NcbiCout);
1354  }
1355  if (manageDiagnosticStreams) {
1356  CloseDiagnosticStreams();
1357  }
1358  if (manageDataStreams) {
1359  CloseDataStreams();
1360  }
1361  } catch (...) {
1362  if (! log_name.GetPath().empty()) {
1363  m_logger->SetProgressOstream(&NcbiCout);
1364  }
1365 
1366  m_context.DeleteOutputs();
1367  if (m_context.m_output) {
1368  GetArgs()["o"].CloseFile();
1369  _ASSERT(! m_context.m_output_filename.empty());
1370  CFile(m_context.m_output_filename).Remove(CDirEntry::fIgnoreMissing);
1371  }
1372 
1373  throw;
1374  }
1375 }
1376 
1379  const string& contentType,
1380  unique_ptr<CNcbiIstream>& pIstr,
1382 {
1383  CMultiReader::TAnnotMap annotMap;
1384  CRef<CSerialObject> pInputObject =
1385  m_reader->FetchEntry(format,
1386  contentType,
1387  pIstr,
1388  annotMap);
1389 
1390  xProcessOneFile(format, pInputObject, annotMap, output);
1391 }
1392 
1393 
1395 {
1396  CRef<CSerialObject> input_obj;
1397  CMultiReader::TAnnotMap annotMap;
1398  CFormatGuess::EFormat format = m_reader->OpenFile(m_context.m_current_file, input_obj, annotMap);
1399  xProcessOneFile(format, input_obj, annotMap, output);
1400 }
1401 
1404  CRef<CSerialObject> input_obj,
1405  TAnnotMap& annotMap,
1407 {
1408  if (! annotMap.empty()) {
1409  for (auto entry : annotMap) {
1410  auto it = m_secret_files->m_AnnotMap.find(entry.first);
1411  if (it == m_secret_files->m_AnnotMap.end()) {
1412  m_secret_files->m_AnnotMap.emplace(entry.first, entry.second);
1413  } else {
1414  it->second.splice(it->second.end(), entry.second);
1415  }
1416  }
1417  annotMap.clear();
1418  }
1419 
1420  do {
1422  ProcessOneEntry(format, input_obj, result);
1423 
1424  if (! IsDryRun() && result.NotEmpty()) {
1425  const CSerialObject* to_write = result;
1426  if (m_context.m_save_bioseq_set) {
1427  if (result->GetThisTypeInfo()->IsType(CSeq_entry::GetTypeInfo())) {
1428  const CSeq_entry* se = static_cast<const CSeq_entry*>(result.GetPointer());
1429  if (se->IsSet())
1430  to_write = &se->GetSet();
1431  }
1432  }
1433 
1434  m_reader->WriteObject(*to_write, *output);
1435  }
1436  input_obj = m_reader->ReadNextEntry();
1437  } while (input_obj.NotEmpty());
1438 }
1439 
1441 {
1442  const string& filename = m_context.m_current_file;
1443  unique_ptr<CNcbiIstream> pIstream(new CNcbiIfstream(filename));
1444 
1445  CRef<CSeq_entry> pEntry = m_reader->ReadAlignment(*pIstream, GetArgs());
1446  pEntry->Parentize();
1447  m_context.MergeWithTemplate(*pEntry);
1448 
1449  CRef<CSerialObject> pResult;
1450 
1452  ProcessOneEntry(inputFormat, pEntry, pResult);
1453 
1454  if (IsDryRun() || ! pResult) {
1455  return;
1456  }
1457 
1458  if (m_context.m_save_bioseq_set &&
1459  pResult->GetThisTypeInfo()->IsType(CSeq_entry::GetTypeInfo())) {
1460 
1461  const CSeq_entry* pTempEntry
1462  = static_cast<const CSeq_entry*>(pResult.GetPointer());
1463  if (pTempEntry->IsSet()) {
1464  m_reader->WriteObject(pTempEntry->GetSet(), *output);
1465  return;
1466  }
1467  }
1468  m_reader->WriteObject(*pResult, *output);
1469 }
1470 
1471 
1472 bool CTbl2AsnApp::ProcessOneDirectory(const CDir& directory, const CMask& mask, bool recurse)
1473 {
1474  unique_ptr<CDir::TEntries> entries(directory.GetEntriesPtr("*", CDir::fCreateObjects | CDir::fIgnoreRecursive));
1475  vector<unique_ptr<CDir::CDirEntry>> vec(entries->size());
1476  auto vec_it = vec.begin();
1477  for (CDir::TEntry& it : *entries) {
1478  vec_it->reset(it.release());
1479  ++vec_it;
1480  }
1481 
1482  auto compareNames = [](const auto& l, const auto& r) { return l->GetPath() < r->GetPath(); };
1483  sort(vec.begin(), vec.end(), compareNames);
1484 
1485  bool commonOutputStream = GetArgs()["o"];
1486  if (commonOutputStream) {
1487  SetupAndOpenDataStreams();
1488  }
1489  SetupAndOpenDiagnosticStreams();
1490 
1491  for (const auto& it : vec) {
1492  // first process files and then recursivelly access other folders
1493  if (! it->IsDir()) {
1494  auto pathName = it->GetPath();
1495  if (mask.Match(pathName)) {
1496  m_context.m_current_file = pathName;
1497  ProcessOneFile(false, ! commonOutputStream, ! commonOutputStream);
1498  }
1499  } else if (recurse) {
1500  ProcessOneDirectory(*it, mask, recurse);
1501  }
1502  }
1503  if (commonOutputStream) {
1504  CloseDataStreams();
1505  }
1506  CloseDiagnosticStreams();
1507  return true;
1508 }
1509 
1510 
1511 void CTbl2AsnApp::Setup(const CArgs& args)
1512 {
1513  // initialize conn library
1514  CONNECT_Init(&GetConfig());
1515 
1516  // Create object manager and scope
1517 
1518  m_context.m_ObjMgr = CObjectManager::GetInstance();
1519  CDataLoadersUtil::SetupObjectManager(args, *m_context.m_ObjMgr, default_loaders);
1520 }
1521 
1522 /*
1523 .tbl   5-column Feature Table
1524 .src   tab-delimited table with source qualifiers
1525 .qvl   PHRAP/PHRED/consed quality scores
1526 .dsc  One or more descriptors in ASN.1 format
1527 .cmt  Tab-delimited file for structured comment
1528 .pep  Replacement proteins for coding regions on this sequence, use to mark conflicts
1529 .rna   Replacement mRNA sequences for RNA editing
1530 .prt   Proteins for suggest intervals
1531 */
1533 {
1534  auto modMergePolicy =
1535  m_context.m_accumulate_mods ?
1538 
1539  g_ApplyMods(
1540  m_global_files.mp_src_qual_map.get(),
1541  m_secret_files->mp_src_qual_map.get(),
1542  m_context.mCommandLineMods,
1543  readModsFromTitle,
1544  m_context.m_verbose,
1545  modMergePolicy,
1546  m_logger,
1547  result);
1548 
1549  if (! m_context.m_huge_files_mode) {
1550  if (m_global_files.m_descriptors)
1551  m_reader->ApplyDescriptors(result, *m_global_files.m_descriptors);
1552  if (m_secret_files->m_descriptors)
1553  m_reader->ApplyDescriptors(result, *m_secret_files->m_descriptors);
1554  }
1555 
1556  if (! m_global_files.m_AnnotMap.empty() || ! m_secret_files->m_AnnotMap.empty()) {
1557  AddAnnots(result);
1558  }
1559 }
1560 
1561 void CTbl2AsnApp::ProcessSecretFiles1Phase(bool readModsFromTitle, TAsyncToken& token)
1562 {
1563  auto modMergePolicy =
1564  m_context.m_accumulate_mods ?
1567 
1568  g_ApplyMods(
1569  m_global_files.mp_src_qual_map.get(),
1570  m_secret_files->mp_src_qual_map.get(),
1571  m_context.mCommandLineMods,
1572  readModsFromTitle,
1573  m_context.m_verbose,
1574  modMergePolicy,
1575  m_logger,
1576  *token.top_entry);
1577 
1578  if (! m_context.m_huge_files_mode) {
1579  if (m_global_files.m_descriptors)
1580  m_reader->ApplyDescriptors(*token.top_entry, *m_global_files.m_descriptors);
1581  if (m_secret_files->m_descriptors)
1582  m_reader->ApplyDescriptors(*token.top_entry, *m_secret_files->m_descriptors);
1583  }
1584 
1585  if (! m_global_files.m_AnnotMap.empty() || ! m_secret_files->m_AnnotMap.empty()) {
1586  AddAnnots(*token.top_entry);
1587  }
1588 }
1589 
1590 
1592 {
1593  ProcessCMTFiles(result);
1594 }
1595 
1596 void CTbl2AsnApp::LoadDSCFile(const string& pathname)
1597 {
1598  CFile file(pathname);
1599  if (! file.Exists() || file.GetLength() == 0)
1600  return;
1601  m_reader->LoadDescriptors(pathname, m_secret_files->m_descriptors);
1602 }
1603 
1605 {
1606  if (m_global_files.m_struct_comments)
1607  m_global_files.m_struct_comments->ProcessComments(result);
1608  if (m_secret_files && m_secret_files->m_struct_comments)
1609  m_secret_files->m_struct_comments->ProcessComments(result);
1610 }
1611 
1612 void CTbl2AsnApp::LoadPEPFile(const string& pathname)
1613 {
1614  CFile file(pathname);
1615  if (! file.Exists() || file.GetLength() == 0)
1616  return;
1617 
1618  CRef<ILineReader> reader(ILineReader::New(pathname));
1619 
1620  m_secret_files->m_replacement_proteins = m_secret_files->m_feature_table_reader->ReadProtein(*reader);
1621 }
1622 
1623 void CTbl2AsnApp::LoadRNAFile(const string& pathname)
1624 {
1625  CFile file(pathname);
1626  if (! file.Exists() || file.GetLength() == 0)
1627  return;
1628 }
1629 
1630 void CTbl2AsnApp::LoadPRTFile(const string& pathname)
1631 {
1632  CFile file(pathname);
1633  if (! file.Exists() || file.GetLength() == 0)
1634  return;
1635 
1636  CRef<ILineReader> reader(ILineReader::New(pathname));
1637 
1638  m_secret_files->m_possible_proteins = m_secret_files->m_feature_table_reader->ReadProtein(*reader);
1639 }
1640 
1641 
1642 void CTbl2AsnApp::LoadAnnotMap(const string& pathname, TAnnotMap& annotMap)
1643 {
1644  CFile file(pathname);
1645 
1646  if (! file.Exists())
1647  return;
1648 
1649  if (file.IsIdentical(m_context.m_current_file)) {
1651  eDiag_Warning,
1652  "Ignorning annotation " + pathname + " because it was already used as input source",
1653  *m_logger);
1654  return;
1655  }
1656 
1657  if (file.GetLength() == 0) {
1658  g_LogGeneralParsingError(eDiag_Warning, "Empty file: " + pathname, *m_logger);
1659  return;
1660  }
1661 
1662  m_reader->LoadAnnotMap(pathname, annotMap);
1663 }
1664 
1665 
1667 {
1668  if (entry.IsSeq()) {
1669  m_reader->AddAnnots(m_global_files.m_AnnotMap, m_global_files.m_MatchedAnnots, entry.SetSeq());
1670  if (m_secret_files) {
1671  m_reader->AddAnnots(m_secret_files->m_AnnotMap, m_secret_files->m_MatchedAnnots, entry.SetSeq());
1672  }
1673  return;
1674  }
1675 
1676  if (! entry.GetSet().IsSetSeq_set()) {
1677  return;
1678  }
1679 
1680  // If this is a nuc-prot set, only add annotations to the nucleotide sequence
1681  if (entry.GetSet().IsSetClass() &&
1683  { // We expect the nucleotide sequence to appear first, but this will work even if it doesn't.
1684  for (auto pSubEntry : entry.SetSet().SetSeq_set()) {
1685  if (pSubEntry && pSubEntry->IsSeq() && pSubEntry->GetSeq().IsNa()) {
1686  AddAnnots(*pSubEntry);
1687  return;
1688  }
1689  }
1690  }
1691 
1692  for (auto pSubEntry : entry.SetSet().SetSeq_set()) {
1693  if (pSubEntry) {
1694  AddAnnots(*pSubEntry);
1695  }
1696  }
1697 }
1698 
1699 void CTbl2AsnApp::LoadCMTFile(const string& pathname, unique_ptr<CTable2AsnStructuredCommentsReader>& comments)
1700 {
1701  if (! comments) {
1702  CFile file(pathname);
1703  if (file.Exists() && file.GetLength()) {
1704  comments.reset(new CTable2AsnStructuredCommentsReader(pathname, m_logger, m_context.m_verbose));
1705  }
1706  }
1707 }
1708 
1710 {
1711  string dir;
1712  string base;
1713  string ext;
1714  CDirEntry::SplitPath(m_context.m_current_file, &dir, &base, &ext);
1715 
1716  string name = dir + base;
1717 
1718  // always reset secret file
1719  m_secret_files.reset(new TAdditionalFiles);
1720  m_secret_files->m_feature_table_reader.reset(new CFeatureTableReader(m_context));
1721 
1722  const auto& namedSrcFile = m_context.m_single_source_qual_file;
1723  if (! NStr::IsBlank(namedSrcFile) && ! m_global_files.mp_src_qual_map) {
1724  m_global_files.mp_src_qual_map.reset(new CMemorySrcFileMap(m_logger));
1725  m_global_files.mp_src_qual_map->MapFile(namedSrcFile, m_context.m_allow_accession);
1726  }
1727 
1728  const string defaultSrcFile = name + ".src";
1729  if (CFile(defaultSrcFile).Exists()) {
1730  m_secret_files->mp_src_qual_map.reset(new CMemorySrcFileMap(m_logger));
1731  m_secret_files->mp_src_qual_map->MapFile(defaultSrcFile, m_context.m_allow_accession);
1732  }
1733 
1734  LoadPEPFile(name + ".pep");
1735  LoadRNAFile(name + ".rna");
1736  LoadPRTFile(name + ".prt");
1737  LoadDSCFile(name + ".dsc");
1738 
1739  LoadCMTFile(m_context.m_single_structure_cmt, m_global_files.m_struct_comments);
1740  LoadCMTFile(name + ".cmt", m_secret_files->m_struct_comments);
1741 
1742  // if (m_context.m_can_use_huge_files && ! m_context.m_disable_huge_files) {
1743  {
1744  if (! m_context.m_single_annot_file.empty() && m_global_files.m_AnnotMap.empty())
1745  { // load only once
1746  LoadAnnotMap(m_context.m_single_annot_file, m_global_files.m_AnnotMap);
1747  } else {
1748  for (auto suffix : { ".tbl", ".gff", ".gff3", ".gff2", ".gtf" }) {
1749  LoadAnnotMap(name + suffix, m_secret_files->m_AnnotMap);
1750  }
1751 #ifdef THIS_IS_TRUNK_BUILD
1752  for (auto suffix : { ".gbf" }) {
1753  LoadAnnotMap(name + suffix, m_secret_files->m_AnnotMap);
1754  }
1755 #endif
1756  }
1757  }
1758 }
1759 
1761 {
1762  if (m_context.m_verbose && m_secret_files && m_secret_files->mp_src_qual_map)
1763  m_secret_files->mp_src_qual_map->ReportUnusedIds();
1764 }
1765 
1767 
1768 /////////////////////////////////////////////////////////////////////////////
1769 // MAIN
1770 
1771 int main(int argc, const char* argv[])
1772 {
1773  #ifdef _DEBUG
1774  // this code converts single argument into multiple, just to simplify testing
1775  list<string> split_args;
1776  vector<const char*> new_argv;
1777 
1778  if (argc==2 && argv && argv[1] && strchr(argv[1], ' ')) {
1779  NStr::Split(argv[1], " ", split_args);
1780 
1781  auto it = split_args.begin();
1782  while (it != split_args.end()) {
1783  auto next = it; ++next;
1784  if (next != split_args.end() &&
1785  ((it->front() == '"' && it->back() != '"') ||
1786  (it->front() == '\'' && it->back() != '\'')))
1787  {
1788  it->append(" "); it->append(*next);
1789  next = split_args.erase(next);
1790  } else it = next;
1791  }
1792  for (auto& rec : split_args) {
1793  if (rec.front() == '\'' && rec.back()=='\'')
1794  rec = rec.substr(1, rec.length()-2);
1795  }
1796  argc = 1 + split_args.size();
1797  new_argv.reserve(argc);
1798  new_argv.push_back(argv[0]);
1799  for (const string& s : split_args) {
1800  new_argv.push_back(s.c_str());
1801  std::cerr << s.c_str() << " ";
1802  }
1803  std::cerr << "\n";
1804 
1805 
1806  argv = new_argv.data();
1807  }
1808  #endif
1809  return CTbl2AsnApp().AppMain(argc, argv, 0, eDS_Default, "table2asn.conf");
1810 }
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
static const CDataLoadersUtil::TLoaders default_loaders
Definition: annotwriter.cpp:76
void g_LogGeneralParsingError(EDiagSev sev, const string &idString, const string &msg, objects::ILineErrorListener &listener)
Definition: utils.cpp:41
ncbi::TMaskedQueryRegions mask
AutoPtr –.
Definition: ncbimisc.hpp:401
CArgAllow_Strings –.
Definition: ncbiargs.hpp:1641
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgException –.
Definition: ncbiargs.hpp:120
CArgs –.
Definition: ncbiargs.hpp:379
const SBadResiduePositions & GetBadResiduePositions(void) const THROWS_NONE
static bool CleanArticle(CCit_art &, bool fix_initials, bool strip_serial)
static bool ConvertPubFeatsToPubDescs(CSeq_entry_Handle seh)
Convert full-length publication features to publication descriptors.
Definition: cleanup.cpp:3388
@ eScope_UseInPlace
Definition: cleanup.hpp:85
@ eClean_NoNcbiUserObjects
Definition: cleanup.hpp:75
static void SetupObjectManager(const CArgs &args, objects::CObjectManager &obj_mgr, TLoaders loaders=fDefault)
Set up the standard object manager data loaders according to the arguments provided above.
static void AddArgumentDescriptions(CArgDescriptions &arg_desc, TLoaders loaders=fDefault)
Add a standard set of arguments used to configure the object manager.
CDir –.
Definition: ncbifile.hpp:1695
CFile –.
Definition: ncbifile.hpp:1604
void Generate(const CSeq_entry_Handle &entry, CFlatItemOStream &item_os, const multiout &={})
EFormat
The formats are checked in the same order as declared here.
@ eBinaryASN
Binary ASN.1.
@ eGff3
GFF3, CGff3Reader.
@ eFasta
FASTA format sequence record, CFastaReader.
@ eTextASN
Text ASN.1.
@ eAlignment
Text alignment.
static EFormat Format(const string &path, EOnError onerror=eDefault)
Guess file format.
static CLineError * Create(EProblem eProblem, EDiagSev eSeverity, const std::string &strSeqId, unsigned int uLine, const std::string &strFeatureName=string(""), const std::string &strQualifierName=string(""), const std::string &strQualifierValue=string(""), const std::string &strErrorMessage=string(""), const TVecOfLines &vecOfOtherLines=TVecOfLines())
Use this because the constructor is protected.
Definition: line_error.cpp:42
CMaskFileName –.
Definition: ncbi_mask.hpp:107
CMask –.
Definition: ncbi_mask.hpp:59
void PutProgress(const string &sMessage, const Uint8 iNumDone, const Uint8 iNumTotal) override
This is used for processing progress messages.
bool PutMessage(const IObjtoolsMessage &message) override
static const set< TTypeInfo > kSupportedTypes
Definition: multireader.hpp:46
int GetCode() const override
Definition: table2asn.cpp:145
EDiagSev GetSeverity() const override
Definition: table2asn.cpp:144
void DumpAsXML(CNcbiOstream &out) const override
Definition: table2asn.cpp:139
void WriteAsXML(CNcbiOstream &out) const override
Definition: table2asn.cpp:134
int GetSubCode() const override
Definition: table2asn.cpp:146
CObjtoolsDiagMessage(const string &txt, EDiagSev sev)
Definition: table2asn.cpp:124
void Write(CNcbiOstream &out) const override
Definition: table2asn.cpp:132
string GetText() const override
Definition: table2asn.cpp:143
void Dump(CNcbiOstream &out) const override
Definition: table2asn.cpp:133
IObjtoolsMessage * Clone() const override
Definition: table2asn.cpp:130
CScope –.
Definition: scope.hpp:92
SeqMap related exceptions.
CSeq_entry_Handle –.
Definition: Seq_entry.hpp:56
void Parentize(void)
Definition: Seq_entry.cpp:71
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
static void UpdateTaxonFromTable(objects::CBioseq &bioseq)
bool PutMessage(const IObjtoolsMessage &message) override
Definition: table2asn.cpp:168
void Post(const SDiagMessage &mess) override
Post message to handler.
Definition: table2asn.cpp:178
void PutProgress(const string &sMessage, const Uint8 iNumDone=0, const Uint8 iNumTotal=0) override
Definition: table2asn.cpp:159
void LoadCMTFile(const string &pathname, unique_ptr< CTable2AsnStructuredCommentsReader > &comments)
Definition: table2asn.cpp:1699
void LoadDSCFile(const string &pathname)
Definition: table2asn.cpp:1596
void ProcessAlignmentFile(CNcbiOstream *output)
Definition: table2asn.cpp:1440
void Init() override
Initialize the application.
Definition: table2asn.cpp:234
void ProcessOneEntry(CFormatGuess::EFormat inputFormat, CRef< CSerialObject > obj, CRef< CSerialObject > &result)
Definition: table2asn.cpp:960
void SetupAndOpenDataStreams()
Definition: table2asn.cpp:1245
void AddAnnots(CSeq_entry &entry)
Definition: table2asn.cpp:1666
void ProcessSecretFiles2Phase(CSeq_entry &result) const
Definition: table2asn.cpp:1591
void CloseDataStreams()
Definition: table2asn.cpp:1260
void SetupAndOpenDiagnosticStreams()
Definition: table2asn.cpp:1233
int Run() override
Run the application.
Definition: table2asn.cpp:471
void ProcessSecretFiles1Phase(bool readModsFromTitle, TAsyncToken &)
Definition: table2asn.cpp:1561
void MakeFlatFile(CSeq_entry_Handle seh, CRef< CSeq_submit > submit, std::ostream &ostream)
Definition: table2asn.cpp:1221
void xProcessSecretFiles1Phase(bool readModsFromTitle, CSeq_entry &result)
Definition: table2asn.cpp:1532
void Setup(const CArgs &args)
Definition: table2asn.cpp:1511
void ProcessOneFile(bool isAlignment, bool manageDiagnosticStreams=true, bool manageDataStream=true)
Definition: table2asn.cpp:1276
void x_SetAlnArgs(CArgDescriptions &arg_desc)
Definition: table2asn.cpp:200
void ReportUnusedSourceQuals()
Definition: table2asn.cpp:1760
void ProcessCMTFiles(CSeq_entry &result) const
Definition: table2asn.cpp:1604
void LoadAnnotMap(const string &pathname, TAnnotMap &annotMap)
Definition: table2asn.cpp:1642
bool ProcessOneDirectory(const CDir &directory, const CMask &mask, bool recurse)
Definition: table2asn.cpp:1472
void LoadPEPFile(const string &pathname)
Definition: table2asn.cpp:1612
void LoadAdditionalFiles()
Definition: table2asn.cpp:1709
void CloseDiagnosticStreams()
Definition: table2asn.cpp:1255
void xProcessOneFile(CFormatGuess::EFormat format, CRef< CSerialObject > pInputObject, TAnnotMap &annotMap, CNcbiOstream *output)
Definition: table2asn.cpp:1402
void ProcessSingleEntry(CFormatGuess::EFormat inputFormat, TAsyncToken &token)
Definition: table2asn.cpp:1106
void LoadPRTFile(const string &pathname)
Definition: table2asn.cpp:1630
size_t xGetNumThreads() const
Definition: table2asn.cpp:450
void LoadRNAFile(const string &pathname)
Definition: table2asn.cpp:1623
void ProcessTopEntry(CFormatGuess::EFormat inputFormat, bool need_update_date, CRef< CSeq_submit > &submit, CRef< CSeq_entry > &entry)
Definition: table2asn.cpp:1074
CTime –.
Definition: ncbitime.hpp:296
CVersionInfo –.
bool PutMessage(const IObjtoolsMessage &message) override
@ eProblem_GeneralParsingError
Definition: line_error.hpp:106
vector< unsigned int > TVecOfLines
Definition: line_error.hpp:128
virtual bool PutMessage(const IObjtoolsMessage &message)=0
const_iterator begin() const
Definition: map.hpp:151
bool empty() const
Definition: map.hpp:149
Definition: map.hpp:338
Include a standard set of the NCBI C++ Toolkit most basic headers.
constexpr auto begin(const ct_const_array< T, N > &in) noexcept
constexpr auto end(const ct_const_array< T, N > &in) noexcept
@ eTSA
@ eSubmitter
std::ofstream out("events_result.xml")
main entry point for tests
static void cleanup(void)
Definition: ct_dynamic.c:30
#define false
Definition: bool.h:36
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:56
static SQLCHAR output[256]
Definition: print.c:5
static const char * str(char *buf, int n)
Definition: stats.c:84
#define basename(path)
Definition: replacements.h:116
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
Definition: ncbiapp.cpp:819
void Reset(void)
Remove all arguments.
Definition: ncbiargs.cpp:1910
void SetConstraint(const string &name, const CArgAllow *constraint, EConstraintNegate negate=eConstraint)
Set additional user defined constraint on argument value.
Definition: ncbiargs.cpp:2591
void SetDependency(const string &arg1, EDependency dep, const string &arg2)
Define a dependency.
Definition: ncbiargs.cpp:2618
bool Exist(const string &name) const
Check existence of argument description.
Definition: ncbiargs.cpp:1813
void AddOptionalKey(const string &name, const string &synopsis, const string &comment, EType type, TFlags flags=0)
Add description for optional key without default value.
Definition: ncbiargs.cpp:2427
void AddDefaultKey(const string &name, const string &synopsis, const string &comment, EType type, const string &default_value, TFlags flags=0, const string &env_var=kEmptyStr, const char *display_value=nullptr)
Add description for optional key with default value.
Definition: ncbiargs.cpp:2442
@ fAllowMultiple
Repeated key arguments are legal (use with AddKey)
Definition: ncbiargs.hpp:635
@ fHidden
Hide it in Usage.
Definition: ncbiargs.hpp:662
@ fUsageIfNoArgs
Force printing USAGE (and then exit) if no command line args are present.
Definition: ncbiargs.hpp:1029
@ eExcludes
One argument excludes another.
Definition: ncbiargs.hpp:957
@ eInputFile
Name of file (must exist and be readable)
Definition: ncbiargs.hpp:595
@ eString
An arbitrary string.
Definition: ncbiargs.hpp:589
@ eOutputFile
Name of file (must be writable)
Definition: ncbiargs.hpp:596
@ eInteger
Convertible into an integer number (int or Int8)
Definition: ncbiargs.hpp:592
void Write(string &str, TDiagWriteFlags flags=fNone) const
Binary OR of "EDiagWriteFlags".
Definition: ncbidiag.cpp:5355
EDiagSev m_Severity
Severity level.
Definition: ncbidiag.hpp:1651
TDiagPostFlags m_Flags
Bitwise OR of "EDiagPostFlag".
Definition: ncbidiag.hpp:1661
static const char * SeverityName(EDiagSev sev)
Get a common symbolic name for the severity levels.
void SetDiagHandler(CDiagHandler *handler, bool can_delete=true)
Set the diagnostic handler using the specified diagnostic handler class.
Definition: ncbidiag.cpp:6288
EDiagSev
Severity level for the posted diagnostics.
Definition: ncbidiag.hpp:650
void SetDiagStream(CNcbiOstream *os, bool quick_flush=true, FDiagCleanup cleanup=0, void *cleanup_data=0, const string &stream_name="")
Set diagnostic stream.
Definition: ncbidiag.cpp:8083
@ eDPF_IsNote
Print "Note[X]" severity name.
Definition: ncbidiag.hpp:757
@ fNoEndl
No end of line.
Definition: ncbidiag.hpp:1704
@ eDS_Default
Try standard log file (app.name + ".log") in /log/, use stderr on failure.
Definition: ncbidiag.hpp:1790
@ eDiag_Info
Informational message.
Definition: ncbidiag.hpp:651
@ eDiag_Error
Error message.
Definition: ncbidiag.hpp:653
@ eDiag_Warning
Warning message.
Definition: ncbidiag.hpp:652
@ eDiag_Fatal
Fatal error – guarantees exit(or abort)
Definition: ncbidiag.hpp:655
@ eDiag_Critical
Critical error message.
Definition: ncbidiag.hpp:654
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
const string & GetMsg(void) const
Get message string.
Definition: ncbiexpt.cpp:461
const CException * GetPredecessor(void) const
Get "previous" exception from the backlog.
Definition: ncbiexpt.hpp:1041
static string CreateAbsolutePath(const string &path, ERelativeToWhat rtw=eRelativeToCwd)
Get an absolute path from some, possibly relative, path.
Definition: ncbifile.cpp:665
string GetBase(void) const
Get the base entry name without extension.
Definition: ncbifile.hpp:3924
Int8 GetLength(void) const
Get size of file.
Definition: ncbifile.cpp:3204
virtual bool Remove(TRemoveFlags flags=eRecursive) const
Remove a directory entry.
Definition: ncbifile.cpp:2595
static string AddTrailingPathSeparator(const string &path)
Add trailing path separator, if needed.
Definition: ncbifile.cpp:455
virtual bool Exists(void) const
Check if directory "dirname" exists.
Definition: ncbifile.hpp:4065
bool Create(TCreateFlags flags=fCreate_Default) const
Create the directory using "dirname" passed in the constructor.
Definition: ncbifile.cpp:4071
TEntries * GetEntriesPtr(const string &mask=kEmptyStr, TGetEntriesFlags flags=0) const
Get directory entries based on the specified "mask".
Definition: ncbifile.cpp:3856
const string & GetPath(void) const
Get entry path.
Definition: ncbifile.hpp:3910
virtual bool Exists(void) const
Check existence of file.
Definition: ncbifile.hpp:4038
static void SplitPath(const string &path, string *dir=0, string *base=0, string *ext=0)
Split a path string into its basic components.
Definition: ncbifile.cpp:358
@ eRelativeToCwd
Relative to the current working directory.
Definition: ncbifile.hpp:436
@ fIgnoreRecursive
Suppress "self recursive" elements (the directories "." and "..").
Definition: ncbifile.hpp:1755
@ fCreateObjects
Create appropriate subclasses of CDirEntry (CFile,CDir,...), not just CDirEntry objects.
Definition: ncbifile.hpp:1758
@ fIgnoreMissing
Ignore missed entries.
Definition: ncbifile.hpp:720
#define ENUM_METHOD_NAME(EnumName)
Definition: serialbase.hpp:994
virtual const CTypeInfo * GetThisTypeInfo(void) const =0
CTempString GetCurrentLine(void) const
static CRef< ILineReader > New(const string &filename)
Return a new ILineReader object corresponding to the given filename, taking "-" (but not "....
Definition: line_reader.cpp:49
void ReadLine(void)
Definition: line_reader.hpp:88
virtual bool AtEOF(void) const =0
Indicates (negatively) whether there is any more input.
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CScope & GetScope(void) const
Get scope this handle belongs to.
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
Definition: ncbiobj.hpp:2015
TObjectType * GetPointer(void) THROWS_NONE
Get pointer,.
Definition: ncbiobj.hpp:998
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::ofstream CNcbiOfstream
Portable alias for ofstream.
Definition: ncbistre.hpp:500
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
#define NcbiCout
Definition: ncbistre.hpp:543
#define NcbiCerr
Definition: ncbistre.hpp:544
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3461
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
Definition: ncbistr.cpp:3201
static string XmlEncode(const CTempString str, TXmlEncode flags=eXmlEnc_Contents)
Encode a string for XML.
Definition: ncbistr.cpp:4036
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
@ fWithCommas
Use commas as thousands separator.
Definition: ncbistr.hpp:254
@ eCurrent
Use current time. See also CCurrentTime.
Definition: ncbitime.hpp:300
virtual bool IsType(TTypeInfo type) const
Definition: typeinfo.cpp:314
void CONNECT_Init(const IRWRegistry *reg=0, CRWLock *lock=0, TConnectInitFlags flag=eConnectInit_OwnNothing, FSSLSetup ssl=0)
Init [X]CONNECT library with the specified "reg" and "lock" (ownership for either or both can be deta...
void Add(const string &mask)
Add an inclusion mask.
Definition: ncbi_mask.hpp:67
bool IsArticle(void) const
Check if variant Article is selected.
Definition: Pub_.hpp:629
TArticle & SetArticle(void)
Select the variant.
Definition: Pub_.cpp:239
bool IsSetClass(void) const
Check if a value has been assigned to Class data member.
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
TClass GetClass(void) const
Get the Class member data.
const TSet & GetSet(void) const
Get the variant data.
Definition: Seq_entry_.cpp:124
bool IsSeq(void) const
Check if variant Seq is selected.
Definition: Seq_entry_.hpp:257
bool IsSetSeq_set(void) const
Check if a value has been assigned to Seq_set data member.
bool IsSet(void) const
Check if variant Set is selected.
Definition: Seq_entry_.hpp:263
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
TSeq_set & SetSeq_set(void)
Assign a value to Seq_set data member.
@ eClass_pop_set
population study
@ eClass_phy_set
phylogenetic study
@ eClass_mut_set
set of mutations
@ eClass_eco_set
ecological sample study
@ eClass_nuc_prot
nuc acid and coded proteins
Definition: Bioseq_set_.hpp:99
@ eClass_small_genome_set
viral segments or mitochondrial minicircles
FILE * file
if(yy_accept[yy_current_state])
Lightweight interface for getting lines of data with minimal memory copying.
constexpr auto sort(_Init &&init)
Definition: fix_pub.hpp:45
const struct ncbi::grid::netcache::search::fields::EXPIRES expires
Magic spell ;-) needed for some weird compilers... very empiric.
void VisitAllFeatures(objects::CSeq_entry_EditHandle &entry_h, _M m)
Definition: visitors.hpp:120
void VisitAllBioseqs(objects::CSeq_entry &entry, _M &&m)
Definition: visitors.hpp:14
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
#define NCBI_SC_VERSION_PROXY
#define NCBI_TEAMCITY_BUILD_NUMBER_PROXY
Classes to match a string against a set of masks.
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
#define GetArgs
Avoid preprocessor name clash with the NCBI C Toolkit.
Definition: ncbiapp_api.hpp:54
Defines command line argument related classes.
Defines unified interface to application:
NCBI C++ stream class wrappers for triggering between "new" and "old" C++ stream libraries.
bool le(T x_, T y_, T round_)
Definition: njn_approx.hpp:84
static Format format
Definition: njn_ioutil.cpp:53
std::istream & in(std::istream &in_, double &x_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
double f(double x_, const double &y_)
Definition: njn_root.hpp:188
The Object manager core.
static const char * suffix[]
Definition: pcregrep.c:408
void SortSeqDescr(CSeq_entry &entry)
void g_ApplyMods(CMemorySrcFileMap *pNamedSrcFileMap, CMemorySrcFileMap *pDefaultSrcFileMap, const string &commandLineStr, bool readModsFromTitle, bool isVerbose, CModHandler::EHandleExisting mergePolicy, ILineErrorListener *pEC, CSeq_entry &entry)
Definition: src_quals.cpp:778
SDiagMessage –.
Definition: ncbidiag.hpp:1599
CSeq_entry_Handle seh
Definition: async_token.hpp:53
CRef< CScope > scope
Definition: async_token.hpp:52
CRef< CSeq_submit > submit
Definition: async_token.hpp:50
atomic_bool * pPubLookupDone
Definition: async_token.hpp:59
CRef< CSeq_entry > top_entry
Definition: async_token.hpp:51
std::mutex * cleanup_mutex
Definition: async_token.hpp:60
static void s_PubCleanup(CRef< CPub > &pub)
Definition: table2asn.cpp:442
static bool s_UseHugeFileMode(const CTable2AsnContext &context, CFormatGuess::EFormat format)
Definition: table2asn.cpp:1265
int main(int argc, const char *argv[])
Definition: table2asn.cpp:1771
void g_LogDiagMessage(ILineErrorListener *logger, EDiagSev sev, const string &msg)
Definition: table2asn.cpp:188
static void s_FailOnBadInput(const string &specifics, IObjtoolsListener &listener)
Definition: table2asn.cpp:113
void g_LoadLinkageEvidence(const string &linkageEvidenceFilename, CGapsEditor::TCountToEvidenceMap &gapsizeToEvidence, ILineErrorListener *pEC)
@ fixedproducts
#define _ASSERT
int g(Seg_Gsm *spe, Seq_Mtf *psm, Thd_Gsm *tdg)
Definition: thrddgri.c:44
else result
Definition: token2.c:20
static CS_CONTEXT * context
Definition: will_convert.c:21
static wxAcceleratorEntry entries[3]
Modified on Mon May 13 04:36:25 2024 by modify_doxy.py rev. 669887