45 arg_desc->SetUsageContext
47 "Microbial Genome Submission Check Tool (subcheck) is for the validation of "
48 "genome records prior to submission to GenBank. It utilizes a series of "
49 "self-consistency checks as well as comparison of submitted annotations to "
50 "computed annotations. Some of specified computed annotations could be "
51 "pre-computed using BLAST and its modifications and tRNAscanSE. Currently "
52 "there is no specific tool for predicting rRNA annotations. Please use the "
53 "format specified in documentation"
59 "input file in the ASN.1 format, must be either Seq-entry or Seq-submit",
62 arg_desc->AddOptionalKey
64 "output file in the ASN.1 format, of the same type (Seq-entry or Seq-submit)",
69 "keep frameshifted sequences and make misc_features at the same time. Needs editing after run!");
71 arg_desc->AddOptionalKey
72 (
"inblast",
"blast_res_proteins",
73 "input file which contains the standard BLAST output results (ran with -IT option) "
74 "for all query proteins "
75 "sequences specified in the input genome against a protein database (recommended: bact_prot "
76 "database of Refseq proteins supplied with the distributed standalone version of this tool)",
79 arg_desc->AddOptionalKey
80 (
"inblastcdd",
"blast_res_cdd",
81 "input file which contains the standard BLAST output results for all query proteins "
82 "sequences specified in input_asn against the CDD database",
85 arg_desc->AddOptionalKey
86 (
"intrna",
"input_trna",
87 "input tRNAscan predictions in default output format, default value is <-in parameter>.nfsa.tRNA",
90 arg_desc->AddOptionalKey
91 (
"inrrna",
"input_rrna",
92 "input ribosomal RNA predictions (5S, 16S, 23S), see the manual for format, default value is <-in parameter>.nfsa.rRNA",
95 arg_desc->AddOptionalKey(
96 "parentacc",
"parent_genome_accession",
97 "Refseq accession of the genome which protein annotations need to be excluded from BLAST output results",
100 arg_desc->AddOptionalKey(
101 "inparents",
"InputParentsFile",
102 "contains a list of all protein accessions/GIs for each Refseq accession/GI",
105 arg_desc->AddOptionalKey(
106 "intagmap",
"InputTagMap",
107 "use the file to map tags in BLAST",
110 arg_desc->AddDefaultKey(
"infmt",
"InputFormat",
"format of input file",
112 arg_desc->SetConstraint
115 arg_desc->AddOptionalKey
116 (
"outTbl",
"OutputTblFile",
117 "name of file to write additional TBL output (/dev/null by default)",
120 arg_desc->AddOptionalKey
121 (
"outPartial",
"OutputFilePartial",
122 "name of the output file for reporting \"partial hit\" problems",
125 arg_desc->AddOptionalKey
126 (
"outOverlap",
"OutputFileOverlap",
127 "name of the output file for reporting overlap problems",
130 arg_desc->AddOptionalKey
131 (
"outRnaOverlap",
"OutputFileRnaOverlap",
132 "name of the output file for reporting RNA overlap problems",
135 arg_desc->AddOptionalKey
136 (
"outCompleteOverlap",
"OutputFileCompleteOverlap",
137 "name of the output file for reporting complete overlap problems",
140 arg_desc->AddOptionalKey
141 (
"outOther",
"OutputFileOther",
142 "name of the output file for reporting other problems",
145 arg_desc->AddDefaultKey(
"outfmt",
"OutputFormat",
"format of output file",
147 arg_desc->SetConstraint
151 arg_desc->AddDefaultKey(
152 "verbosity",
"Verbosity",
153 "Verbosity level threshold",
157 arg_desc->AddDefaultKey(
158 "small_tails_threshold",
"small_tails_threshold",
159 "the sum of the left and right tails outside the aligned region for "
160 "the given sum less than this threshold will make it \"small tails\"",
163 arg_desc->AddDefaultKey(
164 "n_best_hit",
"n_best_hit",
165 "number of BLAST best hits imported for each sequence",
168 arg_desc->AddDefaultKey(
169 "m_eThreshold",
"m_eThreshold",
170 "only CDD hits below this threshold will be used for partial hit definition",
173 arg_desc->AddDefaultKey(
174 "m_entireThreshold",
"m_entireThreshold",
175 "at least this part of the query needs to be in the alignment to be considered for partial hit candidate",
178 arg_desc->AddDefaultKey(
179 "m_partThreshold",
"m_partThreshold",
180 "if aligned region with CDD is less than this threshold, this hit will be considered for partial hit candidate",
183 arg_desc->AddDefaultKey(
184 "m_rna_overlapThreshold",
"m_rna_overlapThreshold",
185 "if protein and RNA annotations overlapping more than that threshold, it will be reported",
188 arg_desc->AddDefaultKey(
189 "m_cds_overlapThreshold",
"m_cds_overlapThreshold",
190 "if CDS annotations overlapping more than that threshold, it will be reported",
193 arg_desc->AddDefaultKey(
194 "m_trnascan_scoreThreshold",
"m_trnascan_scoreThreshold",
195 "tRNA-scan predictions below that threshold are ignored",
198 arg_desc->AddDefaultKey(
199 "m_shortProteinThreshold",
"m_shortProteinThreshold",
200 "proteins shorter than that will be reported and removed",
242 string base = args[
"in"].AsString();
253 unique_ptr<CObjectIStream>
in
255 args[
"in"].AsInputFile()));
268 NcbiCerr <<
"WARNING: tbl file will be read but nothing more will be done." <<
NcbiEndl;
269 if(!
m_tbl.
Read(args[
"in"].AsInputFile()))
271 NcbiCerr <<
"FATAL: tbl file does not have any records or have been corrupted" <<
NcbiEndl;
279 NcbiCerr <<
"FATAL: only tbl, Seq-submit or Seq-entry formats are accepted at this time. Seq-set has to be present as well" <<
NcbiEndl;
299 if( args[
"out"].
HasValue() &&
false)
301 unique_ptr<CObjectOStream>
out
303 args[
"out"].AsOutputFile()));
336 if(!
ReadBlast(args[
"inblast"].AsString().c_str(), blastMap))
349 ReadBlast(args[
"inblastcdd"].AsString().c_str(), cddMap);
358 tRNA_file = args[
"intrna"].AsString();
363 tRNA_file +=
".nfsa.tRNA";
374 rRNA_file =args[
"inrrna"].AsString();
379 rRNA_file +=
".nfsa.rRNA";
411 NcbiCerr <<
"Dumping FASTA file for subsequent HTML blast output..." <<
NcbiEndl;
416 bool report_and_forget =
false;
419 string sout = args[
"outPartial"].HasValue() ?
420 args[
"outPartial"].AsString() :
421 base +
".partial.problems.log";
433 string sout = args[
"outOverlap"].HasValue() ?
434 args[
"outOverlap"].AsString() :
435 base +
".overlap.problems.log";
447 string sout = args[
"outRnaOverlap"].HasValue() ?
448 args[
"outRnaOverlap"].AsString() :
449 base +
".rna.overlap.problems.log";
462 string sout = args[
"outCompleteOverlap"].HasValue() ?
463 args[
"outCompleteOverlap"].AsString() :
464 base +
".complete.overlap.problems.log";
468 <<
"(eCompleteOverlap)"
476 string sout = base +
".overlap.resolved.problems.log";
480 <<
"(eRemoveOverlap)"
488 string sout = base +
".tRNA.missing.log";
502 string sout = base +
".tRNA.bad.strand.log";
506 <<
"(eTRNABadStrand)"
514 string sout = base +
".tRNA.undef.strand.log";
518 <<
"(eTRNAUndefStrand)"
526 string sout = base +
".tRNA.complete.mismatch.log";
530 <<
"(eTRNAComMismatch)"
538 string sout = base +
".tRNA.mismatch.log";
550 string sout = base +
".short.annotation.log";
566 string sout = args[
"outOther"].HasValue() ?
567 args[
"outOther"].AsString() :
568 base +
".frameshifts.problems.log";
572 <<
"(eRelFrameShift)"
606 args[
"out"].AsOutputFile().seekp(0);
607 unique_ptr<CObjectOStream>
out
609 args[
"out"].AsOutputFile()));
void printGeneralInfo(ostream &out=NcbiCerr)
int CollectFrameshiftedSeqs(map< string, string > &problem_names)
static int m_verbosity_threshold
static bool PrintDetails(int current_verbosity=m_current_verbosity)
map< string, string > m_tagmap
static stack< int > m_saved_verbosity
list< long > m_previous_genome
int ProcessCDD(map< string, blastStr > &blastMap)
int ReadRRNA2(const string &file)
static double m_trnascan_scoreThreshold
int simple_overlaps(void)
static int m_current_verbosity
bool ReadPreviousAcc(const string &file, list< long > &input_acc)
int RemoveProblems(map< string, string > &problem_seqs, LocMap &loc_map)
virtual void Init(void)
Initialize the application.
static int m_cds_overlapThreshold
int CopyInfoFromGenesToProteins(void)
===========================================================================
int ReadBlast(const char *file, map< string, blastStr > &blastMap)
static double m_entireThreshold
static int m_rna_overlapThreshold
static ECoreDataType getCoreDataType(istream &in)
virtual int Run(void)
Run the application.
int StoreBlast(map< string, blastStr > &blastMap)
int AnalyzeSeqsViaBioseqs(bool in_pool_prot, bool against_prot)
TSimpleSeqs m_extRNAtable2
int ReadTagMap(const char *file)
int ReadParents(CNcbiIstream &in, const list< long > &nacc)
static bool less_simple_seq(const TSimpleSeq &first, const TSimpleSeq &second)
static int m_shortProteinThreshold
static void PopVerbosity(void)
ECoreDataType m_coreDataType
void reportProblems(const bool report_and_forget, diagMap &diag, ostream &out, const CBioseq::TAnnot &annots, const EProblem type)
static double m_small_tails_threshold
static double m_partThreshold
static double m_eThreshold
void dump_fasta_for_pretty_blast(diagMap &diag)
int ReadTRNA2(const string &file)
static void PushVerbosity(void)
std::ofstream out("events_result.xml")
main entry point for tests
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
string & Print(string &str) const
Print (append) all arguments to the string "str" and return "str".
@ fPreOpen
Open file right away; for eInputFile, eOutputFile, eIOFile.
@ eInputFile
Name of file (must exist and be readable)
@ eDouble
Convertible into a floating point number (double)
@ eString
An arbitrary string.
@ eOutputFile
Name of file (must be writable)
@ eInteger
Convertible into an integer number (int or Int8)
static CObjectOStream * Open(ESerialDataFormat format, CNcbiOstream &outStream, bool deleteOutStream)
Create serial object writer and attach it to an output stream.
static CObjectIStream * Open(ESerialDataFormat format, CNcbiIstream &inStream, bool deleteInStream)
Create serial object reader and attach it to an input stream.
@ eOverlap
CSeq_locs overlap.
IO_PREFIX::ofstream CNcbiOfstream
Portable alias for ofstream.
void SetData(TData &value)
Assign a value to Data data member.
std::istream & in(std::istream &in_, double &x_)
ESerialDataFormat s_GetFormat(const string &name)