75 virtual void Init(
void);
76 virtual int Run(
void);
77 virtual void Exit(
void);
114 m_comp2len, m_comp2range_coll)
130 version_str+=
", AGP Specification v2.1";
132 str=
"Validate data in the AGP format:\n"
133 "https://www.ncbi.nlm.nih.gov/assembly/agp/AGP_Specification/\n"
135 "Version: " + version_str +
"\n"
137 "USAGE: agp_validate [-options] [FASTA files...] [AGP files...]\n"
139 "There are 3 validations modes:\n"
140 "no mode option: (default mode) report component, gap, scaffold and object statistics, perform checks\n"
141 " that do not require component sequences to be available in GenBank (see: -list).\n"
142 "-alt, -species: Check component Accessions, Lengths and Taxonomy ID using GenBank data;\n"
143 " -species allows components from different subspecies during Taxid checks.\n"
145 "-comp Check that the supplied object sequences (in FASTA files) match what can be\n"
146 " constructed from the AGP and the component sequences (in FASTA files or in GenBank).\n"
147 " Run \"agp_validate -comp\" to see the options for this mode.\n"
149 "OPTIONS (default and -alt modes):\n"
150 " -g Check that component names look like Nucleotide accessions\n"
151 " (this does not require components to be in GenBank).\n"
152 " -out FILE Save the AGP file, adding missing version 1 to the component accessions (need -alt),\n"
153 " or adding gaps where runs of Ns longer than 10 bp are found in components (need FASTA files).\n"
154 " -obj Use FASTA files to read names and lengths of objects (the default is components).\n"
155 " -v VER AGP version (1 or 2). The default is to choose automatically. Version 2 is chosen\n"
156 " when the linkage evidence (column 9) is not empty in the first gap line encountered.\n"
157 " -xml Report results in XML format.\n"
158 " -sub Treat serious warnings as errors, put summary and stats at the top.\n"
160 " Extra checks specific to an object type:\n"
161 " -un Unplaced/unlocalized scaffolds:\n"
162 " any single-component scaffold must use the whole component in orientation '+'\n"
163 " -scaf Scaffold from component AGP: no scaffold-breaking gaps allowed\n"
164 " -chr Chromosome from scaffold AGP: ONLY scaffold-breaking gaps allowed\n"
165 " Use both of the last 2 options in this order: -scaf Scaf_AGP_file(s) -chr Chr_AGP_file(s)\n"
166 " to check that all scaffolds in Scaf_AGP_file(s) are wholly included in Chr_AGP_file(s)\n"
169 " -list List error and warning messages.\n"
170 " -limit COUNT Print only the first COUNT messages of each type.\n"
171 " Default=100. To print all, use: -limit 0\n"
172 " -skip, -only WHAT Skip, or report only a particular error or warning.\n"
173 " -show WHAT Show the warning hidden by default (w40, w45, w46, w52).\n"
174 " 'WHAT' could be a part of the message text, an error code (e11, w22, etc; see -list),\n"
175 " or a keyword: all, warn, err, alt.\n"
177 "If component FASTA files are given in front of AGP files, also check that:\n"
178 "- component_id from AGP is present in FASTA;\n"
179 "- component_end does not exceed sequence length.\n"
180 "If FASTA files for objects are given (after -obj), check that:\n"
181 "- object_id from AGP is present in FASTA;\n"
182 "- object lengths in FASTA and in AGP match.\n"
194 auto arg_desc = make_unique<CArgDesc_agp_validate>(
GetVersion());
196 arg_desc->SetUsageContext(
198 "Validate AGP data",
false);
201 arg_desc->AddFlag(
"alt",
"");
203 arg_desc->AddFlag(
"g" ,
"");
204 arg_desc->AddFlag(
"obj" ,
"");
205 arg_desc->AddFlag(
"un" ,
"");
206 arg_desc->AddFlag(
"scaf",
"");
207 arg_desc->AddFlag(
"chr" ,
"");
208 arg_desc->AddFlag(
"comp",
"");
209 arg_desc->AddFlag(
"xml" ,
"");
210 arg_desc->AddFlag(
"sub" ,
"");
213 arg_desc->AddOptionalKey(
"loadlog",
"FILE",
214 "specifies where we write our loading log for -comp",
216 arg_desc->AddFlag(
"ignoreagponly",
"");
217 arg_desc->AddFlag(
"ignoreobjfileonly",
"");
218 arg_desc->AddDefaultKey(
"diffstofind",
"",
"",
221 arg_desc->AddFlag(
"species",
"allow components from different subspecies");
223 arg_desc->AddOptionalKey(
"out",
"FILE",
224 "add missing version 1 to component accessions",
227 arg_desc->AddOptionalKey(
"v",
"ver",
231 arg_desc->AddOptionalKey(
"skip",
"error_or_warning",
232 "Message or message code to skip",
236 arg_desc->AddOptionalKey(
"only",
"error_or_warning",
237 "Message or message code to print (hide other)",
241 arg_desc->AddOptionalKey(
"show",
"error_or_warning",
242 "Message or message code to print (if not printed by default)",
246 arg_desc->AddDefaultKey(
"limit",
"ErrorCount",
247 "Print at most ErrorCount lines with a particular error",
251 arg_desc->AddFlag(
"list",
"all possible errors and warnings");
254 arg_desc->AddExtra(0, 10000,
"files to be processed",
273 pAgpErr->PrintAllMessages(cout);
288 pAgpErr->m_out = error_details_out;
301 cerr <<
"Error -- cannot specify -un with -chr/-scaf.\n";
305 cerr <<
"Error -- cannot specify -chr/-scaf with -alt/-species.\n";
311 cerr <<
"Error -- -scaf and -chr must precede different files.\n";
317 else if( args[
"scaf"].
HasValue() ) {
323 cerr <<
"Error -- cannot specify -obj with -alt/-species.\n";
330 bool checkCompNames=args[
"g"].HasValue();
352 bool onlyNotSkip = args[
"only"].HasValue();
356 cerr <<
"Error -- cannot specify both -only and -skip.\n";
359 err_warn = &( args[
"skip"].GetStringList() );
360 action=
"Skipping messages:\n";
362 else if(onlyNotSkip) {
364 cerr <<
"Error -- cannot specify both -only and -show; please use multiple -only instead.\n";
368 err_warn = &( args[
"only"].GetStringList() );
370 action=
"Allowed messages:\n";
374 bool needHeading=
true;
375 for( CArgValue::TStringArray::const_iterator it =
376 err_warn->begin(); it != err_warn->end(); ++it
378 string res =
pAgpErr->SkipMsg(*it, onlyNotSkip);
380 cerr <<
"WARNING: no matches for " << *it <<
"\n";
384 if ( res[0] ==
' ' && needHeading) {
385 if(needHeading) cerr << action;
398 err_warn = &( args[
"show"].GetStringList() );
399 for( CArgValue::TStringArray::const_iterator it =
400 err_warn->begin(); it != err_warn->end(); ++it
407 args[
"limit"].HasValue() ? args[
"limit"].AsInteger() : 100;
410 if( args[
"v"].AsString()[0]==
'1' ) {
413 else if( args[
"v"].AsString()[0]==
'2' ) {
417 cerr <<
"Error -- invalid AGP version after -v (must start with 1 or 2).\n";
425 if( ! args[
"comp"] ) {
428 if( args[
"loadlog"] || args[
"ignoreagponly"] ||
429 args[
"ignoreobjfileonly"] ||
430 args[
"diffstofind"].AsInteger() > 0 )
432 cerr <<
"Error -- -comp mode options without -comp" << endl;
437 bool taxid_check_failed=
false;
439 cout <<
"<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n<page>\n";
454 else if(error_details_out) {
455 cout <<
"\n\n===== Details =====" << endl;
457 delete error_details_out;
468 if( ! filename.empty() && filename[0] !=
'-' ) {
474 if( args[
"loadlog"] ) {
475 comploadlog = args[
"loadlog"].AsString();
478 string agp_as_fasta_file;
480 agp_as_fasta_file = args[
"out"].AsString();
484 if( args[
"ignoreagponly"] ) {
487 if( args[
"ignoreobjfileonly"] ) {
491 int diffsToFind = args[
"diffstofind"].AsInteger();
496 agp_as_fasta_file, diffsToHide,
499 cerr <<
"AGP/FASTA comparison failed." << endl;
512 << s <<
" and length" << s <<
" loaded from FASTA." << endl;
516 runs_of_Ns += it->second.size();
521 if(!
m_use_xml) cout <<
"No runs of Ns longer than 10 bp found in FASTA sequences." << endl;
531 cout <<
"===== Reading Chromosome from scaffold AGP =====" << endl;
533 if(
out) *
out <<
"===== Chromosome from scaffold AGP =====" << endl;
540 cout <<
"===== Reading Scaffold from component AGP =====" << endl;
541 if(
out) *
out <<
"===== Scaffold from component AGP =====" << endl;
561 cerr <<
"Error -- second -chr is not supported.\n";
565 cerr <<
"Error -- -chr after a file, but no preceding -scaf. Expecting:\n"
566 <<
" -scaf Scaffold_AGP_file(s) -chr Chromosome_AGP_file(s)\n";
575 cout <<
"\n===== Reading Chromosome from scaffold AGP =====" << endl;
576 if(
out) *
out <<
"\n===== Chromosome from scaffold AGP =====" << endl;
590 istr.get(ch); istr.putback(ch);
632 if(
code==-1)
continue;
634 bool comp2len_check_failed=
false;
637 if( !agp_row->
IsGap() ) {
664 if(
code!=0 || comp2len_check_failed ||
673 pAgpErr->m_messages = tmp_messages;
691 string acc, acc_long;
696 int header_line_num=0;
703 bool mfa_bMasked=
false;
704 bool mfa_prevMasked=
false;
719 if(prev_len)
goto LengthRedefinedFa;
722 if(mfa_pos-mfa_firstMasked > 10)
723 range_coll +=
TSeqRange(mfa_firstMasked, mfa_pos-1);
725 if(!range_coll.
empty()) {
730 mfa_firstMasked=mfa_pos=0;
732 mfa_prevMasked=
false;
738 if(pos2<pos1) pos1 = pos2;
741 if(pos1>0 && line[pos1]==
'|') pos1--;
744 acc_long=line.substr(1, pos1);
752 cerr<<
"ERROR - expecting >fasta_header at start of file " << filename <<
", got:\n"
753 << line.substr(0, 100) <<
"\n\n";
759 cerr<<
"ERROR - non-alphabetic character in the FASTA:\n"
760 " file " << filename <<
"\n line " <<
line_num <<
"\n column " <<
i+1 <<
"\n\n";
765 mfa_bMasked =
toupper(line[
i]) ==
'N';
766 if(mfa_bMasked!=mfa_prevMasked) {
768 mfa_firstMasked=mfa_pos;
771 if(mfa_pos-mfa_firstMasked > 10)
772 range_coll +=
TSeqRange(mfa_firstMasked, mfa_pos-1);
775 mfa_prevMasked=mfa_bMasked;
798 if(prev_len)
goto LengthRedefinedFa;
801 if(mfa_pos-mfa_firstMasked > 10)
802 range_coll +=
TSeqRange(mfa_firstMasked, mfa_pos-1);
804 if(!range_coll.
empty()) {
809 cerr<<
"WARNING - empty file " << filename <<
"\n";
814 cerr<<
"ERROR - sequence length redefined from " << prev_len <<
" to " <<
len <<
"\n"
815 <<
" sequence id: " << acc_long <<
"\n"
816 <<
" File: " << filename <<
"\n"
817 <<
" Lines: "<< header_line_num <<
".." <<
line_num <<
"\n\n";
831 if(runs_of_Ns && runs_of_Ns->
size()) {
834 cerr <<
"FATAL: need AGP version (for adding gap lines). Please use -v 1 or -v 2\n";
848 "\t1\t100\t1\tN\t100\t"+
861 (*m_out) << tmp_row->
ToString() << endl;
866 tmp_gap_row->
object_beg = comp2obj_ofs + it->GetFrom();
867 tmp_gap_row->
object_end = comp2obj_ofs + it->GetTo();
868 tmp_gap_row->
gap_length = it->GetTo() - it->GetFrom() + 1;
871 (*m_out) << tmp_gap_row->
ToString(
true) << endl;
887 (*m_out) << tmp_row->
ToString() << endl;
892 (*m_out) << s << endl;
899 int main(
int argc,
const char* argv[])
901 if(argc==1+1 &&
string(
"-comp")==argv[1]) {
902 cout <<
"agp_validate -comp (formerly agp_fasta_compare):\n"
904 "check that the object sequences FASTA matches the AGP.\n"
907 "USAGE: agp_validate -comp [-options] FASTA file(s)... AGP file(s)...\n"
909 " -loadlog OUTPUT_FILE Save the list of all loaded sequences.\n"
910 " -ignoreagponly Do not report objects present in AGP file(s) only.\n"
911 " -ignoreobjfileonly Do not report objects present in FASTA file(s) only.\n"
912 " -diffstofind NUM (EXPERIMENTAL) If specified, list the first NUM lines of each difference.\n"
913 " -out OUTPUT_FILE Save the assembled AGP sequences as FASTA.\n"
915 "FASTA files for components can be provided (along with object FASTA files) if components are not yet in GenBank.\n"
void OverrideLenIfAccession(const string &acc, int &in_out_len)
string ExtractAccession(const string &long_acc)
@ eAgpVersion_auto
auto-detect using the first gap line
@ eAgpVersion_1_1
AGP spec 1.1.
@ eAgpVersion_2_0
AGP spec 2.0 or later.
CRef< CAgpErrEx > pAgpErr
int main(int argc, const char *argv[])
static unsigned int line_num
virtual ~CAgpCompSpanSplitter()
CAgpCompSpanSplitter(CNcbiOstream *out=NULL)
virtual void SaveRow(const string &s, CRef< CAgpRow > row, TRangeColl *runs_of_Ns)
Correctly print multiple errors and warnings on consequitive lines; suppress undesired or higly repet...
EResult Run(const std::list< std::string > &files, const std::string &loadlog, const std::string &agp_as_fasta_file, TDiffsToHide diffsToHide, int diffs_to_find)
@ fDiffsToHide_ObjfileOnly
virtual void SetVersion(EAgpVersion ver)
Change what AGP version to use for the next input that's read.
virtual int ReadStream(CNcbiIstream &is, EFinalize eFinalize=eFinalize_Yes)
Read an AGP file from the given input stream.
string & GetComponentId()
static bool CheckComponentEnd(const string &comp_id, TAgpPos comp_end, TAgpLen comp_len, CAgpErr &agp_err)
static CRef< CAgpRow > New(CAgpErr *arg, EAgpVersion agp_version=eAgpVersion_auto, CAgpReader *reader=nullptr)
string ToString(bool reorder_linkage_evidences=false)
static bool IsGap(char c)
int FromString(const string &line)
CRef< CAgpRow > Clone(void) const
CAgpValidateReader m_reader
void x_LoadLenFa(CNcbiIstream &istr, const string &filename)
enum CAgpValidateApplication::EValidationType m_ValidationType
EAgpVersion m_agp_version
TMapStrRangeColl m_comp2range_coll
CAgpValidateApplication()
virtual void Init(void)
Initialize the application.
virtual int Run(void)
Run the application.
void x_ReportFastaSeqCount()
CAltValidator * m_AltValidator
void x_ValidateUsingFiles(const CArgs &args, CNcbiOstream *out=NULL)
void x_ValidateFile(CNcbiIstream &istr)
virtual void Exit(void)
Cleanup on application exit.
void PrintTotals(CNcbiOstream &out=cout, bool use_xml=false)
void Reset(bool for_chr_from_scaf=false)
void SetRowOutput(IAgpRowOutput *row_output)
bool IsSetOstream(void) const
bool CheckTaxids(CNcbiOstream &out, bool use_xml)
void QueueLine(const string &orig_line, const string &comp_id, int line_num, int comp_end)
void SetOstream(CNcbiOstream *pOstr)
void PrintTotals(CNcbiOstream &out, bool use_xml)
void SetSpeciesLevelTaxonCheck(bool check=true)
CVersionInfo m_VersionInfo
string & PrintUsage(string &str, bool) const
Print usage message to end of specified string.
CArgDesc_agp_validate(CVersionInfo &&versionInfo)
TAgpLen AddCompLen(const string &acc, TAgpLen len, bool increment_count=true)
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
TRangeVector::const_iterator const_iterator
const_iterator end() const
const_iterator begin() const
container_type::iterator iterator
const_iterator begin() const
const_iterator end() const
const_iterator find(const key_type &key) const
std::ofstream out("events_result.xml")
main entry point for tests
const CNcbiRegistry & GetConfig(void) const
Get the application's cached configuration parameters (read-only).
unsigned int TSeqPos
Type for sequence locations and lengths.
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
CVersionInfo GetVersion(void) const
Get the program version information.
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
void SetVersion(const CVersionInfo &version)
Set the version number for the program.
vector< string > TStringArray
Some values types can contain several value lists.
size_t GetNExtra(void) const
Get the number of unnamed positional (a.k.a. extra) args.
@ fAllowMultiple
Repeated key arguments are legal (use with AddKey)
@ eString
An arbitrary string.
@ eOutputFile
Name of file (must be writable)
@ eInteger
Convertible into an integer number (int or Int8)
void SetDiagStream(CNcbiOstream *os, bool quick_flush=true, FDiagCleanup cleanup=0, void *cleanup_data=0, const string &stream_name="")
Set diagnostic stream.
CRange< TSeqPos > TSeqRange
typedefs for sequence ranges
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
CNcbiIstream & NcbiGetline(CNcbiIstream &is, string &str, char delim, string::size_type *count=NULL)
Read from "is" to "str" up to the delimiter symbol "delim" (or EOF)
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
CNcbistrstream_Base< IO_PREFIX::ostrstream, IOS_BASE::out > CNcbiOstrstream
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
NCBI_NS_STD::string::size_type SIZE_TYPE
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
void CONNECT_Init(const IRWRegistry *reg=0, CRWLock *lock=0, TConnectInitFlags flag=eConnectInit_OwnNothing, FSSLSetup ssl=0)
Init [X]CONNECT library with the specified "reg" and "lock" (ownership for either or both can be deta...
virtual string Print(void) const
Print version information.
#define NCBI_SC_VERSION_PROXY
#define NCBI_TEAMCITY_BUILD_NUMBER_PROXY
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
static const char * str(char *buf, int n)