NCBI C++ ToolKit
agp_validate.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: agp_validate.cpp 100571 2023-08-11 13:06:42Z gotvyans $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:
27  * Victor Sapojnikov
28  *
29  * File Description:
30  * Validate AGP data. A command line option to choose either context
31  * or GenBank validation. Context validation uses only the information
32  * in the AGP file. GenBank validation queries sequence length and taxid
33  * via ObjectManager or CEntrez2Client.
34  *
35  */
36 
37 #include <ncbi_pch.hpp>
38 #include <common/ncbi_source_ver.h>
39 
40 #include <corelib/ncbiapp.hpp>
43 
44 #include "AltValidator.hpp"
45 #include "AgpFastaComparator.hpp"
46 
48 
50 
52 
54 {
55 public:
58  string object_name;
61  {
62  m_out=out;
63  part_number=0;
64  comp_or_gap_printed=false;
65  }
66 
67  virtual void SaveRow(const string& s, CRef<CAgpRow> row, TRangeColl* runs_of_Ns);
68 
69  virtual ~CAgpCompSpanSplitter() {}
70 };
71 
73 {
74 private:
75  virtual void Init(void);
76  virtual int Run(void);
77  virtual void Exit(void);
78  //string Run(const CArgs& args);
79 
82  bool m_use_xml;
83 
88 
91 
92  //void x_LoadLen (CNcbiIstream& istr, const string& filename);
93  void x_LoadLenFa(CNcbiIstream& istr, const string& filename);
94 
96  //CAgpContextValidator* m_ContextValidator;
98 
99  // out is only for printing headers: "Reading Chromosome from scaffold / Scaffold from component AGP"
100  // (which are only used with -scaf and -chr)
101  void x_ValidateUsingFiles(const CArgs& args, CNcbiOstream* out=NULL);
102 
103  void x_ValidateFile(CNcbiIstream& istr);
104  void x_ReportFastaSeqCount();
105 
106 public:
108 };
109 
111  : m_agp_version(eAgpVersion_auto),
112  m_use_xml(false),
113  m_reader( (pAgpErr.Reset(new CAgpErrEx), *pAgpErr),
114  m_comp2len, m_comp2range_coll)
115 {
117 }
118 
119 // Print a nicer usage message
121 {
122 public:
125  m_VersionInfo(move(versionInfo)) {}
126 
127  string& PrintUsage(string& str, bool /*detailed*/) const
128  {
129  auto version_str = m_VersionInfo.Print();
130  version_str+=", AGP Specification v2.1";
131 
132  str="Validate data in the AGP format:\n"
133  "https://www.ncbi.nlm.nih.gov/assembly/agp/AGP_Specification/\n"
134  "\n"
135  "Version: " + version_str + "\n"
136  "\n"
137  "USAGE: agp_validate [-options] [FASTA files...] [AGP files...]\n"
138  "\n"
139  "There are 3 validations modes:\n"
140  "no mode option: (default mode) report component, gap, scaffold and object statistics, perform checks\n"
141  " that do not require component sequences to be available in GenBank (see: -list).\n"
142  "-alt, -species: Check component Accessions, Lengths and Taxonomy ID using GenBank data;\n"
143  " -species allows components from different subspecies during Taxid checks.\n"
144  //"-comp Check that the supplied object sequences (in FASTA or ASN.1 file) match what can be\n"
145  "-comp Check that the supplied object sequences (in FASTA files) match what can be\n"
146  " constructed from the AGP and the component sequences (in FASTA files or in GenBank).\n"
147  " Run \"agp_validate -comp\" to see the options for this mode.\n"
148  "\n"
149  "OPTIONS (default and -alt modes):\n"
150  " -g Check that component names look like Nucleotide accessions\n"
151  " (this does not require components to be in GenBank).\n"
152  " -out FILE Save the AGP file, adding missing version 1 to the component accessions (need -alt),\n"
153  " or adding gaps where runs of Ns longer than 10 bp are found in components (need FASTA files).\n"
154  " -obj Use FASTA files to read names and lengths of objects (the default is components).\n"
155  " -v VER AGP version (1 or 2). The default is to choose automatically. Version 2 is chosen\n"
156  " when the linkage evidence (column 9) is not empty in the first gap line encountered.\n"
157  " -xml Report results in XML format.\n"
158  " -sub Treat serious warnings as errors, put summary and stats at the top.\n"
159  "\n"
160  " Extra checks specific to an object type:\n"
161  " -un Unplaced/unlocalized scaffolds:\n"
162  " any single-component scaffold must use the whole component in orientation '+'\n"
163  " -scaf Scaffold from component AGP: no scaffold-breaking gaps allowed\n"
164  " -chr Chromosome from scaffold AGP: ONLY scaffold-breaking gaps allowed\n" // + -cc
165  " Use both of the last 2 options in this order: -scaf Scaf_AGP_file(s) -chr Chr_AGP_file(s)\n"
166  " to check that all scaffolds in Scaf_AGP_file(s) are wholly included in Chr_AGP_file(s)\n"
167  //" -cc Chromosome from component: check telomere/centromere/short-arm gap counts per chromosome\n"
168  "\n"
169  " -list List error and warning messages.\n"
170  " -limit COUNT Print only the first COUNT messages of each type.\n"
171  " Default=100. To print all, use: -limit 0\n"
172  " -skip, -only WHAT Skip, or report only a particular error or warning.\n"
173  " -show WHAT Show the warning hidden by default (w40, w45, w46, w52).\n"
174  " 'WHAT' could be a part of the message text, an error code (e11, w22, etc; see -list),\n"
175  " or a keyword: all, warn, err, alt.\n"
176  "\n"
177  "If component FASTA files are given in front of AGP files, also check that:\n"
178  "- component_id from AGP is present in FASTA;\n"
179  "- component_end does not exceed sequence length.\n"
180  "If FASTA files for objects are given (after -obj), check that:\n"
181  "- object_id from AGP is present in FASTA;\n"
182  "- object lengths in FASTA and in AGP match.\n"
183  "\n"
184  ;
185  return str;
186  // To do: -taxon "taxname or taxid" ?
187  }
188 private:
190 };
191 
193 {
194  auto arg_desc = make_unique<CArgDesc_agp_validate>(GetVersion());
195 
196  arg_desc->SetUsageContext(
197  GetArguments().GetProgramBasename(),
198  "Validate AGP data", false);
199 
200  // component_id checks that involve GenBank: Accession Length Taxid
201  arg_desc->AddFlag("alt", "");
202 
203  arg_desc->AddFlag("g" , "");
204  arg_desc->AddFlag("obj" , "");
205  arg_desc->AddFlag("un" , "");
206  arg_desc->AddFlag("scaf", "");
207  arg_desc->AddFlag("chr" , "");
208  arg_desc->AddFlag("comp", "");
209  arg_desc->AddFlag("xml" , "");
210  arg_desc->AddFlag("sub" , "");
211 
212  // -comp args
213  arg_desc->AddOptionalKey( "loadlog", "FILE",
214  "specifies where we write our loading log for -comp",
216  arg_desc->AddFlag("ignoreagponly", "");
217  arg_desc->AddFlag("ignoreobjfileonly", "");
218  arg_desc->AddDefaultKey( "diffstofind", "", "",
220 
221  arg_desc->AddFlag("species", "allow components from different subspecies");
222 
223  arg_desc->AddOptionalKey( "out", "FILE",
224  "add missing version 1 to component accessions",
226 
227  arg_desc->AddOptionalKey( "v", "ver",
228  "AGP version",
230 
231  arg_desc->AddOptionalKey( "skip", "error_or_warning",
232  "Message or message code to skip",
235 
236  arg_desc->AddOptionalKey( "only", "error_or_warning",
237  "Message or message code to print (hide other)",
240 
241  arg_desc->AddOptionalKey( "show", "error_or_warning",
242  "Message or message code to print (if not printed by default)",
245 
246  arg_desc->AddDefaultKey("limit", "ErrorCount",
247  "Print at most ErrorCount lines with a particular error",
249  "100");
250 
251  arg_desc->AddFlag("list", "all possible errors and warnings");
252 
253  // file list for file processing
254  arg_desc->AddExtra(0, 10000, "files to be processed",
256  //CArgDescriptions::eInputFile
257  );
258  // Setup arg.descriptions for this application
259  SetupArgDescriptions(arg_desc.release());
260 
261 }
262 
263 
265 {
266  //// Setup registry, error log, MT-lock for CONNECT library
268 
269  //// Process command line arguments
270  const CArgs& args = GetArgs();
271 
272  if( args["list"].HasValue() ) {
273  pAgpErr->PrintAllMessages(cout);
274  exit(0);
275  }
276 
277  if( args["xml"].HasValue() ) {
278  m_use_xml=true;
279  pAgpErr->m_use_xml=true;
280  pAgpErr->m_out = &cout; // not the default &cerr
281  }
282 
283  CNcbiOstrstream* error_details_out=NULL; // using cerr or cout directly
284  if( args["sub"].HasValue() ) {
285  pAgpErr->m_strict=true;
286  if(!m_use_xml) {
287  error_details_out = new CNcbiOstrstream();
288  pAgpErr->m_out = error_details_out;
289  }
290  }
291 
292  m_reader.m_CheckObjLen=args["obj"].HasValue();
293  m_reader.m_unplaced =args["un" ].HasValue();
294 
295  if (m_reader.m_unplaced) {
296  pAgpErr->UpgradeToError(CAgpErrEx::W_SingleOriNotPlus);
297  }
298 
299  if(args["chr" ].HasValue() || args["scaf" ].HasValue()) {
300  if( m_reader.m_unplaced ) {
301  cerr << "Error -- cannot specify -un with -chr/-scaf.\n";
302  exit(1);
303  }
304  if( args["alt"].HasValue() || args["species"].HasValue() ) {
305  cerr << "Error -- cannot specify -chr/-scaf with -alt/-species.\n";
306  exit(1);
307  }
308  }
309  if( args["chr"].HasValue() ) {
310  if( args["scaf"].HasValue() ) {
311  cerr << "Error -- -scaf and -chr must precede different files.\n";
312  exit(1);
313  }
314  m_reader.m_is_chr=true;
316  }
317  else if( args["scaf"].HasValue() ) {
319  }
320 
321  if( args["alt"].HasValue() || args["species"].HasValue() ) {
322  if(m_reader.m_CheckObjLen) {
323  cerr << "Error -- cannot specify -obj with -alt/-species.\n";
324  exit(1);
325  }
327  }
328  else {
330  bool checkCompNames=args["g"].HasValue();
331  // m_ContextValidator = new CAgpContextValidator(checkCompNames);
332  if(checkCompNames) {
333  // also print WGS component_id/component_type mismatches.
334  pAgpErr->SkipMsg(CAgpErr::W_CompIsWgsTypeIsNot, true);
335  pAgpErr->SkipMsg(CAgpErr::W_CompIsNotWgsTypeIs, true);
337  }
338 
339  }
340  if(m_ValidationType & VT_Acc) {
342  m_AltValidator->Init();
343  if( args["species"].HasValue() ) {
345  }
346  if( args["out"].HasValue() ) {
347  m_AltValidator->SetOstream(&(args["out"].AsOutputFile()));
348  }
349  }
350 
351  const CArgValue::TStringArray* err_warn=NULL;
352  bool onlyNotSkip = args["only"].HasValue();
353  string action;
354  if( args["skip"].HasValue() ) {
355  if( onlyNotSkip ) {
356  cerr << "Error -- cannot specify both -only and -skip.\n";
357  exit(1);
358  }
359  err_warn = &( args["skip"].GetStringList() );
360  action="Skipping messages:\n";
361  }
362  else if(onlyNotSkip) {
363  if( args["show"].HasValue() ) {
364  cerr << "Error -- cannot specify both -only and -show; please use multiple -only instead.\n";
365  exit(1);
366  }
367 
368  err_warn = &( args["only"].GetStringList() );
369  pAgpErr->SkipMsg("all");
370  action="Allowed messages:\n";
371  }
372  if(err_warn) {
373  // Inform pAgpErr what to skip; show messages that we skip.
374  bool needHeading=true; // avoid printing >action when not needed
375  for( CArgValue::TStringArray::const_iterator it =
376  err_warn->begin(); it != err_warn->end(); ++it
377  ) {
378  string res = pAgpErr->SkipMsg(*it, onlyNotSkip);
379  if(res=="") {
380  cerr << "WARNING: no matches for " << *it << "\n";
381  needHeading=true;
382  }
383  else {
384  if ( res[0] == ' ' && needHeading) {
385  if(needHeading) cerr << action;
386  cerr << res;
387  needHeading=false;
388  }
389  else {
390  cerr << res << "\n";
391  needHeading=true;
392  }
393  }
394  }
395  }
396 
397  if( args["show"].HasValue() ) {
398  err_warn = &( args["show"].GetStringList() );
399  for( CArgValue::TStringArray::const_iterator it =
400  err_warn->begin(); it != err_warn->end(); ++it
401  ) {
402  pAgpErr->SkipMsg(*it, true);
403  }
404  }
405 
406  pAgpErr->m_MaxRepeat =
407  args["limit"].HasValue() ? args["limit"].AsInteger() : 100;
408 
409  if(args["v"].HasValue() ) {
410  if( args["v"].AsString()[0]=='1' ) {
412  }
413  else if( args["v"].AsString()[0]=='2' ) {
415  }
416  else {
417  cerr << "Error -- invalid AGP version after -v (must start with 1 or 2).\n";
418  exit(1);
419  }
420  }
421  else {
422  m_agp_version=eAgpVersion_auto; // save for CAgpRow; it is default for CAgpValidateReader
423  }
424 
425  if( ! args["comp"] ) {
426  // if "-comp" not specified, neither should the other
427  // comp-related args
428  if( args["loadlog"] || args["ignoreagponly"] ||
429  args["ignoreobjfileonly"] ||
430  args["diffstofind"].AsInteger() > 0 )
431  {
432  cerr << "Error -- -comp mode options without -comp" << endl;
433  exit(1);
434  }
435 
436  //// Process files, print results
437  bool taxid_check_failed=false;
438  if(m_use_xml) {
439  cout << "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n<page>\n";
440  }
441  x_ValidateUsingFiles(args, error_details_out);
444  }
445  else if(m_ValidationType & VT_Acc) {
446  if(!m_use_xml) cout << "\n";
447  if(m_ValidationType & VT_Taxid) taxid_check_failed = !m_AltValidator->CheckTaxids(cout, m_use_xml);
449  }
450  if(m_use_xml) {
451  cout << "</page>\n";
452  // return 0;
453  }
454  else if(error_details_out) {
455  cout << "\n\n===== Details =====" << endl;
456  cout << (string)CNcbiOstrstreamToString(*error_details_out);
457  delete error_details_out;
458  }
459  return (pAgpErr->CountTotals(CAgpErrEx::E_Last)>0 || taxid_check_failed) ? 2 : 0;
460  }
461  else {
462  // Note: traditional validation (now in the "if" clause above) used to be done regardless of args["comp"].
463  // Doing it separately now since it does not yet work properly when both object and component FASTA files are given at the same time.
464 
465  list<string> filenames;
466  for (unsigned int i = 1; i <= args.GetNExtra(); i++) {
467  const string filename = args['#' + NStr::IntToString(i)].AsString();
468  if( ! filename.empty() && filename[0] != '-' ) {
469  filenames.push_back(filename);
470  }
471  }
472 
473  string comploadlog;
474  if( args["loadlog"] ) {
475  comploadlog = args["loadlog"].AsString();
476  }
477 
478  string agp_as_fasta_file;
479  if( args["out"] ) {
480  agp_as_fasta_file = args["out"].AsString();
481  }
482 
483  CAgpFastaComparator::TDiffsToHide diffsToHide = 0;
484  if( args["ignoreagponly"] ) {
486  }
487  if( args["ignoreobjfileonly"] ) {
489  }
490 
491  int diffsToFind = args["diffstofind"].AsInteger();
492 
493  CAgpFastaComparator agpFastaComparator;
495  agpFastaComparator.Run( filenames, comploadlog,
496  agp_as_fasta_file, diffsToHide,
497  diffsToFind) )
498  {
499  cerr << "AGP/FASTA comparison failed." << endl;
500  }
501  }
502 
503  return 0;
504 }
505 
507 {
508  string s;
509  if(m_comp2len.m_count!=1) s="s";
510  if(!m_use_xml) cout<< m_comp2len.m_count << " "
511  << (m_reader.m_CheckObjLen?"object name":"component_id")
512  << s <<" and length" << s << " loaded from FASTA." << endl;
513  if(m_comp2range_coll.size()) {
514  int runs_of_Ns=0;
516  runs_of_Ns += it->second.size();
517  }
518  if(!m_use_xml) cout << m_comp2range_coll.size() << " component sequences have masked spans (" << runs_of_Ns << " spans)." << endl;
519  }
520  else if(!m_reader.m_CheckObjLen) {
521  if(!m_use_xml) cout << "No runs of Ns longer than 10 bp found in FASTA sequences." << endl;
522  }
523 
524 }
525 
527 {
528  if(m_reader.m_is_chr) {
530  if(!m_use_xml) {
531  cout << "===== Reading Chromosome from scaffold AGP =====" << endl;
532  // second header - for details that are printed below the summary and stats
533  if(out) *out << "===== Chromosome from scaffold AGP =====" << endl;
534  }
535  }
536  // else: cout << "===== Reading Chromosome from component AGP =====" << endl;
537  }
538  else if(m_reader.m_explicit_scaf) {
539  if(!m_use_xml) {
540  cout << "===== Reading Scaffold from component AGP =====" << endl;
541  if(out) *out << "===== Scaffold from component AGP =====" << endl; // header for details that are printed below
542  }
543  }
544 
545  if( 0==(m_ValidationType&VT_Acc) && args["out"].HasValue()) {
546  CAgpCompSpanSplitter *comp_splitter = new CAgpCompSpanSplitter(&(args["out"].AsOutputFile()));
547  m_reader.SetRowOutput(comp_splitter);
548  }
549 
550  if (args.GetNExtra() == 0) {
551  x_ValidateFile(cin);
552  }
553  else {
554  SIZE_TYPE num_fasta_files=0;
555  bool allowFasta = !m_reader.m_explicit_scaf;
556  for (unsigned int i = 1; i <= args.GetNExtra(); i++) {
557 
558  m_CurrentFileName = args['#' + NStr::IntToString(i)].AsString();
559  if(m_CurrentFileName=="-chr") {
560  if(m_reader.m_is_chr) {
561  cerr << "Error -- second -chr is not supported.\n";
562  exit(1);
563  }
565  cerr << "Error -- -chr after a file, but no preceding -scaf. Expecting:\n"
566  << " -scaf Scaffold_AGP_file(s) -chr Chromosome_AGP_file(s)\n";
567  exit(1);
568  }
569 
571  m_reader.Reset(true);
572  pAgpErr->ResetTotals();
573 
574  if(!m_use_xml) {
575  cout << "\n===== Reading Chromosome from scaffold AGP =====" << endl;
576  if(out) *out << "\n===== Chromosome from scaffold AGP =====" << endl;// header for details that are printed below
577  }
578  continue;
579  }
580 
581  //CNcbiIstream& istr = args['#' + NStr::IntToString(i)].AsInputFile();
582  CNcbiIfstream istr(m_CurrentFileName.c_str());
583  if (!istr) {
584  cerr << "Error -- unable to open file : " << m_CurrentFileName << "\n";
585  exit (1);
586  }
587 
588  char ch=0;
589  if(allowFasta) {
590  istr.get(ch); istr.putback(ch);
591  }
592  if(ch=='>') {
594  num_fasta_files++;
595  }
596  else {
597  if(allowFasta && num_fasta_files) x_ReportFastaSeqCount();
598  if( args.GetNExtra()-num_fasta_files>1 ) pAgpErr->StartFile(m_CurrentFileName);
599  x_ValidateFile(istr);
600  allowFasta=false;
601  }
602 
603  }
604  if(num_fasta_files==args.GetNExtra()) {
605  //cerr << "No AGP files."; exit (1);
606  if(allowFasta && num_fasta_files) x_ReportFastaSeqCount();
607  x_ValidateFile(cin);
608  }
609  }
610 
611 }
612 
614  CNcbiIstream& istr)
615 {
616 
617  if( 0==(m_ValidationType&VT_Acc) ) {
618  // CAgpReader
620  m_reader.ReadStream(istr); // , false
621  }
622  else {
623  int line_num = 0;
624  string line;
625  CRef<CAgpRow> agp_row( CAgpRow::New(pAgpErr.GetPointer(), m_agp_version));
626 
627  // Allow Unix, DOS, Mac EOL characters
628  while( NcbiGetline(istr, line, "\r\n") ) {
629  line_num++;
630 
631  int code=agp_row->FromString(line);
632  if(code==-1) continue; // skip a comment line
633  bool queued=false;
634  bool comp2len_check_failed=false;
635 
636  if(code==0) {
637  if( !agp_row->IsGap() ) {
638  if( m_comp2len.size() && !agp_row->IsGap() ) {
640  if( it!=m_comp2len.end() ) {
641  comp2len_check_failed=!agp_row->CheckComponentEnd(it->second);
642  // Skip regular genbank-based validation for this line;
643  // will print it verbatim, same as gap or error line.
644  m_AltValidator->QueueLine(line);
645  queued=true;
646  }
647  // else: will try Entrez and ObjMan
648  }
649  if(!queued){
650  // component line - queue for batch lookup
652  agp_row->GetComponentId(), line_num, agp_row->component_end);
653  queued=true;
654  }
655  }
656  }
657  // else: the error message already reached the error handler
658 
659  if(m_AltValidator->IsSetOstream() && !queued) {
660  // error or gap line - queue for verbatim reprinting
661  m_AltValidator->QueueLine(line);
662  }
663 
664  if( code!=0 || comp2len_check_failed || // process the batch now so that error lines are printed in the correct order
665  m_AltValidator->QueueSize() >= 1000
666  ) {
667  AutoPtr<CNcbiOstrstream> tmp_messages = pAgpErr->m_messages;
668  pAgpErr->m_messages.reset( new CNcbiOstrstream );
669 
670  // process a batch of preceding lines
672 
673  pAgpErr->m_messages = tmp_messages;
674  }
675 
676  pAgpErr->LineDone(line, line_num, code!=0 );
677  }
679  }
680 }
681 
683 {
684  SetDiagStream(0);
685 }
686 
687 // To be moved to MapCompLen.cpp
688 void CAgpValidateApplication::x_LoadLenFa(CNcbiIstream& istr, const string& filename)
689 {
690  string line;
691  string acc, acc_long;
692  int line_num=0;
693  int acc_count=0;
694 
695  // these are initialized only to suppress the warnings
696  int header_line_num=0;
697  int len=0;
698  int prev_len=0;
699 
700  TRangeColl range_coll; // runs of Ns in the fasta of the current component
701  TSeqPos mfa_firstMasked=0;
702  TSeqPos mfa_pos=0;
703  bool mfa_bMasked=false;
704  bool mfa_prevMasked=false;
705 
706  while( NcbiGetline(istr, line, "\r\n") ) {
707  line_num++;
708  //if(line.size()==0) continue;
709 
710  if(line[0]=='>') {
711  if( acc.size() ) {
712  // close off the previous acc
713 
714  // warn if acc could also be an accession
716 
717  prev_len = m_comp2len.AddCompLen(acc, len);
718  if(acc_long!=acc) prev_len = m_comp2len.AddCompLen(acc_long, len, false);
719  if(prev_len) goto LengthRedefinedFa;
720 
721  if(mfa_bMasked) {
722  if(mfa_pos-mfa_firstMasked > 10)
723  range_coll += TSeqRange(mfa_firstMasked, mfa_pos-1);
724  }
725  if(!range_coll.empty()) {
726  m_comp2range_coll[acc] = range_coll;
727  }
728 
729  range_coll.clear();
730  mfa_firstMasked=mfa_pos=0;
731  mfa_bMasked=false;
732  mfa_prevMasked=false;
733  }
734 
735  // Get first word, trim final '|' (if any).
736  SIZE_TYPE pos1=line.find(' ' , 1);
737  SIZE_TYPE pos2=line.find('\t', 1);
738  if(pos2<pos1) pos1 = pos2;
739  if(pos1!=NPOS) {
740  pos1--;
741  if(pos1>0 && line[pos1]=='|') pos1--;
742  }
743 
744  acc_long=line.substr(1, pos1);
745  acc=ExtractAccession( acc_long );
746  len=0;
747  header_line_num=line_num;
748  acc_count++;
749  }
750  else {
751  if(acc.size()==0) {
752  cerr<< "ERROR - expecting >fasta_header at start of file " << filename << ", got:\n"
753  << line.substr(0, 100) << "\n\n";
754  exit(1);
755  }
756 
757  for(SIZE_TYPE i=0; i<line.size(); i++ ) {
758  if(!isalpha(line[i])) {
759  cerr<< "ERROR - non-alphabetic character in the FASTA:\n"
760  " file " << filename << "\n line " << line_num << "\n column " << i+1 << "\n\n";
761  exit(1);
762  }
763 
764  mfa_pos++;
765  mfa_bMasked = toupper(line[i]) == 'N';
766  if(mfa_bMasked!=mfa_prevMasked) {
767  if(mfa_bMasked) {
768  mfa_firstMasked=mfa_pos;
769  }
770  else{
771  if(mfa_pos-mfa_firstMasked > 10)
772  range_coll += TSeqRange(mfa_firstMasked, mfa_pos-1);
773  }
774  }
775  mfa_prevMasked=mfa_bMasked;
776 
777  }
778 
779  len+=line.size();
780 
781  /* to do: save runs of Ns as CRangeCollection<TSeqPos>
782  later, will test component spans with:
783 
784  // returns iterator pointing to the TRange that has ToOpen > pos
785  const_iterator find(position_type pos) const
786  {
787  PRangeLessPos<TRange, position_type> p;
788  return lower_bound(begin(), end(), pos, p);
789  }
790  */
791  }
792  }
793 
794  if( acc.size() ) {
795  // close off the last acc
796  prev_len = m_comp2len.AddCompLen(acc, len);
797  if(acc_long!=acc) prev_len = m_comp2len.AddCompLen(acc_long, len, false);
798  if(prev_len) goto LengthRedefinedFa;
799 
800  if(mfa_bMasked) {
801  if(mfa_pos-mfa_firstMasked > 10)
802  range_coll += TSeqRange(mfa_firstMasked, mfa_pos-1);
803  }
804  if(!range_coll.empty()) {
805  m_comp2range_coll[acc] = range_coll;
806  }
807  }
808  if(acc_count==0) {
809  cerr<< "WARNING - empty file " << filename << "\n";
810  }
811  return;
812 
813 LengthRedefinedFa:
814  cerr<< "ERROR - sequence length redefined from " << prev_len << " to " << len << "\n"
815  << " sequence id: " << acc_long << "\n"
816  << " File: " << filename << "\n"
817  << " Lines: "<< header_line_num << ".." << line_num << "\n\n";
818  exit(1);
819 }
820 
821 void CAgpCompSpanSplitter::SaveRow(const string& s, CRef<CAgpRow> row, TRangeColl* runs_of_Ns)
822 {
823  if( row ) {
824  comp_or_gap_printed=true;
825  if(object_name != row->GetObject() ) {
826  object_name = row->GetObject();
827  part_number = 1; // row->GetPartNumber();
828  }
829  CRef<CAgpRow> tmp_row( row->Clone() );
830 
831  if(runs_of_Ns && runs_of_Ns->size()) {
832 
833  if( row->GetVersion() == eAgpVersion_auto ) {
834  cerr << "FATAL: need AGP version (for adding gap lines). Please use -v 1 or -v 2\n";
835  exit(1);
836  }
837  /*
838  CAgpRow tmp_gap_row = *row; // to retain the object name
839  tmp_gap_row.GetComponentType() = "N";
840  tmp_gap_row.is_gap = true;
841  tmp_gap_row.linkage = true;
842  tmp_gap_row.gap_type = row->GetVersion() == eAgpVersion_1_1 ? CAgpRow::eGapFragment : CAgpRow::eGapScaffold;
843  tmp_gap_row.linkage_evidence_flags = CAgpRow::fLinkageEvidence_unspecified;'
844  */
845  CRef<CAgpRow> tmp_gap_row( CAgpRow::New(NULL, row->GetVersion(), NULL) );
846  tmp_gap_row->FromString(
847  row->GetObject()+
848  "\t1\t100\t1\tN\t100\t"+
849  string(row->GetVersion() == eAgpVersion_1_1 ? "fragment\tyes\t" : "scaffold\tyes\tunspecified")
850  );
851 
852  int comp2obj_ofs = row->object_beg - row->component_beg;
853 
854  for(TRangeColl::const_iterator it = runs_of_Ns->begin(); it != runs_of_Ns->end(); ++it) {
855  if( (TSeqPos) tmp_row->component_beg < it->GetFrom() ) {
856  // component line
857  tmp_row->component_end = it->GetFrom()-1;
858  tmp_row->object_end = comp2obj_ofs + tmp_row->component_end;
859 
860  tmp_row->part_number = part_number;
861  (*m_out) << tmp_row->ToString() << endl;
862  part_number++;
863  }
864 
865  // gap line
866  tmp_gap_row->object_beg = comp2obj_ofs + it->GetFrom();
867  tmp_gap_row->object_end = comp2obj_ofs + it->GetTo();
868  tmp_gap_row->gap_length = it->GetTo() - it->GetFrom() + 1;
869 
870  tmp_gap_row->part_number = part_number;
871  (*m_out) << tmp_gap_row->ToString(true) << endl; // true: use linkage_evidence_flags
872  part_number++;
873 
874  tmp_row->component_beg = it->GetTo() + 1;
875  tmp_row->object_beg = comp2obj_ofs + tmp_row->component_beg;
876  }
877 
878  if(tmp_row->component_beg <= row->component_end) {
879  // this component does not end with Ns => need to print the final component span
880  tmp_row->component_end = row->component_end;
881  tmp_row->object_end = row->object_end;
882  }
883  else return; // ends with Ns => skip printing the component row below
884  }
885 
886  tmp_row->part_number = part_number;
887  (*m_out) << tmp_row->ToString() << endl;
888  part_number++;
889  }
890  else if(!comp_or_gap_printed){
891  // comment line (only at the head of file, to comply with AGP 2.0)
892  (*m_out) << s << endl;
893  }
894 }
895 
897 
898 
899 int main(int argc, const char* argv[])
900 {
901  if(argc==1+1 && string("-comp")==argv[1]) {
902  cout << "agp_validate -comp (formerly agp_fasta_compare):\n"
903  // "check that the object sequences (in FASTA or ASN.1 file) match the AGP.\n" //
904  "check that the object sequences FASTA matches the AGP.\n" //
905  "\n"
906  //"USAGE: agp_validate -comp [-options] ASN.1/FASTA file(s)... AGP file(s)...\n"
907  "USAGE: agp_validate -comp [-options] FASTA file(s)... AGP file(s)...\n"
908  "OPTIONS:\n"
909  " -loadlog OUTPUT_FILE Save the list of all loaded sequences.\n"
910  " -ignoreagponly Do not report objects present in AGP file(s) only.\n"
911  " -ignoreobjfileonly Do not report objects present in FASTA file(s) only.\n"
912  " -diffstofind NUM (EXPERIMENTAL) If specified, list the first NUM lines of each difference.\n"
913  " -out OUTPUT_FILE Save the assembled AGP sequences as FASTA.\n"
914  "\n"
915  "FASTA files for components can be provided (along with object FASTA files) if components are not yet in GenBank.\n"
916  ;
917  return 0;
918  }
919 
920  return CAgpValidateApplication().AppMain(argc, argv);
921 }
void OverrideLenIfAccession(const string &acc, int &in_out_len)
string ExtractAccession(const string &long_acc)
EAgpVersion
Definition: agp_util.hpp:55
@ eAgpVersion_auto
auto-detect using the first gap line
Definition: agp_util.hpp:56
@ eAgpVersion_1_1
AGP spec 1.1.
Definition: agp_util.hpp:57
@ eAgpVersion_2_0
AGP spec 2.0 or later.
Definition: agp_util.hpp:58
CRef< CAgpErrEx > pAgpErr
int main(int argc, const char *argv[])
USING_NCBI_SCOPE
static unsigned int line_num
Definition: attributes.c:11
#define false
Definition: bool.h:36
AutoPtr –.
Definition: ncbimisc.hpp:401
virtual ~CAgpCompSpanSplitter()
CAgpCompSpanSplitter(CNcbiOstream *out=NULL)
virtual void SaveRow(const string &s, CRef< CAgpRow > row, TRangeColl *runs_of_Ns)
CNcbiOstream * m_out
Correctly print multiple errors and warnings on consequitive lines; suppress undesired or higly repet...
Definition: agp_util.hpp:649
@ W_CompIsNotWgsTypeIs
Definition: agp_util.hpp:574
@ W_SingleOriNotPlus
Definition: agp_util.hpp:580
@ W_CompIsWgsTypeIsNot
Definition: agp_util.hpp:572
EResult Run(const std::list< std::string > &files, const std::string &loadlog, const std::string &agp_as_fasta_file, TDiffsToHide diffsToHide, int diffs_to_find)
virtual void SetVersion(EAgpVersion ver)
Change what AGP version to use for the next input that's read.
Definition: agp_util.cpp:1073
virtual int ReadStream(CNcbiIstream &is, EFinalize eFinalize=eFinalize_Yes)
Read an AGP file from the given input stream.
Definition: agp_util.cpp:1082
TAgpPos object_beg
Definition: agp_util.hpp:153
TAgpPos part_number
Definition: agp_util.hpp:153
EAgpVersion GetVersion()
Definition: agp_util.hpp:317
string & GetComponentId()
Definition: agp_util.hpp:126
static bool CheckComponentEnd(const string &comp_id, TAgpPos comp_end, TAgpLen comp_len, CAgpErr &agp_err)
Definition: agp_util.cpp:846
string & GetObject()
Definition: agp_util.hpp:120
static CRef< CAgpRow > New(CAgpErr *arg, EAgpVersion agp_version=eAgpVersion_auto, CAgpReader *reader=nullptr)
Definition: agp_util.hpp:90
string ToString(bool reorder_linkage_evidences=false)
Definition: agp_util.cpp:805
static bool IsGap(char c)
Definition: agp_util.hpp:221
TAgpPos component_beg
Definition: agp_util.hpp:158
int FromString(const string &line)
Definition: agp_util.cpp:423
CRef< CAgpRow > Clone(void) const
Definition: agp_util.hpp:103
TAgpPos component_end
Definition: agp_util.hpp:158
TAgpLen gap_length
Definition: agp_util.hpp:173
TAgpPos object_end
Definition: agp_util.hpp:153
CAgpValidateReader m_reader
void x_LoadLenFa(CNcbiIstream &istr, const string &filename)
enum CAgpValidateApplication::EValidationType m_ValidationType
TMapStrRangeColl m_comp2range_coll
virtual void Init(void)
Initialize the application.
virtual int Run(void)
Run the application.
CAltValidator * m_AltValidator
void x_ValidateUsingFiles(const CArgs &args, CNcbiOstream *out=NULL)
void x_ValidateFile(CNcbiIstream &istr)
virtual void Exit(void)
Cleanup on application exit.
void PrintTotals(CNcbiOstream &out=cout, bool use_xml=false)
void Reset(bool for_chr_from_scaf=false)
void SetRowOutput(IAgpRowOutput *row_output)
size_t QueueSize() const
bool IsSetOstream(void) const
bool CheckTaxids(CNcbiOstream &out, bool use_xml)
void QueueLine(const string &orig_line, const string &comp_id, int line_num, int comp_end)
void SetOstream(CNcbiOstream *pOstr)
void PrintTotals(CNcbiOstream &out, bool use_xml)
void SetSpeciesLevelTaxonCheck(bool check=true)
CVersionInfo m_VersionInfo
string & PrintUsage(string &str, bool) const
Print usage message to end of specified string.
CArgDesc_agp_validate(CVersionInfo &&versionInfo)
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgs –.
Definition: ncbiargs.hpp:379
TAgpLen AddCompLen(const string &acc, TAgpLen len, bool increment_count=true)
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
TRangeVector::const_iterator const_iterator
Definition: range_coll.hpp:70
size_type size() const
Definition: range_coll.hpp:98
const_iterator end() const
Definition: range_coll.hpp:86
bool empty() const
Definition: range_coll.hpp:102
const_iterator begin() const
Definition: range_coll.hpp:82
CRef –.
Definition: ncbiobj.hpp:618
CVersionInfo –.
size_type size() const
Definition: map.hpp:148
const_iterator begin() const
Definition: map.hpp:151
const_iterator end() const
Definition: map.hpp:152
const_iterator find(const key_type &key) const
Definition: map.hpp:153
std::ofstream out("events_result.xml")
main entry point for tests
const CNcbiRegistry & GetConfig(void) const
Get the application's cached configuration parameters (read-only).
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:285
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
Definition: ncbiapp.cpp:799
CVersionInfo GetVersion(void) const
Get the program version information.
Definition: ncbiapp.cpp:1164
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
Definition: ncbiapp.cpp:1175
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
void SetVersion(const CVersionInfo &version)
Set the version number for the program.
Definition: ncbiapp.cpp:1135
vector< string > TStringArray
Some values types can contain several value lists.
Definition: ncbiargs.hpp:293
size_t GetNExtra(void) const
Get the number of unnamed positional (a.k.a. extra) args.
Definition: ncbiargs.hpp:422
@ fAllowMultiple
Repeated key arguments are legal (use with AddKey)
Definition: ncbiargs.hpp:635
@ eString
An arbitrary string.
Definition: ncbiargs.hpp:589
@ eOutputFile
Name of file (must be writable)
Definition: ncbiargs.hpp:596
@ eInteger
Convertible into an integer number (int or Int8)
Definition: ncbiargs.hpp:592
string
Definition: cgiapp.hpp:687
#define NULL
Definition: ncbistd.hpp:225
void SetDiagStream(CNcbiOstream *os, bool quick_flush=true, FDiagCleanup cleanup=0, void *cleanup_data=0, const string &stream_name="")
Set diagnostic stream.
Definition: ncbidiag.cpp:8083
CRange< TSeqPos > TSeqRange
typedefs for sequence ranges
Definition: range.hpp:419
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
CNcbiIstream & NcbiGetline(CNcbiIstream &is, string &str, char delim, string::size_type *count=NULL)
Read from "is" to "str" up to the delimiter symbol "delim" (or EOF)
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
CNcbistrstream_Base< IO_PREFIX::ostrstream, IOS_BASE::out > CNcbiOstrstream
Definition: ncbistre.hpp:286
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
#define NPOS
Definition: ncbistr.hpp:133
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5083
void CONNECT_Init(const IRWRegistry *reg=0, CRWLock *lock=0, TConnectInitFlags flag=eConnectInit_OwnNothing, FSSLSetup ssl=0)
Init [X]CONNECT library with the specified "reg" and "lock" (ownership for either or both can be deta...
virtual string Print(void) const
Print version information.
Definition: version.cpp:120
exit(2)
int i
int len
#define NCBI_SC_VERSION_PROXY
#define NCBI_TEAMCITY_BUILD_NUMBER_PROXY
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
int isalpha(Uchar c)
Definition: ncbictype.hpp:61
int toupper(Uchar c)
Definition: ncbictype.hpp:73
static int filenames
Definition: pcregrep.c:172
static const char * str(char *buf, int n)
Definition: stats.c:84
Definition: inftrees.h:24
Modified on Thu Dec 07 10:10:27 2023 by modify_doxy.py rev. 669887