NCBI C++ ToolKit
agp_renumber.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: agp_renumber.cpp 91857 2020-12-15 13:06:07Z gouriano $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Victor Sapojnikov; usage wording: Paul Kitts.
27  *
28  * File Description:
29  * Repair an AGP file, if possible.
30  * Includes a custom error handler,
31  * processing of the input stream in chunks.
32  */
33 
34 #include <ncbi_pch.hpp>
35 #include <corelib/ncbiapp.hpp>
36 #include <corelib/ncbiargs.hpp>
37 #include <corelib/ncbienv.hpp>
38 #include <corelib/ncbifile.hpp>
39 
41 
43 
44 const char* usage=
45 "Clean up an AGP file:\n"
46 "https://www.ncbi.nlm.nih.gov/assembly/agp/AGP_Specification/\n"
47 "\n"
48 "USAGE: agp_renumber <in.agp >out.agp\n"
49 "\n"
50 " - Recalculate the object begin and end coordinates from\n"
51 " the length of the component span or gap length.\n"
52 " - Renumber the part numbers for each object.\n"
53 " - Lowercase gap type and linkage.\n"
54 " - Reorder linkage evidence terms: paired-ends;align_genus;align_xgenus;align_trnscpt;within_clone;clone_contig;map;strobe;pcr\n"
55 " - Reformat white space to conform to the AGP format specification:\n"
56 " - add missing tabs at the ends of gap lines;\n"
57 " - drop blank lines;\n"
58 " - remove extra tabs and spaces at the end of lines;\n"
59 " - add a missing line separator at the end of the file;\n"
60 " - replace spaces with tabs (except in comments).\n";
61 
62 const int MAX_BUF_LINES=100;
63 //int line_num_in[MAX_BUF_LINES]; // filled in ProcessStream(), read in CAgpRenumber and CCustomErrorHandler
64 
66 {
67 public:
68  static bool MustRenumber(int code)
69  {
70  return
77  }
78 
82  {
83  had_missing_tab=false;
84  }
85 
86  virtual void Msg(int code, const string& details, int appliesTo=fAtThisLine)
87  {
88  bad_part_number=false;
89  // GCOL-2021: fix zero values in: component number (column 4),
90  if( code==E_MustBePositive && details == "part_number (column 4)") {
91  bad_part_number=true;
92  }
93  else if( MustRenumber(code) ) return;
94  else if( code==W_GapLineMissingCol9 ) had_missing_tab=true;
95  CAgpErr::Msg(code, details, appliesTo);
96  }
97  // copied from CAgpErr - only to get rid of the pointless compiler warning
98  virtual void Msg(int code, int appliesTo=fAtThisLine)
99  {
100  Msg(code, NcbiEmptyString, appliesTo);
101  }
102 };
103 
104 class CAgpRenumber : public CAgpReader
105 {
106 protected:
108  ostream& m_out;
111 
112  // Callbacks
113  //virtual void OnScaffoldEnd();
114  virtual void OnObjectChange()
115  {
117  if(!m_at_beg) {
119  renum_current_obj=false;
120  }
121  if(!m_at_end) {
122  if(! m_obj_names.insert(m_this_row->GetObject()).second ) {
124  }
125  }
126  }
127 
128  virtual void OnGapOrComponent()
129  {
130  m_adjusted =
131  m_this_row->GetObject() + "\t" +
136 
137  m_adjusted+=
138  NStr::IntToString(m_object_beg-1) + "\t" +
140  m_part_num++;
141 
142  // cannot simply append the tail of m_line - it might have a missing tab...
143  string s = m_this_row->ToString(true);
144  string s_orig_linkage_evidence = m_this_row->ToString(false);
145  if(s!=s_orig_linkage_evidence) reordered_ln_ev++;
146  m_adjusted+= s.substr(
147  s.find( "\t", 1+
148  s.find( "\t", 1+
149  s.find( "\t", 1+
150  s.find( "\t" ))))
151  );
152 
153  if(m_this_row->pcomment!=NPOS)
155 
156  m_out << m_adjusted << "\n";
157  m_line_num_out++;
158  }
159 
160  virtual void OnComment()
161  {
162  m_out << m_line << "\n";
163  m_line_num_out++;
164  }
165 
166  virtual bool OnError()
167  {
168  switch(m_error_code) {
170  had_empty_line=true;
171  m_prev_line_skipped=false; // do not skip checks on the next line
172  break;
173 
174  // case CAgpErr::E_ObjRangeNeGap:
175  // case CAgpErr::E_ObjRangeNeComp:
176  default: if( custom_err->bad_part_number ||
179 
180  // Use m_line_skipped to prevent endless recursion
181  if(m_line_skipped) {
182  m_line_skipped=false;
183  //m_error_code=0; -- another possible way to prevent endless recursion
184  if( !ProcessThisRow() ) return false;
185  renum_current_obj=true;
186  break;
187  }
188  // else: print diags
189  }
190  else {
191  //case CAgpErr::E_ObjEndLtBeg: -- component/gap-specific columns were not parsed...
192  // die (could also: warn, set End=Beg, retry/resume parsing?)
196  cerr <<"Corrected: " << CAgpErr::GetMsg(m_error_code) << "\n";
197  cerr << "- " << m_line << "\n+ " << m_adjusted << "\n";
198  cerr << "\n";
199 
200  renum_current_obj=true;
201  }
202  else return false; // die
203  }
204  }
205  return true;
206  }
207 
208 public:
212  string m_adjusted;
213 
216  {
219  had_empty_line = false;
221  m_line_num_out = 0;
222  renum_current_obj=false;
223  reordered_ln_ev = 0;
224  }
225 
226 };
227 
228 int ProcessStream(istream &in, ostream& out)
229 {
230  CAgpRenumber renum(out);
231 
232  string s;
234  int buf_lines=0;
235  int code=0;
236 
237  // for reporting
238  bool had_space =false;
239  bool had_extra_tab=false;
240  bool no_eol_at_eof=false;
241  bool bad_case_gap =false;
242 
243  while( NcbiGetline(in, s, "\r\n") ) {
244  // get rid of spaces except in or in front of EOL #comments
245  char prev_ch=0;
246  int tab_count=0;
247  bool at_beg=true;
248 
249  char component_type=0;
250  for(SIZE_TYPE i=0; i<s.size(); i++) {
251  char ch=s[i];
252  switch(ch) {
253  case ' ':
254  if(at_beg) continue;
255  had_space=true;
256  ch='\t';
257  case '\t':
258  if(prev_ch!='\t') {
259  tab_count++;
260  *buf<<'\t';
261  if(tab_count>8) {
262  if( tab_count==9 && i<s.size()-1 && s[i+1]=='#' ) {
263  // don't bark at the tab we keep (for aesthetic reasons)
264  // in front of EOL comment in component lines
265  }
266  else if(!had_space){
267  had_extra_tab=true;
268  }
269  }
270  }
271  else if(!had_space){
272  // not necessarily a complete diags, but at least true
273  had_extra_tab=true;
274  }
275 
276  break;
277  case '#':
278  *buf << s.substr(i);
279  goto EndFor;
280  default:
281  // 2010/09/14 lowercase gap type and linkage
282  if(prev_ch=='\t' && tab_count==4) {
283  component_type=ch;
284  }
285  if( (component_type=='N' || component_type=='U') &&
286  (tab_count==6 || tab_count==7) && tolower(ch)!=ch
287  ) {
288  ch=tolower(ch); bad_case_gap=true;
289  }
290 
291  if(tab_count>8) {
292  // A fatal error - let CAgpRow catch it and complain
293  *buf << '\t' << s.substr(i);
294  goto EndFor;
295  }
296  at_beg=false;
297  *buf << ch;
298  }
299  prev_ch=ch;
300  }
301  EndFor:
302 
303  *buf << '\n';
304  if(++buf_lines>=MAX_BUF_LINES) {
305  buf_lines=0;
306 
308  CNcbiIstrstream is(s);
310  if(code) break;
311 
312  delete buf;
313  buf=new CNcbiOstrstream();
314  }
315 
316  if(in.eof()) no_eol_at_eof=true;
317  }
318 
319  if(buf_lines) {
321  CNcbiIstrstream is(s);
323  }
324 
325  if(!code) code=renum.Finalize();
326  if( code) {
327  cerr << renum.GetErrorMessage()<<"\nRenumbering not completed because of errors.\n";
328  return 1;
329  }
330 
331  if(had_space ) cerr << "Spaces converted to tabs.\n";
332  if(had_extra_tab ) cerr << "Extra tabs removed.\n";
333  if(renum.had_empty_line) cerr << "Empty line(s) removed.\n";
334  if(renum.custom_err->had_missing_tab) cerr << "Missing tabs added at the ends of gap lines.\n";
335  //if(renum.custom_err.bad_part_number) cerr << "Invalid part numbers corrected.\n";
336  if(no_eol_at_eof ) cerr << "Line break added at the end of file.\n";
337  if(bad_case_gap ) cerr << "Gap type/linkage converted to lower case.\n";
338  if(renum.reordered_ln_ev) cerr << "Linkage evidence terms reordered.\n";
339  if(renum.renum_objs ) cerr << renum.renum_objs << " object(s) renumbered.\n";
340  if(renum.no_renum_objs ) {
341  if(renum.renum_objs)
342  cerr << renum.no_renum_objs << " object(s) did not need renumbering.\n";
343  else
344  cerr << "All lines have proper object_beg, object_end, part_number.\n";
345  }
346 
347  delete buf;
348  return 0;
349 }
350 
351 // to do:
352 // - prints these warnings in handler: W_GapObjBegin, W_GapObjEnd, W_ConseqGaps
353 
354 int main(int argc, char* argv[])
355 {
356  if(argc==1) {
357  return ProcessStream(cin, cout);
358  }
359  else if(argv[1][0]=='-' || argc > 1+1) {
360  cout << usage;
361  return 1;
362  }
363  else {
364  CNcbiIfstream in(argv[1]);
365  if( !in.good() ) {
366  cerr << "Error - cannot open for reading: " << argv[1] << "\n";
367  return 1;
368  }
369  return ProcessStream(in, cout);
370  }
371 }
int main(int argc, char *argv[])
int ProcessStream(istream &in, ostream &out)
const int MAX_BUF_LINES
USING_NCBI_SCOPE
const char * usage
@ eAgpVersion_auto
auto-detect using the first gap line
Definition: agp_util.hpp:56
virtual void Msg(int code, const string &details, int appliesTo=fAtThisLine)
Definition: agp_util.cpp:322
static const char * GetMsg(int code)
Definition: agp_util.cpp:278
@ E_ObjRangeNeComp
Definition: agp_util.hpp:530
@ E_PartNumberNotPlus1
Definition: agp_util.hpp:538
@ E_MustBePositive
Definition: agp_util.hpp:525
@ E_PartNumberNot1
Definition: agp_util.hpp:537
@ E_EmptyLine
Definition: agp_util.hpp:521
@ E_ObjMustBegin1
Definition: agp_util.hpp:536
@ E_ObjRangeNeGap
Definition: agp_util.hpp:529
@ E_DuplicateObj
Definition: agp_util.hpp:535
@ E_ObjBegNePrevEndPlus1
Definition: agp_util.hpp:541
@ W_GapLineMissingCol9
Definition: agp_util.hpp:566
@ fAtThisLine
Definition: agp_util.hpp:497
Detects scaffolds, object boundaries, errors that involve 2 consecutive lines, and is intended as a s...
Definition: agp_util.hpp:327
string m_line
Definition: agp_util.hpp:418
CAgpErr * GetErrorHandler()
Definition: agp_util.hpp:481
int m_error_code
Definition: agp_util.hpp:408
virtual int Finalize()
This is called at the end of the file, usually automatically but can be called manually if the automa...
Definition: agp_util.cpp:1160
bool m_at_beg
Definition: agp_util.hpp:394
CRef< CAgpRow > m_this_row
Definition: agp_util.hpp:416
virtual string GetErrorMessage(const string &filename=NcbiEmptyString)
Return a string with one (or two, depending on error) source line(s) on which the error occured,...
Definition: agp_util.cpp:1194
virtual int ReadStream(CNcbiIstream &is, EFinalize eFinalize=eFinalize_Yes)
Read an AGP file from the given input stream.
Definition: agp_util.cpp:1084
bool m_line_skipped
Definition: agp_util.hpp:400
bool m_at_end
Definition: agp_util.hpp:396
bool m_prev_line_skipped
Definition: agp_util.hpp:401
bool ProcessThisRow()
Invoked from ReadStream(), after the row has been parsed, and seldom needs to be invoked by user.
Definition: agp_util.cpp:1001
void SetErrorHandler(CAgpErr *arg)
Definition: agp_util.cpp:1187
CRef< CCustomErrorHandler > custom_err
virtual void OnComment()
virtual bool OnError()
virtual void OnObjectChange()
ostream & m_out
bool renum_current_obj
virtual void OnGapOrComponent()
set< string > m_obj_names
CAgpRenumber(ostream &out)
SIZE_TYPE pcomment
Definition: agp_util.hpp:117
string & GetObject()
Definition: agp_util.hpp:120
string ToString(bool reorder_linkage_evidences=false)
Definition: agp_util.cpp:807
static bool IsGap(char c)
Definition: agp_util.hpp:222
TAgpPos component_beg
Definition: agp_util.hpp:158
TAgpPos component_end
Definition: agp_util.hpp:158
TAgpLen gap_length
Definition: agp_util.hpp:173
static bool MustRenumber(int code)
virtual void Msg(int code, int appliesTo=fAtThisLine)
virtual void Msg(int code, const string &details, int appliesTo=fAtThisLine)
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
std::ofstream out("events_result.xml")
main entry point for tests
TObjectType * GetPointer(void) THROWS_NONE
Get pointer,.
Definition: ncbiobj.hpp:998
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
CNcbiIstream & NcbiGetline(CNcbiIstream &is, string &str, char delim, string::size_type *count=NULL)
Read from "is" to "str" up to the delimiter symbol "delim" (or EOF)
CNcbistrstream_Base< IO_PREFIX::ostrstream, IOS_BASE::out > CNcbiOstrstream
Definition: ncbistre.hpp:286
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
#define NPOS
Definition: ncbistr.hpp:133
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
#define NcbiEmptyString
Definition: ncbistr.hpp:122
char * buf
int i
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Defines command line argument related classes.
int tolower(Uchar c)
Definition: ncbictype.hpp:72
Defines unified interface to application:
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
std::istream & in(std::istream &in_, double &x_)
Definition: inftrees.h:24
Modified on Thu May 02 14:28:44 2024 by modify_doxy.py rev. 669887