NCBI C++ ToolKit
gff_deconcat.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: gff_deconcat.cpp 90467 2020-06-16 18:23:04Z foleyjp $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Justin Foley
27 *
28 * File Description:
29 * Feature deconcatenor for .gff files
30 *
31 * ===========================================================================
32 */
33 
34 #include <ncbi_pch.hpp>
35 #include <corelib/ncbiapp.hpp>
36 #include <corelib/ncbistd.hpp>
37 #include <corelib/ncbiargs.hpp>
38 #include <corelib/ncbifile.hpp>
39 #include <util/line_reader.hpp>
40 #include <corelib/ncbiexpt.hpp>
41 
43 
44 
46 {
47 public:
48  enum EErrCode {
50  };
51 
52  virtual const char* GetErrCodeString() const override {
53  switch (GetErrCode()) {
54  case eInvalidOutputDir:
55  return "eInvalidOutputDir";
56 
57  default: return CException::GetErrCodeString();
58  }
59  }
60 
62 };
63 
64 
66 {
67 public:
68  void Init(void);
69  int Run(void);
70 
72 
73 private:
74  void xProcessFile(CNcbiIstream& istr);
75  void xReadFile(CNcbiIstream& istr, string& header, TIdmap& id_map);
76  void xProcessLine(const string& line, TIdmap& id_map);
77  void xWriteFile(CNcbiOfstream& ostr,
78  const string& header,
79  const list<string>& body);
80  void xSetExtension(const string& input_filename);
81 
82  string m_OutputDir;
83  string m_Extension;
84 };
85 
86 
88 {
89  unique_ptr<CArgDescriptions> arg_desc(new CArgDescriptions());
90 
91 
92  arg_desc->AddKey("i", "InputFile",
93  "GFF input filename",
95 
96  arg_desc->AddOptionalKey("dir",
97  "OutputDirectory",
98  "Output Directory. Defaults to CWD",
100 
101  SetupArgDescriptions(arg_desc.release());
102 }
103 
104 
105 void CGffDeconcatApp::xSetExtension(const string& input_filename)
106 {
107  CDir input_file(input_filename);
108  string trial_ext(input_file.GetExt());
109 
110  if (!NStr::IsBlank(trial_ext)) {
111  try {
112  NStr::StringToDouble(trial_ext); // Extension cannot be numeric
113  }
114  catch(...) {
115  m_Extension = trial_ext;
116  return;
117  }
118  }
119 
120  m_Extension = ".gff";
121 }
122 
123 
125 {
126  const CArgs& args = GetArgs();
127  CNcbiIstream& istr = args["i"].AsInputFile();
128 
129  xSetExtension(args["i"].AsString());
130 
131  if (args["dir"]) {
132  auto dirname = args["dir"].AsString();
133 
134  if (!CDir::IsAbsolutePath(dirname)) {
135  dirname = CDir::CreateAbsolutePath(dirname);
136  }
137 
138  CDir output_dir(dirname);
139  if (!output_dir.Exists()) {
140  string err_msg = dirname
141  + " does not exist";
143  eInvalidOutputDir,
144  err_msg);
145  }
146 
147  m_OutputDir = dirname;
148  }
149 
150  xProcessFile(istr);
151 
152  return 0;
153 }
154 
155 
157 
158  TIdmap id_map;
159  string header;
160  xReadFile(istr, header, id_map);
161 
162  string output_dir = ".";
163  if (!NStr::IsBlank(m_OutputDir)) {
164  output_dir = m_OutputDir;
165  }
166 
167  for (const auto& key_val : id_map) {
168  string filename = output_dir + "/"
169  + key_val.first
170  + m_Extension;
171 
172  unique_ptr<CNcbiOfstream> ostr(new CNcbiOfstream(filename.c_str()));
173  xWriteFile(*ostr, header, key_val.second);
174  }
175  return;
176 }
177 
178 
179 void CGffDeconcatApp::xProcessLine(const string& line, TIdmap& id_map) {
180  vector<string> columns;
182  if (columns.size() <= 1) {
183  return;
184  }
185  id_map[columns[0]].push_back(line);
186 }
187 
188 
189 void CGffDeconcatApp::xReadFile(CNcbiIstream& istr, string& header, TIdmap& id_map)
190 {
191  id_map.clear();
192  CStreamLineReader lr(istr);
193 
194  string line;
195  while ( !lr.AtEOF() ) {
196  line = *++lr;
197  if (NStr::IsBlank(line) ||
198  line[0] == '#') {
199  if (line.size() > 2 &&
200  line[1] == '#' &&
201  header.empty() ) {
202  header = line;
203  }
204  continue;
205  }
206  xProcessLine(line, id_map);
207  }
208  return;
209 }
210 
211 
213  const string& header,
214  const list<string>& body)
215 {
216  if (!NStr::IsBlank(header)) {
217  ostr << header << "\n";
218  }
219 
220  for (const string& line : body) {
221  ostr << line << "\n";
222  }
223 
224  return;
225 }
226 
227 
229 
231 
232 int main(int argc, const char* argv[])
233 {
234  return CGffDeconcatApp().AppMain(argc, argv, 0, eDS_ToStderr, 0);
235 }
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgs –.
Definition: ncbiargs.hpp:379
CDir –.
Definition: ncbifile.hpp:1695
void xProcessFile(CNcbiIstream &istr)
void xProcessLine(const string &line, TIdmap &id_map)
void xReadFile(CNcbiIstream &istr, string &header, TIdmap &id_map)
int Run(void)
Run the application.
void xWriteFile(CNcbiOfstream &ostr, const string &header, const list< string > &body)
void xSetExtension(const string &input_filename)
void Init(void)
Initialize the application.
NCBI_EXCEPTION_DEFAULT(CGffDeconcatException, CException)
virtual const char * GetErrCodeString() const override
Get error code interpreted as text.
Simple implementation of ILineReader for i(o)streams.
void clear()
Definition: map.hpp:169
Include a standard set of the NCBI C++ Toolkit most basic headers.
static FILE * input_file
Definition: common.c:35
static const column_t columns[]
Definition: utf8_2.c:22
int main(int argc, const char *argv[])
USING_NCBI_SCOPE
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:305
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
Definition: ncbiapp.cpp:819
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
Definition: ncbiapp.cpp:1195
@ eInputFile
Name of file (must exist and be readable)
Definition: ncbiargs.hpp:595
@ eOutputFile
Name of file (must be writable)
Definition: ncbiargs.hpp:596
@ eDS_ToStderr
To standard error stream.
Definition: ncbidiag.hpp:1782
TErrCode GetErrCode(void) const
Get error code.
Definition: ncbiexpt.cpp:453
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
EErrCode
Error types that an application can generate.
Definition: ncbiexpt.hpp:884
virtual const char * GetErrCodeString(void) const
Get error code interpreted as text.
Definition: ncbiexpt.cpp:444
static string CreateAbsolutePath(const string &path, ERelativeToWhat rtw=eRelativeToCwd)
Get an absolute path from some, possibly relative, path.
Definition: ncbifile.cpp:665
static bool IsAbsolutePath(const string &path)
Check if a "path" is absolute for the current OS.
Definition: ncbifile.cpp:508
virtual bool Exists(void) const
Check if directory "dirname" exists.
Definition: ncbifile.hpp:4065
bool AtEOF(void) const
Indicates (negatively) whether there is any more input.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::ofstream CNcbiOfstream
Portable alias for ofstream.
Definition: ncbistre.hpp:500
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3461
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
static double StringToDouble(const CTempStringEx str, TStringToNumFlags flags=0)
Convert string to double.
Definition: ncbistr.cpp:1387
@ fSplit_Truncate
Definition: ncbistr.hpp:2501
@ fSplit_MergeDelimiters
Merge adjacent delimiters.
Definition: ncbistr.hpp:2498
Lightweight interface for getting lines of data with minimal memory copying.
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Defines command line argument related classes.
Defines NCBI C++ exception handling.
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
Modified on Wed Apr 17 13:10:21 2024 by modify_doxy.py rev. 669887