NCBI C++ ToolKit
biotree_attr_reader.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: biotree_attr_reader.cpp 45111 2020-06-02 20:15:06Z asztalos $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Anatoliy Kuznetsov
27  *
28  * File Description:
29  * BioTree attribute reader
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbifile.hpp>
35 #include <corelib/ncbistr.hpp>
36 #include <util/line_reader.hpp>
37 
39 
41 
42 
44 {
45 }
46 
47 
48 // Format description/example:
49 // Note that the first column will be used as the key, whether or not it
50 // is a seq-id
51 //
52 //
53 // #BKBTA-1
54 // #seq-id cluster-id label
55 // #optional comments related to file, dates, file origin, generation params, etc
56 //
57 // 9456789 12 "sequence name 1"
58 // 9478900 12 "sequence name 2"
59 // 8456789 15 "sequence name 3"
60 
61 
63 {
64  CStreamLineReader line_reader(is);
65  CTempString l;
66 
67  string ls;
68  vector<string> str_arr;
69  //int seq_id_idx = -1;
70 
71  // read the header
72  //
73  {{
74  ++line_reader;
75  CheckLineReader(line_reader);
76 
77  l = *line_reader;
78 
79  // signature match? If not we will try to parse this line for column
80  // headers
81  if (l != "#BKBTA-1") {
82  //NCBI_THROW2(CObjReaderParseException, eFormat,
83  // "BioTree attributes format: header signature is missing ",
84  // line_reader.GetLineNumber());
85  LOG_POST(Warning << "BioTree attributes format: header signature '#BKBTA-1' is missing ");
86  }
87  else {
88  // found header, advance to next line for column names
89  ++line_reader;
90  CheckLineReader(line_reader);
91  }
92 
93  // get list of columns
94  ls = *line_reader;
95 
96  if (ls.empty() || ls[0] != '#') {
98  "BioTree attributes format: line with column names expected: #name1 name2 name3... ",
99  line_reader.GetLineNumber());
100  }
101  ls.erase(0, 1); // remove '#'
102  NStr::Split(ls, " \t", str_arr, NStr::fSplit_Tokenize);
103  if (str_arr.size() == 0) {
105  "BioTree attributes format: column name list is missing ",
106  line_reader.GetLineNumber());
107  }
108 
109  ITERATE(vector<string>, it, str_arr) {
110  const string& attr_name = *it;
111  attr_table.AddColumn(attr_name);
112  }
113 
114  }}
115 
116  // Read the content
117  //
118  {{
119  vector<string> row_values;
120  unsigned row = 0;
121 
122  // check eof Before reading next line since reading a valid
123  // line can trigger AtEOF to be true (and you don't want to
124  // ignore the line you just read)
125  bool at_eof = line_reader.AtEOF();
126  for (++line_reader;!at_eof; ++line_reader) {
127  at_eof = line_reader.AtEOF();
128  ls = *line_reader;
129  if (ls.empty() || (ls[0] == '#')) {
130  continue;
131  }
132  row_values.resize(0);
133  NStr::Split(ls, "\t", row_values);
134  if (row_values.size() == 0) {
135  continue;
136  }
137 
138  //attr_table.Resize(row+1, (unsigned)str_arr.size());
139  attr_table.AddRow(row);
140 
141  for (size_t i = 0; i < row_values.size(); ++i) {
142  if (i >= attr_table.Cols()) {
143  ERR_POST("Attribute reader too many columns at line=" << line_reader.GetLineNumber());
144  break;
145  }
146  string& s = row_values[i];
147  TableValueDecode(s);
148 
149  /*
150  if (i == (size_t)seq_id_idx) {
151  if (s.empty()) {
152  NCBI_THROW2(CObjReaderParseException, eFormat,
153  "BioTree attributes: seq-id attribute missing",
154  line_reader.GetLineNumber());
155  }
156  }
157  */
158 
159  string& tcell = attr_table.GetCell(row, (unsigned)i);
160  tcell = s;
161  } // for
162 
163  ++row;
164 
165  } // for
166  }}
167 
168 }
169 
171 {
173  if (s.empty()) return;
174 
175  if (s[0] == '"') {
176  s.erase(0, 1);
177  if (s.empty()) return;
178  }
179  if (s[s.length()-1] == '"') {
180  s.erase(s.length()-1, 1);
181  if (s.empty()) return;
182  }
183 }
184 
185 
187 {
188  if (ilr.AtEOF()) {
190  "Input stream no longer valid ",
191  ilr.GetLineNumber());
192  }
193 }
194 
Template class to create a table with custom row-column access.
Definition: ncbi_table.hpp:66
void AddColumn(const TColumn &col)
Add column to the table, column recieves name "col".
Definition: ncbi_table.hpp:252
const TValueType & GetCell(unsigned int row_idx, unsigned int col_idx) const
Access table element by index.
Definition: ncbi_table.hpp:467
void AddRow(const TRow &row)
Add row to the table, column recieves name "row".
Definition: ncbi_table.hpp:264
unsigned int Cols() const
Number of column.
Definition: ncbi_table.hpp:246
Simple implementation of ILineReader for i(o)streams.
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
Abstract base class for lightweight line-by-line reading.
Definition: line_reader.hpp:54
The NCBI C++ standard methods for dealing with std::string.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1191
#define NCBI_THROW2(exception_class, err_code, message, extra)
Throw exception with extra parameter.
Definition: ncbiexpt.hpp:1754
void Read(CNcbiIstream &is, TAttrTable &attr_table)
Read attributes stream into the table.
void TableValueDecode(string &s)
void CheckLineReader(ILineReader &ilr)
Uint8 GetLineNumber(void) const
Returns the current line number (counting from 1, not 0).
bool AtEOF(void) const
Indicates (negatively) whether there is any more input.
virtual Uint8 GetLineNumber(void) const =0
Returns the current line number (counting from 1, not 0).
virtual bool AtEOF(void) const =0
Indicates (negatively) whether there is any more input.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3457
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
Definition: ncbistr.cpp:3197
@ fSplit_Tokenize
All delimiters are merged and trimmed, to get non-empty tokens only.
Definition: ncbistr.hpp:2508
int i
Lightweight interface for getting lines of data with minimal memory copying.
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
Modified on Wed Feb 21 09:55:00 2024 by modify_doxy.py rev. 669887