NCBI C++ ToolKit
row_reader_iana_csv.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef UTIL___ROW_READER_IANA_CSV__HPP
2 #define UTIL___ROW_READER_IANA_CSV__HPP
3 
4 /* $Id: row_reader_iana_csv.hpp 84612 2018-11-21 14:24:48Z ucko $
5 * ===========================================================================
6 *
7 * PUBLIC DOMAIN NOTICE
8 * National Center for Biotechnology Information
9 *
10 * This software/database is a "United States Government Work" under the
11 * terms of the United States Copyright Act. It was written as part of
12 * the author's official duties as a United States Government employee and
13 * thus cannot be copyrighted. This software/database is freely available
14 * to the public for use. The National Library of Medicine and the U.S.
15 * Government have not placed any restriction on its use or reproduction.
16 *
17 * Although all reasonable efforts have been taken to ensure the accuracy
18 * and reliability of the software and data, the NLM and the U.S.
19 * Government do not and cannot warrant the performance or results that
20 * may be obtained by using this software or data. The NLM and the U.S.
21 * Government disclaim all warranties, express or implied, including
22 * warranties of performance, merchantability or fitness for any particular
23 * purpose.
24 *
25 * Please cite the author in any work or product based on this material.
26 *
27 * ===========================================================================
28 *
29 * Authors: Denis Vakatov, Sergey Satskiy
30 *
31 * File Description:
32 * Implementation of the CRowReader<> traits for IANA CSV
33 * https://tools.ietf.org/html/rfc4180
34 *
35 * ===========================================================================
36 */
37 
39 
40 
42 
43 
44 /// Note 1: Empty rows are allowed and silently skipped
45 /// Note 2: Both CRLF and LF are allowed
46 /// Note 3: Number of fields is not enforced
47 /// Note 4: The row with names does not appear in the iteration loop
48 
49 
50 /// Exception specific to IANA CSV sources
52 {
53 public:
54  enum EErrCode {
57  };
58 
59  virtual const char * GetErrCodeString(void) const override
60  {
61  switch (GetErrCode()) {
63  return "eUnbalancedDoubleQuote";
65  return "eUnexpectedDoubleQuote";
66  default:
68  }
69  }
70 
72 };
73 
74 
75 
76 /// IANA CSV traits.
77 /// It follows the specs at:
78 /// https://tools.ietf.org/html/rfc4180
79 /// @note by default the source is considered as the one which has a header
80 /// line. If your source does not features a header then use the
81 /// CRowReaderStream_IANA_CSV_no_header traits or use the SetHasHeader()
82 /// member to switch between modes at runtime.
84 {
85 public:
88  {
89  m_LineSeparator.reserve(2);
90  m_PreviousLineSeparator.reserve(2);
91  }
92 
93  // It could be more than one raw line in one row
94  size_t ReadRowData(CNcbiIstream& is, string* data)
95  {
96  data->clear();
97  m_Tokens.clear();
98 
99  size_t current_index= 0;
100  size_t token_begin_index = 0;
101  size_t lines_read = 0;
102  bool in_quotes = false;
103  for (;;) {
104  x_ReadOneLine(is, data, lines_read > 0);
105  ++lines_read;
106 
107  while (current_index < data->size()) {
108  auto current_char = (*data)[current_index];
109  if (current_char == ',') {
110  if (!in_quotes) {
111  m_Tokens.emplace_back(token_begin_index);
112  token_begin_index = current_index + 1;
113  }
114  } else if (current_char == '"') {
115  if (!in_quotes && token_begin_index != current_index) {
117  eUnexpectedDoubleQuote,
118  "Unexpected double quote. "
119  "If a field is not quoted then a "
120  "double quote may not appear "
121  "in the middle.");
122  }
123  if (!in_quotes) {
124  in_quotes = true;
125  } else {
126  if (current_index + 1 < data->size() &&
127  (*data)[current_index + 1] == '"') {
128  ++current_index;
129  } else {
130  if (current_index + 1 < data->size() &&
131  (*data)[current_index + 1] != ',')
133  eUnexpectedDoubleQuote,
134  "Unexpected double quote. "
135  "Closing double quote must be the "
136  "last in a line or be followed "
137  "by a comma character");
138 
139  in_quotes = false;
140  }
141  }
142  }
143 
144  ++current_index;
145  }
146 
147  if (!in_quotes)
148  break;
149 
150  // Here: need to read one more line because of the quotes.
151  // So check if we still can read.
152  if (!bool(is)) {
154  eUnbalancedDoubleQuote,
155  "Unbalanced double quote detected");
156  }
157  }
158 
159  m_Tokens.push_back(token_begin_index);
160  return lines_read;
161  }
162 
164  {
165  if (raw_line.empty())
166  return eRR_Skip;
167  return eRR_Continue_Data;
168  }
169 
170 
171  // The tokenization is actually done in the ReadRowData() member
173  vector<CTempString>& tokens)
174  {
175  if (m_HasHeader) {
176  if (GetMyStream().GetCurrentLineNo() == 0) {
177  x_SetFieldNames(raw_line);
178  return eRR_Skip;
179  }
180  }
181 
182  size_t field_size;
183  for (TFieldNo field_no = 0; field_no < m_Tokens.size(); ++field_no) {
184  if (field_no + 1 < m_Tokens.size())
185  field_size = m_Tokens[field_no + 1] - m_Tokens[field_no] - 1;
186  else
187  field_size = raw_line.size() - m_Tokens[field_no];
188  tokens.emplace_back(raw_line.data() + m_Tokens[field_no],
189  field_size);
190  }
191  return eRR_Continue_Data;
192  }
193 
195  ERR_FieldValidationMode field_validation_mode)
196  {
197  if (field_validation_mode == eRR_NoFieldValidation)
198  return eRR_Skip;
200  return eRR_Skip;
201 
202  if (raw_line.empty())
203  return eRR_Skip;
204 
205  // Here: the field values need to be validated and there is some type
206  // information
207  m_ValidationTokens.clear();
208  ERR_Action action = this->Tokenize(raw_line, m_ValidationTokens);
209 
210  if (action == eRR_Skip)
211  return eRR_Skip;
212 
213  for (const auto& info : m_FieldsToValidate) {
214  if (info.first < m_Tokens.size()) {
215  string translated;
216  ERR_TranslationResult translation_result =
217  this->Translate((TFieldNo)info.first, m_ValidationTokens[info.first], translated);
218  if (translation_result == eRR_UseOriginal) {
220  m_ValidationTokens[info.first],
221  info.second.first, info.second.second);
222  } else {
224  translated, info.second.first, info.second.second);
225  }
226  }
227  }
228  return eRR_Skip;
229  }
230 
232  const CTempString raw_value,
233  string& translated_value)
234  {
235  if (!raw_value.empty()) {
236  if (raw_value[0] == '"') {
237  translated_value = string(raw_value.data() + 1,
238  raw_value.size() - 2);
239  NStr::ReplaceInPlace(translated_value, "\"\"", "\"");
240  return eRR_Translated;
241  }
242  }
243  return eRR_UseOriginal;
244  }
245 
246  /// Tell if the source has a header. The new value will be taken into
247  /// account for the further sources which are read from the beginning.
248  /// @param has_header
249  /// flag for the following data sources
250  void SetHasHeader(bool has_header)
251  { m_HasHeader = has_header; }
252 
254  ERR_EventMode event_mode)
255  {
256  switch (event) {
258  GetMyStream().x_ClearTraitsProvidedFieldsInfo();
259 
260  if (event_mode == eRR_EventMode_Validating)
262 
263  // fall through
264  case eRR_Event_SourceEnd:
266  default:
267  ;
268  }
270  }
271 
272 private:
273  void x_ReadOneLine(CNcbiIstream& is, string* data, bool joining)
274  {
275  m_RawLine.clear();
276  std::getline(is, m_RawLine);
277  m_LineSeparator = "\n";
278  if(!m_RawLine.empty() && m_RawLine.back() == '\r') {
279  m_RawLine.pop_back();
280  m_LineSeparator = "\r\n";
281  }
282 
283  if (joining)
284  data->append(m_PreviousLineSeparator);
285  data->append(m_RawLine);
286 
288  }
289 
290  void x_SetFieldNames(const CTempString& raw_line)
291  {
292  string translated;
293  size_t field_size;
294  for (TFieldNo field_no = 0; field_no < m_Tokens.size(); ++field_no) {
295  if (field_no + 1 < m_Tokens.size())
296  field_size = m_Tokens[field_no + 1] - m_Tokens[field_no] - 1;
297  else
298  field_size = raw_line.size() - m_Tokens[field_no];
299 
300  CTempString raw_field_name(raw_line.data() + m_Tokens[field_no],
301  field_size);
302 
303  if (Translate(field_no, raw_field_name, translated) == eRR_UseOriginal)
304  GetMyStream().x_SetFieldName(field_no,
305  string(raw_field_name.data(),
306  raw_field_name.size()));
307  else
308  GetMyStream().x_SetFieldName(field_no, translated);
309  }
310  }
311 
313  {
315  for (const auto& info : GetMyStream().GetFieldsMetaInfo()) {
316  if (info.is_type_initialized) {
317  auto field_type = info.type.GetType();
318  if (field_type == eRR_Boolean || field_type == eRR_Integer ||
319  field_type == eRR_Double || field_type == eRR_DateTime)
320  m_FieldsToValidate[info.field_no] =
321  make_pair(field_type, info.type.GetProps());
322  }
323  }
324  }
325 
326 private:
328  vector<size_t> m_Tokens;
331  string m_RawLine;
332 
334  vector<CTempString> m_ValidationTokens;
335 
337 };
338 
339 
340 
341 /// IANA CSV traits which by default consider the source as the one without a
342 /// header (can be switched at runtime using the SetHasHeader() member).
343 /// It follows the specs at:
344 /// https://tools.ietf.org/html/rfc4180
346 {
347 public:
349  { SetHasHeader(false); }
350 
351 private:
353 };
354 
355 
357 
358 #endif /* UTIL___ROW_READER_IANA_CSV__HPP */
#define true
Definition: bool.h:35
Note 1: Empty rows are allowed and silently skipped Note 2: Both CRLF and LF are allowed Note 3: Numb...
NCBI_EXCEPTION_DEFAULT(CCRowReaderStream_IANA_CSV_Exception, CException)
virtual const char * GetErrCodeString(void) const override
Get error code interpreted as text.
static void ValidateBasicTypeFieldValue(const CTempString &str_value, ERR_FieldType field_type, const string &props)
Definition: row_reader.inl:467
IANA CSV traits which by default consider the source as the one without a header (can be switched at ...
RR_TRAITS_PARENT_STREAM(CRowReaderStream_IANA_CSV_no_header)
vector< CTempString > m_ValidationTokens
RR_TRAITS_PARENT_STREAM(CRowReaderStream_IANA_CSV)
ERR_TranslationResult Translate(TFieldNo, const CTempString raw_value, string &translated_value)
ERR_EventAction OnEvent(ERR_Event event, ERR_EventMode event_mode)
ERR_Action Validate(CTempString raw_line, ERR_FieldValidationMode field_validation_mode)
void x_ReadOneLine(CNcbiIstream &is, string *data, bool joining)
void x_SetFieldNames(const CTempString &raw_line)
map< size_t, pair< ERR_FieldType, string > > m_FieldsToValidate
void SetHasHeader(bool has_header)
Tell if the source has a header.
ERR_Action OnNextLine(CTempString raw_line)
ERR_Action Tokenize(const CTempString raw_line, vector< CTempString > &tokens)
size_t ReadRowData(CNcbiIstream &is, string *data)
Partial specialization of the CRowReaderStream_CharDelimited<...> template for the case when the data...
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
bool empty() const
Definition: map.hpp:149
void clear()
Definition: map.hpp:169
Definition: map.hpp:338
string
Definition: cgiapp.hpp:687
TErrCode GetErrCode(void) const
Get error code.
Definition: ncbiexpt.cpp:453
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
EErrCode
Error types that an application can generate.
Definition: ncbiexpt.hpp:884
virtual const char * GetErrCodeString(void) const
Get error code interpreted as text.
Definition: ncbiexpt.cpp:444
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
const char * data(void) const
Return a pointer to the array represented.
Definition: tempstr.hpp:313
bool empty(void) const
Return true if the represented string is empty (i.e., the length is zero)
Definition: tempstr.hpp:334
static string & ReplaceInPlace(string &src, const string &search, const string &replace, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3401
size_type size(void) const
Return the length of the represented array.
Definition: tempstr.hpp:327
static MDB_envinfo info
Definition: mdb_load.c:37
const struct ncbi::grid::netcache::search::fields::SIZE size
Uint4 TFieldNo
Field number (zero based)
Definition: row_reader.hpp:53
ERR_FieldValidationMode
Whether to check validity of the fields (names and/or values)
Definition: row_reader.hpp:117
@ eRR_NoFieldValidation
don't validate fields' value and name
Definition: row_reader.hpp:118
@ eRR_Double
double
Definition: row_reader.hpp:66
@ eRR_DateTime
CTime.
Definition: row_reader.hpp:67
@ eRR_Integer
int
Definition: row_reader.hpp:65
@ eRR_Boolean
bool
Definition: row_reader.hpp:64
ERR_TranslationResult
The Translate() callback result. It is used to translate field values.
Definition: row_reader.inl:71
@ eRR_UseOriginal
No translation done.
Definition: row_reader.inl:72
@ eRR_Translated
The value has been translated to another string.
Definition: row_reader.inl:73
ERR_Action
Delimited stream traits use the ERR_Action members to instruct what should be done next.
Definition: row_reader.inl:48
@ eRR_Continue_Data
Continue processing this line, in full.
Definition: row_reader.inl:52
@ eRR_Skip
Skip this line.
Definition: row_reader.inl:50
ERR_EventAction
How to react to the potentially disruptive events.
Definition: row_reader.inl:110
@ eRR_EventAction_Default
Do some default action.
Definition: row_reader.inl:111
ERR_EventMode
Indicate whether the "ERR_Event" event (passed to the OnEvent() callback) occured during regular read...
Definition: row_reader.inl:102
@ eRR_EventMode_Validating
We are performing data validation.
Definition: row_reader.inl:104
ERR_Event
CRowReader passes such events to the Traits via OnEvent() callback.
Definition: row_reader.inl:92
@ eRR_Event_SourceEnd
Data source has hit EOF.
Definition: row_reader.inl:95
@ eRR_Event_SourceBegin
Data source has started or been switched (no reads yet though).
Definition: row_reader.inl:93
@ eRR_Event_SourceError
Data source has hit an error on read.
Definition: row_reader.inl:96
Modified on Sat Dec 02 09:20:39 2023 by modify_doxy.py rev. 669887