NCBI C++ ToolKit
row_reader_excel_csv.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef UTIL___ROW_READER_EXCEL_CSV__HPP
2 #define UTIL___ROW_READER_EXCEL_CSV__HPP
3 
4 /* $Id: row_reader_excel_csv.hpp 82351 2018-05-23 12:13:23Z ivanov $
5 * ===========================================================================
6 *
7 * PUBLIC DOMAIN NOTICE
8 * National Center for Biotechnology Information
9 *
10 * This software/database is a "United States Government Work" under the
11 * terms of the United States Copyright Act. It was written as part of
12 * the author's official duties as a United States Government employee and
13 * thus cannot be copyrighted. This software/database is freely available
14 * to the public for use. The National Library of Medicine and the U.S.
15 * Government have not placed any restriction on its use or reproduction.
16 *
17 * Although all reasonable efforts have been taken to ensure the accuracy
18 * and reliability of the software and data, the NLM and the U.S.
19 * Government do not and cannot warrant the performance or results that
20 * may be obtained by using this software or data. The NLM and the U.S.
21 * Government disclaim all warranties, express or implied, including
22 * warranties of performance, merchantability or fitness for any particular
23 * purpose.
24 *
25 * Please cite the author in any work or product based on this material.
26 *
27 * ===========================================================================
28 *
29 * Authors: Denis Vakatov, Sergey Satskiy
30 *
31 * File Description:
32 * Implementation of the CRowReader<> traits for MS EXCEL CSV
33 *
34 * ===========================================================================
35 */
36 
38 
39 
41 
42 
43 /// Note 1: Empty rows are allowed and treated as 0 fields rows
44 /// Note 2: Both CRLF and LF are allowed
45 /// Note 3: Number of fields is not enforced
46 /// Note 4: There is no formal MS Excel CSV spec. So the implementation is
47 /// based on experiments made on MS Excel 2013.
48 /// See the description in JIRA: CXX-9221
49 /// Note 5: Two field cases in a data source
50 /// - empty, i.e. ,,
51 /// - "", i.e. ,"",
52 /// are translated to a Null field
53 /// Note 6: trailing Null fields in a data source are stripped
54 
55 
57 
58 
59 /// MS Excel CSV traits.
61 {
62 public:
64  {
65  m_LineSeparator.reserve(2);
66  m_PreviousLineSeparator.reserve(2);
67  }
68 
69  // It could be more than one raw line in one row
70  size_t ReadRowData(CNcbiIstream& is, string* data)
71  {
72  data->clear();
73  m_Tokens.clear();
74 
75  size_t current_index= 0;
76  size_t token_begin_index = 0;
77  size_t lines_read = 0;
78  bool in_quotes = false;
79  for (;;) {
80  x_ReadOneLine(is, data, lines_read > 0);
81  ++lines_read;
82 
83  while (current_index < data->size()) {
84  auto current_char = (*data)[current_index];
85  if (current_char == ',') {
86  if (!in_quotes) {
87  m_Tokens.emplace_back(token_begin_index);
88  token_begin_index = current_index + 1;
89  }
90  } else if (current_char == '"') {
91  if (token_begin_index == current_index) {
92  in_quotes = true;
93  } else {
94  if (in_quotes) {
95  if (current_index + 1 < data->size() &&
96  (*data)[current_index + 1] == '"') {
97  ++current_index;
98  } else {
99  in_quotes = false;
100  }
101  }
102  }
103  }
104 
105  ++current_index;
106  }
107 
108  if (!in_quotes)
109  break;
110 
111  // Here: need to read one more line because of the double quotes.
112  // So check if we still can read.
113  if (!bool(is))
114  break;
115  }
116 
117  m_Tokens.push_back(token_begin_index);
118  return lines_read;
119  }
120 
122  {
123  return eRR_Continue_Data;
124  }
125 
126 
127  // The tokenization is actually done in the ReadRowData() member
129  vector<CTempString>& tokens)
130  {
131  // Special case in accordance with CXX-9221: empty line => no fields
132  if (!raw_line.empty()) {
133  size_t field_size;
134  for (TFieldNo field_no = 0;
135  field_no < m_Tokens.size(); ++field_no) {
136  if (field_no + 1 < m_Tokens.size())
137  field_size = m_Tokens[field_no + 1] - m_Tokens[field_no] - 1;
138  else
139  field_size = raw_line.size() - m_Tokens[field_no];
140  tokens.emplace_back(raw_line.data() + m_Tokens[field_no],
141  field_size);
142  }
143 
145  }
146  return eRR_Continue_Data;
147  }
148 
150  ERR_FieldValidationMode field_validation_mode)
151  {
152  if (field_validation_mode == eRR_NoFieldValidation)
153  return eRR_Skip;
155  return eRR_Skip;
156 
157  if (raw_line.empty())
158  return eRR_Skip;
159 
160  // Here: the field values need to be validated and there is some type
161  // information
162  m_ValidationTokens.clear();
163  ERR_Action action = this->Tokenize(raw_line, m_ValidationTokens);
164 
165  if (action == eRR_Skip)
166  return eRR_Skip;
167 
168  for (const auto& info : m_FieldsToValidate) {
169  if (info.first < m_Tokens.size()) {
170  string translated;
171  ERR_TranslationResult translation_result =
172  this->Translate((TFieldNo)info.first, m_ValidationTokens[info.first], translated);
173  if (translation_result == eRR_UseOriginal) {
175  m_ValidationTokens[info.first],
176  info.second.first, info.second.second);
177  } else {
179  translated, info.second.first, info.second.second);
180  }
181  }
182  }
183  return eRR_Skip;
184  }
185 
187  const CTempString raw_value,
188  string& translated_value)
189  {
190  if (x_IsNull(raw_value))
191  return eRR_Null;
192 
193  if (raw_value[0] == '=') {
194  size_t dbl_quote_cnt = 0;
195  for (size_t index = 0; index < raw_value.size(); ++index)
196  if (raw_value[index] == '"')
197  ++dbl_quote_cnt;
198 
199  if (dbl_quote_cnt == 0) {
200  translated_value = string(raw_value.data() + 1,
201  raw_value.size() - 1);
202  return eRR_Translated;
203  }
204 
205  // Here: there are " in the field. They may need to be stripped
206  // together with = if:
207  // - " follows = immediately
208  // - " is the last character in a field
209  // - there is an even number of "
210  // If so then "" need to be translated into " inside the field
211  // as well
212  if (dbl_quote_cnt % 2 == 0) {
213  if (raw_value[1] == '"' &&
214  raw_value[raw_value.size() - 1] == '"') {
215  // Balanced double quote and poperly surround the field
216  // value => strip the = and surrounding " plus replace
217  // "" with "
218  translated_value = string(raw_value.data() + 2,
219  raw_value.size() - 3);
220  NStr::ReplaceInPlace(translated_value, "\"\"", "\"");
221  return eRR_Translated;
222  }
223  }
224 
225  // Non balanced double quotes or they are not surrounding the
226  // value after =
227  // There is no translation for this case
228  return eRR_UseOriginal;
229  }
230 
231  if (raw_value[0] == '"') {
232  size_t match_index = 1;
233  for (; match_index < raw_value.size(); ++match_index) {
234  if (raw_value[match_index] == '"') {
235  if (match_index + 1< raw_value.size() &&
236  raw_value[match_index + 1] == '"')
237  ++match_index;
238  else
239  break;
240  }
241  }
242 
243  // Here: match_index points beyond of the field or to a
244  // matching "
245  if (match_index < raw_value.size()) {
246  // matching " found
247  translated_value = string(raw_value.data() + 1,
248  match_index - 1);
249  NStr::ReplaceInPlace(translated_value, "\"\"", "\"");
250  if (match_index < raw_value.size() - 1) {
251  // tail of the field needs to ba attached as is
252  translated_value.append(
253  raw_value.data() + match_index + 1,
254  raw_value.size() - match_index - 1);
255  }
256  } else {
257  // Unbalanced " case
258  translated_value = string(raw_value.data() + 1,
259  raw_value.size() - 1);
260  }
261 
262  // This could be a case with a leading = which may need to be
263  // stripped as well...
264  if (!translated_value.empty()) {
265  if (translated_value[0] == '=') {
266  size_t dbl_quote_cnt = 0;
267  for (size_t index = 0;
268  index < translated_value.size(); ++index)
269  if (translated_value[index] == '"')
270  ++dbl_quote_cnt;
271 
272  if (dbl_quote_cnt > 0 && (dbl_quote_cnt % 2 == 0)) {
273  if (translated_value[1] == '"' &&
274  translated_value[translated_value.size() - 1] == '"') {
275  translated_value = translated_value.substr(2, translated_value.size() - 3);
276  }
277  }
278  }
279  }
280 
281  return eRR_Translated;
282  }
283  return eRR_UseOriginal;
284  }
285 
287  ERR_EventMode event_mode)
288  {
289  switch (event) {
291  GetMyStream().x_ClearTraitsProvidedFieldsInfo();
292 
293  if (event_mode == eRR_EventMode_Validating)
295 
296  // fall through
297  case eRR_Event_SourceEnd:
299  default:
300  ;
301  }
303  }
304 
305 private:
306  void x_ReadOneLine(CNcbiIstream& is, string* data, bool joining)
307  {
308  m_RawLine.clear();
309  std::getline(is, m_RawLine);
310  m_LineSeparator = "\n";
311  if(!m_RawLine.empty() && m_RawLine.back() == '\r') {
312  m_RawLine.pop_back();
313  m_LineSeparator = "\r\n";
314  }
315 
316  if (joining)
317  data->append(m_PreviousLineSeparator);
318  data->append(m_RawLine);
319 
321  }
322 
324  {
326  for (const auto& info : GetMyStream().GetFieldsMetaInfo()) {
327  if (info.is_type_initialized) {
328  auto field_type = info.type.GetType();
329  if (field_type == eRR_Boolean || field_type == eRR_Integer ||
330  field_type == eRR_Double || field_type == eRR_DateTime)
331  m_FieldsToValidate[info.field_no] =
332  make_pair(field_type, info.type.GetProps());
333  }
334  }
335  }
336 
337  bool x_IsNull(const CTempString& raw_field_value)
338  {
339  return raw_field_value.empty() ||
340  (raw_field_value == kNullFieldRepresentation);
341  }
342 
343  void x_StripTrailingNullFields(vector<CTempString>& tokens)
344  {
345  while (!tokens.empty()) {
346  if (x_IsNull(tokens.back()))
347  tokens.pop_back();
348  else
349  break;
350  }
351  }
352 
353 private:
354  vector<size_t> m_Tokens;
357  string m_RawLine;
358 
360  vector<CTempString> m_ValidationTokens;
361 
363 };
364 
365 
366 
368 
369 #endif /* UTIL___ROW_READER_EXCEL_CSV__HPP */
static void ValidateBasicTypeFieldValue(const CTempString &str_value, ERR_FieldType field_type, const string &props)
Definition: row_reader.inl:467
ERR_TranslationResult Translate(TFieldNo, const CTempString raw_value, string &translated_value)
map< size_t, pair< ERR_FieldType, string > > m_FieldsToValidate
ERR_Action OnNextLine(CTempString)
ERR_EventAction OnEvent(ERR_Event event, ERR_EventMode event_mode)
ERR_Action Tokenize(const CTempString raw_line, vector< CTempString > &tokens)
void x_ReadOneLine(CNcbiIstream &is, string *data, bool joining)
bool x_IsNull(const CTempString &raw_field_value)
RR_TRAITS_PARENT_STREAM(CRowReaderStream_Excel_CSV)
vector< CTempString > m_ValidationTokens
void x_StripTrailingNullFields(vector< CTempString > &tokens)
size_t ReadRowData(CNcbiIstream &is, string *data)
ERR_Action Validate(CTempString raw_line, ERR_FieldValidationMode field_validation_mode)
Partial specialization of the CRowReaderStream_CharDelimited<...> template for the case when the data...
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
bool empty() const
Definition: map.hpp:149
void clear()
Definition: map.hpp:169
Definition: map.hpp:338
char data[12]
Definition: iconv.c:80
string
Definition: cgiapp.hpp:687
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
const char * data(void) const
Return a pointer to the array represented.
Definition: tempstr.hpp:313
bool empty(void) const
Return true if the represented string is empty (i.e., the length is zero)
Definition: tempstr.hpp:334
static string & ReplaceInPlace(string &src, const string &search, const string &replace, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3405
size_type size(void) const
Return the length of the represented array.
Definition: tempstr.hpp:327
static MDB_envinfo info
Definition: mdb_load.c:37
const struct ncbi::grid::netcache::search::fields::SIZE size
Uint4 TFieldNo
Field number (zero based)
Definition: row_reader.hpp:53
ERR_FieldValidationMode
Whether to check validity of the fields (names and/or values)
Definition: row_reader.hpp:117
@ eRR_NoFieldValidation
don't validate fields' value and name
Definition: row_reader.hpp:118
@ eRR_Double
double
Definition: row_reader.hpp:66
@ eRR_DateTime
CTime.
Definition: row_reader.hpp:67
@ eRR_Integer
int
Definition: row_reader.hpp:65
@ eRR_Boolean
bool
Definition: row_reader.hpp:64
ERR_TranslationResult
The Translate() callback result. It is used to translate field values.
Definition: row_reader.inl:71
@ eRR_UseOriginal
No translation done.
Definition: row_reader.inl:72
@ eRR_Translated
The value has been translated to another string.
Definition: row_reader.inl:73
@ eRR_Null
The value has been translated to NULL.
Definition: row_reader.inl:74
ERR_Action
Delimited stream traits use the ERR_Action members to instruct what should be done next.
Definition: row_reader.inl:48
@ eRR_Continue_Data
Continue processing this line, in full.
Definition: row_reader.inl:52
@ eRR_Skip
Skip this line.
Definition: row_reader.inl:50
ERR_EventAction
How to react to the potentially disruptive events.
Definition: row_reader.inl:110
@ eRR_EventAction_Default
Do some default action.
Definition: row_reader.inl:111
ERR_EventMode
Indicate whether the "ERR_Event" event (passed to the OnEvent() callback) occured during regular read...
Definition: row_reader.inl:102
@ eRR_EventMode_Validating
We are performing data validation.
Definition: row_reader.inl:104
ERR_Event
CRowReader passes such events to the Traits via OnEvent() callback.
Definition: row_reader.inl:92
@ eRR_Event_SourceEnd
Data source has hit EOF.
Definition: row_reader.inl:95
@ eRR_Event_SourceBegin
Data source has started or been switched (no reads yet though).
Definition: row_reader.inl:93
@ eRR_Event_SourceError
Data source has hit an error on read.
Definition: row_reader.inl:96
const CTempString kNullFieldRepresentation
Note 1: Empty rows are allowed and treated as 0 fields rows Note 2: Both CRLF and LF are allowed Note...
Modified on Sun Jun 23 05:16:41 2024 by modify_doxy.py rev. 669887