NCBI C++ ToolKit
table_import_data_source.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: table_import_data_source.cpp 47485 2023-05-02 14:46:59Z ucko $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Bob Falk
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 
35 #include <corelib/ncbistre.hpp>
36 #include <corelib/ncbifile.hpp>
37 
40 
54 
55 #include <functional>
56 #include <math.h>
57 #include <sstream>
58 
59 
61 
63 
64 /*****************************************************************************/
65 /*************************** CCharHistogram **********************************/
66 
68 : m_RowCount(0), m_CharCount(0), m_CaptureTarget(100)
69 {
70  // Initialize objects used to check for repeated characters or
71  // character combinations in table entries.
72  CMergedChar space_repeat(256);
73  space_repeat.m_Chars = " ";
74  m_Repeats.push_back(space_repeat);
75 
76  CMergedChar tab_repeat(257);
77  tab_repeat.m_Chars = "\t";
78  m_Repeats.push_back(tab_repeat);
79 
80  CMergedChar whitespace_repeat(258);
81  whitespace_repeat.m_Chars = " \t";
82  m_Repeats.push_back(whitespace_repeat);
83 }
84 
86 {
87  std::vector<int> buf(s_NumDelimiters, 0);
88  size_t i, j;
89 
90  if (row.size() >= m_CharFrequency.size())
91  x_UpdateMaxRowLen((int)row.size());
92 
93 
94  // Add to the total number of characters in all rows (used to compute
95  // average row length)
96  m_CharCount += (int) row.size();
97  // Update total number of rows used in computing the statistics
98  ++m_RowCount;
99 
100  // Initialize checks for repeated characters
101  for (j=0; j<m_Repeats.size(); ++j) {
102  m_Repeats[j].NewLine();
103  }
104 
105  // Count the number of occurances for all characters in row "row" and
106  // save them in 'buf' - only consider ascii characters
107  for (i=0; i<row.size(); ++i) {
108  unsigned int idx = (unsigned int) row[i];
109  if (idx <256) {
110  buf[idx] += 1;
111 
112  // perform check for repeated characters
113  for (j=0; j<m_Repeats.size(); ++j) {
114  if (m_Repeats[j].NextChar(row[i]))
115  buf[m_Repeats[j].m_FreqArrayIdx] += 1;
116  }
117  }
118  }
119 
120 
121  // Given the number of times each character appears in the current
122  // row, update the corresponding row occurance count for that character
123  for (i=0; i<(size_t)s_NumDelimiters; ++i) {
124  int occurances = buf[i];
125  m_CharFrequency[occurances][i] += 1;
126  }
127 
128  /// Record some rows in case the initial delimiter analysis gives
129  /// more than one option. We might want to extend this to randomly
130  /// updating from different parts of the stream (after accumulating the
131  /// first 'm_CaptureCount' rows).
132  if (m_RecordedRows.size() < m_CaptureTarget)
133  m_RecordedRows.push_back(row);
134 }
135 
136 void CCharHistogram::UpdateHistogramWithDelim(const string& row, char delim)
137 {
138  std::vector<int> buf(s_NumDelimiters, 0);
139  size_t i, j;
140 
141  if (row.length() > m_CharFrequency.size())
142  x_UpdateMaxRowLen((int)row.length());
143 
144  // Add to the total number of characters in all rows (used to compute
145  // average row length)
146  m_CharCount += (int) row.size();
147  // Update total number of rows used in computing the statistics
148  ++m_RowCount;
149 
150  // Initialize checks for repeated characters
151  for (j=0; j<m_Repeats.size(); ++j) {
152  m_Repeats[j].NewLine();
153  }
154 
155  // Update occurance values for all occurances not between quotes 'delim'
156  bool quoted = false;
157  for (i=0; i<row.size(); ++i) {
158 
159  /// Start quoted string
160  if (!quoted && row[i] == delim) {
161  quoted = true;
162  continue;
163  }
164 
165  // In a quoted string, a new quote mark either stops it unless it is
166  // escaped. In CSV format, quotes are escaped as by doubling them, e.g.
167  // "". If quotes are escaped in another way, e.g. \" we will not
168  // recognize them.
169  if (quoted && row[i] == delim) {
170  if (i+1 < row.size() && row[i+1] == '"')
171  ++i;
172  else quoted = false;
173  continue;
174  }
175 
176  if (!quoted) {
177 
178  unsigned int idx = (unsigned int) row[i];
179  if (idx < 256) {
180  buf[idx] += 1;
181 
182  // perform check for repeated characters
183  for (j=0; j<m_Repeats.size(); ++j) {
184  if (m_Repeats[j].NextChar(row[i]))
185  buf[m_Repeats[j].m_FreqArrayIdx] += 1;
186  }
187  }
188  }
189  }
190 
191 
192  for (i=0; i<(size_t)s_NumDelimiters; ++i) {
193  int occurances = buf[i];
194  m_CharFrequency[occurances][i] += 1;
195  }
196 
197  /// Record some rows in case the initial delimiter analysis gives
198  /// more than one option. We might want to extend this to randomly
199  /// updating from different parts of the stream (after accumulating the
200  /// first 'm_CaptureCount' rows).
201  if (m_RecordedRows.size() < m_CaptureTarget)
202  m_RecordedRows.push_back(row);
203 }
204 
206 {
207  while (m_CharFrequency.size() <= (size_t)len) {
208  vector<int> zerovec(s_NumDelimiters, 0);
209  m_CharFrequency.push_back(zerovec);
210  }
211 }
212 
214  char to_char,
215  float& certainty,
216  int& first_non_header) const
217 {
218  certainty = 0.0f;
219  first_non_header = -1;
220 
221  // Need some data to analyze...
222  if (m_InitialRows.size() < 5) {
223  return;
224  }
225 
226  float num_rows = (float)m_RowCount;
227 
228  // average number of occurances of each character over selected rows
229  int num_chars = to_char - from_char + 1;
230  vector<float> mean_occurances(num_chars, 0.0f);
231  vector<char> chars(num_chars, ' ');
232 
233  // For each character, determine how clustered it is (determine if the majority
234  // of rows have the same number of occurances of a character).
235  size_t i;
236  size_t j;
237  size_t k;
238  for (i=(size_t)from_char; i<=(size_t)to_char; ++i) {
239  // Compute mean number of occurances for the current character (the average
240  // number of times the character appears in a row over all rows)
241  for (j=0; j<m_CharFrequency.size(); ++j) {
242  // Number of occurances in a row (j) * number of rows that have that
243  // number of occurances (m_CharFrequency[j][i]) divided by total
244  // number of rows (num_rows).
245  mean_occurances[i-from_char] += (((float)j)*((float)m_CharFrequency[j][i]))/num_rows;
246  }
247  chars[i-from_char] = (char)i;
248  }
249 
250  // Determine how close all occurances are to the mean (best result
251  // is for all rows have the same number of occurances) for current ascii
252  // character 'i'. Start at 1 to ignore (penalize) rows with 0 occurances
253  vector<float> mean_delta;
254  for (i=0; i<m_InitialRows.size(); ++i) {
255  /// Find number of occurances for (from_char..to_char) in this row
256  vector<float> row_occurances(num_chars, 0.0f);
257 
258  for (j=0; j<m_InitialRows[i].size(); ++j) {
259  char c = m_InitialRows[i][j];
260  if (c>=from_char && c<=to_char) {
261  row_occurances[c-from_char] += 1.0f;
262  }
263  }
264 
265  // This would be better weighted if we took into account the
266  // distribution of occurances by character for non-header
267  // rows. (e.g. if a character appears a consistent # of times
268  // in non-header rows, weight that more heavily).
269  float abs_delta = 0.0f;
270  for (k=0; k<(size_t)num_chars; ++k) {
271  float delta = row_occurances[k] - mean_occurances[k];
272  if (delta < 0.0f)
273  delta *= -1.0f;
274  abs_delta += delta;
275  }
276  //_TRACE("Row: " << i << " delta: " << abs_delta);
277 
278  // get delta as an absolute %
279  mean_delta.push_back(abs_delta);
280  }
281 
282  //*************************************************************************
283  // This is good for weighing deltas of individual characters, but we
284  // could also look at occurances of groups, specifically numeric vs non-
285  // numeric
286  //*************************************************************************
287 
288  // Find range of normal deltas in score for lines in second half of
289  // m_InitialRows (which should all be non-headers)
290  float max_normal_delta = 0.0f;
291  float avg_row_len = 0.0f;
292  float avg_row_count = 0;
293 
294  for (j=mean_delta.size()-1; j>=mean_delta.size()/2; --j) {
295  if (mean_delta[j] > max_normal_delta) {
296  max_normal_delta = mean_delta[j];
297  }
298  avg_row_len += (float)m_InitialRows[j].size();
299  avg_row_count += 1.0f;
300  }
301 
302  avg_row_len /= avg_row_count;
303 
304  // Find first row at which row scores become similar to the 'normal' rows.
305  // check two rows for consistency.
306  for (j=0; j<mean_delta.size()-2; ++j) {
307  // Get delta row lengths of current row with average row as an absolute % (0..1)
308  float row_len_delta = 1.0f - ((float)m_InitialRows[j].size())/avg_row_len;
309  if (row_len_delta < 0.0f)
310  row_len_delta *= -1.0f;
311 
312  float row_len_delta1 = 1.0f - ((float)m_InitialRows[j+1].size())/avg_row_len;
313  if (row_len_delta1 < 0.0f)
314  row_len_delta1 *= -1.0f;
315 
316  if (mean_delta[j] + row_len_delta*5 <= max_normal_delta*1.6f &&
317  mean_delta[j+1] + row_len_delta1*5 <= max_normal_delta*1.6f) {
318  first_non_header = static_cast<int>(j);
319  break;
320  }
321  }
322 
323  // for scoring validate that deltas on one side
324  // of mean_delta_index are generally higher than on the
325  // other side.
326  float prev_delta_avg = 0.0f;
327  float post_delta_avg = 0.0f;
328 
329  if (first_non_header > 0 &&
330  first_non_header < (int)mean_delta.size()-1) {
331  for (j=0; j<mean_delta.size(); ++j) {
332  if (j<(size_t)first_non_header) {
333  prev_delta_avg += mean_delta[j]*(1.0f/(float)first_non_header);
334  }
335  else if (j>(size_t)first_non_header) {
336  post_delta_avg += mean_delta[j]*(1.0f/(float)(mean_delta.size()-first_non_header));
337  }
338  }
339  }
340 
341  // Larger is better (delta in header should be higher than delta after). Below approx 1.4
342  // probably implies no header).
343  if (post_delta_avg > 0)
344  certainty = prev_delta_avg/post_delta_avg;
345  else
346  certainty = prev_delta_avg;
347 }
348 
350  vector<char>& delims,
351  NStr::EMergeDelims& merge)
352 {
353  std::vector<CDelimScore> m_Scores;
354 
355  float max_frequency_score = 0.0f;
356  max_score = -1.0f;
357 
358  // For each character, determine how clustered it is (determine if the majority
359  // of rows have the same number of occurances of a character).
360  size_t i;
361  for (i=0; i<(size_t)s_NumDelimiters; ++i) {
362  size_t j;
363  float num_rows = (float)m_RowCount;
364  float num_rows_inv = 1.0f/num_rows;
365 
366  // average number of occurances of current character over all rows
367  float mean_occurance = 0.0f;
368  float score = 0;
369 
370  // Compute mean number of occurances for the current character (the average
371  // number of times the character appears in a row over all rows)
372  for (j=0; j<m_CharFrequency.size(); ++j) {
373  // Number of occurances in a row (j) * number of rows that have that
374  // number of occurances (m_CharFrequency[j][i]) divided by total
375  // number of rows (num_rows).
376  mean_occurance += (((float)j)*((float)m_CharFrequency[j][i]))/num_rows;
377  }
378 
379  // Determine how close all occurances are to the mean (best result
380  // is for all rows have the same number of occurances) for current ascii
381  // character 'i'. Start at 1 to ignore (penalize) rows with 0 occurances
382  for (j=1; j<m_CharFrequency.size(); ++j) {
383  float occurances = (float)j;
384 
385  // get abs(delta)
386  double delta = (double)(occurances-mean_occurance);
387  if (delta < 0.0) delta *= -1.0;
388 
389  score += (((float)m_CharFrequency[j][i])/(1.0f + (float)pow(delta,2.0)))*num_rows_inv;
390  }
391 
392  // Generally, consistent characters with higher mean occurances are better.
393  // This is because one or two random special characters can appear at a
394  // certain point in every field giving them a high consistency score.
395  CDelimScore char_score(static_cast<int>(i));
396  char_score.m_OccuranceScore = score;
397  char_score.m_FrequencyScore = mean_occurance;
398 
399  m_Scores.push_back(char_score);
400 
401  if (mean_occurance > max_frequency_score)
402  max_frequency_score = mean_occurance;
403  }
404 
405  // Normalize the frequency score to 0..1. Having the most characters per row
406  // isn't necessarily better than having a few - this measure is meant primarily
407  // to filter out characters that only occur one or two per row. We max out
408  // frequency score to 1.0 when it reaches "average row length"/8
409  float desired_frequency = ((float)(m_CharCount/m_RowCount))/8.0f;
410 
411  // If the max frequency is highter than the 'desired' frequency, use
412  // the desired (best-guess) frequency.
413  desired_frequency = std::min(max_frequency_score, desired_frequency);
414 
415  // So in the end anything with a frequency count at or over once per 8 chars
416  // gets a frequency score of 1.
417  for (i=0; i<(size_t)s_NumDelimiters; ++i) {
418  m_Scores[i].m_FrequencyScore = std::min(m_Scores[i].m_FrequencyScore,
419  desired_frequency)/desired_frequency;
420  }
421 
422 
423  // The last thing we check is how effectively the delimiters
424  // break rows up into fields. One reason is this:
425  // my | dog | has | fleas | | |
426  // is the delimiter space or pipe? Obviously pipe but the measures so far
427  // don't enforce that (space and pipe score the same on frequency and
428  // occurances. With longer fields, space could win on frequency.)
429 
430  // To check for this, pick highest scoring delimiters and then tokenize
431  // the saved rows with each of them and check the results for token length.
432 
433  // Compute a combined score and then sort potential character delimiters
434  // by the combined score.
435  for (i=0; i<m_Scores.size(); ++i) {
436  m_Scores[i].m_CombinedScore = (m_Scores[i].m_OccuranceScore +
437  m_Scores[i].m_FrequencyScore*0.25f)/1.25f;
438  }
439 
440  int delim_candidates_count = 0;
441  std::sort(m_Scores.begin(), m_Scores.end(), greater<CDelimScore>());
442 
443  /// Get number of delimiters that score over a set value (0.8):
444  for (i=0; i<m_Scores.size(); ++i) {
445  if (m_Scores[i].m_CombinedScore > 0.0f)
446 // _TRACE("Score for char: " << m_Scores[i].m_DelimChar << " is: (" <<
447 // m_Scores[i].m_OccuranceScore << ", " << m_Scores[i].m_FrequencyScore << ", " <<
448 // m_Scores[i].m_CombinedScore << ")" );
449 
450  // This is a relatively low score, but we should support delimiters even
451  // with relatively inconsistent data
452  if (m_Scores[i].m_CombinedScore > 0.5f)
453  ++delim_candidates_count;
454  }
455 
456  // If there is more than 1 possible delimiter, add a new criteria based
457  // on the length of tokens parsed using each of the (remaining) candidate
458  // delimiters.
459  if (delim_candidates_count > 1) {
460 
461  // Erase delimiters we are no longer considering
462  m_Scores.erase(m_Scores.begin() + delim_candidates_count, m_Scores.end());
463 
464  vector<string> token_array;
465  for (i=0; i<m_Scores.size(); ++i) {
466 
467  merge = NStr::eNoMergeDelims;
468 
469  string delim_str;
470 
471  // If its < 256, its a single ascii character
472  if (m_Scores[i].m_DelimChar < 256) {
473  delim_str = string(1, (char)m_Scores[i].m_DelimChar);
474  }
475  // > 256 the current candidate is a set of characters (e.g. whitespace)
476  else {
477  merge = NStr::eMergeDelims;
478 
479  CMergedChar c(m_Scores[i].m_DelimChar);
480  vector<CMergedChar>::iterator iter;
481  iter = std::find(m_Repeats.begin(), m_Repeats.end(), c);
482 
483  /// Get delimiter string from m_Repeats.
484  if (iter != m_Repeats.end()) {
485  delim_str = (*iter).m_Chars;
486  }
487  else {
488  _TRACE("Execution error - missing repeat character: " <<
489  m_Scores[i].m_DelimChar);
490  continue;
491  }
492  }
493 
494  // We have saved a set of unparsed rows in m_RecordedRows and here
495  // we go through those and see which delimiter produces better
496  // 'quality' tokens.
497  for (size_t j=0; j<m_RecordedRows.size(); ++j) {
498  token_array.clear();
499  NStr::Split(m_RecordedRows[j], delim_str, token_array, merge == NStr::eMergeDelims ? NStr::fSplit_Tokenize : 0);
500  m_Scores[i].m_TokenLenScore = (float)token_array.size();
501 
502  // Score delimiters by % of non-trivial tokens. A trivial token
503  // is empty or it has another candidate token inside of it.
504  for (size_t tok = 0; tok<token_array.size(); ++tok) {
505  if (token_array[tok].size() == 0) {
506  m_Scores[i].m_TokenLenScore -= 2.0f;
507  }
508  else if (token_array[tok].size() == 1) {
509  for (size_t k=0; k<(size_t)delim_candidates_count; ++k) {
510  if (k != i) {
511  if (token_array[tok] == string(1, m_Scores[k].m_DelimChar) ) {
512  m_Scores[i].m_TokenLenScore -= 2.0f;
513  }
514  }
515  }
516  }
517  }
518 
519  // normalize it so that the result is the % of non-zero length
520  // tokens + one-length tokens where the token is equal to another
521  // delimiter candidate
522  m_Scores[i].m_TokenLenScore = m_Scores[i].m_TokenLenScore/(float)token_array.size();
523 
524  // If a merged delimiter score is identical to a non-merged score,
525  // prefer the non-merged
526  float merge_penalty = 0.0f;
527  if ( m_Scores[i].m_DelimChar > 255 ) {
528  merge_penalty = 0.01f;
529 
530  // whitespace will allways score equal to or better than blanks and tabs.
531  // This penalizes whitespace for being more general so if scores
532  // are exactly the same, fewer delimiters will be preferred.
533  merge_penalty += (m_Scores[i].m_DelimChar == 258) ? 0.1f : 0.0f;
534  }
535 
536  // Favor 'standard' tokens over nonstandard ones generally e.g.
537  // seq_id 29
538  // seq_id 33
539  // the '_' and ' ' should have identical scores but we choose ' '
540  // because it is more often used as a token.
541  float token_bonus = 0.0f;
542  if (char(m_Scores[i].m_DelimChar) == ',' ||
543  char(m_Scores[i].m_DelimChar) == ';' ||
544  char(m_Scores[i].m_DelimChar) == '|' ||
545  char(m_Scores[i].m_DelimChar) == ' ' ||
546  char(m_Scores[i].m_DelimChar) == '\t' ||
547  m_Scores[i].m_DelimChar > 255)
548  token_bonus = 0.2f;
549 
550 
551  // Compute a combined score:
552  m_Scores[i].m_CombinedScore = (m_Scores[i].m_OccuranceScore +
553  m_Scores[i].m_FrequencyScore*0.2f +
554  (m_Scores[i].m_TokenLenScore*0.25f)/1.50f) - merge_penalty + token_bonus;
555 
556  }
557  }
558 
559  // Resort with updated weights. If scores are even, sort favors
560  // candidates with lower indices.
561  std::sort(m_Scores.begin(), m_Scores.end(), greater<CDelimScore>());
562 
563 
564  // /* for debugging delimiter scores
565  for (i=0; i<m_Scores.size(); ++i) {
566  if (m_Scores[i].m_CombinedScore > 0.0f && i<5) {
567  _TRACE("Score for char: " << m_Scores[i].m_DelimChar << " is: (" <<
568  m_Scores[i].m_OccuranceScore << ", " <<
569  m_Scores[i].m_FrequencyScore << ", " <<
570  m_Scores[i].m_TokenLenScore << ", " <<
571  m_Scores[i].m_CombinedScore << ")" );
572  }
573  }
574  //*/
575  }
576 
577 
578  // Return most likely token
579  max_score = m_Scores[0].m_CombinedScore;
580 
581  if (m_Scores[0].m_DelimChar < 256) {
582  delims.push_back((char)m_Scores[0].m_DelimChar);
583  merge = NStr::eNoMergeDelims;
584 
585  }
586  else {
587  merge = NStr::eMergeDelims;
588 
589  CMergedChar c(m_Scores[0].m_DelimChar);
590  vector<CMergedChar>::iterator iter;
591  iter = std::find(m_Repeats.begin(), m_Repeats.end(), c);
592 
593  /// Should always be found:
594  if (iter != m_Repeats.end()) {
595  for (size_t j=0; j<(*iter).m_Chars.size(); ++j)
596  delims.push_back((*iter).m_Chars[j]);
597  }
598  else {
599  _TRACE("Execution error - missing repeat character: " <<
600  m_Scores[i].m_DelimChar);
601  max_score = 0.0f;
602  }
603  }
604 }
605 
606 
607 
608 /*****************************************************************************/
609 /*************************** CTableDelimiterRules ****************************/
610 
611 bool CTableDelimiterRules::MatchingDelimiters(std::vector<char> other_delims) const
612 {
613  // Return true if we have all the same delimiters in m_Delimeters as we
614  // we have in other_delims (even if order is different)
615  vector<char> cur_delims = m_Delimiters;
616 
617  sort(cur_delims.begin(), cur_delims.end());
618  sort(other_delims.begin(), other_delims.end());
619 
620  return (cur_delims == other_delims);
621 }
622 
624 {
625  string delims;
626  for (size_t i=0; i<m_Delimiters.size(); ++i) {
627  if (m_Delimiters[i] == '\t')
628  delims += "\\t";
629  else
630  delims +=m_Delimiters[i];
631  }
632 
633  LOG_POST(Info << "Import Table Delimiters: \"" << delims << "\"");
634  LOG_POST(Info << " Quote Character: '" << m_QuoteChar << "'");
635  LOG_POST(Info << " Multi-line Quotes: " << m_MultiLineQuotes);
636  LOG_POST(Info << " Merge Delimiters: " << m_MergeDelimiters);
637 }
638 
639 /// Export delimiter rules in ASN user-data format
641 {
642  // convert character array to a ints to store in user object.
643  vector<int> delims;
644  for (size_t i=0; i<m_Delimiters.size(); ++i)
645  delims.push_back((int)m_Delimiters[i]);
646 
647  user_field.AddField("delim-chars", delims);
648  user_field.AddField("quote-char", (int)m_QuoteChar);
649  user_field.AddField("multi-line-quotes", m_MultiLineQuotes);
650  user_field.AddField("merge-delimiters", m_MergeDelimiters);
651 }
652 
654 {
655  if (delimiter_object.HasField("delim-chars") &&
656  delimiter_object.GetField("delim-chars").GetData().IsInts()) {
657  vector<int> delims = delimiter_object.GetField("delim-chars").GetData().GetInts();
658 
659  m_Delimiters.clear();
660  for (size_t i=0; i<delims.size(); ++i) {
661  m_Delimiters.push_back((char)delims[i]);
662  }
663  }
664 
665  if (delimiter_object.HasField("quote-char") &&
666  delimiter_object.GetField("quote-char").GetData().IsInt()) {
667  m_QuoteChar = (char)delimiter_object.
668  GetField("quote-char").GetData().GetInt();
669  }
670 
671  if (delimiter_object.HasField("multi-line-quotes") &&
672  delimiter_object.GetField("multi-line-quotes").GetData().IsBool()) {
673  m_MultiLineQuotes = delimiter_object.
674  GetField("multi-line-quotes").GetData().GetBool();
675  }
676 
677  if (delimiter_object.HasField("merge-delimiters") &&
678  delimiter_object.GetField("merge-delimiters").GetData().IsBool()) {
679  m_MergeDelimiters = delimiter_object.
680  GetField("merge-delimiters").GetData().GetBool();
681  }
682 }
683 
684 /*****************************************************************************/
685 /*************************** CTableImportRow *********************************/
686 
688 : m_TableEntry(s)
689 {
690  m_Fields.push_back(pair<size_t,size_t>(0,m_TableEntry.size()));
691 }
692 
693 string CTableImportRow::GetField(int column_idx) const
694 {
695  string field("");
696 
697  if (column_idx < (int)m_Fields.size()) {
698  pair<size_t,size_t> field_idx = m_Fields[column_idx];
699 
700  field = m_TableEntry.substr(field_idx.first, field_idx.second);
701  }
702 
703  return field;
704 }
705 
706 /*****************************************************************************/
707 /*************************** CTableImportDataSource **************************/
708 
710 : m_TableType(eDelimitedTable)
711 , m_FileType(eUndefinedFile)
712 , m_MaxRowLen(0)
713 , m_ImportFromRow(0)
714 , m_NumImportedRows(0)
715 , m_CommentChar(' ')
716 , m_MaxNonImportedRowLength(0)
717 , m_ColumnHeaderRow(-1)
718 , m_UseCurrentDelimiters(false)
719 {
720 }
721 
723 {
724  m_TableEntries.clear();
725  m_Columns.clear();
726  m_MaxRowLen = 0;
728  m_DelimRules.Reset();
729  SetCommentChar(' ');
730  m_ImportFromRow = 0;
731  m_NumImportedRows = 0;
732  m_ColumnHeaderRow = -1;
733  m_UseCurrentDelimiters = false;
734 }
735 
736 bool CTableImportDataSource::LoadTable(const wxString& fname, CUser_object& user_object)
737 {
738  Int8 filesize = -1;
739  {
740  CFile tstfile(string(fname.ToUTF8()));
741 
742  if (!tstfile.IsFile()) {
743  LOG_POST("Error opening file: " + fname);
744  return false;
745  }
746 
747  filesize = tstfile.GetLength();
748  }
749 
750  if (!LoadTable(fname, filesize, NULL))
751  return false;
752 
754 
755  // Now update all parameters to their pre-set values,
756  // except possibly for delimiters.
757  ImportTableParms(user_object);
759  m_DelimRules = delims;
760 
761  // Reparse the data but keep the columns read in from the
762  // user_object
764  RecomputeFields(false);
765  else //eFixedWidthTable
767 
768  return true;
769 }
770 
771 
772 bool CTableImportDataSource::LoadTable(const wxString& fname,
773  Int8 filesize,
774  ICanceled* call)
775 {
776  ClearTable();
777 
778  CCompressedFile file(fname);
779  CNcbiIstream* ifs = &(file.GetIstream());
780 
781  if (!ifs->good() || ifs->eof())
782  return false;
783 
784  m_FileName = fname;
785  LOG_POST(Info << "Importing Table: " << m_FileName.ToUTF8());
786 
787  try {
789  c.SetName("Column 0");
790  m_Columns.push_back(c);
791 
792  c.SetName("#");
793  m_Columns.push_back(c);
794 
796 
797  CCharHistogram char_counter;
798  int row_count = 0;
799  int total_char_count = 0;
800 
801  int hist_count1 = 100 + m_ImportFromRow;
802  int hist_count2 = 1000 + m_ImportFromRow;
803 
804  /// Read all the rows. Read directly into target row
805  // to avoid a copy. This would probably be faster if we
806  // knew the number of rows in advance (could estimate
807  // based on a few lines and the file size...)
809  //while (NcbiGetlineEOL(*ifs, row.GetValue())) {
810  while (NcbiGetline(*ifs, row.GetValue(), "\n\r" )) {
811 
812  if (row_count < m_ImportFromRow)
813  m_MaxNonImportedRowLength = std::max(row.GetValue().length(),
815 
816  // Don't add any completely blank rows
817  if (!NStr::IsBlank(row.GetValue())) {
818  m_TableEntries.push_back(row);
819  total_char_count += row.GetValue().size();
820  ++row_count;
821  }
822  else continue;
823 
824  // Store copy of initial rows in character histogram class so we
825  // can later inspect them there for (possible)headers.
826  if (row_count < 50)
827  char_counter.AddInitialRows(row.GetValue());
828 
829  // Gather some statistical info for picking delimiters. Up to hist_count1,
830  // gather from all rows, after that, gather less frequently (since data is
831  // probably already adequte to take a good guess). Try to ignore some
832  // initial rows to avoid including headers in this process.
833  if ((row_count >= m_ImportFromRow && row_count < hist_count1) ||
834  (row_count > hist_count1 && row_count < hist_count2 && row_count%10 == 0 ) ||
835  (row_count > hist_count2 && row_count%100 == 0)) {
836 
837  // Ignore first "few" (<5) rows to try to skip headers, but don't
838  // ignore initial rows for very small files (e.g. 3 row file)
839  bool analyze_row = true;
840  if (row_count < 5) {
841  Int8 average_rowlen = (Int8)(total_char_count/row_count);
842  int projected_rowcount = (int)(filesize/average_rowlen);
843  if (projected_rowcount >= 10)
844  analyze_row = false;
845  else if (row_count < projected_rowcount-5)
846  analyze_row = false;
847  }
848 
849  if (analyze_row)
850  char_counter.UpdateHistogram(row.GetValue());
851  }
852 
853  // Estimate total file size to avoid resizing array multiple times
854  // (efficiency for reading large files - provides modest improvement -
855  // about 20%)
856  if (row_count == 80 && filesize != -1) {
857  Int8 average_rowlen = (Int8)(char_counter.GetCharCount()/row_count);
858  int projected_rowcount = (int)(filesize/average_rowlen);
859  //_TRACE("Projected rows = " << projected_rowcount);
860  m_TableEntries.reserve(projected_rowcount + 0.2*projected_rowcount);
861  }
862 
863 
864  row.GetValue().clear();
865 
866  if (call != NULL && call->IsCanceled()) {
867  m_TableEntries.clear();
868  return false;
869  }
870  }
871 
872  if (row_count == 0) {
873  m_TableEntries.clear();
874  return false;
875  }
876 
877 
878  // If the table type is known, then we can use that info
879  // to directly fill in separator character, comment character etc.
880  if (x_PickFileType()) {
882  LOG_POST(Info << "Imported Table Type Guess: Delimited Type");
884 
885  RecomputeFields(true);
886 
887  // Save max row len for displaying table in single-column mode
888  m_MaxRowLen = char_counter.GetMaxRowLen();
889  }
890  else {
891  // Before we try to find a separating character, lets try to figure
892  // out how many lines are part of the header and if there is a
893  // comment character at the front of the header
894  x_FindHeaderRows(char_counter);
895 
896  float max_score = 0.0f;
897 
898  vector<char> delims;
899  NStr::EMergeDelims merge;
900 
901  // Save max row len for displaying table in single-column mode
902  m_MaxRowLen = char_counter.GetMaxRowLen();
903  char_counter.GetDelimiterProbablities(max_score, delims, merge);
904 
905  /// Clear current delims and set other options to defaults
907 
908  // Set the table type (for now) based on the max_score. 0.8 is somewhat arbirary
909  // (scale is 0..1).
910  if (max_score > 0.8f) {
912 
913  m_DelimRules.SetDelimiters(delims);
915  RecomputeFields(true);
916 
917  LOG_POST(Info << "Imported Table Type Guess: Delimited Type");
919  }
920  else {
922  LOG_POST(Info << "Imported Table Type Guess: Fixed Width");
923 
924  vector<char> delim;
925 
926  // No delimiters - will have to assign widths on fixed-width page
927  // unless choice is overriden.
929  RecomputeFields(true);
930  }
931  }
932 
933  }
934  catch (...) {
935  ClearTable();
936 
937  return false;
938  }
939 
940  return true;
941 }
942 
944 {
945  // Go through each entry in the table
946  for (size_t row=0; row<m_TableEntries.size(); ++row) {
947  ofs << m_TableEntries[row].GetValue() << endl;
948  }
949 }
950 
952 {
953  m_TableType = e;
954 
955  // Update column information based on type
956  if (m_TableType == eDelimitedTable) {
957  RecomputeFields(true);
958  }
959  else {
960  // This is called prior to having fixed column widths available,
961  // so we just clear out the column info
962  if (m_Columns.size() > 2) {
963  m_Columns.erase(m_Columns.begin()+2, m_Columns.end());
964  m_Columns[1].SetWidth(static_cast<int>(m_MaxRowLen));
965  }
966  }
967 }
968 
969 
971 {
973 
974  string::size_type pos = m_TableEntries[0].GetValue().find_first_not_of(" \t");
975  char comment_char = ' ';
976 
977  if (pos != string::npos)
978  comment_char = m_TableEntries[0].GetValue()[pos];
979 
980  if (comment_char == '#' &&
981  m_TableEntries[0].GetValue().find("BLAST") != string::npos) {
983 
984  std::vector<char> delimiters;
985  delimiters.push_back('\t');
986  delimiters.push_back('|');
987 
988  m_DelimRules.SetDelimiters(delimiters);
990 
991  SetCommentChar('#');
992 
993  // Synchronize first import row to number of initial comment lines:
994  m_ImportFromRow = 0;
995  for (size_t i=0; i<m_TableEntries.size(); ++i) {
996  string::size_type spos =
997  m_TableEntries[i].GetValue().find_first_not_of(" \n");
998 
999  if (spos != string::npos &&
1000  m_TableEntries[i].GetValue()[spos] == m_CommentChar) {
1001  ++m_ImportFromRow;
1002  }
1003  else break;
1004  }
1005 
1006  return true;
1007  }
1008 
1009  return false;
1010 }
1011 
1013 {
1014  // First get first non-blank character from first line and see if it is
1015  // a header comment (if it appears in the initial rows as first character
1016  // but is never the first character thereafter, it is a header comment. Must
1017  // also not be a letter or number: a-z, A-Z, 0-9)
1018  int header_rows = 1;
1019  int non_header_rows = 0;
1020  bool has_header = true;
1021  char comment_char = '0'; // Not a valid comment char
1022  string::size_type pos = m_TableEntries[0].GetValue().find_first_not_of(" \t");
1023  if (pos != string::npos)
1024  comment_char = m_TableEntries[0].GetValue()[pos];
1025 
1026  // Standard characters are not supported as header comments:
1027  if ((comment_char >= '0' && comment_char <= '9') ||
1028  (comment_char >= 'A' && comment_char <= 'Z') ||
1029  (comment_char >= 'a' && comment_char <= 'z') ||
1030  comment_char == ' ' || comment_char == '\t') {
1031  has_header = false;
1032  header_rows = 0;
1033  }
1034 
1035  for (size_t i=1; i<std::min((size_t)500, m_TableEntries.size()) && has_header; ++i) {
1036  // Get first character in row:
1037  char first_char = '0';
1038  pos = m_TableEntries[i].GetValue().find_first_not_of(" \t");
1039  if (pos != string::npos)
1040  first_char = m_TableEntries[i].GetValue()[pos];
1041 
1042  if (first_char == comment_char) {
1043  // possible comment character has appeared after initial block, so it
1044  // must not be a comment character:
1045  if (non_header_rows > 0) {
1046  has_header = false;
1047  break;
1048  }
1049  ++header_rows;
1050  }
1051  else {
1052  ++non_header_rows;
1053  }
1054  }
1055 
1056  // If many rows begin with same char, it is probably not a header char
1057  if (has_header) {
1058  if (header_rows > 20 || header_rows > non_header_rows) {
1059  has_header = false;
1060  header_rows = 0;
1061  }
1062  else {
1063  m_ImportFromRow = header_rows;
1064  SetCommentChar(comment_char);
1065  return;
1066  }
1067  }
1068 
1069  // If there is no obvious header comment comment character,
1070  // look for differences in length and frequency of character
1071  // data to identify possible header
1072  float certainty = 0.0f;
1073  int first_non_header_row = -1;
1074 
1075  // ! and ~ are the first and last typical, non-space, ascii characters
1076  char_counter.GetGroupOccuranceAverage('!', '~', certainty, first_non_header_row);
1077  if (certainty > 1.5f && first_non_header_row > 0) {
1078  m_ImportFromRow = first_non_header_row;
1079  }
1080  // Last check - check if first row contains certain keywords often found in headers
1081  else {
1082  string first_row = m_TableEntries[0].GetValue();
1083  NStr::ToLower(first_row);
1084 
1085  if (NStr::Find(first_row, "seqid") != NPOS ||
1086  NStr::Find(first_row, "accession") != NPOS) {
1087  m_ImportFromRow = 1;
1088  }
1089  }
1090 
1091  SetCommentChar(' ');
1092 }
1093 
1094 
1096  const CTempString& delim,
1097  NStr::EMergeDelims merge,
1098  bool multiple_spaces_only,
1099  vector<std::pair<size_t,size_t> >& token_pos)
1100 {
1101  // Special cases
1102  if (str.empty()) {
1103  return;
1104  } else if (delim.empty() && !multiple_spaces_only) {
1105  token_pos.push_back(pair<size_t,size_t>(0, str.length()));
1106  return;
1107  }
1108 
1109  // Tokenization
1110  //
1111  string::size_type pos, prev_pos;
1112  for (pos = 0;;) {
1113  prev_pos = ((merge == NStr::eMergeDelims && delim != "") ?
1114  str.find_first_not_of(delim, pos) : pos);
1115 
1116  // don't allow merging of delimiters between multiple blanks and other chars
1117  if (multiple_spaces_only && str[pos] == ' ')
1118  prev_pos = str.find_first_not_of(CTempString(" "), pos);
1119 
1120  if (prev_pos == CTempString::npos) {
1121  break;
1122  }
1123  pos = str.find_first_of(delim, prev_pos);
1124  if (multiple_spaces_only) {
1125  pos = std::min(pos, str.find(CTempString(" "), prev_pos));
1126  }
1127  if (pos == CTempString::npos) {
1128  token_pos.push_back(pair<size_t, size_t>(prev_pos, str.length() - prev_pos));
1129  break;
1130  } else {
1131  token_pos.push_back(pair<size_t, size_t>(prev_pos, pos-prev_pos));
1132  ++pos;
1133  }
1134  } // for
1135 }
1136 
1138  const CTempString& delim,
1139  const CTempString& delim_and_quote,
1140  NStr::EMergeDelims merge,
1141  bool multiple_spaces_only,
1142  char quote_char,
1143  vector<std::pair<size_t,size_t> >& token_pos)
1144 {
1145  // Special cases
1146  if (str.empty()) {
1147  return;
1148  } else if (delim.empty() && !multiple_spaces_only) {
1149  token_pos.push_back(pair<size_t,size_t>(0, str.length()));
1150  return;
1151  }
1152 
1153  // Tokenization
1154  //
1155  string::size_type pos;
1156  string::size_type prev_pos = string::npos;
1157  string::size_type search_pos = string::npos;
1158 
1159  bool token_added = true;
1160 
1161  for (pos = 0;;) {
1162  if (token_added) {
1163  prev_pos = ((merge == NStr::eMergeDelims && delim != "") ?
1164  str.find_first_not_of(delim, pos) : pos);
1165 
1166  if (multiple_spaces_only && str[pos] == ' ')
1167  prev_pos = str.find_first_not_of(CTempString(" "), pos);
1168 
1169  search_pos = prev_pos;
1170  }
1171  if (prev_pos == CTempString::npos) {
1172  break;
1173  }
1174 
1175  // Find the next delimiter OR beginning of a quoted string
1176  pos = str.find_first_of(delim_and_quote, search_pos);
1177  if (multiple_spaces_only) {
1178  pos = std::min(pos, str.find(CTempString(" "), search_pos));
1179  }
1180  if (pos == CTempString::npos) {
1181  token_pos.push_back(pair<size_t, size_t>(prev_pos, str.length() - prev_pos));
1182  break;
1183  } else {
1184  if (str[pos] == quote_char) {
1185  // proceed to close-quote then search again for next delimiter.
1186  // Assume CSV rules for embedding quotes within quoted strings
1187  // - any embedded quotes are doubled, e.g.
1188  // "A quote within a ""quote"" looks like this"
1189  for (++pos;
1190  pos < str.length() &&
1191  (str[pos]!=quote_char ||
1192  (str[pos]==quote_char && str[pos-1]==quote_char) ||
1193  (str[pos]==quote_char && pos<str.length()-1 &&
1194  str[pos+1]==quote_char)); ++pos) {
1195  }
1196 
1197  /// Quote ended at EOL OR may be unbalanced, eg: "not balnaced\n
1198  if (pos >= str.length()-1) {
1199  token_pos.push_back(pair<size_t, size_t>(prev_pos, str.length() - prev_pos));
1200  break;
1201  }
1202 
1203  search_pos = ++pos;
1204  token_added = false;
1205  continue;
1206  }
1207  else {
1208  token_added = true;
1209  token_pos.push_back(pair<size_t,size_t>(prev_pos, pos-prev_pos));
1210  ++pos;
1211  search_pos = prev_pos;
1212  }
1213  }
1214  } // for
1215 }
1216 
1218  const CTempString& delims_ts,
1219  const CTempString& delims_quote_ts,
1220  NStr::EMergeDelims merge_delims,
1221  bool multiple_spaces_only)
1222 {
1223  // parse fields based on current delimiters
1224  // Use the tokens to set position and length of tokens for the current row
1225  vector<std::pair<size_t,size_t> >& fields = row.GetFields();
1226  fields.clear();
1227 
1228  if (m_DelimRules.GetQuoteChar() == ' ') {
1229  x_ParseEntry(CTempString(row.GetValue()),
1230  delims_ts,
1231  merge_delims,
1232  multiple_spaces_only,
1233  fields);
1234  }
1235  else {
1236  x_ParseQuotedEntry(CTempString(row.GetValue()),
1237  delims_ts,
1238  delims_quote_ts,
1239  merge_delims,
1240  multiple_spaces_only,
1242  fields);
1243  }
1244 
1245 
1246  // Update field positions for this specific table entry
1247  // NOTE: We could also do this only when a row is actually displayed
1248  // if performance is an issue (and maybe tag it as 'updated')
1249  for (size_t i=0; i<fields.size(); ++i) {
1250  // Update (if needed) columns for overall table (although all rows
1251  // will often have the same number of columns that may not always be
1252  // true. Also, track maximum column widths for formatting).
1253  if (i >= m_Columns.size()-1) {
1255  c.SetWidth(static_cast<int>(fields[i].second));
1256 
1257  if (m_ColumnHeaderRow != -1 &&
1258  i <m_TableEntries[m_ColumnHeaderRow].GetNumFields()) {
1259  string column_name = m_TableEntries[m_ColumnHeaderRow].GetField(static_cast<int>(i));
1260  // First field could be preceded by the comment character for header
1261  // rows, if there is one.
1262  if (i==0 && column_name.size() > 0 &&
1263  column_name[0] == m_CommentChar) {
1264  column_name = column_name.substr(1, column_name.size()-1);
1265  }
1266  c.SetName(column_name);
1267 
1268  // make sure there is space for the name
1269  c.SetWidth(static_cast<int>(column_name.size()));
1270  }
1271  else {
1272  c.SetName("Col " + NStr::NumericToString(i+1));
1273  c.SetWidth(static_cast<int>(c.GetName().size()));
1274  }
1275 
1276  m_Columns.push_back(c);
1277  }
1278  m_Columns[i+1].SetWidth(std::max(m_Columns[i+1].GetWidth(),
1279  (int)fields[i].second));
1280  }
1281 }
1282 
1284 {
1285  // Concatenate all delimiters into one string for use by tokenize
1286  string delims_str;
1287  string delims_quote_str;
1288  CTempString delims_ts;
1289  CTempString delims_quote_ts;
1290  for (size_t j=0; j<m_DelimRules.GetDelimiters().size(); ++j) {
1292  delims_str.push_back(m_DelimRules.GetDelimiters()[j]);
1293  }
1294 
1295  delims_ts = delims_str;
1296 
1297  delims_quote_str = delims_str + m_DelimRules.GetQuoteChar();
1298  delims_quote_ts = delims_quote_str;
1299 
1302 
1304  delims_ts,
1305  delims_quote_ts,
1306  merge_delims,
1308 }
1309 
1310 void CTableImportDataSource::RecomputeFields(bool recreate_columns,
1311  int recompute_count)
1312 {
1313  // Concatenate all delimiters into one string for use by tokenize
1314  string delims_str;
1315  string delims_quote_str;
1316  CTempString delims_ts;
1317  CTempString delims_quote_ts;
1318  for (size_t j=0; j<m_DelimRules.GetDelimiters().size(); ++j) {
1320  delims_str.push_back(m_DelimRules.GetDelimiters()[j]);
1321  }
1322 
1323  delims_ts = delims_str;
1324 
1325  delims_quote_str = delims_str + m_DelimRules.GetQuoteChar();
1326  delims_quote_ts = delims_quote_str;
1327 
1330 
1331  int row_count = 0;
1333 
1334  // Iterate over the data to determine the number of fields and set
1335  // up those fields as columns with undefined types and
1336  // simple names (column 1, column 2....)
1337  vector<CTableImportRow>::iterator iter;
1338 
1339  // If we are not reparsing all the rows, need to remember the widths
1340  vector<CTableImportColumn> prev_columns = m_Columns;
1341 
1342  if (recreate_columns) {
1343  m_Columns.clear();
1345  c.SetName("#");
1346  m_Columns.push_back(c);
1347  }
1348  int count = 0;
1349 
1350  // If user wants to use one of the rows to get column names, parse that
1351  // first and then use its results for column headers (until they run out)
1352  if (m_ColumnHeaderRow != -1) {
1353  string header_row = m_TableEntries[m_ColumnHeaderRow].GetValue();
1354 
1355  vector<std::pair<size_t,size_t> >& fields = m_TableEntries[m_ColumnHeaderRow].GetFields();
1356  fields.clear();
1357 
1358  if (m_DelimRules.GetQuoteChar() == ' ') {
1359  x_ParseEntry(CTempString(header_row),
1360  delims_ts,
1361  merge_delims,
1363  fields);
1364  }
1365  else {
1366  x_ParseQuotedEntry(CTempString(header_row),
1367  delims_ts,
1368  delims_quote_ts,
1369  merge_delims,
1372  fields);
1373  }
1374  }
1375 
1376 
1377  // Go through each entry in the table
1378  for (iter = m_TableEntries.begin(); iter != m_TableEntries.end(); ++iter) {
1379 
1380  // Don't parse rows that are not designated to be imported
1381  if (row_count++ < m_ImportFromRow || (*iter).GetRowNum() == -1) {
1382  m_MaxNonImportedRowLength = std::max((*iter).GetValue().length(),
1384  continue;
1385  }
1386 
1387  x_RecomputeRowFields(*iter,
1388  delims_ts,
1389  delims_quote_ts,
1390  merge_delims,
1392 
1393 
1394  // Caller may set a limited number of (initial) rows to
1395  // be updated (used when # of header rows is updated)
1396  if (recompute_count != -1 && ++count > recompute_count) {
1397 
1398  // Retain widths from previous update
1399  for (size_t i=1; i<m_Columns.size(); ++i) {
1400  if (prev_columns.size() > i) {
1401  m_Columns[i].SetWidth(max(m_Columns[i].GetWidth(), prev_columns[i].GetWidth()));
1402  }
1403  else break;
1404  }
1405 
1406  RecomputeHeaders();
1407  return;
1408  }
1409  }
1410 
1411  RecomputeHeaders();
1412 }
1413 
1415 {
1416  int row_count = 0;
1418 
1419  // Iterate over the data to determine the number of fields and set
1420  // up those fields as columns with undefined types and
1421  // simple names (column 1, column 2....)
1422  vector<CTableImportRow>::iterator iter;
1423 
1424  // Go through each entry in the table
1425  for (iter = m_TableEntries.begin(); iter != m_TableEntries.end(); ++iter) {
1426 
1427  // Don't parse rows that are not designated to be imported
1428  if (row_count++ < m_ImportFromRow || (*iter).GetRowNum() == -1) {
1429  m_MaxNonImportedRowLength = std::max((*iter).GetValue().length(),
1431  continue;
1432  }
1433 
1434  size_t len = (*iter).GetValue().length();
1435  std::vector<std::pair<size_t,size_t> >& fields = (*iter).GetFields();
1436 
1437  fields.clear();
1438 
1439  size_t start_idx = 0;
1440  for (size_t i=1; i<m_Columns.size(); ++i) {
1441  size_t w = (size_t)m_Columns[i].GetWidth();
1442  if (start_idx < len) {
1443  w = std::min(w, len-start_idx);
1444  fields.push_back(std::pair<size_t,size_t>(start_idx,w));
1445  }
1446  else {
1447  w = 0;
1448  fields.push_back(std::pair<size_t,size_t>(len-1, w));
1449  }
1450 
1451  start_idx += w;
1452  }
1453  }
1454 
1455  RecomputeHeaders();
1456 }
1457 
1459 {
1460  LOG_POST(Info << "Table Import: Fixed field widths: ");
1461  size_t start_idx = 0;
1462  for (size_t i=1; i<m_Columns.size(); ++i) {
1463  size_t w = (size_t)m_Columns[i].GetWidth();
1464 
1465  LOG_POST(Info << "Field #: " << i << " (" << start_idx << ", "
1466  << start_idx + w << ")");
1467 
1468  start_idx += w;
1469  }
1470 }
1471 
1473 {
1474  // If a row has been designated as the source of the column names, parse
1475  // it according to the same rules as the other rows and then use
1476  // the results to update the column names.
1477  if (m_ColumnHeaderRow == -1 ||
1478  m_TableEntries.size() <= (size_t)m_ColumnHeaderRow)
1479  return;
1480 
1481  // Concatenate all delimiters into one string for use by tokenize
1482  if (m_TableType == eDelimitedTable) {
1483  string delims_str;
1484  string delims_quote_str;
1485  CTempString delims_ts;
1486  CTempString delims_quote_ts;
1487 
1488  for (size_t j=0; j<m_DelimRules.GetDelimiters().size(); ++j) {
1490  delims_str.push_back(m_DelimRules.GetDelimiters()[j]);
1491  }
1492 
1493  delims_ts = delims_str;
1494 
1495  delims_quote_str = delims_str + m_DelimRules.GetQuoteChar();
1496  delims_quote_ts = delims_quote_str;
1497 
1500 
1501  // If user wants to use one of the rows to get column names, parse that
1502  // first and then use its results for column headers (until they run out)
1503  string header_row = m_TableEntries[m_ColumnHeaderRow].GetValue();
1504 
1505  vector<std::pair<size_t,size_t> >& fields = m_TableEntries[m_ColumnHeaderRow].GetFields();
1506  fields.clear();
1507 
1508  if (m_DelimRules.GetQuoteChar() == ' ') {
1509  x_ParseEntry(CTempString(header_row),
1510  delims_ts,
1511  merge_delims,
1513  fields);
1514  }
1515  else {
1516  x_ParseQuotedEntry(CTempString(header_row),
1517  delims_ts,
1518  delims_quote_ts,
1519  merge_delims,
1522  fields);
1523  }
1524  }
1525  else {// (m_TableType == eFixedWidthTable)
1526  size_t len = m_TableEntries[m_ColumnHeaderRow].GetValue().length();
1527  std::vector<std::pair<size_t,size_t> >& fields = m_TableEntries[m_ColumnHeaderRow].GetFields();
1528 
1529  fields.clear();
1530 
1531  size_t start_idx = 0;
1532  for (size_t i=1; i<m_Columns.size(); ++i) {
1533  size_t w = (size_t)m_Columns[i].GetWidth();
1534  if (start_idx < len) {
1535  w = std::min(w, len-start_idx);
1536  fields.push_back(std::pair<size_t,size_t>(start_idx,w));
1537  }
1538  else {
1539  w = 0;
1540  fields.push_back(std::pair<size_t,size_t>(len-1, w));
1541  }
1542 
1543  start_idx += w;
1544  }
1545  }
1546 
1547  // Update column names with parsed values or generated names (column.[1..n])
1548  size_t parsed_field_idx = 0;
1549  for (size_t i=0; i<m_Columns.size()-1; ++i) {
1550  if (i < m_TableEntries[m_ColumnHeaderRow].GetNumFields()) {
1551  string column_name = m_TableEntries[m_ColumnHeaderRow].GetField(static_cast<int>(parsed_field_idx++));
1552  // First field could be preceded by the comment character for header
1553  // rows, if there is one. The comment character could either be
1554  // attached to the name: "#accession " or separate: "# accession".
1555  // check for both.
1556  if (i==0 && column_name.size() > 0 &&
1557  column_name[0] == m_CommentChar) {
1558  column_name = column_name.substr(1, column_name.size()-1);
1559 
1560  // If the field was just the comment character, jump to next
1561  // parsed field. GetField() returns empty strings if it
1562  // runs out of tokens,
1563  if (column_name.length() == 0) {
1564  column_name = m_TableEntries[m_ColumnHeaderRow].GetField(static_cast<int>(parsed_field_idx++));
1565  }
1566  }
1567 
1568  column_name = NStr::TruncateSpaces(column_name);
1569  if (column_name == "")
1570  column_name = "Column " + NStr::NumericToString(i+1);
1571 
1572  m_Columns[i+1].SetName(column_name);
1573 
1574 
1575  // make sure there is space for the name, which we don't do for
1576  // the generated names.
1577  m_Columns[i+1].SetWidth(std::max(m_Columns[i+1].GetWidth(),
1578  (int)column_name.size()));
1579  }
1580  else {
1581  m_Columns[i+1].SetName("Column " + NStr::NumericToString(i+1));
1582  }
1583  }
1584 }
1585 
1586 string CTableImportDataSource::GetField(size_t row, size_t col) const
1587 {
1588  // Get a field out of the specified row 'row'. This function is
1589  // used by the wxListCtrl to get fields for display.
1590  if (m_TableEntries.size() <= row) {
1591  return "";
1592  }
1593 
1594  return (m_TableEntries[row].GetField(static_cast<int>(col)));
1595 }
1596 
1597 
1599 {
1600  m_ImportFromRow = r;
1601  LOG_POST(Info << "Import table: first import row: " << m_ImportFromRow);
1602 
1605 }
1606 
1608 {
1609  m_ColumnHeaderRow = c;
1610 
1611  // Need to update row numbers if this will change where import starts (import
1612  // will always start after this line). SetCommentChar will do this update so
1613  // just reset the same character.
1615 
1616  LOG_POST(Info << "Import table: column header row: " << m_ColumnHeaderRow);
1617 }
1618 
1620  int first_row)
1621  {
1622  m_ColumnHeaderRow = column_header_row;
1623  m_ImportFromRow = first_row;
1624 
1625  LOG_POST(Info << "Import table: column header row: " << m_ColumnHeaderRow);
1626  LOG_POST(Info << "Import table: first import row: " << m_ImportFromRow);
1627 
1628  // Need to update row numbers if this will change where import starts (import
1629  // will always start after this line). SetCommentChar will do this update so
1630  // just reset the same character.
1632  RecomputeFields(true);
1633  }
1634 
1636 {
1637  m_CommentChar = c;
1638  LOG_POST(Info << "Import table: row comment character: " << m_CommentChar);
1639 
1641  int row_num = 0;
1642  int count = 0;
1643 
1644  vector<CTableImportRow>::iterator iter;
1645  for (iter = m_TableEntries.begin(); iter != m_TableEntries.end(); ++iter, ++count) {
1646  if (count >= m_ImportFromRow && count > m_ColumnHeaderRow) {
1647  string::size_type spos = (*iter).GetValue().find_first_not_of(" \n");
1648 
1649  if (spos == string::npos || (*iter).GetValue()[spos] != m_CommentChar) {
1650  (*iter).SetRowNum(row_num++);
1651  }
1652  else {
1653  (*iter).SetRowNum(-1);
1654  m_MaxNonImportedRowLength = std::max((*iter).GetValue().length(),
1656  }
1657  }
1658  else {
1659  (*iter).SetRowNum(-1);
1660  m_MaxNonImportedRowLength = std::max((*iter).GetValue().length(),
1662  }
1663  }
1664 
1665  m_NumImportedRows = row_num;
1666 
1667  RecomputeHeaders();
1668 }
1669 
1670 void CTableImportDataSource::MergeColumns(vector<size_t> col_indices, char ch,
1671  bool no_merge_char)
1672 {
1673  std::sort(col_indices.begin(), col_indices.end());
1674 
1675  // make sure indices not > number of columns
1676  if (col_indices.back() >= m_Columns.size())
1677  return;
1678 
1679  // We are interested here in fields, not columns. The first column is
1680  // always the row # column which is not a field parsed from the text. So
1681  // we subtract 1 here to convert from columns to fields
1682  for (size_t i=0; i<col_indices.size(); ++i)
1683  col_indices[i] -= 1;
1684 
1685  // Go through each entry in the table
1686  for (size_t row=0; row<m_TableEntries.size(); ++row) {
1687 
1688  // Don't parse comment rows (but do get the header, if any)
1689  if (row != m_ColumnHeaderRow &&
1690  (row < (size_t)m_ImportFromRow || m_TableEntries[row].GetRowNum() == -1)) {
1691  continue;
1692  }
1693 
1694  // This number of columns in this row is below the number specified for merging:
1695  if (col_indices.back() >= size_t(m_TableEntries[row].GetNumFields()))
1696  continue;
1697 
1698  vector<size_t> merge_cols;
1699  // Remove any columns which are not available in this row:
1700  for (size_t i=0; i<col_indices.size(); ++i)
1701  if ( col_indices[i] < m_TableEntries[row].GetNumFields())
1702  merge_cols.push_back(col_indices[i]);
1703 
1704  // Get edited row (replace each separation between given cols with new
1705  // single character). Don't care here if separators were merged (or multiple) -
1706  // each is replaced with a single char.
1707  string& str = m_TableEntries[row].GetValue();
1708  // get string up to end of first col (the 'from_col')
1709  size_t field_end_idx = m_TableEntries[row].GetFields()[ merge_cols.front()].first +
1710  m_TableEntries[row].GetFields()[ merge_cols.front()].second;
1711  string merged_str = str.substr(0, field_end_idx);
1712 
1713  // Can also merge without inserting a character (no_merge_char)
1714  for (size_t i=1; i<merge_cols.size(); ++i) {
1715  size_t col = merge_cols[i];
1716  if (!no_merge_char)
1717  merged_str += ch;
1718  merged_str += str.substr(m_TableEntries[row].GetFields()[col].first,
1719  m_TableEntries[row].GetFields()[col].second);
1720  }
1721 
1722  for (size_t col=merge_cols.front()+1; col<m_TableEntries[row].GetFields().size(); ++col) {
1723  // If this is not one of the merged columns, add it to the end of the row
1724  // we are (re)building
1725  if (std::find(merge_cols.begin(), merge_cols.end(), col) == merge_cols.end()) {
1726  // get field start plus preceeding delimiter
1727  size_t field_start_idx = m_TableEntries[row].GetFields()[col].first-1;
1729  merged_str += " ";
1730  }
1731 
1732  merged_str += str.substr(field_start_idx,
1733  m_TableEntries[row].GetFields()[col].second+1);
1734  }
1735  }
1736 
1737  m_TableEntries[row].GetValue() = merged_str;
1738  }
1739 
1740  RecomputeFields(true);
1741 }
1742 
1743 
1744 bool CTableImportDataSource::SplitColumn(size_t col_idx, char ch,
1745  bool split_on_whitespace)
1746 {
1747  // We are interested here in fields, not columns. The first column is
1748  // always the row # column which is not a field parsed from the text. So
1749  // we subtract 1 here to convert from columns to fields
1750  col_idx -= 1;
1751 
1752  if (col_idx >= m_Columns.size())
1753  return false;
1754 
1755  // What is our delimiter character? If there are multiple we can choose any.
1756  // If the delimiter character is space, with multiple spaces required,
1757  // we will use 2 blanks.
1758  string delimiter = "";
1760  delimiter = " ";
1761  }
1762  // should be true unless user unchecked all delims...
1763  else if (m_DelimRules.GetDelimiters().size() > 0) {
1765  }
1766  else {
1767  return false;
1768  }
1769 
1770  // If any of the rows are split, we will add a new column to each row,
1771  // although that column will be empty in rows without a split. That is
1772  // likely what the user wants in most cases.
1773 
1774  // Initial run to determine if there are splits in any rows. If not
1775  // return.
1776  bool has_splits = false;
1777  for (size_t row=0; row<m_TableEntries.size() && !has_splits; ++row) {
1778  // Don't parse header or comment rows for this check
1779  if (row < (size_t)m_ImportFromRow || m_TableEntries[row].GetRowNum() == -1) {
1780  continue;
1781  }
1782 
1783  // Skip if this particular row has fewer than 'col_idx' fields
1784  if (col_idx >= m_TableEntries[row].GetNumFields())
1785  continue;
1786 
1787  // Get row and replace the first instance of 'ch' in column 'col_idx' with
1788  // a separator character. If there are no instances or multiple instances
1789  // of 'ch' only the first (or none) are updated
1790  string& str = m_TableEntries[row].GetValue();
1791 
1792  size_t start_idx = m_TableEntries[row].GetFields()[col_idx].first;
1793  size_t chars = m_TableEntries[row].GetFields()[col_idx].second;
1794 
1795  for (size_t i=start_idx; i<start_idx+chars; ++i) {
1796  if (split_on_whitespace && (str[i]==' ' || str[i]=='\t')) {
1797  has_splits = true;
1798  break;
1799  }
1800  else if (!split_on_whitespace && str[i] == ch) {
1801  has_splits = true;
1802  break;
1803  }
1804  }
1805  }
1806 
1807  if (!has_splits)
1808  return false;
1809 
1810  // Go through each entry in the table
1811  for (size_t row=0; row<m_TableEntries.size(); ++row) {
1812 
1813  // Don't parse comment rows (but do get the header, if any)
1814  if (row != m_ColumnHeaderRow &&
1815  (row < (size_t)m_ImportFromRow || m_TableEntries[row].GetRowNum() == -1)) {
1816  continue;
1817  }
1818 
1819  // Skip if this particular row has fewer than 'col_idx' fields
1820  if (col_idx >= m_TableEntries[row].GetNumFields())
1821  continue;
1822 
1823  // Get row and replace the first instance of 'ch' in column 'col_idx' with
1824  // a separator character. If there are no instances or multiple instances
1825  // of 'ch' only the first (or none) are updated
1826  string& str = m_TableEntries[row].GetValue();
1827 
1828  size_t start_idx = m_TableEntries[row].GetFields()[col_idx].first;
1829  size_t chars = m_TableEntries[row].GetFields()[col_idx].second;
1830 
1831  // get string up to first character of split column
1832  string merged_str = str.substr(0, start_idx);
1833 
1834  bool col_split = false;
1835  for (size_t i=start_idx; i<start_idx+chars; ++i) {
1836  // We only split once per row, maximum
1837  if (split_on_whitespace && !col_split &&
1838  (str[i]==' ' || str[i]=='\t')) {
1839  merged_str += delimiter;
1840  col_split = true;
1841 
1842  // Get end of the whitespace and restart there
1843  string ws(" \t");
1844  size_t next_non_whitespace_idx = str.find_first_not_of(ws, i+1);
1845  if (next_non_whitespace_idx == string::npos ||
1846  next_non_whitespace_idx >= start_idx+chars)
1847  break;
1848  else
1849  i = next_non_whitespace_idx-1;
1850  }
1851  else if (str[i] == ch && !col_split) {
1852  merged_str += delimiter;
1853  col_split = true;
1854  }
1855  else {
1856  merged_str += str[i];
1857  }
1858  }
1859 
1860  // Add delimiter to end of the field to add a new, blank, field.
1861  // Note that this won't work if user chooses to merge delimiters.
1862  if (!col_split)
1863  merged_str += delimiter;
1864 
1865  // Add all remaining characters after the split column
1866  merged_str += str.substr(start_idx + chars, str.length()-(start_idx+chars));
1867  m_TableEntries[row].GetValue() = merged_str;
1868  }
1869 
1870  RecomputeFields(true);
1871  return true;
1872 }
1873 
1875 {
1876  // What is our delimiter character? If there are multiple we can choose any.
1877  // If the delimiter character is space, with multiple spaces required,
1878  // we will use 2 blanks.
1879  string delimiter = "";
1881  delimiter = " ";
1882  }
1883  // should be true uless user unchecked all delims...
1884  else if (m_DelimRules.GetDelimiters().size() > 0) {
1886  }
1887  else {
1888  return false;
1889  }
1890 
1891  // If any of the rows are split, we will add a new column to each row,
1892  // although that column will be empty in rows without a split. That is
1893  // likely what the user wants in most cases.
1894 
1895  // Initial run to determine if there are splits in any rows. If not
1896  // return.
1897  bool has_splits = false;
1898  for (size_t row=0; row<m_TableEntries.size() && !has_splits; ++row) {
1899  // Don't parse header or comment rows for this check
1900  if (row < (size_t)m_ImportFromRow || m_TableEntries[row].GetRowNum() == -1) {
1901  continue;
1902  }
1903 
1904  // Get row and replace the first instance of 'ch' in column 'col_idx' with
1905  // a separator character. If there are no instances or multiple instances
1906  // of 'ch' only the first (or none) are updated
1907  string& str = m_TableEntries[row].GetValue();
1908 
1909  if (NStr::Find(str, " ") != NPOS)
1910  has_splits = true;
1911  }
1912 
1913  if (!has_splits)
1914  return false;
1915 
1916  // Go through each entry in the table
1917  for (size_t row=0; row<m_TableEntries.size(); ++row) {
1918 
1919  // Don't parse comment rows (but do get the header, if any)
1920  if (row != m_ColumnHeaderRow &&
1921  (row < (size_t)m_ImportFromRow || m_TableEntries[row].GetRowNum() == -1)) {
1922  continue;
1923  }
1924 
1925  // Get row and replace any instance of 2 or more spaces with a single instance of 'delimiter'
1926  string& str = m_TableEntries[row].GetValue();
1927 
1928  string result;
1929  int spaces = 0;
1930  for (size_t i=0; i<str.size(); ++i) {
1931  if (str[i] != ' ' && (spaces==0)) {
1932  result += str[i];
1933  }
1934  else if (str[i] == ' ') {
1935  spaces += 1;
1936  }
1937  else if (str[i] != ' ' && (spaces > 0)) {
1938  if (spaces == 1) {
1939  result += ' ';
1940  }
1941  else {
1942  result += delimiter;
1943  }
1944  result += str[i];
1945  spaces = 0;
1946  }
1947  }
1948 
1949  // handle trailing space or spaces
1950  if (spaces > 1)
1951  result += delimiter;
1952  else if (spaces == 1)
1953  result += ' ';
1954 
1955  m_TableEntries[row].GetValue() = result;
1956  }
1957 
1958  RecomputeFields(true);
1959  return true;
1960 }
1961 
1963 {
1964  // The seq-table
1966  table.Reset(new CSeq_table());
1967 
1968  CRef<CUser_object> column_meta_info;
1969  column_meta_info.Reset(new CUser_object());
1970 
1971  table->SetNum_rows(m_NumImportedRows);
1972 
1973  // Field id for data type - initialized based on type of each column
1976 
1977  // Skip first column since it's just a row number
1978  for (size_t i=1; i<m_Columns.size(); ++i) {
1980  cinfo.Reset(new CSeqTable_column_info());
1981 
1982  string label;
1983  string value_type;
1984  string properties;
1985 
1986  label = "Column.";
1988 
1990 
1991  switch (m_Columns[i].GetType()) {
1993  continue;
1994 
1996  {
1997  switch (m_Columns[i].GetDataType()) {
2000  break;
2002  // Want to use location IDs in all cases because then table
2003  // can do broadcasting (doesn't work with GIs which are stored
2004  // as ints)
2005  //field_id = CSeqTable_column_info_Base::eField_id_location_gi;
2007  break;
2010  break;
2013  if (m_Columns[i].GetAssembly().GetUseMapping() && !m_Columns[i].GetAssembly().GetAssemblyAcc().empty()) {
2014  properties += " &genome_assembly=" + m_Columns[i].GetAssembly().GetAssemblyAcc();
2015  }
2016  break;
2019  break;
2021  // None of the different column types seem to apply to snps/variations
2023  if (m_Columns[i].GetAssembly().GetUseMapping() && !m_Columns[i].GetAssembly().GetAssemblyAcc().empty()) {
2024  properties += " &genome_assembly=" + m_Columns[i].GetAssembly().GetAssemblyAcc();
2025  }
2026  break;
2027  default:
2029  break;
2030  };
2031  break;
2032  }
2033 
2035  {
2036  switch (m_Columns[i].GetDataType()) {
2039  break;
2042  break;
2045  break;
2048  break;
2049  default:
2051  break;
2052  };
2053 
2054  // If start or stop positions are one-based, we subtract one when
2055  // we copy the field to the seq-annot, so we set the property false
2056  if (m_Columns[i].GetOneBased() &&
2057  m_Columns[i].GetDataType() != CTableImportColumn::eStartPosition &&
2058  m_Columns[i].GetDataType() != CTableImportColumn::eStopPosition)
2059  properties += " &one_based=true";
2060  else
2061  properties += " &one_based=false";
2062  break;
2063  }
2064 
2066  {
2067  switch (m_Columns[i].GetDataType()) {
2070  break;
2071  default:
2073  break;
2074  };
2075  break;
2076  }
2077 
2079  {
2080  switch (m_Columns[i].GetDataType()) {
2083  break;
2086  break;
2089  break;
2092  break;
2095  break;
2098  break;
2101  break;
2104  break;
2105  default:
2107  break;
2108  };
2109  break;
2110  }
2111 
2112  default:
2113  // All options already accounted for.
2114  break;
2115  };
2116 
2117  properties = "&xtype=" + value_type + properties;
2118  cinfo->SetField_id(field_id);
2119  cinfo->SetTitle(m_Columns[i].GetName());
2120 
2121  // Add any other properties set by the caller
2122  const map<string,string>& pmap = m_Columns[i].GetPropertyValues();
2124 
2125  for (iter=pmap.begin(); iter!=pmap.end(); ++iter) {
2126  string prop = "&" + (*iter).first + "=" + (*iter).second;
2127  properties += prop;
2128  }
2129 
2131  column.Reset(new CSeqTable_column());
2132 
2133  column->SetHeader(*cinfo);
2135  data.Reset(new CSeqTable_multi_data());
2136 
2137  switch (m_Columns[i].GetType()) {
2141  else
2143  break;
2146  break;
2149  break;
2152  break;
2154  break;
2155  default:
2156  break;
2157  }
2158 
2159  // The number of columns is one greater than the number of fields since
2160  // the first column is the row number (and the only rows to import are
2161  // those rows were the row number is an integer, not '-').
2162  int field_num = static_cast<int>(i-1);
2163 
2164  // subtract 1 from numeric start/stop position fields that are one based
2165  int num_delta = 0;
2166  if (m_Columns[i].GetOneBased() &&
2167  (m_Columns[i].GetDataType() == CTableImportColumn::eStartPosition ||
2168  m_Columns[i].GetDataType() == CTableImportColumn::eStopPosition)) {
2169  num_delta = 1;
2170  }
2171 
2172  string field_string_value;
2173 
2174  for (size_t j=m_ImportFromRow; j<m_TableEntries.size(); ++j) {
2175 
2176  // If this is a comment row, ignore it. (Header rows can also
2177  // be ignored this way, but we start at m_ImportFromRow, which is
2178  // after the header rows)
2179  if (m_TableEntries[j].GetRowNum() == -1)
2180  continue;
2181 
2182  field_string_value = "";
2183 
2184  // Some rows may have fewer fields (data problems). If so,
2185  // we will put in default values (since each column is in a separate
2186  // array missing entries would cause fields to not line up as
2187  // in the original rows)
2188  if (field_num < (int)m_TableEntries[j].GetNumFields()) {
2189  field_string_value = m_TableEntries[j].GetField(field_num);
2190 
2191  NStr::TruncateSpacesInPlace(field_string_value);
2192 
2193  switch (m_Columns[i].GetType()) {
2195  {
2196  if ( value_type == "Chromosome" || value_type == "Rsid" ) {
2197  data->SetString().push_back(field_string_value);
2198  }
2199  else {
2200  try {
2201  // Currently we save gi's as ids so we won't use this (int) gi
2202  // since it doesn't work with broadcasting when table is loaded.
2204  CSeq_id* gid = new CSeq_id(field_string_value);
2205  data->SetInt().push_back(GI_TO(int, gid->GetGi()));
2206  }
2207  }
2208  catch (CException& e) {
2209  e.ReportThis();
2210  data->SetInt().push_back(0);
2211  }
2212 
2213  try {
2215  CRef<CSeq_id> id(new CSeq_id(
2216  CSeq_id_Base::e_Local, field_string_value));
2217  data->SetId().push_back(id);
2218  }
2219  else {
2220  CRef<CSeq_id> id(new CSeq_id(field_string_value));
2221  data->SetId().push_back(id);
2222  }
2223  }
2224  catch (CException& e) {
2225  e.ReportThis(); //e_not_set 0
2226  CRef<CSeq_id> id(new CSeq_id());
2227  data->SetId().push_back(id);
2228  }
2229  }
2230  }
2231  break;
2233  {
2234  // We use the unsigned conversion since it supports
2235  // suffices like KB, MB. But since its unsigned
2236  // we have to handle negative numbers ourselves
2237  int sign_val = 1;
2238 
2239  if (field_string_value.length() > 1 &&
2240  field_string_value[0] == '-') {
2241  field_string_value = field_string_value.substr(1, field_string_value.length()-1);
2242  sign_val = -1;
2243  }
2244 
2245  int val = (int)NStr::StringToUInt8_DataSize(field_string_value,
2247 
2248  // need to set 1-based ints to 0-based in some cases. Only do this
2249  // if number is positive since a 0 or negative number in a field that is
2250  // supposed to be 'one-based' is not really meaningful
2251  if (sign_val == 1 && val > 0)
2252  val -= num_delta;
2253 
2254  val *= sign_val;
2255 
2256  data->SetInt().push_back(val);
2257  }
2258  break;
2260  {
2261  double val = NStr::StringToDouble(field_string_value,
2263  data->SetReal().push_back(val);
2264  }
2265  break;
2267  {
2268  data->SetString().push_back(field_string_value);
2269  }
2270  break;
2271  default:
2272  break;
2273  }
2274  }
2275  else {
2276  // No data was in the table for this field - log an error showing what
2277  // data was missing (1-based indices for log)
2278  ERR_POST(Error << "Table missing column value for row, column: ("
2279  << j-m_ImportFromRow + 1 << ", " << i << ")");
2280 
2281  // Put in some default value in based on column type
2282  switch (m_Columns[i].GetType()) {
2284  {
2285  if ( value_type == "Chromosome") {
2286  data->SetString().push_back("");
2287  }
2288  else {
2289  try {
2290  // Currently we save gi's as ids so we won't use this (int) gi
2291  // since it doesn't work with broadcasting when table is loaded.
2293  data->SetInt().push_back(0); //??
2294  }
2296  CRef<CSeq_id> id(new CSeq_id(
2297  CSeq_id_Base::e_Local, ""));
2298  data->SetId().push_back(id);
2299  }
2300  else {
2301  CRef<CSeq_id> id(new CSeq_id(""));
2302  data->SetId().push_back(id);
2303  }
2304  }
2305  catch (CException& e) {
2306  e.ReportThis();
2307  }
2308  }
2309  }
2310  break;
2312  {
2313  // What is a reasonable default for a number? -1, 0, maxint?
2314  // Use -1 to force an error if an attempt is made to convert
2315  // it to a seq-loc or feature.
2316  data->SetInt().push_back(-1);
2317  }
2318  break;
2320  {
2321  // default: -1.0
2322  data->SetReal().push_back(-1.0);
2323  }
2324  break;
2326  {
2327  data->SetString().push_back("");
2328  }
2329  break;
2330  default:
2331  break;
2332  }
2333  }
2334  }
2335 
2336 
2337  column->SetData(*data);
2338  table->SetColumns().push_back(column);
2339 
2340  // Not required for qualifier tables
2341  column_meta_info->AddField(label, properties);
2342  }
2343 
2344  CRef<CObject_id> column_meta_info_id;
2345  column_meta_info_id.Reset(new CObject_id());
2346  column_meta_info_id->SetStr("Column Meta Info");
2347 
2348  column_meta_info->SetType(*column_meta_info_id);
2349  // Clear any existing user objects;
2350  annot_container->SetDesc().Set().clear();
2351  // Add user object with meta info for all fields in this table
2352  annot_container->AddUserObject(*column_meta_info);
2353 
2354  /// file name of source table (fname is a temporary asn file)
2355  CFile f(string(m_FileName.ToUTF8()));
2356  annot_container->SetTitleDesc(f.GetName());
2357 
2358  /// See e:\src\gbench\include\objects\seqfeat\SeqFeatData_.hpp
2359  table->SetFeat_type(CSeqFeatData_Base::e_not_set);
2360 
2362  d.Reset(new CSeq_annot::TData());
2363  d->SetSeq_table(*table);
2364 
2365  annot_container->SetData(*d);
2366  }
2367 
2369  {
2370  LOG_POST(Info << "Table Import column descriptions: ");
2371  for (size_t i=1; i<m_Columns.size(); ++i) {
2372  m_Columns[i].LogColumnInfo();
2373  }
2374  }
2375 
2377  {
2378  user_object.AddField("comment-char", (int)m_CommentChar);
2379  user_object.AddField("table-type", (int)m_TableType);
2380  user_object.AddField("file-type", (int)m_FileType);
2381  user_object.AddField("first-row", m_ImportFromRow);
2382  user_object.AddField("column-header-row", m_ColumnHeaderRow);
2383  user_object.AddField("recompute-delimiters", m_UseCurrentDelimiters);
2384 
2385  CRef<CUser_field> delimiter_object(new CUser_field());
2386  delimiter_object->SetLabel().SetStr() = "delimiters";
2387  m_DelimRules.SaveAsn(delimiter_object.GetObject());
2388  user_object.SetData().push_back(delimiter_object);
2389 
2390  CRef<CUser_field> column_vec(new CUser_field());
2391  column_vec->SetLabel().SetStr() = "column-array";
2392 
2393  vector<CRef<CUser_field> > columns;
2394 
2395  for (size_t i=1; i<m_Columns.size(); ++i) {
2397  column->SetLabel().SetStr() = "column";
2398 
2399  m_Columns[i].SaveAsn(column.GetObject());
2400 
2401  columns.push_back(column);
2402  }
2403 
2404  column_vec->SetData().SetFields() = columns;
2405  user_object.SetData().push_back(column_vec);
2406  }
2407 
2409 {
2410  if (user_object.HasField("comment-char") &&
2411  user_object.GetField("comment-char").GetData().IsInt()) {
2412  m_CommentChar = (char)user_object.
2413  GetField("comment-char").GetData().GetInt();
2414  }
2415 
2416  if (user_object.HasField("table-type") &&
2417  user_object.GetField("table-type").GetData().IsInt()) {
2418  m_TableType = (EFieldSeparatorType)user_object.
2419  GetField("table-type").GetData().GetInt();
2420  }
2421 
2422  if (user_object.HasField("file-type") &&
2423  user_object.GetField("file-type").GetData().IsInt()) {
2424  m_FileType = (ETableFileType)user_object.
2425  GetField("file-type").GetData().GetInt();
2426  }
2427 
2428  if (user_object.HasField("first-row") &&
2429  user_object.GetField("first-row").GetData().IsInt()) {
2430  m_ImportFromRow = user_object.
2431  GetField("first-row").GetData().GetInt();
2432  }
2433 
2434  if (user_object.HasField("column-header-row") &&
2435  user_object.GetField("column-header-row").GetData().IsInt()) {
2436  m_ColumnHeaderRow = user_object.
2437  GetField("column-header-row").GetData().GetInt();
2438  }
2439 
2440  if (user_object.HasField("recompute-delimiters") &&
2441  user_object.GetField("recompute-delimiters").GetData().IsBool()) {
2442  m_UseCurrentDelimiters = user_object.
2443  GetField("recompute-delimiters").GetData().GetBool();
2444  }
2445 
2446  if (user_object.HasField("delimiters")) {
2447  CUser_field& delimiter_object = user_object.SetField("delimiters");
2448  m_DelimRules.LoadAsn(delimiter_object);
2449  }
2450 
2451  if (user_object.HasField("column-array")) {
2452  const CUser_field& columns = user_object.GetField("column-array");
2453 
2454  // Erase any existing columns excpet the first (line number) column since that
2455  // is never stored
2456  if (m_Columns.size() > 1)
2457  m_Columns.erase(m_Columns.begin()+1, m_Columns.end());
2458 
2459  if (columns.GetData().IsFields()) {
2460  vector<CRef<CUser_field> > col_fields = columns.GetData().GetFields();
2461 
2462  for (size_t i=0; i<col_fields.size(); ++i) {
2463  CTableImportColumn col;
2464  col.LoadAsn(col_fields[i].GetObject());
2465  m_Columns.push_back(col);
2466  }
2467  }
2468  }
2469 }
2470 
2471 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
Data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
void GetGroupOccuranceAverage(char from_char, char to_char, float &certainty, int &first_non_header) const
Capture point at which # of occurances of characters between from_char and to_char approach 'normal' ...
size_t m_CaptureTarget
Number of rows we will try to capture (max)
int m_CharCount
Total number of characters that went into updating the histogram.
void AddInitialRows(const string &row)
First rows in files may contain headers. Record these for later analysis.
void UpdateHistogramWithDelim(const string &row, char delim)
Updates occurance data used by GetDelimiterProbabilites while ignoring characters enclosed by string ...
vector< vector< int > > m_CharFrequency
The outer vector m_CharFrequency[i] represents the number of times a character occurs 'i' times among...
void GetDelimiterProbablities(float &max_score, vector< char > &delims, NStr::EMergeDelims &merge)
Return the most likely delimiter token(s) and its weighted probability (max_score: 0....
void UpdateHistogram(const string &row)
Updates occurance data used by GetDelimiterProbabilities() based on current row.
void x_UpdateMaxRowLen(int len)
Updates m_CharFrequency to reflect the current maximum row length.
vector< CMergedChar > m_Repeats
int GetCharCount() const
Return number of characters processed so far.
vector< string > m_RecordedRows
Up to m_CaptureTarget rows recorded in calls to UpdateHistogram*()
int m_RowCount
The number of rows that went into updating the histogram (this is the number of times that UpdateHist...
vector< string > m_InitialRows
Set of initial rows in file (may contain headers)
static const int s_NumDelimiters
Number of entries in m_CharFrequency (256 + merged delimiters we check)
CFile –.
Definition: ncbifile.hpp:1604
void AddUserObject(CUser_object &obj)
Definition: Seq_annot.cpp:169
void SetTitleDesc(const string &title)
Definition: Seq_annot.cpp:96
CTableDelimiterRules -.
vector< char > m_Delimiters
One or more single characters that divide the table entries into separate fields.
const vector< char > & GetDelimiters() const
bool m_MultiLineQuotes
CSV formats allows quotes to extend over multiple lines.
void SetDelimiters(const vector< char > &d)
Get/set delimiter characters.
void LoadAsn(CUser_field &delimiter_object)
Import delimiter rules from ASN user-object format.
void SaveAsn(CUser_field &user_field) const
Export delimiter rules in ASN user-object format.
void Reset()
Clear delimiters and set all values to defaults.
void SetMergeDelimiters(bool b)
Get/set merge delimiters rule for tokenizing table into fields.
char m_QuoteChar
Any delimiters inside strings enclosed in m_QuoteChar are not considered as field separators.
bool m_MergeDelimiters
If m_MergeDelimiters is true, adjacent delimiters (characters from m_Delimiters) next to each other a...
void LogDelims() const
Write delims information to log.
bool MatchingDelimiters(vector< char > other_delims) const
Return true if the delimiters match (even if order is different)
CTableImportColumn -.
void SetName(const string &n)
static string GetStringFromDataType(eDataType t)
Return a string version of a data-type (e.g. "Length" for eLength)
void LoadAsn(CUser_field &user_field)
Import column info from ASN user-object format.
ETableFileType
Possible file types - allows special processing for table types that are not totally generic.
int m_NumImportedRows
Number of rows to actually be imported.
void SetColumnHeaderRow(int c)
set/get (optional) row from which to parse column names
void SetTableType(EFieldSeparatorType e)
Update current table type.
vector< CTableImportRow > m_TableEntries
Holds a string and field position and size for each line from the file.
char m_CommentChar
Optional comment character - lines beginning with this character are not imported.
vector< CTableImportColumn > m_Columns
Descriptors for columns.
void LogColumnInfo() const
Log column information.
int m_ImportFromRow
A generic mechanism to avoid loading comment or header column rows at the top of the file - a user-se...
void SetCommentChar(char c)
set/get comment character (lines beginning with this are not imported)
void MergeColumns(vector< size_t > col_indices, char ch, bool no_merge_char=false)
Replace all delimiter characters separating cols in the array 'col_indices' with the delimiter char '...
void x_FindHeaderRows(const CCharHistogram &hist)
Examines in put data and makes a best-guest at how many header rows there are and if there is a speci...
void RecomputeFields(bool recreate_columns, int recompute_count=-1)
Updated the individual rows and columns to match the current delimiter choice.
int m_ColumnHeaderRow
If != -1, parse selected row to get column names.
CTableDelimiterRules m_DelimRules
For character-delimited tables, the delimiter character(s), merge rule, and quote-handling option.
void ExtractFixedFields()
Update fields in rows to reflect column widths in fixed tables (use character widths in m_Columns)
void SetHeaderAndFirstRow(int column_header_row, int first_row)
Set column header row and first row (more efficient when updating both)
bool LoadTable(const wxString &fname, CUser_object &user_object)
static void x_ParseQuotedEntry(const CTempString &tr, const CTempString &delim, const CTempString &delim_and_quote, NStr::EMergeDelims merge, bool multiple_spaces_only, char quote_char, vector< pair< size_t, size_t > > &token_pos)
Parse fields from 'str' returning position and lengths, respectively, of parsed fields in token_pos.
void LogFixedFieldWidths() const
Log fixed field widths.
EFieldSeparatorType m_TableType
Tells if table fields are delimited by characters or are fixed width.
void ExportTableParms(CUser_object &user_object)
Export table load parameters in ASN user-data format.
string GetField(size_t row, size_t col) const
return a specific field from a specific row, based on current table type and delimiter
EFieldSeparatorType
Possible delimiter options.
void SaveTable(CNcbiOfstream &ofs)
Save possible edited table (edits possible w/merge-split cols and row edits)
ETableFileType m_FileType
The underlying file type (or undefined)
wxString m_FileName
Name of file from which table was loaded.
bool x_PickFileType()
After loading rows, this tests for any distinctive file types (which would allow delimiter and header...
size_t m_MaxRowLen
Maximum row length - useful for displaying data in single-column mode.
void RecomputeRowFields(size_t row_idx)
Do same but only for 1 row.
void ImportTableParms(CUser_object &user_object)
Import table load parameters in ASN user-data format.
size_t m_MaxNonImportedRowLength
Rows that are not imported are displayed differently so it's helpful to know their maximum width (in ...
void RecomputeHeaders()
Update columns to genereated names or names parsed from row m_ColumnHeaderRow.
void ClearTable()
clears all columns rows and delimiters
void x_RecomputeRowFields(CTableImportRow &row, const CTempString &delims_ts, const CTempString &delims_quote_ts, NStr::EMergeDelims merge_delims, bool multiple_spaces_only)
Recompute the fields for the provided row.
void ConvertToSeqAnnot(CRef< CSeq_annot > annot_container)
Save data in table into annot_container.
void SetFirstImportRow(int r)
set/get first row for import (0-based)
bool ReplaceSpaces()
Replace all instances of multiple spaces with the current delimiter.
bool SplitColumn(size_t col_idx, char ch, bool split_on_whitespace=false)
Split column col_idx into 2 columns using the character 'ch' (if 'ch' does not appear in the column,...
bool m_UseCurrentDelimiters
If true we should use pre-determined delimiters when loading.
static void x_ParseEntry(const CTempString &str, const CTempString &delim, NStr::EMergeDelims merge, bool multiple_spaces_only, vector< pair< size_t, size_t > > &token_pos)
Parse fields from 'str' returning position and lengths, respectively, of parsed fields in token_pos.
string m_TableEntry
String content of this row in the table.
vector< pair< size_t, size_t > > m_Fields
Each field is entry represents a field as a start/length pair.
string GetField(int column_idx) const
Get a specific field or "" if column_idx > m_Fields.size()
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
const CUser_field & GetField(const string &str, const string &delim=".", NStr::ECase use_case=NStr::eCase) const
Access a named field in this user field.
Definition: User_field.cpp:211
CUser_field & AddField(const string &label, int value)
add fields to the current user field
Definition: User_field.cpp:92
bool HasField(const string &str, const string &delim=".", NStr::ECase use_case=NStr::eCase) const
Verify that a named field exists.
Definition: User_field.cpp:393
bool HasField(const string &str, const string &delim=".", NStr::ECase use_case=NStr::eCase) const
Verify that a named field exists.
CUser_object & AddField(const string &label, const string &value, EParseField parse=eParse_String)
add a data field to the user object that holds a given value
CUser_field & SetField(const string &str, const string &delim=".", const string &obj_subtype=kEmptyStr, NStr::ECase use_case=NStr::eCase)
Access a named field in this user object.
const CUser_field & GetField(const string &str, const string &delim=".", NStr::ECase use_case=NStr::eCase) const
Access a named field in this user object.
Definition: User_object.cpp:71
Interface for testing cancellation request in a long lasting operation.
Definition: icanceled.hpp:51
const_iterator begin() const
Definition: map.hpp:151
const_iterator end() const
Definition: map.hpp:152
#define false
Definition: bool.h:36
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
static const char * str(char *buf, int n)
Definition: stats.c:84
static const char * column
Definition: stats.c:23
static const column_t columns[]
Definition: utf8_2.c:22
char data[12]
Definition: iconv.c:80
#define GI_TO(T, gi)
Definition: ncbimisc.hpp:1085
string
Definition: cgiapp.hpp:687
#define NULL
Definition: ncbistd.hpp:225
#define _TRACE(message)
Definition: ncbidbg.hpp:122
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
string ReportThis(TDiagPostFlags flags=eDPF_Exception) const
Report this exception only.
Definition: ncbiexpt.cpp:397
void Info(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1185
Int8 GetLength(void) const
Get size of file.
Definition: ncbifile.cpp:3204
bool IsFile(EFollowLinks follow=eFollowLinks) const
Check whether a directory entry is a file.
Definition: ncbifile.hpp:3940
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
TObjectType & GetObject(void)
Get object.
Definition: ncbiobj.hpp:1011
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
virtual bool IsCanceled(void) const =0
IO_PREFIX::ofstream CNcbiOfstream
Portable alias for ofstream.
Definition: ncbistre.hpp:500
CNcbiIstream & NcbiGetline(CNcbiIstream &is, string &str, char delim, string::size_type *count=NULL)
Read from "is" to "str" up to the delimiter symbol "delim" (or EOF)
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
EMergeDelims
Whether to merge adjacent delimiters.
Definition: ncbistr.hpp:2514
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3461
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
static double StringToDouble(const CTempStringEx str, TStringToNumFlags flags=0)
Convert string to double.
Definition: ncbistr.cpp:1387
static Uint8 StringToUInt8_DataSize(const CTempString str, TStringToNumFlags flags=0)
Convert string that can contain "software" qualifiers to Uint8.
Definition: ncbistr.cpp:1539
#define NPOS
Definition: ncbistr.hpp:133
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
Definition: ncbistr.cpp:3201
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2891
bool empty(void) const
Return true if the represented string is empty (i.e., the length is zero)
Definition: tempstr.hpp:334
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
static string TruncateSpaces(const string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string.
Definition: ncbistr.cpp:3186
static string & ToLower(string &str)
Convert string to lower case – string& version.
Definition: ncbistr.cpp:405
static const size_type npos
Definition: tempstr.hpp:72
@ fConvErr_NoThrow
Do not throw an exception on error.
Definition: ncbistr.hpp:285
@ fAllowCommas
Allow commas. See 'ENumToStringFlags::fWithCommas'.
Definition: ncbistr.hpp:293
@ fSplit_Tokenize
All delimiters are merged and trimmed, to get non-empty tokens only.
Definition: ncbistr.hpp:2508
@ eNoMergeDelims
Definition: ncbistr.hpp:2516
@ eMergeDelims
Definition: ncbistr.hpp:2515
static const char label[]
const TData & GetData(void) const
Get the Data member data.
TBool GetBool(void) const
Get the variant data.
bool IsInt(void) const
Check if variant Int is selected.
TData & SetData(void)
Assign a value to Data data member.
void SetLabel(TLabel &value)
Assign a value to Label data member.
TInt GetInt(void) const
Get the variant data.
TStr & SetStr(void)
Select the variant.
Definition: Object_id_.hpp:304
bool IsInts(void) const
Check if variant Ints is selected.
void SetType(TType &value)
Assign a value to Type data member.
void SetData(TData &value)
Assign a value to Data data member.
bool IsBool(void) const
Check if variant Bool is selected.
const TInts & GetInts(void) const
Get the variant data.
EField_id
identification of the column data in the objects described by the table known column data types posit...
void SetTitle(const TTitle &value)
Assign a value to Title data member.
void SetField_id(TField_id value)
Assign a value to Field_id data member.
@ eField_id_location_strand
location strand
@ eField_id_id_local
main feature fields id.local.id
@ e_Real
a set of reals, one per row
@ e_String
a set of strings, one per row
@ e_Int
a set of 4-byte integers, one per row
@ e_not_set
No variant selected.
TGi GetGi(void) const
Get the variant data.
Definition: Seq_id_.hpp:889
@ e_Local
local use
Definition: Seq_id_.hpp:95
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_annot_.cpp:244
void SetDesc(TDesc &value)
Assign a value to Desc data member.
Definition: Seq_annot_.cpp:223
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
<!DOCTYPE HTML >< html > n< header > n< title > PubSeq Gateway Help Page</title > n< style > n table
FILE * file
char * buf
int i
int len
constexpr auto sort(_Init &&init)
constexpr bool empty(list< Ts... >) noexcept
double value_type
The numeric datatype used by the parser.
Definition: muParserDef.h:228
const struct ncbi::grid::netcache::search::fields::SIZE size
static const BitmapCharRec *const chars[]
Definition: ncbi_10x20.c:1829
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
NCBI C++ stream class wrappers for triggering between "new" and "old" C++ stream libraries.
T max(T x_, T y_)
T min(T x_, T y_)
Int4 delta(size_t dimension_, const Int4 *score_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
double f(double x_, const double &y_)
Definition: njn_root.hpp:188
static const char delimiter[]
#define row(bind, expected)
Definition: string_bind.c:73
Holds all scoring parameters for a given character.
float m_FrequencyScore
Scores based on mean number of occurances per row (small #'s are bad)
float m_OccuranceScore
Reflects consistency in number of time character appears in each row.
Hold properties for a single character or set of characters that are candidates for merging,...
USING_SCOPE(objects)
else result
Definition: token2.c:20
Modified on Wed Apr 17 13:08:49 2024 by modify_doxy.py rev. 669887