68 : m_RowCount(0), m_CharCount(0), m_CaptureTarget(100)
81 whitespace_repeat.
m_Chars =
" \t";
107 for (
i=0;
i<
row.size(); ++
i) {
108 unsigned int idx = (
unsigned int)
row[
i];
124 int occurances =
buf[
i];
157 for (
i=0;
i<
row.size(); ++
i) {
160 if (!quoted &&
row[
i] == delim) {
169 if (quoted &&
row[
i] == delim) {
170 if (
i+1 <
row.size() &&
row[
i+1] ==
'"')
178 unsigned int idx = (
unsigned int)
row[
i];
193 int occurances =
buf[
i];
216 int& first_non_header)
const
219 first_non_header = -1;
229 int num_chars = to_char - from_char + 1;
230 vector<float> mean_occurances(num_chars, 0.0f);
231 vector<char>
chars(num_chars,
' ');
238 for (
i=(
size_t)from_char;
i<=(size_t)to_char; ++
i) {
245 mean_occurances[
i-from_char] += (((float)j)*((float)
m_CharFrequency[j][
i]))/num_rows;
253 vector<float> mean_delta;
256 vector<float> row_occurances(num_chars, 0.0f);
260 if (c>=from_char && c<=to_char) {
261 row_occurances[c-from_char] += 1.0f;
269 float abs_delta = 0.0f;
270 for (k=0; k<(size_t)num_chars; ++k) {
271 float delta = row_occurances[k] - mean_occurances[k];
279 mean_delta.push_back(abs_delta);
290 float max_normal_delta = 0.0f;
291 float avg_row_len = 0.0f;
292 float avg_row_count = 0;
294 for (j=mean_delta.size()-1; j>=mean_delta.size()/2; --j) {
295 if (mean_delta[j] > max_normal_delta) {
296 max_normal_delta = mean_delta[j];
299 avg_row_count += 1.0f;
302 avg_row_len /= avg_row_count;
306 for (j=0; j<mean_delta.size()-2; ++j) {
309 if (row_len_delta < 0.0f)
310 row_len_delta *= -1.0f;
313 if (row_len_delta1 < 0.0f)
314 row_len_delta1 *= -1.0f;
316 if (mean_delta[j] + row_len_delta*5 <= max_normal_delta*1.6f &&
317 mean_delta[j+1] + row_len_delta1*5 <= max_normal_delta*1.6f) {
318 first_non_header =
static_cast<int>(j);
326 float prev_delta_avg = 0.0f;
327 float post_delta_avg = 0.0f;
329 if (first_non_header > 0 &&
330 first_non_header < (
int)mean_delta.size()-1) {
331 for (j=0; j<mean_delta.size(); ++j) {
332 if (j<(
size_t)first_non_header) {
333 prev_delta_avg += mean_delta[j]*(1.0f/(float)first_non_header);
335 else if (j>(
size_t)first_non_header) {
336 post_delta_avg += mean_delta[j]*(1.0f/(float)(mean_delta.size()-first_non_header));
343 if (post_delta_avg > 0)
344 certainty = prev_delta_avg/post_delta_avg;
346 certainty = prev_delta_avg;
350 vector<char>& delims,
353 std::vector<CDelimScore> m_Scores;
355 float max_frequency_score = 0.0f;
364 float num_rows_inv = 1.0f/num_rows;
367 float mean_occurance = 0.0f;
383 float occurances = (float)j;
386 double delta = (double)(occurances-mean_occurance);
399 m_Scores.push_back(char_score);
401 if (mean_occurance > max_frequency_score)
402 max_frequency_score = mean_occurance;
413 desired_frequency =
std::min(max_frequency_score, desired_frequency);
418 m_Scores[
i].m_FrequencyScore =
std::min(m_Scores[
i].m_FrequencyScore,
419 desired_frequency)/desired_frequency;
435 for (
i=0;
i<m_Scores.size(); ++
i) {
436 m_Scores[
i].m_CombinedScore = (m_Scores[
i].m_OccuranceScore +
437 m_Scores[
i].m_FrequencyScore*0.25f)/1.25f;
440 int delim_candidates_count = 0;
441 std::sort(m_Scores.begin(), m_Scores.end(), greater<CDelimScore>());
444 for (
i=0;
i<m_Scores.size(); ++
i) {
445 if (m_Scores[
i].m_CombinedScore > 0.0f)
452 if (m_Scores[
i].m_CombinedScore > 0.5f)
453 ++delim_candidates_count;
459 if (delim_candidates_count > 1) {
462 m_Scores.erase(m_Scores.begin() + delim_candidates_count, m_Scores.end());
464 vector<string> token_array;
465 for (
i=0;
i<m_Scores.size(); ++
i) {
472 if (m_Scores[
i].m_DelimChar < 256) {
473 delim_str =
string(1, (
char)m_Scores[
i].m_DelimChar);
480 vector<CMergedChar>::iterator iter;
485 delim_str = (*iter).m_Chars;
488 _TRACE(
"Execution error - missing repeat character: " <<
489 m_Scores[
i].m_DelimChar);
500 m_Scores[
i].m_TokenLenScore = (float)token_array.size();
504 for (
size_t tok = 0; tok<token_array.size(); ++tok) {
505 if (token_array[tok].
size() == 0) {
506 m_Scores[
i].m_TokenLenScore -= 2.0f;
508 else if (token_array[tok].
size() == 1) {
509 for (
size_t k=0; k<(size_t)delim_candidates_count; ++k) {
511 if (token_array[tok] ==
string(1, m_Scores[k].m_DelimChar) ) {
512 m_Scores[
i].m_TokenLenScore -= 2.0f;
522 m_Scores[
i].m_TokenLenScore = m_Scores[
i].m_TokenLenScore/(float)token_array.size();
526 float merge_penalty = 0.0f;
527 if ( m_Scores[
i].m_DelimChar > 255 ) {
528 merge_penalty = 0.01f;
533 merge_penalty += (m_Scores[
i].m_DelimChar == 258) ? 0.1f : 0.0f;
541 float token_bonus = 0.0f;
542 if (
char(m_Scores[
i].m_DelimChar) ==
',' ||
543 char(m_Scores[
i].m_DelimChar) ==
';' ||
544 char(m_Scores[
i].m_DelimChar) ==
'|' ||
545 char(m_Scores[
i].m_DelimChar) ==
' ' ||
546 char(m_Scores[
i].m_DelimChar) ==
'\t' ||
547 m_Scores[
i].m_DelimChar > 255)
552 m_Scores[
i].m_CombinedScore = (m_Scores[
i].m_OccuranceScore +
553 m_Scores[
i].m_FrequencyScore*0.2f +
554 (m_Scores[
i].m_TokenLenScore*0.25f)/1.50f) - merge_penalty + token_bonus;
561 std::sort(m_Scores.begin(), m_Scores.end(), greater<CDelimScore>());
565 for (
i=0;
i<m_Scores.size(); ++
i) {
566 if (m_Scores[
i].m_CombinedScore > 0.0f &&
i<5) {
567 _TRACE(
"Score for char: " << m_Scores[
i].m_DelimChar <<
" is: (" <<
568 m_Scores[
i].m_OccuranceScore <<
", " <<
569 m_Scores[
i].m_FrequencyScore <<
", " <<
570 m_Scores[
i].m_TokenLenScore <<
", " <<
571 m_Scores[
i].m_CombinedScore <<
")" );
579 max_score = m_Scores[0].m_CombinedScore;
581 if (m_Scores[0].m_DelimChar < 256) {
582 delims.push_back((
char)m_Scores[0].m_DelimChar);
590 vector<CMergedChar>::iterator iter;
595 for (
size_t j=0; j<(*iter).m_Chars.size(); ++j)
596 delims.push_back((*iter).m_Chars[j]);
599 _TRACE(
"Execution error - missing repeat character: " <<
600 m_Scores[
i].m_DelimChar);
617 sort(cur_delims.begin(), cur_delims.end());
618 sort(other_delims.begin(), other_delims.end());
620 return (cur_delims == other_delims);
633 LOG_POST(
Info <<
"Import Table Delimiters: \"" << delims <<
"\"");
647 user_field.
AddField(
"delim-chars", delims);
655 if (delimiter_object.
HasField(
"delim-chars") &&
660 for (
size_t i=0;
i<delims.size(); ++
i) {
665 if (delimiter_object.
HasField(
"quote-char") &&
671 if (delimiter_object.
HasField(
"multi-line-quotes") &&
677 if (delimiter_object.
HasField(
"merge-delimiters") &&
697 if (column_idx < (
int)
m_Fields.size()) {
698 pair<size_t,size_t> field_idx =
m_Fields[column_idx];
700 field =
m_TableEntry.substr(field_idx.first, field_idx.second);
710 : m_TableType(eDelimitedTable)
711 , m_FileType(eUndefinedFile)
714 , m_NumImportedRows(0)
716 , m_MaxNonImportedRowLength(0)
717 , m_ColumnHeaderRow(-1)
718 , m_UseCurrentDelimiters(
false)
740 CFile tstfile(
string(fname.ToUTF8()));
743 LOG_POST(
"Error opening file: " + fname);
781 if (!ifs->good() || ifs->eof())
799 int total_char_count = 0;
819 total_char_count +=
row.GetValue().size();
834 (row_count > hist_count1 && row_count < hist_count2 && row_count%10 == 0 ) ||
835 (row_count > hist_count2 && row_count%100 == 0)) {
839 bool analyze_row =
true;
841 Int8 average_rowlen = (
Int8)(total_char_count/row_count);
842 int projected_rowcount = (
int)(filesize/average_rowlen);
843 if (projected_rowcount >= 10)
845 else if (row_count < projected_rowcount-5)
856 if (row_count == 80 && filesize != -1) {
858 int projected_rowcount = (
int)(filesize/average_rowlen);
860 m_TableEntries.reserve(projected_rowcount + 0.2*projected_rowcount);
864 row.GetValue().clear();
872 if (row_count == 0) {
882 LOG_POST(
Info <<
"Imported Table Type Guess: Delimited Type");
896 float max_score = 0.0f;
910 if (max_score > 0.8f) {
917 LOG_POST(
Info <<
"Imported Table Type Guess: Delimited Type");
922 LOG_POST(
Info <<
"Imported Table Type Guess: Fixed Width");
974 string::size_type pos =
m_TableEntries[0].GetValue().find_first_not_of(
" \t");
975 char comment_char =
' ';
977 if (pos != string::npos)
980 if (comment_char ==
'#' &&
984 std::vector<char> delimiters;
985 delimiters.push_back(
'\t');
986 delimiters.push_back(
'|');
996 string::size_type spos =
999 if (spos != string::npos &&
1018 int header_rows = 1;
1019 int non_header_rows = 0;
1020 bool has_header =
true;
1021 char comment_char =
'0';
1022 string::size_type pos =
m_TableEntries[0].GetValue().find_first_not_of(
" \t");
1023 if (pos != string::npos)
1027 if ((comment_char >=
'0' && comment_char <=
'9') ||
1028 (comment_char >=
'A' && comment_char <=
'Z') ||
1029 (comment_char >=
'a' && comment_char <=
'z') ||
1030 comment_char ==
' ' || comment_char ==
'\t') {
1037 char first_char =
'0';
1039 if (pos != string::npos)
1042 if (first_char == comment_char) {
1045 if (non_header_rows > 0) {
1058 if (header_rows > 20 || header_rows > non_header_rows) {
1072 float certainty = 0.0f;
1073 int first_non_header_row = -1;
1077 if (certainty > 1.5f && first_non_header_row > 0) {
1098 bool multiple_spaces_only,
1099 vector<std::pair<size_t,size_t> >& token_pos)
1104 }
else if (delim.
empty() && !multiple_spaces_only) {
1105 token_pos.push_back(pair<size_t,size_t>(0,
str.length()));
1111 string::size_type pos, prev_pos;
1114 str.find_first_not_of(delim, pos) : pos);
1117 if (multiple_spaces_only &&
str[pos] ==
' ')
1123 pos =
str.find_first_of(delim, prev_pos);
1124 if (multiple_spaces_only) {
1128 token_pos.push_back(pair<size_t, size_t>(prev_pos,
str.length() - prev_pos));
1131 token_pos.push_back(pair<size_t, size_t>(prev_pos, pos-prev_pos));
1141 bool multiple_spaces_only,
1143 vector<std::pair<size_t,size_t> >& token_pos)
1148 }
else if (delim.
empty() && !multiple_spaces_only) {
1149 token_pos.push_back(pair<size_t,size_t>(0,
str.length()));
1155 string::size_type pos;
1156 string::size_type prev_pos = string::npos;
1157 string::size_type search_pos = string::npos;
1159 bool token_added =
true;
1164 str.find_first_not_of(delim, pos) : pos);
1166 if (multiple_spaces_only &&
str[pos] ==
' ')
1169 search_pos = prev_pos;
1176 pos =
str.find_first_of(delim_and_quote, search_pos);
1177 if (multiple_spaces_only) {
1181 token_pos.push_back(pair<size_t, size_t>(prev_pos,
str.length() - prev_pos));
1184 if (
str[pos] == quote_char) {
1190 pos <
str.length() &&
1191 (
str[pos]!=quote_char ||
1192 (
str[pos]==quote_char &&
str[pos-1]==quote_char) ||
1193 (
str[pos]==quote_char && pos<
str.length()-1 &&
1194 str[pos+1]==quote_char)); ++pos) {
1198 if (pos >=
str.length()-1) {
1199 token_pos.push_back(pair<size_t, size_t>(prev_pos,
str.length() - prev_pos));
1204 token_added =
false;
1209 token_pos.push_back(pair<size_t,size_t>(prev_pos, pos-prev_pos));
1211 search_pos = prev_pos;
1221 bool multiple_spaces_only)
1225 vector<std::pair<size_t,size_t> >& fields =
row.GetFields();
1232 multiple_spaces_only,
1240 multiple_spaces_only,
1249 for (
size_t i=0;
i<fields.size(); ++
i) {
1255 c.
SetWidth(
static_cast<int>(fields[
i].second));
1262 if (
i==0 && column_name.size() > 0 &&
1264 column_name = column_name.substr(1, column_name.size()-1);
1269 c.
SetWidth(
static_cast<int>(column_name.size()));
1279 (
int)fields[
i].second));
1287 string delims_quote_str;
1295 delims_ts = delims_str;
1298 delims_quote_ts = delims_quote_str;
1311 int recompute_count)
1315 string delims_quote_str;
1323 delims_ts = delims_str;
1326 delims_quote_ts = delims_quote_str;
1337 vector<CTableImportRow>::iterator iter;
1340 vector<CTableImportColumn> prev_columns =
m_Columns;
1342 if (recreate_columns) {
1396 if (recompute_count != -1 && ++count > recompute_count) {
1400 if (prev_columns.size() >
i) {
1422 vector<CTableImportRow>::iterator iter;
1434 size_t len = (*iter).GetValue().length();
1435 std::vector<std::pair<size_t,size_t> >& fields = (*iter).GetFields();
1439 size_t start_idx = 0;
1442 if (start_idx <
len) {
1444 fields.push_back(std::pair<size_t,size_t>(start_idx,w));
1448 fields.push_back(std::pair<size_t,size_t>(
len-1, w));
1460 LOG_POST(
Info <<
"Table Import: Fixed field widths: ");
1461 size_t start_idx = 0;
1465 LOG_POST(
Info <<
"Field #: " <<
i <<
" (" << start_idx <<
", "
1466 << start_idx + w <<
")");
1484 string delims_quote_str;
1493 delims_ts = delims_str;
1496 delims_quote_ts = delims_quote_str;
1531 size_t start_idx = 0;
1534 if (start_idx <
len) {
1536 fields.push_back(std::pair<size_t,size_t>(start_idx,w));
1540 fields.push_back(std::pair<size_t,size_t>(
len-1, w));
1548 size_t parsed_field_idx = 0;
1556 if (
i==0 && column_name.size() > 0 &&
1558 column_name = column_name.substr(1, column_name.size()-1);
1563 if (column_name.length() == 0) {
1569 if (column_name ==
"")
1578 (
int)column_name.size()));
1644 vector<CTableImportRow>::iterator iter;
1647 string::size_type spos = (*iter).GetValue().find_first_not_of(
" \n");
1649 if (spos == string::npos || (*iter).GetValue()[spos] !=
m_CommentChar) {
1650 (*iter).SetRowNum(row_num++);
1653 (*iter).SetRowNum(-1);
1659 (*iter).SetRowNum(-1);
1673 std::sort(col_indices.begin(), col_indices.end());
1676 if (col_indices.back() >=
m_Columns.size())
1682 for (
size_t i=0;
i<col_indices.size(); ++
i)
1683 col_indices[
i] -= 1;
1698 vector<size_t> merge_cols;
1700 for (
size_t i=0;
i<col_indices.size(); ++
i)
1702 merge_cols.push_back(col_indices[
i]);
1709 size_t field_end_idx =
m_TableEntries[
row].GetFields()[ merge_cols.front()].first +
1711 string merged_str =
str.substr(0, field_end_idx);
1714 for (
size_t i=1;
i<merge_cols.size(); ++
i) {
1715 size_t col = merge_cols[
i];
1722 for (
size_t col=merge_cols.front()+1; col<
m_TableEntries[
row].GetFields().size(); ++col) {
1725 if (std::find(merge_cols.begin(), merge_cols.end(), col) == merge_cols.end()) {
1732 merged_str +=
str.substr(field_start_idx,
1745 bool split_on_whitespace)
1776 bool has_splits =
false;
1795 for (
size_t i=start_idx;
i<start_idx+
chars; ++
i) {
1796 if (split_on_whitespace && (
str[
i]==
' ' ||
str[
i]==
'\t')) {
1800 else if (!split_on_whitespace &&
str[
i] == ch) {
1832 string merged_str =
str.substr(0, start_idx);
1834 bool col_split =
false;
1835 for (
size_t i=start_idx;
i<start_idx+
chars; ++
i) {
1837 if (split_on_whitespace && !col_split &&
1844 size_t next_non_whitespace_idx =
str.find_first_not_of(ws,
i+1);
1845 if (next_non_whitespace_idx == string::npos ||
1846 next_non_whitespace_idx >= start_idx+
chars)
1849 i = next_non_whitespace_idx-1;
1851 else if (
str[
i] == ch && !col_split) {
1856 merged_str +=
str[
i];
1866 merged_str +=
str.substr(start_idx +
chars,
str.length()-(start_idx+
chars));
1897 bool has_splits =
false;
1930 for (
size_t i=0;
i<
str.size(); ++
i) {
1931 if (
str[
i] !=
' ' && (spaces==0)) {
1934 else if (
str[
i] ==
' ') {
1937 else if (
str[
i] !=
' ' && (spaces > 0)) {
1952 else if (spaces == 1)
2014 properties +=
" &genome_assembly=" +
m_Columns[
i].GetAssembly().GetAssemblyAcc();
2024 properties +=
" &genome_assembly=" +
m_Columns[
i].GetAssembly().GetAssemblyAcc();
2059 properties +=
" &one_based=true";
2061 properties +=
" &one_based=false";
2117 properties =
"&xtype=" +
value_type + properties;
2125 for (iter=pmap.
begin(); iter!=pmap.
end(); ++iter) {
2126 string prop =
"&" + (*iter).first +
"=" + (*iter).second;
2133 column->SetHeader(*cinfo);
2162 int field_num =
static_cast<int>(
i-1);
2172 string field_string_value;
2182 field_string_value =
"";
2197 data->SetString().push_back(field_string_value);
2210 data->SetInt().push_back(0);
2217 data->SetId().push_back(
id);
2221 data->SetId().push_back(
id);
2227 data->SetId().push_back(
id);
2239 if (field_string_value.length() > 1 &&
2240 field_string_value[0] ==
'-') {
2241 field_string_value = field_string_value.substr(1, field_string_value.length()-1);
2251 if (sign_val == 1 &&
val > 0)
2256 data->SetInt().push_back(
val);
2263 data->SetReal().push_back(
val);
2268 data->SetString().push_back(field_string_value);
2278 ERR_POST(
Error <<
"Table missing column value for row, column: ("
2286 data->SetString().push_back(
"");
2293 data->SetInt().push_back(0);
2298 data->SetId().push_back(
id);
2302 data->SetId().push_back(
id);
2316 data->SetInt().push_back(-1);
2322 data->SetReal().push_back(-1.0);
2327 data->SetString().push_back(
"");
2346 column_meta_info_id->
SetStr(
"Column Meta Info");
2348 column_meta_info->
SetType(*column_meta_info_id);
2350 annot_container->
SetDesc().Set().clear();
2363 d->SetSeq_table(*
table);
2370 LOG_POST(
Info <<
"Table Import column descriptions: ");
2386 delimiter_object->
SetLabel().SetStr() =
"delimiters";
2388 user_object.
SetData().push_back(delimiter_object);
2391 column_vec->
SetLabel().SetStr() =
"column-array";
2393 vector<CRef<CUser_field> >
columns;
2397 column->SetLabel().SetStr() =
"column";
2405 user_object.
SetData().push_back(column_vec);
2410 if (user_object.
HasField(
"comment-char") &&
2413 GetField(
"comment-char").GetData().GetInt();
2416 if (user_object.
HasField(
"table-type") &&
2419 GetField(
"table-type").GetData().GetInt();
2422 if (user_object.
HasField(
"file-type") &&
2425 GetField(
"file-type").GetData().GetInt();
2428 if (user_object.
HasField(
"first-row") &&
2431 GetField(
"first-row").GetData().GetInt();
2434 if (user_object.
HasField(
"column-header-row") &&
2437 GetField(
"column-header-row").GetData().GetInt();
2440 if (user_object.
HasField(
"recompute-delimiters") &&
2443 GetField(
"recompute-delimiters").GetData().GetBool();
2446 if (user_object.
HasField(
"delimiters")) {
2451 if (user_object.
HasField(
"column-array")) {
2459 if (
columns.GetData().IsFields()) {
2460 vector<CRef<CUser_field> > col_fields =
columns.GetData().GetFields();
2462 for (
size_t i=0;
i<col_fields.size(); ++
i) {
2464 col.
LoadAsn(col_fields[
i].GetObject());
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
void GetGroupOccuranceAverage(char from_char, char to_char, float &certainty, int &first_non_header) const
Capture point at which # of occurances of characters between from_char and to_char approach 'normal' ...
size_t m_CaptureTarget
Number of rows we will try to capture (max)
int m_CharCount
Total number of characters that went into updating the histogram.
size_t GetMaxRowLen() const
void AddInitialRows(const string &row)
First rows in files may contain headers. Record these for later analysis.
void UpdateHistogramWithDelim(const string &row, char delim)
Updates occurance data used by GetDelimiterProbabilites while ignoring characters enclosed by string ...
vector< vector< int > > m_CharFrequency
The outer vector m_CharFrequency[i] represents the number of times a character occurs 'i' times among...
void GetDelimiterProbablities(float &max_score, vector< char > &delims, NStr::EMergeDelims &merge)
Return the most likely delimiter token(s) and its weighted probability (max_score: 0....
void UpdateHistogram(const string &row)
Updates occurance data used by GetDelimiterProbabilities() based on current row.
void x_UpdateMaxRowLen(int len)
Updates m_CharFrequency to reflect the current maximum row length.
vector< CMergedChar > m_Repeats
int GetCharCount() const
Return number of characters processed so far.
vector< string > m_RecordedRows
Up to m_CaptureTarget rows recorded in calls to UpdateHistogram*()
int m_RowCount
The number of rows that went into updating the histogram (this is the number of times that UpdateHist...
vector< string > m_InitialRows
Set of initial rows in file (may contain headers)
static const int s_NumDelimiters
Number of entries in m_CharFrequency (256 + merged delimiters we check)
void AddUserObject(CUser_object &obj)
void SetTitleDesc(const string &title)
vector< char > m_Delimiters
One or more single characters that divide the table entries into separate fields.
const vector< char > & GetDelimiters() const
bool m_MultiLineQuotes
CSV formats allows quotes to extend over multiple lines.
void SetDelimiters(const vector< char > &d)
Get/set delimiter characters.
bool GetMergeDelimiters() const
void LoadAsn(CUser_field &delimiter_object)
Import delimiter rules from ASN user-object format.
void SaveAsn(CUser_field &user_field) const
Export delimiter rules in ASN user-object format.
char GetQuoteChar() const
void Reset()
Clear delimiters and set all values to defaults.
void SetMergeDelimiters(bool b)
Get/set merge delimiters rule for tokenizing table into fields.
char m_QuoteChar
Any delimiters inside strings enclosed in m_QuoteChar are not considered as field separators.
bool GetMultipleSpacesOnly() const
bool m_MergeDelimiters
If m_MergeDelimiters is true, adjacent delimiters (characters from m_Delimiters) next to each other a...
void LogDelims() const
Write delims information to log.
bool MatchingDelimiters(vector< char > other_delims) const
Return true if the delimiters match (even if order is different)
void SetName(const string &n)
static string GetStringFromDataType(eDataType t)
Return a string version of a data-type (e.g. "Length" for eLength)
void LoadAsn(CUser_field &user_field)
Import column info from ASN user-object format.
ETableFileType
Possible file types - allows special processing for table types that are not totally generic.
int m_NumImportedRows
Number of rows to actually be imported.
void SetColumnHeaderRow(int c)
set/get (optional) row from which to parse column names
void SetTableType(EFieldSeparatorType e)
Update current table type.
vector< CTableImportRow > m_TableEntries
Holds a string and field position and size for each line from the file.
char m_CommentChar
Optional comment character - lines beginning with this character are not imported.
vector< CTableImportColumn > m_Columns
Descriptors for columns.
void LogColumnInfo() const
Log column information.
int m_ImportFromRow
A generic mechanism to avoid loading comment or header column rows at the top of the file - a user-se...
void SetCommentChar(char c)
set/get comment character (lines beginning with this are not imported)
void MergeColumns(vector< size_t > col_indices, char ch, bool no_merge_char=false)
Replace all delimiter characters separating cols in the array 'col_indices' with the delimiter char '...
char GetCommentChar() const
CTableImportDataSource()
ctor
void x_FindHeaderRows(const CCharHistogram &hist)
Examines in put data and makes a best-guest at how many header rows there are and if there is a speci...
void RecomputeFields(bool recreate_columns, int recompute_count=-1)
Updated the individual rows and columns to match the current delimiter choice.
int m_ColumnHeaderRow
If != -1, parse selected row to get column names.
CTableDelimiterRules m_DelimRules
For character-delimited tables, the delimiter character(s), merge rule, and quote-handling option.
void ExtractFixedFields()
Update fields in rows to reflect column widths in fixed tables (use character widths in m_Columns)
void SetHeaderAndFirstRow(int column_header_row, int first_row)
Set column header row and first row (more efficient when updating both)
bool LoadTable(const wxString &fname, CUser_object &user_object)
static void x_ParseQuotedEntry(const CTempString &tr, const CTempString &delim, const CTempString &delim_and_quote, NStr::EMergeDelims merge, bool multiple_spaces_only, char quote_char, vector< pair< size_t, size_t > > &token_pos)
Parse fields from 'str' returning position and lengths, respectively, of parsed fields in token_pos.
void LogFixedFieldWidths() const
Log fixed field widths.
EFieldSeparatorType m_TableType
Tells if table fields are delimited by characters or are fixed width.
void ExportTableParms(CUser_object &user_object)
Export table load parameters in ASN user-data format.
string GetField(size_t row, size_t col) const
return a specific field from a specific row, based on current table type and delimiter
EFieldSeparatorType
Possible delimiter options.
void SaveTable(CNcbiOfstream &ofs)
Save possible edited table (edits possible w/merge-split cols and row edits)
ETableFileType m_FileType
The underlying file type (or undefined)
wxString m_FileName
Name of file from which table was loaded.
bool x_PickFileType()
After loading rows, this tests for any distinctive file types (which would allow delimiter and header...
size_t m_MaxRowLen
Maximum row length - useful for displaying data in single-column mode.
void RecomputeRowFields(size_t row_idx)
Do same but only for 1 row.
void ImportTableParms(CUser_object &user_object)
Import table load parameters in ASN user-data format.
size_t m_MaxNonImportedRowLength
Rows that are not imported are displayed differently so it's helpful to know their maximum width (in ...
void RecomputeHeaders()
Update columns to genereated names or names parsed from row m_ColumnHeaderRow.
void ClearTable()
clears all columns rows and delimiters
void x_RecomputeRowFields(CTableImportRow &row, const CTempString &delims_ts, const CTempString &delims_quote_ts, NStr::EMergeDelims merge_delims, bool multiple_spaces_only)
Recompute the fields for the provided row.
void ConvertToSeqAnnot(CRef< CSeq_annot > annot_container)
Save data in table into annot_container.
void SetFirstImportRow(int r)
set/get first row for import (0-based)
bool ReplaceSpaces()
Replace all instances of multiple spaces with the current delimiter.
bool SplitColumn(size_t col_idx, char ch, bool split_on_whitespace=false)
Split column col_idx into 2 columns using the character 'ch' (if 'ch' does not appear in the column,...
bool m_UseCurrentDelimiters
If true we should use pre-determined delimiters when loading.
static void x_ParseEntry(const CTempString &str, const CTempString &delim, NStr::EMergeDelims merge, bool multiple_spaces_only, vector< pair< size_t, size_t > > &token_pos)
Parse fields from 'str' returning position and lengths, respectively, of parsed fields in token_pos.
string m_TableEntry
String content of this row in the table.
vector< pair< size_t, size_t > > m_Fields
Each field is entry represents a field as a start/length pair.
string GetField(int column_idx) const
Get a specific field or "" if column_idx > m_Fields.size()
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
const CUser_field & GetField(const string &str, const string &delim=".", NStr::ECase use_case=NStr::eCase) const
Access a named field in this user field.
CUser_field & AddField(const string &label, int value)
add fields to the current user field
bool HasField(const string &str, const string &delim=".", NStr::ECase use_case=NStr::eCase) const
Verify that a named field exists.
bool HasField(const string &str, const string &delim=".", NStr::ECase use_case=NStr::eCase) const
Verify that a named field exists.
CUser_object & AddField(const string &label, const string &value, EParseField parse=eParse_String)
add a data field to the user object that holds a given value
CUser_field & SetField(const string &str, const string &delim=".", const string &obj_subtype=kEmptyStr, NStr::ECase use_case=NStr::eCase)
Access a named field in this user object.
const CUser_field & GetField(const string &str, const string &delim=".", NStr::ECase use_case=NStr::eCase) const
Access a named field in this user object.
Interface for testing cancellation request in a long lasting operation.
const_iterator begin() const
const_iterator end() const
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
static const char * str(char *buf, int n)
static const char * column
static const column_t columns[]
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
void Error(CExceptionArgs_Base &args)
string ReportThis(TDiagPostFlags flags=eDPF_Exception) const
Report this exception only.
void Info(CExceptionArgs_Base &args)
Int8 GetLength(void) const
Get size of file.
bool IsFile(EFollowLinks follow=eFollowLinks) const
Check whether a directory entry is a file.
void Reset(void)
Reset reference object.
TObjectType & GetObject(void)
Get object.
int64_t Int8
8-byte (64-bit) signed integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
virtual bool IsCanceled(void) const =0
IO_PREFIX::ofstream CNcbiOfstream
Portable alias for ofstream.
CNcbiIstream & NcbiGetline(CNcbiIstream &is, string &str, char delim, string::size_type *count=NULL)
Read from "is" to "str" up to the delimiter symbol "delim" (or EOF)
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
EMergeDelims
Whether to merge adjacent delimiters.
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
static double StringToDouble(const CTempStringEx str, TStringToNumFlags flags=0)
Convert string to double.
static Uint8 StringToUInt8_DataSize(const CTempString str, TStringToNumFlags flags=0)
Convert string that can contain "software" qualifiers to Uint8.
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
bool empty(void) const
Return true if the represented string is empty (i.e., the length is zero)
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
static string TruncateSpaces(const string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string.
static string & ToLower(string &str)
Convert string to lower case – string& version.
static const size_type npos
@ fConvErr_NoThrow
Do not throw an exception on error.
@ fAllowCommas
Allow commas. See 'ENumToStringFlags::fWithCommas'.
@ fSplit_Tokenize
All delimiters are merged and trimmed, to get non-empty tokens only.
static const char label[]
const TData & GetData(void) const
Get the Data member data.
TBool GetBool(void) const
Get the variant data.
bool IsInt(void) const
Check if variant Int is selected.
TData & SetData(void)
Assign a value to Data data member.
void SetLabel(TLabel &value)
Assign a value to Label data member.
TInt GetInt(void) const
Get the variant data.
TStr & SetStr(void)
Select the variant.
bool IsInts(void) const
Check if variant Ints is selected.
void SetType(TType &value)
Assign a value to Type data member.
void SetData(TData &value)
Assign a value to Data data member.
bool IsBool(void) const
Check if variant Bool is selected.
const TInts & GetInts(void) const
Get the variant data.
EField_id
identification of the column data in the objects described by the table known column data types posit...
void SetTitle(const TTitle &value)
Assign a value to Title data member.
void SetField_id(TField_id value)
Assign a value to Field_id data member.
@ eField_id_location_id
location Seq-id
@ eField_id_location_strand
location strand
@ eField_id_id_local
main feature fields id.local.id
@ eField_id_location_to
interval to
@ eField_id_location_gi
gi
@ eField_id_location_from
interval from
@ e_Real
a set of reals, one per row
@ e_String
a set of strings, one per row
@ e_Int
a set of 4-byte integers, one per row
@ e_not_set
No variant selected.
TGi GetGi(void) const
Get the variant data.
void SetData(TData &value)
Assign a value to Data data member.
void SetDesc(TDesc &value)
Assign a value to Desc data member.
unsigned int
A callback function used to compare two keys in a database.
<!DOCTYPE HTML >< html > n< header > n< title > PubSeq Gateway Help Page</title > n< style > n table
constexpr auto sort(_Init &&init)
constexpr bool empty(list< Ts... >) noexcept
double value_type
The numeric datatype used by the parser.
const struct ncbi::grid::netcache::search::fields::SIZE size
static const BitmapCharRec *const chars[]
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
NCBI C++ stream class wrappers for triggering between "new" and "old" C++ stream libraries.
Int4 delta(size_t dimension_, const Int4 *score_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
double f(double x_, const double &y_)
static const char delimiter[]
#define row(bind, expected)
Holds all scoring parameters for a given character.
float m_FrequencyScore
Scores based on mean number of occurances per row (small #'s are bad)
float m_OccuranceScore
Reflects consistency in number of time character appears in each row.
Hold properties for a single character or set of characters that are candidates for merging,...