NCBI C++ ToolKit
table_column_type_guesser.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: table_column_type_guesser.cpp 47485 2023-05-02 14:46:59Z ucko $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Bob Falk
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 
35 
37 #include <misc/hgvs/sequtils.hpp>
38 
41 
42 #include <math.h>
43 
44 
47 
49  : m_ImportedTableData(ds)
50 {
52  m_Scope->AddDefaults();
53 
54 }
55 
57 {
59  return;
60 
61  // Guess types of individual columns. First column is line num so we ignore
62  for (size_t i=1; i<m_ImportedTableData->GetColumns().size(); ++i) {
63  GuessColumn(i);
64  }
65 
66  // Given individual column types, do guessing based on combinations
67  // of columns. At this point, the only combination we are looking
68  // for is start/stop following an ID column.
69  for (size_t i=1; i<m_ImportedTableData->GetColumns().size(); ++i) {
71 
72  // If the current column is a seq-id and the next two are integers,
73  // then assume they are start and stop (we could also require that
74  // the second column #'s be >= than the numbers in the first column)
75  if (m_ImportedTableData->GetColumns().size() >= i+3 &&
78 
79  m_ImportedTableData->GetColumns()[i+1].SetDataType(
81  m_ImportedTableData->GetColumns()[i+2].SetDataType(
83  }
84  }
85  }
86 
87 }
88 
90 {
91  size_t row = 0;
92 
93  // Examine rows and for each row guess its type and add that result
94  // to the entry in these arrays for type info. (one array for basic type,
95  // int, string, etc and one for semantic type, e.g. strand, start position)
96  vector<size_t> type_matches((int)CTableImportColumn::eTypeUndefined, 0);
97  vector<size_t> property_matches((int)CTableImportColumn::eLastType, 0);
98 
101 
102  // Stop check after 300 rows (that should be enough to pick a type)
103  size_t check_count = std::min(m_ImportedTableData->GetNumRows(), (size_t)300);
104  for (row=0; row<check_count; ++row) {
105  // GetRowNum() == -1 for header rows
106  if (m_ImportedTableData->GetRow(row).GetRowNum() != -1) {
107 
108  // The first column for display purposes is the row number which
109  // is not parsed from the data. This returns a parsed field so
110  // we subtract 1 to get the right field.
111  string field_value = m_ImportedTableData->GetRow(static_cast<int>(row)).GetField(static_cast<int>(col_num-1));
112 
113  NStr::TruncateSpacesInPlace(field_value);
114 
115  ct = x_GuessType(field_value, dt);
116 
118  type_matches[ct] += 1;
119 
121  property_matches[dt] += 1;
122  }
123  }
124 
125  size_t max_matches = 0;
126 
127  // If all fields were blank, unspecified text will be the result
130 
131  bool has_real = false;
132  if (type_matches[(size_t)CTableImportColumn::eRealNumberColumn] > 0)
133  has_real = true;
134 
135  for (size_t j=0; j<type_matches.size(); ++j) {
136  if (type_matches[j] > max_matches) {
137  max_matches = type_matches[j];
139  }
140  }
141 
142  // if a column has mostly integers but also 1 or more real numbers
143  // then we make the column type real (rather than the 'most popular'
144  // type of int)
145  if (ct == CTableImportColumn::eNumberColumn && has_real)
147 
148  m_ImportedTableData->GetColumns()[col_num].SetType(ct);
149 
150 
151  max_matches = 0;
152  for (size_t j=0; j<property_matches.size(); ++j) {
153  if (property_matches[j] > max_matches) {
154  max_matches = property_matches[j];
156  }
157  }
158 
159  m_ImportedTableData->GetColumns()[col_num].SetDataType(dt);
160 
161  // Can use imported column headers, if any, to override the guess
162  // made on data analysis alone. For now, we are only checking for
163  // 'chromosome' since it can be missed if a column is just numbers.
165  string colname = m_ImportedTableData->GetColumnName(col_num);
166  NStr::ToLower(colname);
167 
168  if (colname == "chromosome" &&
170  m_ImportedTableData->GetColumns()[col_num].
171  SetDataType(CTableImportColumn::eChromosome);
172  m_ImportedTableData->GetColumns()[col_num].
174  }
175  }
176 }
177 
179  const string& field,
181 {
183 
184  // If blank, return undefined so that this result doesn't effect the
185  // final choice for a field. If all entries are blank though,
186  // the type should be text.
187  if (field.length() == 0)
189 
190  if ( (field[0] == '\'' && field[field.length()-1] == '\'') ||
191  (field[0] == '\"' && field[field.length()-1] == '\"') ) {
194  }
195 
196 
197  bool is_int;
198  try {
199  // Test for integer. Test StringToUInt8_DataSize only looks for
200  // positive integers (unsigned) but we want to allow negative as well.
201  string test_field = field;
202  if (test_field.length() > 1 && test_field[0] == '-') {
203  test_field[0] = ' ';
204  NStr::TruncateSpacesInPlace(test_field);
205  }
207  is_int = true;
208  }
209  catch ( CStringException&) {
210  is_int = false;
211  }
212 
213  // Is it a chromosome? You should need an Assmbly for this...
214  // Largest known number of chromosomes is adders tounge 1260).
215  // Maybe we could look this up/call a library function to check this?
216 
217  // M is for mitochondrial (MT)
218  size_t first_chr = field.find_first_of("01234567890xymXYM", 0);
219 
220  // Check for chromosome of the form "ch##/chr## etc.
221  if (first_chr != string::npos && first_chr >=2) {
222  string prefix = field.substr(0, first_chr);
223  string suffix = field.substr(first_chr, field.length()-first_chr);
224 
225  int cnum = 0;
226  try {
227  cnum = NStr::StringToInt(suffix);
228  }
229  // Not a number - check to see if it's of the form chrX/Y/x/y/MT/mt:
230  catch (CStringException&) {
232 
233  if ((suffix == "X" || suffix == "Y" || suffix == "mt" || suffix == "MT") &&
234  (!NStr::CompareNocase(prefix, "chr") ||
235  !NStr::CompareNocase(prefix, "ch") ||
236  !NStr::CompareNocase(prefix, "ch#") ||
237  !NStr::CompareNocase(prefix, "chr#"))) {
238 
241  }
242  }
243 
244  if ((cnum > 0 && cnum < 1261) &&
245  (!NStr::CompareNocase(prefix, "chr") ||
246  !NStr::CompareNocase(prefix, "ch") ||
247  !NStr::CompareNocase(prefix, "ch#") ||
248  !NStr::CompareNocase(prefix, "chr#"))) {
249 
252  }
253  }
254  /// Chromosomes are also the best guess if the field is one of {x,y,X,Y, mt, MT}
255  string field_upper = field;
256  NStr::ToUpper(field_upper);
257  if (field_upper == "X" || field_upper == "Y" || field_upper == "MT") {
260  }
261 
262  ////////////////////////////////////////////////
263  /// Check if it is an ID
264  ////////////////////////////////////////////////
265 
266  // First check for special case of GI|xxxxxxxx or lcl|xxxxxxxxxx
267  // allow pipe | or colon : to be used as a separator
268  // CSeq_id constructor seems to recognize '|' but not ':'
269  // so we do a check first here
270  size_t gi_separator = field.find_first_of("|:", 0);
271  if (gi_separator != string::npos &&
272  gi_separator != field.length()-1) {
273  string prefix = field.substr(0, gi_separator);
275 
276  string ginum = field.substr(gi_separator+1, field.length()-gi_separator);
277 
278  // For this to be a valid ID, ginum has to be an integer (it probably
279  // also needs to be a certain character length...)
280  bool ginum_int;
281  try {
283  ginum_int = true;
284  }
285  catch ( CStringException&) {
286  ginum_int = false;
287  }
288 
289  if (ginum_int) {
290  // GI
291  if (prefix == "GI") {
293  }
294  // Local ID
295  else {
297  }
298 
300  }
301  }
302 
303  if (is_int) {
306  }
307 
308  // Check for RSID (snp id): RSnnnnn
309  if (field.length() > 2) {
310  string rsstr = field.substr(0,2);
311  rsstr = NStr::ToUpper(rsstr);
312  if (rsstr == "RS") {
313  // Is remainder of id an integer?
314  string rsint = field.substr(2, field.length()-2);
315 
316  try {
317  // Test for integer. Test StringToUInt8_DataSize only looks for
318  // positive integers and that's ok since there should be no negative numbers here.
320 
323  }
324  catch ( CStringException&) {
325  }
326  }
327  }
328 
329 
330  // Check if it is an ID and, if so, what type of id. Default is
331  // eUnspecifiedID if we can't match it to anything else.
332  bool is_id = false;
334 
335  {
336  try {
337  CSeq_id seqid(field);
338 
339  // if no exception, it's some kind of id:
340  is_id = true;
341  CSeq_id_Base::E_Choice w = seqid.Which();
342 
343  // In case previous code missed a GI
344  if (w == CSeq_id::e_Gi) {
346  }
347  else if (seqid.IsLocal() || seqid.IdentifyAccession() == CSeq_id::eAcc_local) {
348  id_type = CTableImportColumn::eLocalID;
349  }
350  else {
351  // GetSequenceType to check if it is some kind of accession
352  Uint4 type_flags = GetSequenceType(m_Scope->GetBioseqHandle(seqid));
355  }
356  }
357 
358  //_TRACE("type?: " << CTableImportColumn::GetStringFromDataType(id_type) <<
359  //" Which: " << w);
360  }
361  catch (CException&) {
362  is_id = false;
363  }
364  }
365 
366  if (is_id) {
367  // Its an id of some sort:
368  dt = id_type;
370  }
371 
372  // Check for strand
373  if (field == "+" || field == "-" ||
374  !NStr::CompareNocase(field, "negative") ||
375  !NStr::CompareNocase(field, "positive") ||
376  !NStr::CompareNocase(field, "neg") ||
377  !NStr::CompareNocase(field, "pos")) {
378 
381  }
382 
383  // Check for genotype (ACTG or '-' when these are missing. No specific number of characters).
384  // Currently we will not consider '-' since if all rows had that, it would be an undefined
385  // column. But it would be acceptable for a small % of rows to have dashes.
386  if (field_upper.find_first_not_of("ACTG")) {
389  }
390 
391  bool is_float;
392  try {
393  NStr::StringToDouble(field);
394  is_float = true;
395  }
396  catch ( CStringException&) {
397  is_float = false;
398  }
399 
400  if (is_float) {
403  }
404 
407 }
408 
409 
CScope –.
Definition: scope.hpp:92
CStringException –.
Definition: ncbistr.hpp:4505
CRef< CTableImportDataSource > m_ImportedTableData
Data table to be rendered in the list.
CTableImportColumn::eColumnType x_GuessType(const string &field, CTableImportColumn::eDataType &dt)
Guess the type for a specific field (string)
CRef< objects::CScope > m_Scope
Scope is used in guessing what kind of ids we have.
void GuessColumns()
Update columns in data source with type info based on best-guesses.
CTableColumnTypeGuesser(CRef< CTableImportDataSource > ds)
ctor
void GuessColumn(size_t col_num)
Iterate over the data in a specific column (but not necessarily all rows) to guess type information.
eColumnType GetType() const
const CTableImportRow & GetRow(size_t row) const
Return a specific row.
vector< CTableImportColumn > & GetColumns()
return the array of column data
const CTableImportColumn & GetColumn(size_t col) const
Return the specified column.
size_t GetNumRows() const
return total number of rows read
string GetColumnName(size_t col) const
Return name of specified column.
string GetField(int column_idx) const
Get a specific field or "" if column_idx > m_Fields.size()
static EAccessionInfo IdentifyAccession(const CTempString &accession, TParseFlags flags=fParse_AnyRaw)
Deduces information from a bare accession a la WHICH_db_accession; may report false negatives on prop...
Definition: Seq_id.cpp:1634
@ eAcc_local
Definition: Seq_id.hpp:309
@ eAcc_unknown
Definition: Seq_id.hpp:294
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool IsNull(void) const THROWS_NONE
Check if pointer is null – same effect as Empty().
Definition: ncbiobj.hpp:735
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static double StringToDouble(const CTempStringEx str, TStringToNumFlags flags=0)
Convert string to double.
Definition: ncbistr.cpp:1387
static Uint8 StringToUInt8_DataSize(const CTempString str, TStringToNumFlags flags=0)
Convert string that can contain "software" qualifiers to Uint8.
Definition: ncbistr.cpp:1539
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
Definition: ncbistr.cpp:3197
static string & ToUpper(string &str)
Convert string to upper case – string& version.
Definition: ncbistr.cpp:424
static string & ToLower(string &str)
Convert string to lower case – string& version.
Definition: ncbistr.cpp:405
@ fDS_ProhibitFractions
StringToUInt8_DataSize(): Ignore any fraction part of a value, "1.2K" ~ "1K".
Definition: ncbistr.hpp:307
E_Choice Which(void) const
Which variant is currently selected.
Definition: Seq_id_.hpp:746
E_Choice
Choice variants.
Definition: Seq_id_.hpp:93
bool IsLocal(void) const
Check if variant Local is selected.
Definition: Seq_id_.hpp:775
@ e_Gi
GenInfo Integrated Database.
Definition: Seq_id_.hpp:106
int i
const struct ncbi::grid::netcache::search::fields::SIZE size
T min(T x_, T y_)
The Object manager core.
static const char * suffix[]
Definition: pcregrep.c:408
static const char * prefix[]
Definition: pcregrep.c:405
type_flags
Definition: proto.h:431
Uint4 GetSequenceType(const CBioseq_Handle &bsh)
Return a (corrected) set of flags identifying the sequence type.
Definition: sequtils.cpp:42
USING_SCOPE(objects)
Modified on Wed Nov 29 02:24:09 2023 by modify_doxy.py rev. 669887