NCBI C++ ToolKit
seqtable_util.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: seqtable_util.cpp 47464 2023-04-20 00:19:10Z evgeniev $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Colleen Bollin
27  */
28 
29 
30 #include <ncbi_pch.hpp>
31 
35 
36 
39 
40 // note: rows must have been sorted prior to calling
41 void RemoveRowsFromColumn(CRef<CSeqTable_column> col, const vector<size_t>& rows)
42 {
43  if (rows.size() == 0) {
44  return;
45  }
46  vector<size_t>::const_iterator row_it = rows.begin();
47 
48  if (col->GetData().IsString()) {
49  size_t orig_pos = 0;
50  CSeqTable_column::TData::TString::iterator it = col->SetData().SetString().begin();
51  while (row_it != rows.end() && it != col->SetData().SetString().end()) {
52  if (orig_pos == *row_it) {
53  it = col->SetData().SetString().erase(it);
54  row_it++;
55  } else {
56  it++;
57  }
58  orig_pos++;
59  }
60  } else if (col->GetData().IsId()) {
61  size_t orig_pos = 0;
62  CSeqTable_column::TData::TId::iterator it = col->SetData().SetId().begin();
63  while (row_it != rows.end() && it != col->SetData().SetId().end()) {
64  if (orig_pos == *row_it) {
65  it = col->SetData().SetId().erase(it);
66  row_it++;
67  } else {
68  it++;
69  }
70  orig_pos++;
71  }
72  } else if (col->GetData().IsInt()) {
73  size_t orig_pos = 0;
74  CSeqTable_column::TData::TInt::iterator it = col->SetData().SetInt().begin();
75  while (row_it != rows.end() && it != col->SetData().SetInt().end()) {
76  if (orig_pos == *row_it) {
77  it = col->SetData().SetInt().erase(it);
78  row_it++;
79  } else {
80  it++;
81  }
82  orig_pos++;
83  }
84  } else if (col->GetData().IsBit()) {
85  size_t orig_pos = 0;
86  CSeqTable_column::TData::TBit::iterator it = col->SetData().SetBit().begin();
87  while (row_it != rows.end() && it != col->SetData().SetBit().end()) {
88  if (orig_pos == *row_it) {
89  it = col->SetData().SetBit().erase(it);
90  row_it++;
91  } else {
92  it++;
93  }
94  orig_pos++;
95  }
96  } else if (col->GetData().IsBytes()) {
97  size_t orig_pos = 0;
98  CSeqTable_column::TData::TBytes::iterator it = col->SetData().SetBytes().begin();
99  while (row_it != rows.end() && it != col->SetData().SetBytes().end()) {
100  if (orig_pos == *row_it) {
101  it = col->SetData().SetBytes().erase(it);
102  row_it++;
103  } else {
104  it++;
105  }
106  orig_pos++;
107  }
108  } else if (col->GetData().IsReal()) {
109  size_t orig_pos = 0;
110  CSeqTable_column::TData::TReal::iterator it = col->SetData().SetReal().begin();
111  while (row_it != rows.end() && it != col->SetData().SetReal().end()) {
112  if (orig_pos == *row_it) {
113  it = col->SetData().SetReal().erase(it);
114  row_it++;
115  } else {
116  it++;
117  }
118  orig_pos++;
119  }
120  }
121 }
122 
123 
124 // note: rows must have been sorted prior to calling
125 void RemoveRowsFromTable(CRef<CSeq_table> table, const vector<size_t>& rows)
126 {
127  if (rows.size() > 0) {
128  NON_CONST_ITERATE(CSeq_table::TColumns, it, table->SetColumns()) {
129  RemoveRowsFromColumn (*it, rows);
130  }
131  table->SetNum_rows(table->GetNum_rows() - static_cast<int>(rows.size()));
132  }
133 }
134 
135 
137  const string& field,
138  CRef<edit::CStringConstraint> string_constraint)
139 {
141  if (!column) {
142  return;
143  }
144  vector<size_t> rows_to_remove;
145  if (column->GetData().IsInt()) {
146  CSeqTable_column::TData::TInt::const_iterator it = column->GetData().GetInt().begin();
147  size_t row = 0;
148  while (it != column->GetData().GetInt().end()) {
149  string num = NStr::NumericToString(*it);
150  if (!string_constraint->DoesTextMatch(num)) {
151  rows_to_remove.push_back(row);
152  }
153  row++;
154  it++;
155  }
156  } else if (column->GetData().IsString()) {
157  CSeqTable_column::TData::TString::const_iterator it = column->GetData().GetString().begin();
158  size_t row = 0;
159  while (it != column->GetData().GetString().end()) {
160  if (!string_constraint->DoesTextMatch(*it)) {
161  rows_to_remove.push_back(row);
162  }
163  row++;
164  it++;
165  }
166  } else if (column->GetData().IsId()) {
167  CSeqTable_column::TData::TId::const_iterator it = column->GetData().GetId().begin();
168  size_t row = 0;
169  while (it != column->GetData().GetId().end()) {
170  string id_string = (*it)->AsFastaString();
171  if (!string_constraint->DoesTextMatch(id_string)) {
172  rows_to_remove.push_back(row);
173  }
174  row++;
175  it++;
176  }
177  }
178 
179  RemoveRowsFromTable(table, rows_to_remove);
180 }
181 
182 
183 void ApplyToTable(CRef<CSeq_table> table, const string& field, const string& val, edit::EExistingText existing_text)
184 {
186  if (!column) {
187  column = new objects::CSeqTable_column();
188  column->SetHeader().SetTitle(field);
189  table->SetColumns().push_back(column);
190  }
191  for (int row = 0; row < table->GetNum_rows(); row++) {
192  AddValueToColumn(column, val, row, existing_text);
193  }
194 }
195 
196 
197 void RemoveFromTable (CRef<CSeq_table> table, const string& field)
198 {
200  if (column) {
201  for (int row = 0; row < table->GetNum_rows(); row++) {
203  }
204  }
205 }
206 
207 
208 void ConvertTableColumns(CRef<CSeq_table> table, const string& field1, const string& field2, edit::EExistingText existing_text)
209 {
211  if (!column1) {
212  return;
213  }
215  if (!column2) {
216  column2 = new objects::CSeqTable_column();
217  column2->SetHeader().SetTitle(field2);
218  table->SetColumns().push_back(column2);
219  }
220  for (size_t row = 0; row < column1->GetData().GetString().size(); row++) {
221  string orig = column1->GetData().GetString()[row];
222  if (!NStr::IsBlank(orig)) {
223  AddValueToColumn (column2, orig, row, existing_text);
224  }
225  column1->SetData().SetString()[row] = "";
226  }
227 }
228 
229 
230 void SwapTableColumns (CRef<CSeq_table> table, const string& field1, const string& field2)
231 {
234  if (!column1 && !column2) {
235  return;
236  }
237  if (!column1) {
238  column1 = new objects::CSeqTable_column();
239  column1->SetHeader().SetTitle(field1);
240  column1->SetData().SetString();
241  table->SetColumns().push_back(column1);
242  }
243  if (!column2) {
244  column2 = new objects::CSeqTable_column();
245  column2->SetHeader().SetTitle(field2);
246  column2->SetData().SetString();
247  table->SetColumns().push_back(column2);
248  }
249  for (size_t row = 0; row < column1->GetData().GetString().size(); row++) {
250  string orig1 = column1->GetData().GetString()[row];
251  string orig2 = "";
252  if (row < column2->GetData().GetString().size()) {
253  orig2 = column2->GetData().GetString()[row];
254  }
255  column1->SetData().SetString()[row] = orig2;
256  column2->SetData().SetString()[row] = orig1;
257  }
258 }
259 
260 
261 void CopyTableColumns(CRef<CSeq_table> table, const string& field1, const string& field2, edit::EExistingText existing_text)
262 {
264  if (!column1) {
265  return;
266  }
268  if (!column2) {
269  column2 = new objects::CSeqTable_column();
270  column2->SetHeader().SetTitle(field2);
271  table->SetColumns().push_back(column2);
272  }
273  for (size_t row = 0; row < column1->GetData().GetString().size(); row++) {
274  string orig = column1->GetData().GetString()[row];
275  if (!NStr::IsBlank(orig)) {
276  AddValueToColumn (column2, orig, row, existing_text);
277  }
278  }
279 }
280 
281 
283 {
284  ITERATE (objects::CSeq_table::TColumns, cit, values_table->GetColumns()) {
285  if ((*cit)->IsSetHeader() && (*cit)->GetHeader().IsSetTitle()
286  && MatchColumnName ((*cit)->GetHeader().GetTitle(), column_name)) {
287  return *cit;
288  }
289  }
291  return empty;
292 }
293 
294 
296 {
297  bool found = false;
298  objects::CSeq_table::TColumns::iterator cit = table->SetColumns().begin();
299  while (cit != table->SetColumns().end()) {
300  if ((*cit)->IsSetHeader() && (*cit)->GetHeader().IsSetTitle()
301  && MatchColumnName ((*cit)->GetHeader().GetTitle(), column_name)) {
302  cit = table->SetColumns().erase(cit);
303  found = true;
304  } else {
305  ++cit;
306  }
307  }
308  return found;
309 }
310 
311 
313 {
315  if (!col) {
316  col = new objects::CSeqTable_column();
317  col->SetHeader().SetTitle(label);
318  col->SetData().SetString();
319  table->SetColumns().push_back(col);
320  }
321  return col;
322 }
323 
324 
326 {
328  if (!col) {
329  col = new objects::CSeqTable_column();
330  col->SetHeader().SetTitle(label);
331  col->SetData().SetInt();
332  table->SetColumns().push_back(col);
333  }
334  return col;
335 }
336 
337 
339 {
340  while (column->SetData().SetString().size() < row + 1) {
341  column->SetData().SetString().push_back ("");
342  }
343 
344  string orig_val = column->GetData().GetString()[row];
345  edit::AddValueToString(orig_val, value, existing_text);
346 
347  column->SetData().SetString()[row] = orig_val;
348 }
349 
350 
351 void AddValueToTable(CRef<objects::CSeq_table> table, string subtype_name, string value, size_t row, edit::EExistingText existing_text)
352 {
353  // do we already have a column for this subtype?
354  bool found = false;
355  NON_CONST_ITERATE (objects::CSeq_table::TColumns, cit, table->SetColumns()) {
356  if ((*cit)->IsSetHeader() && (*cit)->GetHeader().IsSetTitle()
357  && NStr::EqualNocase((*cit)->GetHeader().GetTitle(), subtype_name)) {
358  AddValueToColumn((*cit), value, row, existing_text);
359  found = true;
360  break;
361  }
362  }
363  if (!found) {
364  CRef<objects::CSeqTable_column> new_col(new objects::CSeqTable_column());
365  new_col->SetHeader().SetTitle(subtype_name);
366  while (new_col->SetData().SetString().size() < row) {
367  new_col->SetData().SetString().push_back ("");
368  }
369  new_col->SetData().SetString().push_back(value);
370  table->SetColumns().push_back(new_col);
371  }
372 }
373 
374 
375 void AddValueToTable (CRef<objects::CSeq_table> table, string subtype_name, int value, size_t row)
376 {
377  // do we already have a column for this subtype?
378  bool found = false;
379  NON_CONST_ITERATE (objects::CSeq_table::TColumns, cit, table->SetColumns()) {
380  if ((*cit)->IsSetHeader() && (*cit)->GetHeader().IsSetTitle()
381  && NStr::EqualNocase((*cit)->GetHeader().GetTitle(), subtype_name)) {
382  while ((*cit)->SetData().SetInt().size() < row + 1) {
383  (*cit)->SetData().SetInt().push_back (0);
384  }
385  (*cit)->SetData().SetInt()[row] = value;
386  found = true;
387  break;
388  }
389  }
390  if (!found) {
391  CRef<objects::CSeqTable_column> new_col(new objects::CSeqTable_column());
392  new_col->SetHeader().SetTitle(subtype_name);
393  while (new_col->SetData().SetInt().size() < row) {
394  new_col->SetData().SetInt().push_back (0);
395  }
396  new_col->SetData().SetInt().push_back(value);
397  table->SetColumns().push_back(new_col);
398  }
399 }
400 
401 
403 {
404  if (!col || !col->IsSetData() || !col->GetData().IsString() || col->GetData().GetSize() < 1) {
405  return;
406  }
407 
408  size_t num_rows = col->GetData().GetSize();
409  for (size_t row = 0; row < num_rows; row++) {
410  col->SetData().SetString()[row] = val;
411  }
412 }
413 
414 
415 bool QualifierNamesAreEquivalent (string name1, string name2)
416 {
417  // ignore protein at beginning
418  const string protein("protein");
419  if (NStr::StartsWith(name1, protein)) {
420  name1 = name1.substr(protein.length());
421  }
422  if (NStr::StartsWith(name2, protein)) {
423  name2 = name2.substr(protein.length());
424  }
425  if ((NStr::EqualNocase(name1, "authors") && NStr::EqualNocase(name2, "author name list")) ||
426  (NStr::EqualNocase(name2, "authors") && NStr::EqualNocase(name1, "author name list")))
427  return true;
428 
429  // spaces, dashes, and underscores do not count
430  NStr::ReplaceInPlace (name1, " ", "");
431  NStr::ReplaceInPlace (name1, "_", "");
432  NStr::ReplaceInPlace (name1, "-", "");
433  NStr::ReplaceInPlace (name2, " ", "");
434  NStr::ReplaceInPlace (name2, "_", "");
435  NStr::ReplaceInPlace (name2, "-", "");
436 
437  return NStr::EqualNocase(name1, name2);
438 }
439 
440 
441 bool MatchColumnName (string name1, string name2)
442 {
443  size_t pos = NStr::Find(name1, "\n");
444  if (pos != string::npos) {
445  name1 = name1.substr(0, pos);
446  }
447  pos = NStr::Find(name2, "\n");
448  if (pos != string::npos) {
449  name2 = name2.substr(0, pos);
450  }
451  if (QualifierNamesAreEquivalent (name1, name2)) {
452  return true;
453  } else if (IsOrgColumnName(name1) && IsOrgColumnName(name2)) {
454  return true;
455  } else {
456  return false;
457  }
458 }
459 
460 
461 bool IsOrgColumnName (string name)
462 {
463  if (NStr::EqualNocase (name, "Organism Name")
464  || NStr::EqualNocase (name, "org")
465  || NStr::EqualNocase (name, "organism")
466  || NStr::EqualNocase (name, "taxname")) {
467  return true;
468  } else {
469  return false;
470  }
471 }
472 
473 
474 bool IsSubSourceNoteName (const string& name)
475 {
476  if (NStr::EqualNocase(name, "note-subsource")
477  || NStr::EqualNocase(name, "subsource-note")
478  || NStr::EqualNocase(name, "subsrc-note")
479  || NStr::EqualNocase(name, "note-subsrc")) {
480  return true;
481  } else {
482  return false;
483  }
484 }
485 
486 
487 bool IsOrgModNoteName (const string& name)
488 {
489  if (NStr::EqualNocase(name, "note-orgmod")
490  || NStr::EqualNocase(name, "orgmod-note")) {
491  return true;
492  } else {
493  return false;
494  }
495 }
496 
497 
498 // CountColumnValueConflicts
499 // For two string columns, counts the number of rows for which both columns have a non-blank
500 // value that does not match
501 // For two int columns, counts the number of rows for which the values do not match
502 // For any other type of column, returns -1 (error, cannot compare)
503 
505 {
506  int num_conflicts = 0;
507 
508  if (!orig_col || !orig_col->IsSetData() || !new_col || !new_col->IsSetData()) {
509  return 0;
510  }
511  if (orig_col->GetData().IsString() && new_col->GetData().IsString()) {
512  CSeqTable_column::TData::TString::const_iterator it1 = orig_col->GetData().GetString().begin();
513  CSeqTable_column::TData::TString::const_iterator it2 = new_col->GetData().GetString().begin();
514  while (it1 != orig_col->GetData().GetString().end()
515  && it2 != new_col->GetData().GetString().end()) {
516  if (NStr::IsBlank(*it1) || NStr::IsBlank (*it2)) {
517  // ok, ignore
518  } else if (NStr::Equal (*it1, *it2)) {
519  // no change
520  } else {
521  num_conflicts++;
522  }
523  ++it1;
524  ++it2;
525  }
526  } else if (orig_col->GetData().IsInt() && new_col->GetData().IsInt()) {
527  CSeqTable_column::TData::TInt::const_iterator it1 = new_col->GetData().GetInt().begin();
528  CSeqTable_column::TData::TInt::const_iterator it2 = new_col->GetData().GetInt().begin();
529  while (it1 != orig_col->GetData().GetInt().end()
530  && it2 != new_col->GetData().GetInt().end()) {
531  if (*it1 != *it2) {
532  num_conflicts++;
533  }
534  ++it1;
535  ++it2;
536  }
537  } else {
538  return -1;
539  }
540  return num_conflicts;
541 }
542 
543 
545 {
546  if (!values_table || !values_table->IsSetNum_rows()) return;
547 
548  size_t num_rows = values_table->GetNum_rows();
549 
550  NON_CONST_ITERATE (objects::CSeq_table::TColumns, it, values_table->SetColumns()) {
551  if ((*it)->IsSetData() && (*it)->GetData().IsString()) {
552  while ((*it)->GetData().GetString().size() < num_rows) {
553  (*it)->SetData().SetString().push_back("");
554  }
555  }
556  }
557 }
558 
559 
560 
static const char * column
Definition: stats.c:23
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2882
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5406
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5347
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
static bool Equal(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Test for equality of a substring with another string.
Definition: ncbistr.hpp:5378
static string & ReplaceInPlace(string &src, const string &search, const string &replace, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3396
static const char label[]
bool IsBit(void) const
Check if variant Bit is selected.
void SetHeader(THeader &value)
Assign a value to Header data member.
bool IsId(void) const
Check if variant Id is selected.
bool IsString(void) const
Check if variant String is selected.
bool IsReal(void) const
Check if variant Real is selected.
vector< CRef< CSeqTable_column > > TColumns
Definition: Seq_table_.hpp:92
bool IsInt(void) const
Check if variant Int is selected.
bool IsBytes(void) const
Check if variant Bytes is selected.
void SetData(TData &value)
Assign a value to Data data member.
bool IsSetData(void) const
row data Check if a value has been assigned to Data data member.
const TInt & GetInt(void) const
Get the variant data.
const TString & GetString(void) const
Get the variant data.
const TData & GetData(void) const
Get the Data member data.
where boath are integers</td > n< td ></td > n</tr > n< tr > n< td > tse</td > n< td > optional</td > n< td > String</td > n< td class=\"description\"> TSE option controls what blob is orig
<!DOCTYPE HTML >< html > n< header > n< title > PubSeq Gateway Help Page</title > n< style > n table
constexpr bool empty(list< Ts... >) noexcept
const struct ncbi::grid::netcache::search::fields::SIZE size
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
void ApplyToTable(CRef< CSeq_table > table, const string &field, const string &val, edit::EExistingText existing_text)
bool MatchColumnName(string name1, string name2)
void RemoveFromTable(CRef< CSeq_table > table, const string &field)
void AddValueToColumn(CRef< objects::CSeqTable_column > column, string value, size_t row, edit::EExistingText existing_text)
void CopyTableColumns(CRef< CSeq_table > table, const string &field1, const string &field2, edit::EExistingText existing_text)
CRef< objects::CSeqTable_column > AddIntColumnToTable(CRef< objects::CSeq_table > table, string label)
CRef< objects::CSeqTable_column > FindSeqTableColumnByName(CRef< objects::CSeq_table > values_table, string column_name)
void RemoveTableRowsThatDoNotMatchStringConstraint(CRef< CSeq_table > table, const string &field, CRef< edit::CStringConstraint > string_constraint)
bool RemoveSeqTableColumnByName(CRef< objects::CSeq_table > table, string column_name)
USING_SCOPE(ncbi::objects)
void AddValueToTable(CRef< objects::CSeq_table > table, string subtype_name, string value, size_t row, edit::EExistingText existing_text)
void ConvertTableColumns(CRef< CSeq_table > table, const string &field1, const string &field2, edit::EExistingText existing_text)
bool IsOrgColumnName(string name)
CRef< objects::CSeqTable_column > AddStringColumnToTable(CRef< objects::CSeq_table > table, string label)
void SwapTableColumns(CRef< CSeq_table > table, const string &field1, const string &field2)
bool QualifierNamesAreEquivalent(string name1, string name2)
void RemoveRowsFromTable(CRef< CSeq_table > table, const vector< size_t > &rows)
void RemoveRowsFromColumn(CRef< CSeqTable_column > col, const vector< size_t > &rows)
void FillShortColumns(CRef< objects::CSeq_table > values_table)
void SetColumnValue(CRef< objects::CSeqTable_column > col, string val)
int CountColumnValueConflicts(CRef< CSeqTable_column > orig_col, CRef< CSeqTable_column > new_col)
bool IsOrgModNoteName(const string &name)
bool IsSubSourceNoteName(const string &name)
#define row(bind, expected)
Definition: string_bind.c:73
EExistingText
bool AddValueToString(string &str, const string &value, EExistingText existing_text)
Add text to an existing string, using the "existing_text" directive to combine new text with existing...
Modified on Fri Sep 20 14:57:43 2024 by modify_doxy.py rev. 669887