NCBI C++ ToolKit
cleanup_user_object.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: cleanup_user_object.cpp 97508 2022-07-25 12:54:07Z ludwigf $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Colleen Bollin
27  *
28  * File Description:
29  * Code for cleaning up user objects
30  *
31  */
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbistd.hpp>
34 #include <serial/serialbase.hpp>
38 
40 #include <objmgr/util/sequence.hpp>
41 #include <objmgr/seqdesc_ci.hpp>
43 #include "cleanup_utils.hpp"
44 
45 
48 
49 
50 bool CCleanup::CleanupUserObject( CUser_object &user_object )
51 {
52  bool any_change = false;
53 
54  // clean type str
55  if( user_object.IsSetType() && user_object.GetType().IsStr() ) {
56  any_change |= CleanVisString (user_object.SetType().SetStr());
57  }
58 
59  // clean fields
60  if (user_object.IsSetData()) {
61  for (auto field : user_object.SetData()) {
62  any_change |= x_CleanupUserField(*field);
63  }
64  }
65 
66  any_change |= s_CleanupGeneOntology(user_object);
67  any_change |= s_CleanupStructuredComment(user_object);
68  any_change |= s_CleanupDBLink(user_object);
69 
70  return any_change;
71 }
72 
73 
75 {
76  bool any_change = false;
77 
78  if (field.IsSetLabel() && field.GetLabel().IsStr()) {
79  any_change |= CleanVisString(field.SetLabel().SetStr());
80  }
81 
82  if (field.IsSetData()) {
83  any_change |= s_AddNumToUserField(field);
84  switch (field.GetData().Which()) {
86  any_change |= Asn2gnbkCompressSpaces(field.SetData().SetStr());
87  any_change |= CleanVisString(field.SetData().SetStr());
88  break;
90  any_change |= CleanupUserObject(field.SetData().SetObject());
91  break;
93  for (auto sub_obj : field.SetData().SetObjects()) {
94  any_change |= CleanupUserObject(*sub_obj);
95  }
96  break;
98  // NOTE: for some reason, using the auto range
99  // does not work here
100  for (auto str = field.SetData().SetStrs().begin(); str != field.SetData().SetStrs().end(); str++) {
101  any_change |= Asn2gnbkCompressSpaces(*str);
102  any_change |= CleanVisString(*str);
103  }
104  break;
106  for (auto sub_field : field.SetData().SetFields()) {
107  any_change |= x_CleanupUserField(*sub_field);
108  }
109  default:
110  break;
111  }
112  }
113  return any_change;
114 }
115 
116 
119  {"go id", "GO:"},
120  {"go ref", "GO_REF:"}
121 };
124 
126 {
127  bool any_change = false;
128 
129  // nothing to do if not GeneOntology object
130  if (!obj.IsSetType() || !obj.GetType().IsStr() ||
131  !NStr::Equal(obj.GetType().GetStr(), "GeneOntology")) {
132  return any_change;
133  }
134  // nothing to do if no fields
135  if (!obj.IsSetData()) {
136  return any_change;
137  }
138 
139  static const char * const sc_bsecGoQualType[] = {
140  "", "Component", "Function", "Process"
141  };
142  typedef CStaticArraySet<const char*, PNocase_CStr> TGoQualTypeSet;
143  DEFINE_STATIC_ARRAY_MAP( TGoQualTypeSet, sc_GoQualArray, sc_bsecGoQualType );
144 
145  for (auto outer_field : obj.SetData()) {
146  if (outer_field->IsSetLabel() && outer_field->GetLabel().IsStr() &&
147  outer_field->IsSetData() && outer_field->GetData().IsFields()
148  && sc_GoQualArray.find(outer_field->GetLabel().GetStr().c_str()) != sc_GoQualArray.end()) {
149  for (auto term : outer_field->SetData().SetFields()) {
150  CUser_field &field = *term;
151  if (field.IsSetData() && field.GetData().IsFields()) {
152  for (auto inner_term_iter : field.SetData().SetFields()) {
153  CUser_field &inner_field = *inner_term_iter;
154  if (inner_field.IsSetLabel() &&
155  inner_field.GetLabel().IsStr() &&
156  inner_field.IsSetData() &&
157  inner_field.GetData().IsStr()) {
158  const string &inner_label = inner_field.GetLabel().GetStr();
159  auto find_term = sc_OntologyTermCleanupMap.find(inner_label.c_str());
160  if (find_term != sc_OntologyTermCleanupMap.end() &&
161  NStr::StartsWith(inner_field.SetData().SetStr(), find_term->second, NStr::eNocase)) {
162  inner_field.SetData().SetStr().erase(0, strlen(find_term->second));
163  any_change = true;
164  }
165  }
166  }
167  }
168  }
169  }
170  }
171 
172  return any_change;
173 }
174 
175 
176 
178 {
179  bool any_change = false;
180 
182  return any_change;
183  }
184 
185  any_change |= s_RemoveEmptyFields(obj);
186 
187  if (!obj.IsSetData()) {
188  return any_change;
189  }
190 
191  bool genome_assembly_data = false;
192  bool ibol_data = false;
193 
194  bool prefix_present = false;
195  bool suffix_present = false;
196  string core;
197 
198  const string kBarcode = "International Barcode of Life (iBOL)Data";
199  const string kGenomeAssemblyData = "Genome-Assembly-Data";
200 
201  for (auto field_i : obj.SetData()) {
202  CUser_field &field = *field_i;
203  if (field.IsSetLabel() && field.GetLabel().IsStr()
204  && field.IsSetData() && field.GetData().IsStr()) {
205  bool is_prefix = NStr::Equal(field.GetLabel().GetStr(), "StructuredCommentPrefix");
206  if (is_prefix) {
207  prefix_present = true;
208  }
209  bool is_suffix = NStr::Equal(field.GetLabel().GetStr(), "StructuredCommentSuffix");
210  if (is_suffix) {
211  suffix_present = true;
212  }
213  if (is_prefix || is_suffix) {
214  core = CUtf8::AsUTF8(field.GetData().GetStr(), eEncoding_Ascii);
216  string new_val = is_prefix ? CComment_rule::MakePrefixFromRoot(core) : CComment_rule::MakeSuffixFromRoot(core);
217  if (!NStr::Equal(new_val, field.GetData().GetStr())) {
218  field.SetData().SetStr(new_val);
219  any_change = true;
220  }
221  if (core == kGenomeAssemblyData) {
222  genome_assembly_data = true;
223  } else if( core == kBarcode ) {
224  ibol_data = true;
225  }
226  }
227  }
228  }
229  if (prefix_present && !suffix_present) {
230  string suffix_val = CComment_rule::MakeSuffixFromRoot(core);
231  auto& data = obj.SetData();
233  suffix->SetLabel().SetStr("StructuredCommentSuffix");
234  suffix->SetString(suffix_val);
235  data.push_back(suffix);
236  any_change = true;
237  }
238  if (!prefix_present && suffix_present) {
239  string prefix_val = CComment_rule::MakePrefixFromRoot(core);
240  auto& data = obj.SetData();
242  prefix->SetLabel().SetStr("StructuredCommentPrefix");
243  prefix->SetString(prefix_val);
244  data.emplace(data.begin(), prefix);
245  any_change = true;
246  }
247 
248  if( genome_assembly_data ) {
249  any_change |= s_CleanupGenomeAssembly(obj);
250  }
251 
252  if( ibol_data ) {
254  if (rules) {
255  CConstRef<CComment_rule> ruler = rules->FindCommentRuleEx(kBarcode);
256  if (ruler) {
257  const CComment_rule& rule = *ruler;
258  any_change |= rule.ReorderFields(obj);
259  }
260  }
261  }
262  return any_change;
263 }
264 
265 
268  {"Annotation Directed", "Annotation-Directed Improvement"},
269  {"High Quality Draft", "High-Quality Draft"},
270  {"Improved High Quality Draft", "Improved High-Quality Draft"},
271  {"Non-contiguous Finished", "Noncontiguous Finished"}
272 };
275 
277 {
278  bool any_change = false;
279  for (auto field_i : obj.SetData()) {
280  CUser_field &field = *field_i;
281  if (!field.IsSetLabel() || !field.GetLabel().IsStr() ||
282  !field.IsSetData() || !field.GetData().IsStr()) {
283  continue;
284  }
285  if (field.GetLabel().GetStr() == "Finishing Goal" ||
286  field.GetLabel().GetStr() == "Current Finishing Status" ) {
287  auto replace = sc_FinishingCleanupMap.find(field.GetData().GetStr().c_str());
288  if (replace != sc_FinishingCleanupMap.end()) {
289  field.SetData().SetStr(replace->second);
290  any_change = true;
291  }
292  } else if( field.GetLabel().GetStr() == "Assembly Date" ) {
293  string &field_str = field.SetData().SetStr();
294  bool ambiguous = false;
295  string altered = CSubSource::FixDateFormat (field_str, true, ambiguous);
296  if (!NStr::IsBlank(altered)) {
297  CRef<CDate> coll_date = CSubSource::DateFromCollectionDate (altered);
298  if (coll_date && coll_date->IsStd() && coll_date->GetStd().IsSetYear()) {
299  string day;
300  string month;
301  string year;
302  string new_date;
303  if (!ambiguous && coll_date->GetStd().IsSetDay()) {
304  coll_date->GetDate(&day, "%2D");
305  }
306  if (!ambiguous && coll_date->GetStd().IsSetMonth()) {
307  coll_date->GetDate(&month, "%N");
308  month = month.substr(0, 3);
309  month = NStr::ToUpper(month);
310  }
311  coll_date->GetDate(&year, "%Y");
312  if (!NStr::IsBlank(day)) {
313  new_date += day + "-";
314  }
315  if (!NStr::IsBlank(month)) {
316  new_date += month + "-";
317  }
318  if (!NStr::IsBlank(year)) {
319  new_date += year;
320  }
321  if (!NStr::Equal(field_str, new_date)) {
322  field_str = new_date;
323  any_change = true;
324  }
325  }
326  }
327  }
328  }
329  return any_change;
330 }
331 
332 
334 {
335  bool any_change = false;
337  return any_change;
338  }
339  if (!obj.IsSetData()) {
340  return any_change;
341  }
342 
343  CUser_object::TData::iterator it = obj.SetData().begin();
344  while (it != obj.SetData().end()) {
345  bool is_blank = false;
346  if ((*it)->IsSetData()) {
347  if ((*it)->GetData().IsStr()) {
348  const string& val = (*it)->GetData().GetStr();
349  if (NStr::IsBlank(val)) {
350  is_blank = true;
351  }
352  } else if ((*it)->GetData().Which() == CUser_field::TData::e_not_set) {
353  is_blank = true;
354  }
355  } else {
356  is_blank = true;
357  }
358 
359  if (is_blank) {
360  it = obj.SetData().erase(it);
361  any_change = true;
362  } else {
363  ++it;
364  }
365  }
366  return any_change;
367 }
368 
369 
371 {
372  bool changed = false;
374  return changed;
375  }
376  if (!obj.IsSetData()) {
377  return changed;
378  }
379  for (auto& it : obj.SetData()) {
380  if (it->IsSetData() && it->GetData().IsStr()) {
381  string val = it->GetData().GetStr();
382  it->SetData().SetStrs().push_back(val);
383  changed = true;
384  }
385  }
386  return changed;
387 }
388 
389 
391 {
392  if (!field.IsSetData()) {
393  return false;
394  }
395  bool any_change = false;
396  switch (field.GetData().Which()) {
398  if (!field.IsSetNum() || field.GetNum() != field.GetData().GetStrs().size()) {
399  field.SetNum(field.GetData().GetStrs().size());
400  any_change = true;
401  }
402  break;
404  if (!field.IsSetNum() || field.GetNum() != field.GetData().GetInts().size()) {
405  field.SetNum(field.GetData().GetInts().size());
406  any_change = true;
407  }
408  break;
410  if (!field.IsSetNum() || field.GetNum() != field.GetData().GetReals().size()) {
411  field.SetNum(field.GetData().GetReals().size());
412  any_change = true;
413  }
414  break;
416  if (!field.IsSetNum() || field.GetNum() != field.GetData().GetOss().size()) {
417  field.SetNum(field.GetData().GetOss().size());
418  any_change = true;
419  }
420  break;
421  default:
422  if (field.IsSetNum() && field.GetNum() != 1) {
423  field.SetNum(1);
424  any_change = true;
425  }
426  break;
427  }
428  return any_change;
429 }
430 
431 
User-defined methods of the data storage class.
User-defined methods of the data storage class.
static bool s_CleanupDBLink(CUser_object &obj)
static bool s_CleanupGeneOntology(CUser_object &obj)
static bool s_CleanupStructuredComment(CUser_object &obj)
static bool s_CleanupGenomeAssembly(CUser_object &obj)
static bool CleanupUserObject(CUser_object &object)
static bool x_CleanupUserField(CUser_field &field)
static bool s_AddNumToUserField(CUser_field &field)
static bool s_RemoveEmptyFields(CUser_object &obj)
static string MakeSuffixFromRoot(const string &root)
static string MakePrefixFromRoot(const string &root)
bool ReorderFields(CUser_object &user) const
static void NormalizePrefix(string &prefix)
static CConstRef< CComment_set > GetCommentRules()
CConstRef –.
Definition: ncbiobj.hpp:1266
void GetDate(string *label, bool year_only=false) const
Append a standardized string representation of the date to the label.
Definition: Date.hpp:149
class CStaticArrayMap<> provides access to a static array in much the same way as CStaticArraySet<>,...
Definition: static_map.hpp:175
static string FixDateFormat(const string &orig_date)
Attempt to fix the format of the date Returns a blank if the format of the date cannot be determined.
Definition: SubSource.cpp:620
static CRef< CDate > DateFromCollectionDate(const string &str) THROWS((CException))
Definition: SubSource.cpp:287
@ eObjectType_StructuredComment
EObjectType GetObjectType() const
SStaticPair< const char *, const char * > TFinishingCleanupElem
DEFINE_STATIC_ARRAY_MAP(TOntologyCleanupMap, sc_OntologyTermCleanupMap, k_ontology_term_cleanup_map)
SStaticPair< const char *, const char * > TOntologyCleanupElem
CStaticArrayMap< const char *, const char *, PNocase_CStr > TFinishingCleanupMap
static const TOntologyCleanupElem k_ontology_term_cleanup_map[]
CStaticArrayMap< const char *, const char *, PNocase_CStr > TOntologyCleanupMap
static const TFinishingCleanupElem k_finishing_cleanup_map[]
bool CleanVisString(string &str)
bool Asn2gnbkCompressSpaces(string &val)
weird space compression from C Toolkit
Include a standard set of the NCBI C++ Toolkit most basic headers.
#define bool
Definition: bool.h:34
static const char * str(char *buf, int n)
Definition: stats.c:84
char data[12]
Definition: iconv.c:80
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
static CStringUTF8 AsUTF8(const CTempString &src, EEncoding encoding, EValidate validate=eNoValidate)
Convert into UTF8 from a C/C++ string.
Definition: ncbistr.hpp:3889
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
static bool Equal(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Test for equality of a substring with another string.
Definition: ncbistr.hpp:5384
static string & ToUpper(string &str)
Convert string to upper case – string& version.
Definition: ncbistr.cpp:424
@ eEncoding_Ascii
Definition: ncbistr.hpp:202
@ eNocase
Case insensitive compare.
Definition: ncbistr.hpp:1206
const TStr & GetStr(void) const
Get the variant data.
bool IsSetData(void) const
the object itself Check if a value has been assigned to Data data member.
bool IsStr(void) const
Check if variant Str is selected.
Definition: Object_id_.hpp:291
bool IsSetType(void) const
type of object within class Check if a value has been assigned to Type data member.
bool IsSetYear(void) const
full year (including 1900) Check if a value has been assigned to Year data member.
Definition: Date_std_.hpp:407
bool IsStd(void) const
Check if variant Std is selected.
Definition: Date_.hpp:320
const TStrs & GetStrs(void) const
Get the variant data.
const TData & GetData(void) const
Get the Data member data.
const TOss & GetOss(void) const
Get the variant data.
bool IsFields(void) const
Check if variant Fields is selected.
bool IsSetDay(void) const
day of month (1-31) Check if a value has been assigned to Day data member.
Definition: Date_std_.hpp:501
bool IsStr(void) const
Check if variant Str is selected.
bool IsSetLabel(void) const
field label Check if a value has been assigned to Label data member.
TData & SetData(void)
Assign a value to Data data member.
void SetNum(TNum value)
Assign a value to Num data member.
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
bool IsSetNum(void) const
required for strs, ints, reals, oss Check if a value has been assigned to Num data member.
void SetLabel(TLabel &value)
Assign a value to Label data member.
const TReals & GetReals(void) const
Get the variant data.
bool IsSetMonth(void) const
month (1-12) Check if a value has been assigned to Month data member.
Definition: Date_std_.hpp:454
void SetData(TData &value)
Assign a value to Data data member.
const TLabel & GetLabel(void) const
Get the Label member data.
const TType & GetType(void) const
Get the Type member data.
bool IsSetData(void) const
Check if a value has been assigned to Data data member.
const TStd & GetStd(void) const
Get the variant data.
Definition: Date_.cpp:109
E_Choice Which(void) const
Which variant is currently selected.
const TInts & GetInts(void) const
Get the variant data.
TNum GetNum(void) const
Get the Num member data.
@ e_Object
for using other definitions
@ e_not_set
No variant selected.
The Object manager core.
static const char * suffix[]
Definition: pcregrep.c:408
static const char * prefix[]
Definition: pcregrep.c:405
const string kGenomeAssemblyData
Template structure SStaticPair is simlified replacement of STL pair<> Main reason of introducing this...
Definition: static_set.hpp:60
Modified on Wed Apr 17 13:10:57 2024 by modify_doxy.py rev. 669887