NCBI C++ ToolKit
go_term_validation_and_cleanup.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: go_term_validation_and_cleanup.cpp 93578 2021-05-01 14:54:18Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Colleen Bollin
27  *
28  * File Description:
29  * validation and cleanup of GeneOntology User-object
30  * .......
31  *
32  */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbistr.hpp>
36 
40 
42 
45 BEGIN_SCOPE(validator)
46 
47 
48 static const string kGoTermText = "text string";
49 static const string kGoTermID = "go id";
50 static const string kGoTermPubMedID = "pubmed id";
51 static const string kGoTermRef = "go ref";
52 static const string kGoTermEvidence = "evidence";
53 
55 static const string kGoTermComponent = "Component";
56 static const string kGoTermFunction = "Function";
57 
58 static const string kGeneOntology = "GeneOntology";
59 
61 {
62 public:
63  CGoTermSortStruct (const CUser_object::TData& sublist); // parse from field list
64 
65 
66  static bool IsLegalGoTermType(const string& val);
67 
68  const string& GetTerm() const { return m_Term; }
69 
70  const string& GetGoid() const { return m_Goid; }
71 
72  int GetPmid() const { return m_Pmid; }
73 
74  const set<string>& GetEvidence() const { return m_Evidence; }
75 
76  const vector<string>& GetErrors() const { return m_Errors; }
77 
78 protected:
79  string m_Term;
80  string m_Goid;
81  int m_Pmid;
83  vector<string> m_Errors;
84 
85 };
86 
87 
88 
90  m_Term(kEmptyStr), m_Goid(kEmptyStr), m_Pmid(0)
91 {
92  m_Evidence.clear();
93  m_Errors.clear();
94  for (auto sub_it : sublist) {
95  string label;
96  if (sub_it->IsSetLabel() && sub_it->GetLabel().IsStr()) {
97  label = sub_it->GetLabel().GetStr();
98  }
99  if (NStr::IsBlank(label)) {
100  label = "[blank]";
101  }
102  if (NStr::Equal(label, kGoTermText)) {
103  if (sub_it->GetData().IsStr()) {
104  m_Term = sub_it->GetData().GetStr();
105  } else {
106  m_Errors.push_back("Bad data format for GO term qualifier term");
107  }
108  } else if (NStr::Equal(label, kGoTermID)) {
109  if (sub_it->GetData().IsInt()) {
110  m_Goid = NStr::IntToString(sub_it->GetData().GetInt());
111  } else if (sub_it->GetData().IsStr()) {
112  m_Goid = sub_it->GetData().GetStr();
113  } else {
114  m_Errors.push_back("Bad data format for GO term qualifier GO ID");
115  }
116  } else if (NStr::Equal(label, kGoTermPubMedID)) {
117  if (sub_it->GetData().IsInt()) {
118  m_Pmid = sub_it->GetData().GetInt();
119  } else {
120  m_Errors.push_back("Bad data format for GO term qualifier PMID");
121  }
122  } else if (NStr::Equal(label, kGoTermEvidence)) {
123  if (sub_it->GetData().IsStr()) {
124  m_Evidence.insert(sub_it->GetData().GetStr());
125  } else {
126  m_Errors.push_back("Bad data format for GO term qualifier evidence");
127  }
128  } else if (NStr::Equal(label, kGoTermRef)) {
129  // recognized term
130 
131  } else {
132  m_Errors.push_back("Unrecognized label on GO term qualifier field " + label);
133  }
134  }
135 }
136 
137 
139 {
140  int compare = NStr::Compare (l.GetTerm(), r.GetTerm());
141  if (compare == 0) {
142  compare = NStr::Compare (l.GetGoid(), r.GetGoid());
143  }
144  if (compare > 0) return false;
145  if (compare < 0) return true;
146 
147  if (l.GetPmid() > r.GetPmid()) {
148  return false;
149  } else if (l.GetPmid() < r.GetPmid()) {
150  return true;
151  }
152 
153  auto ev1 = l.GetEvidence();
154  auto ev2 = r.GetEvidence();
155  if (ev1.size() > ev2.size()) {
156  return false;
157  } else if (ev1.size() < ev2.size()) {
158  return true;
159  }
160  auto it1 = ev1.begin();
161  auto it2 = ev2.begin();
162  while (it1 != ev1.end() && it2 != ev2.end() && compare == 0) {
163  compare = NStr::Compare(*it1, *it2);
164  it1++;
165  it2++;
166  }
167 
168  return (compare < 0);
169 }
170 
171 
173 {
177  || NStr::IsBlank(val)) {
178  return true;
179  } else {
180  return false;
181  }
182 }
183 
184 
185 bool IsGeneOntology(const CUser_object& user_object)
186 {
187  if (user_object.IsSetType() && user_object.GetType().IsStr() &&
188  NStr::EqualCase(user_object.GetType().GetStr(), kGeneOntology)) {
189  return true;
190  } else {
191  return false;
192  }
193 }
194 
195 
196 void GetGoTermErrors(CUser_object::TData field_list, map<string, string>& id_terms, vector<TGoTermError>& errors)
197 {
199 
200  size_t num_terms = 0;
201  for (auto it : field_list) {
202  if (!it->IsSetData() || !it->GetData().IsFields()) {
203  errors.push_back(TGoTermError(eErr_SEQ_FEAT_BadGeneOntologyFormat, "Bad GO term format"));
204  continue;
205  }
206 
207  CUser_object::TData sublist = it->GetData().GetFields();
208  // create sort structure and add to set
209  CGoTermSortStruct a(it->GetData().GetFields());
210  terms.insert(a);
211  // report errors
212  for (auto msg : a.GetErrors()) {
213  errors.push_back(TGoTermError(eErr_SEQ_FEAT_BadGeneOntologyFormat, msg));
214  }
215  if (NStr::IsBlank(a.GetGoid())) {
216  errors.push_back(TGoTermError(eErr_SEQ_FEAT_GeneOntologyTermMissingGOID, "GO term does not have GO identifier"));
217  }
218 
219  // add id/term pair
220  pair<string, string> p(a.GetGoid(), a.GetTerm());
221  auto s = id_terms.find(a.GetGoid());
222  if (s == id_terms.end()) {
223  id_terms[a.GetGoid()] = a.GetTerm();
224  } else if (!NStr::Equal(a.GetTerm(), s->second)) {
226  "Inconsistent GO terms for GO ID " + a.GetGoid()));
227 
228  }
229  num_terms++;
230  }
231  if (num_terms > terms.size()) {
232  errors.push_back(TGoTermError(eErr_SEQ_FEAT_DuplicateGeneOntologyTerm, "Duplicate GO term on feature"));
233  }
234 }
235 
236 
237 vector<TGoTermError> GetGoTermErrors(const CSeq_feat& feat)
238 {
239  vector<TGoTermError> rval;
240 
241  if (!feat.IsSetExt()) {
242  return rval;
243  }
244  const CUser_object& user_object = feat.GetExt();
245  if (!IsGeneOntology(user_object) ||
246  !user_object.IsSetData()) {
247  return rval;
248  }
249 
250  map<string, string> id_terms;
251  // iterate through fields
252  for (auto it : user_object.GetData()) {
253  // validate terms if match accepted type
254  if (!it->GetData().IsFields()) {
255  rval.push_back(TGoTermError(eErr_SEQ_FEAT_BadGeneOntologyFormat, "Bad data format for GO term"));
256  } else if (!it->IsSetLabel() || !it->GetLabel().IsStr() || !it->IsSetData()) {
257  rval.push_back(TGoTermError(eErr_SEQ_FEAT_BadGeneOntologyFormat, "Unrecognized GO term label [blank]"));
258  } else {
259  string qualtype = it->GetLabel().GetStr();
260  if (CGoTermSortStruct::IsLegalGoTermType(qualtype)) {
261  if (it->IsSetData()
262  && it->GetData().IsFields()) {
263  GetGoTermErrors(it->GetData().GetFields(), id_terms, rval);
264  }
265  } else {
266  rval.push_back(TGoTermError(eErr_SEQ_FEAT_BadGeneOntologyFormat, "Unrecognized GO term label " + qualtype));
267  }
268  }
269  }
270  return rval;
271 }
272 
273 
274 //LCOV_EXCL_START
275 //not used by validation, will be used by Genome Workbench menu item
277 {
278  bool rval = false;
279 
281 
282  auto it = field_list.begin();
283  while (it != field_list.end()) {
284  if (!(*it)->IsSetData() || !(*it)->GetData().IsFields()) {
285  ++it;
286  continue;
287  }
288 
289  // create sort structure and add to list if not already found
290  CGoTermSortStruct a((*it)->GetData().GetFields());
291  if (terms.find(a) != terms.end()) {
292  it = field_list.erase(it);
293  rval = true;
294  } else {
295  terms.insert(a);
296  ++it;
297  }
298  }
299 
300  return rval;
301 }
302 
303 
305 {
306  bool rval = false;
307  if (!feat.IsSetExt()) {
308  return rval;
309  }
310  CUser_object& user_object = feat.SetExt();
311  if (!IsGeneOntology(user_object) ||
312  !user_object.IsSetData()) {
313  return rval;
314  }
315 
316  // iterate through fields
317  for (auto it : user_object.SetData()) {
318  // only remove duplicates from properly formmated fields with accepted type
319  if (!it->GetData().IsFields()) {
320  // skip it
321  } else if (!it->IsSetLabel() || !it->GetLabel().IsStr() || !it->IsSetData()) {
322  // skip it
323  } else {
324  string qualtype = it->GetLabel().GetStr();
325  if (CGoTermSortStruct::IsLegalGoTermType(qualtype)) {
326  if (it->IsSetData()
327  && it->GetData().IsFields()) {
328  rval |= RemoveDuplicateGoTerms(it->SetData().SetFields());
329  }
330  }
331  }
332  }
333  return rval;
334 }
335 
336 
337 void SetGoTermValue(CUser_field& field, const string& val, const string& val_name)
338 {
339  bool found_existing = false;
340  if (field.IsSetData() && field.GetData().IsFields()) {
341  auto it = field.SetData().SetFields().begin();
342  while (it != field.SetData().SetFields().end()) {
343  bool do_erase = false;
344  if ((*it)->IsSetLabel() && (*it)->GetLabel().IsStr() &&
345  NStr::Equal((*it)->GetLabel().GetStr(), val_name)) {
346  if (found_existing) {
347  do_erase = true;
348  } else {
349  (*it)->SetData().SetStr(val);
350  found_existing = true;
351  }
352  }
353  if (do_erase) {
354  it = field.SetData().SetFields().erase(it);
355  } else {
356  it++;
357  }
358  }
359  }
360  if (!found_existing) {
361  CRef<CUser_field> go_id(new CUser_field());
362  go_id->SetLabel().SetStr(val_name);
363  go_id->SetData().SetStr(val);
364  field.SetData().SetFields().push_back (go_id);
365  }
366 }
367 
368 
369 void SetGoTermValue(CUser_field& field, int val, const string& val_name)
370 {
371  bool found_existing = false;
372  if (field.IsSetData() && field.GetData().IsFields()) {
373  auto it = field.SetData().SetFields().begin();
374  while (it != field.SetData().SetFields().end()) {
375  bool do_erase = false;
376  if ((*it)->IsSetLabel() && (*it)->GetLabel().IsStr() &&
377  NStr::Equal((*it)->GetLabel().GetStr(), val_name)) {
378  if (found_existing) {
379  do_erase = true;
380  } else {
381  (*it)->SetData().SetInt(val);
382  found_existing = true;
383  }
384  }
385  if (do_erase) {
386  it = field.SetData().SetFields().erase(it);
387  } else {
388  it++;
389  }
390  }
391  }
392  if (!found_existing) {
393  CRef<CUser_field> go_id(new CUser_field());
394  go_id->SetLabel().SetStr(val_name);
395  go_id->SetData().SetInt(val);
396  field.SetData().SetFields().push_back (go_id);
397  }
398 }
399 
400 
401 void ClearGoTermValue(CUser_field& field, const string& val_name)
402 {
403  if (field.IsSetData() && field.GetData().IsFields()) {
404  auto it = field.SetData().SetFields().begin();
405  while (it != field.SetData().SetFields().end()) {
406  if ((*it)->IsSetLabel() && (*it)->GetLabel().IsStr() &&
407  NStr::Equal((*it)->GetLabel().GetStr(), val_name)) {
408  it = field.SetData().SetFields().erase(it);
409  } else {
410  it++;
411  }
412  }
413  }
414 }
415 
416 
417 void SetGoTermId(CUser_field& field, const string& val)
418 {
419  SetGoTermValue(field, val, kGoTermID);
420 }
421 
422 
423 void SetGoTermText(CUser_field& field, const string& val)
424 {
425  SetGoTermValue(field, val, kGoTermText);
426 }
427 
428 
429 void SetGoTermPMID(CUser_field& field, int pmid)
430 {
431  SetGoTermValue(field, pmid, kGoTermPubMedID);
432 }
433 
434 
435 void AddGoTermEvidence(CUser_field& field, const string& val)
436 {
437  CRef<CUser_field> go_id(new CUser_field());
438  go_id->SetLabel().SetStr(kGoTermEvidence);
439  go_id->SetData().SetStr(val);
440  field.SetData().SetFields().push_back (go_id);
441 }
442 
443 
445 {
447 }
448 
449 
451 {
453 }
454 
455 
456 void AddGoTermToList(CSeq_feat& feat, CRef<CUser_field> field, const string& val_name)
457 {
458  if (feat.IsSetExt() && !IsGeneOntology(feat.GetExt())) {
459  return;
460  } else if (!feat.IsSetExt()) {
461  feat.SetExt().SetType().SetStr(kGeneOntology);
462  }
463 
464  bool found_existing = false;
465  if (feat.GetExt().IsSetData()) {
466  for (auto it : feat.SetExt().SetData()) {
467  if (it->IsSetLabel() &&
468  it->GetLabel().IsStr() &&
469  NStr::Equal(it->GetLabel().GetStr(), val_name) &&
470  (!it->IsSetData() || it->GetData().IsFields())) {
471  it->SetData().SetFields().push_back(field);
472  found_existing = true;
473  }
474  }
475  }
476  if (!found_existing) {
477  CRef<CUser_field> new_list(new CUser_field());
478  new_list->SetLabel().SetStr(val_name);
479  new_list->SetData().SetFields().push_back(field);
480  feat.SetExt().SetData().push_back(new_list);
481  }
482 }
483 
484 
486 {
487  AddGoTermToList(feat, field, kGoTermProcess);
488 }
489 
490 
492 {
493  AddGoTermToList(feat, field, kGoTermComponent);
494 }
495 
496 
498 {
499  AddGoTermToList(feat, field, kGoTermFunction);
500 }
501 
502 
503 size_t CountGoTerms(const CSeq_feat& feat, const string& list_name)
504 {
505  if (!feat.IsSetExt() || !IsGeneOntology(feat.GetExt()) ||
506  !feat.GetExt().IsSetData()) {
507  return 0;
508  }
509  for (auto it : feat.GetExt().GetData()) {
510  if (it->IsSetLabel() && it->GetLabel().IsStr() &&
511  NStr::Equal(it->GetLabel().GetStr(), list_name) &&
512  it->IsSetData() &&
513  it->GetData().IsFields()) {
514  return it->GetData().GetFields().size();
515  }
516  }
517  return 0;
518 }
519 
520 
521 size_t CountProcessGoTerms(const CSeq_feat& feat)
522 {
523  return CountGoTerms(feat, kGoTermProcess);
524 }
525 
526 
527 size_t CountComponentGoTerms(const CSeq_feat& feat)
528 {
529  return CountGoTerms(feat, kGoTermComponent);
530 }
531 
532 
533 size_t CountFunctionGoTerms(const CSeq_feat& feat)
534 {
535  return CountGoTerms(feat, kGoTermFunction);
536 }
537 //LCOV_EXCL_STOP
538 
539 
540 END_SCOPE(validator)
#define static
@ eErr_SEQ_FEAT_BadGeneOntologyFormat
@ eErr_SEQ_FEAT_GeneOntologyTermMissingGOID
@ eErr_SEQ_FEAT_DuplicateGeneOntologyTerm
@ eErr_SEQ_FEAT_InconsistentGeneOntologyTermAndId
static bool IsLegalGoTermType(const string &val)
const string & GetGoid() const
const set< string > & GetEvidence() const
const vector< string > & GetErrors() const
CGoTermSortStruct(const CUser_object::TData &sublist)
const string & GetTerm() const
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
const_iterator end() const
Definition: map.hpp:152
const_iterator find(const key_type &key) const
Definition: map.hpp:153
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
const_iterator begin() const
Definition: set.hpp:135
void clear()
Definition: set.hpp:153
size_type size() const
Definition: set.hpp:132
const_iterator find(const key_type &key) const
Definition: set.hpp:137
const_iterator end() const
Definition: set.hpp:136
Include a standard set of the NCBI C++ Toolkit most basic headers.
The NCBI C++ standard methods for dealing with std::string.
static const string kGoTermText
void SetGoTermPMID(CUser_field &field, int pmid)
bool RemoveDuplicateGoTerms(CUser_object::TData &field_list)
size_t CountProcessGoTerms(const CSeq_feat &feat)
static const string kGeneOntology
static const string kGoTermComponent
void AddFunctionGoTerm(CSeq_feat &feat, CRef< CUser_field > field)
size_t CountComponentGoTerms(const CSeq_feat &feat)
void ClearGoTermPMID(CUser_field &field)
void SetGoTermText(CUser_field &field, const string &val)
void AddProcessGoTerm(CSeq_feat &feat, CRef< CUser_field > field)
static const string kGoTermEvidence
void SetGoTermValue(CUser_field &field, const string &val, const string &val_name)
size_t CountGoTerms(const CSeq_feat &feat, const string &list_name)
size_t CountFunctionGoTerms(const CSeq_feat &feat)
void ClearGoTermEvidence(CUser_field &field)
void SetGoTermId(CUser_field &field, const string &val)
void AddGoTermEvidence(CUser_field &field, const string &val)
void AddComponentGoTerm(CSeq_feat &feat, CRef< CUser_field > field)
bool operator<(const CGoTermSortStruct &l, const CGoTermSortStruct &r)
static const string kGoTermID
static const string kGoTermFunction
static const string kGoTermPubMedID
void AddGoTermToList(CSeq_feat &feat, CRef< CUser_field > field, const string &val_name)
void ClearGoTermValue(CUser_field &field, const string &val_name)
static const string kGoTermRef
static const string kGoTermProcess
bool IsGeneOntology(const CUser_object &user_object)
void GetGoTermErrors(CUser_object::TData field_list, map< string, string > &id_terms, vector< TGoTermError > &errors)
pair< EErrType, string > TGoTermError
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
#define kEmptyStr
Definition: ncbistr.hpp:123
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5083
static bool EqualCase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-sensitive equality of a substring with another string.
Definition: ncbistr.hpp:5324
static int Compare(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Compare of a substring with another string.
Definition: ncbistr.hpp:5296
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5352
static bool Equal(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Test for equality of a substring with another string.
Definition: ncbistr.hpp:5383
virtual void Process(SOCK sock)=0
Runs asynchronously (from a separate thread) for each request.
static const char label[]
bool IsSetData(void) const
the object itself Check if a value has been assigned to Data data member.
bool IsStr(void) const
Check if variant Str is selected.
Definition: Object_id_.hpp:291
bool IsSetType(void) const
type of object within class Check if a value has been assigned to Type data member.
const TData & GetData(void) const
Get the Data member data.
bool IsFields(void) const
Check if variant Fields is selected.
TData & SetData(void)
Assign a value to Data data member.
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
void SetLabel(TLabel &value)
Assign a value to Label data member.
const TData & GetData(void) const
Get the Data member data.
void SetData(TData &value)
Assign a value to Data data member.
const TType & GetType(void) const
Get the Type member data.
bool IsSetData(void) const
Check if a value has been assigned to Data data member.
vector< CRef< CUser_field > > TData
bool IsSetExt(void) const
user defined structure extension Check if a value has been assigned to Ext data member.
Definition: Seq_feat_.hpp:1207
void SetExt(TExt &value)
Assign a value to Ext data member.
Definition: Seq_feat_.cpp:153
const TExt & GetExt(void) const
Get the Ext member data.
Definition: Seq_feat_.hpp:1219
static void text(MDB_val *v)
Definition: mdb_dump.c:62
unsigned int a
Definition: ncbi_localip.c:102
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
#define const
Definition: zconf.h:230
Modified on Fri Dec 01 04:47:47 2023 by modify_doxy.py rev. 669887