NCBI C++ ToolKit
tax_validation_and_cleanup.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: tax_validation_and_cleanup.hpp 101919 2024-03-01 19:51:04Z kans $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *`
26  * Author: Colleen Bollin
27  *
28  * File Description:
29  * Tools for batch processing taxonomy-related validation and cleanup
30  * .......
31  *
32  */
33 
34 #ifndef VALIDATOR___TAX_VALIDATION_AND_CLEANUP__HPP
35 #define VALIDATOR___TAX_VALIDATION_AND_CLEANUP__HPP
36 
37 #include <corelib/ncbistd.hpp>
43 
46 BEGIN_SCOPE(validator)
47 
48 class CValidError_imp;
49 
50 
51 // For Taxonomy Lookups and Fixups
52 //
53 // For validation, we need to be able to look up an Org-ref and determine
54 // whether the tax ID in the record is the same as what is returned by
55 // the taxonomy service.
56 // For cleanup, we want to look up an Org-ref and replace the existing Org-ref
57 // in the record with what is returned by the taxonomy service.
58 //
59 // Several qualifiers other than Org-ref.taxname may also contain scientific names.
60 // It is possible that the scientific name is merely a portion of the string.
61 //
62 // In the case of specific host, we want to be able to identify names that are
63 // mis-spelled or unrecognized. Unfortunately, common names are also
64 // acceptable for specific host, and it can be difficult to detect whether a
65 // value is a scientific name or a common name. The current method looks for
66 // the string to contain at least two words, the first of which must be capitalized.
67 // Unfortunately, this fails for "Rhesus monkey", "Atlantic white-sided dolphin",
68 // and others, and fails to catch the obvious miscapitalization "homo sapiens".
69 // See SQD-4325 for ongoing discussion.
70 // For validation, these values are reported. For cleanup, we replace the
71 // original value with a corrected value where possible.
72 //
73 // In the case of strain, scientific names should *not* be present in certain
74 // situations. For validation, these values will be reported, once TM-725 is
75 // resolved.
76 //
77 // Often the same value will occur many, many times in the same record, and we
78 // would like to avoid redundant lookups.
79 // Taxonomy requests should be separated into manageable chunks.
80 // In order for the undo commands to work correctly in Genome Workbench, we need
81 // a method that allows Genome Workbench to control when the updates are made.
82 //
83 // Note that Org-refs can be found in both features and source descriptors.
84 // It is necessary to record the parents of the Org-refs for which lookups are
85 // made and for which lookups of qualifiers are made, in order to report
86 // and/or clean them.
87 //
88 
89 typedef struct {
92  string err_msg;
93 } TTaxError;
94 
95 
96 // This base class represents a request for a qualifier value.
97 // The same qualifier value will be found in multiple Org-refs, which will
98 // be represented in the parents (m_Descs and m_Feats).
99 // A single qualifier could have multiple strings to be sent to taxonomy
100 // (try the whole value, try just the first two tokens, etc.). These will be
101 // represented in m_ValuesToTry.
103 {
104 public:
106 // ~CQualifierRequest() override {}
107 
108  void AddParent(CConstRef<CSeqdesc> desc, CConstRef<CSeq_entry> ctx);
109  void AddParent(CConstRef<CSeq_feat> feat);
110 
111  void AddRequests(vector<CRef<COrg_ref> >& request_list) const;
112  bool MatchTryValue(const string& val) const;
113  size_t NumRemainingReplies() const { return m_ValuesToTry.size() - m_RepliesProcessed; }
114 
115  virtual void AddReply(const CT3Reply& reply, TTaxId descTaxID) = 0;
116  void PostErrors(CValidError_imp& imp);
117  virtual void ListErrors(vector<TTaxError>& errs) const = 0;
118 
119 protected:
120  void x_Init();
121 
122  vector<string> m_ValuesToTry;
124 
125  typedef pair<CConstRef<CSeqdesc>, CConstRef<CSeq_entry> > TDescPair;
126  vector<TDescPair> m_Descs;
127  vector<CConstRef<CSeq_feat> > m_Feats;
128 };
129 
130 // Specific host values can be classified as normal, ambiguous, or unrecognized.
131 // We can also suggest a better value to use instead.
133 {
134 public:
135  CSpecificHostRequest(const string& orig_val, const COrg_ref& org, bool for_fix = false);
136 // ~CSpecificHostRequest() override {}
137 
139  eNormal = 0,
142  eAlternateName
143  };
144  typedef int TResponseFlags;
145 
146  void AddReply(const CT3Reply& reply, TTaxId descTaxID) override;
147  void ListErrors(vector<TTaxError>& errs) const override;
148 
149  const string& SuggestFix() const;
150 
151 private:
152  string m_Host;
155  string m_Error;
157  string m_OrgLineage;
158 };
159 
160 
162 {
163 public:
164  CStrainRequest(const string& strain, const COrg_ref& org);
165 // ~CStrainRequest() override {}
166 
167  void AddReply(const CT3Reply& reply, TTaxId descTaxID) override;
168  void ListErrors(vector<TTaxError>& errs) const override;
169 
170  static string MakeKey(const string& strain, const string& taxname);
171  static bool RequireTaxname(const string& taxname);
172  static bool Check(const COrg_ref& org);
173 
174 private:
175  string m_Strain;
176  string m_Taxname;
178  static bool x_IsUnwanted(const string& str);
179  static bool x_IgnoreStrain(const string& str);
180 };
181 
182 
183 // The map is used to eliminate duplicate taxonomy requests.
184 // The keys used may depend on just the qualifier value or may
185 // be a combination of the qualifier value and other values from
186 // the Org-ref (in the case of strain, this is sometimes taxname).
188 {
189 public:
190  CQualLookupMap(COrgMod::ESubtype subtype) : m_Subtype(subtype), m_Populated(false) {}
191  virtual ~CQualLookupMap() {}
192 
193  bool IsPopulated() const { return m_Populated; }
194 
195  void Clear();
196 
197  // GetKey gets a string key that is used to determine whether the lookup for two Org-refs
198  // will be the same.
199  // * For validating specific hosts, this would be the original value.
200  // * For fixing specific hosts, this would be the original value after default
201  // fixes have been applied
202  // * For validating strain, this might be the original value or it might be the original
203  // value plus the organism name.
204  virtual string GetKey(const string& orig_val, const COrg_ref& org) const = 0;
205 
206  // Check indicates whether this Org-ref should be examined or ignored.
207  // strain values are ignored for some values of lineage or taxname
208  virtual bool Check(const COrg_ref& /*org*/) const { return true; }
209 
210  // used to add items to be looked up, when appropriate for this
211  // descriptor or feature
212  void AddDesc(CConstRef<CSeqdesc> desc, CConstRef<CSeq_entry> ctx);
213  void AddFeat(CConstRef<CSeq_feat> feat);
214  void AddOrg(const COrg_ref& org);
215 
216  // add an item to be looked up independently of a feature or descriptor
217  void AddString(const string& val);
218 
219  // GetRequestList returns a list of Org-refs to be sent to taxonomy.
220  // Note that the number of requests may be greater than the number of
221  // values being checked.
222  vector<CRef<COrg_ref> > GetRequestList();
223 
224  // It is the responsibility of the calling program to chunk the request
225  // list and pass the input and reply to the map until all requests
226  // have responses
227  string IncrementalUpdate(const vector<CRef<COrg_ref> >& input, const CTaxon3_reply& reply, TTaxId descTaxID = ZERO_TAX_ID);
228 
229  // Indicates whether the map is waiting for more responses
230  bool IsUpdateComplete() const;
231 
232  // Posts errors to the validator based on responses
233  void PostErrors(CValidError_imp& imp);
234 
235  virtual void ListErrors(vector<TTaxError>& errs) const;
236 
237  // Applies the change to an Org-ref. Note that there might be multiple
238  // qualifiers of the same subtype on the Org-ref, and we need to be sure
239  // to apply the change to the correct qualifier
240  virtual bool ApplyToOrg(COrg_ref& org) const = 0;
241 
242 protected:
244 
248 
249  TQualifierRequests::iterator x_FindRequest(const string& val);
250 
251  // x_MakeNewRequest creates a new CQualifierRequest object for the given pair of orig_val and org
252  virtual CRef<CQualifierRequest> x_MakeNewRequest(const string& orig_val, const COrg_ref& org) = 0;
253 };
254 
255 
257 {
258 public:
259  CSpecificHostMap() : CQualLookupMap(COrgMod::eSubtype_nat_host) {}
260 // ~CSpecificHostMap() override {}
261 
262  string GetKey(const string& orig_val, const COrg_ref& /*org*/) const override { return orig_val; }
263  bool ApplyToOrg(COrg_ref& /*org*/) const override { return false; }
264 
265 protected:
266  CRef<CQualifierRequest> x_MakeNewRequest(const string& orig_val, const COrg_ref& org) override;
267 };
268 
270 {
271 public:
272  CSpecificHostMapForFix() : CQualLookupMap(COrgMod::eSubtype_nat_host) {}
273 // ~CSpecificHostMapForFix() override {}
274 
275  string GetKey(const string& orig_val, const COrg_ref& /*org*/) const override { return x_DefaultSpecificHostAdjustments(orig_val); }
276  bool ApplyToOrg(COrg_ref& org) const override;
277 
278 protected:
279  static string x_DefaultSpecificHostAdjustments(const string& host_val);
280  CRef<CQualifierRequest> x_MakeNewRequest(const string& orig_val, const COrg_ref& org) override;
281 };
282 
283 
285 {
286 public:
287  CStrainMap() : CQualLookupMap(COrgMod::eSubtype_strain) {}
288 // ~CStrainMap() override {}
289 
290  string GetKey(const string& orig_val, const COrg_ref& org) const override { return CStrainRequest::MakeKey(orig_val, org.IsSetTaxname() ? org.GetTaxname() : kEmptyStr); }
291  bool Check(const COrg_ref& org) const override { return CStrainRequest::Check(org); }
292  bool ApplyToOrg(COrg_ref& /*org*/) const override { return false; }
293 
294 protected:
295  CRef<CQualifierRequest> x_MakeNewRequest(const string& orig_val, const COrg_ref& org) override;
296 };
297 
299 
300 // This class handles complete org-ref lookups, specific-host lookups,
301 // and strain lookups.
302 // These activities are bundled together in order to avoid doing a scan
303 // of the record looking for source features and source descriptors
304 // multiple times.
306 {
307 public:
311 
312  void Init(const CSeq_entry& se);
313 
314  // for complete Org-ref validation/replacement
315  vector< CRef<COrg_ref> > GetTaxonomyLookupRequest() const;
316  void ListTaxLookupErrors(const CT3Reply& reply, const COrg_ref& org, CBioSource::TGenome genome, bool is_insd_patent, bool is_wp, vector<TTaxError>& errs) const;
317  void ReportTaxLookupErrors(const CTaxon3_reply& reply, CValidError_imp& imp, bool is_insd_patent) const;
318  void ReportIncrementalTaxLookupErrors(const CTaxon3_reply& reply, CValidError_imp& imp, bool is_insd_patent, size_t offset) const;
319  bool AdjustOrgRefsWithTaxLookupReply(const CTaxon3_reply& reply,
320  vector<CRef<COrg_ref> > org_refs,
321  string& error_message,
322  bool use_error_orgrefs = false) const;
323 
324  // for specific host validation/replacement
325  vector<CRef<COrg_ref> > GetSpecificHostLookupRequest(bool for_fix);
326 
327  string IncrementalSpecificHostMapUpdate(const vector<CRef<COrg_ref> >& input, const CTaxon3_reply& reply);
328  bool IsSpecificHostMapUpdateComplete() const;
329  void ReportSpecificHostErrors(const CTaxon3_reply& reply, CValidError_imp& imp);
330  void ReportSpecificHostErrors(CValidError_imp& imp);
331  bool AdjustOrgRefsWithSpecificHostReply(
332  vector<CRef<COrg_ref>> requests,
333  const CTaxon3_reply& reply,
334  vector<CRef<COrg_ref>> org_refs);
335  bool AdjustOrgRefsForSpecificHosts(vector<CRef<COrg_ref> > org_refs);
336 
337  // for strain validation
338  vector<CRef<COrg_ref> > GetStrainLookupRequest();
339  string IncrementalStrainMapUpdate(const vector<CRef<COrg_ref> >& input, const CTaxon3_reply& reply, TTaxId descTaxID = ZERO_TAX_ID);
340  bool IsStrainMapUpdateComplete() const;
341  void ReportStrainErrors(CValidError_imp& imp);
342 
343  // Used when reporting a problem contacting the taxonomy service
344  CConstRef<CSeq_entry> GetTopReportObject() const;
345 
346  // Genome Workbench uses these methods to update individual descriptors and features
347  size_t NumDescs() const { return m_SrcDescs.size(); }
348  size_t NumFeats() const { return m_SrcFeats.size(); }
349 
350  CConstRef<CSeqdesc> GetDesc(size_t num) const { return m_SrcDescs[num]; }
351  CConstRef<CSeq_feat> GetFeat(size_t num) const { return m_SrcFeats[num]; }
352  CConstRef<CSeq_entry> GetSeqContext(size_t num) const;
353 
354  bool DoTaxonomyUpdate(CSeq_entry_Handle seh, bool with_host);
355 
356  void FixOneSpecificHost(string& val);
357  bool IsOneSpecificHostValid(const string& val, string& err_msg);
358 
359  void CheckOneOrg(const COrg_ref& org, CBioSource::TGenome genome, CValidError_imp& imp);
360 
362 
363 protected:
364  void x_InterpretTaxonomyError(const CT3Error& error, const COrg_ref& org, const EErrType type, vector<TTaxError>& errs) const;
365  void x_GatherSources(const CSeq_entry& se);
366  void x_CreateSpecificHostMap(bool for_fix);
367  void x_UpdateSpecificHostMapWithReply(const CTaxon3_reply& reply, string& error_message);
368  bool x_ApplySpecificHostMap(COrg_ref& org_ref) const;
369  static string x_DefaultSpecificHostAdjustments(const string& host_val);
370  TSpecificHostRequests::iterator x_FindHostFixRequest(const string& val);
371 
372  void x_CreateStrainMap();
373  void x_CreateQualifierMap(CQualLookupMap& lookup);
374 
375  void x_ClearMaps() { m_HostMap.Clear(); m_HostMapForFix.Clear(); m_StrainMap.Clear(); }
376 
377  vector<CConstRef<CSeqdesc>> m_SrcDescs;
378  vector<CConstRef<CSeq_entry>> m_DescCtxs;
379  vector<CConstRef<CSeq_feat>> m_SrcFeats;
380 
382  bool m_SpecificHostRequestsBuilt{ false };
383  bool m_SpecificHostRequestsUpdated{ false };
384 
385  bool m_StrainRequestsBuilt{ false };
386 
390 
391  unique_ptr<ITaxon3> m_taxon3;
392  taxupdate_func_t m_tax_func = nullptr;
393 };
394 
395 
396 END_SCOPE(validator)
399 
400 #endif /* TAX_VALIDATION_AND_CLEANUP__HPP */
User-defined methods of the data storage class.
User-defined methods of the data storage class.
EErrType
static int lookup(const char *name, const struct lookup_int *table)
Definition: attributes.c:50
#define false
Definition: bool.h:36
CObject –.
Definition: ncbiobj.hpp:180
@OrgMod.hpp User-defined methods of the data storage class.
Definition: OrgMod.hpp:54
virtual bool ApplyToOrg(COrg_ref &org) const =0
virtual CRef< CQualifierRequest > x_MakeNewRequest(const string &orig_val, const COrg_ref &org)=0
CQualLookupMap(COrgMod::ESubtype subtype)
virtual bool Check(const COrg_ref &) const
virtual string GetKey(const string &orig_val, const COrg_ref &org) const =0
map< string, CRef< CQualifierRequest > > TQualifierRequests
pair< CConstRef< CSeqdesc >, CConstRef< CSeq_entry > > TDescPair
virtual void AddReply(const CT3Reply &reply, TTaxId descTaxID)=0
virtual void ListErrors(vector< TTaxError > &errs) const =0
vector< CConstRef< CSeq_feat > > m_Feats
CSeq_entry_Handle –.
Definition: Seq_entry.hpp:56
string GetKey(const string &orig_val, const COrg_ref &) const override
bool ApplyToOrg(COrg_ref &) const override
string GetKey(const string &orig_val, const COrg_ref &) const override
bool ApplyToOrg(COrg_ref &) const override
string GetKey(const string &orig_val, const COrg_ref &org) const override
bool Check(const COrg_ref &org) const override
static bool Check(const COrg_ref &org)
static string MakeKey(const string &strain, const string &taxname)
CT3Reply –.
Definition: T3Reply.hpp:66
CConstRef< CSeq_feat > GetFeat(size_t num) const
vector< CConstRef< CSeq_feat > > m_SrcFeats
vector< CConstRef< CSeq_entry > > m_DescCtxs
TSpecificHostRequests m_SpecificHostRequests
CSpecificHostMapForFix m_HostMapForFix
vector< CConstRef< CSeqdesc > > m_SrcDescs
CConstRef< CSeqdesc > GetDesc(size_t num) const
void x_CreateSpecificHostMap(bool for_fix)
CTaxon3_reply –.
Definition: map.hpp:338
static string MakeKey(CScoreValue const &)
Include a standard set of the NCBI C++ Toolkit most basic headers.
CS_CONTEXT * ctx
Definition: t0006.c:12
static void Init(void)
Definition: cursor6.c:76
#define ZERO_TAX_ID
Definition: ncbimisc.hpp:1115
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
EDiagSev
Severity level for the posted diagnostics.
Definition: ncbidiag.hpp:650
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
#define kEmptyStr
Definition: ncbistr.hpp:123
#define NCBI_VALIDATOR_EXPORT
Definition: ncbi_export.h:913
const TTaxname & GetTaxname(void) const
Get the Taxname member data.
Definition: Org_ref_.hpp:372
bool IsSetTaxname(void) const
preferred formal name Check if a value has been assigned to Taxname data member.
Definition: Org_ref_.hpp:360
function< CRef< CTaxon3_reply >(const vector< CRef< COrg_ref > > &list)> taxupdate_func_t
Definition: itaxon3.hpp:60
static int input()
void Check(const string &value)
int offset
Definition: replacements.h:160
static const char * str(char *buf, int n)
Definition: stats.c:84
Definition: type.c:6
map< string, CSpecificHostRequest > TSpecificHostRequests
CRef< objects::CSeq_annot > AddFeat(CRef< objects::CSeq_feat > feat, CRef< objects::CSeq_entry > entry)
Modified on Mon Mar 04 05:12:42 2024 by modify_doxy.py rev. 669887