NCBI C++ ToolKit
validator_barcode.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: validator_barcode.cpp 84722 2018-12-04 12:05:14Z bollin $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Jonathan Kans, Clifford Clausen, Aaron Ucko, Mati Shomrat, ....
27  *
28  * File Description:
29  * Implementation of private parts of the validator
30  * .......
31  *
32  */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbistr.hpp>
37 
38 #include <corelib/ncbiexec.hpp>
44 #include <objmgr/feat_ci.hpp>
45 #include <objmgr/seqdesc_ci.hpp>
46 #include <objmgr/util/sequence.hpp>
47 
48 
51 BEGIN_SCOPE(validator)
52 //using namespace sequence;
53 
54 
55 
57 {
58  string accession;
59  string local;
60  string label;
61  for (CBioseq_Handle::TId::const_iterator it = bsh.GetId().begin(); it != bsh.GetId().end(); ++it)
62  {
63  const CSeq_id &id = *(it->GetSeqId());
64  if (id.IsGenbank() && id.GetGenbank().IsSetAccession())
65  accession = id.GetGenbank().GetAccession();
66  if (id.IsLocal() && id.GetLocal().IsStr())
67  local = id.GetLocal().GetStr();
68  }
69  if (!accession.empty())
70  return accession;
71  if (!local.empty())
72  return local;
73 
74  bsh.GetBioseqCore()->GetLabel(&label, CBioseq::eContent);
75  return label;
76 }
77 
79 {
80  string barcode;
81  string local;
82  for (CBioseq_Handle::TId::const_iterator it = bsh.GetId().begin(); it != bsh.GetId().end(); ++it)
83  {
84  const CSeq_id &id = *(it->GetSeqId());
85  if (id.IsGeneral() && id.GetGeneral().IsSetDb() && NStr::EqualNocase(id.GetGeneral().GetDb(), "uoguelph"))
86  {
87  id.GetLabel(&barcode, CSeq_id::eContent);
88  NStr::ReplaceInPlace(barcode, "uoguelph:", kEmptyStr);
89  }
90  if (id.IsLocal())
91  {
92  id.GetLabel(&local, CSeq_id::eContent);
93  }
94  }
95  if (!barcode.empty())
96  return barcode;
97  if (!local.empty())
98  return local;
99 
100  return "NO";
101 }
102 
103 
105 {
106  bool found_rbcl(false);
107  bool found_matk(false);
108 
109  for (CFeat_CI feat_it(bsh, CSeqFeatData::eSubtype_gene); feat_it; ++feat_it)
110  {
111  const CSeq_feat& gene = feat_it->GetOriginalFeature();
112  if (gene.IsSetData() && gene.GetData().IsGene() && gene.GetData().GetGene().IsSetLocus())
113  {
114  if (NStr::EqualNocase(gene.GetData().GetGene().GetLocus(), "rbcL"))
115  found_rbcl = true;
116  if (NStr::EqualNocase(gene.GetData().GetGene().GetLocus(), "matK"))
117  found_matk = true;
118  }
119  }
120  TSeqPos length = bsh.GetBioseqLength();
121  bool rval(false);
122 
123  if (found_matk)
124  {
125  if (length < 585)
126  {
127  rval = true;
128  }
129  } else if (found_rbcl)
130  {
131  if (length < 414)
132  {
133  rval = true;
134  }
135  } else if (length < 500)
136  {
137  rval = true;
138  }
139  return rval;
140 }
141 
143 {
144  bool forward(false);
145  bool reverse(false);
146  for (CSeqdesc_CI source_ci(bsh, CSeqdesc::e_Source); source_ci; ++source_ci)
147  {
148  if (source_ci->GetSource().IsSetPcr_primers())
149  {
150  FOR_EACH_PCRREACTION_IN_PCRREACTIONSET(reaction, source_ci->GetSource().GetPcr_primers())
151  {
152  if ((*reaction)->IsSetForward() && (*reaction)->GetForward().IsSet() && !(*reaction)->GetForward().Get().empty())
153  forward = true;
154  if ((*reaction)->IsSetReverse() && (*reaction)->GetReverse().IsSet() && !(*reaction)->GetReverse().Get().empty())
155  reverse = true;
156  }
157  }
158  }
159  return !(forward && reverse);
160 }
161 
163 {
164  bool country(false);
165  for (CSeqdesc_CI source_ci(bsh, CSeqdesc::e_Source); source_ci; ++source_ci)
166  {
167  FOR_EACH_SUBSOURCE_ON_BIOSOURCE(subsource, source_ci->GetSource())
168  {
169  if ((*subsource)->IsSetSubtype() && (*subsource)->GetSubtype() == CSubSource::eSubtype_country)
170  {
171  country = true;
172  }
173  }
174  }
175  return !country;
176 }
177 
179 {
180  bool rval(false);
181  for (CSeqdesc_CI source_ci(bsh, CSeqdesc::e_Source); source_ci; ++source_ci)
182  {
183  FOR_EACH_ORGMOD_ON_BIOSOURCE(orgmod, source_ci->GetSource())
184  {
185  if ((*orgmod)->IsSetSubtype() && (
186  (*orgmod)->GetSubtype() == COrgMod::eSubtype_specimen_voucher ||
187  (*orgmod)->GetSubtype() == COrgMod::eSubtype_bio_material ||
188  (*orgmod)->GetSubtype() == COrgMod::eSubtype_culture_collection))
189  {
190  rval = true;
191  }
192  }
193  }
194  return !rval;
195 }
196 
198 {
199  bool rval(false);
200  for (CSeqdesc_CI source_ci(bsh, CSeqdesc::e_Source); source_ci; ++source_ci)
201  {
202  FOR_EACH_ORGMOD_ON_BIOSOURCE(orgmod, source_ci->GetSource())
203  {
204  if ((*orgmod)->IsSetSubtype() && (
205  (*orgmod)->GetSubtype() == COrgMod::eSubtype_specimen_voucher ||
206  (*orgmod)->GetSubtype() == COrgMod::eSubtype_bio_material ||
207  (*orgmod)->GetSubtype() == COrgMod::eSubtype_culture_collection)
208  && (*orgmod)->IsSetSubname())
209  {
210  string subname = (*orgmod)->GetSubname();
211  if (NStr::Find(subname, ":") != NPOS)
212  rval = true;
213  }
214  }
215  }
216  return rval;
217 }
218 
220 {
221  TSeqPos num_n = 0;
222  CBioseqGaps_CI::Params params;
223  params.max_gap_len_to_ignore = 0;
224  for (CBioseqGaps_CI gap_it(bsh.GetSeq_entry_Handle(), params); gap_it; ++gap_it)
225  {
226  num_n += gap_it->length;
227  }
228  double p = double(100 * num_n) / bsh.GetBioseqLength();
229  string percent;
230  if (p > 1.)
231  percent = NStr::DoubleToString(p, 1);
232  return percent;
233 }
234 
236 {
237  bool has_date(false);
238  for (CSeqdesc_CI source_ci(bsh, CSeqdesc::e_Source); source_ci; ++source_ci)
239  {
240  FOR_EACH_SUBSOURCE_ON_BIOSOURCE(subsource, source_ci->GetSource())
241  {
242  if ((*subsource)->IsSetSubtype() && (*subsource)->GetSubtype() == CSubSource::eSubtype_collection_date && (*subsource)->IsSetName())
243  {
244  const string &date = (*subsource)->GetName();
245  bool bad_format(false);
246  bool in_future(false);
247  CSubSource::IsCorrectDateFormat(date, bad_format, in_future);
248  if (!bad_format && !in_future)
249  {
250  has_date = true;
251  }
252  }
253  }
254  }
255  return !has_date;
256 }
257 
259 {
260  bool has_order(false);
261  bool has_ibol(false);
262  for (CSeqdesc_CI desc_ci(bsh, CSeqdesc::e_User); desc_ci; ++desc_ci)
263  {
264  if (desc_ci->GetUser().IsSetType() && desc_ci->GetUser().GetType().IsStr() && NStr::EqualNocase(desc_ci->GetUser().GetType().GetStr(), "StructuredComment"))
265  {
266  bool is_ibol(false);
267  if (desc_ci->GetUser().HasField("StructuredCommentPrefix"))
268  {
269  const CUser_field& field = desc_ci->GetUser().GetField("StructuredCommentPrefix");
270  if (field.IsSetData() && field.GetData().IsStr() && NStr::EqualNocase(field.GetData().GetStr(), "##International Barcode of Life (iBOL)Data-START##"))
271  {
272  is_ibol = true;
273  has_ibol = true;
274  }
275  }
276  if (is_ibol && desc_ci->GetUser().HasField("Order Assignment"))
277  {
278  const CUser_field& field = desc_ci->GetUser().GetField("Order Assignment");
279  if (field.IsSetData() && field.GetData().IsStr() && !field.GetData().GetStr().empty())
280  {
281  has_order = true;
282  }
283  }
284  }
285  }
286  return has_ibol && !has_order;
287 }
288 
290 {
291  bool low_trace(true);
292  for (CSeqdesc_CI desc_ci(bsh, CSeqdesc::e_User); desc_ci; ++desc_ci)
293  {
294  if (desc_ci->GetUser().IsSetType() && desc_ci->GetUser().GetType().IsStr() && NStr::EqualNocase(desc_ci->GetUser().GetType().GetStr(), "Submission"))
295  {
296  if (desc_ci->GetUser().HasField("AdditionalComment"))
297  {
298  const CUser_field& field = desc_ci->GetUser().GetField("AdditionalComment");
299  if (field.IsSetData() && field.GetData().IsStr() && NStr::StartsWith(field.GetData().GetStr(), "Traces: "))
300  {
301  string str = field.GetData().GetStr();
302  NStr::ReplaceInPlace(str, "Traces: ", kEmptyStr);
304  if (traces >= 2)
305  low_trace = false;
306  }
307  }
308  }
309  }
310  return low_trace;
311 }
312 
313 
315 {
316  if (b.length || b.primers || b.country || b.voucher || !b.structured_voucher || // NO b.structured_voucher or b.has_keyword
317  !b.percent_n.empty() || b.collection_date || b.order_assignment || b.low_trace || b.frame_shift)
318  {
319  return true;
320  } else {
321  return false;
322  }
323 }
324 
325 
327 {
328  bool is_ibol(false);
329  for (CSeqdesc_CI desc_ci(bsh, CSeqdesc::e_User); desc_ci; ++desc_ci)
330  {
331  if (desc_ci->GetUser().IsSetType() && desc_ci->GetUser().GetType().IsStr() && NStr::EqualNocase(desc_ci->GetUser().GetType().GetStr(), "StructuredComment"))
332  {
333  if (desc_ci->GetUser().HasField("StructuredCommentPrefix"))
334  {
335  const CUser_field& field = desc_ci->GetUser().GetField("StructuredCommentPrefix");
336  if (field.IsSetData() && field.GetData().IsStr() && NStr::EqualNocase(field.GetData().GetStr(), "##International Barcode of Life (iBOL)Data-START##"))
337  {
338  is_ibol = true;
339  }
340  }
341  }
342  }
343 
344  bool frame_shift(false);
345  for (CSeqdesc_CI desc_ci(bsh, CSeqdesc::e_User); desc_ci; ++desc_ci)
346  {
347  if (desc_ci->GetUser().IsSetType() && desc_ci->GetUser().GetType().IsStr() && NStr::EqualNocase(desc_ci->GetUser().GetType().GetStr(), "multalin"))
348  {
349  if (desc_ci->GetUser().HasField("frameshift-nuc"))
350  {
351  const CUser_field& field = desc_ci->GetUser().GetField("frameshift-nuc");
352  if (field.IsSetData() && field.GetData().IsStr() && NStr::EqualNocase(field.GetData().GetStr(), "fail"))
353  {
354  frame_shift = true;
355  }
356  }
357  }
358  }
359 
360  return is_ibol && frame_shift;
361 }
362 
364 {
365  bool has_keyword(false);
366  for (CSeqdesc_CI desc_ci(bsh, CSeqdesc::e_Genbank); desc_ci; ++desc_ci)
367  {
368  FOR_EACH_KEYWORD_ON_GENBANKBLOCK(qual_it, desc_ci->GetGenbank())
369  {
370  const string &keyword = *qual_it;
371  if (NStr::EqualNocase(keyword, "BARCODE"))
372  has_keyword = true;
373  }
374  }
375  return has_keyword;
376 }
377 
379 {
380  bool res(false);
381  for (CSeqdesc_CI desc_it(bsh, CSeqdesc::e_Molinfo); desc_it; ++desc_it)
382  {
383  if (desc_it->GetMolinfo().IsSetTech() && desc_it->GetMolinfo().GetTech() == CMolInfo::eTech_barcode)
384  {
385  res = true;
386  }
387  }
388  return res;
389 }
390 
391 
393 {
394  b.bsh = bsh;
395  b.barcode = GetBarcodeId(bsh);
396  b.genbank = GetSeqTitle(bsh);
397  b.length = GetIsLength(bsh);
398  b.primers = GetIsPrimers(bsh);
399  b.country = GetIsCountry(bsh);
400  b.voucher = GetIsVoucher(bsh);
401  b.structured_voucher = GetIsStructuredVoucher(bsh);
402  b.percent_n = GetPercentN(bsh);
403  b.collection_date = GetHasCollectionDate(bsh);
404  b.order_assignment = GetHasOrderAssignment(bsh);
405  b.low_trace = GetLowTrace(bsh);
406  b.frame_shift = GetHasFrameShift(bsh);
407  b.has_keyword = GetHasKeyword(bsh);
408 }
409 
410 
412 {
413  TBarcodeResults BarcodeFailures;
414 
415  objects::CBioseq_CI b_iter(seh, objects::CSeq_inst::eMol_na);
416  for (; b_iter; ++b_iter)
417  {
418  if (IsTechBarcode(*b_iter)) {
419  SBarcode b;
420  BarcodeTestBioseq(*b_iter, b);
421  if (BarcodeTestFails(b)) {
422  BarcodeFailures.push_back(b);
423  }
424  }
425  }
426 
427  return BarcodeFailures;
428 }
429 
430 
431 END_SCOPE(validator)
This iterates over the runs of Ns of each sequence.
Definition: sequence.hpp:1573
CBioseq_Handle –.
@ eContent
Definition: Bioseq.hpp:103
CFeat_CI –.
Definition: feat_ci.hpp:64
CSeq_entry_Handle –.
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
CSeqdesc_CI –.
Definition: seqdesc_ci.hpp:65
static void IsCorrectDateFormat(const string &date_string, bool &bad_format, bool &in_future)
Definition: SubSource.cpp:454
const CUser_field & GetField(const string &str, const string &delim=".", NStr::ECase use_case=NStr::eCase) const
Access a named field in this user field.
Definition: User_field.cpp:211
Include a standard set of the NCBI C++ Toolkit most basic headers.
The NCBI C++ standard methods for dealing with std::string.
static const char * str(char *buf, int n)
Definition: stats.c:84
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
string
Definition: cgiapp.hpp:687
void GetLabel(string *label, ELabelType type=eDefault, TLabelFlags flags=fLabel_Default) const
Append a label for this Seq-id to the supplied string.
Definition: Seq_id.cpp:2040
@ eContent
Untagged human-readable accession or the like.
Definition: Seq_id.hpp:605
TSeqPos max_gap_len_to_ignore
We completely ignore any gaps we find that have this number of bases or fewer.
Definition: sequence.hpp:1589
TSeqPos GetBioseqLength(void) const
CSeq_entry_Handle GetSeq_entry_Handle(void) const
Get parent Seq-entry handle.
const TId & GetId(void) const
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
static string DoubleToString(double value, int precision=-1, TNumToStringFlags flags=0)
Convert double to string.
Definition: ncbistr.hpp:5187
#define kEmptyStr
Definition: ncbistr.hpp:123
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
#define NPOS
Definition: ncbistr.hpp:133
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2891
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5353
static string & ReplaceInPlace(string &src, const string &search, const string &replace, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3405
@ fAllowTrailingSpaces
Ignore trailing space characters.
Definition: ncbistr.hpp:297
@ fConvErr_NoThrow
Do not throw an exception on error.
Definition: ncbistr.hpp:285
@ fAllowLeadingSpaces
Ignore leading spaces in converted string.
Definition: ncbistr.hpp:294
static const char label[]
@ eSubtype_collection_date
DD-MMM-YYYY format.
Definition: SubSource_.hpp:114
bool IsSetLocus(void) const
Official gene symbol Check if a value has been assigned to Locus data member.
Definition: Gene_ref_.hpp:493
const TLocus & GetLocus(void) const
Get the Locus member data.
Definition: Gene_ref_.hpp:505
const TStr & GetStr(void) const
Get the variant data.
const TData & GetData(void) const
Get the Data member data.
bool IsStr(void) const
Check if variant Str is selected.
bool IsSetData(void) const
Check if a value has been assigned to Data data member.
@ eSubtype_specimen_voucher
Definition: OrgMod_.hpp:106
@ eSubtype_bio_material
Definition: OrgMod_.hpp:119
@ eSubtype_culture_collection
Definition: OrgMod_.hpp:118
bool IsSetData(void) const
the specific data Check if a value has been assigned to Data data member.
Definition: Seq_feat_.hpp:913
bool IsGene(void) const
Check if variant Gene is selected.
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_feat_.hpp:925
const TGene & GetGene(void) const
Get the variant data.
@ eTech_barcode
barcode of life project
Definition: MolInfo_.hpp:144
@ e_User
user defined object
Definition: Seqdesc_.hpp:124
@ e_Genbank
GenBank specific info.
Definition: Seqdesc_.hpp:121
@ e_Molinfo
info on the molecule and techniques
Definition: Seqdesc_.hpp:134
@ e_Source
source of materials, includes Org-ref
Definition: Seqdesc_.hpp:133
static char * subname
Definition: mdb_load.c:26
Defines a portable execute class.
The Object manager core.
#define FOR_EACH_KEYWORD_ON_GENBANKBLOCK(Itr, Var)
FOR_EACH_KEYWORD_ON_GENBANKBLOCK EDIT_EACH_KEYWORD_ON_GENBANKBLOCK.
#define FOR_EACH_PCRREACTION_IN_PCRREACTIONSET(Itr, Var)
FOR_EACH_PCRREACTION_IN_PCRREACTIONSET.
#define FOR_EACH_ORGMOD_ON_BIOSOURCE(Itr, Var)
FOR_EACH_ORGMOD_ON_BIOSOURCE EDIT_EACH_ORGMOD_ON_BIOSOURCE.
#define FOR_EACH_SUBSOURCE_ON_BIOSOURCE(Itr, Var)
FOR_EACH_SUBSOURCE_ON_BIOSOURCE EDIT_EACH_SUBSOURCE_ON_BIOSOURCE.
The params that control the behavior of CBioseqGaps_CI.
Definition: sequence.hpp:1577
bool GetIsPrimers(CBioseq_Handle bsh)
bool BarcodeTestFails(const SBarcode &b)
bool GetLowTrace(CBioseq_Handle bsh)
string GetPercentN(CBioseq_Handle bsh)
bool GetHasCollectionDate(CBioseq_Handle bsh)
string GetSeqTitle(CBioseq_Handle bsh)
bool IsTechBarcode(CBioseq_Handle bsh)
bool GetIsVoucher(CBioseq_Handle bsh)
bool GetHasOrderAssignment(CBioseq_Handle bsh)
TBarcodeResults GetBarcodeValues(CSeq_entry_Handle seh)
bool GetHasFrameShift(CBioseq_Handle bsh)
void BarcodeTestBioseq(CBioseq_Handle bsh, SBarcode &b)
bool GetIsStructuredVoucher(CBioseq_Handle bsh)
string GetBarcodeId(CBioseq_Handle bsh)
bool GetHasKeyword(CBioseq_Handle bsh)
bool GetIsCountry(CBioseq_Handle bsh)
bool GetIsLength(CBioseq_Handle bsh)
vector< SBarcode > TBarcodeResults
#define local
Definition: zutil.h:33
Modified on Sat Apr 20 12:20:50 2024 by modify_doxy.py rev. 669887