NCBI C++ ToolKit
validerror_annot.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: validerror_annot.cpp 100232 2023-07-11 14:09:56Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Jonathan Kans, Clifford Clausen, Aaron Ucko......
27  *
28  * File Description:
29  * validation of seq_annot
30  * .......
31  *
32  */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
37 
40 
44 
45 #include <objmgr/bioseq_ci.hpp>
47 
48 
51 BEGIN_SCOPE(validator)
52 
53 
55  CValidError_base(imp),
56  m_GraphValidator(imp),
57  m_AlignValidator(imp),
58  m_FeatValidator(imp)
59 {
60 }
61 
62 
64 {
65 }
66 
67 
69 {
71 }
72 
73 
75 {
76  if (annot.IsAlign()) {
77  if (annot.IsSetDesc()) {
78  for (const CRef<CAnnotdesc>& iter : annot.GetDesc().Get()) {
79  if (iter->IsUser()) {
80  const CObject_id& oid = iter->GetUser().GetType();
81  if (oid.IsStr()) {
82  if (oid.GetStr() == "Blast Type") {
84  "Record contains BLAST alignments", annot); // !!!
85  break;
86  }
87  }
88  }
89  }
90  }
92  int order = 1;
93  for (const auto& align : annot.GetData().GetAlign()) {
94  m_AlignValidator.ValidateSeqAlign(*align, order++);
95  }
96  }
97  } else if (annot.IsIds()) {
99  "Record contains Seq-annot.data.ids", annot);
100  } else if (annot.IsLocs()) {
102  "Record contains Seq-annot.data.locs", annot);
103  } else if (annot.IsGraph()) {
104  for (const auto& graph : annot.GetData().GetGraph()) {
106  }
107  } else if (annot.IsFtable()) {
108  CSeq_entry_Handle appropriate_parent;
109  if (m_Imp.ShouldSubdivide() && m_Scope) {
111  if (ah) {
113  if (seh) {
114  appropriate_parent = GetAppropriateXrefParent(seh);
115  }
116  }
117  }
118  if (appropriate_parent) {
119  CRef<CScope> tmp_scope(new CScope(*(CObjectManager::GetInstance())));
120  tmp_scope->AddDefaults();
121  CSeq_entry_Handle this_seh = tmp_scope->AddTopLevelSeqEntry(*(appropriate_parent.GetCompleteSeq_entry()));
122  m_FeatValidator.SetScope(*tmp_scope);
123  m_FeatValidator.SetTSE(this_seh);
124  for (const auto& feat : annot.GetData().GetFtable()) {
126  }
129  } else {
132  for (const auto& feat : annot.GetData().GetFtable()) {
134  }
135  }
136  }
137 }
138 
139 
141 {
142  if (annot.IsGraph()) {
143  for (const auto& graph : annot.GetData().GetGraph()) {
145  }
146  } else if (annot.IsFtable()) {
147  for (const auto& feat_it : annot.GetData().GetFtable()) {
148  string label = seq.GetId().front()->AsFastaString();
149  ReportLocationGI0(*feat_it, label);
150  if (! feat_it->IsSetLocation() || IsLocationUnindexed(feat_it->GetLocation())) {
152  "Feature is not indexed on Bioseq " + label, *feat_it);
153  } else {
154  // check feature packaging
155  // a feature packaged on a bioseq should have at least one location on the bioseq
156  bool found = false;
157  for (CSeq_loc_CI loc_it(feat_it->GetLocation()); loc_it; ++loc_it) {
158  const CSeq_id& id = loc_it.GetSeq_id();
159  if (seq.IsSetId()) {
160  for (const auto& id_it : seq.GetId()) {
161  if (id.Compare(*id_it) == CSeq_id::e_YES) {
162  found = true;
163  break;
164  }
165  }
166  }
167  if (! found && seq.GetInst().GetRepr() == CSeq_inst::eRepr_seg) {
168  const CBioseq_Handle& part =
170  m_Scope,
171  loc_it.GetEmbeddingSeq_loc(),
172  m_Imp.GetTSE_Handle());
173  if (part) {
174  CSeq_entry_Handle parent = part.GetParentEntry();
175  if (parent && parent.IsSeq()) {
176  parent = parent.GetParentEntry();
177  if (parent && parent.IsSet() && parent.GetSet().GetClass() == CBioseq_set::eClass_parts) {
178  parent = parent.GetParentEntry();
179  if (parent && parent.IsSet() && parent.GetSet().GetClass() == CBioseq_set::eClass_segset) {
180  CBioseq_CI bi(parent);
181  if (bi && bi->GetCompleteBioseq()->Equals(seq)) {
182  found = true;
183  break;
184  }
185  }
186  }
187  }
188  }
189  }
190  if (! found && seq.GetInst().GetRepr() == CSeq_inst::eRepr_raw) {
192  if (part) {
193  CSeq_entry_Handle parent = part.GetParentEntry();
194  if (parent && parent.IsSeq()) {
195  parent = parent.GetParentEntry();
196  if (parent && parent.IsSet() && parent.GetSet().GetClass() == CBioseq_set::eClass_parts) {
197  found = true;
198  break;
199  }
200  }
201  }
202  }
203  if (! found) {
204  if (m_Imp.IsSmallGenomeSet()) {
206  break;
207  }
208  }
209  }
210  if (! found) {
212  break;
213  }
214  }
215  }
216  }
217 }
218 
220 {
221  bool answer = false;
222  for (CBioseq_CI b_ci(set); b_ci && ! answer; ++b_ci) {
223  // actually looks only at the first seq-id
224  const CBioseq* bioseq = b_ci->GetCompleteBioseq();
225  if (bioseq->IsSetId()) {
226  for (const auto& id_it : bioseq->GetId()) {
227  switch (id_it->Which()) {
228  case CSeq_id::e_Embl:
229  case CSeq_id::e_Ddbj:
230  case CSeq_id::e_Tpe:
231  case CSeq_id::e_Tpd:
232  answer = true;
233  break;
234  default:
235  break;
236  }
237  }
238  }
239  }
240 
241  return answer;
242 }
243 
244 
246 {
248  while (parent) {
249  if (parent.GetCompleteBioseq_set().GetPointer() == &set) {
250  return true;
251  }
252  parent = parent.GetParentBioseq_set();
253  }
254  return false;
255 }
256 
257 
258 bool s_HasOneIntervalInSet(const CSeq_loc& loc, const CBioseq_set& set, CScope& scope, const CSeq_entry& tse)
259 {
260  for (CSeq_loc_CI loc_it(loc); loc_it; ++loc_it) {
261  const CSeq_id& id = loc_it.GetSeq_id();
262  CBioseq_Handle in_record = scope.GetBioseqHandleFromTSE(id, tse);
263  if (! in_record)
264  continue;
265  if (s_IsBioseqInSet(in_record, set)) {
266  return true;
267  }
268  }
269  return false;
270 }
271 
272 
274 {
275  if (annot.IsGraph()) {
276  for (const auto& graph : annot.GetData().GetGraph()) {
278  }
279  } else if (annot.IsFtable()) {
280  // if a feature is packaged on a set, the bioseqs in the locations should be in the set
282  bool is_embl_or_ddbj_on_set = x_IsEmblOrDdbjOnSet(bssh);
283 
284  for (const auto& feat_it : annot.GetData().GetFtable()) {
285  ReportLocationGI0(*feat_it, "?");
286  if (! feat_it->IsSetLocation() || IsLocationUnindexed(feat_it->GetLocation())) {
288  "Feature is not indexed on Bioseq ?", *feat_it);
289  } else if (is_embl_or_ddbj_on_set) {
290  // don't check packaging
291  } else if (! set.IsSetClass() ||
292  (set.GetClass() != CBioseq_set::eClass_nuc_prot && set.GetClass() != CBioseq_set::eClass_gen_prod_set)) {
294  } else if (feat_it->IsSetLocation() &&
295  ! s_HasOneIntervalInSet(feat_it->GetLocation(), set, *m_Scope, m_Imp.GetTSE())) {
296  if (m_Imp.IsSmallGenomeSet()) {
298  } else {
300  }
301  }
302  }
303  }
304 }
305 
306 
307 // feature must have location on at least one sequence in this record
308 // feature location must not extend past end of sequence
310 {
311  bool found_one = false;
312  for (CSeq_loc_CI loc_it(loc); loc_it; ++loc_it) {
313  const CSeq_id& id = loc_it.GetSeq_id();
314 #if 0
316  if (in_record) {
317  found_one = true;
318  if (! loc_it.IsWhole() && loc_it.GetRange().GetFrom() > in_record.GetBioseqLength() - 1) {
319  return true;
320  }
321  }
322 #else
324  if (seq_len != kInvalidSeqPos) {
325  found_one = true;
326  if (! loc_it.IsWhole() && loc_it.GetRange().GetFrom() > seq_len - 1) {
327  return true;
328  }
329  }
330 #endif
331  }
332 
333  return ! found_one;
334 }
335 
336 
338 {
339  if (! f.IsSetLocation()) {
340  return;
341  }
342 
343  unsigned int zero_gi = 0;
344 
345  for (CSeq_loc_CI lit(f.GetLocation()); lit; ++lit) {
346  if (lit.GetSeq_id().IsGi() && lit.GetSeq_id().GetGi() == ZERO_GI) {
347  zero_gi++;
348  }
349  }
350 
351  if (zero_gi > 0) {
353  "Feature has " + NStr::UIntToString(zero_gi) + " gi|0 location" + (zero_gi > 1 ? "s" : "") + " on Bioseq " + label,
354  f);
355  }
356 }
357 
358 
359 END_SCOPE(validator)
User-defined methods of the data storage class.
User-defined methods of the data storage class.
@ eErr_SEQ_ANNOT_AnnotIDs
@ eErr_SEQ_FEAT_FeatureLocationIsGi0
@ eErr_SEQ_ALIGN_BlastAligns
@ eErr_SEQ_FEAT_UnindexedFeature
@ eErr_SEQ_ANNOT_AnnotLOCs
CBioseq_CI –.
Definition: bioseq_ci.hpp:69
CBioseq_Handle –.
CBioseq_set_Handle –.
CScope –.
Definition: scope.hpp:92
CSeq_annot_Handle –.
bool IsGraph(void) const
Definition: Seq_annot.cpp:187
bool IsAlign(void) const
Definition: Seq_annot.cpp:182
bool IsIds(void) const
Definition: Seq_annot.cpp:192
bool IsLocs(void) const
Definition: Seq_annot.cpp:197
bool IsFtable(void) const
Definition: Seq_annot.cpp:177
CSeq_entry_Handle –.
Definition: Seq_entry.hpp:56
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
Seq-loc iterator class – iterates all intervals from a seq-loc in the correct order.
Definition: Seq_loc.hpp:453
void ValidateSeqAlign(const CSeq_align &align, int order=-1)
~CValidError_annot() override
void ValidateSeqAnnot(const CSeq_annot_Handle &annot)
void ValidateSeqAnnotContext(const CSeq_annot &annot, const CBioseq &seq)
CValidError_align m_AlignValidator
bool IsLocationUnindexed(const CSeq_loc &loc)
void ReportLocationGI0(const CSeq_feat &f, const string &label)
CValidError_feat m_FeatValidator
CValidError_graph m_GraphValidator
CCacheImpl & GetCache()
static CSeq_entry_Handle GetAppropriateXrefParent(CSeq_entry_Handle seh)
CValidError_imp & m_Imp
void PostErr(EDiagSev sv, EErrType et, const string &msg, const CSerialObject &obj)
void SetScope(CScope &scope)
void SetTSE(CSeq_entry_Handle seh)
void ValidateSeqFeat(const CSeq_feat &feat)
void ValidateSeqGraph(const CSeq_graph &graph)
void ValidateSeqGraphContext(const CSeq_graph &graph, const CBioseq_set &set)
const CSeq_entry_Handle & GetTSEH()
void PostErr(EDiagSev sv, EErrType et, const string &msg, const CSerialObject &obj)
Definition: validatorp.cpp:359
void IncrementMisplacedFeatureCount()
bool IsValidateAlignments() const
const CTSE_Handle & GetTSE_Handle()
bool IsSmallGenomeSet() const
void IncrementSmallGenomeSetMisplacedCount()
const CSeq_entry & GetTSE() const
bool ShouldSubdivide() const
CBioseq_Handle GetBioseqHandleFromLocation(CScope *scope, const CSeq_loc &loc, const CTSE_Handle &tse)
Definition: set.hpp:45
Include a standard set of the NCBI C++ Toolkit most basic headers.
const TSeqPos kInvalidSeqPos
Define special value for invalid sequence position.
Definition: ncbimisc.hpp:878
#define ZERO_GI
Definition: ncbimisc.hpp:1088
@ eDiag_Error
Error message.
Definition: ncbidiag.hpp:653
@ eDiag_Critical
Critical error message.
Definition: ncbidiag.hpp:654
virtual bool Equals(const CSerialObject &object, ESerialRecursionMode how=eRecursive) const
Check if both objects contain the same values.
@ e_YES
SeqIds compared, but are different.
Definition: Seq_id.hpp:583
sequence::ECompare Compare(const CSeq_loc &loc1, const CSeq_loc &loc2, CScope *scope)
Returns the sequence::ECompare containment relationship between CSeq_locs.
CBioseq_Handle GetBioseqHandleFromTSE(const CSeq_id &id, const CTSE_Handle &tse)
Get bioseq handle for sequence withing one TSE.
Definition: scope.cpp:253
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CSeq_entry_Handle AddTopLevelSeqEntry(CSeq_entry &top_entry, TPriority pri=kPriority_Default, EExist action=eExist_Default)
Add seq_entry, default priority is higher than for defaults or loaders Add object to the score with p...
Definition: scope.cpp:522
TSeqPos GetSequenceLength(const CSeq_id &id, TGetFlags flags=0)
Get sequence length Return kInvalidSeqPos if sequence is not found.
Definition: scope.cpp:769
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
void AddDefaults(TPriority pri=kPriority_Default)
Add default data loaders from object manager.
Definition: scope.cpp:504
CBioseq_set_Handle GetBioseq_setHandle(const CBioseq_set &seqset, EMissing action=eMissing_Default)
Definition: scope.cpp:176
CSeq_annot_Handle GetSeq_annotHandle(const CSeq_annot &annot, EMissing action=eMissing_Default)
Definition: scope.cpp:192
@ fDoNotRecalculate
avoid time-consuming recalculation of missing data
Definition: scope.hpp:438
CConstRef< CBioseq > GetCompleteBioseq(void) const
Get the complete bioseq.
TClass GetClass(void) const
CBioseq_set_Handle GetParentBioseq_set(void) const
Return a handle for the parent Bioseq-set, or null handle.
CSeq_entry_Handle GetParentEntry(void) const
Get parent Seq-entry handle.
TSeqPos GetBioseqLength(void) const
TSet GetSet(void) const
CConstRef< CSeq_annot > GetCompleteSeq_annot(void) const
Complete and return const reference to the current seq-annot.
CConstRef< CBioseq_set > GetCompleteBioseq_set(void) const
Return the complete bioseq-set object.
CBioseq_set_Handle GetParentBioseq_set(void) const
Return a handle for the parent Bioseq-set, or null handle.
CSeq_entry_Handle GetParentEntry(void) const
Get parent Seq-entry handle.
CConstRef< CSeq_entry > GetCompleteSeq_entry(void) const
Complete and get const reference to the seq-entry.
bool IsSet(void) const
CSeq_entry_Handle GetParentEntry(void) const
Get parent Seq-entry handle.
bool IsSeq(void) const
TObjectType * GetPointer(void) const THROWS_NONE
Get pointer,.
Definition: ncbiobj.hpp:1684
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
static string UIntToString(unsigned int value, TNumToStringFlags flags=0, int base=10)
Convert UInt to string.
Definition: ncbistr.hpp:5109
static const char label[]
bool IsStr(void) const
Check if variant Str is selected.
Definition: Object_id_.hpp:291
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
@ e_Tpe
Third Party Annot/Seq EMBL.
Definition: Seq_id_.hpp:111
@ e_Tpd
Third Party Annot/Seq DDBJ.
Definition: Seq_id_.hpp:112
@ e_Ddbj
DDBJ.
Definition: Seq_id_.hpp:107
@ eClass_parts
parts for 2 or 3
@ eClass_nuc_prot
nuc acid and coded proteins
Definition: Bioseq_set_.hpp:99
@ eClass_gen_prod_set
genomic products, chrom+mRNA+protein
@ eClass_segset
segmented sequence + parts
TRepr GetRepr(void) const
Get the Repr member data.
Definition: Seq_inst_.hpp:565
const Tdata & Get(void) const
Get the member data.
const TGraph & GetGraph(void) const
Get the variant data.
Definition: Seq_annot_.hpp:661
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
const TId & GetId(void) const
Get the Id member data.
Definition: Bioseq_.hpp:290
const TDesc & GetDesc(void) const
Get the Desc member data.
Definition: Seq_annot_.hpp:852
bool IsSetDesc(void) const
used only for stand alone Seq-annots Check if a value has been assigned to Desc data member.
Definition: Seq_annot_.hpp:840
const TAlign & GetAlign(void) const
Get the variant data.
Definition: Seq_annot_.hpp:641
const TFtable & GetFtable(void) const
Get the variant data.
Definition: Seq_annot_.hpp:621
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_annot_.hpp:873
bool IsSetId(void) const
equivalent identifiers Check if a value has been assigned to Id data member.
Definition: Bioseq_.hpp:278
@ eRepr_seg
segmented sequence
Definition: Seq_inst_.hpp:95
@ eRepr_raw
continuous sequence
Definition: Seq_inst_.hpp:94
double f(double x_, const double &y_)
Definition: njn_root.hpp:188
The Object manager core.
bool s_IsBioseqInSet(CBioseq_Handle bsh, const CBioseq_set &set)
bool s_HasOneIntervalInSet(const CSeq_loc &loc, const CBioseq_set &set, CScope &scope, const CSeq_entry &tse)
static bool x_IsEmblOrDdbjOnSet(const CBioseq_set_Handle &set)
Modified on Sun Apr 14 05:29:07 2024 by modify_doxy.py rev. 669887