NCBI C++ ToolKit
gene_qual_normalization.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: gene_qual_normalization.cpp 95784 2021-12-22 19:48:06Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Colleen Bollin
27  *
28  * File Description:
29  * Code for moving gene qualifiers from gene xrefs on child features to
30  * the actual gene
31  *
32  */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <serial/serialbase.hpp>
38 
40 #include <objmgr/util/sequence.hpp>
41 #include <objmgr/seqdesc_ci.hpp>
43 
44 
47 
48 
49 
51 {
52  // we're only going to move the qualifiers if the gene has a
53  // locus_tag and no locus
54  if (!gene.GetData().IsGene() ||
55  gene.GetData().GetGene().IsSetLocus() ||
56  !gene.GetData().GetGene().IsSetLocus_tag() ||
57  !cds.IsSetXref()) {
58  return false;
59  }
60 
61  CTempString locus;
62  CTempString locus_tag;
63  for (auto it : cds.GetXref()) {
64  if (it->IsSetData() && it->GetData().IsGene()) {
65  if (it->GetData().GetGene().IsSetLocus()) {
66  if (NStr::IsBlank(locus)) {
67  locus = it->GetData().GetGene().GetLocus();
68  } else {
69  // already found a locus value, quit
70  return false;
71  }
72  }
73  if (it->GetData().GetGene().IsSetLocus_tag()) {
74  if (NStr::IsBlank(locus_tag)) {
75  locus_tag = it->GetData().GetGene().GetLocus_tag();
76  } else {
77  // already found a locus-tag value, quit
78  return false;
79  }
80  }
81  }
82  }
83  if (NStr::IsBlank(locus)) {
84  // no locus that we can use
85  return false;
86  }
87 
88  if (!NStr::IsBlank(locus_tag) &&
89  gene.GetData().GetGene().IsSetLocus_tag() &&
90  !NStr::Equal(locus_tag, gene.GetData().GetGene().GetLocus_tag())) {
91  return false;
92  }
93 
94  return true;
95 }
96 
97 
99 {
100  if (!gene.GetData().IsGene() || gene.GetData().GetGene().IsSetLocus()
101  || !cds.IsSetXref()) {
102  return false;
103  }
104  CTempString locus;
105  CTempString locus_tag;
106  CRef<CSeqFeatXref> gene_xref;
107  for (auto it : cds.SetXref()) {
108  if (it->IsSetData() && it->GetData().IsGene()) {
109  if (it->GetData().GetGene().IsSetLocus()) {
110  if (NStr::IsBlank(locus)) {
111  locus = it->GetData().GetGene().GetLocus();
112  gene_xref = it;
113  } else {
114  // already found a locus value, quit
115  return false;
116  }
117  }
118  if (it->GetData().GetGene().IsSetLocus_tag()) {
119  if (NStr::IsBlank(locus_tag)) {
120  locus_tag = it->GetData().GetGene().GetLocus_tag();
121  } else {
122  // already found a locus-tag value, quit
123  return false;
124  }
125  }
126  }
127  }
128  if (NStr::IsBlank(locus)) {
129  // no locus that we can use
130  return false;
131  }
132 
133  if (!NStr::IsBlank(locus_tag) &&
134  gene.GetData().GetGene().IsSetLocus_tag() &&
135  !NStr::Equal(locus_tag, gene.GetData().GetGene().GetLocus_tag())) {
136  return false;
137  }
138 
139  if (!NStr::IsBlank(locus)) {
140  gene.SetData().SetGene().SetLocus(locus);
141  if (gene.GetData().GetGene().IsSetLocus_tag() &&
142  !NStr::IsBlank(gene.GetData().GetGene().GetLocus_tag())) {
143  // gene xrefs should point to locus_tag instead of locus
144  gene_xref->SetData().SetGene().ResetLocus();
145  gene_xref->SetData().SetGene().SetLocus_tag(gene.GetData().GetGene().GetLocus_tag());
146  }
147  return true;
148  } else {
149  return false;
150  }
151 }
152 
153 
154 vector<CCleanup::TFeatGenePair> CCleanup::GetNormalizableGeneQualPairs(CBioseq_Handle bsh)
155 {
156  vector<CCleanup::TFeatGenePair> rval;
157 
158  // only for nucleotide sequences
159  if (bsh.IsAa()) {
160  return rval;
161  }
162  // only for prokaryotes
164  if (!src || !src->IsSource() || !src->GetSource().IsSetLineage() ||
165  (NStr::Find(src->GetSource().GetLineage(), "Bacteria; ") == NPOS &&
166  NStr::Find(src->GetSource().GetLineage(), "Archaea") == NPOS)) {
167  return rval;
168  }
169 
170  typedef pair<CSeq_feat_Handle, bool> TFeatOkPair;
171  typedef map<CSeq_feat_Handle, TFeatOkPair > TGeneCDSMap;
172  TGeneCDSMap gene_cds;
173 
174  CFeat_CI f(bsh);
175  CRef<feature::CFeatTree> tr(new feature::CFeatTree(f));
176  tr->SetIgnoreMissingGeneXref();
177  while (f) {
178  if (f->GetData().IsCdregion()) {
179  CMappedFeat gene = tr->GetBestGene(*f);
180  if (gene) {
181  TGeneCDSMap::iterator smit = gene_cds.find(gene);
182  if (smit == gene_cds.end()) {
183  // not found before
184  bool ok_to_map = IsMappablePair(f->GetOriginalFeature(), gene.GetOriginalFeature());
185  gene_cds[gene] = { f->GetSeq_feat_Handle(), ok_to_map };
186  } else {
187  // same gene, two different coding regions
188  gene_cds[gene].second = false;
189  }
190  }
191  }
192  ++f;
193  }
194 
195  for (auto copy_pair : gene_cds) {
196  if (copy_pair.second.second) {
197  rval.push_back({ copy_pair.second.first, copy_pair.first });
198  }
199  }
200  return rval;
201 }
202 
204 {
205  vector<CCleanup::TFeatGenePair> cds_gene_pairs = GetNormalizableGeneQualPairs(bsh);
206  bool any_change = false;
207  for (auto copy_pair : cds_gene_pairs) {
208  CRef<CSeq_feat> new_cds(new CSeq_feat());
209  new_cds->Assign(*(copy_pair.first.GetOriginalSeq_feat()));
210  CRef<CSeq_feat> new_gene(new CSeq_feat());
211  new_gene->Assign(*(copy_pair.second.GetOriginalSeq_feat()));
212  if (NormalizeGeneQuals(*new_cds, *new_gene)) {
213  CSeq_feat_EditHandle gene_edit = CSeq_feat_EditHandle(copy_pair.second);
214  gene_edit.Replace(*new_gene);
215  CSeq_feat_EditHandle cds_edit = CSeq_feat_EditHandle(copy_pair.first);
216  cds_edit.Replace(*new_cds);
217  any_change = true;
218  }
219  }
220 
221  return any_change;
222 }
223 
225 {
226  bool any_change = false;
228  while (bi) {
229  any_change |= NormalizeGeneQuals(*bi);
230  ++bi;
231  }
232  return any_change;
233 }
234 
User-defined methods of the data storage class.
const string & GetLineage(void) const
Definition: BioSource.cpp:360
bool IsSetLineage(void) const
Definition: BioSource.cpp:355
CBioseq_CI –.
Definition: bioseq_ci.hpp:69
CBioseq_Handle –.
static vector< TFeatGenePair > GetNormalizableGeneQualPairs(CBioseq_Handle bsh)
static bool NormalizeGeneQuals(CSeq_feat &cds, CSeq_feat &gene)
CFeat_CI –.
Definition: feat_ci.hpp:64
CMappedFeat –.
Definition: mapped_feat.hpp:59
CRef –.
Definition: ncbiobj.hpp:618
CSeq_entry_Handle –.
CSeq_feat_EditHandle –.
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
CSeqdesc_CI –.
Definition: seqdesc_ci.hpp:65
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
Definition: map.hpp:338
Include a standard set of the NCBI C++ Toolkit most basic headers.
#define bool
Definition: bool.h:34
bool IsMappablePair(const CSeq_feat &cds, const CSeq_feat &gene)
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
bool IsAa(void) const
void Replace(const CSeq_feat &new_feat) const
Replace the feature with new Seq-feat object.
const CSeq_feat & GetOriginalFeature(void) const
Get original feature with unmapped location/product.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
#define NPOS
Definition: ncbistr.hpp:133
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2891
static bool Equal(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Test for equality of a substring with another string.
Definition: ncbistr.hpp:5384
bool IsSetLocus_tag(void) const
systematic gene name (e.g., MI0001, ORF0069) Check if a value has been assigned to Locus_tag data mem...
Definition: Gene_ref_.hpp:781
bool IsSetLocus(void) const
Official gene symbol Check if a value has been assigned to Locus data member.
Definition: Gene_ref_.hpp:493
const TLocus_tag & GetLocus_tag(void) const
Get the Locus_tag member data.
Definition: Gene_ref_.hpp:793
TXref & SetXref(void)
Assign a value to Xref data member.
Definition: Seq_feat_.hpp:1314
bool IsSetXref(void) const
cite other relevant features Check if a value has been assigned to Xref data member.
Definition: Seq_feat_.hpp:1296
bool IsGene(void) const
Check if variant Gene is selected.
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_feat_.hpp:925
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_feat_.cpp:94
const TGene & GetGene(void) const
Get the variant data.
const TSource & GetSource(void) const
Get the variant data.
Definition: Seqdesc_.cpp:566
bool IsSource(void) const
Check if variant Source is selected.
Definition: Seqdesc_.hpp:1190
@ e_Source
source of materials, includes Org-ref
Definition: Seqdesc_.hpp:133
@ eMol_na
just a nucleic acid
Definition: Seq_inst_.hpp:113
double f(double x_, const double &y_)
Definition: njn_root.hpp:188
The Object manager core.
#define const
Definition: zconf.h:232
Modified on Tue Apr 23 07:39:52 2024 by modify_doxy.py rev. 669887