NCBI C++ ToolKit
weight.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: weight.cpp 100589 2023-08-14 14:23:37Z grichenk $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Aaron Ucko
27 *
28 * File Description:
29 * Weights for protein sequences
30 */
31 
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbistd.hpp>
35 
37 #include <objmgr/bioseq_handle.hpp>
38 #include <objmgr/feat_ci.hpp>
39 #include <objmgr/seq_vector.hpp>
40 #include <objmgr/seq_vector_ci.hpp>
42 
43 #include <objects/seq/Bioseq.hpp>
44 #include <objects/seq/MolInfo.hpp>
45 #include <objects/seq/Seq_inst.hpp>
46 
51 
53 
54 #include <objmgr/util/weight.hpp>
55 #include <objmgr/util/sequence.hpp>
56 
59 
60 
61 // By NCBIeaa:
62 // A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
63 static const int kNumC[26] =
64 { 3, 4, 3, 4, 5, 9, 2, 6, 6, 6, 6, 6, 5, 4, 12, 5, 5, 6, 3, 4, 3, 5, 11, 0, 9, 5};
65 static const int kNumH[26] =
66 { 5, 5, 5, 5, 7, 9, 3, 7, 11, 11, 12, 11, 9, 6, 19, 7, 8, 12, 5, 7, 5, 9, 10, 0, 9, 7};
67 static const int kNumN[26] =
68 { 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 2, 1, 1, 2, 3, 1, 2, 4, 1, 1, 1, 1, 2, 0, 1, 1};
69 static const int kNumO[26] =
70 { 1, 3, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 2, 2, 1, 1, 1, 0, 2, 3};
71 static const int kNumS[26] =
72 { 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
73 static const int kNumSe[26] =
74 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0};
75 
76 
77 template <class Iterator>
78 double s_GetProteinWeight(Iterator start, Iterator end)
79 {
80  // Start with water (H2O)
81  size_t c = 0, h = 2, n = 0, o = 1, s = 0, se = 0;
82 
83  for ( ; start != end; ++start) {
84  unsigned char ch = *start;
85  int res = 0;
86  if ( ch >= 'a' && ch <= 'z' ) {
87  res = ch - 'a';
88  } else if ( ch >= 'A' && ch <= 'Z' ) {
89  res = ch - 'A';
90  } else if ( ch != '-' && ch != '*' ) {
91  NCBI_THROW(CObjmgrUtilException, eBadResidue,
92  "GetProteinWeight: bad residue");
93  }
94  c += kNumC [res];
95  h += kNumH [res];
96  n += kNumN [res];
97  o += kNumO [res];
98  s += kNumS [res];
99  se += kNumSe[res];
100  }
101 
102  return 12.01115 * c +
103  1.0079 * h +
104  14.0067 * n +
105  15.9994 * o +
106  32.064 * s +
107  78.96 * se;
108 }
109 
110 
111 double GetProteinWeight(const CSeq_feat& feat, CScope& scope,
112  const CSeq_loc* location,
113  TGetProteinWeight opts )
114 {
115  if (feat.GetData().Which() != CSeqFeatData::e_Prot) {
117  "molecular weight only valid for protein features");
118  }
119 
120  const CSeq_loc& loc =
121  (location ? *location : feat.GetLocation());
122  CSeqVector v(loc, scope);
124 
125  CSeqVector_CI vit(v);
126 
127  /// find out if the molecule is complete
129  const CProt_ref& prot = feat.GetData().GetProt();
130 
131  switch (prot.GetProcessed()) {
134  /// follow the molecule's setting
135  break;
139  /// trust the location as-is
141  break;
142  default:
143  ;
144  }
145 
146  if (comp == CMolInfo::eCompleteness_unknown) {
147  /// assess based on the molecule
148  CBioseq_Handle bsh = scope.GetBioseqHandle(loc);
149  if (loc.GetTotalRange().GetFrom() > 0 ||
150  loc.GetTotalRange().GetLength() < bsh.GetBioseqLength()) {
151  /// we don' want to clip
153  } else {
155 
156  if (prot.GetProcessed() == CProt_ref::eProcessed_not_set) {
157  /// look for a signal peptide; if there is one, consider
158  /// ourselves partial
159  CFeat_CI feat_it(bsh, CSeqFeatData::e_Prot);
160  for ( ; feat_it; ++feat_it) {
161  switch (feat_it->GetData().GetProt().GetProcessed()) {
165  break;
166 
167  default:
168  break;
169  }
170  }
171  }
172 
173  /**
174  /// NB: the C toolkit has not yet implemented this; commented out
175  /// for now to maintain compatibility
176  CConstRef<CMolInfo> molinfo(sequence::GetMolInfo(bsh));
177  if (molinfo) {
178  comp = molinfo->GetCompleteness();
179  LOG_POST(Error << "comp = " << comp);
180  }
181  **/
182  }
183  }
184 
185  if( (opts & fGetProteinWeight_ForceInitialMetTrim) != 0 ) {
186  if ( vit.GetBufferSize() > 1 && *vit == 'M') {
187  ++vit;
188  }
189  } else {
190  switch (comp) {
195  /// molecule is incomplete at the start; any 'M' here should be trusted
196  break;
197 
198  default:
199  /// for complete molecules, we skip the leading 'M' since this is
200  /// cleaved as a post-transcriptional modification
201  if ( vit.GetBufferSize() > 1 && *vit == 'M') {
202  ++vit;
203  }
204  break;
205  }
206  }
207 
208  return s_GetProteinWeight(vit, v.end());
209 }
210 
211 
212 double GetProteinWeight(const CBioseq_Handle& handle, const CSeq_loc* location,
213  TGetProteinWeight opts )
214 {
215  CSeqVector v = (location
216  ? CSeqVector(*location, handle.GetScope())
217  : handle.GetSeqVector());
219 
220  CSeqVector_CI vit(v);
221 
222  /// find out if the molecule is complete
224  if (location &&
225  (location->GetTotalRange().GetFrom() > 0 ||
226  location->GetTotalRange().GetLength() < handle.GetBioseqLength())) {
227  /// we don' want to clip
229  } else {
231  /**
232  /// NB: the C toolkit has not yet implemented this; commented out
233  /// for now to maintain compatibility
234  CConstRef<CMolInfo> molinfo(sequence::GetMolInfo(handle));
235  if (molinfo) {
236  comp = molinfo->GetCompleteness();
237  LOG_POST(Error << "comp = " << comp);
238  }
239  **/
240  }
241 
242  if( (opts & fGetProteinWeight_ForceInitialMetTrim) != 0 ) {
243  if (*vit == 'M') {
244  ++vit;
245  }
246  } else {
247  switch (comp) {
252  /// molecule is incomplete at the start; any 'M' here should be trusted
253  break;
254 
255  default:
256  /// for complete molecules, we skip the leading 'M' since this is
257  /// cleaved as a post-transcriptional modification
258  if (*vit == 'M') {
259  ++vit;
260  }
261  break;
262  }
263  }
264 
265  return s_GetProteinWeight(vit, v.end());
266 }
267 
268 
269 double GetProteinWeight(const string& ncbieaa_sequence)
270 {
271  return s_GetProteinWeight(ncbieaa_sequence.begin(),
272  ncbieaa_sequence.end());
273 }
274 
275 
276 void GetProteinWeights(const CBioseq_Handle& handle, TWeights& weights)
277 {
278  if (handle.GetBioseqMolType() != CSeq_inst::eMol_aa) {
279  NCBI_THROW(CObjmgrUtilException, eBadSequenceType,
280  "GetMolecularWeights requires a protein!");
281  }
282  weights.clear();
283 
284  set<CConstRef<CSeq_loc> > locations;
285  CConstRef<CSeq_loc> signal;
286 
287  // Look for explicit markers: ideally cleavage products (mature
288  // peptides), but possibly just signal peptides
289  SAnnotSelector sel;
295  for (CFeat_CI feat(handle, sel); feat; ++feat) {
296  bool is_mature = false, is_signal = false;
297  const CSeqFeatData& data = feat->GetData();
298  switch (data.Which()) {
300  switch (data.GetProt().GetProcessed()) {
301  case CProt_ref::eProcessed_mature: is_mature = true; break;
302  case CProt_ref::eProcessed_signal_peptide: is_signal = true; break;
303  default: break;
304  }
305  break;
306 
308  if (!NStr::CompareNocase(data.GetRegion(), "mature chain")
309  || !NStr::CompareNocase(data.GetRegion(),
310  "processed active peptide")) {
311  is_mature = true;
312  } else if (!NStr::CompareNocase(data.GetRegion(), "signal")) {
313  is_signal = true;
314  }
315  break;
316 
318  if (data.GetSite() == CSeqFeatData::eSite_signal_peptide) {
319  is_signal = true;
320  }
321  break;
322 
323  default:
324  break;
325  }
326 
327  if (is_mature) {
328  locations.insert(CConstRef<CSeq_loc>(&feat->GetLocation()));
329  } else if (is_signal && signal.Empty()
330  && !feat->GetLocation().IsWhole() ) {
331  signal = &feat->GetLocation();
332  }
333  }
334 
335  if (locations.empty()) {
339  if ( signal.NotEmpty() ) {
340  // Expects to see at beginning; is this assumption safe?
341  CSeq_interval& interval = whole->SetInt();
342  interval.SetFrom(signal->GetTotalRange().GetTo() + 1);
343  interval.SetTo(v.size() - 1);
344  interval.SetId(const_cast<CSeq_id&>(*handle.GetSeqId()));
345  } else if (v[0] == 'M') { // Treat initial methionine as start codon
346  CSeq_interval& interval = whole->SetInt();
347  interval.SetFrom(1);
348  interval.SetTo(v.size() - 1);
349  interval.SetId(const_cast<CSeq_id&>(*handle.GetSeqId()));
350  }
351  else {
352  whole->SetWhole(const_cast<CSeq_id&>(*handle.GetSeqId()));
353  }
354  locations.insert(CConstRef<CSeq_loc>(whole));
355  }
356 
357  ITERATE(set<CConstRef<CSeq_loc> >, it, locations) {
358  try {
359  // Split up to ensure that we call [] only if
360  // GetProteinWeight succeeds.
361  double weight = GetProteinWeight(handle, *it);
362  weights[*it] = weight;
363  } catch (CObjmgrUtilException&) {
364  // Silently elide
365  }
366  }
367 }
368 
369 
#define static
CBioseq_Handle –.
CFeat_CI –.
Definition: feat_ci.hpp:64
Exceptions for objmgr/util library.
CScope –.
Definition: scope.hpp:92
CSeqVector –.
Definition: seq_vector.hpp:65
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
void clear()
Definition: map.hpp:169
Definition: map.hpp:338
Definition: set.hpp:45
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
bool empty() const
Definition: set.hpp:133
Include a standard set of the NCBI C++ Toolkit most basic headers.
static const char location[]
Definition: config.c:97
char data[12]
Definition: iconv.c:80
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
@ eUnknown
Definition: app_popup.hpp:72
TRange GetTotalRange(void) const
Definition: Seq_loc.hpp:913
void GetProteinWeights(const CBioseq_Handle &handle, TWeights &weights)
Automatically picks reasonable ranges: in decreasing priority order,.
Definition: weight.cpp:276
double GetProteinWeight(const CSeq_feat &feat, CScope &scope, const CSeq_loc *location, TGetProteinWeight opts)
Definition: weight.cpp:111
int TGetProteinWeight
Definition: weight.hpp:58
@ fGetProteinWeight_ForceInitialMetTrim
Definition: weight.hpp:56
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
const CSeqFeatData & GetData(void) const
TSeqPos GetBioseqLength(void) const
CConstRef< CSeq_id > GetSeqId(void) const
Get id which can be used to access this bioseq handle Throws an exception if none is available.
CScope & GetScope(void) const
Get scope this handle belongs to.
TMol GetBioseqMolType(void) const
Get some values from core:
CSeqVector GetSeqVector(EVectorCoding coding, ENa_strand strand=eNa_strand_plus) const
Get sequence: Iupacna or Iupacaa if use_iupac_coding is true.
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
SAnnotSelector & IncludeFeatSubtype(TFeatSubtype subtype)
Include feature subtype in the search.
SAnnotSelector & SetResolveTSE(void)
SetResolveTSE() is equivalent to SetResolveMethod(eResolve_TSE).
SAnnotSelector & IncludeFeatType(TFeatType type)
Include feature type in the search.
TSeqPos GetBufferSize(void) const
Get number of chars from current position to the current buffer end.
SAnnotSelector & SetOverlapIntervals(void)
Check overlapping of individual intervals.
TSeqPos size(void) const
Definition: seq_vector.hpp:291
void SetCoding(TCoding coding)
const_iterator end(void) const
Definition: seq_vector.hpp:305
bool Empty(void) const THROWS_NONE
Check if CConstRef is empty – not pointing to any object which means having a null value.
Definition: ncbiobj.hpp:1385
bool NotEmpty(void) const THROWS_NONE
Check if CConstRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:1392
position_type GetLength(void) const
Definition: range.hpp:158
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
TProcessed GetProcessed(void) const
Get the Processed member data.
Definition: Prot_ref_.hpp:538
@ eProcessed_signal_peptide
Definition: Prot_ref_.hpp:99
@ eProcessed_transit_peptide
Definition: Prot_ref_.hpp:100
E_Choice Which(void) const
Which variant is currently selected.
const TLocation & GetLocation(void) const
Get the Location member data.
Definition: Seq_feat_.hpp:1117
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_feat_.hpp:925
const TProt & GetProt(void) const
Get the variant data.
@ e_Region
named region (globin locus)
void SetTo(TTo value)
Assign a value to To data member.
void SetId(TId &value)
Assign a value to Id data member.
void SetFrom(TFrom value)
Assign a value to From data member.
@ eCompleteness_unknown
Definition: MolInfo_.hpp:155
@ eCompleteness_complete
complete biological entity
Definition: MolInfo_.hpp:156
@ eCompleteness_no_left
missing 5' or NH3 end
Definition: MolInfo_.hpp:158
@ eCompleteness_partial
partial but no details given
Definition: MolInfo_.hpp:157
@ eCompleteness_no_ends
missing both ends
Definition: MolInfo_.hpp:160
@ e_Ncbieaa
extended ASCII 1 letter aa codes
Definition: Seq_data_.hpp:111
where boath are integers</td > n< td ></td > n</tr > n< tr > n< td > tse</td > n< td > optional</td > n< td > String</td > n< td class=\"description\"> TSE option controls what blob is whole
n font weight
yy_size_t n
The Object manager core.
SAnnotSelector –.
static const int kNumO[26]
Definition: weight.cpp:69
static const int kNumC[26]
Definition: weight.cpp:63
static const int kNumS[26]
Definition: weight.cpp:71
static const int kNumN[26]
Definition: weight.cpp:67
double s_GetProteinWeight(Iterator start, Iterator end)
Definition: weight.cpp:78
static const int kNumH[26]
Definition: weight.cpp:65
static const int kNumSe[26]
Definition: weight.cpp:73
#define const
Definition: zconf.h:232
Modified on Fri Sep 20 14:57:02 2024 by modify_doxy.py rev. 669887