NCBI C++ ToolKit
util.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: util.cpp 97646 2022-08-08 15:05:05Z dicuccio $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Christiam Camacho
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 #include <algo/sequence/util.hpp>
34 #include <objects/seq/seq__.hpp>
36 #include <objmgr/seq_vector.hpp>
38 
39 #include <math.h>
40 
43 
45 SeqLocToBioseq(const objects::CSeq_loc& loc, objects::CScope& scope)
46 {
47  CRef<CBioseq> bioseq;
48  if ( !loc.GetId() ) {
49  return bioseq;
50  }
51 
52  // Build a Seq-entry for the query Seq-loc
53  CBioseq_Handle handle = scope.GetBioseqHandle(*loc.GetId());
54  if( !handle ){
55  return bioseq;
56  }
57 
58  bioseq.Reset( new CBioseq() );
59 
60  // add an ID for our sequence
61  CRef<CSeq_id> id(new CSeq_id());
62  id->Assign(*handle.GetSeqId());
63  bioseq->SetId().push_back(id);
64 
65  // a title
66  CRef<CSeqdesc> title( new CSeqdesc() );
67  string title_str;
68  id -> GetLabel(&title_str );
69  title_str += ": ";
70  loc.GetLabel( &title_str );
71  title->SetTitle( title_str );
72  bioseq->SetDescr().Set().push_back( title );
73 
74  ///
75  /// create the seq-inst
76  /// we can play some games here
77  ///
78  CSeq_inst& inst = bioseq->SetInst();
79 
80  if (handle.IsAa()) {
82  } else {
84  }
85 
86  bool process_whole = false;
87  if (loc.IsWhole()) {
88  process_whole = true;
89  } else if (loc.IsInt()) {
90  TSeqRange range = loc.GetTotalRange();
91  if (range.GetFrom() == 0 && range.GetTo() == handle.GetBioseqLength() - 1) {
92  /// it's whole
93  process_whole = true;
94  }
95  }
96 
97  /// BLAST now handles delta-seqs correctly, so we can submit this
98  /// as a delta-seq
99  if (process_whole) {
100  /// just encode the whole sequence
101  CSeqVector vec(loc, scope, CBioseq_Handle::eCoding_Iupac);
102  string seq_string;
103  vec.GetSeqData(0, vec.size(), seq_string);
104 
106  inst.SetLength(seq_string.size());
107  if (vec.IsProtein()) {
109  inst.SetSeq_data().SetIupacaa().Set().swap(seq_string);
110  } else {
112  inst.SetSeq_data().SetIupacna().Set().swap(seq_string);
114  }
115  } else {
117  inst.SetLength(handle.GetBioseqLength());
118  CDelta_ext& ext = inst.SetExt().SetDelta();
119 
120  ///
121  /// create a delta sequence
122  ///
123 
124  //const CSeq_id& id = sequence::GetId(loc, &scope);
125  //ENa_strand strand = sequence::GetStrand(loc, &scope);
126  TSeqRange range = loc.GetTotalRange();
127 
128  /// first segment: gap out to initial start of seq-loc
129  if (range.GetFrom() != 0) {
130  ext.AddLiteral(range.GetFrom());
131  }
132 
133  CSeq_loc_CI loc_iter(loc);
134  if (loc_iter) {
135  TSeqRange prev = loc_iter.GetRange();
136  ENa_strand strand = loc_iter.GetStrand();
137 
138  do {
139  /// encode a literal for the included bases
140  CRef<CSeq_loc> sl =
141  handle.GetRangeSeq_loc(prev.GetFrom(), prev.GetTo(), strand);
142 
143  CSeqVector vec(*sl, scope, CBioseq_Handle::eCoding_Iupac, strand);
144  string seq_string;
145  vec.GetSeqData(0, vec.size(), seq_string);
146 
147  ext.AddLiteral(seq_string,
149 
150  /// skip to the next segment
151  /// we may need to include a gap
152  ++loc_iter;
153  if (loc_iter) {
154  TSeqRange next = loc_iter.GetRange();
155  ext.AddLiteral(next.GetFrom() - prev.GetTo());
156  prev = next;
157  strand = loc_iter.GetStrand();
158  }
159  }
160  while (loc_iter);
161 
162  /// gap at the end
163  if (prev.GetTo() < handle.GetBioseqLength() - 1) {
164  ext.AddLiteral(handle.GetBioseqLength() - prev.GetTo() - 1);
165  }
166  }
167  }
168 
169  return bioseq;
170 }
171 
172 
173 //////////////////////////////////////////////////////////////////////////////
174 ///
175 /// Sequence Entropy Calculation
176 ///
177 /// This is a straightforward computation of Shannon entropy for all tetramers
178 /// observed in a given sequence. This was tuned to work specifically with DNA
179 /// sequence. While it should also apply for proteins, the denominator is much
180 /// different in that case (20^4 instead of 4^4).
181 ///
182 ///
183 /// The function has been heavily optimized to use caching of partial results.
184 /// For large sequence sets, this caching makes a huge difference. The
185 /// notional normative function we are trying to reproduce is included inline
186 /// below for reference.
187 ///
188 /// Note that a separate implementation is provided here for protein sequences
189 /// The impetus for this is that the alphabet size for proteins is different,
190 /// and in general, we do no have so many computations to perform (fre millions
191 /// vs. billions for short reads). Also, the optimized class assumes a
192 /// constant overall sequence size when computing the denominator values.
193 ///
194 
195 #if 0
196 ///
197 /// This is the normative copy of the function
198 /// Note that this is specific to nucleotides - the code:
199 ///
200 /// double denom = pow(4, word_size);
201 ///
202 /// assumes that the span of the alphabet is 4 characters (A, C, G, T)
203 ///
204 double ComputeNormalizedEntropy(const CTempString& sequence,
205  size_t word_size)
206 {
207  typedef map<CTempString, double> TCounts;
208  TCounts counts;
209  double total = sequence.size() - word_size;
210  for (size_t i = word_size; i < sequence.size(); ++i) {
211  CTempString t(sequence, i - word_size, word_size);
212  TCounts::iterator it =
213  counts.insert(TCounts::value_type(t, 0)).first;
214  it->second += 1;
215  }
216 
217  NON_CONST_ITERATE (TCounts, it, counts) {
218  it->second /= total;
219  }
220 
221  double entropy = 0;
222  ITERATE (TCounts, it, counts) {
223  entropy += it->second * log(it->second);
224  }
225  double denom = pow(4, word_size);
226  denom = min(denom, total);
227  entropy = -entropy / log(denom);
228  return max<double>(0.0, entropy);
229 }
230 #endif
231 
232 
234  size_t word_size)
235 {
236  typedef map<CTempString, double> TCounts;
237  TCounts counts;
238  double total = double(sequence.size() - word_size + 1);
239  for (size_t i = word_size; i <= sequence.size(); ++i) {
240  CTempString t(sequence, i - word_size, word_size);
241  TCounts::iterator it =
242  counts.insert(TCounts::value_type(t, 0)).first;
243  it->second += 1;
244  }
245 
246  //cerr << "sum:" << endl;
247  NON_CONST_ITERATE (TCounts, it, counts) {
248  /**
249  cerr << " " << it->first
250  << " = " << it->second
251  << " = " << it->second / total
252  << " = " << it->second * log10(it->second) << endl;
253  **/
254  it->second /= total;
255  }
256 
257  double entropy = 0;
258  ITERATE (TCounts, it, counts) {
259  entropy += it->second * log10(it->second);
260  }
261  // NOTE:
262  // 20 = count of amino acids
263  double denom = pow(20, word_size);
264  denom = min(denom, total);
265  //cerr << "entropy: " << entropy << " = " << -entropy / log(denom) << endl;
266  entropy = -entropy / log(denom);
267  return max<double>(0.0, entropy);
268 }
269 
270 
271 
272 CEntropyCalculator::CEntropyCalculator(size_t sequence_size, size_t word_size)
273  : m_WordSize(word_size)
274  , m_NumWords(sequence_size - word_size)
275  , m_Denom(log(min((double)m_NumWords,
276  pow(4.0, (int)word_size))))
277 {
278  if (word_size > sequence_size) {
280  "entropy is undefined when the sequence size is "
281  "smaller than the word size");
282  }
283 
284  m_Words.resize(m_NumWords);
285  m_EntropyValues.resize(m_NumWords+1, -1.);
286 
287  /// Special case for count 0; don't calculate, to avoid calling log(0)
288  m_EntropyValues[0] = 0;
289 }
290 
292 {
293  NCBI_ASSERT(sequence.size() - m_WordSize == m_NumWords,
294  "Sequence of wrong length");
295 
296  for (size_t i = 0; i < m_NumWords; ++i) {
297  m_Words[i].assign(sequence, i, m_WordSize);
298  }
299  sort(m_Words.begin(), m_Words.end());
300 
301  double entropy = 0;
302  for (size_t i = 0, count = 1; i < m_NumWords; ++i, ++count) {
303  if (i == m_NumWords-1 || m_Words[i] != m_Words[i+1]) {
304  entropy += x_Entropy(count);
305  count = 0;
306  }
307  }
308  return max<double>(0.0, entropy);
309 }
310 
311 vector<double> CEntropyCalculator::
313 {
314  NCBI_ASSERT(sequence.size() - m_WordSize >= m_NumWords,
315  "Sequence too short");
316 
317  size_t window = m_NumWords + m_WordSize;
318  TCounts counts;
319 
320  for (size_t i = 0; i < m_NumWords; ++i) {
321  CTempString t(sequence, i, m_WordSize);
322  TCounts::iterator it =
323  counts.insert(TCounts::value_type(t, 0)).first;
324  ++it->second;
325  }
326 
327  /// The first half-window all has the same entropy, calculated over the
328  /// beginning of the sequence
329  vector<double> results(window/2+1, x_Entropy(counts));
330 
331  for (size_t pos = 0; pos < sequence.size()-window; ++pos) {
332  CTempString removed_word(sequence, pos, m_WordSize);
333  --counts[removed_word];
334  CTempString added_word(sequence, pos+m_NumWords, m_WordSize);
335  TCounts::iterator it =
336  counts.insert(TCounts::value_type(added_word, 0)).first;
337  ++it->second;
338  results.push_back(x_Entropy(counts));
339  }
340 
341  /// last half-window again has the same entropy
342  results.resize(sequence.size(), x_Entropy(counts));
343  return results;
344 }
345 
346 double CEntropyCalculator::x_Entropy(size_t count)
347 {
348  double &entropy_value = m_EntropyValues[count];
349  if (entropy_value < 0) {
350  /// Lazy evaluation of entropy value
351  double fraction = (double)count / m_NumWords;
352  entropy_value = -fraction * log(fraction) / m_Denom;
353  }
354  return entropy_value;
355 }
356 
358 {
359  double entropy = 0;
360  ITERATE (TCounts, word_it, counts) {
361  entropy += x_Entropy(word_it->second);
362  }
363  return max<double>(0.0, entropy);
364 }
365 
366 double ComputeNormalizedEntropy(const CTempString& sequence,
367  size_t word_size)
368 {
369  return CEntropyCalculator(sequence.size(), word_size)
370  .ComputeEntropy(sequence);
371 }
372 
373 
USING_SCOPE(objects)
double ComputeNormalizedEntropy(const CTempString &sequence, size_t word_size)
Compute the normalized Shannon entropy for a sequence of IUPACna bases.
Definition: util.cpp:366
double ComputeNormalizedProteinEntropy(const CTempString &sequence, size_t word_size)
Sequence Entropy Calculation.
Definition: util.cpp:233
CRef< objects::CBioseq > SeqLocToBioseq(const objects::CSeq_loc &loc, objects::CScope &scope)
Definition: util.cpp:45
CBioseq_Handle –.
CDelta_seq & AddLiteral(TSeqPos len)
add a literal segment at the end this variant adds a gap literal
Definition: Delta_ext.cpp:69
double m_Denom
Definition: util.hpp:59
vector< CTempString > m_Words
Definition: util.hpp:57
double ComputeEntropy(const CTempString &sequence)
Definition: util.cpp:291
vector< double > m_EntropyValues
Definition: util.hpp:58
double x_Entropy(size_t count)
Definition: util.cpp:346
size_t m_WordSize
Definition: util.hpp:55
CEntropyCalculator(size_t sequence_size, size_t word_size)
Definition: util.cpp:272
size_t m_NumWords
Definition: util.hpp:56
vector< double > ComputeSlidingWindowEntropy(const CTempString &sequence)
Definition: util.cpp:312
CSeqVector –.
Definition: seq_vector.hpp:65
Seq-loc iterator class – iterates all intervals from a seq-loc in the correct order.
Definition: Seq_loc.hpp:453
static TSeqPos Pack(CSeq_data *in_seq, TSeqPos uLength=ncbi::numeric_limits< TSeqPos >::max())
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
container_type::iterator iterator
Definition: map.hpp:54
iterator_bool insert(const value_type &val)
Definition: map.hpp:165
container_type::value_type value_type
Definition: map.hpp:52
Definition: map.hpp:338
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:61
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:56
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define NCBI_ASSERT(expr, mess)
Definition: ncbidbg.hpp:130
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
string GetLabel(const CSeq_id &id)
TRange GetRange(void) const
Get the range.
Definition: Seq_loc.hpp:1042
ENa_strand GetStrand(void) const
Definition: Seq_loc.hpp:1056
TSeqPos GetBioseqLength(void) const
bool IsAa(void) const
CConstRef< CSeq_id > GetSeqId(void) const
Get id which can be used to access this bioseq handle Throws an exception if none is available.
CRef< CSeq_loc > GetRangeSeq_loc(TSeqPos start, TSeqPos stop, ENa_strand strand=eNa_strand_unknown) const
Return CSeq_loc referencing the given range and strand on the bioseq If start == 0,...
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
Definition: seq_vector.cpp:304
TSeqPos size(void) const
Definition: seq_vector.hpp:291
bool IsProtein(void) const
Definition: seq_vector.hpp:350
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
size_type size(void) const
Return the length of the represented array.
Definition: tempstr.hpp:327
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
TId & SetId(void)
Assign a value to Id data member.
Definition: Bioseq_.hpp:296
TTitle & SetTitle(void)
Select the variant.
Definition: Seqdesc_.hpp:1039
void SetExt(TExt &value)
Assign a value to Ext data member.
Definition: Seq_inst_.cpp:147
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
void SetDescr(TDescr &value)
Assign a value to Descr data member.
Definition: Bioseq_.cpp:65
void SetRepr(TRepr value)
Assign a value to Repr data member.
Definition: Seq_inst_.hpp:574
void SetLength(TLength value)
Assign a value to Length data member.
Definition: Seq_inst_.hpp:668
void SetSeq_data(TSeq_data &value)
Assign a value to Seq_data data member.
Definition: Seq_inst_.cpp:130
void SetMol(TMol value)
Assign a value to Mol data member.
Definition: Seq_inst_.hpp:621
@ eRepr_delta
sequence made by changes (delta) to others
Definition: Seq_inst_.hpp:100
@ eRepr_raw
continuous sequence
Definition: Seq_inst_.hpp:94
@ eMol_na
just a nucleic acid
Definition: Seq_inst_.hpp:113
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
range(_Ty, _Ty) -> range< _Ty >
constexpr auto sort(_Init &&init)
double value_type
The numeric datatype used by the parser.
Definition: muParserDef.h:228
EIPRangeType t
Definition: ncbi_localip.c:101
T log10(T x_)
T min(T x_, T y_)
Modified on Sat Jun 22 10:38:36 2024 by modify_doxy.py rev. 669887