NCBI C++ ToolKit
util.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef ALGO_SEQUENCE___UTIL__HPP
2 #define ALGO_SEQUENCE___UTIL__HPP
3 
4 /* $Id: util.hpp 83344 2018-08-13 19:25:38Z dicuccio $
5  * ===========================================================================
6  *
7  * PUBLIC DOMAIN NOTICE
8  * National Center for Biotechnology Information
9  *
10  * This software/database is a "United States Government Work" under the
11  * terms of the United States Copyright Act. It was written as part of
12  * the author's official duties as a United States Government employee and
13  * thus cannot be copyrighted. This software/database is freely available
14  * to the public for use. The National Library of Medicine and the U.S.
15  * Government have not placed any restriction on its use or reproduction.
16  *
17  * Although all reasonable efforts have been taken to ensure the accuracy
18  * and reliability of the software and data, the NLM and the U.S.
19  * Government do not and cannot warrant the performance or results that
20  * may be obtained by using this software or data. The NLM and the U.S.
21  * Government disclaim all warranties, express or implied, including
22  * warranties of performance, merchantability or fitness for any particular
23  * purpose.
24  *
25  * Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Authors: Christiam Camacho
30  *
31  * File Description:
32  *
33  */
34 
35 #include <corelib/ncbiobj.hpp>
36 #include <objmgr/scope.hpp>
37 
39 
41 class CBioseq;
42 class CSeq_loc;
44 
46 {
47 public:
48  CEntropyCalculator(size_t sequence_size, size_t word_size);
49 
50  double ComputeEntropy(const CTempString& sequence);
51 
52  vector<double> ComputeSlidingWindowEntropy(const CTempString& sequence);
53 
54 private:
55  size_t m_WordSize;
56  size_t m_NumWords;
57  vector<CTempString> m_Words;
58  vector<double> m_EntropyValues;
59  double m_Denom;
60 
62 
63  double x_Entropy(size_t count);
64  double x_Entropy(const TCounts &counts);
65 };
66 
67 double ComputeNormalizedProteinEntropy(const CTempString& sequence,
68  size_t word_size);
69 
70 // Create virtual Bioseq for masking original sequence with gaps.
71 // New sequence will has Seq-id 'new_id'
72 // Its Seq-inst object will be of type delta, and has reference to
73 // the original sequence ('original_id') and gaps in place of masked ranges.
75 CRef<objects::CBioseq> SeqLocToBioseq(const objects::CSeq_loc& loc,
76  objects::CScope& scope);
77 
78 /// Compute the normalized Shannon entropy for a sequence of IUPACna bases
79 ///
80 /// The algorithm is as described in:
81 ///
82 /// http://en.wikipedia.org/wiki/Shannon_entropy
83 ///
84 /// This follows the scheme for normalized representation, and assumes that the
85 /// input contains IUPACna bases. Ambiguities are not fully handled.
86 /// The return value is a real-valued number ranging from 0..1. Typical
87 /// interpretations of values are:
88 /// - Less than 0.5 implies that the sequence has low complexity
89 /// - 0.0 implies that the sequence is a homopolymer
90 
91 double ComputeNormalizedEntropy(const CTempString& sequence,
92  size_t word_size);
93 
95 
96 #endif // ALGO_SEQUENCE___UTIL__HPP
double m_Denom
Definition: util.hpp:59
vector< CTempString > m_Words
Definition: util.hpp:57
vector< double > m_EntropyValues
Definition: util.hpp:58
size_t m_WordSize
Definition: util.hpp:55
size_t m_NumWords
Definition: util.hpp:56
map< CTempString, size_t > TCounts
Definition: util.hpp:61
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
Definition: map.hpp:338
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
#define NCBI_XALGOSEQ_EXPORT
Definition: ncbi_export.h:1017
double ComputeNormalizedEntropy(const CTempString &sequence, size_t word_size)
Compute the normalized Shannon entropy for a sequence of IUPACna bases.
Definition: util.cpp:366
double ComputeNormalizedProteinEntropy(const CTempString &sequence, size_t word_size)
Sequence Entropy Calculation.
Definition: util.cpp:233
CRef< objects::CBioseq > SeqLocToBioseq(const objects::CSeq_loc &loc, objects::CScope &scope)
Definition: util.cpp:45
Portable reference counted smart and weak pointers using CWeakRef, CRef, CObject and CObjectEx.
Modified on Sun May 19 04:44:33 2024 by modify_doxy.py rev. 669887