63 bioseq->
SetId().push_back(
id);
70 loc.GetLabel( &title_str );
72 bioseq->
SetDescr().Set().push_back( title );
86 bool process_whole =
false;
89 }
else if (loc.IsInt()) {
109 inst.
SetSeq_data().SetIupacaa().Set().swap(seq_string);
112 inst.
SetSeq_data().SetIupacna().Set().swap(seq_string);
129 if (
range.GetFrom() != 0) {
209 double total = sequence.
size() - word_size;
210 for (
size_t i = word_size;
i < sequence.
size(); ++
i) {
212 TCounts::iterator it =
222 ITERATE (TCounts, it, counts) {
223 entropy += it->second *
log(it->second);
225 double denom = pow(4, word_size);
226 denom =
min(denom, total);
227 entropy = -entropy /
log(denom);
228 return max<double>(0.0, entropy);
238 double total = double(sequence.
size() - word_size + 1);
239 for (
size_t i = word_size;
i <= sequence.
size(); ++
i) {
241 TCounts::iterator it =
258 ITERATE (TCounts, it, counts) {
259 entropy += it->second *
log10(it->second);
263 double denom = pow(20, word_size);
264 denom =
min(denom, total);
266 entropy = -entropy /
log(denom);
267 return max<double>(0.0, entropy);
273 : m_WordSize(word_size)
274 , m_NumWords(sequence_size - word_size)
275 , m_Denom(
log(
min((double)m_NumWords,
276 pow(4.0, (
int)word_size))))
278 if (word_size > sequence_size) {
280 "entropy is undefined when the sequence size is "
281 "smaller than the word size");
294 "Sequence of wrong length");
302 for (
size_t i = 0, count = 1;
i <
m_NumWords; ++
i, ++count) {
308 return max<double>(0.0, entropy);
315 "Sequence too short");
329 vector<double> results(window/2+1,
x_Entropy(counts));
331 for (
size_t pos = 0; pos < sequence.
size()-window; ++pos) {
333 --counts[removed_word];
349 if (entropy_value < 0) {
352 entropy_value = -fraction *
log(fraction) /
m_Denom;
354 return entropy_value;
363 return max<double>(0.0, entropy);
double ComputeNormalizedEntropy(const CTempString &sequence, size_t word_size)
Compute the normalized Shannon entropy for a sequence of IUPACna bases.
double ComputeNormalizedProteinEntropy(const CTempString &sequence, size_t word_size)
Sequence Entropy Calculation.
CRef< objects::CBioseq > SeqLocToBioseq(const objects::CSeq_loc &loc, objects::CScope &scope)
CDelta_seq & AddLiteral(TSeqPos len)
add a literal segment at the end this variant adds a gap literal
vector< CTempString > m_Words
double ComputeEntropy(const CTempString &sequence)
vector< double > m_EntropyValues
double x_Entropy(size_t count)
CEntropyCalculator(size_t sequence_size, size_t word_size)
vector< double > ComputeSlidingWindowEntropy(const CTempString &sequence)
Seq-loc iterator class – iterates all intervals from a seq-loc in the correct order.
static TSeqPos Pack(CSeq_data *in_seq, TSeqPos uLength=ncbi::numeric_limits< TSeqPos >::max())
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
container_type::iterator iterator
iterator_bool insert(const value_type &val)
container_type::value_type value_type
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
#define NCBI_ASSERT(expr, mess)
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
string GetLabel(const CSeq_id &id)
TRange GetRange(void) const
Get the range.
ENa_strand GetStrand(void) const
TSeqPos GetBioseqLength(void) const
CConstRef< CSeq_id > GetSeqId(void) const
Get id which can be used to access this bioseq handle Throws an exception if none is available.
CRef< CSeq_loc > GetRangeSeq_loc(TSeqPos start, TSeqPos stop, ENa_strand strand=eNa_strand_unknown) const
Return CSeq_loc referencing the given range and strand on the bioseq If start == 0,...
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
bool IsProtein(void) const
void Reset(void)
Reset reference object.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
size_type size(void) const
Return the length of the represented array.
ENa_strand
strand of nucleic acid
TId & SetId(void)
Assign a value to Id data member.
TTitle & SetTitle(void)
Select the variant.
void SetExt(TExt &value)
Assign a value to Ext data member.
void SetInst(TInst &value)
Assign a value to Inst data member.
void SetDescr(TDescr &value)
Assign a value to Descr data member.
void SetRepr(TRepr value)
Assign a value to Repr data member.
void SetLength(TLength value)
Assign a value to Length data member.
void SetSeq_data(TSeq_data &value)
Assign a value to Seq_data data member.
void SetMol(TMol value)
Assign a value to Mol data member.
@ eRepr_delta
sequence made by changes (delta) to others
@ eRepr_raw
continuous sequence
@ eMol_na
just a nucleic acid
unsigned int
A callback function used to compare two keys in a database.
range(_Ty, _Ty) -> range< _Ty >
constexpr auto sort(_Init &&init)
double value_type
The numeric datatype used by the parser.