46 #ifdef NCBI_COMPILER_MSVC
47 # define isfinite _finite
48 #elif defined(NCBI_OS_SOLARIS) && !defined(__builtin_isfinite) \
49 && (!defined(__GNUC__) || (__GNUC__ == 4 && __GNUC_MINOR < 5))
51 # define isfinite finite
59 : m_NumElements(num_elements), m_Diagnol(0.0)
61 if (num_elements > 0) {
62 m_Distances.resize(num_elements * num_elements - num_elements, -1.0);
68 m_NumElements = num_elements;
69 if (num_elements > 0) {
70 m_Distances.resize(num_elements * num_elements - num_elements);
76 if (
i >= m_NumElements || j >= m_NumElements ||
i < 0 || j < 0) {
78 "Distance matrix index out of bounds");
89 int index = (
i*
i -
i) / 2 + j;
90 _ASSERT(index < (
int)m_Distances.size());
92 return m_Distances[index];
97 if (
i >= m_NumElements || j >= m_NumElements ||
i < 0 || j < 0) {
99 "Distance matrix index out of bounds");
104 "Distance matrix diagnol elements cannot be set");
111 int index = (
i*
i -
i) / 2 + j;
112 _ASSERT(index < (
int)m_Distances.size());
114 return m_Distances[index];
142 for (
i=0;
i < ids.size();
i++) {
157 +
" not found in alignment");
165 "Tree was not constructed");
180 seqalign->
SetSegs().SetDenseg(*denseg);
196 double val = mat(
i, j);
197 if (!isfinite(
val) ||
val < 0.0) {
212 vector<string> sequences(num_seqs);
218 const string& query_seq = sequences[query_idx];
220 bitvector.
set(query_idx);
226 for (
int i=0;
i < num_seqs;
i++) {
229 if (
i == query_idx) {
242 links.push_back(
SLink(query_idx,
i, dist));
247 sequences[
i].erase();
257 list<SLink>::const_iterator it1 = links.begin();
260 for (; it1 != links.end() && it1->index1 == query_idx; ++it1) {
263 if (!bitvector.
test(it1->index2)) {
266 const string& seqi = sequences[it1->index2];
269 list<SLink>::const_iterator it2(it1);
273 for (; it2 != links.end() && it2->index1 == query_idx
274 && bitvector.
test(it1->index2); ++it2) {
277 if (!bitvector.
test(it2->index2)) {
280 const string& seqj = sequences[it2->index2];
289 links.push_back(
SLink(it2->index2, it1->index2, dist));
300 bitvector[score_1 > score_2 ? it2->index2 : it1->index2]
311 for (; en.
valid(); ++en) {
312 included.push_back((
int)*en);
318 ITERATE (list<SLink>, it, links) {
320 if (bitvector.
test(it->index1) && bitvector.
test(it->index2)) {
324 int index1 = (it->index1 == 0 ? 0 :
325 bitvector.
count_range(0, it->index1 - 1, rs_index));
327 int index2 = (it->index2 == 0 ? 0 :
328 bitvector.
count_range(0, it->index2 - 1, rs_index));
335 "divergence matrix contains invalid or inifinite values");
338 return included.size() > 1;
342 const CAlnVec& align_data_source,
343 vector<string>& seqids)
350 string seq_id_str =
"";
351 (*seq_id).GetLabel(&seq_id_str);
353 seqids.push_back(seq_id_str);
362 ITERATE (vector<int>, it, used_indices) {
363 for (; index < *it; index++) {
384 vector<string> new_labels;
385 ITERATE (vector<int>, it, used_indices) {
386 new_labels.push_back(
m_Labels[*it]);
440 "Invalid distance calculation method");
452 "distance matrix contains invalid or infinite values");
482 "Invalid tree reconstruction method");
487 "Tree was not created");
496 int num_children = 0;
497 double cumulative_length = 0.0;
501 cumulative_length += (*it)->GetValue().GetDist();
505 double new_len = cumulative_length / (double)num_children;
509 (*it)->GetValue().SetDist(new_len);
531 double dist = node->
GetValue().GetDist();
532 if (!isfinite(dist) || dist < 0.0) {
544 " is not the same as number of sequences");
549 "divergence must be positive");
555 "divergence must be smaller than 0.85 if Kimura distance is"
559 vector<int> used_inds;
570 - (
int)used_inds.size())
571 +
" sequences were discarded due to"
572 " divergence that exceeds maximum allowed.");
579 m_Messages.push_back(
"Sequence dissimilarity exceeds maximum"
603 if(seq_align_set.
Get().size() == 1) {
606 else if(seq_align_set.
Get().size() > 1) {
616 mix.
Merge(merge_flags);
623 "Empty seqalign provided");
642 const char kGap =
'-';
644 seg_info.resize(aln.size());
648 size_t query_start = 0;
649 size_t query_end =
query.length() - 1;
650 while (query_start <
query.length() &&
query[query_start] == kGap) {
653 while (query_end > 0 &&
query[query_end] == kGap) {
658 for (
size_t i=0;
i < aln.size();
i++) {
659 const string& sequence = aln[
i];
660 vector<TRange>& segs = seg_info[
i];
662 _ASSERT(query_end < sequence.length());
666 for (
size_t k=query_start;k <= query_end;k++) {
667 if (sequence[k] != kGap) {
672 const int kMinSplit =
max(seq_len / 20, 4);
674 while (p < sequence.length()) {
677 while (p < sequence.length() && (p < query_start ||
678 sequence[p] == kGap)) {
686 while (p < sequence.length() && gaps < kMinSplit && p <= query_end) {
689 while (p < sequence.length() && p <= query_end &&
690 sequence[p] != kGap) {
699 while (p < sequence.length() && p <= query_end &&
700 sequence[p] == kGap) {
709 if (seg.
GetLength() > 4 || segs.empty()) {
static CRef< CScope > m_Scope
TDim GetNumRows(void) const
const CDense_seg & GetDenseg(void) const
void Add(const CDense_seg &ds, TAddFlags flags=0)
void Merge(TMergeFlags flags=0)
const CSeq_align & GetSeqAlign(void) const
const CBioseq_Handle & GetBioseqHandle(TNumrow row) const
void SetEndChar(TResidue gap_char)
void SetGapChar(TResidue gap_char)
string & GetWholeAlnSeqString(TNumrow row, string &buffer, TSeqPosList *insert_aln_starts=0, TSeqPosList *insert_starts=0, TSeqPosList *insert_lens=0, unsigned int scrn_width=0, TSeqPosList *scrn_lefts=0, TSeqPosList *scrn_rights=0) const
int CalculateScore(TNumrow row1, TNumrow row2) const
CRef< CDense_seg > ExtractRows(const vector< TDim > &rows) const
Extract specified rows of the alignment, in specified order.
void Assign(const CSerialObject &obj, ESerialRecursionMode how=eRecursive)
overloaded Assign()
static TTree * RerootTree(TTree *tree, TTree *node=NULL)
Reroot tree, new root is placed in the middle of the edge specified by node.
static TTree * NjTree(const TMatrix &dist_mat, const vector< string > &labels=vector< string >())
Compute a tree by neighbor joining; as per Hillis et al.
static double FractionIdentity(const string &seq1, const string &seq2)
Calculate pairwise fraction identity based on positions where both sequences have a base/residue.
static void GrishinGeneralDist(const TMatrix &frac_diff, TMatrix &result)
Grishin's distance for protein sequences 1 - p = ln(1 + 2d) / 2d.
static void GrishinDist(const TMatrix &frac_diff, TMatrix &result)
Grishin's distance for protein sequences: 1 - p = (1 - e^(2*d)) / (2 * d) approximated with d = p(2 +...
static void PoissonDist(const TMatrix &frac_diff, TMatrix &result)
Simple distance calculation for protein sequences: d = -ln(1 - p).
static TTree * FastMeTree(const TMatrix &dist_mat, const vector< string > &labels=vector< string >(), EFastMePar btype=eOls, EFastMePar wtype=eOls, EFastMePar ntype=eBalanced)
Compute a tree using the fast minimum evolution algorithm.
static void KimuraDist(const TMatrix &frac_diff, TMatrix &result)
Kimura's distance for protein sequences: d = -ln(1 - p - 0.2p^2).
static void JukesCantorDist(const TMatrix &frac_diff, TMatrix &result)
Jukes-Cantor distance calculation for DNA sequences: d = -3/4 ln(1 - (4/3)p).
void Resize(size_t i, size_t j, T val=T())
resize this matrix, filling the empty cells with a known value
size_t GetRows() const
get the number of rows in this matrix
Guide tree calc exceptions.
Distance matrix (square, symmetric with zeros on diagnol)
CDistMatrix(int num_elements=0)
Constructor.
const double & operator()(int i, int j) const
Get distance value.
bool Empty(void) const
Is matrix size zero.
void Resize(int num_elements)
Resize matrix.
vector< double > m_Distances
Storage for distance values.
int GetNumElements(void) const
Get number of rows/columns.
CDistMatrix m_DivergenceMatrix
Matrix of percent identities based distances.
CRef< CSeq_align > GetSeqAlign(void) const
Get seq_align that corresponds to current tree.
CRef< CBioTreeContainer > GetSerialTree(void) const
Get serial tree.
vector< string > m_RemovedSeqIds
Sequences that are not included in the tree.
bool m_CalcSegInfo
Calculate segment positions.
vector< int > m_RemovedSeqIndeces
Indeces of sequences that are not included in the tree.
bool CalcBioTree(void)
Compute bio tree for the current alignment in a black box manner.
CDistMethods::TMatrix m_FullDistMatrix
Full distance matrix, this data structure is required by CDistMethods functions.
TPhyTreeNode * m_Tree
Computed tree.
void x_CalcDistMatrix(void)
Compute distance as evolutionary corrected dissimilarity.
const vector< CRef< CSeq_id > > & GetSeqIds(void) const
Get seq-ids of sequences used in tree construction.
void x_Init(void)
Initialize class attributes.
@ eFastME
Fast Minumum Evolution.
int m_QueryIdx
Index of query sequence in the input alignment.
bool x_CalcDivergenceMatrix(vector< int > &used_indices)
Compute divergence matrix and find sequences to exclude from tree reconstruction.
CPhyTreeCalc(const CSeq_align &seq_aln, CRef< CScope > scope)
Create CPhyTreeCalc from Seq-align.
void x_CalcAlnSegInfo(const vector< string > &aln, TSegInfo &seg_info)
Calculate the alignment segemnts summary.
EDistMethod m_DistMethod
Method of calculating evolutionary distance correction.
ETreeMethod m_TreeMethod
Method of calculating tree.
vector< string > m_Messages
Error/warning messages.
CDistMatrix m_DistMatrix
Matrix of evolutionary distance.
CRef< CScope > m_Scope
Scope.
double m_MaxDivergence
Maximum allowed divergence between sequences.
vector< string > m_Labels
Labels for tree leaves.
void SetQuery(int index)
Set query sequence by index in alignment.
TSegInfo m_SegInfo
Positions of segments in mulitple or query anchored alignment.
CRef< CAlnVec > m_AlignDataSource
Alignment data source.
void x_InitAlignDS(const CSeq_align &seq_aln)
Initialize alignment data source.
void x_CorrectBranchLengths(TPhyTreeNode *node)
Change non-finite and negative tree branch lengths to zeros.
vector< vector< TRange > > TSegInfo
void x_CreateValidAlign(const vector< int > &used_indices)
Create alignment only for sequences included in tree computation.
void x_ComputeTree(bool correct=true)
Compute phylogenetic tree.
definition of a Culling tree
Constant iterator designed to enumerate "ON" bits counted_enumerator keeps bitcount,...
bool valid() const noexcept
Checks if iterator is still valid.
bool test(size_type n) const noexcept
returns true if bit n is set and false is bit n is 0.
bvector< Alloc > & set(size_type n, bool val=true)
Sets bit n if val is true, clears bit n if val is false.
size_type count_range(size_type left, size_type right, const rs_index_type &rs_idx) const noexcept
Returns count of 1 bits in the given range [left..right] Uses rank-select index to accelerate the sea...
enumerator first() const
Returns enumerator pointing on the first non-zero bit.
void build_rs_index(rs_index_type *rs_idx, bvector< Alloc > *bv_blocks=0) const
compute running total of all blocks in bit vector (rank-select index)
size_type count() const noexcept
population count (count of ON bits)
Rank-Select acceleration index.
CRef< objects::CBioTreeContainer > MakeBioTreeContainer(const TPhyTreeNode *tree)
Conversion from TPhyTreeNode to CBioTreeContainer.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
const string AsFastaString(void) const
CConstRef< CSeq_id > GetSeqId(void) const
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
@ eGetId_Best
return the "best" gi (uses FindBestScore(), with CSeq_id::CalculateScore() as the score function
bool IsSameBioseq(const CSeq_id_Handle &id1, const CSeq_id_Handle &id2, EGetBioseqFlag get_flag)
Check if two seq-ids are resolved to the same Bioseq.
@ eGetBioseq_All
Search bioseq, load if not loaded yet.
void Reset(void)
Reset reference object.
position_type GetLength(void) const
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
TNodeList::iterator TNodeList_I
TNodeList_CI SubNodeBegin(void) const
Return first const iterator on subnode list.
TNodeList::const_iterator TNodeList_CI
bool IsLeaf() const
Report whether this is a leaf node.
TNodeList_CI SubNodeEnd(void) const
Return last const iterator on subnode list.
const TValue & GetValue(void) const
Return node's value.
const TDenseg & GetDenseg(void) const
Get the variant data.
void SetSegs(TSegs &value)
Assign a value to Segs data member.
void SetDim(TDim value)
Assign a value to Dim data member.
void SetType(TType value)
Assign a value to Type data member.
vector< CRef< CSeq_id > > TIds
TDim GetDim(void) const
Get the Dim member data.
const TIds & GetIds(void) const
Get the Ids member data.
const Tdata & Get(void) const
Get the member data.
const TSegs & GetSegs(void) const
Get the Segs member data.
list< CRef< CSeq_align > > TAlign
Compressed bitset (entry point to bm.h)
Floating-point support routines.
The NCBI C++/STL use hints.
bool s_ValidateMatrix(const CPhyTreeCalc::CDistMatrix &mat)
static void s_RecordSeqId(int index, const CAlnVec &align_data_source, vector< string > &seqids)
Structure for storing divergences between sequences as list of links.