NCBI C++ ToolKit
dist.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /*
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's offical duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================*/
25 
26 /*****************************************************************************
27 
28 File name: dist.cpp
29 
30 Author: Jason Papadopoulos
31 
32 Contents: Implementation of CDistances
33 
34 ******************************************************************************/
35 
36 #include <ncbi_pch.hpp>
37 #include <algo/cobalt/dist.hpp>
38 
39 /// @file dist.cpp
40 /// Implementation of CDistances
41 
43 BEGIN_SCOPE(cobalt)
44 
45 /// Update the extent of a sequence that figures into its self
46 /// score, based upon the sequence ranges of a pairwise alignment
47 /// @param seq1 The current sequence range [in/out]
48 /// @param align1 The range of the current alignment on sequence 1
49 /// @param seq1_length Length of sequence 1
50 /// @param align2 The range of the current alignment on sequence 2
51 /// @param seq2_length Length of sequence 2
52 ///
53 static void
55  TRange align1,
56  int seq1_length,
57  TRange align2,
58  int seq2_length)
59 {
60  // extend the alignment bounds to the left and right
61  // until one sequence runs out.
62 
63  TOffset first = min(seq1.GetFrom(), align1.GetFrom() - align2.GetFrom());
64  first = max(first, 0);
65 
66  TOffset second = max(seq1.GetTo(),
67  align1.GetTo() + (seq2_length - 1 - align2.GetTo()));
68  second = min(second, seq1_length - 1);
69 
70  seq1.Set(first, second);
71 }
72 
73 void
74 CDistances::x_GetSelfScores(vector<CSequence>& query_data,
75  CHitList& hitlist,
76  SNCBIFullScoreMatrix& score_matrix,
77  vector<double>& self_score,
78  Blast_KarlinBlk& karlin_blk)
79 {
80  // Fill in the self scores for each sequence
81 
82  int num_queries = query_data.size();
83  vector<TRange> seq_ranges(num_queries);
84 
85  // initialize the range of each sequence to
86  // an impossible value
87 
88  for (int i = 0; i < num_queries; i++) {
89  self_score[i] = 0.0;
90  seq_ranges[i].SetEmpty();
91  }
92 
93  // for each alignment between two sequences, extend
94  // the range of the alignment to left and right until
95  // one sequence or the other runs out
96 
97  for (int i = 0; i < hitlist.Size(); i++) {
98  CHit *hit = hitlist.GetHit(i);
99  int seq1 = hit->m_SeqIndex1;
100  int seq2 = hit->m_SeqIndex2;
101  TRange& range1 = seq_ranges[seq1];
102  TRange& range2 = seq_ranges[seq2];
103 
104  x_UpdateRanges(range1,
105  hit->m_SeqRange1, query_data[seq1].GetLength(),
106  hit->m_SeqRange2, query_data[seq2].GetLength());
107  x_UpdateRanges(range2,
108  hit->m_SeqRange2, query_data[seq2].GetLength(),
109  hit->m_SeqRange1, query_data[seq1].GetLength());
110  }
111 
112  // for each sequence
113 
114  for (int i = 0; i < num_queries; i++) {
115 
116  int score = 0;
117  TRange& range = seq_ranges[i];
118 
119  // If there are no hits to limit the self-score computation,
120  // sum the entire query sequence
121 
122  if (range.Empty())
123  range.Set(0, query_data[i].GetLength() - 1);
124 
125  // otherwise, sum the portion of the sequence needed to
126  // envelop all other sequences that align to it. If all
127  // sequences are about the same size then this is likely to
128  // just sum all sequences in their entirety. However, if there
129  // is a big disparity in sequence lengths, only the sequence
130  // ranges that an aligner would concentrate on will contribute
131  // to the self score for that sequence
132 
133  for (int j = range.GetFrom(); j <= range.GetTo(); j++) {
134  unsigned char c = query_data[i].GetLetter(j);
135  score += score_matrix.s[c][c];
136  }
137  self_score[i] = karlin_blk.Lambda * score - karlin_blk.logK;
138  }
139 
140 #if 0
141  //---------------------
142  printf("Self scores:\n");
143  for (int i = 0; i < num_queries; i++) {
144  printf("query %d(%d): range %d-%d score %lf\n", i,
145  query_data[i].GetLength(),
146  seq_ranges[i].GetFrom(), seq_ranges[i].GetTo(),
147  self_score[i]);
148  }
149  printf("\n\n");
150  //---------------------
151 #endif
152 }
153 
154 
155 void
156 CDistances::ComputeMatrix(vector<CSequence>& query_data,
157  CHitList& hitlist,
158  SNCBIFullScoreMatrix& score_matrix,
159  Blast_KarlinBlk& karlin_blk)
160 {
161  int num_queries = query_data.size();
162  vector<double> self_score(num_queries);
163 
164  // Get the self score of each sequence
165 
166  x_GetSelfScores(query_data, hitlist, score_matrix,
167  self_score, karlin_blk);
168 
169  // All sequences start out equally far from each other
170 
171  m_Matrix.Resize(num_queries, num_queries);
172  m_Matrix.Set(1.0);
173 
174  // Every pairwise alignment pulls the two sequences involved
175  // in the alignment closer together. This formulation of
176  // pairwise distances amounts to calculating the average
177  // per-letter distance between the two sequences. See
178  //
179  // Clarke et al, "Inferring Genome Trees by Using a Filter
180  // to Eliminate Phylogenetically Discordant Sequences and
181  // a Distance Matrix Based on Mean Normalized BLASTP Scores",
182  // Jour. Bacteriology Apr. 2002 pp 2072-2080
183 
184  for (int i = 0; i < hitlist.Size(); i++) {
185  CHit *hit = hitlist.GetHit(i);
186  int j = hit->m_SeqIndex1;
187  int k = hit->m_SeqIndex2;
188  double align_score = karlin_blk.Lambda *
189  hit->m_Score - karlin_blk.logK;
190 
191  _ASSERT(j < k);
192  m_Matrix(j,k) -= 0.5 * align_score *
193  (1.0 / self_score[j] + 1.0 / self_score[k]);
194  }
195 
196  // Force distance matrix to be symmetric
197 
198  for (int i = 0; i < num_queries; i++) {
199  m_Matrix(i, i) = 0.0;
200  for (int j = 0; j < i; j++) {
201  // clamp to zero any distances that are negative
202  // or too small
203  if (fabs(m_Matrix(j, i)) < 1e-6)
204  m_Matrix(j, i) = 0;
205 
206  if (m_Matrix(j, i) < 0)
207  m_Matrix(j, i) = 0;
208 
209  m_Matrix(i, j) = m_Matrix(j, i);
210  }
211  }
212 }
213 
214 END_SCOPE(cobalt)
int TOffset
Basic data type for offsets into a sequence.
Definition: base.hpp:49
CDistMethods::TMatrix m_Matrix
Current distance matrix.
Definition: dist.hpp:106
void x_GetSelfScores(vector< CSequence > &query_data, CHitList &hitlist, SNCBIFullScoreMatrix &score_matrix, vector< double > &self_score, Blast_KarlinBlk &karlin_blk)
Compute the self-scores of the input sequences.
Definition: dist.cpp:74
void ComputeMatrix(vector< CSequence > &query_data, CHitList &hitlist, SNCBIFullScoreMatrix &score_matrix, Blast_KarlinBlk &karlin_blk)
Recompute the distance matrix using new parameters.
Definition: dist.cpp:156
An ordered collection of CHit objects.
Definition: hitlist.hpp:50
int Size() const
Retrieve number of hits in list.
Definition: hitlist.hpp:75
CHit * GetHit(int index)
Retrieve a hit from the hitlist.
Definition: hitlist.hpp:93
A generalized representation of a pairwise alignment.
Definition: hit.hpp:86
int m_Score
Score of alignment.
Definition: hit.hpp:104
int m_SeqIndex1
Numerical identifier for first sequence in alignment.
Definition: hit.hpp:97
int m_SeqIndex2
Numerical identifier for second sequence in alignment.
Definition: hit.hpp:101
TRange m_SeqRange1
The range of offsets on the first sequence.
Definition: hit.hpp:107
TRange m_SeqRange2
The range of offsets on the second sequence.
Definition: hit.hpp:110
void Resize(size_t i, size_t j, T val=T())
resize this matrix, filling the empty cells with a known value
Definition: matrix.hpp:390
void Set(T val)
set all values in the matrix to a known value
Definition: matrix.hpp:417
static void x_UpdateRanges(TRange &seq1, TRange align1, int seq1_length, TRange align2, int seq2_length)
Update the extent of a sequence that figures into its self score, based upon the sequence ranges of a...
Definition: dist.cpp:54
Interface for CDistances class.
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
position_type GetLength(void) const
Definition: range.hpp:158
TThisType & Set(position_type from, position_type to)
Definition: range.hpp:188
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
int i
range(_Ty, _Ty) -> range< _Ty >
#define fabs(v)
Definition: ncbi_dispd.c:46
T max(T x_, T y_)
T min(T x_, T y_)
Structure to hold the Karlin-Altschul parameters.
Definition: blast_stat.h:66
double Lambda
Lambda value used in statistics.
Definition: blast_stat.h:67
double logK
natural log of K value used in statistics
Definition: blast_stat.h:69
TNCBIScore s[128][128]
Definition: raw_scoremat.h:87
#define _ASSERT
Modified on Wed Apr 17 13:08:33 2024 by modify_doxy.py rev. 669887