NCBI C++ ToolKit
conservation_colorer.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: conservation_colorer.cpp 33815 2007-05-04 17:18:18Z kazimird $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Paul Thiessen
27 *
28 * File Description:
29 * Classes to color alignment blocks by sequence conservation
30 *
31 * ===========================================================================
32 */
33 
34 #include <ncbi_pch.hpp>
35 #include <corelib/ncbistd.hpp>
36 #include <corelib/ncbi_limits.h>
38 
40 
42 #include "conservation_colorer.hpp"
43 #include "cn3d_tools.hpp"
44 #include "cn3d_pssm.hpp"
45 
46 #include <math.h>
47 
49 
50 
51 BEGIN_SCOPE(Cn3D)
52 
54 {
56 }
57 
58 int GetBLOSUM62Score(char a, char b)
59 {
60  static SNCBIFullScoreMatrix Blosum62Matrix;
61  static bool unpacked = false;
62 
63  if (!unpacked) {
64  NCBISM_Unpack(&NCBISM_Blosum62, &Blosum62Matrix);
65  unpacked = true;
66  }
67 
68  return Blosum62Matrix.s[(int)ScreenResidueCharacter(a)][(int)ScreenResidueCharacter(b)];
69 }
70 
72  alignment(parent), nColumns(0), basicColorsCurrent(false), fitColorsCurrent(false)
73 {
74 }
75 
77 {
78  // sanity check
79  if (!block->IsFrom(alignment)) {
80  ERRORMSG("ConservationColorer::AddBlock : block is not from the associated alignment");
81  return;
82  }
83 
84  blocks[block].resize(block->width);
85 
86  // map block column position to profile position
87  for (unsigned int i=0; i<block->width; ++i) blocks[block][i] = nColumns + i;
88  nColumns += block->width;
89 
91 }
92 
94 {
95  if (basicColorsCurrent || blocks.size() == 0) return;
96 
97  TRACEMSG("calculating basic conservation colors");
98 
99  int nRows = alignment->NRows();
100 
101  ColumnProfile::iterator p, pe, p2;
102  int row, profileColumn;
103  alignmentProfile.resize(nColumns);
104 
105  typedef vector < int > IntVector;
106  IntVector varieties(nColumns, 0), weightedVarieties(nColumns, 0);
107  identities.resize(nColumns);
108  int minVariety=0, maxVariety=0, minWeightedVariety=0, maxWeightedVariety=0;
109 
110  typedef vector < float > FloatVector;
111  FloatVector informationContents(nColumns, 0.0f);
112  float totalInformationContent = 0.0f;
113 
114  BlockMap::const_iterator b, be = blocks.end();
115  for (b=blocks.begin(); b!=be; ++b) {
116 
117  for (unsigned int blockColumn=0; blockColumn<b->first->width; ++blockColumn) {
118  profileColumn = b->second[blockColumn];
119  ColumnProfile& profile = alignmentProfile[profileColumn];
120 
121  // create profile for this column
122  profile.clear();
123  for (row=0; row<nRows; ++row) {
124  char ch = ScreenResidueCharacter(b->first->GetCharacterAt(blockColumn, row));
125  if ((p=profile.find(ch)) != profile.end())
126  ++(p->second);
127  else
128  profile[ch] = 1;
129  }
130  pe = profile.end();
131 
132  // identity for this column
133  if (profile.size() == 1 && profile.begin()->first != 'X')
134  identities[profileColumn] = true;
135  else
136  identities[profileColumn] = false;
137 
138  // variety for this column
139  int& variety = varieties[profileColumn];
140  variety = profile.size();
141  if (profile.find('X') != profile.end())
142  variety += profile['X'] - 1; // each 'X' counts as one variety
143  if (blockColumn == 0 && b == blocks.begin()) {
144  minVariety = maxVariety = variety;
145  } else {
146  if (variety < minVariety) minVariety = variety;
147  else if (variety > maxVariety) maxVariety = variety;
148  }
149 
150  // weighted variety for this column
151  int& weightedVariety = weightedVarieties[profileColumn];
152  for (p=profile.begin(); p!=pe; ++p) {
153  weightedVariety +=
154  (p->second * (p->second - 1) / 2) * GetBLOSUM62Score(p->first, p->first);
155  p2 = p;
156  for (++p2; p2!=pe; ++p2)
157  weightedVariety +=
158  p->second * p2->second * GetBLOSUM62Score(p->first, p2->first);
159  }
160  if (blockColumn == 0 && b == blocks.begin()) {
161  minWeightedVariety = maxWeightedVariety = weightedVariety;
162  } else {
163  if (weightedVariety < minWeightedVariety) minWeightedVariety = weightedVariety;
164  else if (weightedVariety > maxWeightedVariety) maxWeightedVariety = weightedVariety;
165  }
166 
167  // information content (calculated in bits -> logs of base 2) for this column
168  float &columnInfo = informationContents[profileColumn];
169  for (p=profile.begin(); p!=pe; ++p) {
170  static const float ln2 = log(2.0f), threshhold = 0.0001f;
171  float residueScore = 0.0f, expFreq = GetStandardProbability(p->first);
172  if (expFreq > threshhold) {
173  float obsFreq = 1.0f * p->second / nRows,
174  freqRatio = obsFreq / expFreq;
175  if (freqRatio > threshhold) {
176  residueScore = obsFreq * ((float) log(freqRatio)) / ln2;
177  columnInfo += residueScore; // information content
178  }
179  }
180  }
181  totalInformationContent += columnInfo;
182  }
183  }
184 
185  INFOMSG("Total information content of aligned blocks: " << totalInformationContent << " bits");
186 
187  // now assign colors
188  varietyColors.resize(nColumns);
191 
192  double scale;
193  for (profileColumn=0; profileColumn<nColumns; ++profileColumn) {
194 
195  // variety
196  if (maxVariety == minVariety)
197  scale = 1.0;
198  else
199  scale = 1.0 - 1.0 * (varieties[profileColumn] - minVariety) / (maxVariety - minVariety);
200  varietyColors[profileColumn] = GlobalColors()->Get(Colors::eConservationMap, scale);
201 
202  // weighted variety
203  if (maxWeightedVariety == minWeightedVariety)
204  scale = 1.0;
205  else
206  scale = 1.0 * (weightedVarieties[profileColumn] - minWeightedVariety) /
207  (maxWeightedVariety - minWeightedVariety);
209 
210  // information content, based on absolute scale
211  static const float minInform = 0.10f, maxInform = 6.24f;
212  scale = (informationContents[profileColumn] - minInform) / (maxInform - minInform);
213  if (scale < 0.0) scale = 0.0;
214  else if (scale > 1.0) scale = 1.0;
215  scale = sqrt(scale); // apply non-linearity so that lower values are better distinguished
217  }
218 
219  basicColorsCurrent = true;
220  fitColorsCurrent = false;
221 }
222 
224 {
225  if (fitColorsCurrent || blocks.size() == 0) return;
226 
227  CalculateBasicConservationColors(); // also includes profile
228 
229  TRACEMSG("calculating fit conservation colors");
230 
231  int nRows = alignment->NRows();
232 
233  ColumnProfile::iterator p, pe;
234  int row, profileColumn;
235 
236  typedef map < char, int > CharIntMap;
237  vector < CharIntMap > fitScores(nColumns);
238  int minFit=0, maxFit=0;
239 
240  typedef vector < float > FloatVector;
242  BlockRowScores blockFitScores, blockZFitScores, blockRowFitScores;
243  float minBlockFit=0.0f, maxBlockFit=0.0f, minBlockZFit=0.0f, maxBlockZFit=0.0f, minBlockRowFit=0.0f, maxBlockRowFit=0.0f;
244 
245  BlockMap::const_iterator b, be = blocks.end();
246  for (b=blocks.begin(); b!=be; ++b) {
247  blockFitScores[b->first].resize(nRows, 0.0f);
248 
249  for (unsigned int blockColumn=0; blockColumn<b->first->width; ++blockColumn) {
250  profileColumn = b->second[blockColumn];
251  ColumnProfile& profile = alignmentProfile[profileColumn];
252  pe = profile.end();
253 
254  // fit scores
255  for (p=profile.begin(); p!=pe; ++p) {
256  int& fit = fitScores[profileColumn][p->first];
257  fit = alignment->GetPSSM().GetPSSMScore(
259  b->first->GetRangeOfRow(0)->from + blockColumn);
260  if (blockColumn == 0 && b == blocks.begin() && p == profile.begin()) {
261  minFit = maxFit = fit;
262  } else {
263  if (fit < minFit) minFit = fit;
264  else if (fit > maxFit) maxFit = fit;
265  }
266  }
267 
268  // add up residue fit scores to get block fit scores
269  for (row=0; row<nRows; ++row) {
270  char ch = ScreenResidueCharacter(b->first->GetCharacterAt(blockColumn, row));
271  blockFitScores[b->first][row] += fitScores[profileColumn][ch];
272  }
273  }
274 
275  // find average/min/max block fit
276  float average = 0.0f;
277  for (row=0; row<nRows; ++row) {
278  float& score = blockFitScores[b->first][row];
279  score /= b->first->width; // average fit score across the block for this row
280  average += score;
281  if (row == 0 && b == blocks.begin()) {
282  minBlockFit = maxBlockFit = score;
283  } else {
284  if (score < minBlockFit) minBlockFit = score;
285  else if (score > maxBlockFit) maxBlockFit = score;
286  }
287  }
288  average /= nRows;
289 
290  // calculate block Z scores from block fit scores
291  if (nRows >= 2) {
292  // calculate standard deviation of block fit score over all rows of this block
293  float stdDev = 0.0f;
294  for (row=0; row<nRows; ++row)
295  stdDev += (blockFitScores[b->first][row] - average) *
296  (blockFitScores[b->first][row] - average);
297  stdDev /= nRows - 1;
298  stdDev = sqrt(stdDev);
299  if (stdDev > 1e-10) {
300  // calculate Z scores for each row
301  blockZFitScores[b->first].resize(nRows);
302  for (row=0; row<nRows; ++row)
303  blockZFitScores[b->first][row] = (blockFitScores[b->first][row] - average) / stdDev;
304  }
305  }
306  }
307 
308  // calculate row fit scores based on Z-scores for each block across a given row
309  if (blocks.size() >= 2) {
310  for (b=blocks.begin(); b!=be; ++b)
311  blockRowFitScores[b->first].resize(nRows, kMin_Float);
312 
313  // calculate row average, standard deviation, and Z-scores
314  for (row=0; row<nRows; ++row) {
315  float average = 0.0f;
316  for (b=blocks.begin(); b!=be; ++b)
317  average += blockFitScores[b->first][row];
318  average /= blocks.size();
319  float stdDev = 0.0f;
320  for (b=blocks.begin(); b!=be; ++b)
321  stdDev += (blockFitScores[b->first][row] - average) *
322  (blockFitScores[b->first][row] - average);
323  stdDev /= blocks.size() - 1;
324  stdDev = sqrt(stdDev);
325  if (stdDev > 1e-10) {
326  for (b=blocks.begin(); b!=be; ++b)
327  blockRowFitScores[b->first][row] = (blockFitScores[b->first][row] - average) / stdDev;
328  }
329  }
330  }
331 
332  // now assign colors
333  double scale;
334  fitColors.resize(nRows * nColumns);
335  for (profileColumn=0; profileColumn<nColumns; ++profileColumn) {
336  // fit
337  CharIntMap::const_iterator c, ce = fitScores[profileColumn].end();
338  for (c=fitScores[profileColumn].begin(); c!=ce; ++c) {
339  if (maxFit == minFit)
340  scale = 1.0;
341  else
342  scale = 1.0 * (c->second - minFit) / (maxFit - minFit);
343  fitColors[profileColumn][c->first] = GlobalColors()->Get(Colors::eConservationMap, scale);
344  }
345  }
346 
347  // block fit
348  blockFitColors.clear();
349  for (b=blocks.begin(); b!=be; ++b) {
350  blockFitColors[b->first].resize(nRows);
351  for (row=0; row<nRows; ++row) {
352  if (maxBlockFit == minBlockFit)
353  scale = 1.0;
354  else
355  scale = 1.0 * (blockFitScores[b->first][row] - minBlockFit) / (maxBlockFit - minBlockFit);
357  }
358  }
359 
360  // block Z fit
361  blockZFitColors.clear();
362  for (b=blocks.begin(); b!=be; ++b) {
364  if (blockZFitScores.find(b->first) != blockZFitScores.end()) { // if this column has scores
365  for (row=0; row<nRows; ++row) { // normalize colors per column
366  float zScore = blockZFitScores[b->first][row];
367  if (row == 0) {
368  minBlockZFit = maxBlockZFit = zScore;
369  } else {
370  if (zScore < minBlockZFit) minBlockZFit = zScore;
371  else if (zScore > maxBlockZFit) maxBlockZFit = zScore;
372  }
373  }
374  for (row=0; row<nRows; ++row) {
375  if (maxBlockZFit == minBlockZFit)
376  scale = 1.0;
377  else
378  scale = 1.0 * (blockZFitScores[b->first][row] - minBlockZFit) /
379  (maxBlockZFit - minBlockZFit);
381  }
382  }
383  }
384 
385  // block row fit
386  blockRowFitColors.clear();
387  for (b=blocks.begin(); b!=be; ++b)
389  if (blocks.size() >= 2) {
390  for (row=0; row<nRows; ++row) {
391  if (blockRowFitScores.begin()->second[row] != kMin_Float) { // if this row has fit scores
392  for (b=blocks.begin(); b!=be; ++b) { // normalize colors per row
393  float zScore = blockRowFitScores[b->first][row];
394  if (b == blocks.begin()) {
395  minBlockRowFit = maxBlockRowFit = zScore;
396  } else {
397  if (zScore < minBlockRowFit) minBlockRowFit = zScore;
398  else if (zScore > maxBlockRowFit) maxBlockRowFit = zScore;
399  }
400  }
401  for (b=blocks.begin(); b!=be; ++b) {
402  if (maxBlockRowFit == minBlockRowFit)
403  scale = 1.0;
404  else
405  scale = 1.0 * (blockRowFitScores[b->first][row] - minBlockRowFit) /
406  (maxBlockRowFit - minBlockRowFit);
408  }
409  }
410  }
411  }
412 
413  fitColorsCurrent = true;
414 }
415 
417  const UngappedAlignedBlock *block, int blockColumn, int row,
418  int *profileIndex, char *residue) const
419 {
420  BlockMap::const_iterator b = blocks.find(block);
421  *profileIndex = b->second[blockColumn];
422  *residue = ScreenResidueCharacter(b->first->GetCharacterAt(blockColumn, row));
423 }
424 
426 {
427  nColumns = 0;
428  blocks.clear();
429  FreeColors();
430 }
431 
433 {
434  alignmentProfile.clear();
435  identities.clear();
436  varietyColors.clear();
437  weightedVarietyColors.clear();
438  informationContentColors.clear();
439  fitColors.clear();
440  blockFitColors.clear();
441  blockZFitColors.clear();
442  blockRowFitColors.clear();
444 }
445 
446 END_SCOPE(Cn3D)
const BLAST_Matrix * GetPSSM(void) const
bool IsFrom(const BlockMultipleAlignment *alignment) const
unsigned int width
@ eConservationMap
const Vector & Get(eColor which) const
void AddBlock(const UngappedAlignedBlock *block)
ColumnColors informationContentColors
ConservationColorer(const BlockMultipleAlignment *parent)
BlockFitColors blockRowFitColors
void GetProfileIndexAndResidue(const UngappedAlignedBlock *block, int blockColumn, int row, int *profileIndex, char *residue) const
ColumnColors weightedVarietyColors
AlignmentProfile alignmentProfile
std::map< char, int > ColumnProfile
BlockFitColors blockZFitColors
void CalculateFitConservationColors(void)
void CalculateBasicConservationColors(void)
std::vector< bool > identities
const BlockMultipleAlignment * alignment
const Colors * GlobalColors(void)
Definition: cn3d_colors.cpp:52
static int nRows
Definition: cn3d_png.cpp:115
#define TRACEMSG(stream)
Definition: cn3d_tools.hpp:83
#define INFOMSG(stream)
Definition: cn3d_tools.hpp:84
#define ERRORMSG(stream)
Definition: cn3d_tools.hpp:86
char ScreenResidueCharacter(char ch)
int GetBLOSUM62Score(char a, char b)
Include a standard set of the NCBI C++ Toolkit most basic headers.
#define false
Definition: bool.h:36
#define kMin_Float
Definition: ncbi_limits.h:204
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
const TYPE & Get(const CNamedParameterList *param)
unsigned int a
Definition: ncbi_localip.c:102
const SNCBIPackedScoreMatrix NCBISM_Blosum62
Definition: sm_blosum62.c:92
void NCBISM_Unpack(const SNCBIPackedScoreMatrix *psm, SNCBIFullScoreMatrix *fsm)
Expand a packed score matrix into an unpacked one, which callers can proceed to index directly by sta...
Definition: raw_scoremat.c:81
#define row(bind, expected)
Definition: string_bind.c:73
TNCBIScore s[128][128]
Definition: raw_scoremat.h:87
double GetStandardProbability(char ch)
Definition: su_pssm.cpp:271
unsigned char LookupNCBIStdaaNumberFromCharacter(char r)
Definition: su_pssm.cpp:125
char LookupCharacterFromNCBIStdaaNumber(unsigned char n)
Definition: su_pssm.cpp:142
Modified on Sun May 19 04:46:32 2024 by modify_doxy.py rev. 669887