NCBI C++ ToolKit
cuFlexiDm.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: cuFlexiDm.cpp 44918 2010-02-25 17:50:40Z lanczyck $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Charlie Liu
27 *
28 * File Description:
29 *
30 * Concrete distance matrix class.
31 * Distance is computed based on pure percent pairwise AA identity in
32 * aligned blocks, with or without a correction for multiple AA
33 *
34 */
35 
36 #include <ncbi_pch.hpp>
37 #include <math.h>
38 
39 #include <objects/seq/Bioseq.hpp>
42 
44 
45 // (BEGIN_NCBI_SCOPE must be followed by END_NCBI_SCOPE later in this file)
47 BEGIN_SCOPE(cd_utils)
48 
49 ResidueMatrix::ResidueMatrix(unsigned numRows)
50  : m_rows(numRows, RowContent()), m_numRows(numRows)
51 {
52 }
53 
55 {
56  vector<char> residues;
57  residues.assign(m_numRows, '-');
58  crp.getResiduesByRow(residues, false);
59  for (unsigned row = 0; row < residues.size(); row++)
60  m_rows[row].push_back(ResidueCell(residues[row], crp.isAligned(row)));
61 }
62 
63 bool ResidueMatrix::getAlignedPair(unsigned row1, unsigned row2, pair< string, string >& seqPair)
64 {
65  /*if (row1 > m_rows.size() || row2 > m_rows.size())
66  return false;*/
67  RowContent& rc1 = m_rows[row1];
68  RowContent& rc2 = m_rows[row2];
69  seqPair.first.reserve(rc1.size());
70  seqPair.second.reserve(rc2.size());
71  //assert(rc1.size() == rc2.size());
72  for (unsigned int i = 0; i < rc1.size(); i++)
73  {
74  if (rc1[i].aligned && rc2[i].aligned)
75  {
76  seqPair.first += rc1[i].residue;
77  seqPair.second += rc2[i].residue;
78  }
79  }
80  return true;
81 }
82 
83 // FlexiDm class
84 
85 // This class simply uses the number of AA identities in the specified region
86 // to define the distance between two sequences:
87 //
88 // d[i][j] = 1 - (n_matched/n_tested); d is in [0, 1]
89 
90 const double FlexiDm::MAX_DISTANCE = 100.0;
92 
94 }
95 
96 FlexiDm::FlexiDm(EScoreMatrixType type, int uniformLength) : DistanceMatrix(), m_uniformLength(uniformLength) {
98 }
99 
102  m_useAligned = true;
103  m_nTermExt = nExt;
104  m_cTermExt = cExt;
106  if (m_nTermExt != 0 || m_cTermExt != 0) {
107  m_useAligned = false;
108  }
109 }
110 
111 
113 
114  bool result;
115  if (m_aligns) {
116  GetPercentIdentities(pFunc);
117  result = true;
118  } else {
119  result = false;
120  }
121  return result;
122 }
123 
125 {
126  int nrows = m_aligns->GetNumRows();
127  //LOG_POST("Start building Distance Matrix");
128  //LOG_POST("Start building ResidueProfiles with "<<nrows<<" rows.");
129  ResidueProfiles* rp = new ResidueProfiles();
130  string mseq = m_aligns->GetSequenceForRow(0);
131  for (int i = 1; i < nrows; i++)
132  {
133  string sseq = m_aligns->GetSequenceForRow(i);
135  rp->addOneRow(bmp, mseq, sseq);
136  }
137  //LOG_POST("Done building ResidueProfiles. Start building ResidueMatrix");
138  ResidueMatrix * rm = new ResidueMatrix(nrows);
139  rp->traverseColumnsOnMaster(*rm);
140  //LOG_POST("Done building ResidueMatrix. Starting making Distance Matrix");
141  delete rp;
142  int Identity, TotalAligned;
143  int count = 0;
144  int total = (int)((double)nrows * (((double)nrows-1)/2));
145 
146  // for each row in the alignment
147  for (int j=0; j<nrows; j++)
148  {
149  m_Array[j][j] = 0.0;
150  ResidueMatrix::RowContent& rc1 = rm->getRow(j);
151  // for each other row in the alignment
152  for (int k=j+1; k<nrows; k++)
153  {
154  Identity = 0;
155  TotalAligned = 0;
156  ResidueMatrix::RowContent& rc2 = rm->getRow(k);
157  for (unsigned int i = 0; i < rc1.size(); i++)
158  {
159  if (rc1[i].aligned && rc2[i].aligned)
160  {
161  TotalAligned++;
162  if (rc1[i].residue == rc2[i].residue)
163  Identity++;
164  }
165  }
166 
167  // testing a modification to the algorithm to normalize the identity count to a
168  // single value, although need to deal w/ the possibility that some alignments may be
169  // longer than the value specified. E.g., if have two pending rows which overlap
170  // by only one or two identical residues, is that really 100% identity????!!!
171  if (m_uniformLength > 0 && Identity <= m_uniformLength) TotalAligned = m_uniformLength;
172 
173  m_Array[j][k] = GetDistance(Identity, TotalAligned);
174  m_Array[k][j] = m_Array[j][k];
175  }
176  count += nrows - (j+1);
177  pFunc(count, total);
178  }
179  //LOG_POST("Done building DistanceMatrix");
180  assert(count == total);
181  delete rm;
182 // cout << "Total number rows: " << nrows << " Alignment length: " << alignLen << endl;
183 }
184 
185 
186 double FlexiDm::GetDistance(int nIdentities, int alignLen)
187 {
188  if (alignLen == 0) {
189  return 1.0;
190  } else {
191  return 1.0 - (TMatType(nIdentities) / TMatType (alignLen));
192  }
193 }
194 
195 
196 END_SCOPE(cd_utils)
double ** m_Array
Definition: cuMatrix.hpp:47
string GetSequenceForRow(int row)
CRef< CSeq_align > getSeqAlign(int row) const
void getResiduesByRow(vector< char > &residues, bool byNcbiStd=true) const
bool isAligned(char residue, int row) const
ScoreMatrix * m_scoreMatrix
Definition: cuDistmat.hpp:156
EDistMethod m_dMethod
Definition: cuDistmat.hpp:157
double TMatType
Definition: cuDistmat.hpp:91
AlignmentCollection * m_aligns
Definition: cuDistmat.hpp:159
static const EDistMethod DIST_METHOD
Definition: cuFlexiDm.hpp:76
~FlexiDm()
Definition: cuFlexiDm.cpp:93
bool ComputeMatrix(pProgressFunction pFunc)
Definition: cuFlexiDm.cpp:112
FlexiDm(EScoreMatrixType type=GLOBAL_DEFAULT_SCORE_MATRIX, int uniformLength=-1)
Definition: cuFlexiDm.cpp:96
static const double MAX_DISTANCE
Definition: cuFlexiDm.hpp:75
static double GetDistance(int identities, int alignment_length)
Definition: cuFlexiDm.cpp:186
int m_uniformLength
Definition: cuFlexiDm.hpp:102
void initDMIdentities(EScoreMatrixType type, int nExt=0, int cExt=0)
Definition: cuFlexiDm.cpp:100
void GetPercentIdentities(pProgressFunction pFunc)
Definition: cuFlexiDm.cpp:124
bool getAlignedPair(unsigned row1, unsigned row2, pair< string, string > &seqPair)
Definition: cuFlexiDm.cpp:63
void read(ColumnResidueProfile &crp)
Definition: cuFlexiDm.cpp:54
vector< RowContent > m_rows
Definition: cuFlexiDm.hpp:64
RowContent & getRow(int row)
Definition: cuFlexiDm.hpp:61
vector< ResidueCell > RowContent
Definition: cuFlexiDm.hpp:60
void traverseColumnsOnMaster(ColumnReader &cr)
void addOneRow(BlockModelPair &bmp, const string &mSeq, const string &sSeq)
EDistMethod
Definition: cuDistmat.hpp:60
@ ePercentIdentityRelaxed
Definition: cuDistmat.hpp:68
void(* pProgressFunction)(int Num, int Total)
Definition: cuDistmat.hpp:47
EScoreMatrixType
thread_local unique_ptr< FtaMsgPost > bmp
Definition: ftaerr.cpp:120
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
#define count
#define assert(x)
Definition: srv_diag.hpp:58
#define row(bind, expected)
Definition: string_bind.c:73
Definition: type.c:6
else result
Definition: token2.c:20
Modified on Fri Sep 20 14:57:10 2024 by modify_doxy.py rev. 669887