NCBI C++ ToolKit
cn3d_pssm.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: cn3d_pssm.cpp 44550 2010-01-21 20:28:38Z thiessen $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Paul Thiessen
27 *
28 * File Description:
29 * new C++ PSSM construction
30 *
31 * ===========================================================================
32 */
33 
34 #include <ncbi_pch.hpp>
35 #include <corelib/ncbistd.hpp>
36 #include <corelib/ncbistre.hpp>
37 #include <serial/serial.hpp>
38 #include <serial/objostrasn.hpp>
39 
41 
44 
46 
47 #include "cn3d_pssm.hpp"
49 #include "sequence_set.hpp"
50 #include "cn3d_tools.hpp"
51 
54 USING_SCOPE(blast);
55 
56 
57 BEGIN_SCOPE(Cn3D)
58 
59 //#define DEBUG_PSSM 1 // for testing/debugging PSSM data
60 
61 #define PTHROW(stream) NCBI_THROW(CException, eUnknown, stream)
62 
64 {
65 #ifdef DEBUG_PSSM
66  {{
67  CNcbiOfstream ofs("pssm.txt", IOS_BASE::out);
68  }}
69 #endif
70 
71  try {
72  TRACEMSG("Creating PSSM...");
73 
74  // construct a "fake" CD to pass to PssmMaker
75  cd_utils::CCdCore c;
76  c.SetName("fake");
77 
78  // construct Seq-entry from sequences in current alignment
81  c.SetSequences().SetSet().SetSeq_set().push_back(seq);
82 
83  // construct Seq-annot from rows in the alignment
84  c.SetSeqannot().push_back(CRef<CSeq_annot>(new CSeq_annot));
87 
88  // fill out Seq-entry and Seq-annot based on BMA (row order is irrelevant here)
89  for (unsigned int i=((bma->NRows() > 1) ? 1 : 0); i<bma->NRows(); ++i) {
90  seq.Reset(new CSeq_entry);
92  c.SetSequences().SetSet().SetSeq_set().push_back(seq);
94  c.SetSeqannot().front()->SetData().SetAlign().push_back(seqAlign);
95  }
96 
97  // use PssmMaker to create PSSM using consensus
98  cd_utils::PssmMaker pm(&c, true, true);
99  cd_utils::PssmMakerOptions options; // comes with defaults
100  options.requestFrequencyRatios = true; // necessary for psi-blast
101 // options.scalingFactor = 100; // do *NOT* use SF other than 1 for psi-blast
102  pm.setOptions(options);
103  pssm = pm.make();
104 
105  // blast functions require a master (query) sequence to be present; give it a recognizable id
106  if (!pssm->GetPssm().IsSetQuery() || !pssm->GetPssm().GetQuery().IsSeq())
107  PTHROW("PssmWithParameters from cd_utils::PssmMaker() doesn't contain the master/query sequence");
108  CRef < CSeq_id > id(new CSeq_id);
109  id->SetLocal().SetStr("consensus");
110  pssm->SetPssm().SetQuery().SetSeq().SetId().push_front(id);
111 
112  // for efficient score lookup
113  UnpackMatrix(pm);
114 
115 #ifdef DEBUG_PSSM
116  CNcbiOfstream ofs("pssm.txt", IOS_BASE::out | IOS_BASE::app);
117  if (ofs) {
118  CObjectOStreamAsn oosa(ofs, false);
119  oosa << *pssm;
120 
121 /*
122  if (pssm->GetPssm().IsSetIntermediateData() && pssm->GetPssm().GetIntermediateData().IsSetResFreqsPerPos()) {
123  vector < int >
124  freqs(pssm->GetPssm().GetIntermediateData().GetResFreqsPerPos().size()),
125  nNonGap(pssm->GetPssm().GetNumColumns(), 0);
126  unsigned int i;
127  CPssmIntermediateData::TResFreqsPerPos::const_iterator
128  l = pssm->GetPssm().GetIntermediateData().GetResFreqsPerPos().begin();
129  for (i=0; i<pssm->GetPssm().GetIntermediateData().GetResFreqsPerPos().size(); ++i, ++l)
130  freqs[i] = *l;
131  int freq, n;
132  ofs << "observed frequencies:\n";
133  for (unsigned int c=0; c<pssm->GetPssm().GetNumColumns(); ++c) {
134  ofs << "column " << (c+1) << ": ";
135  n = 0;
136  for (unsigned int r=0; r<pssm->GetPssm().GetNumRows(); ++r) {
137  if (pssm->GetPssm().GetByRow())
138  freq = freqs[r * pssm->GetPssm().GetNumColumns() + c];
139  else
140  freq = freqs[c * pssm->GetPssm().GetNumRows() + r];
141  if (freq > 0) {
142  ofs << LookupCharacterFromNCBIStdaaNumber(r) << '(' << freq << ") ";
143  n += freq;
144  if (r != 0)
145  nNonGap[c] += freq;
146  }
147  }
148  ofs << "total: " << n << " non-gap: " << nNonGap[c] << '\n';
149  }
150 
151  if (pssm->GetPssm().IsSetIntermediateData() && pssm->GetPssm().GetIntermediateData().IsSetWeightedResFreqsPerPos()) {
152  vector < double > wfreqs(pssm->GetPssm().GetIntermediateData().GetWeightedResFreqsPerPos().size());
153  CPssmIntermediateData::TWeightedResFreqsPerPos::const_iterator
154  m = pssm->GetPssm().GetIntermediateData().GetWeightedResFreqsPerPos().begin();
155  for (i=0; i<pssm->GetPssm().GetIntermediateData().GetWeightedResFreqsPerPos().size(); ++i, ++m)
156  wfreqs[i] = *m;
157  double wfreq, s;
158  ofs << "weighted frequencies:\n";
159  for (unsigned int c=0; c<pssm->GetPssm().GetNumColumns(); ++c) {
160  ofs << "column " << (c+1) << ": ";
161  s = 0.0;
162  for (unsigned int r=0; r<pssm->GetPssm().GetNumRows(); ++r) {
163  if (pssm->GetPssm().GetByRow())
164  wfreq = wfreqs[r * pssm->GetPssm().GetNumColumns() + c];
165  else
166  wfreq = wfreqs[c * pssm->GetPssm().GetNumRows() + r];
167  if (wfreq != 0.0) {
168  ofs << LookupCharacterFromNCBIStdaaNumber(r) << '(' << wfreq << ") ";
169  s += wfreq;
170  }
171  }
172  ofs << "sum: " << s << '\n';
173  }
174  }
175 
176  if (pssm->GetPssm().IsSetIntermediateData() && pssm->GetPssm().GetIntermediateData().IsSetFreqRatios()) {
177  vector < double > ratios(pssm->GetPssm().GetIntermediateData().GetFreqRatios().size());
178  CPssmIntermediateData::TFreqRatios::const_iterator
179  n = pssm->GetPssm().GetIntermediateData().GetFreqRatios().begin();
180  for (i=0; i<pssm->GetPssm().GetIntermediateData().GetFreqRatios().size(); ++i, ++n)
181  ratios[i] = *n;
182  double ratio, s;
183  ofs << "frequency ratios:\n";
184  for (unsigned int c=0; c<pssm->GetPssm().GetNumColumns(); ++c) {
185  ofs << "column " << (c+1) << ": ";
186  s = 0.0;
187  for (unsigned int r=0; r<pssm->GetPssm().GetNumRows(); ++r) {
188  if (pssm->GetPssm().GetByRow())
189  ratio = ratios[r * pssm->GetPssm().GetNumColumns() + c];
190  else
191  ratio = ratios[c * pssm->GetPssm().GetNumRows() + r];
192  if (ratio != 0.0) {
193  ofs << LookupCharacterFromNCBIStdaaNumber(r) << '(' << ratio << ") ";
194  s += ratio;
195  }
196  }
197  ofs << "sum: " << s << '\n';
198  }
199  }
200  }
201 */
202  }
203 #endif
204 
205  } catch (exception& e) {
206  ERRORMSG("PSSMWrapper::PSSMWrapper() failed with exception: " << e.what());
207  } catch (...) {
208  ERRORMSG("PSSMWrapper::PSSMWrapper() failed with unknown exception");
209  }
210 }
211 
212 void PSSMWrapper::UnpackMatrix(ncbi::cd_utils::PssmMaker& pm)
213 {
214  if (!pssm->GetPssm().IsSetFinalData())
215  PTHROW("UnpackMatrix() - pssm must have finalData");
216  unsigned int nScores = pssm->GetPssm().GetNumRows() * pssm->GetPssm().GetNumColumns();
217  if (pssm->GetPssm().GetNumRows() != 28 || pssm->GetPssm().GetFinalData().GetScores().size() != nScores)
218  PTHROW("UnpackMatrix() - bad matrix size");
219 
220  scalingFactor = pssm->GetPssm().GetFinalData().GetScalingFactor();
221 
222  // allocate matrix
223  unsigned int i;
224  scaledMatrix.resize(pssm->GetPssm().GetNumColumns());
225  for (i=0; (int)i<pssm->GetPssm().GetNumColumns(); ++i)
226  scaledMatrix[i].resize(28);
227 
228  // convert matrix
229  unsigned int r = 0, c = 0;
230  CPssmFinalData::TScores::const_iterator s = pssm->GetPssm().GetFinalData().GetScores().begin();
231  for (i=0; i<nScores; ++i, ++s) {
232 
233  scaledMatrix[c][r] = *s;
234 
235  // adjust for matrix layout in pssm
236  if (pssm->GetPssm().GetByRow()) {
237  ++c;
238  if ((int)c == pssm->GetPssm().GetNumColumns()) {
239  ++r;
240  c = 0;
241  }
242  } else {
243  ++r;
244  if ((int)r == pssm->GetPssm().GetNumRows()) {
245  ++c;
246  r = 0;
247  }
248  }
249  }
250 
251  // map multiple's master <-> consensus position
252  if ((int)pm.getConsensus().size() != pssm->GetPssm().GetNumColumns())
253  PTHROW("Consensus sequence does not match PSSM size");
254  TRACEMSG("master length: " << multiple->GetMaster()->Length() << ", consensus length: " << pm.getConsensus().size());
255  cd_utils::BlockModelPair bmp(pm.getGuideAlignment()); // consensus is dependent
256  consensus2master.resize(pm.getConsensus().size());
257  for (i=0; i<pm.getConsensus().size(); ++i)
258  consensus2master[i] = bmp.mapToMaster(i);
259  bmp.reverse(); // so that master is consensus, dependent is multiple's master
261  for (i=0; i<multiple->GetMaster()->Length(); ++i)
262  master2consensus[i] = bmp.mapToMaster(i);
263 }
264 
265 void PSSMWrapper::OutputPSSM(ncbi::CNcbiOstream& os, const string& title) const
266 {
267  // create a copy of the pssm, massaged a bit so that it'll work correctly with psi-blast, rps-blast
269  copy.Assign(*pssm);
270  if (!copy.GetPssm().IsSetQuery() || !copy.GetPssm().GetQuery().IsSeq()) {
271  ERRORMSG("PssmWithParameters from cd_utils::PssmMaker() doesn't contain the master/query sequence");
272  return;
273  }
274 
275  CBioseq::TId keep;
276  CBioseq::TId::iterator i, ie = copy.SetPssm().SetQuery().SetSeq().SetId().end();
277  for (i=copy.SetPssm().SetQuery().SetSeq().SetId().begin(); i!=ie; ++i) {
278  if ((*i)->IsLocal() && (*i)->GetLocal().IsStr())
279  (*i)->SetLocal().SetStr(title);
280  if (!(*i)->IsGeneral() || (*i)->GetGeneral().GetDb() != "Cdd")
281  keep.push_back(*i);
282  }
283  copy.SetPssm().SetQuery().SetSeq().SetId() = keep;
284 
285  CSeq_descr::Tdata::iterator d, de = copy.SetPssm().SetQuery().SetSeq().SetDescr().Set().end();
286  for (d=copy.SetPssm().SetQuery().SetSeq().SetDescr().Set().begin(); d!=de; ++d) {
287  if ((*d)->IsTitle()) {
288  (*d)->SetTitle(title);
289  break;
290  }
291  }
292  if (d == de) {
293  CRef < CSeqdesc > descr(new CSeqdesc);
294  descr->SetTitle(title);
295  copy.SetPssm().SetQuery().SetSeq().SetDescr().Set().push_front(descr);
296  }
297 
298  // do not put scores in output pssm, only freq ratios
299  copy.SetPssm().ResetFinalData();
300  if (!copy.GetPssm().IsSetIntermediateData() || !copy.GetPssm().GetIntermediateData().IsSetFreqRatios())
301  ERRORMSG("PSSM is missing frequency ratios");
302 
303  CObjectOStreamAsn osa(os, false);
304  osa << copy;
305 }
306 
307 static inline int Round(double Num)
308 {
309  if (Num >= 0)
310  return((int)(Num + 0.5));
311  else
312  return((int)(Num - 0.5));
313 }
314 
315 int PSSMWrapper::GetPSSMScore(unsigned char ncbistdaa, unsigned int realMasterIndex) const
316 {
317  if (ncbistdaa >= 28 || realMasterIndex > multiple->GetMaster()->Length()) {
318  ERRORMSG("PSSMWrapper::GetPSSMScore() - invalid parameters");
319  return kMin_Int;
320  }
321 
322  // maps to a position in the consensus/pssm
323  int consensusIndex = master2consensus[realMasterIndex];
324  if (consensusIndex >= 0) {
325  double scaledScore;
326  switch (ncbistdaa) {
327  case 2: // B -> average D/N
328  scaledScore = ((double) (scaledMatrix[consensusIndex][4] + scaledMatrix[consensusIndex][13])) / 2;
329  break;
330  case 23: // Z -> average E/Q
331  scaledScore = ((double) (scaledMatrix[consensusIndex][5] + scaledMatrix[consensusIndex][15])) / 2;
332  break;
333  case 24: // U -> C
334  scaledScore = scaledMatrix[consensusIndex][3];
335  break;
336  case 26: // O -> K
337  scaledScore = scaledMatrix[consensusIndex][10];
338  break;
339  case 27: // J -> average I/L
340  scaledScore = ((double) (scaledMatrix[consensusIndex][9] + scaledMatrix[consensusIndex][11])) / 2;
341  break;
342  default:
343  scaledScore = scaledMatrix[consensusIndex][ncbistdaa];
344  }
345  return Round(scaledScore / scalingFactor);
346  }
347 
348  // use simple blosum62 score if outside the consensus/pssm
350 }
351 
352 END_SCOPE(Cn3D)
const Sequence * GetMaster(void) const
std::vector< const UngappedAlignedBlock * > UngappedAlignedBlockList
const Sequence * GetSequenceOfRow(unsigned int row) const
void GetUngappedAlignedBlocks(UngappedAlignedBlockList *blocks) const
CObjectOStreamAsn –.
Definition: objostrasn.hpp:53
Definition: Seq_entry.hpp:56
std::vector< Column > scaledMatrix
Definition: cn3d_pssm.hpp:62
std::vector< int > master2consensus
Definition: cn3d_pssm.hpp:64
ncbi::CRef< ncbi::objects::CPssmWithParameters > pssm
Definition: cn3d_pssm.hpp:58
int scalingFactor
Definition: cn3d_pssm.hpp:63
PSSMWrapper(const BlockMultipleAlignment *bma)
Definition: cn3d_pssm.cpp:63
void UnpackMatrix(ncbi::cd_utils::PssmMaker &pm)
Definition: cn3d_pssm.cpp:212
std::vector< int > consensus2master
Definition: cn3d_pssm.hpp:64
int GetPSSMScore(unsigned char ncbistdaa, unsigned int realMasterIndex) const
Definition: cn3d_pssm.cpp:315
const BlockMultipleAlignment * multiple
Definition: cn3d_pssm.hpp:57
void OutputPSSM(ncbi::CNcbiOstream &os, const std::string &title) const
Definition: cn3d_pssm.cpp:265
unsigned int Length(void) const
string sequenceString
Definition: cav_seqset.hpp:93
CConstRef< objects::CBioseq > bioseqASN
Definition: cav_seqset.hpp:90
USING_SCOPE(objects)
static int Round(double Num)
Definition: cn3d_pssm.cpp:307
#define PTHROW(stream)
Definition: cn3d_pssm.cpp:61
USING_NCBI_SCOPE
Definition: cn3d_pssm.cpp:52
#define TRACEMSG(stream)
Definition: cn3d_tools.hpp:83
#define ERRORMSG(stream)
Definition: cn3d_tools.hpp:86
Include a standard set of the NCBI C++ Toolkit most basic headers.
std::ofstream out("events_result.xml")
main entry point for tests
thread_local unique_ptr< FtaMsgPost > bmp
Definition: ftaerr.cpp:120
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
TObjectType & GetObject(void) const
Get object.
Definition: ncbiobj.hpp:1697
#define kMin_Int
Definition: ncbi_limits.h:183
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
IO_PREFIX::ofstream CNcbiOfstream
Portable alias for ofstream.
Definition: ncbistre.hpp:500
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
TTitle & SetTitle(void)
Select the variant.
Definition: Seqdesc_.hpp:1039
list< CRef< CSeq_id > > TId
Definition: Bioseq_.hpp:94
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
const struct ncbi::grid::netcache::search::fields::SIZE size
void resize(vector< SMethodDef > &container)
NCBI C++ stream class wrappers for triggering between "new" and "old" C++ stream libraries.
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
static DP_BlockInfo * blocks
static int GetBLOSUM62Score(char a, char b)
ncbi::objects::CSeq_align * CreatePairwiseSeqAlignFromMultipleRow(const BlockMultipleAlignment *multiple, const BlockMultipleAlignment::UngappedAlignedBlockList &blocks, unsigned int slaveRow)
char LookupCharacterFromNCBIStdaaNumber(unsigned char n)
Definition: su_pssm.cpp:142
Modified on Sat Dec 02 09:23:07 2023 by modify_doxy.py rev. 669887