NCBI C++ ToolKit
cuPssmMaker.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: cuPssmMaker.cpp 56855 2013-01-11 17:06:38Z lanczyck $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Charlie Liu
27  *
28  * File Description:
29  *
30  * Make PSSM from a CD
31  *
32  * ===========================================================================
33  */
34 
35 #include <ncbi_pch.hpp>
36 #include <corelib/ncbistr.hpp>
45 #include <objects/cdd/Cdd_id.hpp>
46 
48 BEGIN_SCOPE(cd_utils)
49 
50 static void
51 printMsa(const char* filename, const PSIMsa* msa, vector<string>& seqIds)
52 {
53  Uint4 i, j;
54  FILE* fp = NULL;
55 
56  ASSERT(msa);
57  ASSERT(filename);
58 
59  fp = fopen(filename, "w");
60  int startRow = msa->dimensions->num_seqs + 1 - seqIds.size();
61  //if startRow == 1, this means row 0 is the consensus and should be ignored
62  ASSERT(startRow >= 0);
63  for (i = startRow; i < msa->dimensions->num_seqs + 1; i++) {
64  fprintf(fp, ">%s\n", seqIds[i-startRow].c_str());
65  for (j = 0; j < msa->dimensions->query_length; j++) {
66  if (msa->data[i][j].is_aligned) {
67  fprintf(fp, "%c", ColumnResidueProfile::getEaaCode(msa->data[i][j].letter));
68  } else {
69  fprintf(fp, ".");
70  }
71  }
72  fprintf(fp, "\n");
73  }
74  fclose(fp);
75 }
76 
77 
79 : pseudoCount(-1),
80  scalingFactor(1.0),
81  matrixName("BLOSUM62"),
82  requestInformationContent(false),
83  requestResidueFrequencies(false),
84  requestWeightedResidueFrequencies(false),
85  requestFrequencyRatios(false),
86  requestNumIndepObs(false),
87  gaplessColumnWeights(false),
88  unalignedSegThreshold(-1),
89  inclusionThreshold(0.5),
90  reuseUid(false)
91 {
92 };
93 
95 {
98 }
99 
100 //consensus is not in profiles
101 //row 0 if profiles is the master
103  : m_profiles(profiles),m_options(0), m_useConsensus(useConsensus), m_diagRequest(),
104  m_currentCol(0)
105 {
107  if (m_useConsensus)
108  {
111  m_query = new unsigned char[m_msaDimensions.query_length];
113  }
114  else
115  {
117  string trunctMaster;
119  m_query = new unsigned char[m_msaDimensions.query_length];
120  memcpy(m_query, trunctMaster.data(), m_msaDimensions.query_length);
121  }
123  //m_options.inclusion_ethresh = PSI_INCLUSION_ETHRESH;
124  //determine pseudo count with information content
125  if ( config.pseudoCount > 0 ) {
126  m_options->pseudo_count = config.pseudoCount;
127  }
128  else
129  {
130  double SumAInf = profiles.calcInformationContent(m_useConsensus);
131  int iPseudo = 1;
132  if (SumAInf > 84 ) iPseudo = 10;
133  else if (SumAInf > 55 ) iPseudo = 7;
134  else if (SumAInf > 43 ) iPseudo = 5;
135  else if (SumAInf > 41.5) iPseudo = 4;
136  else if (SumAInf > 40 ) iPseudo = 3;
137  else if (SumAInf > 39 ) iPseudo = 2;
138  else iPseudo = 1;
139  m_options->pseudo_count = iPseudo;
140  }
142  //m_options.pseudo_count = PSI_PSEUDO_COUNT_CONST;
143  //m_options.use_best_alignment = false;
144  //m_options.nsg_ignore_consensus = m_useConsensus;
145  //m_options.nsg_identity_threshold = 1.0;
146  m_diagRequest.frequency_ratios = config.requestFrequencyRatios;
147  m_diagRequest.gapless_column_weights = config.gaplessColumnWeights;
148  m_diagRequest.information_content = config.requestInformationContent;
149  m_diagRequest.residue_frequencies = config.requestResidueFrequencies;
150  m_diagRequest.weighted_residue_frequencies = config.requestWeightedResidueFrequencies;
151  m_diagRequest.independent_observations = config.requestNumIndepObs;
152  m_matrixName = config.matrixName;
153  m_options->impala_scaling_factor = config.scalingFactor;
154 }
155 
157 {
158 
159  crp.getIndexByConsensus();
160  int startingRow = 0;
161  if (m_useConsensus)
162  startingRow = 1;
163  vector<char> residuesOnColumn;
164  char gap = ColumnResidueProfile::getNcbiStdCode('-');
165  residuesOnColumn.assign(m_profiles.getNumRows(), gap);
166  crp.getResiduesByRow(residuesOnColumn);
167  for (int row = 0; row < m_profiles.getNumRows(); row++)
168  {
169  m_msa->data[row+startingRow][m_currentCol].letter = residuesOnColumn[row];
170  m_msa->data[row+startingRow][m_currentCol].is_aligned = true;
171  }
172  m_currentCol++;
173 }
174 
176 {
177  PSIMsaFree(m_msa);
179 }
180 
182 {
183  if (m_useConsensus)
184  {
185  //add consensus
186  for (unsigned int i = 0; i < m_msaDimensions.query_length; i++)
187  {
188  m_msa->data[0][i].letter = m_query[i];
189  m_msa->data[0][i].is_aligned = true;
190  }
192  }
193  else
195 
196  //printMsa("msaBefore.txt", m_msa);
198  //move the row with the most aligned residues to row 1
199  //because row 1 will be used to filter out most identical sequences
200  //moveUpLongestRow();
201  // LOG_POST("num_seq="<<m_msaDimensions.num_seqs<<"query_len="<<m_msaDimensions.query_length);
202 
203  //printMsa("msa.txt", m_msa);
204 }
205 
207 {
208  char gap = ColumnResidueProfile::getNcbiStdCode('-');
209  //row 0 is the query; does not have gaps
210  //so we start at row 1
211  for (unsigned int row = 1; row <= m_msaDimensions.num_seqs; row++)
212  {
213  unsigned int i = 0;
214  for (; i < m_msaDimensions.query_length; i++)
215  {
216  if (m_msa->data[row][i].letter == gap)
217  m_msa->data[row][i].is_aligned = false;
218  else
219  break;
220  }
221  for (unsigned int j = m_msaDimensions.query_length - 1; j > i; j--)
222  {
223  if (m_msa->data[row][j].letter == gap)
224  m_msa->data[row][j].is_aligned = false;
225  else
226  break;
227  }
228  }
229 }
230 
232 {
233  int longestRow = 1;
234  int maxLen = countResiduesInRow(longestRow);
235 
236  for (int row = 2; row <= (int) m_msaDimensions.num_seqs; row++)
237  {
238  int len = countResiduesInRow(row);
239  if (len > maxLen)
240  {
241  maxLen = len;
242  longestRow = row;
243  }
244  }
245  if (longestRow != 1)
246  {
248  copyRow(m_msa->data[1], tmp);
249  copyRow(m_msa->data[longestRow], m_msa->data[1]);
250  copyRow(tmp, m_msa->data[longestRow]);
251  free(tmp);
252  }
253 }
254 
256 {
257  for (unsigned int i = 0; i < m_msaDimensions.query_length; i++)
258  {
259  dest[i].is_aligned = src[i].is_aligned;
260  dest[i].letter = src[i].letter;
261  //memcpy(&(dest[i]), &(src[i]), sizeof(PSIMsaCell));
262  }
263 }
264 
266 {
267  int count = 0;
268  for (unsigned int i = 0; i < m_msaDimensions.query_length; i++)
269  {
270  if (m_msa->data[row][i].is_aligned)
271  count++;
272  }
273  return count;
274 }
275 
276  /// Get the query sequence used as master for the multiple sequence
277  /// alignment in ncbistdaa encoding.
278 unsigned char* CdPssmInput::GetQuery()
279 {
280  return m_query;
281 }
282 
283  /// Get the query's length
285 {
287 }
288 
289  /// Obtain the multiple sequence alignment structure
291 {
292  return m_msa;
293 }
294 
295  /// Obtain the options for the PSSM engine
297 {
298  return m_options;
299 }
300 
301  /// Obtain the options for the PSSM engine
303 {
304  return m_options;
305 }
306 
308 {
309  return m_matrixName.c_str();
310 }
311 
312  /// Obtain the diagnostics data that is requested from the PSSM engine
313  /// Its results will be populated in the PssmWithParameters ASN.1 object
315 {
316  return &m_diagRequest;
317 }
318 
319 //------------------------- PssmMaker ---------------------
320 PssmMaker::PssmMaker(CCdCore* cd, bool useConsensus, bool addQueryToPssm)
321  : m_conMaker(0), m_useConsensus(useConsensus), m_addQuery(addQueryToPssm),
322  m_masterSeqEntry(), m_trunctMaster(), m_cd(cd), m_pssmInput(0)
323  //m_identityFilterThreshold(0.94)
324 {
325  CRef< CSeq_id > seqId;
326  cd->GetSeqIDFromAlignment(0, seqId);
327  if (!IsConsensus(seqId))
329  else //if consensus is master
330  {
331  //use master because it is already a consensus
332  //note this override the input useConsensus
333  m_useConsensus = false;
334  vector<int> seqIndice;
335  cd->FindConsensusInSequenceList(&seqIndice);
336  if (seqIndice.size() > 0)
337  cd->GetSeqEntryForIndex(seqIndice[0], m_masterSeqEntry);
338  }
339 }
340 
342 {
343  m_config = option;
344 }
345 
347 {
348  if (m_pssmInput)
349  delete m_pssmInput;
350  if (m_conMaker)
351  delete m_conMaker;
352 }
353 
355 {
358  {
360  }
362  if (!m_useConsensus)
363  for(unsigned int i = 0 ; i < m_pssmInput->GetQueryLength(); i++)
364  m_trunctMaster.push_back(m_pssmInput->GetQuery()[i]);
365  CPssmEngine pssmEngine(m_pssmInput);
367  /*
368  if (m_identityFilterThreshold > 0)
369  pssmInput.SetOptions()->nsg_identity_threshold = m_identityFilterThreshold;
370  */
372  try {
373  pssmRef = pssmEngine.Run();
374  }catch (...)
375  {
376  pssmRef.Reset();
377  };
378  if (pssmRef.Empty())
379  {
380  pssmRef = makeDefaultPssm();
381  }
382  if (m_addQuery)
383  {
385  if(m_useConsensus)
387  else
388  {
389  query = new CSeq_entry;
390  query->Assign(*m_masterSeqEntry);
392  }
393  modifyQuery(query);
394  pssmRef->SetPssm().SetQuery(*query);
395  }
396  m_pssmMade = pssmRef;
397  return pssmRef;
398 }
399 
401 {
402  EScoreMatrixType emt;
403  bool found = false;
404  int i = 0;
405  for (i = eBlosum45; i <= ePam250 ; i++)
406  {
409  {
410  found = true;
411  break;
412  }
413  }
414  if (found )
415  emt = (EScoreMatrixType)i;
416  else
417  emt = eBlosum62;
418  ScoreMatrix sm(emt);
419  string consensus;
420  if(m_useConsensus)
421  consensus = m_conMaker->getConsensus();
422  else
423  {
425  }
427  CPssm& pssm = pssmPara->SetPssm();
428  pssm.SetNumColumns(consensus.size());
429  int numRows = ColumnResidueProfile::m_residues.size();
430  pssm.SetNumRows(numRows);
431  list< double >* freqs = 0;
433  {
434  freqs = &(pssm.SetIntermediateData().SetFreqRatios());
435  }
436  list< int > & scores = pssm.SetFinalData().SetScores();
437  for (unsigned int col = 0; col < consensus.size(); col++)
438  {
439  char c1 = consensus.at(col);
440  for (char row = 0; row < numRows; row++)
441  {
443  int score = m_config.scalingFactor * sm.GetScore(c1, c2);
444  scores.push_back(score);
445  if (freqs)
446  freqs->push_back(0.0);
447  }
448  }
449  pssm.SetFinalData().SetLambda(0.267);
450  pssm.SetFinalData().SetKappa(0.0447);
451  pssm.SetFinalData().SetH(0.140);
452  if (m_config.scalingFactor > 1)
453  pssm.SetFinalData().SetScalingFactor((int)m_config.scalingFactor);
454  return pssmPara;
455 }
456 
458 {
459  static const string commaSpace(", ");
460  static const string periodSpaceSpace(". ");
461  CBioseq& bioseq = query->SetSeq();
462  bioseq.ResetId();
463  list< CRef< CSeq_id > > & ids = bioseq.SetId();
464  CRef< CSeq_id > seqId(new CSeq_id);
465  CDbtag& dbtag = seqId->SetGeneral();
466  //dbtag.SetDb("CDD");
467  CObject_id& obj = dbtag.SetTag();
468  list< CRef< CCdd_id > >& cdids = m_cd->SetId().Set();
469  int uid = -1;
470  list< CRef< CCdd_id > >::iterator cit = cdids.begin();
471  for (; cit != cdids.end(); cit++)
472  {
473  if ((*cit)->IsUid())
474  {
475  uid = (*cit)->GetUid();
476  break;
477  }
478  }
479  if (cit != cdids.end() && m_config.reuseUid)
480  {
481  obj.SetId(uid);
482  dbtag.SetDb("CDD");
483  }
484  else
485  {
486  obj.SetStr(m_cd->GetAccession());
487  dbtag.SetDb("Cdd");
488  }
489  ids.push_back(seqId);
490  //add a decr field
491  list< CRef< CSeqdesc > >& descList = bioseq.SetDescr().Set();
492  CRef< CSeqdesc > desc(new CSeqdesc);
493 
494  // Get the CD title.
495  // Chop leading/trailing spaces, and terminating '.' characters.
496  string cdTitle(m_cd->GetTitle());
498  if (cdTitle.length() > 0) {
499  while (NStr::EndsWith(cdTitle, '.')) {
500  cdTitle = cdTitle.substr(0, cdTitle.length() - 1);
501  }
502  }
503 
504  string seqDescTitle(m_cd->GetAccession());
505  seqDescTitle += commaSpace;
506  seqDescTitle += m_cd->GetName();
507 
508  // Prepend the title to any comment. Do this here in case there are no comments.
509  if (cdTitle.length() > 0) {
510  seqDescTitle += commaSpace + cdTitle + periodSpaceSpace;
511  }
512 
513  list< CRef< CCdd_descr > >& cddescList = m_cd->SetDescription().Set();
514  list< CRef< CCdd_descr > >::iterator lit = cddescList.begin();
515 
516  for (; lit != cddescList.end(); lit++)
517  {
518  if ((*lit)->IsComment())
519  {
520  if (cdTitle.length() == 0) {
521  seqDescTitle += commaSpace;
522  }
523  seqDescTitle += (*lit)->GetComment();
524  if (!NStr::EndsWith(seqDescTitle, '.')) {
525  seqDescTitle += '.';
526  }
527 
528  // only take the first comment
529  break;
530  }
531  }
532  desc->SetTitle(seqDescTitle);
533  list< CRef< CSeqdesc > >::iterator it = descList.begin();
534  for(; it != descList.end(); it++)
535  if ( (*it)->IsTitle() ) {
536  descList.erase(it);
537  break;
538  }
539  descList.push_back(desc);
540 }
541 
543 {
545  CRef< CSeq_id > seqId = *(m_pssmMade->SetPssm().SetQuery().SetSeq().SetId().begin());
546  bmp.getSlave().setSeqId(seqId);
547  return bmp;
548 }
549 
550 const string& PssmMaker::getConsensus()
551 {
552  return m_conMaker->getConsensus();
553 }
554 
555 //seqId in seqEntry is kept.
556 //seqInst is replaced with trunct master.
558 {
559  if (m_useConsensus)
560  return false;
561  CBioseq& bioseq = seqEntry->SetSeq();
562  CSeq_inst& seqInst = bioseq.SetInst();
563  seqInst.SetLength(m_trunctMaster.size());
564  seqInst.ResetSeq_data();
565  string eaa;
567  seqInst.SetSeq_data(*(new CSeq_data(eaa, CSeq_data::e_Ncbieaa)));
568  //CSeq_data& seqData = seqInst.SetSeq_data();
569  //seqData.SetNcbieaa(*(new CSeq_data::Ncbistdaa(m_trunctMaster)));
570  return true;
571 }
572 
573 void PssmMaker::printAlignment(string& fileName)
574 {
575  vector<string> seqIdStr;
576  const vector< CRef< CSeq_id > >& seqIds = m_conMaker->getResidueProfiles().getSeqIdsByRow();
577  if (!IsConsensus(seqIds[0]))
578  seqIdStr.push_back(seqIds[0]->AsFastaString());
579  for (unsigned int i = 1; i < seqIds.size(); i++)
580  {
581  seqIdStr.push_back(seqIds[i]->AsFastaString());
582  }
583 
584  printMsa(fileName.c_str(), m_pssmInput->GetData(), seqIdStr);
585 }
586 
587 void PssmMaker::printAlignmentByColumn(string& fileName)
588 {
589  if (fileName.length() == 0) {
590  return;
591  }
592 
593  Uint4 i, j;
594  unsigned int nRows, nCols;
595  vector<string> seqIdStr;
596  const vector< CRef< CSeq_id > >& seqIds = m_conMaker->getResidueProfiles().getSeqIdsByRow();
597  const PSIMsa& msa = *m_pssmInput->GetData();
598 
599  FILE* fp = fopen(fileName.c_str(), "w");
600 
601  nCols = msa.dimensions->query_length;
602  nRows = msa.dimensions->num_seqs;
603 
604  if (!IsConsensus(seqIds[0]))
605  seqIdStr.push_back(seqIds[0]->AsFastaString());
606  for (unsigned int k = 1; k < seqIds.size(); k++)
607  {
608  seqIdStr.push_back(seqIds[k]->AsFastaString());
609  }
610  for (i = 0; i < seqIdStr.size(); ++i) {
611  fprintf(fp, "row %d: %s\n", i,seqIdStr[i].c_str());
612  }
613 
614 // int startRow = 0; //msa->dimensions->num_seqs + 1 - seqIds.size();
615  //if startRow == 1, this means row 0 is the consensus and should be ignored
616 // ASSERT(startRow >= 0);
617  static const string dash("-");
618  fprintf(fp, "Query length: %d; Number of rows: %d\n", nCols, nRows);
619  for (j = 0; j < nCols; j++) {
620  fprintf(fp, ">column %d\n", j+1);
621  for (i = 0; i < nRows + 1; i++) {
622  if (msa.data[i][j].is_aligned) {
623  fprintf(fp, "%c", ColumnResidueProfile::getEaaCode(msa.data[i][j].letter));
624  } else {
625  fprintf(fp, "-");
626  }
627  }
628  fprintf(fp, "\n");
629  }
630  fclose(fp);
631 }
632 
634 {
635  unsigned int i, j, nRows, nCols;
636  string colResidues;
637 
638  const PSIMsa& msa = *m_pssmInput->GetData();
639  nCols = msa.dimensions->query_length;
640  nRows = msa.dimensions->num_seqs;
641 
642  columnMap.clear();
643 
644  for (j = 0; j < nCols; j++) {
645  colResidues.clear();
646  colResidues.assign(nRows + 1, '-');
647  for (i = 0; i < nRows + 1; i++) {
648  if (msa.data[i][j].is_aligned) {
649  colResidues[i] = ColumnResidueProfile::getEaaCode(msa.data[i][j].letter);
650  } else {
651  colResidues[i] = '-';
652  }
653  }
654  columnMap[j] = colResidues;
655  }
656 }
657 
659 {
660  cd_utils::PssmMaker pm(ccd,true,true); // 2rd param is useConsensus. generally "true".
661  cd_utils::PssmMakerOptions config;
662  config.requestFrequencyRatios = false;
663  pm.setOptions(config);
664  CRef<CPssmWithParameters> pssm = pm.make();
665  const BlockModelPair& guide = pm.getGuideAlignment();
666  int max = 0;
667  int maxRow = 0;
668  PssmScorer ps(pssm);
669  CRef<CBioseq> bioseq;
670  for (int i = 0; i < ccd->GetNumRows(); i++)
671  {
672  ccd->GetBioseqForRow(i, bioseq);
674  if (i==0) //score the master
675  bmp.getSlave() = bmp.getMaster();
676  bmp.remaster(guide);
677  int score = ps.score(bmp,bioseq);
678  if (score > max)
679  {
680  max = score;
681  maxRow = i;
682  }
683  }
684  return maxRow;
685 }
686 
687 
688 END_SCOPE(cd_utils)
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
PSIBlastOptions * PSIBlastOptionsFree(PSIBlastOptions *psi_options)
Deallocate PSI BLAST options.
Int2 PSIBlastOptionsNew(PSIBlastOptions **psi_options)
Initialize default options for PSI BLAST.
PSIMsa * PSIMsaFree(PSIMsa *msa)
Deallocates the PSIMsa structure.
Definition: blast_psi.c:513
PSIMsa * PSIMsaNew(const PSIMsaDimensions *dimensions)
Allocates and initializes the multiple sequence alignment data structure for use as input to the PSSM...
Definition: blast_psi.c:462
int GetNumRows() const
Definition: cuCdCore.cpp:215
bool GetSeqAlign(int Row, CRef< CSeq_align > &seqAlign)
Definition: cuCdCore.cpp:1419
bool GetSeqIDFromAlignment(int RowIndex, CRef< CSeq_id > &SeqID) const
Definition: cuCdCore.cpp:815
string GetTitle() const
Definition: cuCdCore.cpp:1989
bool FindConsensusInSequenceList(vector< int > *indices=NULL) const
Definition: cuCdCore.cpp:1536
bool GetSeqEntryForRow(int rowId, CRef< CSeq_entry > &seqEntry) const
Definition: cuCdCore.cpp:529
bool GetBioseqForRow(int rowId, CRef< CBioseq > &bioseq)
Definition: cuCdCore.cpp:561
string GetAccession(int &Version) const
Definition: cuCdCore.cpp:81
bool GetSeqEntryForIndex(int seqIndex, CRef< CSeq_entry > &seqEntry) const
Definition: cuCdCore.cpp:679
Definition: Dbtag.hpp:53
Computes a PSSM as specified in PSI-BLAST.
Definition: Pssm.hpp:55
Definition: Seq_entry.hpp:56
const char * GetMatrixName()
Obtain the name of the underlying matrix to use when building the PSSM.
int countResiduesInRow(int row)
unsigned int GetQueryLength()
Get the query's length.
void copyRow(PSIMsaCell *src, PSIMsaCell *dest)
CdPssmInput(ResidueProfiles &profiles, PssmMakerOptions &config, bool useConsensus)
string m_matrixName
void unalignLeadingTrailingGaps()
PSIDiagnosticsRequest m_diagRequest
unsigned char * m_query
ResidueProfiles & m_profiles
void moveUpLongestRow()
void Process()
Algorithm to produce multiple sequence alignment structure should be implemented in this method.
void read(ColumnResidueProfile &crp)
PSIMsa * GetData()
Obtain the multiple sequence alignment structure.
PSIMsaDimensions m_msaDimensions
Multiple sequence alignment dimensions.
const PSIDiagnosticsRequest * GetDiagnosticsRequest()
Obtain the diagnostics data that is requested from the PSSM engine Its results will be populated in t...
PSIMsa * m_msa
PSIBlastOptions * SetOptions()
Obtain the options for the PSSM engine.
PSIBlastOptions * m_options
unsigned char * GetQuery()
Get the query sequence used as master for the multiple sequence alignment in ncbistdaa encoding.
bool m_useConsensus
const PSIBlastOptions * GetOptions()
Obtain the options for the PSSM engine.
static const string m_residues
int getIndexByConsensus() const
static char getEaaCode(char stdCode)
void getResiduesByRow(vector< char > &residues, bool byNcbiStd=true) const
static unsigned char getNcbiStdCode(char eaa)
const string & getConsensus()
CRef< CSeq_entry > getConsensusSeqEntry()
ResidueProfiles & getResidueProfiles()
const BlockModelPair & getGuideAlignment() const
void skipUnalignedSeg(int threshold)
const string & getConsensus()
void setOptions(const PssmMakerOptions &option)
CRef< CPssmWithParameters > m_pssmMade
bool m_addQuery
CCdCore * m_cd
PssmMaker(CCdCore *cd, bool useConsensus=true, bool addQueryToPssm=true)
short m_pseudoCount
ConsensusMaker * m_conMaker
CdPssmInput * m_pssmInput
vector< char > m_trunctMaster
bool getTrunctMaster(CRef< CSeq_entry > &seqEntry)
CRef< CPssmWithParameters > makeDefaultPssm()
CRef< CPssmWithParameters > make()
PssmMakerOptions m_config
void printAlignment(string &fileName)
void getPssmColumnResidues(map< unsigned int, string > &columnMap)
bool m_useConsensus
void modifyQuery(CRef< CSeq_entry > query)
const BlockModelPair & getGuideAlignment()
void printAlignmentByColumn(string &fileName)
CRef< CSeq_entry > m_masterSeqEntry
int score(const CRef< CSeq_align > align, const CRef< CBioseq > bioseq)
const vector< CRef< CSeq_id > > getSeqIdsByRow() const
void traverseColumnsOnMaster(ColumnReader &cr)
double calcInformationContent(bool byConsensus=true)
const string getConsensus(bool inNcbieaa=true)
int countColumnsOnMaster(string &seq)
void traverseColumnsOnConsensus(ColumnReader &cr)
int getNumRows() const
int GetScore(char i, char j)
void clear()
Definition: map.hpp:169
Definition: map.hpp:338
static int nRows
Definition: cn3d_png.cpp:115
The NCBI C++ standard methods for dealing with std::string.
struct config config
int findHighestScoringRowByPssm(CCdCore *ccd)
static void printMsa(const char *filename, const PSIMsa *msa, vector< string > &seqIds)
Definition: cuPssmMaker.cpp:51
string GetScoringMatrixName(EScoreMatrixType type)
EScoreMatrixType
@ ePam250
@ eBlosum62
@ eBlosum45
void NcbistdaaToNcbieaaString(const vector< char > &vec, string *str)
bool IsConsensus(const CRef< CSeq_id > &seqId)
Definition: cuSequence.cpp:405
static const char fp[]
Definition: des.c:87
thread_local unique_ptr< FtaMsgPost > bmp
Definition: ftaerr.cpp:120
#define false
Definition: bool.h:36
static char tmp[3200]
Definition: utf8.c:42
#define option
CRef< objects::CPssmWithParameters > Run()
Runs the PSSM engine to compute the PSSM.
#define NULL
Definition: ncbistd.hpp:225
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
static int CompareNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive compare of a substring with another string.
Definition: ncbistr.cpp:219
static bool EndsWith(const CTempString str, const CTempString end, ECase use_case=eCase)
Check if a string ends with a specified suffix value.
Definition: ncbistr.hpp:5432
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate whitespace in a string (in-place)
Definition: ncbistr.cpp:3192
void SetId(TId &value)
Assign a value to Id data member.
Definition: Cdd_.cpp:84
void SetDescription(TDescription &value)
Assign a value to Description data member.
Definition: Cdd_.cpp:94
const TName & GetName(void) const
Get the Name member data.
Definition: Cdd_.hpp:1124
void SetTag(TTag &value)
Assign a value to Tag data member.
Definition: Dbtag_.cpp:66
TStr & SetStr(void)
Select the variant.
Definition: Object_id_.hpp:304
void SetDb(const TDb &value)
Assign a value to Db data member.
Definition: Dbtag_.hpp:229
TId & SetId(void)
Select the variant.
Definition: Object_id_.hpp:277
void SetPssm(TPssm &value)
Assign a value to Pssm data member.
void SetIntermediateData(TIntermediateData &value)
Assign a value to IntermediateData data member.
Definition: Pssm_.cpp:99
void SetFinalData(TFinalData &value)
Assign a value to FinalData data member.
Definition: Pssm_.cpp:116
void SetNumColumns(TNumColumns value)
Assign a value to NumColumns data member.
Definition: Pssm_.hpp:666
void SetNumRows(TNumRows value)
Assign a value to NumRows data member.
Definition: Pssm_.hpp:619
TGeneral & SetGeneral(void)
Select the variant.
Definition: Seq_id_.cpp:375
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
TId & SetId(void)
Assign a value to Id data member.
Definition: Bioseq_.hpp:296
void ResetId(void)
Reset Id data member.
Definition: Bioseq_.cpp:54
TTitle & SetTitle(void)
Select the variant.
Definition: Seqdesc_.hpp:1039
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
void SetDescr(TDescr &value)
Assign a value to Descr data member.
Definition: Bioseq_.cpp:65
void SetLength(TLength value)
Assign a value to Length data member.
Definition: Seq_inst_.hpp:668
void SetSeq_data(TSeq_data &value)
Assign a value to Seq_data data member.
Definition: Seq_inst_.cpp:130
void ResetSeq_data(void)
Reset Seq_data data member.
Definition: Seq_inst_.cpp:125
@ e_Ncbieaa
extended ASCII 1 letter aa codes
Definition: Seq_data_.hpp:111
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
int len
#define ASSERT
macro for assert.
Definition: ncbi_std.h:107
T max(T x_, T y_)
#define count
Declares the CPSIBlastOptionsHandle class.
C++ API for the PSI-BLAST PSSM engine.
#define row(bind, expected)
Definition: string_bind.c:73
Options used in protein BLAST only (PSI, PHI, RPS and translated BLAST) Some of these possibly should...
Boolean nsg_compatibility_mode
Compatibility option for the NCBI's structure group (note nsg_ prefix, stands for NCBI's structure gr...
double impala_scaling_factor
Scaling factor as used in IMPALA to do the matrix rescaling.
Int4 pseudo_count
Pseudocount constant.
Structure to allow requesting various diagnostics data to be collected by PSSM engine.
Definition: blast_psi.h:181
Boolean information_content
request information content
Definition: blast_psi.h:182
Boolean frequency_ratios
request frequency ratios
Definition: blast_psi.h:187
Boolean independent_observations
request number of independent observations
Definition: blast_psi.h:194
Boolean weighted_residue_frequencies
request observed weighted residue frequencies
Definition: blast_psi.h:185
Boolean gapless_column_weights
request gapless column weights
Definition: blast_psi.h:188
Boolean residue_frequencies
request observed residue frequencies
Definition: blast_psi.h:183
Structure to describe the characteristics of a position in the multiple sequence alignment data struc...
Definition: blast_psi.h:49
Boolean is_aligned
Is this letter part of the alignment?
Definition: blast_psi.h:52
Uint1 letter
Preferred letter at this position, in ncbistdaa encoding.
Definition: blast_psi.h:50
Uint4 num_seqs
Number of distinct sequences aligned with the query (does not include the query)
Definition: blast_psi.h:59
Uint4 query_length
Length of the query.
Definition: blast_psi.h:58
Multiple sequence alignment (msa) data structure containing the raw data needed by the PSSM engine to...
Definition: blast_psi.h:75
PSIMsaCell ** data
actual data, dimensions are (dimensions->num_seqs+1) by (dimensions->query_length)
Definition: blast_psi.h:77
PSIMsaDimensions * dimensions
dimensions of the msa
Definition: blast_psi.h:76
bool IsRequestingIntermediateData()
Definition: cuPssmMaker.cpp:94
double inclusionThreshold
Definition: cuPssmMaker.hpp:76
bool requestFrequencyRatios
request frequency ratios
Definition: cuPssmMaker.hpp:71
bool requestWeightedResidueFrequencies
request observed weighted residue frequencies
Definition: cuPssmMaker.hpp:70
bool requestResidueFrequencies
request observed residue frequencies
Definition: cuPssmMaker.hpp:69
bool requestInformationContent
request information content
Definition: cuPssmMaker.hpp:68
bool requestNumIndepObs
request number of independent observations per position
Definition: cuPssmMaker.hpp:72
static string query
void free(voidpf ptr)
voidp calloc(uInt items, uInt size)
Modified on Wed Sep 04 14:59:15 2024 by modify_doxy.py rev. 669887