NCBI C++ ToolKit
cuResidueProfile.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: cuResidueProfile.cpp 45770 2010-05-17 14:44:31Z lanczyck $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  */
24 #include <ncbi_pch.hpp>
28 #include <math.h>
29 
31 BEGIN_SCOPE(cd_utils)
32 
33 // define ColumnResidueProfile
34 const string ColumnResidueProfile::m_residues = "-ABCDEFGHIKLMNPQRSTVWXYZU*OJ"; //ncbieaa
35 
36 unsigned char ColumnResidueProfile::getNcbiStdCode(char eaa)
37 {
38  unsigned char ret = m_residues.find(eaa);
39  if(ret < m_residues.size())
40  return ret;
41  else
42  return m_residues.find('X');
43 }
44 
46 :m_masterIn(false), m_residueRowsMap(), m_residueTypeCount(0), m_indexByConsensus(-1)
47  //m_backgroundResFreq()
48 {
49 }
50 
52 {
53  for (unsigned int i = 0; i < m_residuesByRow.size(); i++)
54  delete m_residuesByRow[i];
55 }
56 
57 void ColumnResidueProfile::addOccurence(char residue, int row, bool aligned)
58 {
59  //this deals with columns on the master which can be added more than once
60  if (row == 0)
61  {
62  if (m_masterIn)
63  return;
64  else
65  m_masterIn = true;
66  }
67  if (m_residueRowsMap.count(residue) == 0) //first time to see this residue
69  if ((int)m_residuesByRow.size() < row)
70  {
71  m_residuesByRow.resize(row, 0);
72  }
73  assert(m_residuesByRow.size() == row);
76  m_residuesByRow.push_back(it);
77  //m_rows.insert(row);
78 }
79 
80 /*
81 bool ColumnResidueProfile::hasRow(int row) const
82 {
83  return m_rows.find(row) != m_rows.end();
84 }*/
85  /*
86 ColumnResidueProfile::ResidueRowsMap::iterator* ColumnResidueProfile::findRow(int row)
87 {
88  if (row > (m_residuesByRow.size()-1))
89  return 0;
90  else
91  return m_residuesByRow[row];
92 }
93  */
94 
95 /*ColumnResidueProfile::ResidueRowsMap::const_iterator* ColumnResidueProfile::findRow(int row)const
96 {
97  return m_residuesByRow[row];
98 }*/
99 
101 {
102  return m_residueRowsMap.size();
103 }
104 
106 {
107  unsigned int max = 0;
109  for (unsigned int i = 1; i < m_residues.size(); i++)
110  {
111  if ((int)m_residueRowsMap.count(m_residues[i]) > count)
112  {
113  max = i;
115  }
116  }
117  return m_residues[max];
118 }
119 
120 double ColumnResidueProfile::calculateColumnWeight(char residue, bool countGap, int numRows)const
121 {
122  if (m_residueRowsMap.count(residue) == 0)
123  return 0;
124  else
125  {
126  if (!countGap)
127  {
128  double denom = (double)(m_residueTypeCount * (m_residueRowsMap.count(residue)));
129  double w = 1.0/denom;
130  return w;
131  }
132  else
133  {
134  int numGap = numRows - getSumCount();
135  double denom = 0.0;
136  if (numGap > 0)
137  denom = (double)((m_residueTypeCount+1) * (m_residueRowsMap.count(residue)));
138  else
139  denom = (double)(m_residueTypeCount * (m_residueRowsMap.count(residue)));
140  double w = 1.0/denom;
141  return w;
142  }
143  }
144 }
145 
146 //return the total weights for this column, which should be 1
147 double ColumnResidueProfile::sumUpColumnWeightsByRow(vector<double>& rowWeights, bool countGap, int numRows) const
148 {
149  double total = 0.0;
151  set<int> rowsUsed;
152  for (; cit != m_residueRowsMap.end(); cit++)
153  {
154  double colResWeight = calculateColumnWeight(cit->first, countGap, numRows);
155  rowWeights[cit->second.first] += colResWeight;
156  total += colResWeight;
157  rowsUsed.insert(cit->second.first);
158  }
159  if (countGap && ((int)rowsUsed.size() < numRows))
160  {
161  double gapWeight = (1.0 - total) / (numRows - rowsUsed.size());
162  for (int row = 0; row < numRows; row++)
163  {
164  if(rowsUsed.find(row) != rowsUsed.end())
165  rowWeights[row] += gapWeight;
166  }
167  }
168  return 1.0;
169 }
170 
171 double ColumnResidueProfile::reweightColumnByRowWeights(const vector<double>& rowWeights, char& heaviestResidue)const
172 {
173  double totalWeight = 0;
174  double maxResWeight = 0;
175  double resWeight = 0;
176  for (unsigned int i = 0; i < m_residues.size(); i++)
177  {
178  pair <ResidueRowsMap::const_iterator, ResidueRowsMap::const_iterator> range =
180  resWeight = 0;
181  for (ResidueRowsMap::const_iterator cit = range.first; cit != range.second; cit++)
182  {
183  int row = cit->second.first;
184  resWeight += rowWeights[row];
185  }
186  if (resWeight > maxResWeight)
187  {
188  heaviestResidue = m_residues[i];
189  maxResWeight = resWeight;
190  }
191  totalWeight += resWeight;
192  }
193  return totalWeight;
194 }
195 
196 //residues will be in Ncbistd by default
197 void ColumnResidueProfile::getResiduesByRow(vector<char>& residues, bool byNcbiStd)const
198 {
200  for (; rit != m_residueRowsMap.end(); rit++)
201  {
202  if (byNcbiStd)
203  residues[rit->second.first] = getNcbiStdCode(rit->first);
204  else
205  residues[rit->second.first] = rit->first;
206  }
207 }
208 
209 //residues will be in Ncbistd
211 {
213  if (rit)
214  {
215  return getNcbiStdCode((*rit)->first);
216  }
217  else
218  return getNcbiStdCode('-');
219 }
220 
221 
222 bool ColumnResidueProfile::isAligned(char residue, int row)const
223 {
224  pair <ResidueRowsMap::const_iterator, ResidueRowsMap::const_iterator> range =
226  for (ResidueRowsMap::const_iterator cit = range.first; cit != range.second; cit++)
227  {
228  if (cit->second.first == row)
229  return cit->second.second;
230  }
231  return false;
232 }
233 
235 {
237  if (cit)
238  return (*cit)->second.second;
239  else
240  return false;
241 }
242 
244 {
246  cit != m_residueRowsMap.end(); cit++)
247  {
248  if (!(cit->second.second))
249  return false;
250  }
251  return true;
252 }
253 
256 
258 {
259  //-------------------------------------------------------------------------
260  //
261  // residue frequencies, SWISS-PROT, Release 40.20, 06-Jun-2002:
262  // Ala (A) 7.67 Gln (Q) 3.93 Leu (L) 9.56 Ser (S) 7.04
263  // Arg (R) 5.21 Glu (E) 6.48 Lys (K) 5.95 Thr (T) 5.55
264  // Asn (N) 4.32 Gly (G) 6.88 Met (M) 2.37 Trp (W) 1.20
265  // Asp (D) 5.25 His (H) 2.25 Phe (F) 4.08 Tyr (Y) 3.14
266  // Cys (C) 1.62 Ile (I) 5.85 Pro (P) 4.89 Val (V) 6.63
267  //
268  //-------------------------------------------------------------------------
269  // these are the only residues used for calculating information content
270  m_backgroundResFreq['A'] = 7.67/100.0; // Ala
271  m_backgroundResFreq['R'] = 5.21/100.0; // Arg
272  m_backgroundResFreq['N'] = 4.32/100.0; // Asn
273  m_backgroundResFreq['D'] = 5.25/100.0; // Asp
274  m_backgroundResFreq['C'] = 1.62/100.0; // Cys
275  m_backgroundResFreq['Q'] = 3.93/100.0; // Gln
276  m_backgroundResFreq['E'] = 6.48/100.0; // Glu
277  m_backgroundResFreq['G'] = 6.88/100.0; // Gly
278  m_backgroundResFreq['H'] = 2.25/100.0; // His
279  m_backgroundResFreq['I'] = 5.85/100.0; // Ile
280  m_backgroundResFreq['L'] = 9.56/100.0; // Leu
281  m_backgroundResFreq['K'] = 5.95/100.0; // Lys
282  m_backgroundResFreq['M'] = 2.37/100.0; // Met
283  m_backgroundResFreq['F'] = 4.08/100.0; // Phe
284  m_backgroundResFreq['P'] = 4.89/100.0; // Pro
285  m_backgroundResFreq['S'] = 7.04/100.0; // Ser
286  m_backgroundResFreq['T'] = 5.55/100.0; // Thr
287  m_backgroundResFreq['W'] = 1.20/100.0; // Trp
288  m_backgroundResFreq['Y'] = 3.14/100.0; // Tyr
289  m_backgroundResFreq['V'] = 6.63/100.0; // Val
290 }
291 
293 {
294  if (m_backgroundResFreqArray == 0)
296 
298  /*
299  if (m_backgroundResFreq.size() == 0)
300  useDefaultBackgroundResFreq();
301  if (m_backgroundResFreq.find(res) == m_backgroundResFreq.end())
302  return 0.0;
303  else
304  return m_backgroundResFreq[res];
305  */
306 }
307 
309 {
310  double info = 0;
311  double freqThreshold = 0.0001f;
312  double total = (double) m_residueRowsMap.size();
313  static const double ln2 = log(2.0f);
314  for (unsigned int i = 0; i < m_residues.size(); i++)
315  {
316 
317  int count = m_residueRowsMap.count(m_residues[i]);
318  if (count > 0)
319  {
320  //double standardFreq = GetStandardProbability(m_residues[i]);
321  double standardFreq = getBackgroundResFreq(m_residues[i]);
322  if ( standardFreq > freqThreshold)
323  {
324  double freq = double(count)/total;
325  double freqRatio = freq/standardFreq;
326  if (freqRatio > freqThreshold)
327  info += freq * (log(freqRatio))/ln2;
328  }
329  }
330  }
331  return info;
332 }
333 
334 //---------------------------ColumnAddress--------------------------------
335 
336 ColumnAddress::ColumnAddress(int posOnMaster, int aGap)
337 : mPos(posOnMaster), gap(aGap)
338 {
339 }
340 
342 : mPos(0), gap(0)
343 {
344 }
345 
347 {
348 }
349 
351 {
352  if (mPos == rhs.mPos)
353  return gap < rhs.gap;
354  else
355  return mPos < rhs.mPos;
356 }
357 
358 
359 //-------------------------ResidueProfiles----------------------------------------
360 
362 : m_frequencyThreshold(0.5),
363  m_totalRows(1), //starting at 1 because 0 is reserved for master
364  m_profiles(), m_rowWeights(),
365  m_consensus(), m_guideAlignment()
366 {
367 }
368 
370 {
371 }
372 
373 void ResidueProfiles::addOneRow(BlockModelPair& bmp, const string& mSeq, const string& sSeq)
374 {
375  //master row = 0
376  int masterRow = 0;
377  const vector<Block>& mBlocks = bmp.getMaster().getBlocks();
378  const vector<Block>& sBlocks = bmp.getSlave().getBlocks();
379  //keep seqIds
380  if (m_seqIds.size() == 0)
381  m_seqIds.push_back(bmp.getMaster().getSeqId());
382  m_seqIds.push_back(bmp.getSlave().getSeqId());
383  for (unsigned int bn = 0; bn < mBlocks.size(); bn++)
384  {
385  for (int i = 0; i < mBlocks[bn].getLen(); i++)
386  {
387  int mPos = mBlocks[bn].getStart() + i;
388  int sPos = sBlocks[bn].getStart() + i;
389  ColumnAddress col(mPos);
390  m_profiles[col].addOccurence(mSeq[mPos], masterRow, true);
391  m_profiles[col].addOccurence(sSeq[sPos], m_totalRows, true);
392  }
393  //add the unaligned region to the c-term unless it is the last block
394  if (bn != (mBlocks.size() -1))
395  {
396  int mPos = mBlocks[bn].getEnd();
397  int sPos = sBlocks[bn].getEnd();
398  //on the slave
399  int sGapLen = bmp.getSlave().getGapToCTerminal(bn);
400  int mGapLen = bmp.getMaster().getGapToCTerminal(bn);
401  //assume the input alignment is degapped, one of above must be 0
402  //assert((sGapLen == 0) || (mGapLen == 0));
403  //add cols on master
404  for (int gap =1; gap <= mGapLen; gap++)
405  {
406  ColumnAddress col(mPos + gap);
407  m_profiles[col].addOccurence(mSeq[mPos+gap], masterRow, false);
408 
409  }
410  //add cols on slave
411  //split the gap in the middle
412  int midOnMaster = mGapLen/2 + mGapLen%2;
413  int mid = sGapLen/2 + sGapLen%2;
414  for (int gap =1; gap <= sGapLen; gap++)
415  {
416  ColumnAddress col;
417  if ( gap <= mid)
418  {
419  if (gap <= midOnMaster)
420  {
421  col.gap = 0;
422  col.mPos = mPos + gap;
423  }
424  else // gap > midOnMaster
425  {
426  col.mPos = mPos + midOnMaster;
427  col.gap = gap - midOnMaster;
428  }
429  }
430  else // (gap > mid) //attach to the next block
431  {
432  int mPosNext = mBlocks[bn+1].getStart();
433  int sPosNext = sBlocks[bn+1].getStart();
434  int delta = sPosNext - (sPos + gap);
435  if (delta <= (mGapLen/2))
436  {
437  col.mPos = mPosNext - delta;
438  col.gap = 0;
439  }
440  else
441  {
442  col.mPos = mPosNext - (mGapLen/2);
443  col.gap = (mGapLen/2) - delta;
444  }
445  }
446  m_profiles[col].addOccurence(sSeq[sPos+gap], m_totalRows, false);
447  }
448  }
449  }
450  m_totalRows++;
451 }
452 
453 
455 {
456  bool countGap = false;
457  //iterate through all column Profile to add up the row weights
458  m_rowWeights.assign(m_totalRows, 0.0);
459  double weightsSum = 0;
460  int colUsed = 0;
462 
463  //get highest ungapped coun
464  int highestCount = 0;
465  for (; cit != m_profiles.end(); cit++)
466  {
467  int aCount = cit->second.getSumCount();
468  if (aCount > highestCount)
469  highestCount = aCount;
470  }
471  cit = m_profiles.begin();
472  for (; cit != m_profiles.end(); cit++)
473  {
474  const ColumnResidueProfile& colProfile = cit->second;
475  if (!countGap)
476  {
477  //only use columns that has all rows to calculate the row weight
478  if (colProfile.getSumCount() >= highestCount)
479  {
480  weightsSum += colProfile.sumUpColumnWeightsByRow(m_rowWeights, countGap, m_totalRows);
481  colUsed++;
482  }
483  }
484  //count gap
485  //but ignore columns with identical residues
486  else if (colProfile.getResidueTypeCount() > 1 )
487  {
488  weightsSum += colProfile.sumUpColumnWeightsByRow(m_rowWeights, countGap, m_totalRows);
489  colUsed++;
490  }
491  }
492  //debug
493  //check weightsSum should round up to colUsed
494  //printf("Column with all rows:%d. Total row Weight: %.2f\n", colUsed, weightsSum);
495  //if a row did not get a weight, give it a default
496  double defaultWeight = 0.0;
497  if (colUsed != 0)
498  defaultWeight = 1.0f * double(colUsed)/double(m_totalRows);
499  else
500  defaultWeight = 1.0f * double(m_profiles.size())/double(m_totalRows);
501  int noWeightRows = 0;
502  /*
503  double minW = 0.0, maxW = 0.0;
504  for ( int i = 0; i < m_rowWeights.size(); i++)
505  {
506  if (m_rowWeights[i] > maxW)
507  maxW = m_rowWeights[i];
508  if (m_rowWeights[i] != 0.00)
509  {
510  if (minW == 0.0)
511  minW = m_rowWeights[i];
512  else if (m_rowWeights[i] < minW)
513  minW = m_rowWeights[i];
514  }
515 
516  }*/
517  /*
518  printf("default weight=%.4f; minW=%.4f;maxW=%.4f;colUsed=%d; wieghtSum=%.4f, totalRow=%d\n", defaultWeight,
519  minW, maxW, colUsed, weightsSum, m_totalRows);*/
520  for ( unsigned int i = 0; i < m_rowWeights.size(); i++)
521  {
522  //normalize the weight by weightsSum
523  if (m_rowWeights[i] == 0.0)
524  {
525  m_rowWeights[i] = defaultWeight;
526  weightsSum += m_rowWeights[i];
527  noWeightRows++;
528  }
529  }
530  for ( unsigned int i = 0; i < m_rowWeights.size(); i++)
531  {
532  m_rowWeights[i] = m_rowWeights[i]/weightsSum;
533  }
534  //printf("number of no weight columns=%d\n", noWeightRows);
535  //debug
536  /*
537  for ( int i = 0; i < m_rowWeights.size(); i++)
538  {
539  printf("Row: %d | Weight: %.3f\n", i, m_rowWeights[i]);
540  }*/
541 }
542 
544 {
545  vector<Block>& blocksOnMaster = m_guideAlignment.getMaster().getBlocks();
546  vector<Block>& blocksOnConsensus = m_guideAlignment.getSlave().getBlocks();
547  blocksOnMaster.clear();
548  blocksOnConsensus.clear();
549  m_consensus.erase();
551 
552  bool inBlock = false;
553  int startM = 0, endM = 0;
554  int startC = 0;
555  int blockId = 0;
556  double threshold = m_frequencyThreshold;
557 
559  for (; cit != m_profiles.end(); cit++)
560  {
561  const ColumnAddress& col = cit->first;
562 
563  char res = 0;
564  double weight = (cit->second).reweightColumnByRowWeights(m_rowWeights, res);
565 
566  bool qualifiedForConsensus = (weight >= threshold && res );
567  bool qualifiedForGuide = qualifiedForConsensus && ((cit->second).isAligned(0)); //is aligned on master
568  //bool qualifiedForGuide = qualifiedForConsensus && (col.gap == 0); //not a gap on the master
569 
570  if (!inBlock)
571  {
572  if (qualifiedForGuide)
573  {
574  startM = col.mPos;
575  endM = startM;
576  startC = m_consensus.size();
577  //m_consensus += res;
578  inBlock = true;
579  }
580  }
581  else
582  {
583  if (qualifiedForGuide)
584  {
585  assert(col.mPos > endM);
586  if (col.mPos == (endM + 1)) //continue on the previous block
587  {
588  endM++;
589  }
590  else
591  {
592  //save the last block
593  blocksOnMaster.push_back(Block(startM, endM - startM + 1, blockId));
594  blocksOnConsensus.push_back(Block(startC, endM - startM + 1, blockId));
595  //start a new block
596  blockId++;
597  startM = col.mPos;
598  endM = startM;
599  startC = m_consensus.size();
600  }
601  }
602  else //ending this block
603  {
604  inBlock = false;
605  blocksOnMaster.push_back(Block(startM, endM - startM + 1, blockId));
606  blocksOnConsensus.push_back(Block(startC, endM - startM + 1, blockId));
607  blockId++;
608  }
609  }
610  if (qualifiedForConsensus)
611  {
612  cit->second.setIndexByConsensus(m_consensus.size());
613  m_consensus += res;
614  }
615  else
616  {
617  int consIndex = (int) m_consensus.size() - 1;
618  ++m_numUnqualAfterConsIndex[consIndex];
619  }
620  }
621  if (inBlock) //block goes to the end of the sequence
622  {
623  blocksOnMaster.push_back(Block(startM, endM - startM + 1, blockId));
624  blocksOnConsensus.push_back(Block(startC, endM - startM + 1, blockId));
625  }
627  return m_consensus;
628 }
629 
630 unsigned int ResidueProfiles::GetNumUnqualAfterIndex(int index) const
631 {
632  unsigned int result = 0;
634  if (cit != m_numUnqualAfterConsIndex.end()) {
635  result = cit->second;
636  }
637  return result;
638 }
639 
641 {
643 }
644 
645 const string ResidueProfiles::getConsensus(bool inNcbieaa)
646 {
647  if (inNcbieaa)
648  return m_consensus;
649  else
650  {
651  string ncbistd;
652  for (unsigned int i = 0; i < m_consensus.size(); i++)
653  {
655  }
656  return ncbistd;
657  }
658 }
659 
661 {
662  return m_guideAlignment;
663 }
664 
666 {
667  return m_guideAlignment;
668 }
669 
671 {
674  seq.assign(mcc.getSeq());
675  return mcc.getCount();
676 }
677 
679 {
681  for (; pit != m_profiles.end(); pit++)
682  {
683  cr.read(pit->second);
684  }
685 }
686 
688 {
690  for (; pit != m_profiles.end(); pit++)
691  {
692  if (pit->first.gap == 0)
693  {
694  int mPos = pit->first.mPos;
696  cr.read(pit->second);
697  }
698  }
699 }
700 
702 {
704  for (; pit != m_profiles.end(); pit++)
705  {
706  if (pit->second.getIndexByConsensus() >= 0)
707  cr.read(pit->second);
708  }
709 }
710 
712 {
714  for (; pit != m_profiles.end(); pit++)
715  {
716  if (pit->second.isAllRowsAligned())
717  cr.read(pit->second);
718  }
719 }
720 
722 {
723  double info = 0;
724 
726  for (; cit != m_profiles.end(); cit++)
727  {
728  ColumnResidueProfile& colProfile = cit->second;
729  //if (colProfile.isAllRowsAligned())
730  //if (colProfile.getSumCount() == m_totalRows)
731  bool useCol = false;
732  if ( byConsensus ) {
733  useCol = colProfile.getIndexByConsensus() >= 0;
734  }
735  else //by master
736  useCol = (cit->first.gap == 0);
737 
738  if (useCol)
739  {
740  info += colProfile.calcInformationContent();
741  }
742  }
743  return info;
744 }
745 
747 {
748  string consensus;
749  if (m_consensus.size() == 0) //master is the consensus
750  {
752  countColumnsOnMaster(consensus);
753  }
754  else
755  {
757  consensus = getConsensus(false);
758  }
759  //in Ncbistd
760  ucr.setIndexSequence(consensus);
761 }
762 
764 {
765  vector<Block>& blocksOnMaster = m_guideAlignment.getMaster().getBlocks();
766  vector<Block>& blocksOnConsensus = m_guideAlignment.getSlave().getBlocks();
767  blocksOnMaster.clear();
768  blocksOnConsensus.clear();
769  string curConsensus = m_consensus;
770  m_consensus.erase();
771 
774 
775  bool inBlock = false;
776  int startM = 0, endM = 0;
777  int startC = 0;
778  int blockId = 0;
780  while (cit != m_profiles.end() )
781  {
782  ColumnResidueProfile& colProfile = cit->second;
783  if (colProfile.getIndexByConsensus() < 0)
784  {
785  cit++;
786  continue;
787  }
788  const ColumnAddress& col = cit->first;
789  int conIndex = colProfile.getIndexByConsensus();
790  bool qualifiedForConsensus = (m_colsToSkipOnConsensus.find(conIndex) == m_colsToSkipOnConsensus.end());
791  bool qualifiedForGuide = qualifiedForConsensus && ((cit->second).isAligned(0)); //is aligned on master
792  if (!inBlock)
793  {
794  if (qualifiedForGuide)
795  {
796  startM = col.mPos;
797  endM = startM;
798  startC = m_consensus.size();
799  //m_consensus += res;
800  inBlock = true;
801  }
802  }
803  else
804  {
805  if (qualifiedForGuide)
806  {
807  assert(col.mPos > endM);
808  if (col.mPos == (endM + 1)) //continue on the previous block
809  {
810  endM++;
811  }
812  else
813  {
814  //save the last block
815  blocksOnMaster.push_back(Block(startM, endM - startM + 1, blockId));
816  blocksOnConsensus.push_back(Block(startC, endM - startM + 1, blockId));
817  //start a new block
818  blockId++;
819  startM = col.mPos;
820  endM = startM;
821  startC = m_consensus.size();
822  }
823  }
824  else //ending this block
825  {
826  inBlock = false;
827  blocksOnMaster.push_back(Block(startM, endM - startM + 1, blockId));
828  blocksOnConsensus.push_back(Block(startC, endM - startM + 1, blockId));
829  blockId++;
830  }
831  }
832  int unqualMapIndex = (int) m_consensus.size();
833  if (qualifiedForConsensus)
834  {
835  colProfile.setIndexByConsensus(m_consensus.size());
836  if (curMap.find(conIndex) != curMap.end()) {
837  m_numUnqualAfterConsIndex[unqualMapIndex] += curMap[conIndex];
838  }
839  m_consensus += curConsensus[conIndex];
840  }
841  else
842  {
843  // use unqualMapIndex - 1 here because need to give conIndex's entries to the last valid
844  // column in the rebuilt consensus.
845  if (curMap.find(conIndex) != curMap.end()) {
846  m_numUnqualAfterConsIndex[unqualMapIndex - 1] += curMap[conIndex];
847  }
848  ++m_numUnqualAfterConsIndex[unqualMapIndex - 1];
849 
850  colProfile.setIndexByConsensus(-2); //use -2 to indicate skiped consensus
851  }
852  cit++;
853  }
854  if (inBlock) //block goes to the end of the sequence
855  {
856  blocksOnMaster.push_back(Block(startM, endM - startM + 1, blockId));
857  blocksOnConsensus.push_back(Block(startC, endM - startM + 1, blockId));
858  }
860 }
861 
863 {
864  vector<UnalignedSegReader::Seg> segs;
865  ucr.getLongUnalignedSegs(len, segs);
866  if (segs.size() == 0)
867  return false;
868  if (m_consensus.size() == 0) //master is the consensus
869  {
871  }
872  else
873  {
875  }
876  return true;
877 }
878 
879 void ResidueProfiles::segsToSet(vector<UnalignedSegReader::Seg>& segs,set<int>& cols)
880 {
881  for(unsigned int i = 0; i < segs.size(); i++)
882  {
883  for(int k = segs[i].first; k <= segs[i].second; k++)
884  cols.insert(k);
885  }
886 }
887 
889  : m_totalUnaligned(0), m_pos(0)
890 {
891  m_maxSeg.first = -1;
892  m_maxSeg.second = -1;
893  m_curSeg.first = -1;
894  m_curSeg.second = -1;
895 }
896 
898 {
899  m_indexSeq = seq;
900 }
901 
903 {
904  return m_indexSeq;
905 }
906 
908 {
909  return m_maxSeg;
910 }
911 
913 {
914  return getLen(m_maxSeg);
915 }
916 
918 {
919  if (getLenOfLongestSeg() > threshold)
920  {
922  }
923  else
924  return m_indexSeq;
925 }
926 
927 int UnalignedSegReader::getLongUnalignedSegs(int length, vector<Seg>& segs)
928 {
929  for(unsigned int i = 0; i < m_unalignedSegs.size(); i++)
930  if (getLen(m_unalignedSegs[i]) >= length)
931  segs.push_back(m_unalignedSegs[i]);
932  return segs.size();
933 }
934 
935 /*
936 string UnalignedSegReader::subtractLongSeg(int length)
937 {
938  vector<Seg> segs;
939  getLongUnalignedSegs(length, segs);
940  string result = m_indexSeq;;
941  for (int i = 0; i < segs.size(); i++)
942  {
943  if (getLen(segs[i])> length)
944  result = subtractSeg(segs[i], result);
945  }
946  return result;
947 }*/
948 
950 {
951  return seg.second - seg.first + 1 ;
952 }
953 
955 {
956  string head = in.substr(0, seg.first);
957  string tail = in.substr(seg.second + 1, in.size() - (seg.second + 1));
958  return head + tail;
959 }
960 
962 {
963  if (crp.isAllRowsAligned()) //aligned
964  {
965  if (m_curSeg.first >= 0) //was in a unaligned region
966  {
968  m_unalignedSegs.push_back(m_curSeg);
969  if (m_maxSeg.first < 0)
970  {
971  m_maxSeg = m_curSeg;
972  }
973  else if ( getLen(m_curSeg) > getLen(m_maxSeg))
974  {
975  m_maxSeg = m_curSeg;
976  }
977  }
978  //-1 to indicate " in aligned region
979  m_curSeg.first = -1;
980  m_curSeg.second = -1;
981  }
982  else //unaligned
983  {
984  if (m_curSeg.first < 0) //see a new unaligned seg
985  {
986  m_curSeg.first = m_pos;
987  m_curSeg.second = m_curSeg.first;
988  }
989  else //continue an existing unaligned seg
990  {
991  m_curSeg.second++;
992  }
993  }
994  m_pos++;
995 }
996 
997 END_SCOPE(cd_utils)
Various auxiliary BLAST utility functions.
double * BLAST_GetStandardAaProbabilities(void)
Get the standard amino acid probabilities.
Definition: blast_util.c:1323
BlockModel & getMaster()
Definition: cuBlock.cpp:925
BlockModel & getSlave()
Definition: cuBlock.cpp:935
bool isValid() const
Definition: cuBlock.cpp:1005
vector< Block > & getBlocks()
Definition: cuBlock.hpp:97
bool operator<(const ColumnAddress &rhs) const
virtual void read(ColumnResidueProfile &crp)=0
double calculateColumnWeight(char residue, bool countGap, int numRows) const
double reweightColumnByRowWeights(const vector< double > &rowWeights, char &heaviestResidue) const
static const string m_residues
void addOccurence(char residue, int row, bool aligned)
unsigned char getResidueByRow(int row)
pair< int, bool > RowStatusPair
ResidueRowsMap::iterator * findRow(int row)
int getIndexByConsensus() const
double getBackgroundResFreq(char res)
void setIndexByConsensus(int col)
vector< ResidueRowsMap::iterator * > m_residuesByRow
double sumUpColumnWeightsByRow(vector< double > &rowWeights, bool countGap, int numRows) const
void getResiduesByRow(vector< char > &residues, bool byNcbiStd=true) const
static map< char, double > m_backgroundResFreq
ResidueRowsMap m_residueRowsMap
char getMostFrequentResidue(int &count) const
static unsigned char getNcbiStdCode(char eaa)
static void useDefaultBackgroundResFreq()
int getResidueTypeCount() const
bool isAligned(char residue, int row) const
vector< double > m_rowWeights
const string & makeConsensus()
set< int > m_colsToSkipOnConsensus
UnqualForConsMap::const_iterator UnqualForConsCit
PosProfileMap m_profiles
void traverseColumnsOnMaster(ColumnReader &cr)
BlockModelPair m_guideAlignment
UnqualForConsMap m_numUnqualAfterConsIndex
double calcInformationContent(bool byConsensus=true)
const string getConsensus(bool inNcbieaa=true)
set< int > m_colsToSkipOnMaster
void traverseAlignedColumns(ColumnReader &cr)
void countUnalignedConsensus(UnalignedSegReader &ucr)
const BlockModelPair & getGuideAlignment() const
bool HasUnqualAfterIndex(int index) const
int countColumnsOnMaster(string &seq)
unsigned int GetNumUnqualAfterIndex(int index) const
void addOneRow(BlockModelPair &bmp, const string &mSeq, const string &sSeq)
void traverseAllColumns(ColumnReader &cr)
void traverseColumnsOnConsensus(ColumnReader &cr)
vector< CRef< CSeq_id > > m_seqIds
bool skipUnalignedSeg(UnalignedSegReader &ucr, int len)
void segsToSet(vector< UnalignedSegReader::Seg > &segs, set< int > &cols)
void read(ColumnResidueProfile &crp)
int getLongUnalignedSegs(int length, vector< Seg > &segs)
string subtractSeg(Seg seg, string &in)
vector< Seg > m_unalignedSegs
pair< int, int > Seg
string subtractLongestSeg(int threshold)
void setIndexSequence(string &seq)
size_type size() const
Definition: map.hpp:148
const_iterator begin() const
Definition: map.hpp:151
const_iterator end() const
Definition: map.hpp:152
void clear()
Definition: map.hpp:169
const_iterator find(const key_type &key) const
Definition: map.hpp:153
const_iterator_pair equal_range(const key_type &key) const
Definition: map.hpp:296
size_type size() const
Definition: map.hpp:288
const_iterator end() const
Definition: map.hpp:292
iterator insert(const value_type &val)
Definition: map.hpp:305
const_iterator begin() const
Definition: map.hpp:291
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
size_type size() const
Definition: set.hpp:132
const_iterator find(const key_type &key) const
Definition: set.hpp:137
const_iterator end() const
Definition: set.hpp:136
#define head
Definition: ct_nlmzip_i.h:138
double * m_backgroundResFreqArray
thread_local unique_ptr< FtaMsgPost > bmp
Definition: ftaerr.cpp:120
#define false
Definition: bool.h:36
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
n font weight
int i
int len
static MDB_envinfo info
Definition: mdb_load.c:37
range(_Ty, _Ty) -> range< _Ty >
T max(T x_, T y_)
std::istream & in(std::istream &in_, double &x_)
Int4 delta(size_t dimension_, const Int4 *score_)
#define count
#define assert(x)
Definition: srv_diag.hpp:58
#define row(bind, expected)
Definition: string_bind.c:73
else result
Definition: token2.c:20
Modified on Fri Sep 20 14:58:21 2024 by modify_doxy.py rev. 669887