NCBI C++ ToolKit
cuResidueProfile.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: cuResidueProfile.hpp 45770 2010-05-17 14:44:31Z lanczyck $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Charlie
27  *
28  * File Description:
29  *
30  * Make consensus and remaster with it
31  *
32  * ===========================================================================
33  */
34 
35 #ifndef CU_RESIDUE_PROFILE_HPP
36 #define CU_RESIDUE_PROFILE_HPP
37 
40 
42 BEGIN_SCOPE(cd_utils)
43 
44 //on one column
46  {
47  public:
48  static const string m_residues;
49  static unsigned char getNcbiStdCode(char eaa);
50  static char getEaaCode(char stdCode) {return m_residues[stdCode];}
51 
52  ColumnResidueProfile(); //init occurence to 0
54 
55  void addOccurence(char residue, int row, bool aligned);
56  double calculateColumnWeight(char residue, bool countGap, int numRows)const;
57  //return the total weights for this column, which should be 1
58  double sumUpColumnWeightsByRow(vector<double>& rowWeights, bool countGap, int numRows) const;
59  double reweightColumnByRowWeights(const vector<double>& rowWeights, char& heaviestResidue)const;
60  int getSumCount() const;
61  char getMostFrequentResidue(int& count) const ;
62  //bool hasRow(int row) const;
63  void getResiduesByRow(vector<char>& residues, bool byNcbiStd=true)const;
64  //residues will be in Ncbistd
65  unsigned char getResidueByRow(int row);
66  bool isAligned(char residue, int row)const;
67  bool isAligned(int row);
68  bool isAllRowsAligned()const;
69  void setIndexByConsensus(int col) {m_indexByConsensus = col;};
70  int getIndexByConsensus()const {return m_indexByConsensus;};
71  int getResidueTypeCount()const { return m_residueTypeCount;}
72  typedef pair<int, bool>RowStatusPair;
74 
75  double calcInformationContent();
76 
77  private:
78 
80  return (row > (int)(m_residuesByRow.size()-1)) ? 0 : m_residuesByRow[row];
81  };
82  // inline ResidueRowsMap::const_iterator* findRow(int row)const;
83  //set<int> m_rows;
84  bool m_masterIn;
86  //to speed up findRow
87  vector<ResidueRowsMap::iterator*> m_residuesByRow;
89  static void useDefaultBackgroundResFreq();
90  double getBackgroundResFreq(char res);
93  };
94 
96  {
97  public:
98  ColumnAddress(int posOnMaster, int gap=0);
99  ColumnAddress();
100  ~ColumnAddress();
101  bool operator<(const ColumnAddress& rhs) const;
102 
103  int mPos;
104  int gap;
105  };
106 
107  //interface
109  {
110  public:
111  virtual void read(ColumnResidueProfile& crp) = 0;
112  virtual ~ColumnReader() {};
113  };
114 
116 {
117 public:
118  MasterColumnCounter():m_count(0){};
119  virtual ~MasterColumnCounter() {};
120  virtual void read(ColumnResidueProfile& crp) {m_count++; m_seq += crp.getResidueByRow(0); }
121  int getCount()const {return m_count;}
122  string& getSeq() {return m_seq;}
123 private:
124  int m_count;
125  string m_seq;
126 };
127 
128 //forward del
130 {
131 public:
132  typedef pair<int,int> Seg;
133 
135 
136  void setIndexSequence(string& seq);
137  string getIndexSequence();
138  void read(ColumnResidueProfile& crp);
139  Seg getLongestSeg();
140  int getLenOfLongestSeg();
141  int getTotalUnaligned() {return m_totalUnaligned;}
142  int getTotal() {return m_pos;}
143  int getLongUnalignedSegs(int length, vector<Seg>& segs);
144  string subtractLongestSeg(int threshold);
145  //string subtractLongSeg(int length);
146  string subtractSeg(Seg seg, string& in);
147 private:
150  vector<Seg> m_unalignedSegs;
152  int m_pos;
153  string m_indexSeq;
154 
155  int getLen(Seg seg);
156 };
157 
159  {
160  public:
161 
162  ResidueProfiles();
163  ~ResidueProfiles();
164  void setInclusionThreshold(double th){m_frequencyThreshold = th;};
165  void addOneRow(BlockModelPair& bmp, const string& mSeq, const string& sSeq);
166  void calculateRowWeights();
167  const string& makeConsensus();
168  //inNcbieaa=false, return string in ncbistdaa
169  const string getConsensus(bool inNcbieaa=true) ;
170  const BlockModelPair& getGuideAlignment() const;
171  BlockModelPair& getGuideAlignment();
172  int countColumnsOnMaster(string& seq);
173 
174  void countUnalignedConsensus(UnalignedSegReader& ucr );
175  bool skipUnalignedSeg(UnalignedSegReader& ucr, int len);
176  void adjustConsensusAndGuide();
177 
178  void traverseAllColumns(ColumnReader& cr);
179  void traverseColumnsOnMaster(ColumnReader& cr);
180  void traverseColumnsOnConsensus(ColumnReader& cr);
181  void traverseAlignedColumns(ColumnReader& cr);
182  int getNumRows()const {return m_totalRows;}
183  double calcInformationContent(bool byConsensus=true);
184  const vector< CRef< CSeq_id> > getSeqIdsByRow() const { return m_seqIds;}
185 
186  // Keep track of how many candidate columns failed the weight check against m_frequencyThreshold.
187  // This is useful to know when trying to decide if two adjacent consensus positions are able
188  // to be included in the same block. If there were failures of the weight check between
189  // those two positions, that means a block can't safely span those two consensus positions
190  // because a) at least one row in the alignment has residues in the failed columns and b) this
191  // requires a block break to avoid erroneously including those skipped residues (which do not
192  // map to any column in the consensus) in the block.
196  unsigned int GetNumUnqualAfterIndex(int index) const;
197  bool HasUnqualAfterIndex(int index) const;
198 
199  private:
202  //double m_rowWeightsSum;
203  //col address vs. ColResidueProfile
206  vector<double> m_rowWeights;
207  vector< CRef< CSeq_id> > m_seqIds;
208 
211 
212  // Number of candidate columns that did not pass m_frequencyThreshold (map value)
213  // between consensus index i (map key) and i+1. Filled in during 'makeConsensus';
214  // adjusted if 'adjustConsensusAndGuide' called after 'makeConsensus'.
215  // (index -1 == before first consensus residue)
217 
218  void segsToSet(vector<UnalignedSegReader::Seg>& segs,set<int>& cols);
219 
220  //results
221  string m_consensus;
223  };
224 
225 
226 END_SCOPE(cd_utils)
228 
229 #endif
virtual void read(ColumnResidueProfile &crp)=0
virtual ~ColumnReader()
static const string m_residues
unsigned char getResidueByRow(int row)
multimap< char, RowStatusPair > ResidueRowsMap
pair< int, bool > RowStatusPair
ResidueRowsMap::iterator * findRow(int row)
int getIndexByConsensus() const
static char getEaaCode(char stdCode)
void setIndexByConsensus(int col)
vector< ResidueRowsMap::iterator * > m_residuesByRow
static map< char, double > m_backgroundResFreq
ResidueRowsMap m_residueRowsMap
int getResidueTypeCount() const
virtual void read(ColumnResidueProfile &crp)
const vector< CRef< CSeq_id > > getSeqIdsByRow() const
void setInclusionThreshold(double th)
vector< double > m_rowWeights
set< int > m_colsToSkipOnConsensus
UnqualForConsMap::const_iterator UnqualForConsCit
PosProfileMap m_profiles
BlockModelPair m_guideAlignment
UnqualForConsMap m_numUnqualAfterConsIndex
set< int > m_colsToSkipOnMaster
map< int, unsigned int > UnqualForConsMap
UnqualForConsMap::iterator UnqualForConsIt
map< ColumnAddress, ColumnResidueProfile > PosProfileMap
vector< CRef< CSeq_id > > m_seqIds
int getNumRows() const
vector< Seg > m_unalignedSegs
pair< int, int > Seg
container_type::const_iterator const_iterator
Definition: map.hpp:53
bool operator<(const CEquivRange &A, const CEquivRange &B)
thread_local unique_ptr< FtaMsgPost > bmp
Definition: ftaerr.cpp:120
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
#define NCBI_CDUTILS_EXPORT
Definition: ncbi_export.h:376
<!DOCTYPE HTML >< html > n< header > n< title > PubSeq Gateway Help Page</title > n< style > n th
int len
std::istream & in(std::istream &in_, double &x_)
#define count
#define row(bind, expected)
Definition: string_bind.c:73
Modified on Fri Sep 20 14:58:32 2024 by modify_doxy.py rev. 669887