NCBI C++ ToolKit
pcsf.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef PCSF__HPP
2 #define PCSF__HPP
3 
4 /*
5  * ===========================================================================
6  *
7  * PUBLIC DOMAIN NOTICE
8  * National Center for Biotechnology Information
9  *
10  * This software/database is a "United States Government Work" under the
11  * terms of the United States Copyright Act. It was written as part of
12  * the author's official duties as a United States Government employee and
13  * thus cannot be copyrighted. This software/database is freely available
14  * to the public for use. The National Library of Medicine and the U.S.
15  * Government have not placed any restriction on its use or reproduction.
16  *
17  * Although all reasonable efforts have been taken to ensure the accuracy
18  * and reliability of the software and data, the NLM and the U.S.
19  * Government do not and cannot warrant the performance or results that
20  * may be obtained by using this software or data. The NLM and the U.S.
21  * Government disclaim all warranties, express or implied, including
22  * warranties of performance, merchantability or fitness for any particular
23  * purpose.
24  *
25  * Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Authors: Alexandre Souvorov
30  *
31  * File Description:
32  *
33  */
34 
36 
38 BEGIN_SCOPE(gnomon)
39 
40 typedef float TPhyloCSFScore;
41 typedef vector<TPhyloCSFScore> TFVec;
43  typedef pair<TSignedSeqPos, TPhyloCSFScore> TElement;
44  void Read(CNcbiIstream& from, size_t len) {
45  m_scores.resize(len);
46  if(!from.read(reinterpret_cast<char*>(m_scores.data()), len*sizeof(TElement))) {
47  cerr << "Error in PhyloCSF compact read\n";
48  exit(1);
49  }
50  }
51  void Write(CNcbiOstream& out) const {
52  if(!out.write(reinterpret_cast<const char*>(m_scores.data()), Size()*sizeof(TElement))) {
53  cerr << "Error in PhyloCSF compact write\n";
54  exit(1);
55  }
56  }
58  auto rslt = lower_bound(m_scores.begin(), m_scores.end(), p, [](const TElement& e, TSignedSeqPos i) { return e.first < i; });
59  if(rslt == m_scores.end() || rslt->first != p)
60  return 0;
61  else
62  return rslt->second;
63  }
64  size_t LowerBound(TSignedSeqPos p) const { return lower_bound(m_scores.begin(), m_scores.end(), p, [](const TElement& e, TSignedSeqPos i) { return e.first < i; })-m_scores.begin(); }
65  size_t UpperBound(TSignedSeqPos p) const { return upper_bound(m_scores.begin(), m_scores.end(), p, [](TSignedSeqPos i, const TElement& e) { return i < e.first; })-m_scores.begin(); }
66  size_t Size() const { return m_scores.size(); }
67 
68  vector<TElement> m_scores;
69 };
70 
72  TPhyloCSFScore Score(int s, TSignedSeqPos codon_left) const {
73  if(m_map != nullptr) {
74  codon_left = m_map->MapEditedToOrig(codon_left);
75  if(codon_left < 0)
76  return 0;
77  }
78  return (*m_scoresp)[s].Score(codon_left+m_shift);
79  }
80  TSignedSeqRange CompactRange(int s, TSignedSeqRange edited_range) const { // returns range of compact indices included in edited_range
81  TSignedSeqRange orig_range = edited_range;
82  if(m_map != nullptr) {
83  edited_range = m_map->ShrinkToRealPointsOnEdited(edited_range);
84  if(edited_range.Empty())
86  orig_range = m_map->MapRangeEditedToOrig(edited_range, false);
87  }
88  if(orig_range.Empty())
90  size_t left = (*m_scoresp)[s].LowerBound(orig_range.GetFrom()+m_shift); // first >= position
91  if(left == (*m_scoresp)[s].Size())
93  // result is not empty
94  size_t right = (*m_scoresp)[s].UpperBound(orig_range.GetTo()+m_shift)-1; // last <= position
95 
96  return TSignedSeqRange((TSignedSeqPos)left, (TSignedSeqPos)right);
97  }
98 
100  const CAlignMap* m_map = nullptr;
101  double m_factor = 0;
103 };
104 
106 public:
107  void Read(CNcbiIstream& from) {
109  //read index
110  map<string, tuple<size_t, TSignedSeqPos, TSignedSeqPos>> index; // shift in file, number of + elements, number of - elements
111  size_t data_length;
112  if(!from.read(reinterpret_cast<char*>(&data_length), sizeof data_length)) {
113  cerr << "Error in PhyloCSF read\n";
114  exit(1);
115  }
116 
117  from.seekg(data_length, ios_base::cur); // skip to index
118  int slen;
119  while(from.read(reinterpret_cast<char*>(&slen), sizeof slen)) { // acc string size
120  vector<char> buf(slen);
121  from.read(buf.data(), slen);
122  string contig_acc(buf.begin(), buf.end());
123  size_t shift;
124  from.read(reinterpret_cast<char*>(&shift), sizeof(size_t)); // data shift in file
125  TSignedSeqPos plus_len;
126  from.read(reinterpret_cast<char*>(&plus_len), sizeof(TSignedSeqPos)); // + strand number of elements
127  TSignedSeqPos minus_len;
128  from.read(reinterpret_cast<char*>(&minus_len), sizeof(TSignedSeqPos)); // - strand number of elements
129  if(!from) {
130  cerr << "Error in PhyloCSF index read\n";
131  exit(1);
132  }
133  index[contig_acc] = make_tuple(shift, plus_len, minus_len);
134  }
135 
136  //read data
137  from.clear();
138  for(auto& ind : index) {
139  auto& contig_acc = ind.first;
140  auto shift = get<0>(ind.second);
141  auto plus_len = get<1>(ind.second);
142  auto minus_len = get<2>(ind.second);
143  from.seekg(shift,ios_base::beg);
144  m_contig_scores[contig_acc][0].Read(from, plus_len);
145  m_contig_scores[contig_acc][1].Read(from, minus_len);
146  }
147  }
148  void Write(CNcbiOstream& out) const {
149  size_t data_length = 0;
150  for(auto& scr : m_contig_scores)
151  data_length += (scr.second[0].Size()+scr.second[1].Size());
152  data_length *= sizeof(SPhyloCSFCompactScore::TElement);
153  //write data length
154  out.write(reinterpret_cast<const char*>(&data_length), sizeof data_length);
155  //write scores
156  for(auto& scr : m_contig_scores) {
157  for(int strand = 0; strand < 2; ++strand)
158  scr.second[strand].Write(out);
159  }
160  //write index
161  size_t shift = sizeof data_length;
162  for(auto& scr : m_contig_scores) {
163  auto& contig_acc = scr.first;
164  //write contig name as length+string
165  int slen = (int)contig_acc.size();
166  out.write(reinterpret_cast<const char*>(&slen), sizeof slen);
167  out.write(contig_acc.c_str(), slen);
168  //write shift
169  out.write(reinterpret_cast<const char*>(&shift), sizeof shift);
170  for(int strand = 0; strand < 2; ++strand) {
171  TSignedSeqPos len = (TSignedSeqPos)scr.second[strand].Size();
172  out.write(reinterpret_cast<const char*>(&len), sizeof len);
173  shift += len*sizeof(SPhyloCSFCompactScore::TElement);
174  }
175  }
176  if(!out) {
177  cerr << "Error in PhyloCSF write\n";
178  exit(1);
179  }
180  }
181  SPhyloCSFSlice* CreateSliceForContig(const string& contig_acc) const {
182  SPhyloCSFSlice* p = nullptr;
183  auto rslt = m_contig_scores.find(contig_acc);
184  if(rslt != m_contig_scores.end()) {
185  p = new SPhyloCSFSlice;
186  p->m_scoresp = &rslt->second;
187  }
188  return p;
189  }
190  void CompactFullScores(const map<string, array<TFVec, 2>>& scores) {
192  for(auto& cs : scores) {
193  auto& contig_acc = cs.first;
194  for(int strand = 0; strand < 2; ++strand) {
195  for(TSignedSeqPos i = 0; i < (TSignedSeqPos)cs.second[strand].size(); ++i) {
196  auto scr = cs.second[strand][i];
197  if(scr > 0)
198  m_contig_scores[contig_acc][strand].m_scores.emplace_back(i, scr);
199  }
200  }
201  }
202  }
203 
204 private:
206 };
207 
208 END_SCOPE(gnomon)
210 
211 #endif // PCSF__HPP
TSignedSeqRange ShrinkToRealPointsOnEdited(TSignedSeqRange edited_range) const
Definition: gnomon_seq.cpp:698
TSignedSeqRange MapRangeEditedToOrig(TSignedSeqRange edited_range, bool withextras=true) const
Definition: gnomon_seq.cpp:966
TSignedSeqPos MapEditedToOrig(TSignedSeqPos edited_pos) const
Definition: gnomon_seq.cpp:903
void CompactFullScores(const map< string, array< TFVec, 2 >> &scores)
Definition: pcsf.hpp:190
void Read(CNcbiIstream &from)
Definition: pcsf.hpp:107
SPhyloCSFSlice * CreateSliceForContig(const string &contig_acc) const
Definition: pcsf.hpp:181
map< string, array< SPhyloCSFCompactScore, 2 > > m_contig_scores
Definition: pcsf.hpp:205
void Write(CNcbiOstream &out) const
Definition: pcsf.hpp:148
const_iterator end() const
Definition: map.hpp:152
void clear()
Definition: map.hpp:169
const_iterator find(const key_type &key) const
Definition: map.hpp:153
Definition: map.hpp:338
std::ofstream out("events_result.xml")
main entry point for tests
int TSignedSeqPos
Type for signed sequence position.
Definition: ncbimisc.hpp:887
static TThisType GetEmpty(void)
Definition: range.hpp:306
bool Empty(void) const
Definition: range.hpp:148
CRange< TSignedSeqPos > TSignedSeqRange
Definition: range.hpp:420
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
exit(2)
char * buf
int i
int len
float TPhyloCSFScore
Definition: pcsf.hpp:40
vector< TPhyloCSFScore > TFVec
Definition: pcsf.hpp:41
size_t UpperBound(TSignedSeqPos p) const
Definition: pcsf.hpp:65
void Write(CNcbiOstream &out) const
Definition: pcsf.hpp:51
void Read(CNcbiIstream &from, size_t len)
Definition: pcsf.hpp:44
pair< TSignedSeqPos, TPhyloCSFScore > TElement
Definition: pcsf.hpp:43
size_t Size() const
Definition: pcsf.hpp:66
size_t LowerBound(TSignedSeqPos p) const
Definition: pcsf.hpp:64
TPhyloCSFScore Score(TSignedSeqPos p) const
Definition: pcsf.hpp:57
vector< TElement > m_scores
Definition: pcsf.hpp:68
TSignedSeqRange CompactRange(int s, TSignedSeqRange edited_range) const
Definition: pcsf.hpp:80
double m_factor
Definition: pcsf.hpp:101
TSignedSeqPos m_shift
Definition: pcsf.hpp:102
const CAlignMap * m_map
Definition: pcsf.hpp:100
TPhyloCSFScore Score(int s, TSignedSeqPos codon_left) const
Definition: pcsf.hpp:72
const array< SPhyloCSFCompactScore, 2 > * m_scoresp
Definition: pcsf.hpp:99
Modified on Wed Sep 04 15:01:23 2024 by modify_doxy.py rev. 669887