NCBI C++ ToolKit
aln_stats.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef OBJTOOLS_ALNMGR___ALN_STATS__HPP
2 #define OBJTOOLS_ALNMGR___ALN_STATS__HPP
3 /* $Id: aln_stats.hpp 78930 2017-07-31 13:06:45Z dicuccio $
4 * ===========================================================================
5 *
6 * PUBLIC DOMAIN NOTICE
7 * National Center for Biotechnology Information
8 *
9 * This software/database is a "United States Government Work" under the
10 * terms of the United States Copyright Act. It was written as part of
11 * the author's official duties as a United States Government employee and
12 * thus cannot be copyrighted. This software/database is freely available
13 * to the public for use. The National Library of Medicine and the U.S.
14 * Government have not placed any restriction on its use or reproduction.
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================
27 *
28 * Authors: Kamen Todorov, NCBI
29 *
30 * File Description:
31 * Seq-align statistics
32 *
33 * ===========================================================================
34 */
35 
36 
37 #include <corelib/ncbistd.hpp>
38 #include <corelib/ncbiobj.hpp>
39 
43 
44 
46 
47 
48 /// Helper class which collects seq-align statistics: seq-ids participating in
49 /// alignments and rows, potential anchors etc. The class is used to create
50 /// anchored alignments.
51 /// @sa TAlnStats
52 /// @sa TScopeAlnStats
53 /// @sa CreateAnchoredAlnFromAln
54 /// @sa CreateAnchoredAlnVec
55 template <class _TAlnIdVec>
56 class CAlnStats : public CObject
57 {
58 public:
59  /// Container with one entry per seq-align using the same indexing as
60  /// m_AlnVec. Each element is a vector of ids referenced by the seq-align.
61  /// See CAlnIdMap for an example of the container implementation.
62  /// @sa CAlnIdMap
63  typedef _TAlnIdVec TAlnIdVec;
64 
65  /// Vector of original seq-aligns.
66  typedef typename _TAlnIdVec::TAlnVec TAlnVec;
67 
68  /// Vector of ids used in all alignments. Some ids may be included several
69  /// times in case of self-aligned sequences.
70  typedef typename _TAlnIdVec::TIdVec TIdVec;
71 
72  /// Vector of indexes in TIdVec.
73  typedef vector<size_t> TIdxVec;
74 
75  /// Each id mapped to a vector of indexes in TIdVec.
77 
78  typedef int TDim;
79 
80  /// Vector, describing how a single id participates in all alignments from
81  /// TAlnVec. Each element contains row index for the id or -1 if the id is
82  /// not present in the alignment.
83  typedef vector<TDim> TRowVec;
84 
85  /// One entry per id (in sync with TIdVec). Each entry describes how a
86  /// single id participates in all alignments (see TRowVec).
87  typedef vector<TRowVec> TRowVecVec;
88 
89  /// Bitmap where each bit indicates an alignment from TAlnVec.
91 
92  /// One entry per id (in sync with TIdVec). Each entry is a bitmap - one
93  /// bit per alignment, indicating if the id is participating in this
94  /// alignment.
95  typedef vector<TBitVec> TBitVecVec;
96 
97  /// Constructor
98  /// @param aln_id_vec
99  /// An instance of CAlnIdMap<> containing the alignments to be indexed.
100  /// @sa CAlnIdMap
101  CAlnStats(const TAlnIdVec& aln_id_vec) :
102  m_AlnIdVec(aln_id_vec),
103  m_AlnVec(aln_id_vec.GetAlnVec()),
105  m_CanBeAnchored(-1)
106  {
107  _ASSERT(m_AlnVec.size() == m_AlnIdVec.size());
108 
109  for (size_t aln_i = 0; aln_i < m_AlnCount; ++aln_i) {
110  for (size_t row_i = 0; row_i < m_AlnIdVec[aln_i].size(); ++row_i) {
111 
112  const TAlnSeqIdIRef& id = m_AlnIdVec[aln_i][row_i];
113  _ASSERT( !id.Empty() );
115  if (it == m_IdMap.end() || *id < *it->first) { // id encountered for a first time, insert it
116  it = m_IdMap.insert(it,
117  TIdMap::value_type(id, TIdxVec()));
118  it->second.push_back(x_AddId(id, aln_i, row_i));
119  }
120  else { // id exists already
121  TIdxVec& idx_vec = it->second;
122  TIdxVec::iterator idx_it = idx_vec.begin();
123  while (idx_it != idx_vec.end()) {
124  if ( !m_BitVecVec[*idx_it][bm::id_t(aln_i)] ) {
125  // create a mapping b/n the id and the alignment
126  m_BitVecVec[*idx_it][bm::id_t(aln_i)] = true;
127  _ASSERT(m_RowVecVec[*idx_it][aln_i] == -1);
128  m_RowVecVec[*idx_it][aln_i] = int(row_i);
129  break;
130  }
131  ++idx_it;
132  }
133  if (idx_it == idx_vec.end()) {
134  // create an extra identical id for this
135  // alignment. (the sequence is aligned to
136  // itself)
137  idx_vec.push_back(x_AddId(id, aln_i, row_i));
138  }
139  }
140  }
141  }
143  }
144 
145  /// How many alignments do we have?
146  size_t GetAlnCount(void) const
147  {
148  return m_AlnCount;
149  }
150 
151  /// Access the underlying vector of alignments
152  const TAlnVec& GetAlnVec(void) const
153  {
154  return m_AlnVec;
155  }
156 
157  /// What is the dimension of an alignment?
158  TDim GetDimForAln(size_t aln_idx) const
159  {
160  _ASSERT(aln_idx < GetAlnCount());
161  return TDim(m_AlnIdVec[aln_idx].size());
162  }
163 
164  /// Access the vector of seq-ids of a particular alignment
165  const TIdVec& GetSeqIdsForAln(size_t aln_idx) const
166  {
167  _ASSERT(aln_idx < GetAlnCount());
168  return m_AlnIdVec[aln_idx];
169  }
170 
171  /// Access the vector of seq-ids of a particular alignment
172  const TIdVec& GetSeqIdsForAln(const CSeq_align& aln) const
173  {
174  return m_AlnIdVec[aln];
175  }
176 
177  /// Get a set of ids that are aligned to a particular id
178  const TIdVec& GetAlignedIds(const TAlnSeqIdIRef& id) const
179  {
181  if (aln_it != m_AlignedIdsMap.end()) {
182  // get from cache
183  return aln_it->second;
184  }
185  else {
187  if (it == m_IdMap.end()) {
188  NCBI_THROW(CAlnException, eInvalidRequest,
189  "Seq-id not present in map");
190  }
191  else {
192  // create in cache
193  TIdVec& aligned_ids_vec = m_AlignedIdsMap[id];
194 
195  // temp, to keep track of already found aligned ids
196  TBitVec id_bit_vec;
197  id_bit_vec.resize(bm::bvector<>::size_type(m_IdVec.size()));
198 
199  const size_t& id_idx = it->second[0];
200  for (size_t aln_i = 0; aln_i < m_AlnCount; ++aln_i) {
201  // if query paricipates in alignment
202  if (m_BitVecVec[id_idx][bm::id_t(aln_i)]) {
203  // find all participating subjects for this alignment
204  for (size_t aligned_id_idx = 0;
205  aligned_id_idx < m_BitVecVec.size();
206  ++aligned_id_idx) {
207  // if an aligned subject
208  if (aligned_id_idx != id_idx &&
209  m_BitVecVec[aligned_id_idx][bm::id_t(aln_i)]) {
210  if ( !id_bit_vec[bm::id_t(aligned_id_idx)] ) {
211  // add only if not already added
212  id_bit_vec[bm::id_t(aligned_id_idx)] = true;
213  aligned_ids_vec.push_back
214  (m_IdVec[aligned_id_idx]);
215  }
216  }
217  }
218  }
219  }
220  return aligned_ids_vec;
221  }
222  }
223  }
224 
225  /// Get vector describing ids usage in each alignment.
226  /// @sa TRowVecVec
227  const TRowVecVec& GetRowVecVec(void) const
228  {
229  return m_RowVecVec;
230  }
231 
232  /// Get map of ids to there indexes in TIdVec.
233  /// @sa TIdMap
234  const TIdMap& GetIdMap(void) const
235  {
236  return m_IdMap;
237  }
238 
239  /// Get vector of all ids from all alignments.
240  /// @sa TIdVec
241  const TIdVec& GetIdVec(void) const
242  {
243  return m_IdVec;
244  }
245 
246  /// Canonical Query-Anchored: all alignments have 2 or 3 rows and
247  /// exactly 2 sequences (A and B), A is present on all alignments
248  /// on row 1, B on rows 2 (and possibly 3). B can be present on 2
249  /// rows only if they represent different strands.
250  bool IsCanonicalQueryAnchored(void) const
251  {
252  // Is the first sequence present in all aligns?
253  if (m_BitVecVec[0].count() != m_AlnCount) {
254  return false;
255  }
256  switch (m_IdVec.size()) {
257  case 2:
258  // two rows: canonical
259  return true;
260  break;
261  case 3:
262  // three rows: A, B and B?
263  if (*m_IdVec[1] == *m_IdVec[2]) {
264  return true;
265  }
266  else {
267  return false;
268  }
269  break;
270  default:
271  break;
272  }
273  return false;
274  }
275 
276  /// Canonical Multiple: Single alignment with multiple sequences.
277  bool IsCanonicalMultiple(void) const
278  {
279  return GetAlnCount() == 1 && ! IsCanonicalQueryAnchored();
280  }
281 
282  /// Check if there are any ids which can be used as anchors for the
283  /// whole set of alignments.
284  bool CanBeAnchored(void) const
285  {
286  if (m_CanBeAnchored < 0) {
288  }
289  return m_CanBeAnchored;
290  }
291 
292  /// Get map of potential anchor ids.
293  /// NOTE: each is is mapped to vector of indexes in IdVec, not AnchorIdVec.
294  const TIdMap& GetAnchorIdMap(void) const
295  {
296  if (m_CanBeAnchored < 0) {
298  }
299  return m_AnchorIdMap;
300  }
301 
302  /// Get vector of potential anchor ids.
303  const TIdVec& GetAnchorIdVec(void) const
304  {
305  if (m_CanBeAnchored < 0) {
307  }
308  return m_AnchorIdVec;
309  }
310 
311  /// Get vector of id indexes (from IdVec) for potential anchors.
312  const TIdxVec& GetAnchorIdxVec(void) const
313  {
314  if (m_CanBeAnchored < 0) {
316  }
317  return m_AnchorIdxVec;
318  }
319 
320 private:
321  size_t x_AddId(const TAlnSeqIdIRef& id, size_t aln_i, size_t row_i)
322  {
323  m_IdVec.push_back(id);
324  {
325  m_BitVecVec.push_back(TBitVec());
326  TBitVec& bit_vec = m_BitVecVec.back();
328  bit_vec[bm::id_t(aln_i)] = true;
329  _ASSERT(m_IdVec.size() == m_BitVecVec.size());
330  }
331  {
332  m_RowVecVec.push_back(TRowVec());
333  TRowVec& rows = m_RowVecVec.back();
334  rows.resize(m_AlnCount, -1);
335  rows[aln_i] = int(row_i);
336  _ASSERT(m_IdVec.size() == m_RowVecVec.size());
337  }
338  return m_IdVec.size() - 1;
339  }
340 
341  void x_IdentifyPotentialAnchors(void) const
342  {
343  _ASSERT(m_IdVec.size() == m_BitVecVec.size());
345  _ASSERT(m_AnchorIdxVec.empty());
346  _ASSERT(m_AnchorIdVec.empty());
348  for (size_t id_idx = 0; id_idx < m_BitVecVec.size(); ++id_idx) {
349  if (m_BitVecVec[id_idx].count() == m_AlnCount) {
350  // insert into the anchor idx vec:
351  m_AnchorIdxVec.push_back(id_idx);
352 
353  const TAlnSeqIdIRef& id = m_IdVec[id_idx];
354 
355  // insert in the anchor vec
356  m_AnchorIdVec.push_back(id);
357 
358  // insert in the anchor map
360  if (it == m_AnchorIdMap.end() || *id < *it->first) { // id encountered for a first time, insert it
361  it = m_AnchorIdMap.insert(it,
362  TIdMap::value_type(id, TIdxVec()));
363  }
364  it->second.push_back(id_idx);
365  }
366  }
367  m_CanBeAnchored = (m_AnchorIdxVec.empty() ? 0 : 1);
368  }
369 
370  const TAlnIdVec& m_AlnIdVec; // Vectors of ids for each alignment
371  const TAlnVec& m_AlnVec; // Vector of alignments
372  size_t m_AlnCount; // Number of alignments
373  TIdVec m_IdVec; // Vector of ids
374  TIdMap m_IdMap; // List of m_IdVec indexes for each id
375  TBitVecVec m_BitVecVec; // For each id list alignments which
376  // contain this id
377  TRowVecVec m_RowVecVec; // For each id list row index in each
378  // alignment (-1 if not present).
379 
381  mutable TAlignedIdsMap m_AlignedIdsMap; // cache for GetAlignedIds
382  mutable TIdxVec m_AnchorIdxVec; // vector of indexes (as
383  // represented in m_RowVecVec
384  // and m_BitVecVec) of the ids
385  // of potential anchors
386  mutable TIdMap m_AnchorIdMap; // Maps a the id of each
387  // potential anchor to its
388  // index(es) in m_RowVecVec
389  mutable TIdVec m_AnchorIdVec; // A vector of potential anchors
390  mutable int m_CanBeAnchored; // If there's at least one
391  // potential anchor
392 };
393 
394 
395 /// Default implementations for alignment stats.
396 /// @sa TAlnIdMap
397 /// @sa TScopeAlnIdMap
400 
401 
403 
404 #endif // OBJTOOLS_ALNMGR___ALN_STATS__HPP
CAlnStats< TAlnIdMap > TAlnStats
Default implementations for alignment stats.
Definition: aln_stats.hpp:398
CAlnStats< TScopeAlnIdMap > TScopeAlnStats
Definition: aln_stats.hpp:399
Helper class which collects seq-align statistics: seq-ids participating in alignments and rows,...
Definition: aln_stats.hpp:57
TIdVec m_AnchorIdVec
Definition: aln_stats.hpp:389
TDim GetDimForAln(size_t aln_idx) const
What is the dimension of an alignment?
Definition: aln_stats.hpp:158
_TAlnIdVec::TAlnVec TAlnVec
Vector of original seq-aligns.
Definition: aln_stats.hpp:66
_TAlnIdVec::TIdVec TIdVec
Vector of ids used in all alignments.
Definition: aln_stats.hpp:70
const TIdxVec & GetAnchorIdxVec(void) const
Get vector of id indexes (from IdVec) for potential anchors.
Definition: aln_stats.hpp:312
void x_IdentifyPotentialAnchors(void) const
Definition: aln_stats.hpp:341
TRowVecVec m_RowVecVec
Definition: aln_stats.hpp:377
const TIdVec & GetIdVec(void) const
Get vector of all ids from all alignments.
Definition: aln_stats.hpp:241
const TIdVec & GetSeqIdsForAln(const CSeq_align &aln) const
Access the vector of seq-ids of a particular alignment.
Definition: aln_stats.hpp:172
vector< TRowVec > TRowVecVec
One entry per id (in sync with TIdVec).
Definition: aln_stats.hpp:87
const TIdMap & GetAnchorIdMap(void) const
Get map of potential anchor ids.
Definition: aln_stats.hpp:294
TBitVecVec m_BitVecVec
Definition: aln_stats.hpp:375
int m_CanBeAnchored
Definition: aln_stats.hpp:390
const TAlnIdVec & m_AlnIdVec
Definition: aln_stats.hpp:370
bool IsCanonicalQueryAnchored(void) const
Canonical Query-Anchored: all alignments have 2 or 3 rows and exactly 2 sequences (A and B),...
Definition: aln_stats.hpp:250
vector< TDim > TRowVec
Vector, describing how a single id participates in all alignments from TAlnVec.
Definition: aln_stats.hpp:83
bool IsCanonicalMultiple(void) const
Canonical Multiple: Single alignment with multiple sequences.
Definition: aln_stats.hpp:277
const TIdVec & GetAnchorIdVec(void) const
Get vector of potential anchor ids.
Definition: aln_stats.hpp:303
size_t x_AddId(const TAlnSeqIdIRef &id, size_t aln_i, size_t row_i)
Definition: aln_stats.hpp:321
vector< TBitVec > TBitVecVec
One entry per id (in sync with TIdVec).
Definition: aln_stats.hpp:95
CAlnStats(const TAlnIdVec &aln_id_vec)
Constructor.
Definition: aln_stats.hpp:101
size_t GetAlnCount(void) const
How many alignments do we have?
Definition: aln_stats.hpp:146
TIdxVec m_AnchorIdxVec
Definition: aln_stats.hpp:382
TIdVec m_IdVec
Definition: aln_stats.hpp:373
TAlignedIdsMap m_AlignedIdsMap
Definition: aln_stats.hpp:381
bool CanBeAnchored(void) const
Check if there are any ids which can be used as anchors for the whole set of alignments.
Definition: aln_stats.hpp:284
TIdMap m_IdMap
Definition: aln_stats.hpp:374
size_t m_AlnCount
Definition: aln_stats.hpp:372
TIdMap m_AnchorIdMap
Definition: aln_stats.hpp:386
vector< size_t > TIdxVec
Vector of indexes in TIdVec.
Definition: aln_stats.hpp:73
const TRowVecVec & GetRowVecVec(void) const
Get vector describing ids usage in each alignment.
Definition: aln_stats.hpp:227
const TIdMap & GetIdMap(void) const
Get map of ids to there indexes in TIdVec.
Definition: aln_stats.hpp:234
const TAlnVec & m_AlnVec
Definition: aln_stats.hpp:371
map< TAlnSeqIdIRef, TIdVec > TAlignedIdsMap
Definition: aln_stats.hpp:380
bm::bvector TBitVec
Bitmap where each bit indicates an alignment from TAlnVec.
Definition: aln_stats.hpp:90
const TIdVec & GetSeqIdsForAln(size_t aln_idx) const
Access the vector of seq-ids of a particular alignment.
Definition: aln_stats.hpp:165
_TAlnIdVec TAlnIdVec
Container with one entry per seq-align using the same indexing as m_AlnVec.
Definition: aln_stats.hpp:63
const TIdVec & GetAlignedIds(const TAlnSeqIdIRef &id) const
Get a set of ids that are aligned to a particular id.
Definition: aln_stats.hpp:178
map< TAlnSeqIdIRef, TIdxVec, SAlnSeqIdIRefComp > TIdMap
Each id mapped to a vector of indexes in TIdVec.
Definition: aln_stats.hpp:76
const TAlnVec & GetAlnVec(void) const
Access the underlying vector of alignments.
Definition: aln_stats.hpp:152
CObject –.
Definition: ncbiobj.hpp:180
void resize(size_type new_size)
Change size of the bvector.
Definition: bm.h:2463
bvector_size_type size_type
Definition: bm.h:121
const_iterator end() const
Definition: map.hpp:152
const_iterator lower_bound(const key_type &key) const
Definition: map.hpp:154
iterator_bool insert(const value_type &val)
Definition: map.hpp:165
bool empty() const
Definition: map.hpp:149
const_iterator find(const key_type &key) const
Definition: map.hpp:153
Include a standard set of the NCBI C++ Toolkit most basic headers.
bool Empty(const CNcbiOstrstream &src)
Definition: fileutil.cpp:523
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
unsigned int id_t
Definition: bmconst.h:38
const struct ncbi::grid::netcache::search::fields::SIZE size
Compressed bitset (entry point to bm.h)
Portable reference counted smart and weak pointers using CWeakRef, CRef, CObject and CObjectEx.
#define count
#define _ASSERT
Modified on Fri Sep 20 14:57:33 2024 by modify_doxy.py rev. 669887