NCBI C++ ToolKit
align_compare.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef GPIPE_COMMON___ALIGN_COMPARE__HPP
2 #define GPIPE_COMMON___ALIGN_COMPARE__HPP
3 
4 /* $Id: align_compare.hpp 97941 2022-09-09 16:48:11Z mozese2 $
5  * ===========================================================================
6  *
7  * PUBLIC DOMAIN NOTICE
8  * National Center for Biotechnology Information
9  *
10  * This software/database is a "United States Government Work" under the
11  * terms of the United States Copyright Act. It was written as part of
12  * the author's official duties as a United States Government employee and
13  * thus cannot be copyrighted. This software/database is freely available
14  * to the public for use. The National Library of Medicine and the U.S.
15  * Government have not placed any restriction on its use or reproduction.
16  *
17  * Although all reasonable efforts have been taken to ensure the accuracy
18  * and reliability of the software and data, the NLM and the U.S.
19  * Government do not and cannot warrant the performance or results that
20  * may be obtained by using this software or data. The NLM and the U.S.
21  * Government disclaim all warranties, express or implied, including
22  * warranties of performance, merchantability or fitness for any particular
23  * purpose.
24  *
25  * Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Authors: Eyal Mozes
30  *
31  * File Description:
32  *
33  */
34 
36 
38 
39 #include <util/range_coll.hpp>
40 
42 BEGIN_objects_SCOPE
43  class CUser_object;
44 END_objects_SCOPE
45 
47 
48 
50 {
51 public:
52  /////////////////////////////////////////////////////////////////////////////
53  //
54  // Retrieve sets of intervals for alignments
55  //
56 
58 
59  enum EMode { e_Interval, e_Exon, e_Span, e_Intron, e_Full };
60 
61  enum EMatchLevel {e_Equiv, e_Overlap, e_OverlapBetter, e_OverlapWorse, e_NoMatch};
62 
63  enum ERowComparison {e_Query, e_Subject, e_Both};
64 
65  /// Optional list of disambiguiting scores; alignments can only be compared
66  /// if they are equal in these scores. The scores are divided into two groups:
67  /// - scores in the first list are required to appear in all alignments, and
68  /// input is required to be sorted by these scores.
69  /// - scores in the second set are optional.
70  /// -- If two alignmens on both sides both have the score set, they must
71  /// have equal value to be compared.
72  /// -- Alignments without the score, or with a 0 value, may be compared to
73  /// alignments that do have the score set.
74  typedef pair< vector<int>, vector<int> > TDisambiguatingScoreValues;
75 
76  typedef pair< vector<string>, vector<string> > TDisambiguatingScoreList;
77 
81 
82  //////////////////////////////////////////////////////////////////////////////
83  //
84  // struct defining all information needed to store a single alignment
85  //
86 
87  struct SAlignment
88  {
90 
93 
96 
99 
101 
103  vector<double> quality_scores;
107 
112 
113  vector<const SAlignment *> matched_alignments;
115 
117 
118  SAlignment(int s, const CRef<CSeq_align> &al,
119  CAlignCompare &compare, bool is_slice = false);
120 
121  int CompareGroup(const SAlignment &o, bool strict_only) const;
122 
123  void PopulateBoundariesMap() const;
124 
125  list< AutoPtr<SAlignment> > BreakOnBoundaries(int row) const;
126 
127  AutoPtr<SAlignment> Slice(int row, TSeqPos from, TSeqPos to) const;
128  };
129 
131  IAlignSource &set2,
132  EMode mode = e_Interval,
133  bool strict = false,
134  bool ignore_not_present = false,
135  ERowComparison row = e_Both,
137  const vector<string> &quality_scores = vector<string>(),
138  const set<string> &score_set = set<string>(),
139  bool score_set_as_blacklist = false,
140  double real_score_tolerance = 0,
141  const set<string> &ext_set = set<string>(),
142  bool ext_set_as_blacklist = false,
143  const set<string> distributive_scores = set<string>())
144  : m_Set1(set1)
145  , m_Set2(set2)
146  , m_Mode(mode)
147  , m_Strict(strict)
148  , m_IgnoreNotPresent(ignore_not_present)
149  , m_Row(row)
150  , m_DisambiguitingScores(scores)
151  , m_QualityScores(quality_scores)
152  , m_ScoreSet(score_set)
153  , m_ScoreSetAsBlacklist(score_set_as_blacklist)
154  , m_RealScoreTolerance(real_score_tolerance)
155  , m_ExtSet(ext_set)
156  , m_ExtSetAsBlacklist(ext_set_as_blacklist)
157  , m_DistributiveScores(distributive_scores)
158  , m_CountSet1(0)
159  , m_CountSet2(0)
160  , m_CountSplitSet1(0)
161  , m_CountSplitSet2(0)
162  , m_CountEquivSet1(0)
163  , m_CountEquivSet2(0)
164  , m_CountOverlapSet1(0)
165  , m_CountOverlapSet2(0)
166  , m_CountOnlySet1(0)
167  , m_CountOnlySet2(0)
168  , m_CountEquivGroups(0)
169  , m_CountOverlapGroups(0)
170  , m_CountBasesSet1(0)
171  , m_CountBasesSet2(0)
172  , m_CountBasesEquivSet1(0)
173  , m_CountBasesEquivSet2(0)
174  , m_CountBasesOverlapSet1(0)
175  , m_CountBasesOverlapSet2(0)
176  , m_CountBasesOnlySet1(0)
177  , m_CountBasesOnlySet2(0)
178  {
179  }
180 
181  bool EndOfData() const
182  {
183  return m_NextSet1Group.empty() && m_NextSet2Group.empty() &&
184  m_Set1.EndOfData() && m_Set2.EndOfData();
185  }
186 
187  void PopulateBoundariesMap();
188 
189  vector<const SAlignment *> NextGroup();
190 
191  size_t CountSet1() const { return m_CountSet1; }
192  size_t CountSet2() const { return m_CountSet2; }
193  size_t CountSplitSet1() const { return m_CountSplitSet1; }
194  size_t CountSplitSet2() const { return m_CountSplitSet2; }
195  size_t CountEquivSet1() const { return m_CountEquivSet1; }
196  size_t CountEquivSet2() const { return m_CountEquivSet2; }
197  size_t CountOverlapSet1() const { return m_CountOverlapSet1; }
198  size_t CountOverlapSet2() const { return m_CountOverlapSet2; }
199  size_t CountOnlySet1() const { return m_CountOnlySet1; }
200  size_t CountOnlySet2() const { return m_CountOnlySet2; }
201  size_t CountEquivGroups() const { return m_CountEquivGroups; }
202  size_t CountOverlapGroups() const { return m_CountOverlapGroups; }
203  size_t CountBasesSet1() const { return m_CountBasesSet1; }
204  size_t CountBasesSet2() const { return m_CountBasesSet2; }
205  size_t CountBasesEquivSet1() const { return m_CountBasesEquivSet1; }
206  size_t CountBasesEquivSet2() const { return m_CountBasesEquivSet2; }
207  size_t CountBasesOverlapSet1() const { return m_CountBasesOverlapSet1; }
208  size_t CountBasesOverlapSet2() const { return m_CountBasesOverlapSet2; }
209  size_t CountBasesOnlySet1() const { return m_CountBasesOnlySet1; }
210  size_t CountBasesOnlySet2() const { return m_CountBasesOnlySet2; }
211 
212 private:
213  friend struct SAlignment;
214 
218  bool m_Strict;
222  vector<string> m_QualityScores;
229 
230  size_t m_CountSet1;
231  size_t m_CountSet2;
250 
251  list< AutoPtr<SAlignment> > m_CurrentSet1Group;
252  list< AutoPtr<SAlignment> > m_CurrentSet2Group;
253  list< AutoPtr<SAlignment> > m_NextSet1Group;
254  list< AutoPtr<SAlignment> > m_NextSet2Group;
255 
257 
258  /// Determine whether the next group of alignments should be taken from set 1 or 2.
259  /// If the next group from both sets are on the same query and subject, return 3;
260  /// otherwise return 1 or 2.
261  int x_DetermineNextGroupSet();
262 
263  /// Get next alignment from the correct set
264  AutoPtr<SAlignment> x_NextAlignment(int set, bool update_counts = true);
265 
266  void x_GetCurrentGroup(int set);
267 
268  void x_SplitOnOverlaps(int group, int row);
269 };
270 
272 
273 
274 #endif // GPIPE_COMMON___ALIGN_COMPARE__HPP
USING_SCOPE(objects)
AutoPtr –.
Definition: ncbimisc.hpp:401
size_t CountSet2() const
size_t m_CountEquivSet2
size_t CountBasesEquivSet2() const
size_t CountEquivSet1() const
ERowComparison m_Row
size_t CountOverlapSet1() const
size_t m_CountOverlapGroups
size_t CountOverlapSet2() const
size_t CountBasesOverlapSet2() const
map< string, double > TRealScoreSet
map< TSeqRange, TSeqRange > TAlignmentSpans
list< AutoPtr< SAlignment > > m_NextSet2Group
list< AutoPtr< SAlignment > > m_NextSet1Group
size_t CountSplitSet1() const
size_t m_CountBasesOverlapSet2
bool EndOfData() const
size_t CountOnlySet2() const
list< AutoPtr< SAlignment > > m_CurrentSet2Group
size_t CountSet1() const
IAlignSource & m_Set1
size_t m_CountBasesOnlySet2
pair< vector< string >, vector< string > > TDisambiguatingScoreList
size_t m_CountBasesEquivSet1
size_t CountBasesOverlapSet1() const
map< CSeq_id_Handle, set< TSeqPos > > m_BoundariesMap
size_t CountOnlySet1() const
size_t CountEquivSet2() const
size_t m_CountBasesEquivSet2
size_t m_CountOverlapSet1
size_t CountEquivGroups() const
double m_RealScoreTolerance
size_t CountBasesOnlySet2() const
set< string > m_ExtSet
size_t CountBasesOnlySet1() const
IAlignSource & m_Set2
bool m_ScoreSetAsBlacklist
size_t m_CountBasesSet1
size_t CountBasesSet1() const
TDisambiguatingScoreList m_DisambiguitingScores
CAlignCompare(IAlignSource &set1, IAlignSource &set2, EMode mode=e_Interval, bool strict=false, bool ignore_not_present=false, ERowComparison row=e_Both, const TDisambiguatingScoreList &scores=TDisambiguatingScoreList(), const vector< string > &quality_scores=vector< string >(), const set< string > &score_set=set< string >(), bool score_set_as_blacklist=false, double real_score_tolerance=0, const set< string > &ext_set=set< string >(), bool ext_set_as_blacklist=false, const set< string > distributive_scores=set< string >())
size_t CountBasesEquivSet1() const
size_t m_CountBasesOverlapSet1
size_t CountOverlapGroups() const
size_t m_CountEquivSet1
size_t m_CountBasesSet2
list< AutoPtr< SAlignment > > m_CurrentSet1Group
pair< vector< int >, vector< int > > TDisambiguatingScoreValues
Optional list of disambiguiting scores; alignments can only be compared if they are equal in these sc...
set< string > m_ScoreSet
size_t m_CountBasesOnlySet1
size_t m_CountSplitSet2
vector< string > m_QualityScores
size_t m_CountOnlySet2
set< string > m_DistributiveScores
size_t m_CountEquivGroups
map< string, CRef< CUser_object > > TExtSet
size_t CountSplitSet2() const
size_t m_CountOverlapSet2
size_t m_CountSplitSet1
size_t m_CountOnlySet1
size_t CountBasesSet2() const
map< string, int > TIntegerScoreSet
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define NCBI_XALGOALIGN_EXPORT
Definition: ncbi_export.h:985
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
mdb_mode_t mode
Definition: lmdb++.h:38
#define row(bind, expected)
Definition: string_bind.c:73
CRef< CSeq_align > align
TDisambiguatingScoreValues scores
vector< const SAlignment * > matched_alignments
vector< double > quality_scores
CAlignCompare & compare_object
TIntegerScoreSet integer_scores
CRangeCollection< TSeqPos > query_mismatches
CRangeCollection< TSeqPos > subject_mismatches
Modified on Fri Sep 20 14:58:13 2024 by modify_doxy.py rev. 669887