NCBI C++ ToolKit
hit.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef ALGO_COBALT___HIT__HPP
2 #define ALGO_COBALT___HIT__HPP
3 
4 /* $Id: hit.hpp 33815 2007-05-04 17:18:18Z kazimird $
5 * ===========================================================================
6 *
7 * PUBLIC DOMAIN NOTICE
8 * National Center for Biotechnology Information
9 *
10 * This software/database is a "United States Government Work" under the
11 * terms of the United States Copyright Act. It was written as part of
12 * the author's offical duties as a United States Government employee and
13 * thus cannot be copyrighted. This software/database is freely available
14 * to the public for use. The National Library of Medicine and the U.S.
15 * Government have not placed any restriction on its use or reproduction.
16 *
17 * Although all reasonable efforts have been taken to ensure the accuracy
18 * and reliability of the software and data, the NLM and the U.S.
19 * Government do not and cannot warrant the performance or results that
20 * may be obtained by using this software or data. The NLM and the U.S.
21 * Government disclaim all warranties, express or implied, including
22 * warranties of performance, merchantability or fitness for any particular
23 * purpose.
24 *
25 * Please cite the author in any work or product based on this material.
26 *
27 * ===========================================================================*/
28 
29 /*****************************************************************************
30 
31 File name: hit.hpp
32 
33 Author: Jason Papadopoulos
34 
35 Contents: Interface for CHit class
36 
37 ******************************************************************************/
38 
39 /// @file hit.hpp
40 /// Interface for CHit class, used to encapsulate
41 /// operations involving pairwise alignments.
42 /// <pre>
43 /// Given an alignment with its traceback, this class can
44 /// compute subsets of the alignment from subsets of the traceback.
45 /// This can sometimes be tricky; for example, given an alignment
46 /// described by
47 ///
48 /// 11111
49 /// traceback 012345678901234
50 ///
51 /// query 10 AAAAA---AAAAAAA 21
52 /// subject 50 AAAAAAAAA--AAAA 62
53 ///
54 /// the traceback is in the range 0 to 14, the query range is 10 to 21,
55 /// and the subject range is 50 to 62. For a subject range of [50,54] the
56 /// query range is [10,14]. However, a subject range of [50,56] still has
57 /// a query range of [10,14]. Some other examples:
58 ///
59 /// start with of compute of
60 /// ------------- ------- ------------- -----
61 /// query range [12,15] subject range [52,58]
62 /// query range [15,16] subject range [58,59]
63 /// subject range [55,56] query range [15,14] <- inverted!
64 ///
65 /// In general, query ranges and subject ranges are assumed to exclude
66 /// gaps at either end of the range. For scoring purposes, the traceback
67 /// range is what's specified, as this is unambiguous.
68 /// </pre>
69 
70 
76 
77 #include <algo/cobalt/base.hpp>
78 #include <algo/cobalt/seq.hpp>
80 
82 BEGIN_SCOPE(cobalt)
83 
84 /// A generalized representation of a pairwise alignment
86 {
87 public:
88  /// Not always used, but useful to avoid
89  /// extremely small hits
90  static const int kMinHitSize = 2;
91 
92  /// Hits can be grouped hierarchically
93  typedef vector<CHit *> TSubHit;
94 
95  /// Numerical identifier for first sequence
96  /// in alignment
98 
99  /// Numerical identifier for second sequence
100  /// in alignment
102 
103  /// Score of alignment
104  int m_Score;
105 
106  /// The range of offsets on the first sequence
108 
109  /// The range of offsets on the second sequence
111 
112  /// Create an empty alignment
113  /// @param seq1_index Numerical identifier for first sequence [in]
114  /// @param seq2_index Numerical identifier for second sequence [in]
115  ///
116  CHit(int seq1_index, int seq2_index)
117  : m_SeqIndex1(seq1_index), m_SeqIndex2(seq2_index),
118  m_Score(0), m_SeqRange1(0,0), m_SeqRange2(0,0) {}
119 
120  /// Create an alignment from a BLAST hit
121  /// @param seq1_index Numerical identifier for first sequence [in]
122  /// @param seq2_index Numerical identifier for second sequence [in]
123  /// @param hsp A single pairwise alignment from a blast hit [in]
124  ///
125  CHit(int seq1_index, int seq2_index, BlastHSP *hsp)
126  : m_SeqIndex1(seq1_index), m_SeqIndex2(seq2_index),
127  m_Score(hsp->score),
128  m_SeqRange1(hsp->query.offset, hsp->query.end - 1),
129  m_SeqRange2(hsp->subject.offset, hsp->subject.end - 1),
130  m_EditScript(hsp->gap_info) { VerifyHit(); }
131 
132  /// Create an alignment from a Dense_seg
133  /// @param seq1_index Numerical identifier for first sequence [in]
134  /// @param seq2_index Numerical identifier for second sequence [in]
135  /// @param score The score of the pairwise alignment [in]
136  /// @param denseg Dense_seg representing a single pairwise alignment
137  /// from a blast hit [in]
138  ///
139  CHit(int seq1_index, int seq2_index, int score,
140  const objects::CDense_seg& denseg);
141 
142  /// Create an alignment from a Dense_diag
143  /// @param seq1_index Numerical identifier for first sequence [in]
144  /// @param seq2_index Numerical identifier for second sequence [in]
145  /// @param score The score of the pairwise alignment [in]
146  /// @param dendiag Dense_seg representing a single ungapped
147  /// pairwise alignment from a blast hit [in]
148  ///
149  CHit(int seq1_index, int seq2_index, int score,
150  const objects::CDense_diag& dendiag);
151 
152  /// Create an alignment with all specified parameters
153  /// @param seq1_index Numerical identifier for first sequence [in]
154  /// @param seq2_index Numerical identifier for second sequence [in]
155  /// @param seq_range1 Offsets on the first sequence [in]
156  /// @param seq_range2 Offsets on the second sequence [in]
157  /// @param score The score of the alignment [in]
158  /// @param edit_script Traceback for the alignment (may be empty) [in]
159  ///
160  CHit(int seq1_index, int seq2_index,
161  TRange seq_range1, TRange seq_range2,
162  int score, CEditScript edit_script)
163  : m_SeqIndex1(seq1_index), m_SeqIndex2(seq2_index),
164  m_Score(score),
165  m_SeqRange1(seq_range1), m_SeqRange2(seq_range2),
166  m_EditScript(edit_script) { VerifyHit(); }
167 
168  /// Destructor
169  ///
170  ~CHit()
171  {
172  // delete sub-hits
173  for (int i = 0; i < (int)m_SubHit.size(); i++)
174  delete m_SubHit[i];
175  }
176 
177  /// Add a to a CHit's list of subhits
178  /// @param hit The hit to add [in]
179  ///
180  void InsertSubHit(CHit *hit) { m_SubHit.push_back(hit); }
181 
182  /// Retrieve a list of subhits
183  /// @return The list of subhits
184  ///
185  TSubHit& GetSubHit() { return m_SubHit; }
186 
187  /// Retrieve the traceback associated with a CHit
188  /// @return The traceback
189  ///
190  CEditScript& GetEditScript() { return m_EditScript; }
191 
192  /// Query if a CHit has a hierarchy of subhits available
193  /// @return true if subhits are available
194  ///
195  bool HasSubHits() { return !(m_SubHit.empty()); }
196 
197  /// Sum the score of all subhits, and make the sequence ranges
198  /// the union of the ranges of all subhits. Traceback is ignored
199  ///
200  void AddUpSubHits();
201 
202  /// Produce an independent copy of a CHit
203  /// @return Pointer to the copy
204  ///
205  CHit * Clone();
206 
207  /// Retrieve the seq1 range corresponding to a
208  /// specified seq2 range. Assumes traceback is valid
209  /// @param seq_range2 The target range on the
210  /// second sequence [in]
211  /// @param seq_range1 The corresponding range
212  /// on the first sequence [out]
213  /// @param new_seq_range2 If seq_range2 starts or ends in a
214  /// gap on seq_range1, the range is shortened
215  /// to exclude the gap and seq_range2 is
216  /// cropped to compensate [out]
217  /// @param traceback_range The range of traceback operations
218  /// corresponding to seq_range1 and
219  /// new_seq_range2 [out]
220  ///
221  void GetRangeFromSeq2(TRange seq_range2,
222  TRange& seq_range1,
223  TRange& new_seq_range2,
224  TRange& traceback_range);
225 
226  /// Retrieve the seq2 range corresponding to a
227  /// specified seq1 range. Assumes traceback is valid
228  /// @param seq_range1 The target range on the
229  /// first sequence [in]
230  /// @param new_seq_range1 If seq_range1 starts or ends in a
231  /// gap on seq_range2, the range is shortened
232  /// to exclude the gap and seq_range1 is
233  /// cropped to compensate [out]
234  /// @param seq_range2 The corresponding range
235  /// on the second sequence [out]
236  /// @param traceback_range The range of traceback operations
237  /// corresponding to seq_range1 and
238  /// new_seq_range2 [out]
239  ///
240  void GetRangeFromSeq1(TRange seq_range1,
241  TRange& new_seq_range1,
242  TRange& seq_range2,
243  TRange& traceback_range);
244 
245  /// Perform basic integrity checks on a CHit
246  ///
247  void VerifyHit();
248 
249  /// If pairs of subhits have overlapping ranges, either delete
250  /// one or change one so that the overlap is avoided. Only the
251  /// sequence 1 range is checked for overlap; in practice, the hits
252  /// refer to block alignments derived from RPS blast results,
253  /// and sequence 2 is an RPS database sequence. It is sequence 1
254  /// that matters for later processing
255  /// @param seq1 The sequence data corresponding to the
256  /// first sequence [in]
257  /// @param seq2_pssm The PSSM for the second sequence [in]
258  /// @param gap_open Penalty for opening a gap [in]
259  /// @param gap_extend Penalty for extending a gap [in]
260  ///
261  void ResolveSubHitConflicts(CSequence& seq1,
262  int **seq2_pssm,
263  CNWAligner::TScore gap_open,
264  CNWAligner::TScore gap_extend);
265 
266 private:
267  CEditScript m_EditScript; ///< Traceback for this alignment
268  vector<CHit *> m_SubHit; ///< Subhits for this alignment
269 };
270 
271 
272 END_SCOPE(cobalt)
274 
275 #endif // ALGO_COBALT___HIT__HPP
User-defined methods of the data storage class.
Definitions used by all COBALT aligner components.
Structures and API used for saving BLAST hits.
Interface for the traceback from blast hits.
Definition: traceback.hpp:55
A generalized representation of a pairwise alignment.
Definition: hit.hpp:86
TSubHit & GetSubHit()
Retrieve a list of subhits.
Definition: hit.hpp:185
CHit(int seq1_index, int seq2_index, int score, const objects::CDense_seg &denseg)
Create an alignment from a Dense_seg.
vector< CHit * > m_SubHit
Subhits for this alignment.
Definition: hit.hpp:268
CHit(int seq1_index, int seq2_index, TRange seq_range1, TRange seq_range2, int score, CEditScript edit_script)
Create an alignment with all specified parameters.
Definition: hit.hpp:160
CEditScript m_EditScript
Traceback for this alignment.
Definition: hit.hpp:267
void InsertSubHit(CHit *hit)
Add a to a CHit's list of subhits.
Definition: hit.hpp:180
CHit(int seq1_index, int seq2_index, BlastHSP *hsp)
Create an alignment from a BLAST hit.
Definition: hit.hpp:125
CHit(int seq1_index, int seq2_index, int score, const objects::CDense_diag &dendiag)
Create an alignment from a Dense_diag.
int m_Score
Score of alignment.
Definition: hit.hpp:104
CEditScript & GetEditScript()
Retrieve the traceback associated with a CHit.
Definition: hit.hpp:190
int m_SeqIndex1
Numerical identifier for first sequence in alignment.
Definition: hit.hpp:97
int m_SeqIndex2
Numerical identifier for second sequence in alignment.
Definition: hit.hpp:101
TRange m_SeqRange1
The range of offsets on the first sequence.
Definition: hit.hpp:107
CHit(int seq1_index, int seq2_index)
Create an empty alignment.
Definition: hit.hpp:116
TRange m_SeqRange2
The range of offsets on the second sequence.
Definition: hit.hpp:110
~CHit()
Destructor.
Definition: hit.hpp:170
bool HasSubHits()
Query if a CHit has a hierarchy of subhits available.
Definition: hit.hpp:195
vector< CHit * > TSubHit
Hits can be grouped hierarchically.
Definition: hit.hpp:93
Class for representing protein sequences.
Definition: seq.hpp:54
int offset
Definition: replacements.h:160
Definitions of structures used for saving traceback information.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
#define NCBI_COBALT_EXPORT
Definition: ncbi_export.h:977
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
Interface for CSequence class.
Structure holding all information about an HSP.
Definition: blast_hits.h:126
static string subject
static string query
Interface for CEditScript class.
Modified on Fri Sep 20 14:58:18 2024 by modify_doxy.py rev. 669887