NCBI C++ ToolKit
hit.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /*
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's offical duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================*/
25 
26 /*****************************************************************************
27 
28 File name: hit.cpp
29 
30 Author: Jason Papadopoulos
31 
32 Contents: implementation of CHit class
33 
34 ******************************************************************************/
35 
36 #include <ncbi_pch.hpp>
37 #include <algo/cobalt/hit.hpp>
38 
39 #include <algorithm>
40 
41 /// @file hit.cpp
42 /// Implementation of CHit class
43 
45 BEGIN_SCOPE(cobalt)
47 
48 CHit::CHit(int seq1_index, int seq2_index, int score,
49  const CDense_seg& denseg)
50  : m_SeqIndex1(seq1_index), m_SeqIndex2(seq2_index),
51  m_Score(score), m_EditScript(denseg)
52 {
53  _ASSERT(denseg.GetDim() == 2);
54 
55  CDense_seg::TNumseg num_seg = denseg.GetNumseg();
56  const CDense_seg::TStarts& starts = denseg.GetStarts();
57  const CDense_seg::TLens& lens = denseg.GetLens();
58  int len1 = 0, len2 = 0;
59 
60  for (CDense_seg::TNumseg i = 0; i < num_seg; i++) {
61  if (starts[2*i] >= 0)
62  len1 += lens[i];
63  if (starts[2*i+1] >= 0)
64  len2 += lens[i];
65  }
66  m_SeqRange1 = TRange(starts[0], starts[0] + len1 - 1);
67  m_SeqRange2 = TRange(starts[1], starts[1] + len2 - 1);
68  VerifyHit();
69 }
70 
71 
72 CHit::CHit(int seq1_index, int seq2_index, int score,
73  const CDense_diag& dendiag)
74  : m_SeqIndex1(seq1_index), m_SeqIndex2(seq2_index),
75  m_Score(score), m_EditScript(dendiag)
76 {
77  _ASSERT(dendiag.GetDim() == 2);
78 
79  const CDense_diag::TStarts& starts = dendiag.GetStarts();
80  const CDense_diag::TLen len = dendiag.GetLen();
81  m_SeqRange1 = TRange(starts[0], starts[0] + len - 1);
82  m_SeqRange2 = TRange(starts[1], starts[1] + len - 1);
83  VerifyHit();
84 }
85 
86 
88 {
90 
91  // Make the ranges of the hit into the bounding box
92  // on the ranges of all the subhits. Make its score
93  // the sum of the scores of all the subhits
94 
95  m_SeqRange1 = m_SubHit[0]->m_SeqRange1;
96  m_SeqRange2 = m_SubHit[0]->m_SeqRange2;
97  m_Score = m_SubHit[0]->m_Score;
98 
99  for (int i = 1; i < (int)m_SubHit.size(); i++) {
100  CHit *hit = m_SubHit[i];
103  m_Score += hit->m_Score;
104  }
105 }
106 
107 
108 void
110  TRange& seq_range1,
111  TRange& new_seq_range2,
112  TRange& tback_range)
113 {
114  _ASSERT(m_SeqRange2.Contains(seq_range2));
115 
116  TOffsetPair start_off(m_SeqRange1.GetFrom(),
117  m_SeqRange2.GetFrom());
118  TOffsetPair new_off;
119  TOffset new_tback;
120  TOffset target_off;
121 
122  // Find the left endpoint of the range
123 
124  target_off = seq_range2.GetFrom();
125  m_EditScript.FindOffsetFromSeq2(start_off, new_off, target_off,
126  new_tback, true);
127 
128  seq_range1.SetFrom(new_off.first);
129  new_seq_range2.SetFrom(new_off.second);
130  tback_range.SetFrom(new_tback);
131 
132  // Find the right endpoint of the range
133 
134  target_off = seq_range2.GetTo();
135  m_EditScript.FindOffsetFromSeq2(start_off, new_off, target_off,
136  new_tback, false);
137 
138  seq_range1.SetTo(new_off.first);
139  new_seq_range2.SetTo(new_off.second);
140  tback_range.SetTo(new_tback);
141 }
142 
143 
144 void
146  TRange& new_seq_range1,
147  TRange& seq_range2,
148  TRange& tback_range)
149 {
150  _ASSERT(m_SeqRange1.Contains(seq_range1));
151 
152  TOffsetPair start_off(m_SeqRange1.GetFrom(),
153  m_SeqRange2.GetFrom());
154  TOffsetPair new_off;
155  TOffset new_tback;
156  TOffset target_off;
157 
158  // Find the left endpoint of the range
159 
160  target_off = seq_range1.GetFrom();
161  m_EditScript.FindOffsetFromSeq1(start_off, new_off, target_off,
162  new_tback, true);
163 
164  new_seq_range1.SetFrom(new_off.first);
165  seq_range2.SetFrom(new_off.second);
166  tback_range.SetFrom(new_tback);
167 
168  // Find the right endpoint of the range
169 
170  target_off = seq_range1.GetTo();
171  m_EditScript.FindOffsetFromSeq1(start_off, new_off, target_off,
172  new_tback, false);
173 
174  new_seq_range1.SetTo(new_off.first);
175  seq_range2.SetTo(new_off.second);
176  tback_range.SetTo(new_tback);
177 }
178 
179 void
181 {
182  // verify query and subject ranges are nonempty
183 
186 
187  // verify the traceback matches up to the sequence ranges
188 
190 }
191 
192 
193 /// callback used for sorting HSP lists
195 
196  /// functor that implements hit comparison
197  /// @param a First hit
198  /// @param b Second hit
199  /// @return true if a and b are sorted in order of
200  /// increasing sequence 1 start offset, with
201  /// sequence1 end offset used as a tiebreaker
202  ///
203  bool operator()(CHit * const& a, CHit * const& b) const {
204  if (a->m_SeqRange1.GetFrom() < b->m_SeqRange1.GetFrom())
205  return true;
206  if (a->m_SeqRange1.GetFrom() > b->m_SeqRange1.GetFrom())
207  return false;
208 
209  return (a->m_SeqRange1.GetTo() < b->m_SeqRange1.GetTo());
210  }
211 };
212 
213 
214 /// Delete any hits from a list that are contained within
215 /// higher-scoring hits. Only overlap on sequence 1 is considered.
216 /// In practice, the hits refer to block alignments derived from
217 /// RPS blast results, and sequence 2 is an RPS database sequence.
218 /// It is sequence 1 that matters for later processing
219 /// @param subhits The list of hits to process [in/modified]
220 ///
221 static void
223 {
224  // order the hits by start offset of sequence 1
225  sort(subhits.begin(), subhits.end(), compare_hit_seq1_start());
226  int num_subhits = subhits.size();
227 
228  for (int i = 0; i < num_subhits - 1; i++) {
229 
230  // skip hits that have already been deleted
231 
232  CHit *hit1 = subhits[i];
233  if (hit1 == 0)
234  continue;
235 
236  // for all hits past hit i
237 
238  for (int j = i + 1; j < num_subhits; j++) {
239 
240  // skip hits that have already been deleted
241 
242  CHit *hit2 = subhits[j];
243  if (hit2 == 0)
244  continue;
245 
246  // stop looking when hits that do not overlap at all
247  // on sequence 1 are encountered
248 
249  TRange& range1(hit1->m_SeqRange1);
250  TRange& range2(hit2->m_SeqRange1);
251  if (range1.StrictlyBelow(range2) ||
252  range2.StrictlyBelow(range1)) {
253  break;
254  }
255 
256  // if the sequence 1 ranges overlap, delete the
257  // lower-scoring hit
258 
259  if (range1.Contains(range2) || range2.Contains(range1)) {
260  if (hit1->m_Score > hit2->m_Score) {
261  delete subhits[j]; subhits[j] = 0;
262  continue;
263  }
264  else {
265  delete subhits[i]; subhits[i] = 0;
266  break;
267  }
268  }
269  }
270  }
271 
272  // compress the list of hits to remove null pointers
273 
274  int j = 0;
275  for (int i = 0; i < num_subhits; i++) {
276  if (subhits[i] != 0)
277  subhits[j++] = subhits[i];
278  }
279  if (j < num_subhits)
280  subhits.resize(j);
281 }
282 
283 
284 void
286  int **seq2_pssm,
287  CNWAligner::TScore gap_open,
288  CNWAligner::TScore gap_extend)
289 {
290  if (m_SubHit.size() < 2)
291  return;
292 
293  // first remove subhits that are completely contained
294 
296 
297  int num_subhits = m_SubHit.size();
298 
299  // if there are still any conflicts, they are only between
300  // adjacent pairs of hits. For all such pairs...
301 
302  for (int i = 0; i < num_subhits - 1; i++) {
303  CHit *hit1 = m_SubHit[i];
304  CHit *hit2 = m_SubHit[i + 1];
305 
306  TRange& seq1range1(hit1->m_SeqRange1);
307  TRange& seq1range2(hit1->m_SeqRange2);
308  TRange& seq2range1(hit2->m_SeqRange1);
309  TRange& seq2range2(hit2->m_SeqRange2);
310 
311  // ignore pairs of hits that will never overlap
312  // on sequence 1
313 
314  if (seq1range1.StrictlyBelow(seq2range1))
315  continue;
316 
317  // there's an overlap. First assume the the first hit is
318  // shortened, and calculate the score of the resulting alignment
319 
320  TRange tback_range1;
321  TRange new_q_range1(seq1range1.GetFrom(), seq2range1.GetFrom() - 1);
322  TRange new_s_range1;
323  hit1->GetRangeFromSeq1(new_q_range1, new_q_range1,
324  new_s_range1, tback_range1);
325 
326  int score1 = hit1->GetEditScript().GetScore(tback_range1,
327  TOffsetPair(seq1range1.GetFrom(),
328  seq1range2.GetFrom()),
329  seq1, seq2_pssm, gap_open, gap_extend);
330 
331  // repeat the process assuming the second hit is shortened
332 
333  TRange tback_range2;
334  TRange new_q_range2(seq1range1.GetTo() + 1, seq2range1.GetTo());
335  TRange new_s_range2;
336  hit2->GetRangeFromSeq1(new_q_range2, new_q_range2,
337  new_s_range2, tback_range2);
338 
339  int score2 = hit2->GetEditScript().GetScore(tback_range2,
340  TOffsetPair(seq2range1.GetFrom(),
341  seq2range2.GetFrom()),
342  seq1, seq2_pssm, gap_open, gap_extend);
343 
344 #if 0
345  //------------------------------------------
346  printf("fixup query %d db %d ", hit1->m_SeqIndex1, hit1->m_SeqIndex2);
347  printf("hit1: qoff %d-%d dboff %d-%d score %d ",
348  hit1->m_SeqRange1.GetFrom(), hit1->m_SeqRange1.GetTo(),
349  hit1->m_SeqRange2.GetFrom(), hit1->m_SeqRange2.GetTo(),
350  hit1->m_Score);
351  printf(" vs hsp2: %d-%d %d-%d score %d ",
352  hit2->m_SeqRange1.GetFrom(), hit2->m_SeqRange1.GetTo(),
353  hit2->m_SeqRange2.GetFrom(), hit2->m_SeqRange2.GetTo(),
354  hit2->m_Score);
355  printf("\n");
356  //------------------------------------------
357 #endif
358 
359  // keep the combination that scores the highest
360 
361  if (score1 + hit2->m_Score > hit1->m_Score + score2) {
362  m_SubHit[i+1] = new CHit(hit2->m_SeqIndex1, hit2->m_SeqIndex2,
363  new_q_range2, new_s_range2, score2,
364  hit2->GetEditScript().MakeEditScript(tback_range2));
365  delete hit2;
366  }
367  else {
368  m_SubHit[i] = new CHit(hit1->m_SeqIndex1, hit1->m_SeqIndex2,
369  new_q_range1, new_s_range1, score1,
370  hit1->GetEditScript().MakeEditScript(tback_range1));
371  delete hit1;
372  }
373  }
374 }
375 
376 
377 CHit *
379 {
380  CHit *new_hit = new CHit(m_SeqIndex1, m_SeqIndex2,
383  if (HasSubHits()) {
385  new_hit->InsertSubHit((*itr)->Clone());
386  }
387  }
388  return new_hit;
389 }
390 
391 END_SCOPE(cobalt)
Interface for CHit class, used to encapsulate operations involving pairwise alignments.
int TOffset
Basic data type for offsets into a sequence.
Definition: base.hpp:49
CLocalRange< TOffset > TRange
define for the fundamental building block of sequence ranges
Definition: base.hpp:115
pair< TOffset, TOffset > TOffsetPair
Basic type specifying a range on a sequence.
Definition: base.hpp:52
void VerifyScript(TRange seq1_range, TRange seq2_range)
Validate that the alignment described by the CEditScript has the same size for each sequence as the i...
Definition: traceback.cpp:475
int GetScore(TRange tback_range, TOffsetPair start_offsets, CSequence &seq1, int **seq2_pssm, int gap_open, int gap_extend)
Compute the score associated with (a portion of) an alignment Assumes that seq1 is a sequence and tha...
Definition: traceback.cpp:343
void FindOffsetFromSeq2(TOffsetPair start_offsets, TOffsetPair &new_offsets, TOffset seq2_target, TOffset &new_tback, bool go_past_seq1_gap)
Given a subject offset, find the corresponding query offset.
Definition: traceback.cpp:196
void FindOffsetFromSeq1(TOffsetPair start_offsets, TOffsetPair &new_offsets, TOffset seq1_target, TOffset &new_tback, bool go_past_seq2_gap)
Given a query offset, find the corresponding subject offset.
Definition: traceback.cpp:270
CEditScript MakeEditScript(TRange tback_range)
Return an edit script corresponding to a subset of the complete traceback available.
Definition: traceback.cpp:153
A generalized representation of a pairwise alignment.
Definition: hit.hpp:86
void GetRangeFromSeq1(TRange seq_range1, TRange &new_seq_range1, TRange &seq_range2, TRange &traceback_range)
Retrieve the seq2 range corresponding to a specified seq1 range.
Definition: hit.cpp:145
TSubHit & GetSubHit()
Retrieve a list of subhits.
Definition: hit.hpp:185
vector< CHit * > m_SubHit
Subhits for this alignment.
Definition: hit.hpp:268
void VerifyHit()
Perform basic integrity checks on a CHit.
Definition: hit.cpp:180
void ResolveSubHitConflicts(CSequence &seq1, int **seq2_pssm, CNWAligner::TScore gap_open, CNWAligner::TScore gap_extend)
If pairs of subhits have overlapping ranges, either delete one or change one so that the overlap is a...
Definition: hit.cpp:285
CEditScript m_EditScript
Traceback for this alignment.
Definition: hit.hpp:267
CHit * Clone()
Produce an independent copy of a CHit.
Definition: hit.cpp:378
void InsertSubHit(CHit *hit)
Add a to a CHit's list of subhits.
Definition: hit.hpp:180
void GetRangeFromSeq2(TRange seq_range2, TRange &seq_range1, TRange &new_seq_range2, TRange &traceback_range)
Retrieve the seq1 range corresponding to a specified seq2 range.
Definition: hit.cpp:109
void AddUpSubHits()
Sum the score of all subhits, and make the sequence ranges the union of the ranges of all subhits.
Definition: hit.cpp:87
int m_Score
Score of alignment.
Definition: hit.hpp:104
CEditScript & GetEditScript()
Retrieve the traceback associated with a CHit.
Definition: hit.hpp:190
int m_SeqIndex1
Numerical identifier for first sequence in alignment.
Definition: hit.hpp:97
int m_SeqIndex2
Numerical identifier for second sequence in alignment.
Definition: hit.hpp:101
TRange m_SeqRange1
The range of offsets on the first sequence.
Definition: hit.hpp:107
TRange m_SeqRange2
The range of offsets on the second sequence.
Definition: hit.hpp:110
bool HasSubHits()
Query if a CHit has a hierarchy of subhits available.
Definition: hit.hpp:195
vector< CHit * > TSubHit
Hits can be grouped hierarchically.
Definition: hit.hpp:93
bool StrictlyBelow(const TThisType &r)
Test whether 'this' represents a sequence range strictly less than a given sequence range.
Definition: base.hpp:100
bool Contains(const TThisType &r)
Test whether 'this' completely envelops a given sequence range.
Definition: base.hpp:85
Class for representing protein sequences.
Definition: seq.hpp:54
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
TThisType & CombineWith(const TThisType &r)
Definition: range.hpp:345
bool Empty(void) const
Definition: range.hpp:148
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
void SetFrom(TFrom value)
Assign a value to From data member.
Definition: Range_.hpp:231
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
void SetTo(TTo value)
Assign a value to To data member.
Definition: Range_.hpp:278
CHit(void)
Definition: Hit.hpp:86
vector< TSeqPos > TLens
Definition: Dense_seg_.hpp:108
TLen GetLen(void) const
Get the Len member data.
vector< TSignedSeqPos > TStarts
Definition: Dense_seg_.hpp:107
vector< TSeqPos > TStarts
Definition: Dense_diag_.hpp:94
TDim GetDim(void) const
Get the Dim member data.
const TStarts & GetStarts(void) const
Get the Starts member data.
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
USING_SCOPE(objects)
static void x_RemoveEnvelopedSubHits(CHit::TSubHit &subhits)
Delete any hits from a list that are contained within higher-scoring hits.
Definition: hit.cpp:222
int i
int len
constexpr auto sort(_Init &&init)
unsigned int a
Definition: ncbi_localip.c:102
callback used for sorting HSP lists
Definition: hit.cpp:194
bool operator()(CHit *const &a, CHit *const &b) const
functor that implements hit comparison
Definition: hit.cpp:203
#define _ASSERT
#define const
Definition: zconf.h:232
Modified on Wed Apr 17 13:10:04 2024 by modify_doxy.py rev. 669887