NCBI C++ ToolKit
hitlist.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /*
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's offical duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================*/
25 
26 /*****************************************************************************
27 
28 File name: hitlist.cpp
29 
30 Author: Jason Papadopoulos
31 
32 Contents: Implementation of CHitList class
33 
34 ******************************************************************************/
35 
36 #include <ncbi_pch.hpp>
37 #include <algo/cobalt/hitlist.hpp>
38 
39 #include <algorithm>
40 
41 
42 /// @file hitlist.cpp
43 /// Implementation of CHitList class
44 
46 BEGIN_SCOPE(cobalt)
47 
48 /// Callback used to sort hits into canonical order
50 
51 
52  /// Sort by seq1 index, then seq2 index, then
53  /// seq1 range, then seq2 range, then score
54  /// @param aa First hit [in]
55  /// @param bb Second hit [in]
56  /// @param true if aa and bb are in the correct order
57  ///
59  CHitList::TListEntry const& bb) const {
60  CHit *a = aa.second;
61  CHit *b = bb.second;
62 
63  if (a->m_SeqIndex1 < b->m_SeqIndex1)
64  return true;
65  if (a->m_SeqIndex1 > b->m_SeqIndex1)
66  return false;
67 
68  if (a->m_SeqIndex2 < b->m_SeqIndex2)
69  return true;
70  if (a->m_SeqIndex2 > b->m_SeqIndex2)
71  return false;
72 
73  if (a->m_SeqRange1.GetFrom() < b->m_SeqRange1.GetFrom())
74  return true;
75  if (a->m_SeqRange1.GetFrom() > b->m_SeqRange1.GetFrom())
76  return false;
77 
78  if (a->m_SeqRange1.GetTo() < b->m_SeqRange1.GetTo())
79  return true;
80  if (a->m_SeqRange1.GetTo() > b->m_SeqRange1.GetTo())
81  return false;
82 
83  if (a->m_SeqRange2.GetFrom() < b->m_SeqRange2.GetFrom())
84  return true;
85  if (a->m_SeqRange2.GetFrom() > b->m_SeqRange2.GetFrom())
86  return false;
87 
88  if (a->m_SeqRange2.GetTo() < b->m_SeqRange2.GetTo())
89  return true;
90  if (a->m_SeqRange2.GetTo() > b->m_SeqRange2.GetTo())
91  return false;
92 
93  return (a->m_Score < b->m_Score);
94  }
95 };
96 
97 void
99 {
100  if (Empty())
101  return;
102 
103  // change all the hits so that the first sequence index
104  // is the one that is the smallest
105 
106  for (int i = 0; i < Size(); i++) {
107  CHit *hit = GetHit(i);
108  if (hit->m_SeqIndex1 < hit->m_SeqIndex2)
109  continue;
110 
111  _ASSERT(hit->m_SeqIndex1 != hit->m_SeqIndex2);
112  swap(hit->m_SeqIndex1, hit->m_SeqIndex2);
113  swap(hit->m_SeqRange1, hit->m_SeqRange2);
115  if (hit->HasSubHits()) {
116  NON_CONST_ITERATE(CHit::TSubHit, subitr, hit->GetSubHit()) {
117  CHit *subhit = *subitr;
118  swap(subhit->m_SeqIndex1, subhit->m_SeqIndex2);
119  swap(subhit->m_SeqRange1, subhit->m_SeqRange2);
120  subhit->GetEditScript().ReverseEditScript();
121  }
122  }
123  }
124 
125  sort(m_List.begin(), m_List.end(), compare_hit_redundant());
126 
127  // delete duplicate hits
128 
129  int i = 0;
130  int num_hits = Size();
131  while(i < num_hits) {
132  int j;
133  for (j = i + 1; j < num_hits; j++) {
134  if (GetHit(j)->m_SeqIndex1 != GetHit(i)->m_SeqIndex1 ||
135  GetHit(j)->m_SeqIndex2 != GetHit(i)->m_SeqIndex2 ||
136  GetHit(j)->m_SeqRange1 != GetHit(i)->m_SeqRange1 ||
137  GetHit(j)->m_SeqRange2 != GetHit(i)->m_SeqRange2 ||
138  GetHit(j)->m_Score != GetHit(i)->m_Score ||
139  GetHit(j)->HasSubHits() || GetHit(i)->HasSubHits()) {
140  break;
141  }
142  SetKeepHit(j, false);
143  }
144  i = j;
145  }
146 
147  // remove the duplicates
148 
150 }
151 
152 
153 /// Given two hits, representing alignments of different
154 /// sequences A and B to the same common sequence, produce
155 /// another hit that represents the alignment directly between
156 /// A and B.
157 /// @param hit1 The first hit [in]
158 /// @param hit2 The second hit [in]
159 /// @return The hit containing the alignment between A and B
160 ///
161 static CHit *
162 x_MatchSubHits(CHit *hit1, CHit *hit2)
163 {
164 
165  // calculate the range on the common sequence where
166  // the A and B sequences overlap.
167 
168  TRange& srange1 = hit1->m_SeqRange2;
169  TRange& srange2 = hit2->m_SeqRange2;
170  TRange s_range(srange1.IntersectionWith(srange2));
171 
172  // ignore an overlap that is too small
173 
174  if (s_range.GetLength() <= CHit::kMinHitSize)
175  return 0;
176 
177  // Now calculate the A and B ranges associated with s_range
178 
179  TRange q_range1, q_range2;
180  TRange tmp_s_range;
181  TRange tmp_tback_range;
182 
183  hit1->GetRangeFromSeq2(s_range, q_range1, tmp_s_range, tmp_tback_range);
184  hit2->GetRangeFromSeq2(s_range, q_range2, tmp_s_range, tmp_tback_range);
185 
186  // Throw away the alignment if the A and B ranges are too
187  // small, or if the difference in A and B lengths is too large
188 
189  if (q_range1.GetLength() <= CHit::kMinHitSize ||
190  q_range2.GetLength() <= CHit::kMinHitSize)
191  return 0;
192 
193  // join the two alignment halves together into a
194  // single alignment. Don't bother recalculating the
195  // score; if there was any overlap at all the scores
196  // should be similar. Also ignore the traceback
197 
198  return new CHit(hit1->m_SeqIndex1, hit2->m_SeqIndex1,
199  q_range1, q_range2,
200  min(hit1->m_Score, hit2->m_Score),
201  CEditScript());
202 }
203 
204 
205 /// Callback to sort a list of hits in order of increasing
206 /// sequence 2 index
208 
209  /// Sort two hits by increasing sequence 2 index
210  /// @param a The first hit
211  /// @param b The second hit
212  /// @return true if 'a' and 'b' are already correctly sorted
213  ///
215  CHitList::TListEntry const& b) const {
216  return (a.second->m_SeqIndex2 < b.second->m_SeqIndex2);
217  }
218 };
219 
220 void
222 {
223  int num_hits = Size();
224 
225  // At least two hits are needed to generate any matches
226  if (num_hits < 2)
227  return;
228 
229  sort(m_List.begin(), m_List.end(), compare_hit_seq2_idx());
230 
231  for (int i = 0; i < num_hits - 1; i++) {
232 
233  // for each hit
234 
235  CHit *hit1 = GetHit(i);
236  _ASSERT(hit1->HasSubHits());
237 
238  // for each succeeding hit
239 
240  for (int j = i + 1; j < num_hits; j++) {
241 
242  CHit *hit2 = GetHit(j);
243  _ASSERT(hit2->HasSubHits());
244 
245  // if hit1 and hit2 do not align to the
246  // same common sequence, later hits will not
247  // align either
248 
249  if (hit1->m_SeqIndex2 != hit2->m_SeqIndex2)
250  break;
251 
252  // sequence 1 must be different for the two hits
253 
254  if (hit1->m_SeqIndex1 == hit2->m_SeqIndex1)
255  continue;
256 
257  CHit *new_hit = 0;
258 
259  // for each subhit of hit1
260 
261  NON_CONST_ITERATE(CHit::TSubHit, itr1, hit1->GetSubHit()) {
262 
263  // for each subhit of hit 2
264 
265  NON_CONST_ITERATE(CHit::TSubHit, itr2, hit2->GetSubHit()) {
266 
267  // create an alignment between the two subhits if
268  // their sequence 2 ranges overlap
269 
270  CHit *new_subhit = x_MatchSubHits(*itr1, *itr2);
271  if (new_subhit != 0) {
272  if (new_hit == 0) {
273  // create the first subhit of a new hit
274 
275  new_hit = new CHit(hit1->m_SeqIndex1,
276  hit2->m_SeqIndex1);
277  new_hit->InsertSubHit(new_subhit);
278  continue;
279  }
280  CHit::TSubHit& subhits = new_hit->GetSubHit();
281  if (new_subhit->m_SeqRange1.GetFrom() -
282  subhits.back()->m_SeqRange1.GetTo() !=
283  new_subhit->m_SeqRange2.GetFrom() -
284  subhits.back()->m_SeqRange2.GetTo()) {
285 
286  // the current generated subhit does not
287  // start the same distance away from the
288  // previous subhit on bother sequences.
289  // Merge the new subhit into the previous one
290 #if 0
291  //---------------------------------------
292  printf("collapse query %d %d-%d / %d-%d "
293  "query %d %d-%d %d-%d\n",
294  hit1->m_SeqIndex1,
295  subhits.back()->m_SeqRange1.GetFrom(),
296  subhits.back()->m_SeqRange1.GetTo(),
297  new_subhit->m_SeqRange1.GetFrom(),
298  new_subhit->m_SeqRange1.GetTo(),
299  hit2->m_SeqIndex1,
300  subhits.back()->m_SeqRange2.GetFrom(),
301  subhits.back()->m_SeqRange2.GetTo(),
302  new_subhit->m_SeqRange2.GetFrom(),
303  new_subhit->m_SeqRange2.GetTo());
304  //---------------------------------------
305 #endif
306  subhits.back()->m_SeqRange1.SetTo(
307  new_subhit->m_SeqRange1.GetTo());
308  subhits.back()->m_SeqRange2.SetTo(
309  new_subhit->m_SeqRange2.GetTo());
310  subhits.back()->m_Score += new_subhit->m_Score;
311  delete new_subhit;
312  }
313  else {
314  // just add the new subhit
315 
316  new_hit->InsertSubHit(new_subhit);
317  }
318  }
319  }
320  }
321 
322  // if any subhits were generated, add the resulting
323  // hit to the output list
324  if (new_hit != 0) {
325  new_hit->AddUpSubHits();
326  matched_list.AddToHitList(new_hit);
327  }
328  }
329  }
330 }
331 
332 
333 /// callback used to sort hits in order of decreasing score
335 public:
336  /// Compare hits by score
337  /// @param a First hit
338  /// @param b Second hit
339  /// @return true if hits are already in order of
340  /// descending score
341  ///
343  CHitList::TListEntry const& b) const {
344  return (a.second->m_Score > b.second->m_Score);
345  }
346 };
347 
348 void
350 {
351  sort(m_List.begin(), m_List.end(), compare_hit_score());
352 }
353 
354 /// callback used to sort hits in order of increasing
355 /// sequence offset
357 
358  /// Compare hits by sequence offset
359  /// @param aa First hit
360  /// @param bb Second hit
361  /// @return true if hits are already in order of
362  /// increasing offset on sequence 1, with the
363  /// offset on sequence 2 used as a tiebreaker
364  ///
366  CHitList::TListEntry const& bb) const {
367  CHit *a = aa.second;
368  CHit *b = bb.second;
369  if (a->m_SeqRange1.GetFrom() < b->m_SeqRange1.GetFrom())
370  return true;
371  if (a->m_SeqRange1.GetFrom() > b->m_SeqRange1.GetFrom())
372  return false;
373 
374  return (a->m_SeqRange2.GetFrom() < b->m_SeqRange2.GetFrom());
375  }
376 };
377 
378 void
380 {
381  sort(m_List.begin(), m_List.end(), compare_hit_start());
382 }
383 
384 void
386 {
387  for (int i = 0; i < hitlist.Size(); i++) {
388  AddToHitList(hitlist.GetHit(i));
389  m_List.back().second = m_List.back().second->Clone();
390  }
391 }
392 
393 END_SCOPE(cobalt)
Interface for the traceback from blast hits.
Definition: traceback.hpp:55
void ReverseEditScript()
Reverse an edit script; insertions become deletions and vice versa.
Definition: traceback.hpp:90
An ordered collection of CHit objects.
Definition: hitlist.hpp:50
void Append(CHitList &hitlist)
Append one hitlist to another.
Definition: hitlist.cpp:385
int Size() const
Retrieve number of hits in list.
Definition: hitlist.hpp:75
void MakeCanonical()
Sort the list in a canonical form: first swap the indices and ranges on all hits and subhits so that ...
Definition: hitlist.cpp:98
void SetKeepHit(int index, bool keep)
Set whether a hit in the hitlist will be scheduled for deletion.
Definition: hitlist.hpp:126
bool Empty()
Determine whether a list contains no hits.
Definition: hitlist.hpp:79
pair< bool, CHit * > TListEntry
add a reminder of whether a hit should be kept
Definition: hitlist.hpp:54
CHit * GetHit(int index)
Retrieve a hit from the hitlist.
Definition: hitlist.hpp:93
void PurgeUnwantedHits()
Delete all hits scheduled to be deleted.
Definition: hitlist.hpp:134
void SortByStartOffset()
Sort the hits in the hitlist by increasing sequence1 index, then by increasing sequence1 start offset...
Definition: hitlist.cpp:379
void SortByScore()
Sort the hits in the hitlist in order of decreasing score.
Definition: hitlist.cpp:349
void MatchOverlappingSubHits(CHitList &matched_list)
For each pair of hits with the same sequence2, produce a list of hits between sequence1 of the first ...
Definition: hitlist.cpp:221
TList m_List
current list of hits
Definition: hitlist.hpp:199
void AddToHitList(CHit *hit)
Append a hit to the hitlist.
Definition: hitlist.hpp:84
A generalized representation of a pairwise alignment.
Definition: hit.hpp:86
TSubHit & GetSubHit()
Retrieve a list of subhits.
Definition: hit.hpp:185
void InsertSubHit(CHit *hit)
Add a to a CHit's list of subhits.
Definition: hit.hpp:180
void GetRangeFromSeq2(TRange seq_range2, TRange &seq_range1, TRange &new_seq_range2, TRange &traceback_range)
Retrieve the seq1 range corresponding to a specified seq2 range.
Definition: hit.cpp:109
void AddUpSubHits()
Sum the score of all subhits, and make the sequence ranges the union of the ranges of all subhits.
Definition: hit.cpp:87
int m_Score
Score of alignment.
Definition: hit.hpp:104
CEditScript & GetEditScript()
Retrieve the traceback associated with a CHit.
Definition: hit.hpp:190
int m_SeqIndex1
Numerical identifier for first sequence in alignment.
Definition: hit.hpp:97
int m_SeqIndex2
Numerical identifier for second sequence in alignment.
Definition: hit.hpp:101
TRange m_SeqRange1
The range of offsets on the first sequence.
Definition: hit.hpp:107
static const int kMinHitSize
Not always used, but useful to avoid extremely small hits.
Definition: hit.hpp:90
TRange m_SeqRange2
The range of offsets on the second sequence.
Definition: hit.hpp:110
bool HasSubHits()
Query if a CHit has a hierarchy of subhits available.
Definition: hit.hpp:195
vector< CHit * > TSubHit
Hits can be grouped hierarchically.
Definition: hit.hpp:93
callback used to sort hits in order of decreasing score
Definition: hitlist.cpp:334
bool operator()(CHitList::TListEntry const &a, CHitList::TListEntry const &b) const
Compare hits by score.
Definition: hitlist.cpp:342
static ulg bb
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
Definition: ncbimisc.hpp:1508
position_type GetLength(void) const
Definition: range.hpp:158
TThisType IntersectionWith(const TThisType &r) const
Definition: range.hpp:312
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
static CHit * x_MatchSubHits(CHit *hit1, CHit *hit2)
Given two hits, representing alignments of different sequences A and B to the same common sequence,...
Definition: hitlist.cpp:162
Interface for CHitList class.
int i
constexpr auto sort(_Init &&init)
unsigned int a
Definition: ncbi_localip.c:102
T min(T x_, T y_)
Callback used to sort hits into canonical order.
Definition: hitlist.cpp:49
bool operator()(CHitList::TListEntry const &aa, CHitList::TListEntry const &bb) const
Sort by seq1 index, then seq2 index, then seq1 range, then seq2 range, then score.
Definition: hitlist.cpp:58
Callback to sort a list of hits in order of increasing sequence 2 index.
Definition: hitlist.cpp:207
bool operator()(CHitList::TListEntry const &a, CHitList::TListEntry const &b) const
Sort two hits by increasing sequence 2 index.
Definition: hitlist.cpp:214
callback used to sort hits in order of increasing sequence offset
Definition: hitlist.cpp:356
bool operator()(CHitList::TListEntry const &aa, CHitList::TListEntry const &bb) const
Compare hits by sequence offset.
Definition: hitlist.cpp:365
#define _ASSERT
Modified on Wed Jun 19 17:01:03 2024 by modify_doxy.py rev. 669887