NCBI C++ ToolKit
seqalign_cmp.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: seqalign_cmp.cpp 50550 2011-07-22 13:48:03Z camacho $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Christiam Camacho
27  *
28  */
29 
30 /** @file seqalign_cmp.cpp
31  * API to compare CSeq-aligns produced by BLAST
32  */
33 
34 #include <ncbi_pch.hpp>
35 #include "seqalign_cmp.hpp"
36 #ifdef NCBI_OS_IRIX
37 #include <math.h>
38 #else
39 #include <cmath>
40 #endif
41 
42 // Object includes
43 #include <serial/serial.hpp>
44 #include <serial/iterator.hpp>
46 
49 BEGIN_SCOPE(blast)
50 BEGIN_SCOPE(qa)
51 
52 //#define VERBOSE_DEBUG
53 
54 /// BEGIN: Debugging functions
55 template <class Container>
56 void s_PrintContainer(ofstream& out, const Container& c)
57 {
58 #if defined(VERBOSE_DEBUG)
59  if ( c.empty() ) {
60  out << "{}";
61  return;
62  }
63 
64  typename Container::const_iterator itr = c.begin();
65  out << "{ " << *itr;
66  for (; itr != c.end(); ++itr) {
67  out << ", " << *itr;
68  }
69  out << " }";
70 #endif
71 }
72 
73 
74 #if defined(VERBOSE_DEBUG)
75 static void
76 s_PrintNeutralSeqAlign(ofstream& out, const SeqAlign& alignment)
77 {
78  out << "SeqAlign::score = " << alignment.score << endl
79  << "SeqAlign::num_ident = " << alignment.num_ident << endl
80  << "SeqAlign::evalue = " << alignment.evalue << endl
81  << "SeqAlign::bit_score = " << alignment.bit_score << endl
82  << "SeqAlign::match = " << alignment.match << endl
83  << "SeqAlign::query_strand = " << alignment.query_strand << endl
84  << "SeqAlign::subject_strand = " << alignment.subject_strand << endl
85  << "SeqAlign::GetNumSegments() = " << alignment.GetNumSegments()
86  << endl << "SeqAlign::starts[" << alignment.starts.size()
87  << "] = " << endl;
88  s_PrintContainer(out, alignment.starts);
89  out << endl << "SeqAlign::lengths[" << alignment.lengths.size()
90  << "] = " << endl;
91  s_PrintContainer(out, alignment.lengths);
92  out << endl;
93 }
94 #endif
95 
96 static void
97 s_PrintTSeqAlignSet(const string& fname, const TSeqAlignSet& neutral_seqaligns)
98 {
99 #if defined(VERBOSE_DEBUG)
100  ofstream out(fname.c_str());
101  if (!out) {
102  throw runtime_error("Failed to open " + fname);
103  }
104 
105  int index = 0;
106  ITERATE(TSeqAlignSet, alignment, neutral_seqaligns) {
107  out << "SeqAlign # " << ++index << endl;
108  s_PrintNeutralSeqAlign(out, *alignment);
109  }
110 #endif
111 }
112 /// END: Debugging functions
113 
115  const TSeqAlignSet& test,
116  const CSeqAlignCmpOpts& opts)
117 : m_Ref(ref), m_Test(test), m_Opts(opts)
118 {
119 }
120 
121 bool
122 CSeqAlignCmp::Run(string* errors)
123 {
124  bool retval = true;
125 
126  s_PrintTSeqAlignSet("old.neutral.txt", m_Ref);
127  s_PrintTSeqAlignSet("new.neutral.txt", m_Test);
128 
129  // FIXME: add HSP matching logic here (i.e.: go through all SeqAligns in
130  // reference set and match them with SeqAligns in test set. The criteria
131  // used in seqaligndiff was that the least number of diffs between a given
132  // pair of SeqAligns would indicate a match).
133 
134  if (m_Ref.size() != m_Test.size()) {
135  if (errors) {
136  (*errors) += "Different number of alignments:\n";
137  (*errors) += NStr::SizetToString(m_Ref.size()) + " vs. ";
138  (*errors) += NStr::SizetToString(m_Test.size()) + "\n";
139  }
140  retval = false;
141  }
142 
143  // Temporary fix to deal with uneven number of alignments
144  const TSeqAlignSet::size_type kMaxSize = min(m_Ref.size(), m_Test.size());
145  for (TSeqAlignSet::size_type i = 0; i < kMaxSize; i++) {
146  if (x_CompareOneAlign(&m_Ref[i], &m_Test[i], i+1, errors) > 0) {
147  retval = false;
148  }
149  }
150  return retval;
151 }
152 
153 /// Interface class to hold values and to determine whether some difference
154 /// in these values should be reported or not
155 template <class T>
157 public:
158  CValueHolder(string& field_name,
159  T reference_value,
160  T test_value,
161  T invalid_value,
162  T max_diff_value)
163  : m_FieldName(field_name), m_Ref(reference_value), m_Test(test_value),
164  m_Invalid(invalid_value), m_MaxDiff(max_diff_value) {}
165 
166  virtual ~CValueHolder() {}
167  virtual bool ReportDiffs() const = 0;
168 
169  string& GetFieldName() const { return m_FieldName; }
170  T GetReference() const { return m_Ref; }
171  T GetTest() const { return m_Test; }
172  T GetInvalidValue() const { return m_Invalid; }
174 
175 protected:
176  string& m_FieldName;
181 };
182 
183 class CIntValueHolder : public CValueHolder<int> {
184 public:
185  CIntValueHolder(string& field_name,
186  int reference_value,
187  int test_value,
188  int max_diff_value = 0,
189  int invalid_value = kInvalidIntValue)
190  : CValueHolder<int>(field_name, reference_value, test_value,
191  invalid_value, max_diff_value),
192  m_Diff(std::abs(reference_value - test_value))
193  {}
194 
195  virtual bool ReportDiffs() const {
196  return (m_Diff > GetMaximumAcceptableDiff());
197  }
198 
199 private:
200  int m_Diff;
201 };
202 
203 class CDoubleValueHolder : public CValueHolder<double> {
204 public:
205  CDoubleValueHolder(string& field_name,
206  double reference_value,
207  double test_value,
208  double max_diff_value = 0.0,
209  double invalid_value = kInvalidDoubleValue)
210  : CValueHolder<double>(field_name, reference_value, test_value,
211  invalid_value, max_diff_value),
212  m_Diff(std::fabs(reference_value - test_value) / reference_value)
213  {}
214 
215  virtual bool ReportDiffs() const {
216  return (m_Diff > GetMaximumAcceptableDiff());
217  };
218 
219 private:
220  double m_Diff;
221 };
222 
223 /// Template wrapper around NStr::XToString functions, where X is a data type
224 template <class T>
225 string s_ToString(T value) { return "<unknown type>"; }
226 template <>
227 string s_ToString(int value) { return NStr::IntToString(value); }
228 template <>
229 string s_ToString(double value) {
231 }
232 
233 /** Compare values in the CValueHolder object.
234  * @param aln_num Number which identifies this alignment, used in conjunction
235  * with errors string to produce human readable output [in]
236  * @param errors string to which errors will be appended
237  * @return false if values differ, true otherwise
238  */
239 template <class T>
240 bool s_CompareValues(const CValueHolder<T>& value_holder,
241  int aln_num = 0,
242  string* errors = NULL)
243 {
244  if (value_holder.GetReference() == value_holder.GetInvalidValue() &&
245  value_holder.GetTest() != value_holder.GetInvalidValue()) {
246  if (errors) {
247  (*errors) += "align " + s_ToString(aln_num) + ": ";
248  (*errors) += value_holder.GetFieldName() + " present\n";
249  }
250  return false;
251  } else if (value_holder.GetReference() != value_holder.GetInvalidValue() &&
252  value_holder.GetTest() == value_holder.GetInvalidValue()) {
253  if (errors) {
254  (*errors) += "align " + s_ToString(aln_num) + ": ";
255  (*errors) += value_holder.GetFieldName() + " absent\n";
256  }
257  return false;
258  } else if (value_holder.GetReference() != value_holder.GetInvalidValue() &&
259  value_holder.GetTest() != value_holder.GetInvalidValue()) {
260  if (value_holder.ReportDiffs()) {
261  if (errors) {
262  (*errors) += "align " + s_ToString(aln_num) + ": ";
263  (*errors) += "different " + value_holder.GetFieldName() + ", ";
264  (*errors) += s_ToString(value_holder.GetReference());
265  (*errors) += " vs. ";
266  (*errors) += s_ToString(value_holder.GetTest()) + "\n";
267  }
268  return false;
269  }
270  }
271  return true;
272 }
273 
274 bool
276 {
277  if (reference != kInvalidDoubleValue && test != kInvalidDoubleValue &&
278  ((reference < m_Opts.GetMaxEvalue() && test < m_Opts.GetMaxEvalue()) ||
279  (reference > m_Opts.GetMinEvalue() && test > m_Opts.GetMinEvalue()))) {
280  return true;
281  } else {
282  return false;
283  }
284 }
285 
286 
287 /** Returns a pair containing the length of the aligned region in the query and
288  * the length of the aligned region in the subject
289  * @param starts starting offsets vector. Even entries represent query offsets,
290  * odd entries represent subject offsets. [in]
291  * @param lengths represents the lengths of the aligned regions in the query
292  * and subject. [in]
293  */
294 static pair<int, int>
295 s_GetAlignmentLengths(const vector<int>& starts,
296  const vector<TSeqPos>& lengths)
297 {
298  int query_length = 0;
299  int subject_length = 0;
300 
301  _ASSERT(lengths.size()*SeqAlign::kNumDimensions == starts.size());
302 
303  for (vector<TSeqPos>::size_type i = 0; i < lengths.size(); i++) {
304  if (starts[SeqAlign::kNumDimensions*i] > 0) {
305  query_length += lengths[i];
306  }
307  if (starts[SeqAlign::kNumDimensions*i+1] > 0) {
308  subject_length += lengths[i];
309  }
310  }
311 
312  return make_pair(query_length, subject_length);
313 }
314 
315 int
317  const SeqAlign* test,
318  int index,
319  string* errors,
320  bool allow_fuzziness)
321 {
322  int retval = 0;
323 
324  if ( !x_MeetsEvalueRequirements(ref->evalue, test->evalue) ) {
325  return retval;
326  }
327 
328  // Compare evalues
329  {
330  string field("evalue");
331  double max_diff = allow_fuzziness ? m_Opts.GetMaxEvalueDiff() : 0.0;
332  CDoubleValueHolder vh(field, ref->evalue, test->evalue, max_diff);
333  if ( !s_CompareValues(vh, index, errors) ) {
334  retval++;
335  }
336  }
337 
338  // Compare bit scores
339  {
340  string field("bit score");
341  double max_diff = allow_fuzziness ? m_Opts.GetMaxEvalueDiff() : 0.0;
342  CDoubleValueHolder vh(field, ref->bit_score, test->bit_score,
343  max_diff);
344  if ( !s_CompareValues(vh, index, errors) ) {
345  retval++;
346  }
347  }
348 
349  // Compare raw scores (no fuzziness allowed)
350  {
351  string field("raw score");
352  CIntValueHolder vh(field, ref->score, test->score);
353  if ( !s_CompareValues(vh, index, errors) ) {
354  retval++;
355  }
356  }
357 
358  // Compare number of identities (no fuzziness allowed)
359  {
360  string field("num identities");
361  CIntValueHolder vh(field, ref->num_ident, test->num_ident);
362  if ( !s_CompareValues(vh, index, errors) ) {
363  retval++;
364  }
365  }
366 
367  // Compare number of segments (no fuzziness allowed)
368  {
369  string field("number of segments");
370  CIntValueHolder vh(field, ref->GetNumSegments(),
371  test->GetNumSegments());
372  if ( !s_CompareValues(vh, index, errors) ) {
373  retval++;
374  }
375  }
376 
377  /* When comparing the alignment, do not go segment by segment, instead
378  * compute the total length, start offset, and strand */
379 
380  // Compare start of query in aligment
381  {
382  string field("total query align start");
383  int max_diff = allow_fuzziness ? m_Opts.GetMaxOffsetDiff() : 0;
384  CIntValueHolder vh(field, ref->starts[0], test->starts[0], max_diff);
385  if ( !s_CompareValues(vh, index, errors) ) {
386  retval++;
387  }
388  }
389 
390  // Compare start of subject in aligment
391  {
392  string field("total subject align start");
393  int max_diff = allow_fuzziness ? m_Opts.GetMaxOffsetDiff() : 0;
394  CIntValueHolder vh(field, ref->starts[1], test->starts[1], max_diff);
395  if ( !s_CompareValues(vh, index, errors) ) {
396  retval++;
397  }
398  }
399 
400  pair<int, int> ref_lengths =
402  pair<int, int> test_lengths =
403  s_GetAlignmentLengths(test->starts, test->lengths);
404 
405  // Compare the length of aligned region in the query
406  {
407  string field("total query align length");
408  int max_diff = allow_fuzziness ? m_Opts.GetMaxLengthDiff() : 0;
409  CIntValueHolder vh(field, ref_lengths.first, test_lengths.first,
410  max_diff);
411  if ( !s_CompareValues(vh, index, errors) ) {
412  retval++;
413  }
414  }
415 
416  // Compare the length of the aligned region in the subject
417  {
418  string field("total subject align length");
419  int max_diff = allow_fuzziness ? m_Opts.GetMaxLengthDiff() : 0;
420  CIntValueHolder vh(field, ref_lengths.second, test_lengths.second,
421  max_diff);
422  if ( !s_CompareValues(vh, index, errors) ) {
423  retval++;
424  }
425  }
426 
427  // Compare strand of query in aligment (no fuzziness allowed)
428  {
429  string field("query strand");
430  CIntValueHolder vh(field, ref->query_strand, test->query_strand);
431  if ( !s_CompareValues(vh, index, errors) ) {
432  retval++;
433  }
434  }
435 
436  // Compare strand of subject in aligment (no fuzziness allowed)
437  {
438  string field("subject strand");
439  CIntValueHolder vh(field, ref->subject_strand, test->subject_strand);
440  if ( !s_CompareValues(vh, index, errors) ) {
441  retval++;
442  }
443  }
444 
445  return retval;
446 }
447 
448 END_SCOPE(qa)
449 END_SCOPE(blast)
451 
virtual bool ReportDiffs() const
CDoubleValueHolder(string &field_name, double reference_value, double test_value, double max_diff_value=0.0, double invalid_value=kInvalidDoubleValue)
CIntValueHolder(string &field_name, int reference_value, int test_value, int max_diff_value=0, int invalid_value=kInvalidIntValue)
virtual bool ReportDiffs() const
Configuration options for CSeqAlignCmp class.
int GetMaxOffsetDiff() const
double GetMinEvalue() const
int GetMaxLengthDiff() const
double GetMaxEvalueDiff() const
double GetMaxEvalue() const
const TSeqAlignSet & m_Test
The sequence alignment to be used as test (compared with reference)
const TSeqAlignSet & m_Ref
The sequence alignment to be used as reference (assumed correct)
bool x_MeetsEvalueRequirements(double reference, double test)
If reference and test alignments do not fall into the evalue range specified, don't perform the compa...
int x_CompareOneAlign(const SeqAlign *ref, const SeqAlign *test, int index, string *errors=NULL, bool allow_fuzziness=true)
Compare alignment ref with alignment test (which correspond to entry index in the TMatchedAlignments.
CSeqAlignCmp(const TSeqAlignSet &ref, const TSeqAlignSet &test, const CSeqAlignCmpOpts &options)
Parametrized constructor.
const CSeqAlignCmpOpts & m_Opts
Our configuration options.
bool Run(string *errors=NULL)
Main function for this object, compare the input Seq-aligns.
Interface class to hold values and to determine whether some difference in these values should be rep...
string & m_FieldName
CValueHolder(string &field_name, T reference_value, T test_value, T invalid_value, T max_diff_value)
T GetTest() const
T GetInvalidValue() const
virtual bool ReportDiffs() const =0
virtual ~CValueHolder()
T GetMaximumAcceptableDiff() const
string & GetFieldName() const
T GetReference() const
char value[7]
Definition: config.c:431
#define T(s)
Definition: common.h:230
std::ofstream out("events_result.xml")
main entry point for tests
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NULL
Definition: ncbistd.hpp:225
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
static string SizetToString(size_t value, TNumToStringFlags flags=0, int base=10)
Convert size_t to string.
Definition: ncbistr.cpp:2751
static string DoubleToString(double value, int precision=-1, TNumToStringFlags flags=0)
Convert double to string.
Definition: ncbistr.hpp:5186
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5083
@ fDoubleScientific
DoubleToString*(): Use scientific format for double conversions.
Definition: ncbistr.hpp:256
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
Magic spell ;-) needed for some weird compilers... very empiric.
#define fabs(v)
Definition: ncbi_dispd.c:46
#define abs(a)
Definition: ncbi_heapmgr.c:130
const int kInvalidIntValue
std::vector< SeqAlign > TSeqAlignSet
Vector of neutral sequence alignments.
const double kInvalidDoubleValue
T min(T x_, T y_)
USING_SCOPE(objects)
static pair< int, int > s_GetAlignmentLengths(const vector< int > &starts, const vector< TSeqPos > &lengths)
Returns a pair containing the length of the aligned region in the query and the length of the aligned...
bool s_CompareValues(const CValueHolder< T > &value_holder, int aln_num=0, string *errors=NULL)
Compare values in the CValueHolder object.
string s_ToString(T value)
Template wrapper around NStr::XToString functions, where X is a data type.
static void s_PrintTSeqAlignSet(const string &fname, const TSeqAlignSet &neutral_seqaligns)
void s_PrintContainer(ofstream &out, const Container &c)
BEGIN: Debugging functions.
API to compare CSeq-aligns produced by BLAST.
Neutral sequence alignment (for representing an HSP in BLAST)
int GetNumSegments() const
Return the number of segments in the HSP.
std::vector< TSeqPos > lengths
Lengths of aligned segments.
double bit_score
HSP bit score.
int num_ident
Number of identical residues.
double evalue
HSP evalue.
std::vector< int > starts
Query/Subject starting offsets.
int score
HSP score.
int query_strand
Strand of the query sequence.
int subject_strand
Strand of the subject sequence.
int test(int srctype, const void *srcdata, int srclen, int dsttype, int dstlen)
Definition: t0019.c:43
#define _ASSERT
#define const
Definition: zconf.h:230
Modified on Sat Dec 09 04:45:37 2023 by modify_doxy.py rev. 669887