NCBI C++ ToolKit
blast_tabular.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: blast_tabular.cpp 84945 2018-12-31 20:16:49Z dicuccio $
2 * ===========================================================================
3 *
4 * public DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Yuri Kapustin
27 *
28 * File Description:
29 *
30 */
31 
32 #include <ncbi_pch.hpp>
40 #include <corelib/ncbiutil.hpp>
41 
42 #include <numeric>
43 
46 
47 //////////////////////////////////////////////////////////////////////////////
48 // explicit specializations
49 
50 
51 CBlastTabular::CBlastTabular(const CSeq_align& seq_align, bool save_xcript):
52  TParent(seq_align, save_xcript)
53 {
54  const CSeq_align::TSegs & seq_align_segs (seq_align.GetSegs());
55 
56  TCoord spaces (0), gaps (0), aln_len (0);
57  if(seq_align_segs.IsDenseg()) {
58 
59  /// Dense-segs imply same scale on query and subject
60 
61  const CDense_seg & ds (seq_align_segs.GetDenseg());
62  const CDense_seg::TLens & lens (ds.GetLens());
63  const CDense_seg::TStarts & starts (ds.GetStarts());
64  for(size_t i (0), dim (lens.size()); i < dim; ++i) {
65  if(starts[2*i] == -1 || starts[2*i+1] == -1) {
66  ++gaps;
67  spaces += lens[i];
68  }
69  aln_len += lens[i];
70  }
71  }
72  else if (seq_align_segs.IsStd()) {
73 
74  const CSeq_align::TSegs::TStd & stdsegs (seq_align_segs.GetStd());
75 
76  /// Std-seg allow different coordinate scales
77  /// such as in prot-nuc or nuc-prot alignments.
78  /// We assume that the scale ratio is const for all segments.
79 
80  /// Find the coordinate scale ratios
81  size_t scale [2];
82  {{
83  TSeqPos len [2] = {0, 0};
84  if(stdsegs.empty()) {
86  "CBlastTabular(): Cannot init off of an empty seq-align.");
87  }
88 
89  ITERATE(CSeq_align::TSegs::TStd, ii, stdsegs) {
90 
91  const CStd_seg & seg (**ii);
92  const CStd_seg::TLoc & locs (seg.GetLoc());
93  if (locs.size() != 2) {
95  "Unexpected std-seg alignment");
96  }
97 
98  const TSeqPos r0 (locs[0]->GetTotalRange().GetLength());
99  const TSeqPos r1 (locs[1]->GetTotalRange().GetLength());
100  if(r0 > 0 && r1 > 0) {
101  len[0] += r0;
102  len[1] += r1;
103  }
104  }
105 
106  int ratio = 0;
107  if (len[0] && len[1]) {
108  // ideally 6*(1/3, 1, or 3), i.e. 2, 6, or 18
109  ratio = (6*len[0])/len[1];
110  }
111 
112  if (ratio < 4) {
113  scale[0] = 1;
114  scale[1] = 3;
115  } else if (ratio < 12) {
116  scale[0] = scale[1] = 1;
117  } else {
118  scale[0] = 3;
119  scale[1] = 1;
120  }
121  }}
122 
123  /// Parse the segments to collect basic alignment stats
124 
125  TSeqPos prev [2] = { TSeqPos(-1), TSeqPos(-1) };
126  ITERATE(CSeq_align::TSegs::TStd, ii, stdsegs) {
127 
128  const CStd_seg & seg (**ii);
129 
130  const CStd_seg::TLoc & locs (seg.GetLoc());
131  TSeqPos delta [2] = {0, 0};
132  sx_MineSegment(0, locs, delta, prev);
133  sx_MineSegment(1, locs, delta, prev);
134 
135  if(delta[0] == 0) {
136  if(delta[1] == 0) {
138  "CBlastTabular(): Empty std-segs not expected.");
139  }
140  else {
141  const TSeqPos increment (delta[1] / scale[1]);
142  aln_len += increment;
143  spaces += increment;
144  ++gaps;
145  }
146  }
147  else if (delta[1] == 0) {
148  const TSeqPos increment (delta[0] / scale[0]);
149  aln_len += increment;
150  spaces += increment;
151  ++gaps;
152  }
153  else {
154  aln_len += delta[0] / scale[0];
155  }
156  }
157  }
158  else {
159  NCBI_THROW(CAlgoAlignUtilException, eInternal, "Unsupported seq-align type");
160  }
161 
162  /// Assign the scores
163 
164 
165  double score;
166  if(seq_align.GetNamedScore("bit_score", score) == false) {
167  score = 2.f * aln_len;
168  }
169  SetScore(float(score));
170 
171  int raw_score = (int)score;//wrong, but better than 0
172  seq_align.GetNamedScore("score", raw_score);
173  SetRawScore((TCoord)raw_score);
174 
175  double matches;
176  if(seq_align.GetNamedScore("num_ident", matches) == false) {
177  matches = aln_len - spaces; // upper estimate
178  }
179  SetIdentity(float(matches / aln_len));
180 
181  SetLength(aln_len);
182  SetGaps(gaps);
183  SetMismatches(aln_len - spaces - TCoord(matches));
184 
185  double evalue;
186  if(seq_align.GetNamedScore("e_value", evalue) == false) {
187  evalue = 0;
188  }
189  SetEValue(evalue);
190 }
191 
192 
193 void CBlastTabular::sx_MineSegment(size_t where, const CStd_seg::TLoc & locs,
194  TSeqPos * delta, TSeqPos * prev)
195 {
196  const CSeq_loc & row_loc (*locs[where]);
197  CConstRef<CSeq_interval> row_interval (new CSeq_interval);
198 
199  if(row_loc.IsInt()) {
200  const CSeq_interval & row_interval (row_loc.GetInt());
201  bool disc_seg_found (false);
202  if(row_loc.GetStrand() == eNa_strand_minus) {
203  const TSeqPos row_stop (row_interval.GetFrom());
204  const TSeqPos row_start (row_interval.GetTo());
205  if(prev[where] != TSeqPos(-1) && prev[where] != row_start + 1) {
206  disc_seg_found = true;
207  }
208  delta[where] = 1 + row_start - row_stop;
209  prev[where] = row_stop;
210  }
211  else {
212  const TSeqPos row_start (row_interval.GetFrom());
213  const TSeqPos row_stop (row_interval.GetTo());
214  if(prev[where] != TSeqPos(-1) && prev[where] + 1 != row_start) {
215  disc_seg_found = true;
216  }
217  delta[where] = 1 + row_stop - row_start;
218  prev[where] = row_stop;
219  }
220 
221  if(disc_seg_found) {
223  "CBlastTabular(): discontiguous std-segs not expected");
224  }
225  }
226  else {
227  delta[where] = 0;
228  }
229 }
230 
231 
232 
233 CBlastTabular::CBlastTabular(const TId& idquery, TCoord qstart, bool qstrand,
234  const TId& idsubj, TCoord sstart, bool sstrand,
235  const string& xcript):
236  TParent(idquery, qstart, qstrand, idsubj, sstart, sstrand, xcript)
237 {
238  m_Length = xcript.size();
239  m_Mismatches = m_Gaps = 0;
240  bool diag (true);
241  size_t matches (0);
242  for(size_t i = 0; i < m_Length; ++i) {
243  switch(xcript[i]) {
244  case 'R': ++m_Mismatches; diag = true; break;
245  case 'M': ++matches; diag = true; break;
246  case 'I': case 'D': if(diag) {diag = false; ++m_Gaps; } break;
247  }
248  }
249 
250  SetIdentity(double(matches) / m_Length);
251 
252  m_EValue = 0;
253  m_Score = 2 * matches;
254  m_RawScore = 2 * matches;
255 }
256 
257 namespace {
258  class CLocalSeqIdCreator {
259  public:
260  CRef<CSeq_id> operator()(const string& strid)
261  {
262  CRef<CSeq_id> seqid;
263  seqid.Reset(new CSeq_id);
264  seqid->SetLocal().SetStr(strid);
265  return seqid;
266  }
267  };
268 
269  class CLastFastaId {
270  public:
271  CRef<CSeq_id> operator()(const string& strid)
272  {
273  CRef<CSeq_id> seqid;
274  CBioseq::TId ids;
275  CSeq_id::ParseFastaIds(ids, strid);
276  if(ids.size()) {
277  seqid = ids.back();
278  } else {
279  seqid.Reset(NULL);
280  }
281  return seqid;
282  }
283  };
284 
285  class CBestSeqIdExtractor {
286  public:
287  CBestSeqIdExtractor(CBlastTabular::SCORE_FUNC score_func) :
288  m_score_func(score_func) {}
289  CRef<CSeq_id> operator()(const string& strid)
290  {
291  CRef<CSeq_id> seqid;
292  CBioseq::TId ids;
293  CSeq_id::ParseFastaIds(ids, strid);
294  if(ids.size()) {
295  seqid = FindBestChoice(ids, m_score_func);
296  } else {
297  seqid.Reset(NULL);
298  }
299  return seqid;
300  }
301  protected:
302  CBlastTabular::SCORE_FUNC m_score_func;
303  };
304 }
305 
306 CBlastTabular::CBlastTabular(const char* m8, bool force_local_ids)
307 {
308  if (force_local_ids)
309  x_Deserialize(m8, CLocalSeqIdCreator());
310  else
311  x_Deserialize(m8, CLastFastaId());
312 }
313 
314 CBlastTabular::CBlastTabular(const char* m8, SCORE_FUNC score_func)
315 {
316  x_Deserialize(m8, CBestSeqIdExtractor(score_func));
317 }
318 
319 template <class F>
320 void CBlastTabular::x_Deserialize(const char* m8, F seq_id_extractor)
321 {
322  const char* p0 = m8, *p = p0;
323  for(; *p && isspace((unsigned char)(*p)); ++p); // skip spaces
324  for(p0 = p; *p && !isspace((unsigned char)(*p)); ++p); // get token
325  if(*p) {
326  const string id1 (p0, p - p0);
327  m_Id.first = seq_id_extractor(id1);
328  }
329 
330  for(; *p && isspace((unsigned char)(*p)); ++p); // skip spaces
331  for(p0 = p; *p && !isspace((unsigned char)(*p)); ++p); // get token
332  if(*p) {
333 
334  const string id2 (p0, p - p0);
335  m_Id.second = seq_id_extractor(id2);
336  }
337 
338  if(m_Id.first.IsNull() || m_Id.second.IsNull()) {
340  eFormat,
341  "Unable to recognize sequence IDs in "
342  + string(m8));
343  }
344 
345  for(; *p && isspace((unsigned char)(*p)); ++p); // skip spaces
346 
348 }
349 
350 
351 //////////////////////////////////////////////////////////////////////////////
352 // getters and setters
353 
354 
356 {
357  m_Length = length;
358 }
359 
360 
361 
363 {
364  return m_Length;
365 }
366 
367 
368 
370 {
371  m_Mismatches = mismatches;
372 }
373 
374 
375 
377 {
378  return m_Mismatches;
379 }
380 
381 
382 
384 {
385  m_Gaps = gaps;
386 }
387 
388 
389 
391 {
392  return m_Gaps;
393 }
394 
395 
396 
398 {
399  m_RawScore = score;
400 }
401 
402 
403 
405 {
406  return m_RawScore;
407 }
408 
409 
410 
411 void CBlastTabular::SetEValue(double EValue)
412 {
413  m_EValue = EValue;
414 }
415 
416 
417 
418 double CBlastTabular::GetEValue(void) const
419 {
420  return m_EValue;
421 }
422 
423 
424 
425 void CBlastTabular::SetScore(float score)
426 {
427  m_Score = score;
428 }
429 
430 
431 
432 float CBlastTabular::GetScore(void) const
433 {
434  return m_Score;
435 }
436 
437 
438 void CBlastTabular::SetIdentity(float identity)
439 {
440  m_Identity = identity;
441 }
442 
443 
444 
445 float CBlastTabular::GetIdentity(void) const
446 {
447  return m_Identity;
448 }
449 
450 
451 /////////////////////////////////////////////////////////////////////////////
452 // tabular serialization / deserialization
453 
454 
456 {
457  os << 100.0 * GetIdentity() << '\t' << GetLength() << '\t'
458  << GetMismatches() << '\t' << GetGaps() << '\t'
459  << TParent::GetQueryStart() + 1 << '\t'
460  << TParent::GetQueryStop() + 1 << '\t'
461  << TParent::GetSubjStart() + 1 << '\t'
462  << TParent::GetSubjStop() + 1 << '\t'
463  << GetEValue() << '\t' << GetScore();
464  if(m_Transcript.size() > 0) {
465  os << '\t' << m_Transcript;
466  }
467 }
468 
469 
470 
472 {
473  CNcbiIstrstream iss (m8);
474  double identity100, evalue, score;
475  TCoord a, b, c, d;
476  iss >> identity100 >> m_Length >> m_Mismatches >> m_Gaps
477  >> a >> b >> c >> d >> evalue >> score;
478 
479  if(iss.fail() == false) {
480 
481  m_Identity = float(identity100 / 100.0);
482  m_EValue = evalue;
483  m_Score = float(score);
484  m_RawScore = (TCoord)score;//wrong, but better than 0
485 
486  if(a > 0 && b > 0 && c > 0 && d > 0) {
487 
488  SetQueryStart(a - 1);
489  SetQueryStop(b - 1);
490  SetSubjStart(c - 1);
491  SetSubjStop(d - 1);
492  }
493  else {
495  "Coordinates in m8 string are expected to be one-based: "
496  + string(m8));
497  }
498 
499  m_Transcript.resize(0);
500  if(iss.good())
501  iss >> m_Transcript;
502  }
503  else {
504 
506  "Failed to init from m8 string: "
507  + string(m8));
508  }
509 }
510 
511 
512 void CBlastTabular::Modify(Uint1 where, TCoord new_pos)
513 {
514  const size_t trlen = GetTranscript().size();
515  if(trlen > 0) {
516 
517 
518 #ifdef ALGO_ALIGN_UTIL_BT_MM_INCLUDED
519 
520  // This is accurate but assumes that mismatches are included
521  // in the transcript, not just matches coding for generic diags.
522  // So keep it commented out for a while.
523 
524  const TCoord matches_old = TCoord(trlen * GetIdentity());
525  TParent::Modify(where, new_pos);
526  const TTranscript& tr_new = GetTranscript();
527 
528  TCoord gaps = 0;
529  TCoord matches = 0;
530  TCoord mismatches = 0;
531  bool m = false, mm = false;
532  TCoord mcnt = 0, mmcnt = 0;
533  ITERATE(TTranscript, ii, tr_new) {
534 
535  char c = *ii;
536  if(c == 'D' || c == 'I') {
537  ++gaps;
538  if(m) {
539  matches += mcnt == 0? 1: mcnt;
540  m = false;
541  }
542  if(mm) {
543  mismatches += mcnt == 0? 1: mmcnt;
544  mm = false;
545  }
546  }
547  else if (c == 'M') {
548  m = true;
549  mcnt = 0;
550  }
551  else if (c == 'R') {
552  mm = true;
553  mmcnt = 0;
554  }
555  else if('0' <= c && c <= '9') {
556  if(m) {
557  mcnt = 10*mcnt + c - '0';
558  }
559  if(mm) {
560  mmcnt = 10*mmcnt + c - '0';
561  }
562  }
563  else {
565  "Unexpected transcript symbol.");
566  }
567  }
568  if(m) {
569  matches += mcnt == 0? 1: mcnt;
570  }
571  if(mm) {
572  mismatches += mmcnt == 0? 1: mmcnt;
573  }
574  SetMismatches(mismatches);
575  SetGaps(gaps);
576  SetLength(matches + mismatches + indels);
577  SetScore(GetScore() * matches / double(matches_old));
578  SetRawScore( ( GetRawScore() * matches ) / matches_old);
579 #endif
580 
581  const TCoord trlen_old = s_RunLengthDecode(GetTranscript()).size();
582  TParent::Modify(where, new_pos);
583  const TCoord trlen_new = s_RunLengthDecode(GetTranscript()).size();
584  const double kq = double(trlen_new) / trlen_old;
586  SetGaps(TCoord(kq*GetGaps()));
587  SetLength(trlen_new);
588  SetScore(kq*GetScore());
589  SetRawScore((TCoord)(kq*GetRawScore()));
590  }
591  else {
592  const TCoord query_span_old = GetQuerySpan();
593  TParent::Modify(where, new_pos);
594  const TCoord query_span_new = GetQuerySpan();
595  const double kq = double(query_span_new) / query_span_old;
597  SetGaps(TCoord(kq*GetGaps()));
598  SetLength(TCoord(kq*GetLength()));
599  SetScore(kq*GetScore());
600  SetRawScore((TCoord)(kq*GetRawScore()));
601  }
602 }
603 
605 
USING_SCOPE(objects)
TSeqPos TCoord
TCoord GetQueryStart(void) const
TCoord GetSubjStop(void) const
TTranscript m_Transcript
std::pair< TId, TId > m_Id
static string s_RunLengthDecode(const string &in)
virtual void Modify(Uint1 point, TCoord new_pos)
TCoord GetSubjStart(void) const
string TTranscript
void SetSubjStart(TCoord pos)
TCoord GetQueryStop(void) const
void SetQueryStart(TCoord pos)
void SetSubjStop(TCoord pos)
TCoord GetQuerySpan(void) const
const TTranscript & GetTranscript(void) const
void SetQueryStop(TCoord pos)
TParent::TCoord TCoord
void SetMismatches(TCoord mismatches)
void SetLength(TCoord length)
float GetScore(void) const
void SetRawScore(TCoord score)
void SetEValue(double evalue)
void SetScore(float score)
virtual void Modify(Uint1 where, TCoord new_pos)
TCoord GetLength(void) const
TCoord GetGaps(void) const
virtual void x_PartialSerialize(CNcbiOstream &os) const
int(* SCORE_FUNC)(const CRef< objects::CSeq_id > &id)
Construct CBlastTabular from m8 line, use score_func to select seq-id from FASTA-style ids.
void SetIdentity(float identity)
float GetIdentity(void) const
virtual void x_PartialDeserialize(const char *m8)
TCoord GetMismatches(void) const
void x_Deserialize(const char *m8, F seq_id_extractor)
double GetEValue(void) const
static void sx_MineSegment(size_t where, const objects::CStd_seg::TLoc &locs, TSeqPos *delta, TSeqPos *prev)
TCoord GetRawScore(void) const
void SetGaps(TCoord gaps)
CConstRef –.
Definition: ncbiobj.hpp:1266
bool GetNamedScore(const string &id, int &score) const
Get score.
Definition: Seq_align.cpp:563
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:61
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NULL
Definition: ncbistd.hpp:225
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
static SIZE_TYPE ParseFastaIds(CBioseq::TId &ids, const CTempString &s, bool allow_partial_failure=false)
Parse an entire set of |-delimited FASTA-style IDs, appending the results to IDS.
Definition: Seq_id.cpp:2603
ENa_strand GetStrand(void) const
Get the location's strand.
Definition: Seq_loc.cpp:882
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
C::value_type FindBestChoice(const C &container, F score_func)
Find the best choice (lowest score) for values in a container.
Definition: ncbiutil.hpp:250
TStr & SetStr(void)
Select the variant.
Definition: Object_id_.hpp:304
vector< CRef< CSeq_loc > > TLoc
Definition: Std_seg_.hpp:93
list< CRef< CStd_seg > > TStd
Definition: Seq_align_.hpp:196
vector< TSeqPos > TLens
Definition: Dense_seg_.hpp:108
vector< TSignedSeqPos > TStarts
Definition: Dense_seg_.hpp:107
const TSegs & GetSegs(void) const
Get the Segs member data.
Definition: Seq_align_.hpp:921
TFrom GetFrom(void) const
Get the From member data.
TLocal & SetLocal(void)
Select the variant.
Definition: Seq_id_.cpp:199
TTo GetTo(void) const
Get the To member data.
bool IsInt(void) const
Check if variant Int is selected.
Definition: Seq_loc_.hpp:528
const TInt & GetInt(void) const
Get the variant data.
Definition: Seq_loc_.cpp:194
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
list< CRef< CSeq_id > > TId
Definition: Bioseq_.hpp:94
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
int len
unsigned int a
Definition: ncbi_localip.c:102
#define F(x)
Make a parametrized function appear to have only one variable.
Definition: ncbi_math.c:342
int isspace(Uchar c)
Definition: ncbictype.hpp:69
Useful/utility classes and methods.
Int4 delta(size_t dimension_, const Int4 *score_)
Modified on Sat Apr 13 11:46:47 2024 by modify_doxy.py rev. 669887