NCBI C++ ToolKit
glb_align.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: glb_align.cpp 100392 2023-07-27 15:57:51Z souvorov $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Alexandre Souvorov
27  *
28  * File Description:
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
35 #include <sstream>
36 
38 BEGIN_SCOPE(gnomon)
39 
40 using namespace std;
41 
42 CCigar::CCigar(string& cigar_string, int qfrom, int sfrom) {
43  m_qfrom = qfrom;
44  m_sfrom = sfrom;
45 
46  m_qto = m_qfrom-1;
47  m_sto = m_sfrom-1;
48  istringstream istr(cigar_string);
49  int len;
50  char c;
51  int clip_pos = 0;
52  while(istr >> len >> c) {
53  if(c == 'S') {
54  if(m_elements.empty()) { //new query start
55  m_qfrom += len;
56  m_qto = m_qfrom-1;
57  }
58  } else if(c == 'N') {
59  if(m_elements.empty()) { // new subject start
60  m_sfrom += len;
61  m_sto = m_sfrom-1;
62  } else {
63  cigar_string = cigar_string.substr(clip_pos); // put back intron for next exon cigar
64  return;
65  }
66  } else {
67  PushBack(SElement(len, c));
68  }
69  clip_pos = (int)istr.tellg();
70  }
71  cigar_string.clear();
72 }
73 
74 void CCigar::PushFront(const SElement& el) {
75  if(el.m_type == 'M') {
76  m_qfrom -= el.m_len;
77  m_sfrom -= el.m_len;
78  } else if(el.m_type == 'D')
79  m_sfrom -= el.m_len;
80  else
81  m_qfrom -= el.m_len;
82 
83  if(m_elements.empty() || m_elements.front().m_type != el.m_type)
84  m_elements.push_front(el);
85  else
86  m_elements.front().m_len += el.m_len;
87 }
88 
89 void CCigar::PushBack(const SElement& el) {
90  if(el.m_type == 'M') {
91  m_qto += el.m_len;
92  m_sto += el.m_len;
93  } else if(el.m_type == 'D')
94  m_sto += el.m_len;
95  else
96  m_qto += el.m_len;
97 
98  if(m_elements.empty() || m_elements.back().m_type != el.m_type)
99  m_elements.push_back(el);
100  else
101  m_elements.back().m_len += el.m_len;
102 }
103 
104 string CCigar::CigarString(int qstart, int qlen) const {
105  string cigar;
106  ITERATE(list<SElement>, i, m_elements)
107  cigar += NStr::IntToString(i->m_len)+i->m_type;
108 
109  int missingstart = qstart+m_qfrom;
110  if(missingstart > 0)
111  cigar = NStr::IntToString(missingstart)+"S"+cigar;
112  int missingend = qlen-1-m_qto-qstart;
113  if(missingend > 0)
114  cigar += NStr::IntToString(missingend)+"S";
115 
116  return cigar;
117 }
118 
119 TInDels CCigar::GetInDels(int sstart, const char* const query, const char* subject) const {
120  TInDels indels;
121 
122  int qpos = m_qfrom;
123  int spos = m_sfrom;
124  ITERATE(list<SElement>, i, m_elements) {
125  if(i->m_type == 'M') {
126  bool is_match = *(query+qpos) == *(subject+spos);
127  int len = 0;
128  for(int l = 0; l < i->m_len; ++l) {
129  if((*(query+qpos) == *(subject+spos)) == is_match) {
130  ++len;
131  } else {
132  if(!is_match) {
133  CInDelInfo indl(spos-len+sstart, len, CInDelInfo::eMism, string(query+qpos-len, len));
134  indels.push_back(indl);
135  }
136  is_match = !is_match;
137  len = 1;
138  }
139  ++qpos;
140  ++spos;
141  }
142  if(!is_match) {
143  CInDelInfo indl(spos-len+sstart, len, CInDelInfo::eMism, string(query+qpos-len, len));
144  indels.push_back(indl);
145  }
146  } else if(i->m_type == 'D') {
147  CInDelInfo indl(spos+sstart, i->m_len, CInDelInfo::eIns);
148  indels.push_back(indl);
149  spos += i->m_len;
150  } else {
151  CInDelInfo indl(spos+sstart, i->m_len, CInDelInfo::eDel, string(query+qpos, i->m_len));
152  indels.push_back(indl);
153  qpos += i->m_len;
154  }
155  }
156 
157  return indels;
158 }
159 
160 string CCigar::DetailedCigarString(int qstart, int qlen, const char* query, const char* subject) const {
161  string cigar;
162  query += m_qfrom;
163  subject += m_sfrom;
164  ITERATE(list<SElement>, i, m_elements) {
165  if(i->m_type == 'M') {
166  bool is_match = *query == *subject;
167  int len = 0;
168  for(int l = 0; l < i->m_len; ++l) {
169  if((*query == *subject) == is_match) {
170  ++len;
171  } else {
172  cigar += NStr::IntToString(len)+ (is_match ? "=" : "X");
173  is_match = !is_match;
174  len = 1;
175  }
176  ++query;
177  ++subject;
178  }
179  cigar += NStr::IntToString(len)+ (is_match ? "=" : "X");
180  } else if(i->m_type == 'D') {
181  cigar += NStr::IntToString(i->m_len)+i->m_type;
182  subject += i->m_len;
183  } else {
184  cigar += NStr::IntToString(i->m_len)+i->m_type;
185  query += i->m_len;
186  }
187  }
188 
189  int missingstart = qstart+m_qfrom;
190  if(missingstart > 0)
191  cigar = NStr::IntToString(missingstart)+"S"+cigar;
192  int missingend = qlen-1-m_qto-qstart;
193  if(missingend > 0)
194  cigar += NStr::IntToString(missingend)+"S";
195 
196  return cigar;
197 }
198 
199 TCharAlign CCigar::ToAlign(const char* query, const char* subject) const {
200  TCharAlign align;
201  query += m_qfrom;
202  subject += m_sfrom;
203  ITERATE(list<SElement>, i, m_elements) {
204  if(i->m_type == 'M') {
205  align.first.insert(align.first.end(), query, query+i->m_len);
206  query += i->m_len;
207  align.second.insert(align.second.end(), subject, subject+i->m_len);
208  subject += i->m_len;
209  } else if(i->m_type == 'D') {
210  align.first.insert(align.first.end(), i->m_len, '-');
211  align.second.insert(align.second.end(), subject, subject+i->m_len);
212  subject += i->m_len;
213  } else {
214  align.first.insert(align.first.end(), query, query+i->m_len);
215  query += i->m_len;
216  align.second.insert(align.second.end(), i->m_len, '-');
217  }
218  }
219 
220  return align;
221 }
222 
223 int CCigar::Matches(const char* query, const char* subject) const {
224  int matches = 0;
225  query += m_qfrom;
226  subject += m_sfrom;
227  ITERATE(list<SElement>, i, m_elements) {
228  if(i->m_type == 'M') {
229  for(int l = 0; l < i->m_len; ++l) {
230  if(*query == *subject)
231  ++matches;
232  ++query;
233  ++subject;
234  }
235  } else if(i->m_type == 'D') {
236  subject += i->m_len;
237  } else {
238  query += i->m_len;
239  }
240  }
241 
242  return matches;
243 }
244 
245 int CCigar::Distance(const char* query, const char* subject) const {
246  int dist = 0;
247  query += m_qfrom;
248  subject += m_sfrom;
249  ITERATE(list<SElement>, i, m_elements) {
250  if(i->m_type == 'M') {
251  for(int l = 0; l < i->m_len; ++l) {
252  if(*query != *subject)
253  ++dist;
254  ++query;
255  ++subject;
256  }
257  } else if(i->m_type == 'D') {
258  subject += i->m_len;
259  dist += i->m_len;
260  } else {
261  query += i->m_len;
262  dist += i->m_len;
263  }
264  }
265 
266  return dist;
267 }
268 
269 int CCigar::Score(const char* query, const char* subject, int gopen, int gapextend, const char delta[256][256]) const {
270  int score = 0;
271 
272  query += m_qfrom;
273  subject += m_sfrom;
274  ITERATE(list<SElement>, i, m_elements) {
275  if(i->m_type == 'M') {
276  for(int l = 0; l < i->m_len; ++l) {
277  score += delta[(int)*query][(int)*subject];
278  ++query;
279  ++subject;
280  }
281  } else if(i->m_type == 'D') {
282  subject += i->m_len;
283  score -= gopen+gapextend*i->m_len;
284  } else {
285  query += i->m_len;
286  score -= gopen+gapextend*i->m_len;
287  }
288  }
289 
290  return score;
291 }
292 
293 CCigar GlbAlign(const char* a, int na, const char* b, int nb, int rho, int sigma, const char delta[256][256]) {
294  // rho - new gap penalty (one base gap rho+sigma)
295  // sigma - extension penalty
296 
297  int* s = new int[nb+1]; // best scores in current a-raw
298  int* sm = new int[nb+1]; // best scores in previous a-raw
299  int* gapb = new int[nb+1]; // best score with b-gap
300  char* mtrx = new char[(na+1)*(nb+1)]; // backtracking info (Astart/Bstart gap start, Agap/Bgap best score has gap and should be backtracked to Asrt/Bsart)
301 
302  int rs = rho+sigma;
303  int bignegative = numeric_limits<int>::min()/2;
304 
305  sm[0] = 0;
306  sm[1] = -rs; // scores for -------------- (the best scores for i == -1)
307  for(int i = 2; i <= nb; ++i) // BBBBBBBBBBBBBB
308  sm[i] = sm[i-1]-sigma;
309  s[0] = -rs; // score for A (the best score for j == -1 and i == 0)
310  // -
311  for(int i = 0; i <= nb; ++i)
312  gapb[i] = bignegative;
313 
314  enum{Agap = 1, Bgap = 2, Astart = 4, Bstart = 8};
315 
316  mtrx[0] = 0;
317  for(int i = 1; i <= nb; ++i) { // ---------------
318  mtrx[i] = Agap; // BBBBBBBBBBBBBBB
319  }
320  mtrx[1] |= Astart;
321 
322  char* m = mtrx+nb;
323  for(int i = 0; i < na; ++i) {
324  *(++m) = Bstart|Bgap; //AAAAAAAAAAAAAAA
325  //---------------
326  int gapa = bignegative;
327  int ai = a[i];
328  const char* matrix = delta[ai];
329  int* sp = s;
330  for(int j = 0; j < nb; ) {
331  *(++m) = 0;
332  int ss = sm[j]+matrix[(int)b[j]];
333 
334  gapa -= sigma;
335  if(*sp-rs > gapa) { // for j == 0 this will open AAAAAAAAAAA- which could be used if mismatch is very expensive
336  gapa = *sp-rs; // -----------B
337  *m |= Astart;
338  }
339 
340  int& gapbj = gapb[++j];
341  gapbj -= sigma;
342  if(sm[j]-rs > gapbj) { // for i == 0 this will open BBBBBBBBBBB- which could be used if mismatch is very expensive
343  gapbj = sm[j]-rs; // -----------A
344  *m |= Bstart;
345  }
346 
347  if(gapa > gapbj) {
348  if(ss > gapa) {
349  *(++sp) = ss;
350  } else {
351  *(++sp) = gapa;
352  *m |= Agap;
353  }
354  } else {
355  if(ss > gapbj) {
356  *(++sp) = ss;
357  } else {
358  *(++sp) = gapbj;
359  *m |= Bgap;
360  }
361  }
362  }
363  swap(sm,s);
364  *s = *sm-sigma;
365  }
366 
367  int ia = na-1;
368  int ib = nb-1;
369  m = mtrx+(na+1)*(nb+1)-1;
370  CCigar track(ia, ib);
371  while(ia >= 0 || ib >= 0) {
372  if(*m&Agap) {
373  int len = 1;
374  while(!(*m&Astart)) {
375  ++len;
376  --m;
377  }
378  --m;
379  ib -= len;
380  track.PushFront(CCigar::SElement(len,'D'));
381  } else if(*m&Bgap) {
382  int len = 1;
383  while(!(*m&Bstart)) {
384  ++len;
385  m -= nb+1;
386  }
387  m -= nb+1;
388  ia -= len;
389  track.PushFront(CCigar::SElement(len,'I'));
390  } else {
391  track.PushFront(CCigar::SElement(1,'M'));
392  --ia;
393  --ib;
394  m -= nb+2;
395  }
396  }
397 
398  delete[] s;
399  delete[] sm;
400  delete[] gapb;
401  delete[] mtrx;
402 
403  return track;
404 }
405 
406 
407 CCigar LclAlign(const char* a, int na, const char* b, int nb, int rho, int sigma, const char delta[256][256]) {
408  // rho - new gap penalty (one base gap rho+sigma)
409  // sigma - extension penalty
410 
411  int* s = new int[nb+1]; // best scores in current a-raw
412  int* sm = new int[nb+1]; // best scores in previous a-raw
413  int* gapb = new int[nb+1]; // best score with b-gap
414  char* mtrx = new char[(na+1)*(nb+1)]; // backtracking info (Astart/Bstart gap start, Agap/Bgap best score has gap and should be backtracked to Asrt/Bsart; Zero stop bactracking)
415 
416  int rs = rho+sigma;
417 
418  enum{Agap = 1, Bgap = 2, Astart = 4, Bstart = 8, Zero = 16};
419 
420  for(int i = 0; i <= nb; ++i) {
421  sm[i] = 0;
422  mtrx[i] = Zero;
423  gapb[i] = 0;
424  }
425  s[0] = 0;
426 
427  int max_score = 0;
428  char* max_ptr = mtrx;
429  char* m = mtrx+nb;
430 
431  for(int i = 0; i < na; ++i) {
432  *(++m) = Zero;
433  int gapa = 0;
434  int ai = a[i];
435  const char* matrix = delta[ai];
436  int* sp = s;
437  for(int j = 0; j < nb; ) {
438  *(++m) = 0;
439  int ss = sm[j]+matrix[(int)b[j]];
440 
441  gapa -= sigma;
442  if(*sp-rs > gapa) {
443  gapa = *sp-rs;
444  *m |= Astart;
445  }
446 
447  int& gapbj = gapb[++j];
448  gapbj -= sigma;
449  if(sm[j]-rs > gapbj) {
450  gapbj = sm[j]-rs;
451  *m |= Bstart;
452  }
453 
454  if(gapa > gapbj) {
455  if(ss > gapa) {
456  *(++sp) = ss;
457  if(ss > max_score) {
458  max_score = ss;
459  max_ptr = m;
460  }
461  } else {
462  *(++sp) = gapa;
463  *m |= Agap;
464  }
465  } else {
466  if(ss > gapbj) {
467  *(++sp) = ss;
468  if(ss > max_score) {
469  max_score = ss;
470  max_ptr = m;
471  }
472  } else {
473  *(++sp) = gapbj;
474  *m |= Bgap;
475  }
476  }
477  if(*sp <= 0) {
478  *sp = 0;
479  *m |= Zero;
480  }
481  }
482  swap(sm,s);
483  }
484 
485  int ia = (int)(max_ptr-mtrx)/(nb+1)-1;
486  int ib = (max_ptr-mtrx)%(nb+1)-1;
487  CCigar track(ia, ib);
488  m = max_ptr;
489  while((ia >= 0 || ib >= 0) && !(*m&Zero)) {
490  if(*m&Agap) {
491  int len = 1;
492  while(!(*m&Astart)) {
493  ++len;
494  --m;
495  }
496  --m;
497  ib -= len;
498  track.PushFront(CCigar::SElement(len,'D'));
499  } else if(*m&Bgap) {
500  int len = 1;
501  while(!(*m&Bstart)) {
502  ++len;
503  m -= nb+1;
504  }
505  m -= nb+1;
506  ia -= len;
507  track.PushFront(CCigar::SElement(len,'I'));
508  } else {
509  track.PushFront(CCigar::SElement(1,'M'));
510  --ia;
511  --ib;
512  m -= nb+2;
513  }
514  }
515 
516  delete[] s;
517  delete[] sm;
518  delete[] gapb;
519  delete[] mtrx;
520 
521  return track;
522 }
523 
524 
525 CCigar LclAlign(const char* a, int na, const char* b, int nb, int rho, int sigma, bool pinleft, bool pinright, const char delta[256][256]) {
526  // rho - new gap penalty (one base gap rho+sigma)
527  // sigma - extension penalty
528 
529  int* s = new int[nb+1]; // best scores in current a-raw
530  int* sm = new int[nb+1]; // best scores in previous a-raw
531  int* gapb = new int[nb+1]; // best score with b-gap
532  char* mtrx = new char[(na+1)*(nb+1)]; // backtracking info (Astart/Bstart gap start, Agap/Bgap best score has gap and should be backtracked to Asrt/Bsart; Zero stop bactracking)
533 
534  int rs = rho+sigma;
535  int bignegative = numeric_limits<int>::min()/2;
536 
537  enum{Agap = 1, Bgap = 2, Astart = 4, Bstart = 8, Zero = 16};
538 
539  sm[0] = 0;
540  mtrx[0] = 0;
541  gapb[0] = bignegative; // not used
542  if(pinleft) {
543  if(nb > 0) {
544  sm[1] = -rs;
545  mtrx[1] = Astart|Agap;
546  gapb[1] = bignegative;
547  for(int i = 2; i <= nb; ++i) {
548  sm[i] = sm[i-1]-sigma;
549  mtrx[i] = Agap;
550  gapb[i] = bignegative;
551  }
552  }
553  s[0] = -rs;
554  } else {
555  for(int i = 1; i <= nb; ++i) {
556  sm[i] = 0;
557  mtrx[i] = Zero;
558  gapb[i] = bignegative;
559  }
560  s[0] = 0;
561  }
562 
563  int max_score = 0;
564  char* max_ptr = mtrx;
565 
566  char* m = mtrx+nb;
567  for(int i = 0; i < na; ++i) {
568  *(++m) = pinleft ? Bstart|Bgap : Zero;
569  int gapa = bignegative;
570  int ai = a[i];
571 
572  const char* matrix = delta[ai];
573  int* sp = s;
574  for(int j = 0; j < nb; ) {
575  *(++m) = 0;
576  int ss = sm[j]+matrix[(int)b[j]];
577 
578  gapa -= sigma;
579  if(*sp-rs > gapa) {
580  gapa = *sp-rs;
581  *m |= Astart;
582  }
583 
584  int& gapbj = gapb[++j];
585  gapbj -= sigma;
586  if(sm[j]-rs > gapbj) {
587  gapbj = sm[j]-rs;
588  *m |= Bstart;
589  }
590 
591  if(gapa > gapbj) {
592  if(ss > gapa) {
593  *(++sp) = ss;
594  if(ss > max_score) {
595  max_score = ss;
596  max_ptr = m;
597  }
598  } else {
599  *(++sp) = gapa;
600  *m |= Agap;
601  }
602  } else {
603  if(ss > gapbj) {
604  *(++sp) = ss;
605  if(ss > max_score) {
606  max_score = ss;
607  max_ptr = m;
608  }
609  } else {
610  *(++sp) = gapbj;
611  *m |= Bgap;
612  }
613  }
614  if(*sp <= 0 && !pinleft) {
615  *sp = 0;
616  *m |= Zero;
617  }
618  }
619  swap(sm,s);
620  if(pinleft)
621  *s = *sm-sigma;
622  }
623 
624  int maxa, maxb;
625  if(pinright) {
626  maxa = na-1;
627  maxb = nb-1;
628  max_score = sm[nb];
629  } else {
630  maxa = (int)(max_ptr-mtrx)/(nb+1)-1;
631  maxb = (max_ptr-mtrx)%(nb+1)-1;
632  m = max_ptr;
633  }
634  int ia = maxa;
635  int ib = maxb;
636  CCigar track(ia, ib);
637  while((ia >= 0 || ib >= 0) && !(*m&Zero)) {
638  if(*m&Agap) {
639  int len = 1;
640  while(!(*m&Astart)) {
641  ++len;
642  --m;
643  }
644  --m;
645  ib -= len;
646  track.PushFront(CCigar::SElement(len,'D'));
647  } else if(*m&Bgap) {
648  int len = 1;
649  while(!(*m&Bstart)) {
650  ++len;
651  m -= nb+1;
652  }
653  m -= nb+1;
654  ia -= len;
655  track.PushFront(CCigar::SElement(len,'I'));
656  } else {
657  track.PushFront(CCigar::SElement(1,'M'));
658  --ia;
659  --ib;
660  m -= nb+2;
661  }
662  }
663 
664  delete[] s;
665  delete[] sm;
666  delete[] gapb;
667  delete[] mtrx;
668 
669  return track;
670 }
671 
672 CCigar VariBandAlign(const char* a, int na, const char* b, int nb, int rho, int sigma, const char delta[256][256], const TSignedSeqRange* blimits) {
673  // rho - new gap penalty (one base gap rho+sigma)
674  // sigma - extension penalty
675 
676  int* s = new int[nb+1]; // best scores in current a-raw
677  int* sm = new int[nb+1]; // best scores in previous a-raw
678  int* gapb = new int[nb+1]; // best score with b-gap
679  char* mtrx = new char[(na+1)*(nb+1)]; // backtracking info (Astart/Bstart gap start, Agap/Bgap best score has gap and should be backtracked to Asrt/Bsart; Zero stop bactracking)
680 
681  int rs = rho+sigma;
682 
683  enum{Agap = 1, Bgap = 2, Astart = 4, Bstart = 8, Zero = 16};
684 
685  for(int i = 0; i <= nb; ++i) {
686  s[i] = 0;
687  sm[i] = 0;
688  gapb[i] = 0;
689  mtrx[i] = Zero;
690  }
691 
692  int max_score = 0;
693  char* max_ptr = mtrx;
694  char* m = mtrx+nb;
695 
696  const TSignedSeqRange* last = blimits+na;
697  while(true) {
698  int ai = *a++;
699  const char* matrix = delta[ai];
700 
701  int bleft = blimits->GetFrom();
702  int bright = blimits->GetTo();
703  m += bleft;
704  *(++m) = Zero;
705  int gapa = 0;
706  int* sp = s+bleft;
707  *sp = 0;
708  for(int j = bleft; j <= bright; ) {
709  *(++m) = 0;
710  int ss = sm[j]+matrix[(int)b[j]];
711 
712  gapa -= sigma;
713  if(*sp-rs > gapa) {
714  gapa = *sp-rs;
715  *m |= Astart;
716  }
717 
718  int& gapbj = gapb[++j];
719  gapbj -= sigma;
720  if(sm[j]-rs > gapbj) {
721  gapbj = sm[j]-rs;
722  *m |= Bstart;
723  }
724 
725  if(gapa > gapbj) {
726  if(ss > gapa) {
727  *(++sp) = ss;
728  if(ss > max_score) {
729  max_score = ss;
730  max_ptr = m;
731  }
732  } else {
733  *(++sp) = gapa;
734  *m |= Agap;
735  }
736  } else {
737  if(ss > gapbj) {
738  *(++sp) = ss;
739  if(ss > max_score) {
740  max_score = ss;
741  max_ptr = m;
742  }
743  } else {
744  *(++sp) = gapbj;
745  *m |= Bgap;
746  }
747  }
748  if(*sp <= 0) {
749  *sp = 0;
750  *m |= Zero;
751  }
752  }
753  if(++blimits == last)
754  break;
755 
756  swap(sm,s);
757 
758  int next = blimits->GetTo();
759  for( ; bright < next-1; ++bright)
760  *(++m) = Zero;
761 
762  m += nb-bright-1;
763  }
764 
765  int ia = (int)(max_ptr-mtrx)/(nb+1)-1;
766  int ib = (max_ptr-mtrx)%(nb+1)-1;
767  CCigar track(ia, ib);
768  m = max_ptr;
769  while((ia >= 0 || ib >= 0) && !(*m&Zero)) {
770  if(*m&Agap) {
771  int len = 1;
772  while(!(*m&Astart)) {
773  ++len;
774  --m;
775  }
776  --m;
777  ib -= len;
778  track.PushFront(CCigar::SElement(len,'D'));
779  } else if(*m&Bgap) {
780  int len = 1;
781  while(!(*m&Bstart)) {
782  ++len;
783  m -= nb+1;
784  }
785  m -= nb+1;
786  ia -= len;
787  track.PushFront(CCigar::SElement(len,'I'));
788  } else {
789  track.PushFront(CCigar::SElement(1,'M'));
790  --ia;
791  --ib;
792  m -= nb+2;
793  }
794  }
795 
796  delete[] s;
797  delete[] sm;
798  delete[] gapb;
799  delete[] mtrx;
800 
801  return track;
802 }
803 
804 
805 SMatrix::SMatrix(int match, int mismatch) { // matrix for DNA
806 
807  for(int i = 0; i < 256; ++i) {
808  int c = toupper(i);
809  for(int j = 0; j < 256; ++j) {
810  if(c != 'N' && c == toupper(j)) matrix[i][j] = match;
811  else matrix[i][j] = -mismatch;
812  }
813  }
814 }
815 
816 SMatrix::SMatrix() { // matrix for proteins
817 
818  string aa("ARNDCQEGHILKMFPSTWYVBZX*");
819  int scores[] = {
820  4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-2,-1, 0,-4,
821 -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-1, 0,-1,-4,
822 -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3, 3, 0,-1,-4,
823 -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3, 4, 1,-1,-4,
824  0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-3,-3,-2,-4,
825 -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2, 0, 3,-1,-4,
826 -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2, 1, 4,-1,-4,
827  0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-1,-2,-1,-4,
828 -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3, 0, 0,-1,-4,
829 -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-3,-3,-1,-4,
830 -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-3,-1,-4,
831 -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2, 0, 1,-1,-4,
832 -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-3,-1,-1,-4,
833 -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-3,-3,-1,-4,
834 -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-2,-1,-2,-4,
835  1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2, 0, 0, 0,-4,
836  0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-1,-1, 0,-4,
837 -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-3,-2,-4,
838 -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-3,-2,-1,-4,
839  0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-3,-2,-1,-4,
840 -2,-1, 3, 4,-3, 0, 1,-1, 0,-3,-4, 0,-3,-3,-2, 0,-1,-4,-3,-3, 4, 1,-1,-4,
841 -1, 0, 0, 1,-3, 3, 4,-2, 0,-3,-3, 1,-1,-3,-1, 0,-1,-3,-2,-2, 1, 4,-1,-4,
842  0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-1,-1,-1,-4,
843 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1
844  };
845 
846  for(int i = 0; i < 256; ++i) {
847  for(int j = 0; j < 256; ++j) {
848  matrix[i][j] = 0;
849  }
850  }
851 
852  int num = (int)aa.size();
853  for(int i = 0; i < num; ++i) {
854  char c = aa[i];
855  for(int j = 0; j < num; ++j) {
856  int score = scores[num*j+i];
857  char d = aa[j];
858  matrix[(int)c][(int)d] = score;
859  matrix[(int)tolower(c)][(int)tolower(d)] = score;
860  matrix[(int)c][(int)tolower(d)] = score;
861  matrix[(int)tolower(c)][(int)d] = score;
862  }
863  }
864 }
865 
866 double Entropy(const string& seq) {
867  int length = (int)seq.size();
868  if(length == 0)
869  return 0;
870  double tA = 1.e-8;
871  double tC = 1.e-8;
872  double tG = 1.e-8;
873  double tT = 1.e-8;
874  ITERATE(string, i, seq) {
875  switch(*i) {
876  case 'A': tA += 1; break;
877  case 'C': tC += 1; break;
878  case 'G': tG += 1; break;
879  case 'T': tT += 1; break;
880  default: break;
881  }
882  }
883  double entropy = -(tA*log(tA/length)+tC*log(tC/length)+tG*log(tG/length)+tT*log(tT/length))/(length*log(4.));
884 
885  return entropy;
886 }
887 
888 
889 
890 END_SCOPE(gnomon)
892 
893 
894 
void PushFront(const SElement &el)
Definition: glb_align.cpp:74
TCharAlign ToAlign(const char *query, const char *subject) const
Definition: glb_align.cpp:199
void PushBack(const SElement &el)
Definition: glb_align.cpp:89
int Distance(const char *query, const char *subject) const
Definition: glb_align.cpp:245
CCigar(int qto=-1, int sto=-1)
Definition: glb_align.hpp:46
TInDels GetInDels(int sstart, const char *const query, const char *subject) const
Definition: glb_align.cpp:119
string CigarString(int qstart, int qlen) const
Definition: glb_align.cpp:104
int Score(const char *query, const char *subject, int gopen, int gapextend, const char delta[256][256]) const
Definition: glb_align.cpp:269
string DetailedCigarString(int qstart, int qlen, const char *query, const char *subject) const
Definition: glb_align.cpp:160
int Matches(const char *query, const char *subject) const
Definition: glb_align.cpp:223
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:51
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:56
double Entropy(const string &seq)
Definition: glb_align.cpp:866
CCigar LclAlign(const char *a, int na, const char *b, int nb, int rho, int sigma, const char delta[256][256])
Definition: glb_align.cpp:407
CCigar VariBandAlign(const char *a, int na, const char *b, int nb, int rho, int sigma, const char delta[256][256], const TSignedSeqRange *blimits)
Definition: glb_align.cpp:672
CCigar GlbAlign(const char *a, int na, const char *b, int nb, int rho, int sigma, const char delta[256][256])
Definition: glb_align.cpp:293
pair< string, string > TCharAlign
Definition: glb_align.hpp:42
vector< CInDelInfo > TInDels
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
Definition: ncbimisc.hpp:1508
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5078
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
int len
Magic spell ;-) needed for some weird compilers... very empiric.
unsigned int a
Definition: ncbi_localip.c:102
int tolower(Uchar c)
Definition: ncbictype.hpp:72
int toupper(Uchar c)
Definition: ncbictype.hpp:73
T min(T x_, T y_)
Int4 delta(size_t dimension_, const Int4 *score_)
static int match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, uint16_t top_bracket, PCRE2_SIZE frame_size, pcre2_match_data *match_data, match_block *mb)
Definition: pcre2_match.c:594
static SLJIT_INLINE sljit_ins l(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
static string subject
static string query
Modified on Fri Sep 20 14:57:45 2024 by modify_doxy.py rev. 669887