NCBI C++ ToolKit
nw_aligner.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: nw_aligner.cpp 102917 2024-08-06 16:17:30Z badrazat $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Yuri Kapustin, Boris Kiryutin, Alexander Souvorov
27  *
28  * File Description: CNWAligner implementation
29  *
30  * ===========================================================================
31  *
32  */
33 
34 
35 #include <ncbi_pch.hpp>
36 
37 #include "nw_aligner_threads.hpp"
38 #include "messages.hpp"
39 
40 #include <corelib/ncbi_system.hpp>
45 
46 #include <objmgr/scope.hpp>
47 #include <objmgr/seq_vector.hpp>
48 
49 #include <algorithm>
50 
51 
54 
55 
57  : m_Wm(GetDefaultWm()),
58  m_Wms(GetDefaultWms()),
59  m_Wg(GetDefaultWg()),
60  m_Ws(GetDefaultWs()),
61  m_esf_L1(false), m_esf_R1(false), m_esf_L2(false), m_esf_R2(false),
62  m_SmithWaterman(false),
63  m_GapPreference(eLater),
64  m_abc(g_nwaligner_nucleotides),
65  m_ScoreMatrixInvalid(true),
66  m_prg_callback(0),
67  m_terminate(false),
68  m_Seq1Vec(),
69  m_Seq1(0), m_SeqLen1(0),
70  m_Seq2Vec(),
71  m_Seq2(0), m_SeqLen2(0),
72  m_PositivesAsMatches(false),
73  m_score(kInfMinus),
74  m_mt(false),
75  m_maxthreads(1),
76  m_MaxMem(GetDefaultSpaceLimit())
77 {
78  SetScoreMatrix(0);
79 }
80 
81 
82 CNWAligner::CNWAligner( const char* seq1, size_t len1,
83  const char* seq2, size_t len2,
84  const SNCBIPackedScoreMatrix* scoremat )
85 
86  : m_Wm(GetDefaultWm()),
87  m_Wms(GetDefaultWms()),
88  m_Wg(GetDefaultWg()),
89  m_Ws(GetDefaultWs()),
90  m_esf_L1(false), m_esf_R1(false), m_esf_L2(false), m_esf_R2(false),
91  m_SmithWaterman(false),
92  m_GapPreference(eLater),
93  m_abc(g_nwaligner_nucleotides),
94  m_ScoreMatrixInvalid(true),
95  m_prg_callback(0),
96  m_terminate(false),
97  m_Seq1Vec(&seq1[0], &seq1[0]+len1),
98  m_Seq1(&m_Seq1Vec[0]), m_SeqLen1(len1),
99  m_Seq2Vec(&seq2[0], &seq2[0]+len2),
100  m_Seq2(&m_Seq2Vec[0]), m_SeqLen2(len2),
101  m_PositivesAsMatches(false),
102  m_score(kInfMinus),
103  m_mt(false),
104  m_maxthreads(1),
105  m_MaxMem(GetDefaultSpaceLimit())
106 {
107  SetScoreMatrix(scoremat);
108  SetSequences(seq1, len1, seq2, len2);
109 }
110 
111 
112 CNWAligner::CNWAligner(const string& seq1,
113  const string& seq2,
114  const SNCBIPackedScoreMatrix* scoremat)
115  : m_Wm(GetDefaultWm()),
116  m_Wms(GetDefaultWms()),
117  m_Wg(GetDefaultWg()),
118  m_Ws(GetDefaultWs()),
119  m_esf_L1(false), m_esf_R1(false), m_esf_L2(false), m_esf_R2(false),
120  m_SmithWaterman(false),
121  m_GapPreference(eLater),
122  m_abc(g_nwaligner_nucleotides),
123  m_ScoreMatrixInvalid(true),
124  m_prg_callback(0),
125  m_terminate(false),
126  m_Seq1Vec(seq1.begin(), seq1.end()),
127  m_Seq1(&m_Seq1Vec[0]), m_SeqLen1(seq1.size()),
128  m_Seq2Vec(seq2.begin(), seq2.end()),
129  m_Seq2(&m_Seq2Vec[0]), m_SeqLen2(seq2.size()),
130  m_score(kInfMinus),
131  m_mt(false),
132  m_maxthreads(1),
133  m_MaxMem(GetDefaultSpaceLimit())
134 {
135  SetScoreMatrix(scoremat);
136  SetSequences(seq1, seq2);
137 };
138 
139 
140 void CNWAligner::SetSequences(const char* seq1, size_t len1,
141  const char* seq2, size_t len2,
142  bool verify)
143 {
144  if(!seq1 || !seq2) {
145  NCBI_THROW(CAlgoAlignException, eBadParameter,
147  }
148 
149  if(verify) {
150  size_t iErrPos1 = x_CheckSequence(seq1, len1);
151  if(iErrPos1 < len1) {
152  CNcbiOstrstream oss;
153  oss << "The first sequence is inconsistent with the current "
154  << "scoring matrix type. "
155  << "Position = " << iErrPos1
156  << " Symbol = '" << seq1[iErrPos1] << "'";
157 
158  string message = CNcbiOstrstreamToString(oss);
159  NCBI_THROW(CAlgoAlignException, eInvalidCharacter, message);
160  }
161 
162  size_t iErrPos2 = x_CheckSequence(seq2, len2);
163  if(iErrPos2 < len2) {
164  CNcbiOstrstream oss;
165  oss << "The second sequence is inconsistent with the current "
166  << "scoring matrix type. "
167  << "Position = " << iErrPos2
168  << " Symbol = '" << seq2[iErrPos2] << "'";
169 
170  string message = CNcbiOstrstreamToString(oss);
171  NCBI_THROW(CAlgoAlignException, eInvalidCharacter, message);
172  }
173  }
174  m_Seq1Vec.assign(&seq1[0], &seq1[0]+len1);
175  m_Seq2Vec.assign(&seq2[0], &seq2[0]+len2);
176  m_Seq1 = &m_Seq1Vec[0];
177  m_SeqLen1 = len1;
178  m_Seq2 = &m_Seq2Vec[0];
179  m_SeqLen2 = len2;
180  m_Transcript.clear();
181 }
182 
183 
184 void CNWAligner::SetSequences(const string& seq1,
185  const string& seq2,
186  bool verify)
187 {
188  SetSequences(seq1.data(), seq1.size(), seq2.data(), seq2.size(), verify);
189 }
190 
191 
192 void CNWAligner::SetEndSpaceFree(bool Left1, bool Right1,
193  bool Left2, bool Right2)
194 {
195  m_esf_L1 = Left1;
196  m_esf_R1 = Right1;
197  m_esf_L2 = Left2;
198  m_esf_R2 = Right2;
199 }
200 
202 {
204  if (SW) {
205  /// Smith-Waterman necessarily implies that all four ends are free
206  m_esf_L1 = m_esf_R1 = m_esf_L2 = m_esf_R2 = true;
207  }
208 }
209 
211 {
212  m_GapPreference = p;
213 }
214 
215 // evaluate score for each possible alignment;
216 // fill out backtrace matrix
217 // bit coding (four bits per value): D E Ec Fc
218 // D: 1 if diagonal; 0 - otherwise
219 // E: 1 if space in 1st sequence; 0 if space in 2nd sequence
220 // Ec: 1 if gap in 1st sequence was extended; 0 if it is was opened
221 // Fc: 1 if gap in 2nd sequence was extended; 0 if it is was opened
222 //
223 
224 const unsigned char kMaskFc = 0x01;
225 const unsigned char kMaskEc = 0x02;
226 const unsigned char kMaskE = 0x04;
227 const unsigned char kMaskD = 0x08;
228 
230 {
231 
232 
233  //check data integrity
234 
235  if( m_SmithWaterman && ( data->m_offset1 || m_SeqLen1 != data->m_len1 ||
236  data->m_offset2 || m_SeqLen2 != data->m_len2 ) ) {
237  NCBI_THROW(CAlgoAlignException, eBadParameter,
238  "Smith-Waterman not compatible with offsets provided");
239  }
240 
241  if( m_SmithWaterman && ( !data->m_esf_L1 || !data->m_esf_R1 ||
242  !data->m_esf_L2 || !data->m_esf_R2 ) ) {
243  NCBI_THROW(CAlgoAlignException, eBadParameter,
244  "Smith-Waterman not compatible with end gap penalties");
245  }
246 
247  const size_t N1 = data->m_len1 + 1;
248  const size_t N2 = data->m_len2 + 1;
249 
250  vector<TScore> stl_rowV (N2), stl_rowF(N2);
251 
252  const TNCBIScore (* sm) [NCBI_FSM_DIM] = m_ScoreMatrix.s;
253 
254  if(m_prg_callback) {
255  m_prg_info.m_iter_total = N1*N2;
258  return 0;
259  }
260  }
261 
262  bool bFreeGapLeft1 = data->m_esf_L1 && data->m_offset1 == 0;
263  bool bFreeGapRight1 = data->m_esf_R1 &&
264  m_SeqLen1 == data->m_offset1 + data->m_len1;
265 
266  bool bFreeGapLeft2 = data->m_esf_L2 && data->m_offset2 == 0;
267  bool bFreeGapRight2 = data->m_esf_R2 &&
268  m_SeqLen2 == data->m_offset2 + data->m_len2;
269 
270  TScore wgleft1 = bFreeGapLeft1? 0: m_Wg;
271  TScore wsleft1 = bFreeGapLeft1? 0: m_Ws;
272  TScore wg1 = m_Wg, ws1 = m_Ws;
273 
274  // index calculation: [i,j] = i*n2 + j
275  CBacktraceMatrix4 backtrace_matrix (N1 * N2);
276  backtrace_matrix.SetAt(0, 0);
277 
278  // first row
279  // note that stl_rowF[0] is not used in the main cycle,
280  size_t k;
281  stl_rowV[0] = wgleft1;
282  for (k = 1; k < N2; ++k) {
283  stl_rowV[k] = stl_rowV[k-1] + wsleft1;
284  stl_rowF[k] = kInfMinus;
285  backtrace_matrix.SetAt(k, kMaskE | kMaskEc);
286  }
287  backtrace_matrix.Purge(k);
288  stl_rowV[0] = 0;
289 
290  if(m_prg_callback) {
293  }
294 
295  // gap penalties
296  TScore wgleft2 (bFreeGapLeft2? 0: m_Wg);
297  TScore wsleft2 (bFreeGapLeft2? 0: m_Ws);
298 
299  const char * seq1 = m_Seq1 + data->m_offset1;
300  const char * seq1_end = seq1 + data->m_len1;
301 
302  TScore V0 = wgleft2;
303  TScore V = 0;//best score in the current cell. Will be equal to the NW score at the end
304  TScore best_V = 0;//best score in the whole matrix aka score for SW
305 
306  --k;
307 
308  for(; seq1 != seq1_end && !m_terminate; ++seq1) {
309 
310  backtrace_matrix.SetAt(++k, kMaskFc);
311 
312  if( seq1 + 1 == seq1_end && bFreeGapRight1) {
313  wg1 = ws1 = 0;
314  }
315 
316  unsigned char tracer;
317  const TNCBIScore * row_sc = sm[(size_t)*seq1];
318 
319  const char * seq2 = m_Seq2 + data->m_offset2;
320  const char * seq2_end = seq2 + data->m_len2;
321  TScore wg2 = m_Wg, ws2 = m_Ws;
322 
323  //best ending with gap in seq1 open seq1 X- or extended seq1 X--
324  // seq2 XX seq2 XXX
325  TScore E = kInfMinus;
326  //best ending with gap in seq2
327  TScore F;
328  //total best with
329  //best ending with match
330  TScore G;
331  //just temporary
332  TScore n0;
333  //total best
334  TScore * rowV = &stl_rowV[0];//previos row
335  V = V0 += wsleft2; //current row
336  //best ending with match
337  TScore * rowF = &stl_rowF[0];
338 
339  for (; seq2 != seq2_end;) {
340 
341  G = *rowV + row_sc[(size_t)*seq2++];
342  *rowV = V;
343 
344  n0 = V + wg1;
345  if(E >= n0) {
346  E += ws1; // continue the gap
347  tracer = kMaskEc;
348  }
349  else {
350  E = n0 + ws1; // open a new gap
351  tracer = 0;
352  }
353 
354  if( bFreeGapRight2 && seq2 == seq2_end ) {
355  wg2 = ws2 = 0;
356  }
357 
358  F = *++rowF;
359  n0 = *++rowV + wg2;
360  if(F >= n0) {
361  F += ws2;
362  tracer |= kMaskFc;
363  }
364  else {
365  F = n0 + ws2;
366  }
367  *rowF = F;
368 
369  //best score
370  if( G < F || ( G == F && m_GapPreference == eLater) ) {
371  if( E <= F ) {
372  V = F;
373  } else {
374  V = E;
375  tracer |= kMaskE;
376  }
377  } else if( E > G || ( E == G && m_GapPreference == eLater) ) {
378  V = E;
379  tracer |= kMaskE;
380  } else {
381  V = G;
382  tracer |= kMaskD;
383  }
384 
385  if (m_SmithWaterman && V < 0 ) {
386  V = 0;
387  }
388 
389  backtrace_matrix.SetAt(++k, tracer);
390 
391  if (V > best_V) {
392  best_V = V;
393  backtrace_matrix.SetBestPos(k);
394  }
395  }
396  *rowV = V;
397 
398  if(m_prg_callback) {
401  break;
402  }
403  }
404  }
405 
406  backtrace_matrix.Purge(++k);
407  backtrace_matrix.SetBestScore(best_V);
408 
409  /*
410  //print the matrix out
411  {{
412  cout<<endl;
413  int kk, ind1, ind2, width = 4;
414  cout<<setw(width)<<" ";
415  cout<<setw(width)<<"-";
416  for(ind2 = 0; ind2 < N2-1; ++ind2) {
417  cout<<setw(width)<<*(m_Seq2 + data->m_offset2 + ind2);
418  }
419  cout<<endl;
420  for(kk = 0,ind1 = 0; ind1 < N1; ++ind1) {
421  if(ind1) {
422  cout<<setw(width)<<(m_Seq1 + data->m_offset1)[ind1-1];
423  } else {
424  cout<<setw(width)<<"-";
425  }
426  for(ind2 = 0; ind2 < N2; ++ind2,++kk) {
427  string tstr;
428  unsigned char Key (backtrace_matrix[kk]);
429  if( Key & kMaskD ) tstr += "D";
430  else if ( Key & kMaskE ) tstr += "E";
431  else tstr += "F";
432  if( Key & kMaskEc ) tstr += "-";
433  if( Key & kMaskFc ) tstr += "|";
434  cout<<setw(width)<<tstr;
435  }
436  cout<<endl<<endl;
437  }
438  cout<<endl;
439  }}
440  //end of print the matrix out
441  */
442 
443  if(!m_terminate) {
444  x_SWDoBackTrace(backtrace_matrix, data);
445  //check back trace
446  TTranscript rv (data->m_transcript.size());
447  copy(data->m_transcript.rbegin(), data->m_transcript.rend(), rv.begin());
448  if(m_SmithWaterman) {
449  if( best_V != ScoreFromTranscript(rv, data->m_offset1, data->m_offset2) ) {
450  NCBI_THROW(CAlgoAlignException, eInternal,
451  "CNWAligner: error in back trace");
452  }
453  } else {
454  if( V != ScoreFromTranscript(rv, data->m_offset1, data->m_offset2) ) {
455  NCBI_THROW(CAlgoAlignException, eInternal,
456  "CNWAligner: error in back trace");
457  }
458  }
459  }
460 
461  if(m_SmithWaterman) {
462  return best_V;
463  }
464  return V;
465 }
466 
467 
469  const CSeq_id &id2, bool trim_end_gaps)
470 {
471  CSeq_loc loc1, loc2;
472  loc1.SetWhole().Assign(id1);
473  loc2.SetWhole().Assign(id2);
474  return Run(scope, loc1, loc2, trim_end_gaps);
475 }
476 
477 CRef<CSeq_align> CNWAligner::Run(CScope &scope, const CSeq_loc &loc1,
478  const CSeq_loc &loc2, bool trim_end_gaps)
479 {
480  if ((!loc1.IsInt() && !loc1.IsWhole()) ||
481  (!loc1.IsInt() && !loc1.IsWhole()))
482  {
484  "Only whole and interval locations supported");
485  }
486  CSeqVector vec1(loc1, scope, CBioseq_Handle::eCoding_Iupac);
487  string seq1;
488  vec1.GetSeqData(0, vec1.size(), seq1);
489  CSeqVector vec2(loc2, scope, CBioseq_Handle::eCoding_Iupac);
490  string seq2;
491  vec2.GetSeqData(0, vec2.size(), seq2);
492  SetSequences(seq1,seq2);
493  Run();
494  CRef<CSeq_align> align(new CSeq_align);
495  align->SetType(CSeq_align::eType_partial);
496  align->SetSegs().SetDenseg(*GetDense_seg(
497  loc1.GetStart(eExtreme_Biological), loc1.GetStrand(), *loc1.GetId(),
498  loc2.GetStart(eExtreme_Biological), loc2.GetStrand(), *loc2.GetId(),
499  trim_end_gaps));
500  return align;
501 }
502 
504 {
506 
507  NCBI_THROW(CAlgoAlignException, eInvalidMatrix,
508  "CNWAligner::SetScoreMatrix(NULL) must be called "
509  "after changing match/mismatch scores "
510  "to make sure that the new parameters are engaged.");
511  }
512 
513  if(!m_Seq1 || !m_Seq2) {
514  NCBI_THROW(CAlgoAlignException, eNoSeqData,
516  }
517 
518  if(!x_CheckMemoryLimit()) {
520  }
521 
522  if (m_SmithWaterman && !m_guides.empty()) {
523  NCBI_THROW(CAlgoAlignException, eBadParameter,
524  "Smith-Waterman not compatible with provided pattern");
525  }
526 
527  m_score = x_Run();
528 
529  return m_score;
530 }
531 
532 
534 {
535  try {
536 
537  m_terminate = false;
538 
539  if(m_guides.size() == 0) {
540 
543  m_score = x_Align(&data);
544  m_Transcript = data.m_transcript;
545  }
546  // run the algorithm for every segment between hits
547  else if(m_mt && m_maxthreads > 1) {
548 
549  size_t guides_dim = m_guides.size() / 4;
550 
551  // setup inputs
552  typedef vector<SAlignInOut> TDataVector;
553  TDataVector vdata;
554  vector<size_t> seed_dims;
555  {{
556  vdata.reserve(guides_dim + 1);
557  seed_dims.reserve(guides_dim + 1);
558  size_t q1 = m_SeqLen1, q0, s1 = m_SeqLen2, s0;
559  for(size_t istart = 4*guides_dim, i = istart; i != 0; i -= 4) {
560 
561  q0 = m_guides[i - 3] + 1;
562  s0 = m_guides[i - 1] + 1;
563  size_t dim_query = q1 - q0, dim_subj = s1 - s0;
564 
565  bool esf_L1 = false, esf_R1 = false,
566  esf_L2 = false, esf_R2 = false;
567  if(i == istart) {
568  esf_R1 = m_esf_R1;
569  esf_R2 = m_esf_R2;
570  }
571 
572  SAlignInOut data (q0, dim_query, esf_L1, esf_R1,
573  s0, dim_subj, esf_L2, esf_R2);
574 
575  vdata.push_back(data);
576  seed_dims.push_back(m_guides[i-3] - m_guides[i-4] + 1);
577 
578  q1 = m_guides[i - 4];
579  s1 = m_guides[i - 2];
580  }
581  SAlignInOut data(0, q1, m_esf_L1, false,
582  0, s1, m_esf_L2, false);
583  vdata.push_back(data);
584  }}
585 
586  // rearrange vdata so that the largest chunks come first
587  typedef vector<SAlignInOut*> TDataPtrVector;
588  TDataPtrVector vdata_p (vdata.size());
589  {{
590  TDataPtrVector::iterator jj = vdata_p.begin();
591  NON_CONST_ITERATE(TDataVector, ii, vdata) {
592  *jj++ = &(*ii);
593  }
594  stable_sort(vdata_p.begin(), vdata_p.end(),
596  }}
597 
598  // align over the segments
599  {{
600  m_Transcript.clear();
601  size_t idim = vdata.size();
602 
603  typedef vector<CNWAlignerThread_Align*> TThreadVector;
604  TThreadVector threads;
605  threads.reserve(idim);
606 
607  ITERATE(TDataPtrVector, ii, vdata_p) {
608 
609  SAlignInOut& data = **ii;
610 
611  if(static_cast<unsigned int>(data.GetSpace()) >= 10000000 &&
612  NW_RequestNewThread(static_cast<unsigned int>(m_maxthreads))) {
613 
614  CNWAlignerThread_Align* thread =
615  new CNWAlignerThread_Align(this, &data);
616  threads.push_back(thread);
617  thread->Run();
618  }
619  else {
620  x_Align(&data);
621  }
622  }
623 
624  unique_ptr<CException> e;
625  ITERATE(TThreadVector, ii, threads) {
626 
627  if(e.get() == 0) {
628  CException* pe = 0;
629  (*ii)->Join(reinterpret_cast<void**>(&pe));
630  if(pe) {
631  e.reset(new CException (*pe));
632  }
633  }
634  else {
635  (*ii)->Join(0);
636  }
637  }
638  if(e.get()) {
639  throw *e;
640  }
641 
642  for(size_t idata = 0; idata < idim; ++idata) {
643 
644  SAlignInOut& data = vdata[idata];
645  copy(data.m_transcript.begin(), data.m_transcript.end(),
646  back_inserter(m_Transcript));
647  if(idata + 1 < idim) {
648  for(size_t k = 0; k < seed_dims[idata]; ++k) {
649  m_Transcript.push_back(eTS_Match);
650  }
651  }
652  }
653 
654  m_score = ScoreFromTranscript(GetTranscript(false), 0, 0);
655  }}
656  }
657  else {
658 
659  m_Transcript.clear();
660  size_t guides_dim = m_guides.size() / 4;
661  size_t q1 = m_SeqLen1, q0, s1 = m_SeqLen2, s0;
662  for(size_t istart = 4*guides_dim, i = istart; i != 0; i -= 4) {
663 
664  q0 = m_guides[i - 3] + 1;
665  s0 = m_guides[i - 1] + 1;
666  size_t dim_query = q1 - q0, dim_subj = s1 - s0;
667 
668  bool esf_L1 = false, esf_R1 = false,
669  esf_L2 = false, esf_R2 = false;
670  if(i == istart) {
671  esf_R1 = m_esf_R1;
672  esf_R2 = m_esf_R2;
673  }
674 
675  SAlignInOut data (q0, dim_query, esf_L1, esf_R1,
676  s0, dim_subj, esf_L2, esf_R2);
677 
678  x_Align(&data);
679  copy(data.m_transcript.begin(), data.m_transcript.end(),
680  back_inserter(m_Transcript));
681 
682  size_t dim_hit = m_guides[i - 3] - m_guides[i - 4] + 1;
683  for(size_t k = 0; k < dim_hit; ++k) {
684  m_Transcript.push_back(eTS_Match);
685  }
686  q1 = m_guides[i - 4];
687  s1 = m_guides[i - 2];
688  }
689  SAlignInOut data(0, q1, m_esf_L1, false,
690  0, s1, m_esf_L2, false);
691  x_Align(&data);
692  copy(data.m_transcript.begin(), data.m_transcript.end(),
693  back_inserter(m_Transcript));
694 
695  m_score = ScoreFromTranscript(GetTranscript(false), 0, 0);
696  }
697  }
698 
699  catch(std::bad_alloc&) {
701  }
702 
703  return m_score;
704 }
705 
706 
708 const
709 {
710  const unsigned char c1 = m_Seq1[i1];
711  const unsigned char c2 = m_Seq2[i2];
712 
715  ts = m_ScoreMatrix.s[c1][c2] > 0? eTS_Match: eTS_Replace;
716  }
717  else {
718  // N vs N should be mismatch for nucleotides,
719  // and X vs X in protein. Check with matrix
720  ts = (toupper(c1) == toupper(c2) && m_ScoreMatrix.s[c1][c2] > 0)? eTS_Match: eTS_Replace;
721  }
722  return ts;
723 }
724 
725 // perform backtrace step, NW only
727  SAlignInOut* data)
728 {
729  const size_t N1 (data->m_len1 + 1);
730  const size_t N2 (data->m_len2 + 1);
731 
732  data->m_transcript.clear();
733  data->m_transcript.reserve(N1 + N2);
734 
735  size_t k (N1*N2 - 1);
736  size_t i1 (data->m_offset1 + data->m_len1 - 1);
737  size_t i2 (data->m_offset2 + data->m_len2 - 1);
738  while (k != 0) {
739 
740  unsigned char Key (backtrace[k]);
741 
742  if (Key & kMaskD) {
743 
744  data->m_transcript.push_back(x_GetDiagTS(i1--, i2--));
745  k -= N2 + 1;
746  }
747  else if (Key & kMaskE) {
748 
749  data->m_transcript.push_back(eTS_Insert);
750  --k;
751  --i2;
752  while(k > 0 && (Key & kMaskEc)) {
753 
754  data->m_transcript.push_back(eTS_Insert);
755  Key = backtrace[k--];
756  --i2;
757  }
758  }
759  else {
760 
761  data->m_transcript.push_back(eTS_Delete);
762  k -= N2;
763  --i1;
764  while(k > 0 && (Key & kMaskFc)) {
765 
766  data->m_transcript.push_back(eTS_Delete);
767  Key = backtrace[k];
768  k -= N2;
769  --i1;
770  }
771  }
772  }
773 }
774 
775 
776 
777 // perform backtrace step, NW + SW
779  SAlignInOut* data)
780 {
781  const size_t N1 (data->m_len1 + 1);
782  const size_t N2 (data->m_len2 + 1);
783  const TNCBIScore (* sm) [NCBI_FSM_DIM] = m_ScoreMatrix.s;
784 
785  data->m_transcript.clear();
786  data->m_transcript.reserve(N1 + N2);
787 
788  size_t k (N1*N2 - 1);
789  size_t i1 (data->m_offset1 + data->m_len1 - 1);
790  size_t i2 (data->m_offset2 + data->m_len2 - 1);
791  if (m_SmithWaterman) {
792  size_t sw_k = backtrace.BestPos();
793  data->FillEdgeGaps(k - sw_k, true);
794  i1 -= (k - sw_k) / (data->m_len2+1);
795  i2 -= (k - sw_k) % (data->m_len2+1);
796  k = backtrace.BestPos();
797  }
798 
799  //score for SmithWaterman. Stop when score == 0
800  TNCBIScore score = backtrace.BestScore();
801 
802  while (k > 0 && ( !m_SmithWaterman || score > 0 ) ) {
803 
804  unsigned char Key (backtrace[k]);
805 
806  if (Key & kMaskD) {
807  score -= sm[(size_t)(m_Seq1[i1])][(size_t)(m_Seq2[i2])];
808  data->m_transcript.push_back(x_GetDiagTS(i1--, i2--));
809  k -= N2 + 1;
810  }
811  else if (Key & kMaskE) {
812  score -= m_Wg + m_Ws;
813  data->m_transcript.push_back(eTS_Insert);
814  --k;
815  --i2;
816  while(k > 0 && (Key & kMaskEc)) {
817  score -= m_Ws;
818  data->m_transcript.push_back(eTS_Insert);
819  Key = backtrace[k--];
820  --i2;
821  }
822  }
823  else {
824  score -= m_Wg + m_Ws;
825  data->m_transcript.push_back(eTS_Delete);
826  k -= N2;
827  --i1;
828  while(k > 0 && (Key & kMaskFc)) {
829  score -= m_Ws;
830  data->m_transcript.push_back(eTS_Delete);
831  Key = backtrace[k];
832  k -= N2;
833  --i1;
834  }
835  }
836  }
837  if( m_SmithWaterman && score != 0 ) {
838  NCBI_THROW(CAlgoAlignException, eInternal,
839  "negative score in Smith-Waterman back trace");
840  }
841  data->FillEdgeGaps(k, false);
842 }
843 
844 
845 void CNWAligner::SetPattern(const vector<size_t>& guides)
846 {
847  size_t dim = guides.size();
848  const char* err = 0;
849  if(dim % 4 == 0) {
850  for(size_t i = 0; i < dim; i += 4) {
851 
852  if( guides[i] > guides[i+1] || guides[i+2] > guides[i+3] ) {
853  err = "Pattern hits must be specified in plus strand";
854  break;
855  }
856 
857  if(i > 4) {
858  if(guides[i] <= guides[i-3] || guides[i+2] <= guides[i-2]){
859  err = "Pattern hits coordinates must be sorted";
860  break;
861  }
862  }
863 
864  size_t dim1 = guides[i + 1] - guides[i];
865  size_t dim2 = guides[i + 3] - guides[i + 2];
866  if( dim1 != dim2) {
867  err = "Pattern hits must have equal length on both sequences";
868  break;
869  }
870 
871  if(guides[i+1] >= m_SeqLen1 || guides[i+3] >= m_SeqLen2) {
872  err = "One or several pattern hits are out of range";
873  break;
874  }
875  }
876  }
877  else {
878  err = "Pattern must have a dimension multiple of four";
879  }
880 
881  if(err) {
882  NCBI_THROW(CAlgoAlignException, eBadParameter, err);
883  }
884  else {
885  m_guides = guides;
886  }
887 }
888 
889 
890 void CNWAligner::GetEndSpaceFree(bool* L1, bool* R1, bool* L2, bool* R2) const
891 {
892  if(L1) *L1 = m_esf_L1;
893  if(R1) *R1 = m_esf_R1;
894  if(L2) *L2 = m_esf_L2;
895  if(R2) *R2 = m_esf_R2;
896 }
897 
899 {
900  return m_SmithWaterman;
901 }
902 
904 {
905  return m_GapPreference;
906 }
907 
908 // return raw transcript
910 {
911  TTranscript rv (m_Transcript.size());
912  if(reversed) {
913  copy(m_Transcript.begin(), m_Transcript.end(), rv.begin());
914  }
915  else {
916  copy(m_Transcript.rbegin(), m_Transcript.rend(), rv.begin());
917  }
918  return rv;
919 }
920 
921 
922 // alignment 'restore' - set raw transcript
923 void CNWAligner::SetTranscript(const TTranscript& transcript)
924 {
925  m_Transcript = transcript;
926  m_score = ScoreFromTranscript(transcript);
927 }
928 
929 
930 // Return transcript as a readable string
932 {
933  const Int8 dim (m_Transcript.size());
934  string s;
935  s.resize(dim);
936  size_t i1 (0), i2 (0), i (0);
937 
938  for (Int8 k (dim - 1); k >= 0; --k) {
939 
941  char c (0);
942  switch ( c0 ) {
943 
944  case eTS_Match: {
945 
946  if(m_Seq1 && m_Seq2) {
947  ETranscriptSymbol ts = x_GetDiagTS(i1++, i2++);
948  c = ts == eTS_Match? 'M': 'R';
949  }
950  else {
951  c = 'M';
952  }
953  }
954  break;
955 
956  case eTS_Replace: {
957 
958  if(m_Seq1 && m_Seq2) {
959  ETranscriptSymbol ts = x_GetDiagTS(i1++, i2++);
960  c = ts == eTS_Match? 'M': 'R';
961  }
962  else {
963  c = 'R';
964  }
965  }
966  break;
967 
968  case eTS_Insert: {
969  c = 'I';
970  ++i2;
971  }
972  break;
973 
974  case eTS_SlackInsert: {
975  c = 'i';
976  ++i2;
977  }
978  break;
979 
980  case eTS_SlackDelete: {
981  c = 'd';
982  ++i1;
983  }
984  break;
985 
986  case eTS_Delete: {
987  c = 'D';
988  ++i1;
989  }
990  break;
991 
992  case eTS_Intron: {
993  c = '+';
994  ++i2;
995  }
996  break;
997 
998  default: {
999  NCBI_THROW(CAlgoAlignException, eInternal,
1001  }
1002  }
1003 
1004  s[i++] = c;
1005  }
1006 
1007  if(i < s.size()) {
1008  s.resize(i + 1);
1009  }
1010 
1011  return s;
1012 }
1013 
1014 
1015 void CNWAligner::SetProgressCallback(FProgressCallback prg_callback, void* data)
1016 {
1017  m_prg_callback = prg_callback;
1019 }
1020 
1021 
1023 {
1024  m_Wms = val;
1025  m_ScoreMatrixInvalid = true;
1026 }
1027 
1028 
1030 {
1031  m_Wm = val;
1032  m_ScoreMatrixInvalid = true;
1033 }
1034 
1035 
1037 {
1038  if(psm) {
1039 
1040  m_abc = psm->symbols;
1042  }
1043  else { // assume IUPACna
1044 
1045  m_abc = g_nwaligner_nucleotides;
1046  const size_t dim = strlen(m_abc);
1047  vector<TNCBIScore> iupacna (dim*dim, m_Wms);
1048  iupacna[0] = iupacna[dim+1] = iupacna[2*(dim+1)] =
1049  iupacna[3*(dim+1)] = m_Wm;
1050  SNCBIPackedScoreMatrix iupacna_psm;
1051  iupacna_psm.symbols = g_nwaligner_nucleotides;
1052  iupacna_psm.scores = &iupacna.front();
1053  iupacna_psm.defscore = m_Wms;
1054  NCBISM_Unpack(&iupacna_psm, &m_ScoreMatrix);
1055  }
1056  m_ScoreMatrixInvalid = false;
1057 }
1058 
1059 
1060 // Check that all characters in sequence are valid for
1061 // the current sequence type.
1062 // Return an index to the first invalid character
1063 // or len if all characters are valid.
1064 size_t CNWAligner::x_CheckSequence(const char* seq, size_t len) const
1065 {
1066  char Flags [256];
1067  memset(Flags, 0, sizeof Flags);
1068  const size_t abc_size = strlen(m_abc);
1069 
1070  size_t k;
1071  for(k = 0; k < abc_size; ++k) {
1072  Flags[unsigned(toupper((unsigned char) m_abc[k]))] = 1;
1073  Flags[unsigned(tolower((unsigned char) m_abc[k]))] = 1;
1074  Flags[unsigned(k)] = 1;
1075  }
1076 
1077  for(k = 0; k < len; ++k) {
1078  if(Flags[unsigned(seq[k])] == 0)
1079  break;
1080  }
1081 
1082  return k;
1083 }
1084 
1085 
1087 {
1088  if(m_Transcript.size()) {
1089  return m_score;
1090  }
1091  else {
1092  NCBI_THROW(CAlgoAlignException, eNoSeqData,
1094  }
1095 }
1096 
1097 
1099 {
1100  const size_t elem_size (GetElemSize());
1101  const size_t gdim (m_guides.size());
1102  double mem (0);
1103 
1104  if(gdim) {
1105 
1106  size_t dim1 (m_guides[0]), dim2 (m_guides[2]);
1107  mem = double(dim1) * dim2 * elem_size;
1108  if(mem <= m_MaxMem) {
1109 
1110  for(size_t i (4); i < gdim; i += 4) {
1111 
1112  dim1 = m_guides[i] - m_guides[i-3] + 1;
1113  dim2 = m_guides[i + 2] - m_guides[i-1] + 1;
1114  mem = double(dim1) * dim2 * elem_size;
1115  if(mem > m_MaxMem) {
1116  break;
1117  }
1118  }
1119 
1120  if(mem <= m_MaxMem) {
1121  dim1 = m_SeqLen1 - m_guides[gdim-3];
1122  dim2 = m_SeqLen2 - m_guides[gdim-1];
1123  mem = double(dim1) * dim2 * elem_size;
1124  }
1125  }
1126  }
1127  else {
1128  mem = double(m_SeqLen1 + 1) * (m_SeqLen2 + 1) * elem_size;
1129  }
1130 
1131  return mem < m_MaxMem;
1132 }
1133 
1134 
1136 {
1137  m_maxthreads = (m_mt = enable)? CSystemInfo::GetCpuCount(): 1;
1138 }
1139 
1140 
1142  const TTranscript& transcript,
1143  size_t start1, size_t start2) const
1144 {
1145  bool nucl_mode;
1146  if(start1 == kMax_UInt && start2 == kMax_UInt) {
1147  nucl_mode = true;
1148  }
1149  else if(start1 != kMax_UInt && start2 != kMax_UInt) {
1150  nucl_mode = false;
1151  }
1152  else {
1153  NCBI_THROW(CAlgoAlignException, eInternal,
1155  }
1156 
1157  size_t dim (transcript.size());
1158  if(dim == 0) {
1159  return 0;
1160  }
1161 
1162  TScore score (0);
1163 
1164  const char* p1 (m_Seq1 + start1);
1165  const char* p2 (m_Seq2 + start2);
1166 
1167  int state1(0); // 0 = normal, 1 = gap;
1168  int state2(0); // 0 = normal, 1 = gap;
1169 
1170  size_t i (0);
1171 
1172  const TNCBIScore (*sm) [NCBI_FSM_DIM] = m_ScoreMatrix.s;
1173 
1174  ///SmithWaterman alterations
1175  if( IsSmithWaterman() ) {
1176  //cut beginning gaps
1177  for(; i<dim; ++i) {
1178  if( transcript[i] == eTS_Insert ) {
1179  ++p2;
1180  } else if( transcript[i] == eTS_Delete ) {
1181  ++p1;
1182  } else if( transcript[i] == eTS_Match ||
1183  transcript[i] == eTS_Replace ) {
1184  break;
1185  } else {
1187  }
1188  }
1189  if( i == dim ) {//alignment consists of gaps only
1190  return score;
1191  }
1192  //cut trailing gaps
1193  //for(size_t endi = dim-1; endi >=0; --endi) { // never stops CXX-11373
1194  for(size_t endi = dim; endi-- > 0; ) {
1195  if( transcript[endi] == eTS_Match || transcript[endi] == eTS_Replace ) {
1196  dim = endi + 1;
1197  break;
1198  } else if( transcript[endi] != eTS_Insert && transcript[endi] != eTS_Delete ) {
1200  }
1201  }
1202  }
1203 
1204  for(; i < dim; ++i) {
1205 
1206  ETranscriptSymbol ts (transcript[i]);
1207 
1208  switch(ts) {
1209 
1210  case eTS_Match:
1211  case eTS_Replace: {
1212  if(nucl_mode) {
1213  score += (ts == eTS_Match)? m_Wm: m_Wms;
1214  }
1215  else {
1216  unsigned char c1 = *p1;
1217  unsigned char c2 = *p2;
1218  score += sm[c1][c2];
1219  ++p1; ++p2;
1220  }
1221  state1 = state2 = 0;
1222  }
1223  break;
1224 
1225  case eTS_Insert: {
1226 
1227  if(state1 != 1) score += m_Wg;
1228  state1 = 1; state2 = 0;
1229  score += m_Ws;
1230  ++p2;
1231  }
1232  break;
1233 
1234  case eTS_Delete: {
1235 
1236  if(state2 != 1) score += m_Wg;
1237  state1 = 0; state2 = 1;
1238  score += m_Ws;
1239  ++p1;
1240  }
1241  break;
1242 
1243  default:
1245  }
1246  }
1247 
1248  if( IsSmithWaterman() ) {//end gap scores are already excluded
1249  return score;
1250  }
1251 
1252  if(m_esf_L1) {
1253  size_t g = 0;
1254  for(size_t i = 0; i < dim; ++i) {
1255  if(transcript[i] == eTS_Insert) ++g; else break;
1256  }
1257  if(g > 0) {
1258  score -= (m_Wg + g*m_Ws);
1259  }
1260  }
1261 
1262  if(m_esf_L2) {
1263  size_t g = 0;
1264  for(size_t i = 0; i < dim; ++i) {
1265  if(transcript[i] == eTS_Delete) ++g; else break;
1266  }
1267  if(g > 0) {
1268  score -= (m_Wg + g*m_Ws);
1269  }
1270  }
1271 
1272  if(m_esf_R1) {
1273  size_t g = 0;
1274  for(Int8 i = dim - 1; i >= 0; --i) {
1275  if(transcript[i] == eTS_Insert) ++g; else break;
1276  }
1277  if(g > 0) {
1278  score -= (m_Wg + g*m_Ws);
1279  }
1280  }
1281 
1282  if(m_esf_R2) {
1283  size_t g = 0;
1284  for(Int8 i = dim - 1; i >= 0; --i) {
1285  if(transcript[i] == eTS_Delete) ++g; else break;
1286  }
1287  if(g > 0) {
1288  score -= (m_Wg + g*m_Ws);
1289  }
1290  }
1291 
1292  return score;
1293 }
1294 
1295 
1296 size_t CNWAligner::GetLeftSeg(size_t* q0, size_t* q1,
1297  size_t* s0, size_t* s1,
1298  size_t min_size) const
1299 {
1300  size_t trdim = m_Transcript.size();
1301  size_t cur = 0, maxseg = 0;
1302  const char* p1 = m_Seq1;
1303  const char* p2 = m_Seq2;
1304  size_t i0 = 0, j0 = 0, imax = i0, jmax = j0;
1305 
1306  for(Int8 k = trdim - 1; k >= 0; --k) {
1307 
1308  switch(m_Transcript[k]) {
1309 
1310  case eTS_Insert: {
1311  ++p2;
1312  if(cur > maxseg) {
1313  maxseg = cur;
1314  imax = i0;
1315  jmax = j0;
1316  if(maxseg >= min_size) goto ret_point;
1317  }
1318  cur = 0;
1319  }
1320  break;
1321 
1322  case eTS_Delete: {
1323  ++p1;
1324  if(cur > maxseg) {
1325  maxseg = cur;
1326  imax = i0;
1327  jmax = j0;
1328  if(maxseg >= min_size) goto ret_point;
1329  }
1330  cur = 0;
1331  }
1332  break;
1333 
1334  case eTS_Match:
1335  case eTS_Replace: {
1336  if(*p1 == *p2) {
1337  if(cur == 0) {
1338  i0 = p1 - m_Seq1;
1339  j0 = p2 - m_Seq2;
1340  }
1341  ++cur;
1342  }
1343  else {
1344  if(cur > maxseg) {
1345  maxseg = cur;
1346  imax = i0;
1347  jmax = j0;
1348  if(maxseg >= min_size) goto ret_point;
1349  }
1350  cur = 0;
1351  }
1352  ++p1;
1353  ++p2;
1354  }
1355  break;
1356 
1357  default: {
1358  NCBI_THROW( CAlgoAlignException, eInternal,
1360  }
1361  }
1362  }
1363 
1364  if(cur > maxseg) {
1365  maxseg = cur;
1366  imax = i0;
1367  jmax = j0;
1368  }
1369 
1370  ret_point:
1371 
1372  *q0 = imax; *s0 = jmax;
1373  *q1 = *q0 + maxseg - 1;
1374  *s1 = *s0 + maxseg - 1;
1375 
1376  return maxseg;
1377 }
1378 
1379 /////////////////////////////////////////////////
1380 /// naive pattern generator (a la Rabin-Karp)
1381 
1383  nwaln_mrnaseg(size_t i1, size_t i2, unsigned char fp0):
1384  a(i1), b(i2), fp(fp0) {}
1385  size_t a, b;
1386  unsigned char fp;
1387 };
1388 
1390  nwaln_mrnaguide(size_t i1, size_t i2, size_t i3, size_t i4):
1391  q0(i1), q1(i2), s0(i3), s1(i4) {}
1392  size_t q0, q1, s0, s1;
1393 };
1394 
1395 
1397  const char* beg, const char* end, size_t& err_index)
1398 {
1399  if(beg >= end) {
1400  return 0xFF;
1401  }
1402 
1403  unsigned char fp = 0, code;
1404  for(const char* p = beg; p < end; ++p) {
1405  switch(*p) {
1406  case 'A': code = 0; break;
1407  case 'G': code = 0x01; break;
1408  case 'T': code = 0x02; break;
1409  case 'C': code = 0x03; break;
1410  default: err_index = p - beg; return 0x40; // incorrect char
1411  }
1412  fp = 0x3F & ((fp << 2) | code);
1413  }
1414 
1415  return fp;
1416 }
1417 
1418 
1420  const char* beg, const char* end,
1421  unsigned char fingerprint, size_t size,
1422  size_t& err_index)
1423 {
1424 
1425  if(beg + size > end) {
1426  err_index = 0;
1427  return 0;
1428  }
1429 
1430  const char* p0 = beg;
1431 
1432  size_t err_idx = 0; --p0;
1433  unsigned char fp = 0x40;
1434  while(fp == 0x40 && p0 < end) {
1435  p0 += err_idx + 1;
1436  fp = x_CalcFingerPrint64(p0, p0 + size, err_idx);
1437  }
1438 
1439  if(p0 >= end) {
1440  return end; // not found
1441  }
1442 
1443  unsigned char code;
1444  while(fp != fingerprint && ++p0 < end) {
1445 
1446  switch(*(p0 + size - 1)) {
1447  case 'A': code = 0; break;
1448  case 'G': code = 0x01; break;
1449  case 'T': code = 0x02; break;
1450  case 'C': code = 0x03; break;
1451  default: err_index = p0 + size - 1 - beg;
1452  return 0;
1453  }
1454 
1455  fp = 0x3F & ((fp << 2) | code );
1456  }
1457 
1458  return p0;
1459 }
1460 
1461 
1462 size_t CNWAligner::MakePattern(const size_t guide_size,
1463  const size_t guide_core)
1464 {
1465  if(guide_core > guide_size) {
1467  eBadParameter,
1469  }
1470 
1471  vector<nwaln_mrnaseg> segs;
1472 
1473  size_t err_idx(0);
1474  for(size_t i = 0; i + guide_size <= m_SeqLen1; ) {
1475  const char* beg = m_Seq1 + i;
1476  const char* end = m_Seq1 + i + guide_size;
1477  unsigned char fp = x_CalcFingerPrint64(beg, end, err_idx);
1478  if(fp != 0x40) {
1479  segs.push_back(nwaln_mrnaseg(i, i + guide_size - 1, fp));
1480  i += guide_size;
1481  }
1482  else {
1483  i += err_idx + 1;
1484  }
1485  }
1486 
1487  vector<nwaln_mrnaguide> guides;
1488  size_t idx = 0;
1489  const char* beg = m_Seq2 + idx;
1490  const char* end = m_Seq2 + m_SeqLen2;
1491  for(size_t i = 0, seg_count = segs.size();
1492  beg + guide_size <= end && i < seg_count; ++i) {
1493 
1494  const char* p = 0;
1495  const char* beg0 = beg;
1496  while( p == 0 && beg + guide_size <= end ) {
1497 
1498  p = x_FindFingerPrint64( beg, end, segs[i].fp,
1499  guide_size, err_idx );
1500  if(p == 0) { // incorrect char
1501  beg += err_idx + 1;
1502  }
1503  else if (p < end) {// fingerprints match but check actual sequences
1504  const char* seq1 = m_Seq1 + segs[i].a;
1505  const char* seq2 = p;
1506  size_t k;
1507  for(k = 0; k < guide_size; ++k) {
1508  if(seq1[k] != seq2[k]) break;
1509  }
1510  if(k == guide_size) { // real match
1511  size_t i1 = segs[i].a;
1512  size_t i2 = segs[i].b;
1513  size_t i3 = seq2 - m_Seq2;
1514  size_t i4 = i3 + guide_size - 1;
1515  size_t guides_dim = guides.size();
1516  if( guides_dim == 0 ||
1517  i1 - 1 > guides[guides_dim - 1].q1 ||
1518  i3 - 1 > guides[guides_dim - 1].s1 ) {
1519  guides.push_back(nwaln_mrnaguide(i1, i2, i3, i4));
1520  }
1521  else { // expand the last guide
1522  guides[guides_dim - 1].q1 = i2;
1523  guides[guides_dim - 1].s1 = i4;
1524  }
1525  beg0 = p + guide_size;
1526  }
1527  else { // spurious match
1528  beg = p + 1;
1529  p = 0;
1530  }
1531  }
1532  }
1533  beg = beg0; // restore start pos in genomic sequence
1534  }
1535 
1536  // initialize m_guides
1537  size_t guides_dim = guides.size();
1538  m_guides.clear();
1539  m_guides.resize(4*guides_dim);
1540  const size_t offs = guide_core/2 - 1;
1541  for(size_t k = 0; k < guides_dim; ++k) {
1542  size_t q0 = (guides[k].q0 + guides[k].q1) / 2;
1543  size_t s0 = (guides[k].s0 + guides[k].s1) / 2;
1544  m_guides[4*k] = q0 - offs;
1545  m_guides[4*k + 1] = q0 + offs;
1546  m_guides[4*k + 2] = s0 - offs;
1547  m_guides[4*k + 3] = s0 + offs;
1548  }
1549 
1550  return m_guides.size();
1551 }
1552 
1553 //////////////////////////////////////////////
1554 /////////////////////////////////////////////
1555 
1556 size_t CNWAligner::GetRightSeg(size_t* q0, size_t* q1,
1557  size_t* s0, size_t* s1,
1558  size_t min_size) const
1559 {
1560  size_t trdim = m_Transcript.size();
1561  size_t cur = 0, maxseg = 0;
1562  const char* seq1_end = m_Seq1 + m_SeqLen1;
1563  const char* seq2_end = m_Seq2 + m_SeqLen2;
1564  const char* p1 = seq1_end - 1;
1565  const char* p2 = seq2_end - 1;
1566  size_t i0 = m_SeqLen1 - 1, j0 = m_SeqLen2 - 1,
1567  imax = i0, jmax = j0;
1568 
1569  for(size_t k = 0; k < trdim; ++k) {
1570 
1571  switch(m_Transcript[k]) {
1572 
1573  case eTS_Insert: {
1574  --p2;
1575  if(cur > maxseg) {
1576  maxseg = cur;
1577  imax = i0;
1578  jmax = j0;
1579  if(maxseg >= min_size) goto ret_point;
1580  }
1581  cur = 0;
1582  }
1583  break;
1584 
1585  case eTS_Delete: {
1586  --p1;
1587  if(cur > maxseg) {
1588  maxseg = cur;
1589  imax = i0;
1590  jmax = j0;
1591  if(maxseg >= min_size) goto ret_point;
1592  }
1593  cur = 0;
1594  }
1595  break;
1596 
1597  case eTS_Match:
1598  case eTS_Replace: {
1599  if(*p1 == *p2) {
1600  if(cur == 0) {
1601  i0 = p1 - m_Seq1;
1602  j0 = p2 - m_Seq2;
1603  }
1604  ++cur;
1605  }
1606  else {
1607  if(cur > maxseg) {
1608  maxseg = cur;
1609  imax = i0;
1610  jmax = j0;
1611  if(maxseg >= min_size) goto ret_point;
1612  }
1613  cur = 0;
1614  }
1615  --p1;
1616  --p2;
1617  }
1618  break;
1619 
1620  default: {
1621  NCBI_THROW( CAlgoAlignException, eInternal,
1623  }
1624  }
1625  }
1626 
1627  if(cur > maxseg) {
1628  maxseg = cur;
1629  imax = i0;
1630  jmax = j0;
1631  }
1632 
1633  ret_point:
1634 
1635  *q1 = imax; *s1 = jmax;
1636  *q0 = imax - maxseg + 1;
1637  *s0 = jmax - maxseg + 1;
1638 
1639  return maxseg;
1640 }
1641 
1642 
1643 size_t CNWAligner::GetLongestSeg(size_t* q0, size_t* q1,
1644  size_t* s0, size_t* s1) const
1645 {
1646  size_t trdim = m_Transcript.size();
1647  size_t cur = 0, maxseg = 0;
1648  const char* p1 = m_Seq1;
1649  const char* p2 = m_Seq2;
1650  size_t i0 = 0, j0 = 0, imax = i0, jmax = j0;
1651 
1652  for(Int8 k = trdim-1; k >= 0; --k) {
1653 
1654  switch(m_Transcript[k]) {
1655 
1656  case eTS_Insert: {
1657  ++p2;
1658  if(cur > maxseg) {
1659  maxseg = cur;
1660  imax = i0;
1661  jmax = j0;
1662  }
1663  cur = 0;
1664  }
1665  break;
1666 
1667  case eTS_Delete: {
1668  ++p1;
1669  if(cur > maxseg) {
1670  maxseg = cur;
1671  imax = i0;
1672  jmax = j0;
1673  }
1674  cur = 0;
1675  }
1676  break;
1677 
1678  case eTS_Match:
1679  case eTS_Replace: {
1680  if(*p1 == *p2) {
1681  if(cur == 0) {
1682  i0 = p1 - m_Seq1;
1683  j0 = p2 - m_Seq2;
1684  }
1685  ++cur;
1686  }
1687  else {
1688  if(cur > maxseg) {
1689  maxseg = cur;
1690  imax = i0;
1691  jmax = j0;
1692  }
1693  cur = 0;
1694  }
1695  ++p1;
1696  ++p2;
1697  }
1698  break;
1699 
1700  default: {
1701  NCBI_THROW( CAlgoAlignException, eInternal,
1703  }
1704  }
1705  }
1706 
1707  if(cur > maxseg) {
1708  maxseg = cur;
1709  imax = i0;
1710  jmax = j0;
1711  }
1712 
1713  *q0 = imax; *s0 = jmax;
1714  *q1 = *q0 + maxseg - 1;
1715  *s1 = *s0 + maxseg - 1;
1716 
1717  return maxseg;
1718 }
1719 
1720 
1722  ENa_strand query_strand,
1723  TSeqPos subj_start,
1724  ENa_strand subj_strand,
1725  bool trim_end_gaps) const
1726 {
1728  if(trim_end_gaps) {
1730  } else {
1732  }
1733 
1734  CNWFormatter fmt(*this);
1735 
1736  return fmt.AsDenseSeg(query_start, query_strand,
1737  subj_start, subj_strand, flags);
1738 }
1739 
1740 
1742  ENa_strand query_strand,
1743  const CSeq_id& query_id,
1744  TSeqPos subj_start,
1745  ENa_strand subj_strand,
1746  const CSeq_id& subj_id,
1747  bool trim_end_gaps) const
1748 {
1750  if(trim_end_gaps) {
1752  } else {
1754  }
1755 
1756  CNWFormatter fmt(*this);
1757 
1758  CConstRef<CSeq_id> id0(&query_id);
1759  CConstRef<CSeq_id> id1(&subj_id);
1760  fmt.SetSeqIds(id0, id1);
1761 
1762  return fmt.AsDenseSeg(query_start, query_strand,
1763  subj_start, subj_strand, flags);
1764 }
1765 
@ eExtreme_Biological
5' and 3'
Definition: Na_strand.hpp:62
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
CScope –.
Definition: scope.hpp:92
CSeqVector –.
Definition: seq_vector.hpp:65
static unsigned int GetCpuCount(void)
Return number of active CPUs/cores (never less than 1).
static uch flags
static const char fp[]
Definition: des.c:87
#define true
Definition: bool.h:35
#define false
Definition: bool.h:36
#define G(x, y, z)
Definition: md4.c:179
char data[12]
Definition: iconv.c:80
size_t MakePattern(const size_t hit_size=100, const size_t core_size=28)
size_t GetRightSeg(size_t *q0, size_t *q1, size_t *s0, size_t *s1, size_t min_size) const
void SetAt(size_t i, Uint1 v)
Definition: nw_aligner.hpp:346
virtual bool x_CheckMemoryLimit(void)
bool m_terminate
Definition: nw_aligner.hpp:291
void SetProgressCallback(FProgressCallback prg_callback, void *data)
CNWAligner(void)
Definition: nw_aligner.cpp:56
virtual TScore x_Align(SAlignInOut *data)
Definition: nw_aligner.cpp:229
virtual TScore ScoreFromTranscript(const TTranscript &transcript, size_t start1=kMax_UInt, size_t start2=kMax_UInt) const
virtual ETranscriptSymbol x_GetDiagTS(size_t i1, size_t i2) const
Definition: nw_aligner.cpp:707
EGapPreference GetGapPreference() const
Definition: nw_aligner.cpp:903
const char * m_Seq1
Definition: nw_aligner.hpp:295
vector< size_t > m_guides
Definition: nw_aligner.hpp:317
bool m_SmithWaterman
Definition: nw_aligner.hpp:275
TScore m_Ws
Definition: nw_aligner.hpp:271
virtual void SetSequences(const char *seq1, size_t len1, const char *seq2, size_t len2, bool verify=true)
Definition: nw_aligner.cpp:140
friend class CNWAlignerThread_Align
Definition: nw_aligner.hpp:400
TTranscript GetTranscript(bool reversed=true) const
Definition: nw_aligner.cpp:909
SNCBIFullScoreMatrix m_ScoreMatrix
Definition: nw_aligner.hpp:281
const char * x_FindFingerPrint64(const char *beg, const char *end, unsigned char fingerprint, size_t size, size_t &err_index)
bool IsSmithWaterman() const
Definition: nw_aligner.cpp:898
vector< char > m_Seq1Vec
Definition: nw_aligner.hpp:294
void SetBestScore(TNCBIScore score)
Definition: nw_aligner.hpp:362
unsigned char x_CalcFingerPrint64(const char *beg, const char *end, size_t &err_index)
TScore GetScore(void) const
size_t GetLongestSeg(size_t *q0, size_t *q1, size_t *s0, size_t *s1) const
size_t x_CheckSequence(const char *seq, size_t len) const
bool m_ScoreMatrixInvalid
Definition: nw_aligner.hpp:282
TScore m_Wg
Definition: nw_aligner.hpp:270
FProgressCallback m_prg_callback
Definition: nw_aligner.hpp:285
string GetTranscriptString(void) const
Definition: nw_aligner.cpp:931
void x_SWDoBackTrace(const CBacktraceMatrix4 &backtrace, SAlignInOut *data)
Definition: nw_aligner.cpp:778
TNCBIScore BestScore() const
Definition: nw_aligner.hpp:365
virtual TScore Run(void)
Definition: nw_aligner.cpp:503
virtual TScore x_Run(void)
Definition: nw_aligner.cpp:533
void EnableMultipleThreads(bool enable=true)
size_t m_SeqLen1
Definition: nw_aligner.hpp:296
const char * m_Seq2
Definition: nw_aligner.hpp:298
vector< ETranscriptSymbol > TTranscript
Definition: nw_aligner.hpp:199
void SetSmithWaterman(bool SW)
Definition: nw_aligner.cpp:201
const char * m_abc
Definition: nw_aligner.hpp:280
bool m_PositivesAsMatches
Definition: nw_aligner.hpp:315
size_t m_maxthreads
Definition: nw_aligner.hpp:321
void GetEndSpaceFree(bool *L1, bool *R1, bool *L2, bool *R2) const
Definition: nw_aligner.cpp:890
TScore m_Wms
Definition: nw_aligner.hpp:269
void SetTranscript(const TTranscript &transcript)
Definition: nw_aligner.cpp:923
TTranscript m_Transcript
Definition: nw_aligner.hpp:314
void SetScoreMatrix(const SNCBIPackedScoreMatrix *scoremat)
EGapPreference m_GapPreference
Definition: nw_aligner.hpp:277
void SetWms(TScore value)
CRef< objects::CDense_seg > GetDense_seg(TSeqPos query_start, objects::ENa_strand query_strand, TSeqPos subj_start, objects::ENa_strand subj_strand, bool trim_end_gaps=false) const
virtual size_t GetElemSize(void) const
Definition: nw_aligner.hpp:229
void SetWm(TScore value)
size_t GetLeftSeg(size_t *q0, size_t *q1, size_t *s0, size_t *s1, size_t min_size) const
void SetPattern(const vector< size_t > &pattern)
Definition: nw_aligner.cpp:845
TScore m_score
Definition: nw_aligner.hpp:316
size_t m_MaxMem
Definition: nw_aligner.hpp:324
void x_DoBackTrace(const CBacktraceMatrix4 &backtrace, SAlignInOut *data)
Definition: nw_aligner.cpp:726
void SetGapPreference(EGapPreference p)
Control preference for where to place a gap if there is a choice; default is eLater,...
Definition: nw_aligner.cpp:210
TScore m_Wm
Definition: nw_aligner.hpp:268
vector< char > m_Seq2Vec
Definition: nw_aligner.hpp:297
void SetEndSpaceFree(bool Left1, bool Right1, bool Left2, bool Right2)
Definition: nw_aligner.cpp:192
size_t m_SeqLen2
Definition: nw_aligner.hpp:299
static bool PSpace(const SAlignInOut *p1, const SAlignInOut *p2)
Definition: nw_aligner.hpp:445
SProgressInfo m_prg_info
Definition: nw_aligner.hpp:288
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
void SetWhole(TWhole &v)
Definition: Seq_loc.hpp:982
ENa_strand GetStrand(void) const
Get the location's strand.
Definition: Seq_loc.cpp:882
TSeqPos GetStart(ESeqLocExtremes ext) const
Return start and stop positions of the seq-loc.
Definition: Seq_loc.cpp:915
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
Definition: Seq_loc.hpp:941
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
#define kMax_UInt
Definition: ncbi_limits.h:185
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
bool Run(TRunMode flags=fRunDefault)
Run the thread.
Definition: ncbithr.cpp:724
@ eType_partial
mapping pieces together
Definition: Seq_align_.hpp:103
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
bool IsWhole(void) const
Check if variant Whole is selected.
Definition: Seq_loc_.hpp:522
bool IsInt(void) const
Check if variant Int is selected.
Definition: Seq_loc_.hpp:528
int i
int len
const struct ncbi::grid::netcache::search::fields::SIZE size
#define verify(expr)
Definition: ncbi_assert.h:51
#define F(x)
Make a parametrized function appear to have only one variable.
Definition: ncbi_math.c:342
int tolower(Uchar c)
Definition: ncbictype.hpp:72
int toupper(Uchar c)
Definition: ncbictype.hpp:73
const double E
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
USING_SCOPE(objects)
const unsigned char kMaskFc
Definition: nw_aligner.cpp:224
const unsigned char kMaskEc
Definition: nw_aligner.cpp:225
const unsigned char kMaskE
Definition: nw_aligner.cpp:226
const unsigned char kMaskD
Definition: nw_aligner.cpp:227
bool NW_RequestNewThread(const unsigned int max_threads)
#define NCBI_FSM_DIM
Recommended approach: unpack and index directly.
Definition: raw_scoremat.h:85
void NCBISM_Unpack(const SNCBIPackedScoreMatrix *psm, SNCBIFullScoreMatrix *fsm)
Expand a packed score matrix into an unpacked one, which callers can proceed to index directly by sta...
Definition: raw_scoremat.c:81
int TNCBIScore
data types
Definition: raw_scoremat.h:45
#define SW
const char g_msg_InconsistentArguments[]
Definition: messages.hpp:27
const char g_msg_HitSpaceLimit[]
Definition: messages.hpp:34
const char g_msg_InvalidTranscriptSymbol[]
Definition: messages.hpp:10
const char g_msg_NoAlignment[]
Definition: messages.hpp:38
const char g_msg_OutOfSpace[]
Definition: messages.hpp:32
const char g_msg_NullParameter[]
Definition: messages.hpp:23
const char g_msg_DataNotAvailable[]
Definition: messages.hpp:29
TNCBIScore s[128][128]
Definition: raw_scoremat.h:87
const TNCBIScore * scores
strlen(symbols) x strlen(symbols)
Definition: raw_scoremat.h:48
TNCBIScore defscore
score for unknown residues
Definition: raw_scoremat.h:49
const char * symbols
order of residues
Definition: raw_scoremat.h:47
Definition: inftrees.h:24
nwaln_mrnaguide(size_t i1, size_t i2, size_t i3, size_t i4)
naive pattern generator (a la Rabin-Karp)
nwaln_mrnaseg(size_t i1, size_t i2, unsigned char fp0)
unsigned char fp
int g(Seg_Gsm *spe, Seq_Mtf *psm, Thd_Gsm *tdg)
Definition: thrddgri.c:44
Modified on Fri Sep 20 14:57:07 2024 by modify_doxy.py rev. 669887