57 : m_Wm(GetDefaultWm()),
58 m_Wms(GetDefaultWms()),
62 m_SmithWaterman(
false),
63 m_GapPreference(eLater),
64 m_abc(g_nwaligner_nucleotides),
65 m_ScoreMatrixInvalid(
true),
69 m_Seq1(0), m_SeqLen1(0),
71 m_Seq2(0), m_SeqLen2(0),
72 m_PositivesAsMatches(
false),
76 m_MaxMem(GetDefaultSpaceLimit())
83 const char* seq2,
size_t len2,
86 : m_Wm(GetDefaultWm()),
87 m_Wms(GetDefaultWms()),
91 m_SmithWaterman(
false),
92 m_GapPreference(eLater),
93 m_abc(g_nwaligner_nucleotides),
94 m_ScoreMatrixInvalid(
true),
97 m_Seq1Vec(&seq1[0], &seq1[0]+len1),
98 m_Seq1(&m_Seq1Vec[0]), m_SeqLen1(len1),
99 m_Seq2Vec(&seq2[0], &seq2[0]+len2),
100 m_Seq2(&m_Seq2Vec[0]), m_SeqLen2(len2),
101 m_PositivesAsMatches(
false),
105 m_MaxMem(GetDefaultSpaceLimit())
115 : m_Wm(GetDefaultWm()),
116 m_Wms(GetDefaultWms()),
117 m_Wg(GetDefaultWg()),
118 m_Ws(GetDefaultWs()),
120 m_SmithWaterman(
false),
121 m_GapPreference(eLater),
122 m_abc(g_nwaligner_nucleotides),
123 m_ScoreMatrixInvalid(
true),
126 m_Seq1Vec(seq1.begin(), seq1.end()),
127 m_Seq1(&m_Seq1Vec[0]), m_SeqLen1(seq1.
size()),
128 m_Seq2Vec(seq2.begin(), seq2.end()),
129 m_Seq2(&m_Seq2Vec[0]), m_SeqLen2(seq2.
size()),
133 m_MaxMem(GetDefaultSpaceLimit())
141 const char* seq2,
size_t len2,
151 if(iErrPos1 < len1) {
153 oss <<
"The first sequence is inconsistent with the current "
154 <<
"scoring matrix type. "
155 <<
"Position = " << iErrPos1
156 <<
" Symbol = '" << seq1[iErrPos1] <<
"'";
163 if(iErrPos2 < len2) {
165 oss <<
"The second sequence is inconsistent with the current "
166 <<
"scoring matrix type. "
167 <<
"Position = " << iErrPos2
168 <<
" Symbol = '" << seq2[iErrPos2] <<
"'";
174 m_Seq1Vec.assign(&seq1[0], &seq1[0]+len1);
175 m_Seq2Vec.assign(&seq2[0], &seq2[0]+len2);
193 bool Left2,
bool Right2)
238 "Smith-Waterman not compatible with offsets provided");
242 !
data->m_esf_L2 || !
data->m_esf_R2 ) ) {
244 "Smith-Waterman not compatible with end gap penalties");
247 const size_t N1 =
data->m_len1 + 1;
248 const size_t N2 =
data->m_len2 + 1;
250 vector<TScore> stl_rowV (N2), stl_rowF(N2);
262 bool bFreeGapLeft1 =
data->m_esf_L1 &&
data->m_offset1 == 0;
263 bool bFreeGapRight1 =
data->m_esf_R1 &&
266 bool bFreeGapLeft2 =
data->m_esf_L2 &&
data->m_offset2 == 0;
267 bool bFreeGapRight2 =
data->m_esf_R2 &&
276 backtrace_matrix.
SetAt(0, 0);
281 stl_rowV[0] = wgleft1;
282 for (k = 1; k < N2; ++k) {
283 stl_rowV[k] = stl_rowV[k-1] + wsleft1;
284 stl_rowF[k] = kInfMinus;
287 backtrace_matrix.
Purge(k);
300 const char * seq1_end = seq1 +
data->m_len1;
312 if( seq1 + 1 == seq1_end && bFreeGapRight1) {
316 unsigned char tracer;
317 const TNCBIScore * row_sc = sm[(size_t)*seq1];
320 const char * seq2_end = seq2 +
data->m_len2;
334 TScore * rowV = &stl_rowV[0];
337 TScore * rowF = &stl_rowF[0];
339 for (; seq2 != seq2_end;) {
341 G = *rowV + row_sc[(size_t)*seq2++];
354 if( bFreeGapRight2 && seq2 == seq2_end ) {
389 backtrace_matrix.
SetAt(++k, tracer);
406 backtrace_matrix.
Purge(++k);
447 copy(
data->m_transcript.rbegin(),
data->m_transcript.rend(), rv.begin());
451 "CNWAligner: error in back trace");
456 "CNWAligner: error in back trace");
469 const CSeq_id &id2,
bool trim_end_gaps)
474 return Run(scope, loc1, loc2, trim_end_gaps);
478 const CSeq_loc &loc2,
bool trim_end_gaps)
484 "Only whole and interval locations supported");
488 vec1.GetSeqData(0, vec1.size(), seq1);
491 vec2.GetSeqData(0, vec2.size(), seq2);
508 "CNWAligner::SetScoreMatrix(NULL) must be called "
509 "after changing match/mismatch scores "
510 "to make sure that the new parameters are engaged.");
524 "Smith-Waterman not compatible with provided pattern");
549 size_t guides_dim =
m_guides.size() / 4;
552 typedef vector<SAlignInOut> TDataVector;
554 vector<size_t> seed_dims;
556 vdata.reserve(guides_dim + 1);
557 seed_dims.reserve(guides_dim + 1);
559 for(
size_t istart = 4*guides_dim,
i = istart;
i != 0;
i -= 4) {
563 size_t dim_query = q1 - q0, dim_subj = s1 - s0;
565 bool esf_L1 =
false, esf_R1 =
false,
566 esf_L2 =
false, esf_R2 =
false;
573 s0, dim_subj, esf_L2, esf_R2);
575 vdata.push_back(
data);
583 vdata.push_back(
data);
587 typedef vector<SAlignInOut*> TDataPtrVector;
588 TDataPtrVector vdata_p (vdata.size());
590 TDataPtrVector::iterator jj = vdata_p.begin();
594 stable_sort(vdata_p.begin(), vdata_p.end(),
601 size_t idim = vdata.size();
603 typedef vector<CNWAlignerThread_Align*> TThreadVector;
604 TThreadVector threads;
605 threads.reserve(idim);
607 ITERATE(TDataPtrVector, ii, vdata_p) {
611 if(
static_cast<unsigned int>(
data.GetSpace()) >= 10000000 &&
616 threads.push_back(thread);
624 unique_ptr<CException> e;
625 ITERATE(TThreadVector, ii, threads) {
629 (*ii)->Join(
reinterpret_cast<void**
>(&pe));
642 for(
size_t idata = 0; idata < idim; ++idata) {
645 copy(
data.m_transcript.begin(),
data.m_transcript.end(),
647 if(idata + 1 < idim) {
648 for(
size_t k = 0; k < seed_dims[idata]; ++k) {
660 size_t guides_dim =
m_guides.size() / 4;
662 for(
size_t istart = 4*guides_dim,
i = istart;
i != 0;
i -= 4) {
666 size_t dim_query = q1 - q0, dim_subj = s1 - s0;
668 bool esf_L1 =
false, esf_R1 =
false,
669 esf_L2 =
false, esf_R2 =
false;
676 s0, dim_subj, esf_L2, esf_R2);
679 copy(
data.m_transcript.begin(),
data.m_transcript.end(),
683 for(
size_t k = 0; k < dim_hit; ++k) {
692 copy(
data.m_transcript.begin(),
data.m_transcript.end(),
699 catch(std::bad_alloc&) {
710 const unsigned char c1 =
m_Seq1[i1];
711 const unsigned char c2 =
m_Seq2[i2];
729 const size_t N1 (
data->m_len1 + 1);
730 const size_t N2 (
data->m_len2 + 1);
732 data->m_transcript.clear();
733 data->m_transcript.reserve(N1 + N2);
735 size_t k (N1*N2 - 1);
736 size_t i1 (
data->m_offset1 +
data->m_len1 - 1);
737 size_t i2 (
data->m_offset2 +
data->m_len2 - 1);
740 unsigned char Key (backtrace[k]);
752 while(k > 0 && (Key &
kMaskEc)) {
755 Key = backtrace[k--];
764 while(k > 0 && (Key &
kMaskFc)) {
781 const size_t N1 (
data->m_len1 + 1);
782 const size_t N2 (
data->m_len2 + 1);
785 data->m_transcript.clear();
786 data->m_transcript.reserve(N1 + N2);
788 size_t k (N1*N2 - 1);
789 size_t i1 (
data->m_offset1 +
data->m_len1 - 1);
790 size_t i2 (
data->m_offset2 +
data->m_len2 - 1);
792 size_t sw_k = backtrace.
BestPos();
793 data->FillEdgeGaps(k - sw_k,
true);
794 i1 -= (k - sw_k) / (
data->m_len2+1);
795 i2 -= (k - sw_k) % (
data->m_len2+1);
804 unsigned char Key (backtrace[k]);
807 score -= sm[(size_t)(
m_Seq1[i1])][(size_t)(
m_Seq2[i2])];
816 while(k > 0 && (Key &
kMaskEc)) {
819 Key = backtrace[k--];
828 while(k > 0 && (Key &
kMaskFc)) {
839 "negative score in Smith-Waterman back trace");
841 data->FillEdgeGaps(k,
false);
847 size_t dim = guides.size();
850 for(
size_t i = 0;
i < dim;
i += 4) {
852 if( guides[
i] > guides[
i+1] || guides[
i+2] > guides[
i+3] ) {
853 err =
"Pattern hits must be specified in plus strand";
858 if(guides[
i] <= guides[
i-3] || guides[
i+2] <= guides[
i-2]){
859 err =
"Pattern hits coordinates must be sorted";
864 size_t dim1 = guides[
i + 1] - guides[
i];
865 size_t dim2 = guides[
i + 3] - guides[
i + 2];
867 err =
"Pattern hits must have equal length on both sequences";
872 err =
"One or several pattern hits are out of range";
878 err =
"Pattern must have a dimension multiple of four";
936 size_t i1 (0), i2 (0),
i (0);
938 for (
Int8 k (dim - 1); k >= 0; --k) {
1045 m_abc = g_nwaligner_nucleotides;
1046 const size_t dim = strlen(
m_abc);
1047 vector<TNCBIScore> iupacna (dim*dim,
m_Wms);
1048 iupacna[0] = iupacna[dim+1] = iupacna[2*(dim+1)] =
1049 iupacna[3*(dim+1)] =
m_Wm;
1051 iupacna_psm.
symbols = g_nwaligner_nucleotides;
1052 iupacna_psm.
scores = &iupacna.front();
1067 memset(Flags, 0,
sizeof Flags);
1068 const size_t abc_size = strlen(
m_abc);
1071 for(k = 0; k < abc_size; ++k) {
1072 Flags[unsigned(
toupper((
unsigned char)
m_abc[k]))] = 1;
1073 Flags[unsigned(
tolower((
unsigned char)
m_abc[k]))] = 1;
1074 Flags[unsigned(k)] = 1;
1077 for(k = 0; k <
len; ++k) {
1078 if(Flags[
unsigned(seq[k])] == 0)
1101 const size_t gdim (
m_guides.size());
1107 mem = double(dim1) * dim2 * elem_size;
1110 for(
size_t i (4);
i < gdim;
i += 4) {
1114 mem = double(dim1) * dim2 * elem_size;
1123 mem = double(dim1) * dim2 * elem_size;
1143 size_t start1,
size_t start2)
const
1157 size_t dim (transcript.size());
1164 const char* p1 (
m_Seq1 + start1);
1165 const char* p2 (
m_Seq2 + start2);
1194 for(
size_t endi = dim; endi-- > 0; ) {
1204 for(;
i < dim; ++
i) {
1216 unsigned char c1 = *p1;
1217 unsigned char c2 = *p2;
1218 score += sm[c1][c2];
1221 state1 = state2 = 0;
1227 if(state1 != 1) score +=
m_Wg;
1228 state1 = 1; state2 = 0;
1236 if(state2 != 1) score +=
m_Wg;
1237 state1 = 0; state2 = 1;
1254 for(
size_t i = 0;
i < dim; ++
i) {
1264 for(
size_t i = 0;
i < dim; ++
i) {
1274 for(
Int8 i = dim - 1;
i >= 0; --
i) {
1284 for(
Int8 i = dim - 1;
i >= 0; --
i) {
1297 size_t* s0,
size_t* s1,
1298 size_t min_size)
const
1301 size_t cur = 0, maxseg = 0;
1304 size_t i0 = 0, j0 = 0, imax = i0, jmax = j0;
1306 for(
Int8 k = trdim - 1; k >= 0; --k) {
1316 if(maxseg >= min_size)
goto ret_point;
1328 if(maxseg >= min_size)
goto ret_point;
1348 if(maxseg >= min_size)
goto ret_point;
1372 *q0 = imax; *s0 = jmax;
1373 *q1 = *q0 + maxseg - 1;
1374 *s1 = *s0 + maxseg - 1;
1384 a(i1),
b(i2),
fp(fp0) {}
1397 const char* beg,
const char* end,
size_t& err_index)
1403 unsigned char fp = 0,
code;
1404 for(
const char* p = beg; p < end; ++p) {
1406 case 'A':
code = 0;
break;
1407 case 'G':
code = 0x01;
break;
1408 case 'T':
code = 0x02;
break;
1409 case 'C':
code = 0x03;
break;
1410 default: err_index = p - beg;
return 0x40;
1420 const char* beg,
const char* end,
1421 unsigned char fingerprint,
size_t size,
1425 if(beg +
size > end) {
1430 const char* p0 = beg;
1432 size_t err_idx = 0; --p0;
1433 unsigned char fp = 0x40;
1434 while(
fp == 0x40 && p0 < end) {
1444 while(
fp != fingerprint && ++p0 < end) {
1446 switch(*(p0 +
size - 1)) {
1447 case 'A':
code = 0;
break;
1448 case 'G':
code = 0x01;
break;
1449 case 'T':
code = 0x02;
break;
1450 case 'C':
code = 0x03;
break;
1451 default: err_index = p0 +
size - 1 - beg;
1463 const size_t guide_core)
1465 if(guide_core > guide_size) {
1471 vector<nwaln_mrnaseg> segs;
1474 for(
size_t i = 0;
i + guide_size <=
m_SeqLen1; ) {
1476 const char* end =
m_Seq1 +
i + guide_size;
1487 vector<nwaln_mrnaguide> guides;
1489 const char* beg =
m_Seq2 + idx;
1491 for(
size_t i = 0, seg_count = segs.size();
1492 beg + guide_size <= end &&
i < seg_count; ++
i) {
1495 const char* beg0 = beg;
1496 while( p == 0 && beg + guide_size <= end ) {
1499 guide_size, err_idx );
1504 const char* seq1 =
m_Seq1 + segs[
i].a;
1505 const char* seq2 = p;
1507 for(k = 0; k < guide_size; ++k) {
1508 if(seq1[k] != seq2[k])
break;
1510 if(k == guide_size) {
1511 size_t i1 = segs[
i].a;
1512 size_t i2 = segs[
i].b;
1513 size_t i3 = seq2 -
m_Seq2;
1514 size_t i4 = i3 + guide_size - 1;
1515 size_t guides_dim = guides.size();
1516 if( guides_dim == 0 ||
1517 i1 - 1 > guides[guides_dim - 1].q1 ||
1518 i3 - 1 > guides[guides_dim - 1].s1 ) {
1522 guides[guides_dim - 1].q1 = i2;
1523 guides[guides_dim - 1].s1 = i4;
1525 beg0 = p + guide_size;
1537 size_t guides_dim = guides.size();
1540 const size_t offs = guide_core/2 - 1;
1541 for(
size_t k = 0; k < guides_dim; ++k) {
1542 size_t q0 = (guides[k].q0 + guides[k].q1) / 2;
1543 size_t s0 = (guides[k].s0 + guides[k].s1) / 2;
1557 size_t* s0,
size_t* s1,
1558 size_t min_size)
const
1561 size_t cur = 0, maxseg = 0;
1564 const char* p1 = seq1_end - 1;
1565 const char* p2 = seq2_end - 1;
1567 imax = i0, jmax = j0;
1569 for(
size_t k = 0; k < trdim; ++k) {
1579 if(maxseg >= min_size)
goto ret_point;
1591 if(maxseg >= min_size)
goto ret_point;
1611 if(maxseg >= min_size)
goto ret_point;
1635 *q1 = imax; *s1 = jmax;
1636 *q0 = imax - maxseg + 1;
1637 *s0 = jmax - maxseg + 1;
1644 size_t* s0,
size_t* s1)
const
1647 size_t cur = 0, maxseg = 0;
1650 size_t i0 = 0, j0 = 0, imax = i0, jmax = j0;
1652 for(
Int8 k = trdim-1; k >= 0; --k) {
1713 *q0 = imax; *s0 = jmax;
1714 *q1 = *q0 + maxseg - 1;
1715 *s1 = *s0 + maxseg - 1;
1725 bool trim_end_gaps)
const
1736 return fmt.AsDenseSeg(query_start, query_strand,
1737 subj_start, subj_strand,
flags);
1747 bool trim_end_gaps)
const
1760 fmt.SetSeqIds(id0, id1);
1762 return fmt.AsDenseSeg(query_start, query_strand,
1763 subj_start, subj_strand,
flags);
@ eExtreme_Biological
5' and 3'
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
static unsigned int GetCpuCount(void)
Return number of active CPUs/cores (never less than 1).
void SetBestPos(size_t k)
size_t MakePattern(const size_t hit_size=100, const size_t core_size=28)
size_t GetRightSeg(size_t *q0, size_t *q1, size_t *s0, size_t *s1, size_t min_size) const
void SetAt(size_t i, Uint1 v)
virtual bool x_CheckMemoryLimit(void)
void SetProgressCallback(FProgressCallback prg_callback, void *data)
virtual TScore x_Align(SAlignInOut *data)
virtual TScore ScoreFromTranscript(const TTranscript &transcript, size_t start1=kMax_UInt, size_t start2=kMax_UInt) const
virtual ETranscriptSymbol x_GetDiagTS(size_t i1, size_t i2) const
EGapPreference GetGapPreference() const
vector< size_t > m_guides
virtual void SetSequences(const char *seq1, size_t len1, const char *seq2, size_t len2, bool verify=true)
friend class CNWAlignerThread_Align
TTranscript GetTranscript(bool reversed=true) const
SNCBIFullScoreMatrix m_ScoreMatrix
const char * x_FindFingerPrint64(const char *beg, const char *end, unsigned char fingerprint, size_t size, size_t &err_index)
bool IsSmithWaterman() const
void SetBestScore(TNCBIScore score)
unsigned char x_CalcFingerPrint64(const char *beg, const char *end, size_t &err_index)
TScore GetScore(void) const
size_t GetLongestSeg(size_t *q0, size_t *q1, size_t *s0, size_t *s1) const
size_t x_CheckSequence(const char *seq, size_t len) const
bool m_ScoreMatrixInvalid
FProgressCallback m_prg_callback
string GetTranscriptString(void) const
void x_SWDoBackTrace(const CBacktraceMatrix4 &backtrace, SAlignInOut *data)
TNCBIScore BestScore() const
virtual TScore x_Run(void)
void EnableMultipleThreads(bool enable=true)
vector< ETranscriptSymbol > TTranscript
void SetSmithWaterman(bool SW)
bool m_PositivesAsMatches
void GetEndSpaceFree(bool *L1, bool *R1, bool *L2, bool *R2) const
void SetTranscript(const TTranscript &transcript)
void SetScoreMatrix(const SNCBIPackedScoreMatrix *scoremat)
EGapPreference m_GapPreference
void SetWms(TScore value)
CRef< objects::CDense_seg > GetDense_seg(TSeqPos query_start, objects::ENa_strand query_strand, TSeqPos subj_start, objects::ENa_strand subj_strand, bool trim_end_gaps=false) const
virtual size_t GetElemSize(void) const
size_t GetLeftSeg(size_t *q0, size_t *q1, size_t *s0, size_t *s1, size_t min_size) const
void SetPattern(const vector< size_t > &pattern)
void x_DoBackTrace(const CBacktraceMatrix4 &backtrace, SAlignInOut *data)
void SetGapPreference(EGapPreference p)
Control preference for where to place a gap if there is a choice; default is eLater,...
void SetEndSpaceFree(bool Left1, bool Right1, bool Left2, bool Right2)
static bool PSpace(const SAlignInOut *p1, const SAlignInOut *p2)
unsigned int TSeqPos
Type for sequence locations and lengths.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
ENa_strand GetStrand(void) const
Get the location's strand.
TSeqPos GetStart(ESeqLocExtremes ext) const
Return start and stop positions of the seq-loc.
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
int64_t Int8
8-byte (64-bit) signed integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
bool Run(TRunMode flags=fRunDefault)
Run the thread.
@ eType_partial
mapping pieces together
ENa_strand
strand of nucleic acid
bool IsWhole(void) const
Check if variant Whole is selected.
bool IsInt(void) const
Check if variant Int is selected.
const struct ncbi::grid::netcache::search::fields::SIZE size
#define F(x)
Make a parametrized function appear to have only one variable.
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
const unsigned char kMaskFc
const unsigned char kMaskEc
const unsigned char kMaskE
const unsigned char kMaskD
bool NW_RequestNewThread(const unsigned int max_threads)
#define NCBI_FSM_DIM
Recommended approach: unpack and index directly.
void NCBISM_Unpack(const SNCBIPackedScoreMatrix *psm, SNCBIFullScoreMatrix *fsm)
Expand a packed score matrix into an unpacked one, which callers can proceed to index directly by sta...
const char g_msg_InconsistentArguments[]
const char g_msg_HitSpaceLimit[]
const char g_msg_InvalidTranscriptSymbol[]
const char g_msg_NoAlignment[]
const char g_msg_OutOfSpace[]
const char g_msg_NullParameter[]
const char g_msg_DataNotAvailable[]
const TNCBIScore * scores
strlen(symbols) x strlen(symbols)
TNCBIScore defscore
score for unknown residues
const char * symbols
order of residues
nwaln_mrnaguide(size_t i1, size_t i2, size_t i3, size_t i4)
naive pattern generator (a la Rabin-Karp)
nwaln_mrnaseg(size_t i1, size_t i2, unsigned char fp0)
int g(Seg_Gsm *spe, Seq_Mtf *psm, Thd_Gsm *tdg)