54 const char id_not_set[] =
"ID_not_set";
91 score->SetId().SetStr(
"global_score");
93 seqalign->
SetScore().push_back(score);
100 ITERATE(
string, ii, transcript) {
109 Int8 endi = transcript.size() - 1;
113 for( ;ind < endi; ++ind) {
118 length = endi + 1 - ind;
122 length = transcript.size();
130 idty = double(matches) / length;
133 score->SetId().SetStr(
"identity");
134 score->SetValue().SetReal(idty);
135 seqalign->
SetScore().push_back(score);
139 subj_start, subj_strand,
flags);
142 seqalign->
SetSegs().SetDenseg(*rds);
144 seqalign->
SetSegs().SetDendiag();
160 if(transcript.size() == 0) {
171 vector<CNWAligner::ETranscriptSymbol>::const_iterator
172 ib = stranscript.begin(),
173 ie = stranscript.end();
174 while( ( ie != ib ) &&
187 subj_start, subj_strand,
195 ids.push_back(id_query);
199 ids.push_back(id_subj);
227 const size_t min_query_size = 4;
228 if(
int(m_box[1] - m_box[0] + 1) <
int(min_query_size)) {
234 int len_total = (
int)m_details.size();
236 string::iterator irs0 = m_details.begin(),
237 irs1 = m_details.end(), irs;
239 for(irs = irs0; irs != irs1; ++irs) {
246 string::reverse_iterator rirs0 = m_details.rbegin(),
247 rirs1 = m_details.rend(), rirs = rirs0;
248 int cnt = 0, max_cnt = 20;
251 for( ; ( rirs != rirs1 ) && (
cnt != max_cnt) ; ++rirs, ++
cnt) {
260 int i0_max = 0, i1_max = 0;
261 string::iterator irs_max;
268 const double dropoff_diff = .19;
271 for(irs = irs0; irs != irs1; ++irs) {
305 match_total -=
match;
316 while(i0_max > 0 && i1_max > 0) {
317 if(
toupper(seq1[m_box[0]+i0_max-1]) !=
'N' && seq1[m_box[0]+i0_max-1] == seq2[m_box[2]+i1_max-1] ) {
328 if(i0_max == 0 && i1_max == 0)
return;
331 if(m_box[1] - m_box[0] + 1 - i0_max >= min_query_size )
336 const size_t L = irs_max - irs0 + 1;
337 m_details.erase(0, L);
338 m_details.insert(m_details.begin(),
head,
'M');
342 if(m_annot.size() > 2 && m_annot[2] ==
'<') {
343 int j1 =
int(m_box[2]) - 2;
344 char c1 = j1 >= 0? seq2[j1]:
' ';
346 int j2 =
int(m_box[2]) - 1;
347 char c2 = j2 >= 0? seq2[j2]:
' ';
360 const size_t min_query_size = 4;
362 int i0 =
int(m_box[1] - m_box[0] + 1), i0_max = i0;
363 if(i0 <
int(min_query_size)) {
369 int i1 =
int(m_box[3] - m_box[2] + 1), i1_max = i1;
378 string::reverse_iterator irs0 = m_details.rbegin(),
379 irs1 = m_details.rend(), irs = irs0, irs_max = irs0;
381 for( ; irs != irs1; ++irs) {
401 if(irs > irs0 && *(irs-1)!=
'I') s += wg;
408 if(irs > irs0 && *(irs-1)!=
'D') s += wg;
425 while(i0_max > 0 && i1_max > 0) {
426 if(
toupper (seq1[m_box[0]+i0_max-1]) !=
'N' && seq1[m_box[0]+i0_max-1] == seq2[m_box[2]+i1_max-1]) {
435 if(i0_max == 0 && i1_max == 0)
return;
438 if(m_box[1] - m_box[0] + 1 - i0_max >= min_query_size )
443 const size_t L = m_details.size() - (irs_max - irs0 + 1);
444 m_details.erase(0, L);
445 m_details.insert(m_details.begin(),
head,
'M');
449 if(m_annot.size() > 2 && m_annot[2] ==
'<') {
450 int j1 =
int(m_box[2]) - 2;
451 char c1 = j1 >= 0? seq2[j1]:
' ';
453 int j2 =
int(m_box[2]) - 1;
454 char c2 = j2 >= 0? seq2[j2]:
' ';
464 size_t gap_count = 0;
465 ITERATE(
string, irs, m_details) {
482 for(
size_t i = m_box[0];
i<=m_box[1]; ++
i) {
485 size_t gap_len = GapLength();
487 if( m_len * 70 <= 100 * (
i->second + gap_len) ) {
499 const size_t min_query_size = 4;
501 if(m_box[1] - m_box[0] + 1 < min_query_size) {
507 int len_total = (
int)m_details.size();
509 string::iterator irs0 = m_details.begin(),
510 irs1 = m_details.end(), irs;
512 for(irs = irs0; irs != irs1; ++irs) {
519 int cnt = 0, max_cnt = 20;
521 for( irs = irs0; ( irs != irs1 ) && (
cnt != max_cnt) ; ++irs, ++
cnt) {
530 const double dropoff_diff = .19;
532 int i0 =
int(m_box[1] - m_box[0] + 1), i0_max = i0;
533 int i1 =
int(m_box[3] - m_box[2] + 1), i1_max = i1;
536 string::reverse_iterator rirs0 = m_details.rbegin(),
537 rirs1 = m_details.rend(), rirs = rirs0, rirs_max;
540 for( ; rirs != rirs1; ++rirs) {
574 match_total -=
match;
581 int dimq =
int(m_box[1] - m_box[0] + 1);
582 int dims =
int(m_box[3] - m_box[2] + 1);
588 while(i0_max < dimq && i1_max < dims ) {
589 if(
toupper(seq1[m_box[0]+i0_max]) !=
'N' && seq1[m_box[0]+i0_max] == seq2[m_box[2]+i1_max]) {
598 if( i0_max >= dimq && i1_max >= dims )
return;
601 if(i0_max - 1 >=
int(min_query_size) ) {
603 m_box[1] = m_box[0] + i0_max - 1;
604 m_box[3] = m_box[2] + i1_max - 1;
606 m_details.resize(m_details.size() - (rirs_max - rirs0 + 1));
607 m_details.insert(m_details.end(), tail,
'M');
611 const size_t adim = m_annot.size();
612 if(adim > 2 && m_annot[adim - 3] ==
'>') {
615 const char c3 (m_box[3] + 1 < len2? seq2[m_box[3] + 1]:
' ');
616 const char c4 (m_box[3] + 2 < len2? seq2[m_box[3] + 2]:
' ');
617 m_annot[adim-2] = c3;
618 m_annot[adim-1] = c4;
631 const size_t min_query_size = 4;
633 if(m_box[1] - m_box[0] + 1 < min_query_size) {
639 int i0 = -1, i0_max = i0;
640 int i1 = -1, i1_max = i1;
649 string::iterator irs0 = m_details.begin(),
650 irs1 = m_details.end(), irs = irs0, irs_max = irs0;
652 for( ; irs != irs1; ++irs) {
672 if(irs > irs0 && *(irs-1) !=
'I') s += wg;
679 if(irs > irs0 && *(irs-1) !=
'D') s += wg;
692 int dimq =
int(m_box[1] - m_box[0] + 1);
693 int dims =
int(m_box[3] - m_box[2] + 1);
699 while(i0_max < dimq - 1 && i1_max < dims - 1) {
700 if(
toupper(seq1[m_box[0]+i0_max+1]) !=
'N' && seq1[m_box[0]+i0_max+1] == seq2[m_box[2]+i1_max+1] ) {
712 if(i0_max >= dimq - 1 && i1_max >= dims - 1)
return;
715 if(i0_max >=
int(min_query_size) ) {
717 m_box[1] = m_box[0] + i0_max;
718 m_box[3] = m_box[2] + i1_max;
720 m_details.resize(irs_max - irs0 + 1);
721 m_details.insert(m_details.end(), tail,
'M');
725 const size_t adim = m_annot.size();
726 if(adim > 2 && m_annot[adim - 3] ==
'>') {
729 const char c3 (m_box[3] + 1 < len2? seq2[m_box[3] + 1]:
' ');
730 const char c4 (m_box[3] + 2 < len2? seq2[m_box[3] + 2]:
' ');
731 m_annot[adim-2] = c3;
732 m_annot[adim-1] = c4;
742 Int8 mind0 = m_box[1] + 1;
744 Int8 gind = m_box[3] + 1;
745 for(; mind < (
int)mrna.size() && gind < (
int)genomic.size(); ++gind, ++mind) {
746 if(
toupper(mrna[mind]) ==
'N' || mrna[mind] != genomic[gind] )
break;
755 int mind0 = (
int)m_box[0] - 1;
757 int gind = (
int)m_box[2] - 1;
758 for(; mind >= 0 && gind >= 0; --mind, --gind) {
759 if(
toupper(mrna[mind]) ==
'N' || mrna[mind] != genomic[gind] )
break;
770 m_details.append(ext_len,
'M');
773 const size_t ann_dim = m_annot.size();
774 if(ann_dim > 2 && m_annot[ann_dim - 3] ==
'>') {
775 m_annot[ann_dim - 2] = (m_box[3] + 1) < genomic.size() ? genomic[m_box[3] + 1] :
' ';
776 m_annot[ann_dim - 1] = (m_box[3] + 2) < genomic.size() ? genomic[m_box[3] + 2] :
' ';
787 m_details.insert(m_details.begin(), ext_len,
'M');
790 if( ( m_annot.size() > 2 ) && ( m_annot[2] ==
'<' ) ) {
791 m_annot[1] = m_box[2] >= 1 ? genomic[m_box[2] - 1] :
' ';
792 m_annot[0] = m_box[2] >= 2 ? genomic[m_box[2] - 2] :
' ';
801 m_len = m_details.size();
803 string::const_iterator ib = m_details.begin(), ie = m_details.end();
805 for(string::const_iterator ii = ib; ii != ie; ++ii) {
806 if(*ii ==
'M') ++count;
808 m_idty = double(count) / m_len;
810 const size_t xcript_dim (m_details.size());
812 for(
size_t i (0);
i < xcript_dim; ++
i) {
816 m_score = float(paligner->CNWAligner::ScoreFromTranscript(transcript)) /
823 const size_t adim = m_annot.size();
825 (adim > 2 && m_annot[adim - 3] ==
'>')? (m_annot.c_str() + adim - 2): 0;
831 const size_t adim = m_annot.size();
832 return (adim > 3 && m_annot[2] ==
'<')? m_annot.c_str(): 0;
837 const char* acceptor,
840 if(!donor || !acceptor)
return false;
845 if(acceptor[0] ==
'A') {
846 if(donor[0] ==
'G' && acceptor[1] ==
'G') {
847 rv = donor[1] ==
'T' || donor[1] ==
'C';
850 rv = donor[0] ==
'A' && donor[1] ==
'T' && acceptor[1] ==
'C';
858 rv = donor[0] ==
'G' && donor[1] ==
'T'
859 && acceptor[0] ==
'A' && acceptor[1] ==
'G';
869 copy(v.begin(), v.end(), psegments->begin());
875 if(transcript.size() == 0) {
879 vector<SSegment>& segments(*psegments);
882 bool esfL1, esfR1, esfL2, esfR2;
887 const char* p1 (start1);
888 const char* p2 (start2);
889 Int8 tr_idx_hi0 (transcript.size() - 1), tr_idx_hi (tr_idx_hi0);
890 Int8 tr_idx_lo0 (0), tr_idx_lo (tr_idx_lo0);
930 vector<char> trans_ex (tr_idx_hi - tr_idx_lo + 1);
932 for(
int tr_idx (tr_idx_hi); tr_idx >= tr_idx_lo; ) {
934 const char * p1_beg (p1), * p1_x (0);
935 const char * p2_beg (p2);
936 size_t matches (0), exon_aln_size (0), exon_aln_size_x(0);
939 vector<char>::iterator ii_ex (trans_ex.begin()), ii_ex_x;
940 size_t cons_dels (0);
941 const size_t max_cons_dels (25);
948 if(cons_dels > max_cons_dels) {
954 if(
toupper(*p1) !=
'N' && *p1 == *p2) {
968 exon_aln_size_x = exon_aln_size;
985 if(cons_dels > max_cons_dels) {
987 swap(ii_ex, ii_ex_x);
988 swap(exon_aln_size, exon_aln_size_x);
989 swap(tr_idx, tr_idx_x);
992 if(exon_aln_size > 0) {
998 s.
m_idty = float(matches) / exon_aln_size;
999 s.
m_len = exon_aln_size;
1001 size_t beg1 (p1_beg - start1), end1 (p1 - start1 - 1);
1002 size_t beg2 (p2_beg - start2), end2 (p2 - start2 - 1);
1009 char c1 ((p2_beg >= start2 + 2)? *(p2_beg - 2):
' ');
1010 char c2 ((p2_beg >= start2 + 1)? *(p2_beg - 1):
' ');
1011 char c3 ((p2 < start2 + len2)? *(p2):
' ');
1012 char c4 ((p2 < start2 + len2 - 1)? *(p2+1):
' ');
1017 const string s_exontag (
"<exon>");
1018 copy(s_exontag.begin(), s_exontag.end(), s.
m_annot.begin() + 2);
1021 s.
m_details.resize(ii_ex - trans_ex.begin());
1026 if(cons_dels > max_cons_dels) {
1033 s.
m_len = exon_aln_size_x - exon_aln_size;
1035 size_t beg1 (p1 - start1), end1 (p1_x - start1 - 1);
1036 size_t beg2 (0), end2 (0);
1048 swap(ii_ex, ii_ex_x);
1049 swap(exon_aln_size, exon_aln_size_x);
1050 swap(tr_idx, tr_idx_x);
1074 if(transcript.size() == 0) {
1080 const string strid_query =
m_Seq1Id->GetSeqIdString(
true);
1081 const string strid_subj =
m_Seq2Id->GetSeqIdString(
true);
1087 ss << '>
' << strid_query << '\t' << strid_subj << endl;
1089 vector<char> v1, v2;
1090 unsigned i1 (0), i2 (0);
1091 size_t aln_size (x_ApplyTranscript(&v1, &v2));
1092 for (size_t i = 0; i < aln_size; ) {
1094 ss << i << '\t' << i1 << ':
' << i2 << endl;
1096 for (size_t jPos = 0; i < aln_size && jPos < line_width; ++i, ++jPos) {
1097 char c1 (v1[i0 + jPos]);
1099 if(c1 != '-
' && c1 != 'x
' && c1 != '+
') ++i1;
1103 string marker_line(line_width, ' ');
1105 for (size_t jPos = 0; i < aln_size && jPos < line_width; ++i, ++jPos) {
1106 char c1 (v1[i0 + jPos]);
1107 char c2 (v2[i0 + jPos]);
1109 if(c2 != '-
' && c2 != '+
' && c2 != 'x
')
1111 if( c2 != '-
' && c1 != '-
' && c1 != '+
' && c1 != 'x
' && ( toupper(c2) != toupper(c1) || m_aligner->GetScoreMatrix().s[(size_t)c1][(size_t)c2] <= 0 ))
1112 marker_line[jPos] = '^
';
1114 ss << endl << marker_line << endl;
1119 case eFormatType2: {
1121 ss << '>
' << strid_query << '\t' << strid_subj << endl;
1123 vector<char> v1, v2;
1124 unsigned i1 (0), i2 (0);
1125 size_t aln_size (x_ApplyTranscript(&v1, &v2));
1126 for (size_t i = 0; i < aln_size; ) {
1127 ss << i << '\t' << i1 << ':
' << i2 << endl;
1129 for (size_t jPos = 0; i < aln_size && jPos < line_width; ++i, ++jPos) {
1130 char c (v1[i0 + jPos]);
1132 if(c != '-
' && c != '+
' && c != 'x
') ++i1;
1136 string line2 (line_width, ' ');
1137 string line3 (line_width, ' ');
1139 for (size_t jPos = 0; i < aln_size && jPos < line_width; ++i, ++jPos) {
1140 char c1 (v1[i0 + jPos]);
1141 char c2 (v2[i0 + jPos]);
1142 if(c2 != '-
' && c2 != '+
' && c2 != 'x
') i2++;
1143 if( toupper(c2) == toupper(c1) && m_aligner-> GetScoreMatrix().s[(size_t)c1][(size_t)c2] > 0 ) line2[jPos] = '|
';
1146 ss << line2 << endl << line3 << endl << endl;
1153 CRef<CSeq_align> sa = AsSeqAlign();
1154 CObjectOStreamAsn asn_stream (ss);
1156 asn_stream << Separator;
1160 case eFormatDenseSeg: {
1162 CRef<CDense_seg> ds = AsDenseSeg();
1163 CObjectOStreamAsn asn_stream (ss);
1165 asn_stream << Separator;
1169 case eFormatFastA: {
1170 vector<char> v1, v2;
1171 size_t aln_size (x_ApplyTranscript(&v1, &v2));
1173 ss << '>
' << strid_query << endl;
1174 const vector<char>* pv = &v1;
1175 for(size_t i = 0; i < aln_size; ) {
1176 for(size_t j = 0; j < line_width && i < aln_size; ++j, ++i) {
1182 ss << '>
' << strid_subj << endl;
1184 for(size_t i = 0; i < aln_size; ) {
1185 for(size_t j = 0; j < line_width && i < aln_size; ++j, ++i) {
1193 case eFormatExonTable:
1194 case eFormatExonTableEx: {
1198 typedef deque<SSegment> TSegments;
1200 MakeSegments(&segments);
1201 ITERATE(TSegments, ii, segments) {
1203 ss << strid_query << '\t' << strid_subj << '\t';
1204 ss << ii->m_idty << '\t' << ii->m_len << '\t';
1205 copy(ii->m_box, ii->m_box + 4,
1206 ostream_iterator<size_t>(ss,"\t"));
1207 ss << '\t' << ii->m_annot;
1208 if(type == eFormatExonTableEx) {
1209 ss << '\t' << ii->m_details;
1217 NCBI_THROW(CAlgoAlignException, eBadParameter, "Incorrect format specified");
1220 *output = CNcbiOstrstreamToString(ss);
1225 // Transform source sequences according to the transcript.
1226 // cut flank gaps for Smith-Waterman
1227 // Write the results to v1 and v2 leaving source sequences intact.
1228 // Return alignment size.
1229 size_t CNWFormatter::x_ApplyTranscript(vector<char>* pv1, vector<char>* pv2)
1232 const CNWAligner::TTranscript transcript = m_aligner->GetTranscript();
1234 vector<char>& v1 (*pv1);
1235 vector<char>& v2 (*pv2);
1240 if(transcript.size() == 0) {
1245 vector<CNWAligner::ETranscriptSymbol>::const_reverse_iterator
1246 ib = transcript.rbegin(),
1247 ie = transcript.rend(),
1250 if( m_aligner->IsSmithWaterman() ) {
1252 while( ( ie != ib ) &&
1253 ( *ie == CNWAligner::eTS_Insert || *ie == CNWAligner::eTS_Delete || *ie == CNWAligner::eTS_Intron ||
1254 *ie == CNWAligner::eTS_SlackInsert || *ie == CNWAligner::eTS_SlackDelete ) ) {
1259 const char* iv1 (m_aligner->GetSeq1());
1260 const char* iv2 (m_aligner->GetSeq2());
1262 bool sw_ini_gap = false;
1263 if( m_aligner->IsSmithWaterman() ) {
1267 for (ii = ib; ii != ie; ii++) {
1269 CNWAligner::ETranscriptSymbol ts (*ii);
1273 case CNWAligner::eTS_Insert:
1278 case CNWAligner::eTS_SlackInsert:
1283 case CNWAligner::eTS_Delete:
1289 case CNWAligner::eTS_SlackDelete:
1294 case CNWAligner::eTS_Match:
1295 case CNWAligner::eTS_Replace:
1301 case CNWAligner::eTS_Intron:
User-defined methods of the data storage class.
void TrimEndGaps()
Trim leading/training gaps if possible.
void FromTranscript(TSeqPos query_start, ENa_strand query_strand, TSeqPos subj_start, ENa_strand subj_strand, const string &transcript)
Initialize from pairwise alignment transcript (a string representation produced by CNWAligner)
container_type::iterator iterator
const_iterator begin() const
const_iterator end() const
const char * GetSeq2(void) const
TTranscript GetTranscript(bool reversed=true) const
const char * GetSeq1(void) const
bool IsSmithWaterman() const
size_t GetSeqLen2(void) const
TScore GetScore(void) const
string GetTranscriptString(void) const
vector< ETranscriptSymbol > TTranscript
void GetEndSpaceFree(bool *L1, bool *R1, bool *L2, bool *R2) const
unsigned int TSeqPos
Type for sequence locations and lengths.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
void swap(NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair1, NCBI_NS_NCBI::pair_base_member< T1, T2 > &pair2)
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
void Reset(void)
Reset reference object.
TObjectType * GetNonNullPointer(void) const
Get pointer value and throw a null pointer exception if pointer is null.
int64_t Int8
8-byte (64-bit) signed integer
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
TStr & SetStr(void)
Select the variant.
TScore & SetScore(void)
Assign a value to Score data member.
void SetSegs(TSegs &value)
Assign a value to Segs data member.
void SetDim(TDim value)
Assign a value to Dim data member.
void SetType(TType value)
Assign a value to Type data member.
vector< CRef< CSeq_id > > TIds
TIds & SetIds(void)
Assign a value to Ids data member.
@ eType_partial
mapping pieces together
ENa_strand
strand of nucleic acid
TLocal & SetLocal(void)
Select the variant.
unsigned int
A callback function used to compare two keys in a database.
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
static int match(register const pcre_uchar *eptr, register const pcre_uchar *ecode, const pcre_uchar *mstart, int offset_top, match_data *md, eptrblock *eptrb, unsigned int rdepth)
static SQLCHAR output[256]
const char g_msg_NoAlignment[]