33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
41 #include <objmgr/util/sequence.hpp>
42 #include <objmgr/seq_vector.hpp>
48 const char CProteinAlignText::GAP_CHAR='-'; // used in dna and protein text
49 const char CProteinAlignText::SPACE_CHAR=' '; // translation and protein
50 const char CProteinAlignText::INTRON_CHAR='.'; // protein
54 // used in match text
59 const char CProteinAlignText::MATCH_CHAR='|';
60 const char CProteinAlignText::POSIT_CHAR='+';
63 void CProteinAlignText::AddSpliceText(CSeqVector_CI& genomic_ci, int& nuc_prev, char match)
64 {
65  AddDNAText(genomic_ci,nuc_prev,2);
67  m_match.append((SIZE_TYPE)2,match);
68  m_protein.append((SIZE_TYPE)2,INTRON_CHAR);
69 }
71 void CProteinAlignText::AddDNAText(CSeqVector_CI& genomic_ci, int& nuc_prev, int len)
72 {
73  string buf;
74  genomic_ci.GetSeqData(buf,len);
75  nuc_prev +=len;
76  m_dna.append(buf);
77 }
79 void CProteinAlignText::AddProtText(CSeqVector_CI& protein_ci, int& prot_prev, int len)
80 {
81  m_protein.reserve(m_protein.size()+len);
83  int phase = (prot_prev+1)%3;
85  if (phase!=0) {
86  size_t prev_not_intron_pos = m_protein.find_last_not_of(INTRON_OR_GAP,m_protein.size()-1);
87  char aa = m_protein[prev_not_intron_pos];
88  _ASSERT( aa != SPACE_CHAR );
89  int added_len = min(3-phase,len);
90  if (prev_not_intron_pos == m_protein.size()-1 && phase+added_len==3 && (phase==1 || m_protein[prev_not_intron_pos-1]==aa)) {
91  m_protein.append(added_len,SPACE_CHAR);
92  m_protein[m_protein.size()-3] = SPACE_CHAR;
93  m_protein[m_protein.size()-2] = toupper(aa);
94  } else {
95  m_protein.append(added_len,aa);
96  }
97  len -= added_len;
98  prot_prev += added_len;
99  }
101  if (len > 0) {
102  string buf;
103  protein_ci.GetSeqData(buf,(len+2)/3);
104  const char* p = buf.c_str();
106  while (len >= 3) {
107  m_protein.push_back(SPACE_CHAR);
108  m_protein.push_back(*p++);
109  m_protein.push_back(SPACE_CHAR);
110  len -=3;
111  prot_prev += 3;
112  }
113  if (len > 0) {
114  m_protein.append(len,tolower(*p));
115  }
116  prot_prev += len;
117  }
118 }
120 // translate last len bases in m_dna
121 // plus spliced codon in prev exon if at the start of exon
122 void CProteinAlignText::TranslateDNA(int phase, size_t len, bool is_insertion)
123 {
124  _ASSERT( m_translation.size()+len ==m_dna.size() );
125  _ASSERT( phase==0 || m_dna.size()>0 );
127  m_translation.reserve(m_translation.size()+len);
128  size_t start_pos = m_dna.size()-len;
129  const char INTRON[] = {INTRON_CHAR,0};
130  if (phase != 0) {
131  size_t prev_exon_pos = 0;
132  if (phase+len >=3 &&
133  ((prev_exon_pos=m_protein.find_last_not_of(is_insertion?INTRON:INTRON_OR_GAP,start_pos-1))!=start_pos-1 ||
134  m_dna[start_pos]==GAP_CHAR) &&
135  m_match[prev_exon_pos]!=BAD_PIECE_CHAR) {
136  string codon = m_dna.substr(prev_exon_pos-phase+1,phase)+m_dna.substr(start_pos,3-phase);
137  char aa = (codon[0]!=GAP_CHAR && codon[1]!=GAP_CHAR) ? TranslateTriplet(*m_trans_table, codon) : SPACE_CHAR;
138  for( size_t i = prev_exon_pos-phase+1; i<=prev_exon_pos;++i) {
139  m_translation[i] = tolower(aa);
140  m_match[i] = MatchChar(i);
141  }
142  m_translation.append((SIZE_TYPE)(3-phase),m_dna[start_pos]!=GAP_CHAR?tolower(aa):SPACE_CHAR);
143  } else {
144  m_translation.append(min(len,(SIZE_TYPE)(3-phase)),SPACE_CHAR);
145  }
146  start_pos += min(len,(SIZE_TYPE)(3-phase));
147  }
149  if (m_dna[start_pos]!=GAP_CHAR) {
150  char aa[] = " ";
151  for ( ; start_pos+3 <= m_dna.size(); start_pos += 3) {
152  aa[1] = TranslateTriplet(*m_trans_table, m_dna.substr(start_pos,3));
153  m_translation += aa;
154  }
155  }
157  if (start_pos < m_dna.size()) {
158  m_translation.append(m_dna.size()-start_pos,SPACE_CHAR);
159  }
161  _ASSERT( m_translation.size()==m_dna.size() );
162 }
165 {
166  char m = SPACE_CHAR;
167  if (m_translation[i] != SPACE_CHAR && m_protein[i] != SPACE_CHAR) {
168  if(toupper(m_protein[i]) != 'X') {
169  if (m_translation[i] == m_protein[i]) {
170  m = MATCH_CHAR;
171  } else if(m_matrix.s[toupper(m_protein[i])]
172  [toupper(m_translation[i])] > 0)
173  {
174  m = POSIT_CHAR;
175  }
176  }
177  }
178  return m;
179 }
181 void CProteinAlignText::MatchText(size_t len, bool is_match)
182 {
183  _ASSERT( m_translation.size() == m_protein.size() );
184  _ASSERT( m_translation.size() == m_match.size()+len );
186  m_match.reserve(m_match.size()+len);
188  for (size_t i = m_translation.size()-len; i < m_translation.size(); ++i) {
189  m_match.push_back((is_match && islower(m_protein[i]))?MATCH_CHAR:MatchChar(i));
190  }
191 }
194  const string& triplet)
195 {
196  return table.GetCodonResidue(
197  table.SetCodonState(triplet[0], triplet[1], triplet[2]));
198 }
201  bool prev_3_prime_splice, bool cur_5_prime_splice,
202  CSeqVector_CI& genomic_ci, CSeqVector_CI& protein_ci,
203  int& nuc_prev, int& prot_prev,
204  int nuc_cur_start, int prot_cur_start)
205 {
206  _ASSERT( m_dna.size() == m_translation.size() );
207  _ASSERT( m_match.size() == m_protein.size() );
208  _ASSERT( m_dna.size() == m_protein.size() );
210  int prot_hole_len = prot_cur_start - prot_prev -1;
211  int nuc_hole_len = nuc_cur_start - nuc_prev -1;
213  bool can_show_splices = prot_hole_len < nuc_hole_len -4;
214  if (can_show_splices && prev_3_prime_splice) {
215  AddSpliceText(genomic_ci,nuc_prev, BAD_PIECE_CHAR);
216  nuc_hole_len = nuc_cur_start - nuc_prev -1;
217  }
218  if (can_show_splices && cur_5_prime_splice) {
219  nuc_cur_start -= 2;
220  nuc_hole_len = nuc_cur_start - nuc_prev -1;
221  }
223  SIZE_TYPE hole_len = max(prot_hole_len,nuc_hole_len);
224  _ASSERT( prot_hole_len>0 || nuc_hole_len>0 );
225  int left_gap = 0;
227  left_gap = (prot_hole_len-nuc_hole_len)/2;
228  if (left_gap>0)
229  m_dna.append((SIZE_TYPE)left_gap,GAP_CHAR);
230  if (nuc_hole_len>0)
231  AddDNAText(genomic_ci,nuc_prev,nuc_hole_len);
232  if (prot_hole_len>nuc_hole_len)
233  m_dna.append((SIZE_TYPE)(prot_hole_len-nuc_hole_len-left_gap),GAP_CHAR);
235  m_translation.append(hole_len,SPACE_CHAR);
236  m_match.append(hole_len,BAD_PIECE_CHAR);
238  left_gap = (nuc_hole_len-prot_hole_len)/2;
239  if (left_gap>0)
240  m_protein.append((SIZE_TYPE)left_gap,GAP_CHAR);
241  if (prot_hole_len>0)
242  AddProtText(protein_ci,prot_prev,prot_hole_len);
243  if (prot_hole_len<nuc_hole_len)
244  m_protein.append((SIZE_TYPE)(nuc_hole_len-prot_hole_len-left_gap),
245  GAP_CHAR);
247  if (can_show_splices && cur_5_prime_splice) {
248  AddSpliceText(genomic_ci,nuc_prev, BAD_PIECE_CHAR);
249  }
250  _ASSERT( m_dna.size() == m_translation.size() );
251  _ASSERT( m_match.size() == m_protein.size() );
252  _ASSERT( m_dna.size() == m_protein.size() );
253 }
256 {
257 }
259 CProteinAlignText::CProteinAlignText(objects::CScope& scope, const objects::CSeq_align& seqalign,
260  const string& matrix_name)
261 {
262  const CSpliced_seg& sps = seqalign.GetSegs().GetSpliced();
264  ENa_strand strand = sps.GetGenomic_strand();
266  const CSeq_id& protid = sps.GetProduct_id();
267  int prot_len = sps.GetProduct_length()*3;
268  CSeqVector protein_seqvec(scope.GetBioseqHandle(protid), CBioseq_Handle::eCoding_Iupac);
269  CSeqVector_CI protein_ci(protein_seqvec);
271  CRef<CSeq_loc> genomic_seqloc = GetGenomicBounds(scope, seqalign);
272  CSeqVector genomic_seqvec(*genomic_seqloc, scope, CBioseq_Handle::eCoding_Iupac);
273  CSeqVector_CI genomic_ci(genomic_seqvec);
275  int gcode = 1;
276  try {
277  const CSeq_id* sid = genomic_seqloc->GetId();
278  CBioseq_Handle hp = scope.GetBioseqHandle(*sid);
279  gcode = sequence::GetOrg_ref(hp).GetGcode();
280  } catch (...) {}
284  const SNCBIPackedScoreMatrix* packed_mtx =
285  NCBISM_GetStandardMatrix(matrix_name.c_str());
286  if (packed_mtx == NULL)
287  NCBI_THROW(CException, eUnknown, "unknown scoring matrix: "+matrix_name);
288  NCBISM_Unpack(packed_mtx, &m_matrix);
290  int nuc_from = genomic_seqloc->GetTotalRange().GetFrom();
291  int nuc_to = genomic_seqloc->GetTotalRange().GetTo();
292  int nuc_prev = -1;
293  int prot_prev = -1;
294  bool prev_3_prime_splice = false;
295  int prev_genomic_ins = 0;
296  ITERATE(CSpliced_seg::TExons, e_it, sps.GetExons()) {
297  const CSpliced_exon& exon = **e_it;
298  int prot_cur_start = exon.GetProduct_start().AsSeqPos();
299 #ifdef _DEBUG
300  int prot_cur_end = exon.GetProduct_end().AsSeqPos();
301 #endif
302  int nuc_cur_start = exon.GetGenomic_start();
303  int nuc_cur_end = exon.GetGenomic_end();
304  if (strand==eNa_strand_plus) {
305  nuc_cur_start -= nuc_from;
306  nuc_cur_end -= nuc_from;
307  } else {
308  swap(nuc_cur_start,nuc_cur_end);
309  nuc_cur_start = nuc_to - nuc_cur_start;
310  nuc_cur_end = nuc_to - nuc_cur_end;
311  }
312  bool cur_5_prime_splice = exon.CanGetAcceptor_before_exon() && exon.GetAcceptor_before_exon().CanGetBases() && exon.GetAcceptor_before_exon().GetBases().size()==2;
313  bool hole_before =
314  prot_prev+1 != prot_cur_start || !( (prev_3_prime_splice && cur_5_prime_splice) || (prot_cur_start==0 && nuc_cur_start==0) );
316  if (hole_before) {
317  AddHoleText(prev_3_prime_splice, cur_5_prime_splice,
318  genomic_ci, protein_ci,
319  nuc_prev, prot_prev,
320  nuc_cur_start, prot_cur_start);
321  prev_genomic_ins = 0;
322  } else { //intron
323  int intron_len = nuc_cur_start - nuc_prev -1;
324  AddDNAText(genomic_ci, nuc_prev, intron_len);
325  m_translation.append(intron_len,SPACE_CHAR);
326  m_match.append(intron_len,MISMATCH_CHAR);
327  m_protein.append(intron_len,INTRON_CHAR);
328  }
330  _ASSERT( m_dna.size() == m_translation.size() );
331  _ASSERT( m_match.size() == m_protein.size() );
332  _ASSERT( m_dna.size() == m_protein.size() );
334  prev_3_prime_splice = exon.CanGetDonor_after_exon() && exon.GetDonor_after_exon().CanGetBases() && exon.GetDonor_after_exon().GetBases().size()==2;
336  ITERATE(CSpliced_exon::TParts, p_it, exon.GetParts()) {
337  const CSpliced_exon_chunk& chunk = **p_it;
338  if (!chunk.IsGenomic_ins())
339  prev_genomic_ins = 0;
340  if (chunk.IsDiag() || chunk.IsMatch() || chunk.IsMismatch()) {
341  int len = 0;
342  if (chunk.IsDiag()) {
343  len = chunk.GetDiag();
344  } else if (chunk.IsMatch()) {
345  len = chunk.GetMatch();
346  } else if (chunk.IsMismatch()) {
347  len = chunk.GetMismatch();
348  }
349  AddDNAText(genomic_ci,nuc_prev,len);
350  TranslateDNA((prot_prev+1)%3,len,false);
351  AddProtText(protein_ci,prot_prev,len);
352  if (chunk.IsMismatch()) {
353  m_match.append(len,MISMATCH_CHAR);
354  } else
355  MatchText(len, chunk.IsMatch());
356  } else if (chunk.IsProduct_ins()) {
357  int len = chunk.GetProduct_ins();
358  m_dna.append(len,GAP_CHAR);
359  TranslateDNA((prot_prev+1)%3,len,false);
360  m_match.append(len,MISMATCH_CHAR);
361  AddProtText(protein_ci,prot_prev,len);
362  } else if (chunk.IsGenomic_ins()) {
363  unsigned len = chunk.GetGenomic_ins();
364  AddDNAText(genomic_ci,nuc_prev,len);
365  if (0<=prot_prev && prot_prev<prot_len-1 && (prot_prev+1)%3==0)
366  TranslateDNA(prev_genomic_ins,len,true);
367  else
368  m_translation.append(len,SPACE_CHAR);
369  prev_genomic_ins = (prev_genomic_ins+len)%3;
370  m_match.append(len,MISMATCH_CHAR);
371  m_protein.append(len,GAP_CHAR);
372  }
373  _ASSERT(prot_prev <= prot_cur_end);
374  }
375  _ASSERT(prot_prev == prot_cur_end);
376  _ASSERT(nuc_prev == nuc_cur_end);
378  _ASSERT( m_dna.size() == m_translation.size() );
379  _ASSERT( m_match.size() == m_protein.size() );
380  _ASSERT( m_dna.size() == m_protein.size() );
381  }
383  int nuc_cur_start = nuc_to - nuc_from +1;
384  int prot_cur_start = prot_len;
385  if (prot_prev+1 != prot_cur_start || nuc_prev+1 != nuc_cur_start) {
386  bool cur_5_prime_splice = false;
387  AddHoleText(prev_3_prime_splice, cur_5_prime_splice,
388  genomic_ci, protein_ci,
389  nuc_prev, prot_prev,
390  nuc_cur_start, prot_cur_start);
391  }
392 }
395  const CSeq_align& seqalign)
396 {
397  CRef<CSeq_loc> genomic(new CSeq_loc);
399  const CSpliced_seg& sps = seqalign.GetSegs().GetSpliced();
400  const CSeq_id& nucid = sps.GetGenomic_id();
402  if (seqalign.CanGetBounds()) {
403  ITERATE(CSeq_align::TBounds, b,seqalign.GetBounds()) {
404  if ((*b)->GetId() != NULL && (*b)->GetId()->Match(nucid)) {
406  TSeqPos len = sequence::GetLength(nucid, &scope);
408  genomic->Assign(**b);
409  if (genomic->IsWhole()) {
410  // change to Interval, because Whole doesn't allow strand change - it's always unknown.
411  genomic->SetInt().SetFrom(0);
412  genomic->SetInt().SetTo(len-1);
413  }
414  genomic->SetStrand(sps.GetGenomic_strand());
416  if (genomic->GetStop(eExtreme_Positional) >= len) {
417  genomic->SetInt().SetFrom(genomic->GetStart(eExtreme_Positional));
418  genomic->SetInt().SetTo(len-1);
419  }
421  return genomic;
422  }
423  }
424  }
426  if (sps.GetExons().empty()) {
427  genomic->SetNull();
428  } else {
429  genomic->SetPacked_int().AddInterval(nucid,sps.GetExons().front()->GetGenomic_start(),sps.GetExons().front()->GetGenomic_end(),sps.GetGenomic_strand());
430  genomic->SetPacked_int().AddInterval(nucid,sps.GetExons().back()->GetGenomic_start(),sps.GetExons().back()->GetGenomic_end(),sps.GetGenomic_strand());
433  }
435  return genomic;
436 }
