NCBI C++ ToolKit
Info.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /*
2 * $Id: Info.cpp 102944 2024-08-08 15:04:59Z badrazat $
3 *
4 * =========================================================================
5 *
6 * PUBLIC DOMAIN NOTICE
7 * National Center for Biotechnology Information
8 *
9 * This software/database is a "United States Government Work" under the
10 * terms of the United States Copyright Act. It was written as part of
11 * the author's official duties as a United States Government employee and
12 * thus cannot be copyrighted. This software/database is freely available
13 * to the public for use. The National Library of Medicine and the U.S.
14 * Government have not placed any restriction on its use or reproduction.
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannt warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * Please cite the author in any work or product based on this material.
25 *
26 * =========================================================================
27 *
28 * Author: Boris Kiryutin
29 *
30 * =========================================================================
31 */
32 
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 
38 
39 #include "Info.hpp"
40 #include "Ali.hpp"
41 #include "nucprot.hpp"
42 #include "NSeq.hpp"
43 #include "PSeq.hpp"
44 #include "AliSeqAlign.hpp"
45 
50 #include <objmgr/util/sequence.hpp>
51 #include <objmgr/seq_vector.hpp>
53 
56 
57 const char GAP_CHAR='-'; // used in dna and protein text
58 const char INTRON_CHAR='.'; // protein
59 _DEBUG_ARG(const char SPACE_CHAR=' ';) // translation and protein; to be used only in _ASSERT and other _DEBUG-dependent constructs.
60 const char INTRON_OR_GAP[] = {INTRON_CHAR,GAP_CHAR,0};
61 
62 // used in match text
63 const char BAD_PIECE_CHAR='X';
64 const char MISMATCH_CHAR=' ';
66 const char MATCH_CHAR='|';
67 const char POSIT_CHAR='+';
68 
69 BEGIN_SCOPE(prosplign)
70 
72 public:
73  CProSplignTrimmer(const CProteinAlignText& alignment_text);
74 
75  /// checks if alignment ends should be restored beyond 'beg' or 'end'
76  ///returns new flanking coord or 'beg'/'end' if no restoring )
77  size_t RestoreFivePrime(size_t beg) const;
78  size_t RestoreThreePrime(size_t end) const;
79 
80  ///trim flanks with positives dropoff over a cutoff, iterative
81  ///flank 'good pieces' should not be dropped completely
82  ///applied inside an exon only
83  int CutFromLeft(CNPiece pc, const CProSplignOutputOptionsExt& options) const; //returns new pc.beg
84  int CutFromRight(CNPiece pc, const CProSplignOutputOptionsExt& options) const; //returns new pc.end
85 
86 
87 private:
89 
90  string m_posit;// has POSIT_CHAR in all positive alignment positions, example:
91 
92 // dna : GATGAAACAGCACTAGTGACAGGTAAA
93 // translation: D E T A L V T G K
94 // match : | | + | | |
95 // protein : D E Q S F --- T G K
96 
97 // m_posit : ++++++ +++ +++++++++
98 };
99 
100 
102 {
105 }
106 
107 list<CNPiece> FindGoodParts(const CProteinAlignText& alignment_text, CProSplignOutputOptionsExt m_options, const CProSplignScaledScoring& scoring, const CSubstMatrix& matrix)
108 {
109 
110  const string& nuc = alignment_text.GetDNA();
111  const string& outp = alignment_text.GetProtein();
112  const string& orig_match = alignment_text.GetMatch();
113  list<CNPiece> m_AliPiece;
114  //init
115  if(m_options.IsPassThrough()) {
116  string::size_type n1 = outp.find_first_not_of(GAP_CHAR);
117  string::size_type n2 = outp.find_last_not_of(GAP_CHAR);
118  if (n1 <= n2)
119  m_AliPiece.push_back(CNPiece(n1, n2+1, 0, 0));
120  return m_AliPiece;
121  }
122 
123  string match = orig_match;
124  for (size_t i = 1; i < match.size()-1; ++i) {
125  if (isupper(outp[i])) {
126  _ASSERT( outp[i-1]==SPACE_CHAR && outp[i+1]==SPACE_CHAR );
127  match[i-1] = match[i+1] = match[i];
128  ++i;
129  }
130  }
131 
132  CProSplignTrimmer trim(alignment_text);
133 
134  m_AliPiece = FindGoodParts(CNPiece(0, match.size(), 0, 0), match, outp, m_options);
135  for(list<CNPiece>::iterator it = m_AliPiece.begin(); it != m_AliPiece.end(); ) {
136  list<CNPiece> tmp = ExcludeBadExons(*it, match, outp, m_options);
137  m_AliPiece.splice(it, tmp);
138  it = m_AliPiece.erase(it);
139  }
140  for(list<CNPiece>::iterator it = m_AliPiece.begin(); it != m_AliPiece.end(); ) {
141  list<CNPiece> tmp = FindGoodParts(*it, match, outp, m_options);
142  m_AliPiece.splice(it, tmp);
143  it = m_AliPiece.erase(it);
144  }
145 
146  //trim flanks with positives dropoff over a cutoff, iterative
147  //flank 'good pieces' should not be dropped completely
148  if( !m_AliPiece.empty() ) {
149  m_AliPiece.front().beg = trim.CutFromLeft(m_AliPiece.front(), m_options);
150  m_AliPiece.back().end = trim.CutFromRight(m_AliPiece.back(), m_options);
151  }
152 
153  // 'fill holes' mode. keep everything between the first 'good piece' and the last one.
154  if( !m_AliPiece.empty() && m_options.GetFillHoles() ) {
155  string::size_type beg = m_AliPiece.front().beg;
156  string::size_type end = m_AliPiece.back().end;
157  m_AliPiece.clear();
158  m_AliPiece.push_back(CNPiece(beg, end, 0, 0));
159  }
160 
161  ///stich small holes
162  if( !m_AliPiece.empty() && m_options.GetMinHoleLen() > 0 ) {
163  for(list<CNPiece>::iterator it = m_AliPiece.begin(); it != m_AliPiece.end(); ) {
164  list<CNPiece>::iterator sit = it;
165  ++sit;
166  if(sit == m_AliPiece.end()) break;
167  //count unaligned portion
168  int hbeg = it->end;
169  int hend = sit->beg;
170  int nuc_cnt = 0, prot_cnt = 0;
171  for(int pos = hbeg; pos < hend; ++pos) {
172  if( nuc[pos] != GAP_CHAR && nuc[pos] != INTRON_CHAR ) ++nuc_cnt;
173  if( outp[pos] != GAP_CHAR && outp[pos] != INTRON_CHAR ) ++prot_cnt;
174  }
175  if( nuc_cnt < m_options.GetMinHoleLen() && prot_cnt < m_options.GetMinHoleLen() ) {//stich
176  sit->beg = it->beg;
177  it = m_AliPiece.erase(it);
178  } else {
179  ++it;
180  }
181  }
182  }
183 
184  //trim short tails with negative score, iterative
185  if( !m_AliPiece.empty() ) {
186  bool keep_trimming = true;
187  while( keep_trimming ) {
188  CNPiece& pc = *m_AliPiece.rbegin();
189  keep_trimming = TrimNegativeTail( pc, alignment_text, scoring, matrix);
190  //cut to match
191  int n = pc.end - 1;
192  for(; n >= pc.beg; --n) {
193  if(match[n] == POSIT_CHAR || match[n] == MATCH_CHAR) break;
194  }
195  pc.end = n+1;
196  //the whole piece is trimmed, delete
197  if(pc.beg >= pc.end) {
198  m_AliPiece.pop_back();
199  }
200  }
201  }
202 
203  //remove trailing NNs
204  if( !m_AliPiece.empty() && m_options.GetCutNs() ) {
205  for(list<CNPiece>::iterator it = m_AliPiece.begin(); it != m_AliPiece.end(); ) {
206  int pos = it->end - 1;
207  for(; pos >= it->beg && nuc[pos] == 'N' ; --pos);
208  if(pos < it->beg) {
209  it = m_AliPiece.erase(it);
210  } else {
211  it->end = pos+1;
212  ++it;
213  }
214  }
215  }
216 
217  //trim partial codons
218  if( !m_AliPiece.empty() && m_options.GetCutFlankPartialCodons() ) {
219  for(list<CNPiece>::iterator it = m_AliPiece.begin(); it != m_AliPiece.end(); ) {
220  int pos = it->end - 1;
221  if(isupper(outp[pos])) {//case where N was trimmed
222  //cout<<it->beg<<", pos: "<<pos<<", poschar: "<<<<", pos: "<<endl;
223  _ASSERT( pos > it->beg
224  && (match[pos] == POSIT_CHAR || match[pos] == MATCH_CHAR)
225  && (match[pos-1] == POSIT_CHAR || match[pos-1] == MATCH_CHAR) );
226  pos -= 2;
227  if(pos < it->beg) {
228  it = m_AliPiece.erase(it);
229  } else {
230  it->end = pos+1;
231  ++it;
232  }
233  } else {
234  ++it;
235  }
236  }
237  for(list<CNPiece>::iterator it = m_AliPiece.begin(); it != m_AliPiece.end(); ) {
238  //partial codons at exon end
239  int pos = it->end - 1;
240  for(; pos >= it->beg && ( islower(outp[pos]) || outp[pos] == INTRON_CHAR ) ; --pos);
241  if(pos < it->beg) {
242  it = m_AliPiece.erase(it);
243  } else {
244  it->end = pos+1;
245  ++it;
246  }
247  }
248  for(list<CNPiece>::iterator it = m_AliPiece.begin(); it != m_AliPiece.end(); ) {
249  //partial codons at exon end
250  int pos = it->beg;
251  for(; pos < it->end && ( islower(outp[pos]) || outp[pos] == INTRON_CHAR ) ; ++pos);
252  if(pos == it->end) {
253  it = m_AliPiece.erase(it);
254  } else {
255  it->beg = pos;
256  ++it;
257  }
258  }
259 
260  }//end trim partial codons
261 
262 
263  //restore short flanks if it makes the alignment complete
264  if( !m_AliPiece.empty() ) {
265  m_AliPiece.front().beg = trim.RestoreFivePrime(m_AliPiece.front().beg);
266  m_AliPiece.back().end = trim.RestoreThreePrime(m_AliPiece.back().end);
267  }
268 
269 
270  return m_AliPiece;
271 }
272 
273 list<CNPiece> FindGoodParts(const CNPiece pc, const string& match_all_pos, const string& protein, CProSplignOutputOptionsExt m_options)
274 {
275  list<CNPiece> m_AliPiece;
276  const string& match = match_all_pos;
277 
278  string::size_type n = match.find_first_not_of(BAD_OR_MISMATCH, pc.beg);
279  if(n == string::npos || n >= (unsigned)pc.end) return m_AliPiece; //no matches
280  bool ism = true;
281  bool isintr = false;
282  string::size_type beg = n;
283  int efflen = 0;
284  if(match[n] == MATCH_CHAR && protein.find_first_not_of(GAP_CHAR) == n && protein[n+1] == 'M')
285  efflen += m_options.GetStartBonus();
286  for(; n<(unsigned)pc.end; ++n) {
287  if(match[n] == POSIT_CHAR || match[n] == MATCH_CHAR) {
288  if(!ism) {
289  ism = true;
290  m_AliPiece.push_back(CNPiece(beg, n, 0, efflen));
291  beg = n;
292  efflen = 0;
293  }
294  } else {
295  if(ism) {
296  ism = false;
297  m_AliPiece.push_back(CNPiece(beg, n, efflen, efflen));
298  beg = n;
299  efflen = 0;
300  }
301  }
302  if(protein[n] != INTRON_CHAR) {//inside exon
303  efflen ++;
304  isintr = false;
305  }
306  else {//intron
307  if(!isintr) {
308  efflen += m_options.splice_cost;
309  isintr = true;
310  }
311  }
312  }
313  if(ism) {//endpoint is a match
314  m_AliPiece.push_back(CNPiece(beg, n, efflen, efflen));
315  }
316  //join pieces
317  list<CNPiece>::iterator itb, ite, itc;
318  list<CNPiece>::size_type pnum = m_AliPiece.size() + 1;
319  while(pnum > m_AliPiece.size()) {
320  pnum = m_AliPiece.size();
321  //forward
322  for(itb = m_AliPiece.begin(); ; ) {
323  ite = itb;
324  int slen = 0, spos = 0;
325  itc = itb;
326  ++itc;
327  while(itc != m_AliPiece.end()) {
328  if(m_options.Bad(itc)) break;//really bad
329  slen += itc->efflen;
330  spos += itc->posit;
331  if(m_options.Dropof(slen, spos, itb)) break;
332  ++itc;//points to 'good'
333  if(m_options.Perc(itc, slen, spos, itb)) {
334  if(m_options.BackCheck(itb, itc)) ite = itc;//join!
335  }
336  slen += itc->efflen;
337  spos += itc->posit;
338  ++itc;//points to ' bad' or end
339  }
340  if(ite != itb) {
341  m_options.Join(itb, ite);
342  m_AliPiece.erase(itb, ite);
343  itb = ite;
344  }
345  ++itb;
346  if(itb == m_AliPiece.end()) break;
347  ++itb;
348  }
349  //backward
350  itb = m_AliPiece.end();
351  --itb;
352  while(itb != m_AliPiece.begin()) {
353  ite = itb;
354  int slen = 0, spos = 0;
355  itc = itb;
356  while(itc != m_AliPiece.begin()) {
357  --itc;//points to ' bad'
358  if(m_options.Bad(itc)) break;//really bad
359  slen += itc->efflen;
360  spos += itc->posit;
361  if(m_options.Dropof(slen, spos, itb)) break;
362  --itc;//points to 'good'
363  if(m_options.Perc(itc, slen, spos, itb)) {
364  if(m_options.ForwCheck(itc, itb)) ite = itc;//join!
365  }
366  slen += itc->efflen;
367  spos += itc->posit;
368  }
369  if(ite != itb) {
370  m_options.Join(ite, itb);
371  m_AliPiece.erase(ite, itb);
372  }
373  if(itb == m_AliPiece.begin()) break;
374  --itb;
375  --itb;
376  }
377  }
378  //throw out bad pieces
379  for(list<CNPiece>::iterator it = m_AliPiece.begin(); it != m_AliPiece.end(); ) {
380  if(it->posit == 0) it = m_AliPiece.erase(it);
381  else if (it->efflen < m_options.GetMinGoodLen()) it = m_AliPiece.erase(it);
382  else ++it;
383  }
384  return m_AliPiece;
385 }
386 
387 
388 list<CNPiece> ExcludeBadExons(const CNPiece pc, const string& match_all_pos, const string& protein, CProSplignOutputOptionsExt m_options)
389 {
390  const string& match = match_all_pos;
391  list<CNPiece> alip;
392  vector<pair<int, int> > exons; //exons inside pc, format [exon_beg, exon_end)
393 
394  bool in_exon = false;
395  for(int n=pc.beg; n<pc.end; ) {
396  if( ( protein[n] != INTRON_CHAR) && !in_exon ) {
397  in_exon = true;
398  exons.push_back(make_pair(n, 0));//mark beg. of exon
399  }
400  ++n;
401  if( ( (protein[n] == INTRON_CHAR) || (n == pc.end) ) && in_exon ) {
402  in_exon = false;
403  exons.back().second = n;//mark end. of exon
404  }
405  }
406 
407  int cur_beg = pc.beg;
408  for(vector<pair<int, int> >::iterator eit = exons.begin(); eit != exons.end(); ++eit) {
409  int pos = 0;
410  int id = 0;
411  int len = eit->second - eit->first;
412  for(int i = eit->first; i < eit->second; ++i) {
413  if(match[i] == POSIT_CHAR) ++pos;
414  else if(match[i] == MATCH_CHAR) {
415  ++id;
416  ++pos;
417  }
418  }
419  if(( 100*pos < len*m_options.GetMinExonPos() ) || ( 100*id < m_options.GetMinExonId() * len)) {//throw out bad exon
420  //find prev match
421  int n;
422  for(n = eit->first - 1; n > cur_beg; --n) {
423  if(match[n] == POSIT_CHAR || match[n] == MATCH_CHAR) break;
424  }
425  ++n;
426  if(n > cur_beg) alip.push_back(CNPiece(cur_beg, n, 0, 0));
427  //find next match
428  for(n = eit->second; n < pc.end; ++n) {
429  if(match[n] == POSIT_CHAR || match[n] == MATCH_CHAR) break;
430  }
431  cur_beg = n;
432  }
433  }
434  if(cur_beg < pc.end) alip.push_back(CNPiece(cur_beg, pc.end, 0, 0));
435  return alip;
436 }
437 
438 bool TrimNegativeTail(CNPiece& pc, const CProteinAlignText& alignment_text, const CProSplignScaledScoring& scoring, const CSubstMatrix& matrix)
439 {
440  int score = 0;
441  int state = 0;//0 - diag, 1 - gap in nuc, 2, gap in prot
442  const string& nuc = alignment_text.GetDNA();
443  const string& prot = alignment_text.GetProtein();
444  const string& tran = alignment_text.GetTranslation();
445  int cnuc = 0, cprot = 0, cmax = 18;
446  int n;
447  for(n = pc.end - 1; n >= pc.beg; --n) {
448  //close gaps
449  if( ( state == 1 && nuc[n] != GAP_CHAR ) ||
450  ( state == 2 && prot[n] != GAP_CHAR ) ) {
451  score -= scoring.sm_Ig;
452  state = 0;
453  }
454  if(score < 0) {//trim at the beg. of the gap
455  pc.end = n+1;
456  return true;
457  }
458  if( prot[n] == INTRON_CHAR ) {//stop at intron
459  return false;
460  }
461  //score the current alignment position
462  if( prot[n] == GAP_CHAR ) {
463  score -= scoring.sm_Ine;
464  state = 2;
465  ++cnuc;
466  } else if ( nuc[n] == GAP_CHAR ) {
467  score -= scoring.sm_Ine;
468  state = 1;
469  ++cprot;
470  } else {
471  ++cnuc;
472  ++cprot;
473  //no gap, no intron
474  if( islower(tran[n]) ) {
475  if( !islower(prot[n]) ) {
476  NCBI_THROW(CProSplignException, eFormat, "CProteinAlignText format error");
477  }
478  score += matrix.ScaledScore(toupper(prot[n]), toupper(tran[n]))/3;
479  } else if( isupper(prot[n]) && isupper(tran[n]) ) {
480  score += matrix.ScaledScore(prot[n], tran[n]);
481  }
482  }
483  if(score < 0) {//trim
484  pc.end = n;
485  return true;
486  }
487  if(cnuc >= cmax && cprot >= cmax) break;
488  }
489  if(state == 1 || state == 2) {//close gap and check the score
490  score -= scoring.sm_Ig;
491  if(score < 0) {//trim
492  pc.end = n+1;
493  return true;
494  }
495  }
496  //no negative tail found
497  return false;
498 }
499 
500 
501 //CNPiece implementation
502 
503 CNPiece::CNPiece(string::size_type obeg, string::size_type oend, int oposit, int oefflen)
504  {
505  beg = (int)obeg;
506  end = (int)oend;
507  posit = oposit;
508  efflen = oefflen;
509  }
510 
511 bool CProSplignOutputOptionsExt::Bad(list<prosplign::CNPiece>::iterator it)
512 {
513  if(it->efflen > GetMaxBadLen()) return true;
514  return false;
515 }
516 
517 bool CProSplignOutputOptionsExt::Dropof(int efflen, int posit, list<prosplign::CNPiece>::iterator it)
518 {
519  if((GetTotalPositives()-drop)*(efflen+it->efflen) > 100*(posit+it->posit)) return true;
520  return false;
521 }
522 
523 bool CProSplignOutputOptionsExt::Perc(list<prosplign::CNPiece>::iterator add, int efflen, int posit, list<prosplign::CNPiece>::iterator cur)
524 {
525  if(Dropof(efflen, posit, add)) return false;
526  if(GetTotalPositives()*(efflen+cur->efflen+add->efflen) > 100*(posit+cur->posit+add->posit)) return false;
527  return true;
528 }
529 
530 void CProSplignOutputOptionsExt::Join(list<prosplign::CNPiece>::iterator it, list<prosplign::CNPiece>::iterator last)
531 {
532  int posit = last->posit;
533  int efflen = last->efflen;
534  for(list<prosplign::CNPiece>::iterator it1 = it; it1 != last; ++it1) {
535  posit += it1->posit;
536  efflen += it1->efflen;
537  }
538  last->posit = posit;
539  last->efflen = efflen;
540  last->beg = it->beg;
541 }
542 
543 bool CProSplignOutputOptionsExt::ForwCheck(list<prosplign::CNPiece>::iterator it1, list<prosplign::CNPiece>::iterator it2)
544 {
545  int efflen = it1->efflen;
546  int pos = it1->posit;
547  for(;it1!= it2;){
548  it1++;
549  if(Dropof(efflen, pos, it1)) return false;
550  efflen += it1->efflen;
551  pos += it1->posit;
552  it1++;
553  efflen += it1->efflen;
554  pos += it1->posit;
555  }
556  return true;
557 }
558 
559 bool CProSplignOutputOptionsExt::BackCheck(list<prosplign::CNPiece>::iterator it1, list<prosplign::CNPiece>::iterator it2)
560 {
561  int efflen = it2->efflen;
562  int pos = it2->posit;
563  for(;it1!= it2;){
564  it2--;
565  if(Dropof(efflen, pos, it2)) return false;
566  efflen += it2->efflen;
567  pos += it2->posit;
568  it2--;
569  efflen += it2->efflen;
570  pos += it2->posit;
571  }
572  return true;
573 }
574 
575 END_SCOPE(prosplign)
576 
577 namespace {
578 int GetCompNum(const CSeq_align& sa)
579 {
580  if (sa.CanGetExt()) {
581  ITERATE(CSeq_align::TExt, e, sa.GetExt()) {
582  const CUser_object& uo = **e;
583  if (uo.CanGetData()) {
584  ITERATE(CUser_object::TData, field, uo.GetData()) {
585  if ((*field)->CanGetLabel() && (*field)->GetLabel().IsStr() && (*field)->GetLabel().GetStr()=="CompartmentId") {
586  return (*field)->GetData().GetInt();
587  }
588  }
589  }
590  }
591  }
592  return -1;
593 }
594 }
595 
596 void CProSplignText::Output(const CSeq_align& seqalign, CScope& scope, ostream& out, int width, const string& matrix_name)
597 {
598  int compartment_id = GetCompNum(seqalign);
599  string contig_name = seqalign.GetSegs().GetSpliced().GetGenomic_id().GetSeqIdString(true);
600  string prot_id = seqalign.GetSegs().GetSpliced().GetProduct_id().GetSeqIdString(true);
601  TSeqRange bounds = CProteinAlignText::GetGenomicBounds(scope, seqalign)->GetTotalRange();
602  int nuc_from = bounds.GetFrom()+1;
603  int nuc_to = bounds.GetTo()+1;
604  bool is_plus_strand = seqalign.GetSegs().GetSpliced().GetGenomic_strand()==eNa_strand_plus;
605 
606  out<<endl<<"************************************************************************"<<endl;
607  out<<"************************************************************************"<<endl;
608  out<<"************************************************************************"<<endl;
609  out<<compartment_id<<"\t"<<contig_name<<"\t"<<prot_id<<"\t"<<nuc_from<<"\t"<<nuc_to<<"\t";
610  out<<(is_plus_strand?'+':'-')<<endl;
611 
612  CProteinAlignText align_text(scope, seqalign, matrix_name);
613  const string& dna = align_text.GetDNA();
614  const string& translation = align_text.GetTranslation();
615  string match = align_text.GetMatch();
616  const string& protein = align_text.GetProtein();
617  string good_parts;
618  good_parts.append(match.size(),'*');
619  for (size_t i = 0; i < match.size(); ++i) {
620  if (match[i] == BAD_PIECE_CHAR)
621  match[i] = good_parts[i] = ' ';
622  }
623 
624  int npos1 = is_plus_strand?nuc_from:nuc_to;
625 
626  int prot_beg_pos = static_cast<int>(protein.find_first_not_of(GAP_CHAR));
627  int prot_end_pos = static_cast<int>(protein.find_last_not_of(GAP_CHAR));
628 
629  for(int i=0;i<prot_end_pos; i+=width) {
630  int apos = i+width-1;
631  if (apos >= (int)dna.length()) {
632  apos = (int)dna.length() - 1;
633  width = apos-i+1;
634  }
635 
636 #ifdef NCBI_COMPILER_WORKSHOP
637  int gaps = 0;
638  count(dna.begin()+i, dna.begin()+(i+width), GAP_CHAR, gaps);
639  int real_bases = width-gaps;
640 #else
641  int real_bases = static_cast<int>(width-count(dna.begin()+i, dna.begin()+(i+width), GAP_CHAR));
642 #endif
643 
644  int npos2 = is_plus_strand?npos1+real_bases-1:npos1-(real_bases-1);
645 
646  //throw out head and tail lines with a complete row gap
647  if (apos > prot_beg_pos) {
648  out.setf(IOS_BASE::left, IOS_BASE::adjustfield);
649  if (real_bases>0) {
650  out<<setw(12)<<npos1<<dna.substr(i, width)<<" "<<npos2<<endl;
651  } else { //complete row is a nucleotide gap
652  out<<setw(12)<<"-"<<dna.substr(i, width)<<" "<<"-"<<endl;
653  }
654  out<<setw(12)<<" "<<translation.substr(i, width)<<endl;
655  out<<setw(12)<<" "<<match.substr(i, width)<<endl;
656  out<<setw(12)<<" "<<protein.substr(i, width)<<endl;
657  out<<setw(12)<<" "<<good_parts.substr(i, width)<<endl;
658  }
659 
660  npos1 = is_plus_strand?npos2+1:npos2-1;
661  }
662 }
663 
664 USING_SCOPE(prosplign);
665 
666 namespace {
667 
668 struct CAliChunk {
669  CAliChunk(TSeqPos ali_pos, TSeqPos nuc_pos, TSeqPos prot_pos, CSpliced_seg::TExons::iterator exon_iter, CSpliced_exon::TParts::iterator chunk_iter) :
670  m_nuc_pos(nuc_pos), m_prot_pos(prot_pos), m_exon_iter(exon_iter), m_chunk_iter(chunk_iter), m_bad(false)
671  {
672  const CSpliced_exon_chunk& chunk = **chunk_iter;
673 
674  m_nuc_len = 0;
675  m_prot_len = 0;
676  if (chunk.IsDiag() || chunk.IsMatch() || chunk.IsMismatch()) {
677  int len = 0;
678  if (chunk.IsDiag()) {
679  len = chunk.GetDiag();
680  } else if (chunk.IsMatch()) {
681  len = chunk.GetMatch();
682  } else if (chunk.IsMismatch()) {
683  len = chunk.GetMismatch();
684  }
685 
686  m_nuc_len = len;
687  m_prot_len = len;
688  } else if (chunk.IsProduct_ins()) {
689  m_prot_len = chunk.GetProduct_ins();
690  } else if (chunk.IsGenomic_ins()) {
691  m_nuc_len = chunk.GetGenomic_ins();
692  }
693 
694  m_ali_range = TSeqRange(ali_pos, ali_pos + max(m_nuc_len,m_prot_len)-1);
695  }
696 
697  TSeqRange m_ali_range;
698  TSeqPos m_nuc_pos;
699  TSeqPos m_prot_pos;
700  int m_nuc_len;
701  int m_prot_len;
702  CSpliced_seg::TExons::iterator m_exon_iter;
703  CSpliced_exon::TParts::iterator m_chunk_iter;
704  bool m_bad;
705 };
706 
707 typedef list<CAliChunk> TAliChunkCollection;
708 typedef TAliChunkCollection::iterator TAliChunkIterator;
709 
710 
711 TAliChunkCollection ExtractChunks(CScope& scope, CSeq_align& seq_align)
712 {
713  CSpliced_seg& sps = seq_align.SetSegs().SetSpliced();
714  ENa_strand strand = sps.GetGenomic_strand();
715  TSeqRange bounds = CProteinAlignText::GetGenomicBounds(scope, seq_align)->GetTotalRange();
716  int nuc_from = bounds.GetFrom();
717  int nuc_to = bounds.GetTo();
718  int prot_from = 0;
719 
720  int alignment_pos = 0;
721 
722  TAliChunkCollection chunks;
723 
725  CSpliced_exon& exon = **e_it;
726  int prot_cur_start = exon.GetProduct_start().AsSeqPos();
727  int nuc_cur_start = exon.GetGenomic_start();
728  int nuc_cur_end = exon.GetGenomic_end();
729 
730  if (strand == eNa_strand_plus) {
731  alignment_pos += max(prot_cur_start-prot_from, nuc_cur_start-nuc_from);
732  nuc_from = nuc_cur_start;
733  } else {
734  alignment_pos += max(prot_cur_start-prot_from, nuc_to-nuc_cur_end);
735  nuc_to = nuc_cur_end;
736  }
737  prot_from = prot_cur_start;
738 
740  CAliChunk chunk(alignment_pos, strand == eNa_strand_plus?nuc_from:nuc_to, prot_from, e_it, p_it);
741  alignment_pos = chunk.m_ali_range.GetTo()+1;
742  prot_from += chunk.m_prot_len;
743  if (strand == eNa_strand_plus) {
744  nuc_from += chunk.m_nuc_len;
745  } else {
746  nuc_to -= chunk.m_nuc_len;
747  }
748 
749  chunks.push_back(chunk);
750  }
751 
752  _ASSERT( int(exon.GetProduct_end().AsSeqPos()+1) == prot_from );
753  _ASSERT( (strand==eNa_strand_plus && nuc_cur_end+1==nuc_from) || (strand!=eNa_strand_plus && nuc_cur_start-1==nuc_to) );
754  }
755  return chunks;
756 }
757 
758 list<TSeqRange> InvertPartList(const list<CNPiece>& good_parts, TSeqRange total_range)
759 {
760  list<TSeqRange> bad_parts;
761 
762  int tail_beg = total_range.GetFrom();
763  int tail_end = total_range.GetTo();
764  ITERATE(list<CNPiece>, i, good_parts) {
765  if (tail_beg < i->beg)
766  bad_parts.push_back(TSeqRange(tail_beg,i->beg-1));
767  tail_beg = i->end;
768  }
769  if (tail_beg <= tail_end)
770  bad_parts.push_back(TSeqRange(tail_beg,tail_end));
771 
772  return bad_parts;
773 }
774 
775 #ifdef _DEBUG
776 void TestExonLength(const CSpliced_exon& exon)
777 {
778  int nuc_len = 0;
779  int prot_len = 0;
780  ITERATE (CSpliced_exon::TParts, p_it, exon.GetParts()) {
781  const CSpliced_exon_chunk& chunk = **p_it;
782 
783  if (chunk.IsDiag() || chunk.IsMatch() || chunk.IsMismatch()) {
784  int len = 0;
785  if (chunk.IsDiag()) {
786  len = chunk.GetDiag();
787  } else if (chunk.IsMatch()) {
788  len = chunk.GetMatch();
789  } else if (chunk.IsMismatch()) {
790  len = chunk.GetMismatch();
791  }
792 
793  nuc_len += len;
794  prot_len += len;
795  } else if (chunk.IsProduct_ins()) {
796  prot_len += chunk.GetProduct_ins();
797  } else if (chunk.IsGenomic_ins()) {
798  nuc_len += chunk.GetGenomic_ins();
799  }
800  }
801  int prot_cur_start = exon.GetProduct_start().AsSeqPos();
802  int prot_cur_end = exon.GetProduct_end().AsSeqPos();
803  int nuc_cur_end = exon.GetGenomic_end();
804  int nuc_cur_start = exon.GetGenomic_start();
805 
806  _ASSERT( nuc_cur_end-nuc_cur_start+1 == nuc_len );
807  _ASSERT( prot_cur_end-prot_cur_start+1 == prot_len );
808 }
809 #endif
810 
811 
812 void SplitChunk(TAliChunkCollection& chunks, TAliChunkIterator iter, TSeqPos start_of_second_chunk, bool genomic_plus)
813 {
814  _DEBUG_CODE( TestExonLength(**iter->m_exon_iter); );
815  _ASSERT( iter->m_ali_range.GetFrom() < start_of_second_chunk );
816  _ASSERT( start_of_second_chunk <= iter->m_ali_range.GetTo());
817  _ASSERT( iter->m_nuc_len == iter->m_prot_len ); // not an insertion
818 
820  new_chunk->Assign(**iter->m_chunk_iter);
821  int first_len = start_of_second_chunk - iter->m_ali_range.GetFrom();
822  int second_len = iter->m_ali_range.GetTo() - start_of_second_chunk+1;
823 
824  TAliChunkIterator first_iter = chunks.insert(iter, *iter);
825 
826  if (genomic_plus) {
827  iter->m_nuc_pos += first_len;
828  } else {
829  iter->m_nuc_pos -= first_len;
830  }
831  iter->m_prot_pos += first_len;
832 
833  if (new_chunk->IsDiag()) {
834  new_chunk->SetDiag(first_len);
835  (*iter->m_chunk_iter)->SetDiag(second_len);
836  } else if (new_chunk->IsMatch()) {
837  new_chunk->SetMatch(first_len);
838  (*iter->m_chunk_iter)->SetMatch(second_len);
839  } else if (new_chunk->IsMismatch()) {
840  new_chunk->SetMismatch(first_len);
841  (*iter->m_chunk_iter)->SetMismatch(second_len);
842  }
843 
844  first_iter->m_ali_range.SetTo(start_of_second_chunk-1);
845  iter->m_ali_range.SetFrom(start_of_second_chunk);
846 
847  first_iter->m_nuc_len = first_iter->m_prot_len = first_len;
848  iter->m_nuc_len = iter->m_prot_len = second_len;
849 
850  first_iter->m_chunk_iter = (*iter->m_exon_iter)->SetParts().insert(iter->m_chunk_iter, new_chunk);
851 
852  _DEBUG_CODE( TestExonLength(**iter->m_exon_iter); );
853 }
854 
855 void DropExon(CSpliced_seg::TExons& exons, CSpliced_seg::TExons::iterator& exon_iter)
856 {
857  exons.erase(exon_iter);
858  exon_iter = exons.end();
859 }
860 
861 void DropExonHead(TAliChunkIterator chunk_iter, bool genomic_plus)
862 {
863  CSpliced_exon* cur_exon = *chunk_iter->m_exon_iter;
864 
865  _DEBUG_CODE( TestExonLength(*cur_exon); );
866 #ifdef _DEBUG
867  size_t chunks_count = cur_exon->GetParts().size();
868 #endif
869 
870  cur_exon->SetParts().erase(cur_exon->SetParts().begin(), chunk_iter->m_chunk_iter);
871 
872  if (genomic_plus) {
873  cur_exon->SetGenomic_start(chunk_iter->m_nuc_pos);
874  } else {
875  cur_exon->SetGenomic_end(chunk_iter->m_nuc_pos);
876  }
877  cur_exon->SetProduct_start(*NultriposToProduct_pos(chunk_iter->m_prot_pos));
878  cur_exon->SetPartial(true);
879  if (cur_exon->IsSetAcceptor_before_exon())
880  cur_exon->ResetAcceptor_before_exon();
881 
882  _ASSERT( 0 < cur_exon->GetParts().size() );
883  _ASSERT( cur_exon->GetParts().size() < chunks_count );
884 
885  _DEBUG_CODE( TestExonLength(*cur_exon); );
886 }
887 
888 void SplitExon(CSpliced_seg::TExons& exons, TAliChunkIterator chunk_iter, bool genomic_plus)
889 {
890  CSpliced_exon* cur_exon = *chunk_iter->m_exon_iter;
891 
892  _DEBUG_CODE( TestExonLength(*cur_exon); );
893 #ifdef _DEBUG
894  size_t chunks_count = cur_exon->GetParts().size();
895 #endif
896 
897  CRef< CSpliced_exon > new_exon( new CSpliced_exon );
898  new_exon->Assign(*cur_exon);
899 
900  CSpliced_exon::TParts::iterator new_exon_chunk = new_exon->SetParts().begin();
901  ITERATE(CSpliced_exon::TParts, old_exon_chunk, cur_exon->GetParts()) {
902  if (old_exon_chunk==chunk_iter->m_chunk_iter)
903  break;
904  ++new_exon_chunk;
905  }
906 
907  new_exon->SetParts().erase(new_exon_chunk, new_exon->SetParts().end());
908 
909  if (genomic_plus) {
910  new_exon->SetGenomic_end(chunk_iter->m_nuc_pos-1);
911  } else {
912  new_exon->SetGenomic_start(chunk_iter->m_nuc_pos+1);
913  }
914  new_exon->SetProduct_end(*NultriposToProduct_pos(chunk_iter->m_prot_pos-1));
915  new_exon->SetPartial(true);
916  if (new_exon->IsSetDonor_after_exon())
917  new_exon->ResetDonor_after_exon();
918 
919  _ASSERT( new_exon->GetGenomic_start() <= new_exon->GetGenomic_end() );
920  _ASSERT( new_exon->GetProduct_start().AsSeqPos() <= new_exon->GetProduct_end().AsSeqPos() );
921 
922  exons.insert(chunk_iter->m_exon_iter, new_exon);
923 
924  DropExonHead(chunk_iter, genomic_plus);
925 
926  _ASSERT( 0 < new_exon->GetParts().size() && 0 < cur_exon->GetParts().size() );
927  _ASSERT( new_exon->GetParts().size()+cur_exon->GetParts().size() == chunks_count );
928 
929  _DEBUG_CODE( TestExonLength(*new_exon); );
930  _DEBUG_CODE( TestExonLength(*cur_exon); );
931 }
932 
933 }
934 
935 void prosplign::SetScores(objects::CSeq_align& seq_align, objects::CScope& scope, const string& matrix_name) {
936  CProteinAlignText pro_text(scope, seq_align, matrix_name);
937  const string& prot = pro_text.GetProtein();
938  const string& dna = pro_text.GetDNA();
939  const string& match = pro_text.GetMatch();
940  // const string& trans = pro_text.GetTranslation();
941  int pos = 0, ident = 0, len = 0, neg = 0, pgap = 0, ngap = 0;
942  for(string::size_type i=0;i<match.size(); ++i) {
943  if( (prot[i] != '.') && (match[i] != 'X') ) {//skip introns and bad parts
944  ++len;
945  if(prot[i] == '-') {
946  ++pgap;
947  } else if(dna[i] == '-') {
948  ++ngap;
949  } else if(isalpha(prot[i])) {
950  bool triple = isupper(prot[i]);
951  switch(match[i]) {
952  case '|':
953  if(triple) ident +=3;
954  else ++ident;
955  case '+':
956  if(triple) pos +=3;
957  else ++pos;
958  break;
959  default://mismatch
960  if(triple) neg +=3;
961  else ++neg;
962  break;
963  }
964  }
965  }
966  }
967  seq_align.SetNamedScore("num_ident", ident);
968  seq_align.SetNamedScore("num_positives", pos);
969  seq_align.SetNamedScore("num_negatives", neg);
970  seq_align.SetNamedScore("product_gap_length", pgap);
971  seq_align.SetNamedScore("genomic_gap_length", ngap);
972  seq_align.SetNamedScore("align_length", len);
973  //internal protein gap length for full alignment quality measure
974  int ipgap = 0;
975  int ibeg, iend;
976  for(ibeg = 0; ibeg<(int)(prot.size()) && ( (prot[ibeg] == '.') || (match[ibeg] == 'X') || (prot[ibeg] == '-' ) ); ++ibeg) {}
977  for(iend = prot.size() - 1; iend >=0 && ( (prot[iend] == '.') || (match[iend] == 'X') || (prot[iend] == '-' ) ); --iend) {}
978  for(int i=ibeg;i<=iend; ++i) {
979  if( (prot[i] != '.') && (match[i] != 'X') ) {//skip introns and bad parts
980  if(prot[i] == '-') {
981  ++ipgap;
982  }
983  }
984  }
985  seq_align.SetNamedScore("product_internal_gap_length", ipgap);
986 }
987 
988 void prosplign::RefineAlignment(CScope& scope, CSeq_align& seq_align, const list<CNPiece>& good_parts)
989 {
990  CSpliced_seg& sps = seq_align.SetSegs().SetSpliced();
991  TAliChunkCollection chunks = ExtractChunks(scope, seq_align);
992 
993  if (chunks.empty())
994  return;
995 
996  bool genomic_plus = sps.GetGenomic_strand()==eNa_strand_plus;
997 
998  list<TSeqRange> bad_parts = InvertPartList(good_parts, TSeqRange(chunks.front().m_ali_range.GetFrom(),chunks.back().m_ali_range.GetTo()));
999 
1000  TAliChunkIterator chunk_iter = chunks.begin();
1001 
1002  ITERATE(list<TSeqRange>, bad_part, bad_parts) {
1003  while (chunk_iter != chunks.end() && chunk_iter->m_ali_range.GetTo() < bad_part->GetFrom()) {
1004  ++chunk_iter;
1005  }
1006 
1007  if (chunk_iter == chunks.end())
1008  break;
1009  if (bad_part->GetTo() < chunk_iter->m_ali_range.GetFrom())
1010  continue;
1011 
1012  if (chunk_iter->m_ali_range.GetFrom() < bad_part->GetFrom())
1013  SplitChunk(chunks, chunk_iter, bad_part->GetFrom(), genomic_plus);
1014 
1015  while (chunk_iter != chunks.end() && chunk_iter->m_ali_range.GetTo() <= bad_part->GetTo())
1016  chunk_iter++->m_bad = true;
1017 
1018  if (chunk_iter != chunks.end() && chunk_iter->m_ali_range.GetFrom() <= bad_part->GetTo()) {
1019  chunk_iter->m_bad = true;
1020  SplitChunk(chunks, chunk_iter, bad_part->GetTo()+1, genomic_plus);
1021  chunk_iter->m_bad = false;
1022  }
1023  }
1024 
1025  CSpliced_seg::TExons::iterator prev_exon_iter = sps.SetExons().end();
1026 
1027  NON_CONST_ITERATE(TAliChunkCollection, chunk_it, chunks) {
1028  while(chunk_it != chunks.end() && !chunk_it->m_bad) {
1029  prev_exon_iter = chunk_it->m_exon_iter;
1030  ++chunk_it;
1031  }
1032  if (chunk_it == chunks.end())
1033  break;
1034  if (prev_exon_iter != chunk_it->m_exon_iter) {
1035  if ((*chunk_it->m_exon_iter)->IsSetAcceptor_before_exon())
1036  (*chunk_it->m_exon_iter)->ResetAcceptor_before_exon();
1037  } else {
1038  SplitExon(sps.SetExons(),chunk_it, genomic_plus);
1039  }
1040 
1041  prev_exon_iter = chunk_it->m_exon_iter;
1042  TAliChunkIterator next_chunk_iter = chunk_it;
1043  ++next_chunk_iter;
1044  while (next_chunk_iter != chunks.end() && next_chunk_iter->m_bad && next_chunk_iter->m_exon_iter==prev_exon_iter) {
1045  chunk_it = next_chunk_iter++;
1046  }
1047 
1048  if (next_chunk_iter == chunks.end() || next_chunk_iter->m_exon_iter!=prev_exon_iter) {
1049  DropExon(sps.SetExons(), prev_exon_iter);
1050  } else {
1051  DropExonHead(next_chunk_iter, genomic_plus);
1052  }
1053  }
1054 
1055 #ifdef _DEBUG
1056  ITERATE (CSpliced_seg::TExons, e_it, sps.GetExons()) {
1057  TestExonLength(**e_it);
1058  }
1059 
1060 #endif
1061 }
1062 
1063 
1064 /// CProSplignTrimmer implementation
1065 
1067  : m_alignment_text(alignment_text) {
1068  const string& outp = alignment_text.GetProtein();
1069  const string& match = alignment_text.GetMatch();
1070  m_posit = match;
1071  for (size_t i = 1; i < match.size()-1; ++i) {
1072  if(isupper(outp[i])) {
1073  _ASSERT( outp[i-1]==SPACE_CHAR && outp[i+1]==SPACE_CHAR );
1074  if( match[i] == MATCH_CHAR || match[i] == POSIT_CHAR ) {
1075  m_posit[i-1] = m_posit[i+1] = m_posit[i] = POSIT_CHAR;
1076  ++i;
1077  }
1078  } else if(islower(outp[i])) {
1079  if( match[i] == MATCH_CHAR || match[i] == POSIT_CHAR ) {
1080  m_posit[i] = POSIT_CHAR;
1081  }
1082  }
1083  }
1084 }
1085 
1086 
1087 // check if alignment beginning should be restored
1088 size_t CProSplignTrimmer::RestoreFivePrime(size_t beg) const {
1089  const string& prot_row = m_alignment_text.GetProtein();
1090  const string& dna_row = m_alignment_text.GetDNA();
1091  size_t pbeg = prot_row.find_first_not_of(INTRON_OR_GAP);
1092  if(pbeg == string::npos) return beg;
1093  if( pbeg >= beg ) return beg;//nothing to restore
1094  int ali_len = (int)(beg - pbeg);
1095  if( m_posit[pbeg] != POSIT_CHAR ) return beg;//won't start from mismatch/gap
1096  if( ali_len > 36 ) return beg;// flank is not short enough to restore
1097  int posit_cnt = 0;
1098  int gap_cnt = 0;//number of gaps, not length
1099  int mismatch_cnt = 0;
1100  int in_gap = 0; // 0 - not in gap, -1 - gap in prot, 1 - gap in dna
1101  for(size_t i = pbeg; i < beg; ++i) {
1102  _ASSERT( m_posit[i] != POSIT_CHAR || ( prot_row[i] != GAP_CHAR && dna_row[i] != GAP_CHAR ) );
1103  if( prot_row[i] == INTRON_CHAR ) return beg;
1104  if( prot_row[i] == GAP_CHAR ) {
1105  if( in_gap != -1 ) {
1106  in_gap = -1;
1107  ++gap_cnt;
1108  }
1109  } else if( dna_row[i] == GAP_CHAR ) {
1110  if( in_gap != 1 ) {
1111  in_gap = 1;
1112  ++gap_cnt;
1113  }
1114  } else {//not in gap
1115  in_gap = 0;
1116  if ( m_posit[i] == POSIT_CHAR ) {
1117  ++posit_cnt;
1118  } else {
1119  ++mismatch_cnt;
1120  }
1121  }
1122  }
1123  if( gap_cnt == 0 && mismatch_cnt < 10) return pbeg;//three mismatches and no gap, restore
1124  if( gap_cnt < 3 && 100 * posit_cnt >= 60 * ali_len ) return pbeg;//match 60% or more
1125  if( gap_cnt < 2 && 100 * posit_cnt >= 50 * ali_len ) return pbeg;//might reconsider later on
1126  //do not extend
1127  return beg;
1128 }
1129 
1130 
1131 // check if alignment end should be restored
1132 size_t CProSplignTrimmer::RestoreThreePrime(size_t end) const {
1133  const string& prot_row = m_alignment_text.GetProtein();
1134  const string& dna_row = m_alignment_text.GetDNA();
1135  const string& tran_row = m_alignment_text.GetTranslation();
1136  size_t pend = prot_row.find_last_not_of(INTRON_OR_GAP);
1137  if(pend == string::npos) return end;
1138  if( m_posit[pend] != POSIT_CHAR ) return end;//won't end with mismatch/gap
1139  ++pend;
1140  if( end >= pend ) return end;//nothing to restore
1141  int ali_len = (int)(pend-end);
1142  if( ali_len > 36 ) return end;// flank is not short enough to restore
1143  int posit_cnt = 0;
1144  int gap_cnt = 0;//number of gaps, not length
1145  int mismatch_cnt = 0;
1146  int in_gap = 0; // 0 - not in gap, -1 - gap in prot, 1 - gap in dna
1147  for(size_t i = end; i<pend; ++i) {
1148  _ASSERT( m_posit[i] != POSIT_CHAR || ( prot_row[i] != GAP_CHAR && dna_row[i] != GAP_CHAR ) );
1149  if( prot_row[i] == INTRON_CHAR ) return end; //don't extend over introns
1150  if( tran_row[i] == '*' ) return end;//don't extend over stop
1151  if( prot_row[i] == GAP_CHAR ) {
1152  if( in_gap != -1 ) {
1153  in_gap = -1;
1154  ++gap_cnt;
1155  }
1156  } else if( dna_row[i] == GAP_CHAR ) {
1157  if( in_gap != 1 ) {
1158  in_gap = 1;
1159  ++gap_cnt;
1160  }
1161  } else {//not in gap
1162  in_gap = 0;
1163  if ( m_posit[i] == POSIT_CHAR ) {
1164  ++posit_cnt;
1165  } else {
1166  ++mismatch_cnt;
1167  }
1168  }
1169  }
1170  if( gap_cnt == 0 && mismatch_cnt < 10) return pend;//three mismatches and no gap, restore
1171  if( gap_cnt < 3 && 100 * posit_cnt >= 60 * ali_len ) return pend;//match 60% or more
1172  if( gap_cnt < 2 && 100 * posit_cnt >= 50 * ali_len ) return pend;//might reconsider later on
1173  //do not extend
1174  return end;
1175 }
1176 
1177 ///trim left flank with positives dropoff over a cutoff, iterative
1178 ///'pc' should not be dropped completely
1179 ///returns new pc.beg
1180 ///applied inside an exon only
1182  if( !options.GetCutFlanksWithPositDrop() ) { // do not trim
1183  return pc.beg;
1184  }
1185 
1186  const string& dna = m_alignment_text.GetDNA();
1187  const string& prot = m_alignment_text.GetProtein();
1188 
1189  const double dropoff = options.GetCutFlanksWithPositDropoff()/(double)100;
1190  const int window_size = options.GetCutFlanksWithPositWindow();
1191  const int max_cut_len = options.GetCutFlanksWithPositMaxLen();
1192  bool keep_trimming = true;
1193 
1194  //trim left flank
1195 
1196  while ( keep_trimming ) {
1197  _ASSERT( pc.beg < pc.end );
1198  int begpos = pc.beg;
1199  int endpos = pc.end;
1200  //cut point
1201  double cur_max_drop = 0;
1202  int cur_cut = begpos;
1203  //current point
1204  int cur_pos = begpos;
1205  int cur_end = begpos + window_size;
1206  int rposit = 0; // number of positives in the window [cur_pos, cur_end)
1207 
1208  /// initial style, real posit count
1209  ///int cur_len = 0;//flank length
1210  ///int lposit = 0; // number of positives to the left of cur_pos;
1211 
1212  ///pseudo counts for the left flank
1213  int ps_len = 0;
1214  int ps_pos = 0;
1215  int ps_dna_gap_len = 0; // length of current dna gap
1216  int ps_prot_gap_len = 0; // length of current prot gap
1217  // penalty for gap extention is 1
1218  // penalty for gap opening = reward for match = penalty for mismath
1219  int ps_len_increment = options.GetCutFlanksWithPositGapRatio();
1220 
1221  //count posit in initial window
1222  if( cur_end >= endpos ) return pc.beg; //nothing to trim if piece length is window size or less
1223  //initial window
1224  for( int pos = cur_pos; pos < cur_end; ++pos ) {
1225  if( prot[pos] == INTRON_CHAR ) { //hit at intron, no cut
1226  return pc.beg;
1227  }
1228  if(m_posit[pos] == POSIT_CHAR) {
1229  ++rposit;
1230  }
1231  }
1232 
1233  //find cutting point
1234  do { //step
1235 
1236  // time to stop?
1237  if( prot[cur_end] == INTRON_CHAR ) { //hit an intron, stop
1238  break;
1239  }
1240  if( max_cut_len < cur_pos - begpos + 1 ) { // maximum length to cut reached
1241  break;
1242  }
1243 
1244  //in window
1245  if( m_posit[cur_pos] == POSIT_CHAR ) {
1246  _ASSERT( rposit > 0 );
1247  --rposit;
1248  }
1249  if( m_posit[cur_end] == POSIT_CHAR ) {
1250  ++rposit;
1251  }
1252 
1253  //in flank
1254  if( m_posit[cur_pos] == POSIT_CHAR ) { // match/positive
1255  _ASSERT( prot[cur_pos] != GAP_CHAR );
1256  _ASSERT( dna[cur_pos] != GAP_CHAR );
1257  ps_len += ps_len_increment;
1258  ps_pos += ps_len_increment;
1259  ps_prot_gap_len = 0;
1260  ps_dna_gap_len = 0;
1261  } else if( dna[cur_pos] == GAP_CHAR ) { //gap in dna
1262  _ASSERT( prot[cur_pos] != GAP_CHAR );
1263  if( ps_dna_gap_len < 3 ) {
1264  ps_len += ps_len_increment;
1265  } else { //count as extention
1266  ++ps_len;
1267  }
1268  ++ps_dna_gap_len;
1269  ps_prot_gap_len = 0;
1270  } else if( prot[cur_pos] == GAP_CHAR ) { // gap in protein
1271  if( ps_prot_gap_len < 3 ) {
1272  ps_len += ps_len_increment;
1273  } else { //count as extention
1274  ++ps_len;
1275  }
1276  ++ps_prot_gap_len;
1277  ps_dna_gap_len = 0;
1278  } else { // mismatch, no gaps
1279  ps_len += ps_len_increment;
1280  ps_prot_gap_len = 0;
1281  ps_dna_gap_len = 0;
1282  }
1283  ++cur_pos;
1284  ++cur_end;
1285 
1286  //check
1287  double posit_drop = rposit/(double)window_size - ps_pos/(double)ps_len;
1288  if( posit_drop >= dropoff && ( posit_drop > cur_max_drop || cur_cut == begpos ) ) {
1289  cur_max_drop = posit_drop;
1290  cur_cut = cur_pos;
1291  }
1292  } while( cur_end < endpos );
1293  // cut ?
1294  if( cur_cut == begpos ) {
1295  keep_trimming = false;
1296  } else {//trim
1297 
1298  //handle weird cases
1299 
1300  //cut to positive
1301  _ASSERT( cur_cut > begpos);
1302  for( ; cur_cut < endpos; ++cur_cut ) {
1303  if(m_posit[cur_cut] == POSIT_CHAR) {
1304  break;
1305  }
1306  }
1307  if( cur_cut >= endpos ) return pc.beg; // we don't want to cut the whole piece
1308 
1309  //add positives back
1310  // cur_cut is a positive after above and cur_cut < endpos
1311  _ASSERT(cur_cut < endpos);
1312  _ASSERT(cur_cut > begpos);
1313  _ASSERT(m_posit[cur_cut] == POSIT_CHAR);
1314  for( ; cur_cut >= begpos; --cur_cut) {
1315  if(m_posit[cur_cut] != POSIT_CHAR) {
1316  ++cur_cut;
1317  break;
1318  }
1319  }
1320  if( cur_cut <= begpos ) return pc.beg; // the whole piece is back, no cut
1321 
1322  //trim
1323  pc.beg = cur_cut;
1324  }
1325  }
1326  return pc.beg;
1327 }
1328 
1329 ///trim right flank with positives dropoff over a cutoff, iterative
1330 ///'pc' should not be dropped completely
1331 ///returns new pc.end
1332 ///applied inside an exon only
1334  if( !options.GetCutFlanksWithPositDrop() ) { // do not trim
1335  return pc.end;
1336  }
1337 
1338  const string& dna = m_alignment_text.GetDNA();
1339  const string& prot = m_alignment_text.GetProtein();
1340 
1341  const double dropoff = options.GetCutFlanksWithPositDropoff()/(double)100;
1342  const int window_size = options.GetCutFlanksWithPositWindow();
1343  const int max_cut_len = options.GetCutFlanksWithPositMaxLen();
1344  bool keep_trimming = true;
1345 
1346  while ( keep_trimming ) {
1347  _ASSERT( pc.beg < pc.end );
1348  int begpos = pc.beg;
1349  int endpos = pc.end;
1350  //cut point
1351  double cur_max_drop = 0;
1352  int cur_cut = endpos;
1353  //current numbers
1354  int win_end = endpos;
1355  if( begpos + window_size > win_end) return pc.end; //window goes out of the 'good piece'
1356  int win_beg = win_end - window_size;
1357  int wposit = 0; // number of positives in the window [win_beg, win_end)
1358 
1359  ///real counts
1360  //int cur_len = 0;//flank length
1361  //int fposit = 0; // flank number of positives (startins with win_end);
1362 
1363  ///pseudo counts for the the flank
1364  int ps_len = 0;
1365  int ps_pos = 0;
1366  int ps_dna_gap_len = 0; // length of current dna gap
1367  int ps_prot_gap_len = 0; // length of current prot gap
1368  // penalty for gap extention is 1
1369  // penalty for gap opening = reward for match = penalty for mismath
1370  int ps_len_increment = options.GetCutFlanksWithPositGapRatio();
1371 
1372  //count posit in initial window
1373  for( int pos = win_beg; pos < win_end; ++pos ) {
1374  if( prot[pos] == INTRON_CHAR ) { //hit at intron, no cut
1375  return pc.end;
1376  }
1377  if(m_posit[pos] == POSIT_CHAR) {
1378  ++wposit;
1379  }
1380  }
1381 
1382  //find cutting point
1383  while( win_beg > begpos ) {
1384 
1385  //step
1386 
1387  //in window
1388  --win_end;
1389  --win_beg;
1390 
1391  //time to stop?
1392  if( max_cut_len < endpos - win_end ) {// maximum length to cut exceeded
1393  break;
1394  }
1395 
1396  if( prot[win_beg] == INTRON_CHAR ) { //hit at intron, stop
1397  break;
1398  }
1399 
1400  if( m_posit[win_end] == POSIT_CHAR ) {
1401  _ASSERT(wposit > 0);
1402  --wposit;
1403  }
1404  if( m_posit[win_beg] == POSIT_CHAR ) {
1405  ++wposit;
1406  }
1407 
1408  //in flank
1409  int cur_pos = win_end;
1410  if( m_posit[cur_pos] == POSIT_CHAR ) { // match/positive
1411  _ASSERT( prot[cur_pos] != GAP_CHAR );
1412  _ASSERT( dna[cur_pos] != GAP_CHAR );
1413  ps_len += ps_len_increment;
1414  ps_pos += ps_len_increment;
1415  ps_prot_gap_len = 0;
1416  ps_dna_gap_len = 0;
1417  } else if( dna[cur_pos] == GAP_CHAR ) { //gap in dna
1418  _ASSERT( prot[cur_pos] != GAP_CHAR );
1419  if( ps_dna_gap_len < 3 ) {
1420  ps_len += ps_len_increment;
1421  } else { //count as extention
1422  ++ps_len;
1423  }
1424  ++ps_dna_gap_len;
1425  ps_prot_gap_len = 0;
1426  } else if( prot[cur_pos] == GAP_CHAR ) { // gap in protein
1427  if( ps_prot_gap_len < 3 ) {
1428  ps_len += ps_len_increment;
1429  } else { //count as extention
1430  ++ps_len;
1431  }
1432  ++ps_prot_gap_len;
1433  ps_dna_gap_len = 0;
1434  } else { // mismatch, no gaps
1435  ps_len += ps_len_increment;
1436  ps_prot_gap_len = 0;
1437  ps_dna_gap_len = 0;
1438  }
1439 
1440  //check
1441  double posit_drop = wposit/(double)window_size - ps_pos/(double)ps_len;
1442  if( posit_drop >= dropoff && ( posit_drop > cur_max_drop || cur_cut == endpos ) ) {
1443  cur_max_drop = posit_drop;
1444  cur_cut = win_end;
1445  }
1446  }
1447  // cut ?
1448  if( cur_cut == endpos ) {
1449  keep_trimming = false;
1450  } else {//trim
1451 
1452  //handle weird cases
1453 
1454  //cut to positive
1455  for( --cur_cut; cur_cut >= begpos; --cur_cut ) {
1456  if(m_posit[cur_cut] == POSIT_CHAR) {
1457  ++cur_cut;
1458  break;
1459  }
1460  }
1461  if( cur_cut <= begpos ) return pc.end; // don't want to cut the whole piece
1462 
1463  //add positives back
1464  // we are on a positive here and cur_cut < endpos and cur_cut > begpos
1465  _ASSERT(cur_cut > begpos);
1466  _ASSERT(cur_cut < endpos);
1467  _ASSERT(m_posit[cur_cut-1] == POSIT_CHAR);
1468  for( ; cur_cut < endpos; ++cur_cut ) {
1469  if(m_posit[cur_cut] != POSIT_CHAR) {
1470  break;
1471  }
1472  }
1473  if(cur_cut >= endpos) return pc.end;//nothing to cut
1474 
1475  //cut
1476  pc.end = cur_cut;
1477  }
1478  }
1479  return pc.end;
1480 }
1481 
1482 /// end of CProSplignTrimmer implementation
1483 
1485 
CRef< CProduct_pos > NultriposToProduct_pos(int nultripos)
Convert linear coordinate into (amin,frame)
Definition: AliSeqAlign.cpp:77
list< CNPiece > ExcludeBadExons(const CNPiece pc, const string &match_all_pos, const string &protein, CProSplignOutputOptionsExt m_options)
Definition: Info.cpp:388
const char BAD_OR_MISMATCH[]
Definition: Info.cpp:65
const char MATCH_CHAR
Definition: Info.cpp:66
list< CNPiece > FindGoodParts(const CProteinAlignText &alignment_text, CProSplignOutputOptionsExt m_options, const CProSplignScaledScoring &scoring, const CSubstMatrix &matrix)
Definition: Info.cpp:107
const char BAD_PIECE_CHAR
Definition: Info.cpp:63
USING_SCOPE(ncbi::objects)
bool TrimNegativeTail(CNPiece &pc, const CProteinAlignText &alignment_text, const CProSplignScaledScoring &scoring, const CSubstMatrix &matrix)
Definition: Info.cpp:438
const char MISMATCH_CHAR
Definition: Info.cpp:64
const char INTRON_CHAR
Definition: Info.cpp:58
const char GAP_CHAR
Definition: Info.cpp:57
const char SPACE_CHAR
Definition: Info.cpp:59
const char POSIT_CHAR
Definition: Info.cpp:67
const char INTRON_OR_GAP[]
Definition: Info.cpp:60
void RefineAlignment(objects::CScope &scope, objects::CSeq_align &seq_align, const list< CNPiece > &good_parts)
void SetScores(objects::CSeq_align &seq_align, objects::CScope &scope, const string &matrix_name="BLOSUM62")
Definition: Info.hpp:50
int posit
Definition: Info.hpp:53
int efflen
Definition: Info.hpp:53
int end
Definition: Info.hpp:52
CNPiece(string::size_type obeg, string::size_type oend, int oposit, int oefflen)
Definition: Info.cpp:503
int beg
Definition: Info.hpp:52
Extended output filtering parameters deprecated, used in older programs.
Definition: Info.hpp:60
CProSplignOutputOptionsExt(const CProSplignOutputOptions &options)
Definition: Info.cpp:101
bool Perc(list< prosplign::CNPiece >::iterator it, int efflen, int posit, list< prosplign::CNPiece >::iterator last)
Definition: Info.cpp:523
bool ForwCheck(list< prosplign::CNPiece >::iterator it1, list< prosplign::CNPiece >::iterator it2)
Definition: Info.cpp:543
void Join(list< prosplign::CNPiece >::iterator it, list< prosplign::CNPiece >::iterator last)
Definition: Info.cpp:530
bool Bad(list< prosplign::CNPiece >::iterator it)
Definition: Info.cpp:511
bool BackCheck(list< prosplign::CNPiece >::iterator it1, list< prosplign::CNPiece >::iterator it2)
Definition: Info.cpp:559
bool Dropof(int efflen, int posit, list< prosplign::CNPiece >::iterator it)
Definition: Info.cpp:517
Output filtering parameters.
Definition: prosplign.hpp:156
bool GetCutFlankPartialCodons() const
Definition: prosplign.cpp:613
int GetTotalPositives() const
Definition: prosplign.cpp:683
int GetCutFlanksWithPositGapRatio() const
Definition: prosplign.cpp:603
int GetMaxBadLen() const
Definition: prosplign.cpp:693
int GetCutFlanksWithPositWindow() const
Definition: prosplign.cpp:583
int GetCutFlanksWithPositMaxLen() const
Definition: prosplign.cpp:593
bool GetFillHoles() const
Definition: prosplign.cpp:623
bool GetCutFlanksWithPositDrop() const
Definition: prosplign.cpp:563
int GetMinHoleLen() const
Definition: prosplign.cpp:633
int GetMinFlankingExonLen() const
Definition: prosplign.cpp:715
int GetMinGoodLen() const
Definition: prosplign.cpp:725
int GetFlankPositives() const
Definition: prosplign.cpp:673
int GetCutFlanksWithPositDropoff() const
Definition: prosplign.cpp:573
int GetMinExonPos() const
Definition: prosplign.cpp:663
int GetStartBonus() const
Definition: prosplign.cpp:736
bool IsPassThrough() const
Definition: prosplign.cpp:553
int GetMinExonId() const
Definition: prosplign.cpp:653
static void Output(const objects::CSeq_align &seqalign, objects::CScope &scope, ostream &out, int width, const string &matrix_name="BLOSUM62")
Outputs formatted text.
Definition: Info.cpp:596
size_t RestoreThreePrime(size_t end) const
Definition: Info.cpp:1132
CProSplignTrimmer(const CProteinAlignText &alignment_text)
CProSplignTrimmer implementation.
Definition: Info.cpp:1066
size_t RestoreFivePrime(size_t beg) const
checks if alignment ends should be restored beyond 'beg' or 'end' returns new flanking coord or 'beg'...
Definition: Info.cpp:1088
int CutFromRight(CNPiece pc, const CProSplignOutputOptionsExt &options) const
trim right flank with positives dropoff over a cutoff, iterative 'pc' should not be dropped completel...
Definition: Info.cpp:1333
int CutFromLeft(CNPiece pc, const CProSplignOutputOptionsExt &options) const
trim flanks with positives dropoff over a cutoff, iterative flank 'good pieces' should not be dropped...
Definition: Info.cpp:1181
const CProteinAlignText & m_alignment_text
Definition: Info.cpp:88
string m_posit
Definition: Info.cpp:90
TSeqPos AsSeqPos() const
Definition: Product_pos.cpp:56
Text representation of ProSplign alignment.
Definition: alntext.hpp:60
const string & GetDNA() const
Definition: alntext.hpp:77
const string & GetMatch() const
Definition: alntext.hpp:79
const string & GetProtein() const
Definition: alntext.hpp:80
static CRef< objects::CSeq_loc > GetGenomicBounds(objects::CScope &scope, const objects::CSeq_align &seqalign)
Definition: alntext.cpp:394
const string & GetTranslation() const
Definition: alntext.hpp:78
CRef –.
Definition: ncbiobj.hpp:618
CScope –.
Definition: scope.hpp:92
CSpliced_exon_chunk –.
Substitution Matrix for Scoring Amino-Acid Alignments.
Definition: nucprot.hpp:123
int ScaledScore(char amin1, char amin2) const
Definition: nucprot.hpp:131
static const char * bounds[]
Include a standard set of the NCBI C++ Toolkit most basic headers.
static ulg window_size
std::ofstream out("events_result.xml")
main entry point for tests
#define false
Definition: bool.h:36
static DLIST_TYPE *DLIST_NAME() last(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:51
static char tmp[3200]
Definition: utf8.c:42
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define _DEBUG_CODE(code)
Definition: ncbidbg.hpp:136
#define _DEBUG_ARG(arg)
Definition: ncbidbg.hpp:134
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
string GetSeqIdString(bool with_version=false) const
Return seqid string with optional version for text seqid type.
Definition: Seq_id.cpp:2145
CRange< TSeqPos > TSeqRange
typedefs for sequence ranges
Definition: range.hpp:419
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
bool CanGetData(void) const
Check if it is safe to call GetData method.
const TData & GetData(void) const
Get the Data member data.
vector< CRef< CUser_field > > TData
const TGenomic_id & GetGenomic_id(void) const
Get the Genomic_id member data.
void SetProduct_start(TProduct_start &value)
Assign a value to Product_start data member.
TMatch GetMatch(void) const
Get the variant data.
const TProduct_id & GetProduct_id(void) const
Get the Product_id member data.
TGenomic_start GetGenomic_start(void) const
Get the Genomic_start member data.
bool IsMismatch(void) const
Check if variant Mismatch is selected.
void SetSegs(TSegs &value)
Assign a value to Segs data member.
Definition: Seq_align_.cpp:310
bool IsSetAcceptor_before_exon(void) const
splice sites Check if a value has been assigned to Acceptor_before_exon data member.
TExons & SetExons(void)
Assign a value to Exons data member.
TDiag GetDiag(void) const
Get the variant data.
TMismatch GetMismatch(void) const
Get the variant data.
TGenomic_strand GetGenomic_strand(void) const
Get the Genomic_strand member data.
list< CRef< CUser_object > > TExt
Definition: Seq_align_.hpp:402
void SetGenomic_start(TGenomic_start value)
Assign a value to Genomic_start data member.
const TParts & GetParts(void) const
Get the Parts member data.
const TProduct_start & GetProduct_start(void) const
Get the Product_start member data.
const TProduct_end & GetProduct_end(void) const
Get the Product_end member data.
const TSpliced & GetSpliced(void) const
Get the variant data.
Definition: Seq_align_.cpp:219
bool IsGenomic_ins(void) const
Check if variant Genomic_ins is selected.
bool IsMatch(void) const
Check if variant Match is selected.
TGenomic_ins GetGenomic_ins(void) const
Get the variant data.
void SetPartial(TPartial value)
Assign a value to Partial data member.
bool CanGetExt(void) const
Check if it is safe to call GetExt method.
Definition: Seq_align_.hpp:995
list< CRef< CSpliced_exon > > TExons
const TExons & GetExons(void) const
Get the Exons member data.
void ResetAcceptor_before_exon(void)
Reset Acceptor_before_exon data member.
TParts & SetParts(void)
Assign a value to Parts data member.
bool IsDiag(void) const
Check if variant Diag is selected.
void SetGenomic_end(TGenomic_end value)
Assign a value to Genomic_end data member.
const TExt & GetExt(void) const
Get the Ext member data.
list< CRef< CSpliced_exon_chunk > > TParts
TGenomic_end GetGenomic_end(void) const
Get the Genomic_end member data.
bool IsProduct_ins(void) const
Check if variant Product_ins is selected.
TProduct_ins GetProduct_ins(void) const
Get the variant data.
const TSegs & GetSegs(void) const
Get the Segs member data.
Definition: Seq_align_.hpp:921
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
yy_size_t n
int len
int isalpha(Uchar c)
Definition: ncbictype.hpp:61
int toupper(Uchar c)
Definition: ncbictype.hpp:73
int islower(Uchar c)
Definition: ncbictype.hpp:66
int isupper(Uchar c)
Definition: ncbictype.hpp:70
T max(T x_, T y_)
static int match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, uint16_t top_bracket, PCRE2_SIZE frame_size, pcre2_match_data *match_data, match_block *mb)
Definition: pcre2_match.c:594
#define count
#define _ASSERT
Modified on Fri Sep 20 14:57:48 2024 by modify_doxy.py rev. 669887