NCBI C++ ToolKit
alnvec.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: alnvec.cpp 100369 2023-07-25 17:29:42Z grichenk $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Kamen Todorov, NCBI
27 *
28 * File Description:
29 * Access to the actual aligned residues
30 *
31 * ===========================================================================
32 */
33 
34 #include <ncbi_pch.hpp>
36 
37 // Objects includes
38 #include <objects/seq/Bioseq.hpp>
39 #include <objects/seq/IUPACna.hpp>
41 #include <objects/seq/Seqdesc.hpp>
42 #include <objects/seq/Seq_inst.hpp>
49 
50 // Object Manager includes
51 #include <objmgr/scope.hpp>
52 #include <objmgr/seq_vector.hpp>
53 
55 
57 BEGIN_objects_SCOPE // namespace ncbi::objects::
58 
59 CAlnVec::CAlnVec(const CDense_seg& ds, CScope& scope)
60  : CAlnMap(ds),
61  m_Scope(&scope),
62  m_set_GapChar(false),
63  m_set_EndChar(false),
64  m_NaCoding(CSeq_data::e_not_set),
65  m_AaCoding(CSeq_data::e_not_set)
66 {
67 }
68 
69 
70 CAlnVec::CAlnVec(const CDense_seg& ds, TNumrow anchor, CScope& scope)
71  : CAlnMap(ds, anchor),
72  m_Scope(&scope),
73  m_set_GapChar(false),
74  m_set_EndChar(false),
75  m_NaCoding(CSeq_data::e_not_set),
76  m_AaCoding(CSeq_data::e_not_set)
77 {
78 }
79 
80 
82 {
83 }
84 
85 
87 {
89 
90  if (i != m_BioseqHandlesCache.end()) {
91  return i->second;
92  } else {
93  CBioseq_Handle bioseq_handle =
95  if (bioseq_handle) {
96  return m_BioseqHandlesCache[row] = bioseq_handle;
97  } else {
98  string errstr = string("CAlnVec::GetBioseqHandle(): ")
99  + "Seq-id cannot be resolved: "
101 
102  NCBI_THROW(CAlnException, eInvalidSeqId, errstr);
103  }
104  }
105 }
106 
107 
109 {
111  CRef<CSeqVector> seq_vec;
112  if (iter != m_SeqVectorCache.end()) {
113  seq_vec = iter->second;
114  }
115  else {
117  CSeqVector vec = h.GetSeqVector
122  seq_vec.Reset(new CSeqVector(vec));
123  m_SeqVectorCache[row] = seq_vec;
124  }
125  if ( seq_vec->IsNucleotide() ) {
127  seq_vec->SetCoding(m_NaCoding);
128  }
129  else {
130  seq_vec->SetIupacCoding();
131  }
132  }
133  else if ( seq_vec->IsProtein() ) {
135  seq_vec->SetCoding(m_AaCoding);
136  }
137  else {
138  seq_vec->SetIupacCoding();
139  }
140  }
141  return *seq_vec;
142 }
143 
144 
146  TNumrow row,
147  const TSignedRange& aln_rng) const
148 {
149  string buff;
150  buffer.erase();
151 
152  CSeqVector& seq_vec = x_GetSeqVector(row);
153  TSeqPos seq_vec_size = seq_vec.size();
154 
155  // get the chunks which are aligned to seq on anchor
156  CRef<CAlnMap::CAlnChunkVec> chunk_vec =
158 
159  // for each chunk
160  for (int i=0; i<chunk_vec->size(); i++) {
161  CConstRef<CAlnMap::CAlnChunk> chunk = (*chunk_vec)[i];
162 
163  if (chunk->GetType() & fSeq) {
164  // add the sequence string
165  if (IsPositiveStrand(row)) {
166  seq_vec.GetSeqData(chunk->GetRange().GetFrom(),
167  chunk->GetRange().GetTo() + 1,
168  buff);
169  } else {
170  seq_vec.GetSeqData(seq_vec_size - chunk->GetRange().GetTo() - 1,
171  seq_vec_size - chunk->GetRange().GetFrom(),
172  buff);
173  }
174  if (GetWidth(row) == 3) {
175  TranslateNAToAA(buff, buff, GetGenCode(row));
176  }
177  buffer += buff;
178  } else {
179  // add appropriate number of gap/end chars
180  const int n = chunk->GetAlnRange().GetLength();
181  char* ch_buff = new char[n+1];
182  char fill_ch;
183  if (chunk->GetType() & fNoSeqOnLeft ||
184  chunk->GetType() & fNoSeqOnRight) {
185  fill_ch = GetEndChar();
186  } else {
187  fill_ch = GetGapChar(row);
188  }
189  memset(ch_buff, fill_ch, n);
190  ch_buff[n] = 0;
191  buffer += ch_buff;
192  delete[] ch_buff;
193  }
194  }
195  return buffer;
196 }
197 
198 
200  string& buffer,
201  TSeqPosList * insert_aln_starts,
202  TSeqPosList * insert_starts,
203  TSeqPosList * insert_lens,
204  unsigned int scrn_width,
205  TSeqPosList * scrn_lefts,
206  TSeqPosList * scrn_rights) const
207 {
208  TSeqPos aln_pos = 0,
209  len = 0,
210  curr_pos = 0,
211  anchor_pos = 0,
212  scrn_pos = 0,
213  prev_len = 0,
214  ttl_len = 0;
215  TSignedSeqPos start = -1,
216  stop = -1,
217  scrn_lft_seq_pos = -1,
218  scrn_rgt_seq_pos = -1,
219  prev_aln_pos = -1,
220  prev_start = -1;
221  TNumseg seg;
222  int pos, nscrns, delta;
223 
224  TSeqPos aln_len = GetAlnStop() + 1;
225 
226  bool anchored = IsSetAnchor();
227  bool plus = IsPositiveStrand(row);
228  int width = GetWidth(row);
229 
230  scrn_width *= width;
231 
232  const bool record_inserts = insert_starts && insert_lens;
233  const bool record_coords = scrn_width && scrn_lefts && scrn_rights;
234 
235  // allocate space for the row
236  buffer.clear();
237  buffer.reserve(aln_len);
238  string buff;
239 
240  const TNumseg& left_seg = x_GetSeqLeftSeg(row);
241  const TNumseg& right_seg = x_GetSeqRightSeg(row);
242 
243  // loop through all segments
244  for (seg = 0, pos = row, aln_pos = 0, anchor_pos = m_Anchor;
245  seg < m_NumSegs;
246  ++seg, pos += m_NumRows, anchor_pos += m_NumRows) {
247 
248  const TSeqPos& seg_len = m_Lens[seg];
249  start = m_Starts[pos];
250  len = seg_len * width;
251 
252  if (anchored && m_Starts[anchor_pos] < 0) {
253  if (start >= 0) {
254  // record the insert if requested
255  if (record_inserts) {
256  if (prev_aln_pos == (TSignedSeqPos)(aln_pos / width) &&
257  start == (TSignedSeqPos)(plus ? prev_start + prev_len :
258  prev_start - len)) {
259  // consolidate the adjacent inserts
260  ttl_len += len;
261  insert_lens->pop_back();
262  insert_lens->push_back(ttl_len);
263  if (!plus) {
264  insert_starts->pop_back();
265  insert_starts->push_back(start);
266  }
267  } else {
268  prev_aln_pos = aln_pos / width;
269  ttl_len = len;
270  insert_starts->push_back(start);
271  insert_aln_starts->push_back(prev_aln_pos);
272  insert_lens->push_back(len);
273  }
274  prev_start = start;
275  prev_len = len;
276  }
277  }
278  } else {
279  if (start >= 0) {
280  stop = start + len - 1;
281 
282  // add regular sequence to buffer
283  GetSeqString(buff, row, start, stop);
284  TSeqPos buf_len = min<TSeqPos>(TSeqPos(buff.size()), seg_len);
285  buffer += buff;
286  if (buf_len < seg_len) {
287  // Not enough chars in the sequence, add gap
288  buf_len = seg_len - buf_len;
289  char fill_ch;
290 
291  if (seg < left_seg || seg > right_seg) {
292  fill_ch = GetEndChar();
293  } else {
294  fill_ch = GetGapChar(row);
295  }
296 
297  for (size_t i = 0; i < buf_len; ++i) {
298  buffer += fill_ch;
299  }
300  }
301 
302  // take care of coords if necessary
303  if (record_coords) {
304  if (scrn_lft_seq_pos < 0) {
305  scrn_lft_seq_pos = plus ? start : stop;
306  if (scrn_rgt_seq_pos < 0) {
307  scrn_rgt_seq_pos = scrn_lft_seq_pos;
308  }
309  }
310  // previous scrns
311  nscrns = (aln_pos - scrn_pos) / scrn_width;
312  for (int i = 0; i < nscrns; i++) {
313  scrn_lefts->push_back(scrn_lft_seq_pos);
314  scrn_rights->push_back(scrn_rgt_seq_pos);
315  if (i == 0) {
316  scrn_lft_seq_pos = plus ? start : stop;
317  }
318  scrn_pos += scrn_width;
319  }
320  if (nscrns > 0) {
321  scrn_lft_seq_pos = plus ? start : stop;
322  }
323  // current scrns
324  nscrns = (aln_pos + len - scrn_pos) / scrn_width;
325  curr_pos = aln_pos;
326  for (int i = 0; i < nscrns; i++) {
327  delta = (plus ?
328  scrn_width - (curr_pos - scrn_pos) :
329  curr_pos - scrn_pos - scrn_width);
330 
331  scrn_lefts->push_back(scrn_lft_seq_pos);
332  if (plus ?
333  scrn_lft_seq_pos < start :
334  scrn_lft_seq_pos > stop) {
335  scrn_lft_seq_pos = (plus ? start : stop) +
336  delta;
337  scrn_rgt_seq_pos = scrn_lft_seq_pos +
338  (plus ? -1 : 1);
339  } else {
340  scrn_rgt_seq_pos = scrn_lft_seq_pos + (plus ? -1 : 1)
341  + delta;
342  scrn_lft_seq_pos += delta;
343  }
344  if (seg == left_seg &&
345  scrn_lft_seq_pos == scrn_rgt_seq_pos) {
346  if (plus) {
347  scrn_rgt_seq_pos--;
348  } else {
349  scrn_rgt_seq_pos++;
350  }
351  }
352  scrn_rights->push_back(scrn_rgt_seq_pos);
353  curr_pos = scrn_pos += scrn_width;
354  }
355  if (aln_pos + len <= scrn_pos) {
356  scrn_lft_seq_pos = -1; // reset
357  }
358  scrn_rgt_seq_pos = plus ? stop : start;
359  }
360  } else {
361  // add appropriate number of gap/end chars
362 
363  char fill_ch;
364 
365  if (seg < left_seg || seg > right_seg) {
366  fill_ch = GetEndChar();
367  } else {
368  fill_ch = GetGapChar(row);
369  }
370 
371  for (size_t i = 0; i < seg_len; ++i) {
372  buffer += fill_ch;
373  }
374  }
375  aln_pos += len;
376  }
377 
378  }
379 
380  // take care of the remaining coords if necessary
381  if (record_coords) {
382  // previous scrns
383  TSeqPos pos_diff = aln_pos - scrn_pos;
384  if (pos_diff > 0) {
385  nscrns = pos_diff / scrn_width;
386  if (pos_diff % scrn_width) {
387  nscrns++;
388  }
389  for (int i = 0; i < nscrns; i++) {
390  scrn_lefts->push_back(scrn_lft_seq_pos);
391  scrn_rights->push_back(scrn_rgt_seq_pos);
392  if (i == 0) {
393  scrn_lft_seq_pos = scrn_rgt_seq_pos;
394  }
395  scrn_pos += scrn_width;
396  }
397  }
398  }
399  return buffer;
400 }
401 
402 
403 //
404 // CreateConsensus()
405 //
406 // compute a consensus sequence given a particular alignment
407 // the rules for a consensus are:
408 // - a segment is consensus gap if > 50% of the sequences are gap at this
409 // segment. 50% exactly is counted as sequence
410 // - for a segment counted as sequence, for each position, the most
411 // frequently occurring base is counted as consensus. in the case of
412 // a tie, the consensus is considered muddied, and the consensus is
413 // so marked
414 //
416 CAlnVec::CreateConsensus(int& consensus_row,
417  CBioseq& consensus_seq,
418  const CSeq_id& consensus_id,
419  vector<string>* consens) const
420 {
421  consensus_seq.Reset();
422  if ( !m_DS || m_NumRows < 1) {
423  return CRef<CDense_seg>();
424  }
425 
426  bool isNucleotide = GetBioseqHandle(0).IsNucleotide();
427 
428  size_t i;
429  size_t j;
430 
431  // If the caller did not pass in consensus values, compute them now
432  vector<string> c;
433  if (consens == NULL) {
434  c.resize(m_NumSegs);
435  CreateConsensus(c);
436  consens = &c;
437  }
438 
439  //
440  // now, create a new CDense_seg
441  // we create a new CBioseq for our data and
442  // copy the contents of the CDense_seg
443  //
444  string data;
445  TSignedSeqPos total_bases = 0;
446 
447  CRef<CDense_seg> new_ds(new CDense_seg());
448  new_ds->SetDim(m_NumRows + 1);
449  new_ds->SetNumseg(m_NumSegs);
450  new_ds->SetLens() = m_Lens;
451  new_ds->SetStarts().reserve(m_Starts.size() + m_NumSegs);
452  if ( !m_Strands.empty() ) {
453  new_ds->SetStrands().reserve(m_Strands.size() +
454  m_NumSegs);
455  }
456 
457  for (i = 0; i < consens->size(); ++i) {
458  // copy the old entries
459  for (j = 0; j < (size_t)m_NumRows; ++j) {
460  size_t idx = i * m_NumRows + j;
461  new_ds->SetStarts().push_back(m_Starts[idx]);
462  if ( !m_Strands.empty() ) {
463  new_ds->SetStrands().push_back(m_Strands[idx]);
464  }
465  }
466 
467  // add our new entry
468  // this places the consensus as the last sequence
469  // it should preferably be the first, but this would mean adjusting
470  // the bioseq handle and seqvector caches, and all row numbers would
471  // shift
472  if ((*consens)[i].length() != 0) {
473  new_ds->SetStarts().push_back(total_bases);
474  } else {
475  new_ds->SetStarts().push_back(-1);
476  }
477 
478  if ( !m_Strands.empty() ) {
479  new_ds->SetStrands().push_back(eNa_strand_unknown);
480  }
481 
482  total_bases += TSignedSeqPos((*consens)[i].length());
483  data += (*consens)[i];
484  }
485 
486  // copy our IDs
487  for (i = 0; i < m_Ids.size(); ++i) {
488  new_ds->SetIds().push_back(m_Ids[i]);
489  }
490 
491  // now, we construct a new Bioseq
492  {{
493 
494  // sequence ID
495  CRef<CSeq_id> id(new CSeq_id());
496  id->Assign(consensus_id);
497  consensus_seq.SetId().push_back(id);
498 
499  new_ds->SetIds().push_back(id);
500 
501  // add a description for this sequence
502  CSeq_descr& desc = consensus_seq.SetDescr();
503  CRef<CSeqdesc> d(new CSeqdesc);
504  desc.Set().push_back(d);
505  d->SetComment("This is a generated consensus sequence");
506 
507  // the main one: Seq-inst
508  CSeq_inst& inst = consensus_seq.SetInst();
510  inst.SetMol(isNucleotide ? CSeq_inst::eMol_na : CSeq_inst::eMol_aa);
511  inst.SetLength(CSeq_inst::TLength(data.length()));
512 
513  CSeq_data& seq_data = inst.SetSeq_data();
514  if (isNucleotide) {
515  CIUPACna& na = seq_data.SetIupacna();
516  na = CIUPACna(data);
517  } else {
518  CIUPACaa& aa = seq_data.SetIupacaa();
519  aa = CIUPACaa(data);
520  }
521  }}
522 
523  consensus_row = int(new_ds->GetIds().size()) - 1;
524  return new_ds;
525 }
526 
527 void CAlnVec::TransposeSequences(vector<string>& segs)
528 {
529  char* buf = NULL;
530  size_t cols = 0;
531  size_t rows = segs.size();
532  size_t gap_rows = 0;
533  for (size_t row = 0; row < rows; ++row) {
534  const string& s = segs[row];
535  if (s.empty()) {
536  ++gap_rows;
537  continue;
538  }
539  if (cols == 0) {
540  cols = s.size();
541  buf = new char[(rows+1)*(cols+1)];
542  }
543  const char* src = s.c_str();
544  char* dst = buf+(row-gap_rows);
545  while ((*dst = *src++)) {
546  dst += rows+1;
547  }
548  }
549  segs.clear();
550  for (size_t col = 0; col < cols; ++col) {
551  char* col_buf = buf + col*(rows+1);
552  *(col_buf+(rows-gap_rows)) = 0;
553  segs.push_back(string(col_buf));
554  }
555  delete[] buf;
556 }
557 
558 void CAlnVec::CollectNucleotideFrequences(const string& col, int base_count[], int numBases)
559 {
560  // first, we record which bases occur and how often
561  // this is computed in NCBI4na notation
562  fill_n(base_count, numBases, 0);
563 
564  const char* i = col.c_str();
565  unsigned char c;
566  while ((c = *i++)) {
567  switch(c) {
568  case 'A':
569  ++base_count[0];
570  break;
571  case 'C':
572  ++base_count[1];
573  break;
574  case 'M':
575  ++base_count[1];
576  ++base_count[0];
577  break;
578  case 'G':
579  ++base_count[2];
580  break;
581  case'R':
582  ++base_count[2];
583  ++base_count[0];
584  break;
585  case 'S':
586  ++base_count[2];
587  ++base_count[1];
588  break;
589  case 'V':
590  ++base_count[2];
591  ++base_count[1];
592  ++base_count[0];
593  break;
594  case 'T':
595  ++base_count[3];
596  break;
597  case 'W':
598  ++base_count[3];
599  ++base_count[0];
600  break;
601  case 'Y':
602  ++base_count[3];
603  ++base_count[1];
604  break;
605  case 'H':
606  ++base_count[3];
607  ++base_count[1];
608  ++base_count[0];
609  break;
610  case 'K':
611  ++base_count[3];
612  ++base_count[2];
613  break;
614  case 'D':
615  ++base_count[3];
616  ++base_count[2];
617  ++base_count[0];
618  break;
619  case 'B':
620  ++base_count[3];
621  ++base_count[2];
622  ++base_count[1];
623  break;
624  case 'N':
625  ++base_count[3];
626  ++base_count[2];
627  ++base_count[1];
628  ++base_count[0];
629  break;
630  default:
631  break;
632  }
633  }
634 }
635 
636 void CAlnVec::CollectProteinFrequences(const string& col, int base_count[], int numBases)
637 {
638  // first, we record which bases occur and how often
639  // this is computed in NCBI4na notation
640  fill_n(base_count, numBases, 0);
641 
642  const char* i = col.c_str();
643  char c;
644  while ((c = *i++)) {
645  int pos = c-'A';
646  if (0<=pos && pos < numBases)
647  ++base_count[ pos ];
648  }
649 }
650 
651 void CAlnVec::CreateConsensus(vector<string>& consens) const
652 {
653  bool isNucleotide = GetBioseqHandle(0).IsNucleotide();
654 
655  const int numBases = isNucleotide ? 4 : 26;
656 
657  int base_count[26]; // must be a compile-time constant for some compilers
658 
659  // determine what the number of segments required for a gapped consensus
660  // segment is. this must be rounded to be at least 50%.
661  int gap_seg_thresh = m_NumRows - m_NumRows / 2;
662 
663  for (size_t j = 0; j < (size_t)m_NumSegs; ++j) {
664  // evaluate for gap / no gap
665  int gap_count = 0;
666  for (size_t i = 0; i < (size_t)m_NumRows; ++i) {
667  if (m_Starts[ j*m_NumRows + i ] == -1) {
668  ++gap_count;
669  }
670  }
671 
672  // check to make sure that this seg is not a consensus
673  // gap seg
674  if ( gap_count > gap_seg_thresh )
675  continue;
676 
677  // the base threshold for being considered unique is at least
678  // 70% of the available sequences
679  int base_thresh =
680  ((m_NumRows - gap_count) * 7 + 5) / 10;
681 
682  {
683  // we will build a segment with enough bases to match
684  consens[j].resize(m_Lens[j]);
685 
686  // retrieve all sequences for this segment
687  vector<string> segs(m_NumRows);
688  RetrieveSegmentSequences(j, segs);
689  TransposeSequences(segs);
690 
692 
693  //
694  // evaluate for a consensus
695  //
696  for (size_t i = 0; i < m_Lens[j]; ++i) {
697  if (isNucleotide) {
698  CollectNucleotideFrequences(segs[i], base_count, numBases);
699  } else {
700  CollectProteinFrequences(segs[i], base_count, numBases);
701  }
702 
703 
704  // we create a sorted list (in descending order) of
705  // frequencies of appearance to base
706  // the frequency is "global" for this position: that is,
707  // if 40% of the sequences are gapped, the highest frequency
708  // any base can have is 0.6
709  TRevMap rev_map;
710 
711  for (int k = 0; k < numBases; ++k) {
712  // this gets around a potentially tricky idiosyncrasy
713  // in some implementations of multimap. depending on
714  // the library, the key may be const (or not)
715  TRevMap::value_type p(base_count[k], isNucleotide ? (1<<k) : k);
716  rev_map.insert(p);
717  }
718 
719  // now, the first element here contains the best frequency
720  // we scan for the appropriate bases
721  if (rev_map.count(rev_map.begin()->first) == 1 &&
722  rev_map.begin()->first >= base_thresh) {
723  consens[j][i] = isNucleotide ?
724  ToIupac(rev_map.begin()->second) :
725  (rev_map.begin()->second+'A');
726  } else {
727  // now we need to make some guesses based on IUPACna
728  // notation
729  int count;
730  unsigned char c = 0x00;
731  int freq = 0;
732  TRevMap::iterator curr = rev_map.begin();
733  TRevMap::iterator prev = rev_map.begin();
734  for (count = 0;
735  curr != rev_map.end() &&
736  (freq < base_thresh || prev->first == curr->first);
737  ++curr, ++count) {
738  prev = curr;
739  freq += curr->first;
740  if (isNucleotide) {
741  c |= curr->second;
742  } else {
743  unsigned char cur_char = curr->second+'A';
744  switch (c) {
745  case 0x00:
746  c = cur_char;
747  break;
748  case 'N': case 'D':
749  c = (cur_char == 'N' || cur_char == 'D') ? 'B' : 'X';
750  break;
751  case 'Q': case 'E':
752  c = (cur_char == 'Q' || cur_char == 'E') ? 'Z' : 'X';
753  break;
754  case 'I': case 'L':
755  c = (cur_char == 'I' || cur_char == 'L') ? 'J' : 'X';
756  break;
757  default:
758  c = 'X';
759  }
760  }
761  }
762 
763  //
764  // catchall
765  //
766  if (count > 2) {
767  consens[j][i] = isNucleotide ? 'N' : 'X';
768  } else {
769  consens[j][i] = isNucleotide ? ToIupac(c) : c;
770  }
771  }
772  }
773  }
774  }
775 }
776 
777 void CAlnVec::RetrieveSegmentSequences(size_t segment, vector<string>& segs) const
778 {
779  size_t segment_row_index = segment*m_NumRows;
780  for (size_t i = 0; i < (size_t)m_NumRows; ++i, ++segment_row_index) {
781  TSignedSeqPos start = m_Starts[ segment_row_index ];
782  if (start != -1) {
783  TSeqPos stop = start + m_Lens[segment];
784 
785  string& s = segs[i];
786 
787  if (IsPositiveStrand(TNumrow(i))) {
788  x_GetSeqVector(TNumrow(i)).GetSeqData(start, stop, s);
789  } else {
790  CSeqVector & seq_vec = x_GetSeqVector(TNumrow(i));
791  TSeqPos size = seq_vec.size();
792  seq_vec.GetSeqData(size - stop, size - start, s);
793  }
794  }
795  else {
796  segs[i].clear();
797  }
798  }
799 }
800 
802  const CSeq_id& consensus_id) const
803 {
804  CRef<CBioseq> bioseq(new CBioseq);
805  CRef<CDense_seg> ds = CreateConsensus(consensus_row,
806  *bioseq, consensus_id);
807 
808  // add bioseq to the scope
809  CRef<CSeq_entry> entry(new CSeq_entry());
810  entry->SetSeq(*bioseq);
811  GetScope().AddTopLevelSeqEntry(*entry);
812 
813  return ds;
814 }
815 
816 
818 {
819  CSeq_id id("lcl|consensus");
820  return CreateConsensus(consensus_row, id);
821 }
822 
823 
825 
826 int CAlnVec::CalculateScore(const string& s1, const string& s2,
827  bool s1_is_prot, bool s2_is_prot,
828  int gen_code1, int gen_code2)
829 {
830  // check the lengths
831  if (s1_is_prot == s2_is_prot && s1.length() != s2.length()) {
832  NCBI_THROW(CAlnException, eInvalidRequest,
833  "CAlnVec::CalculateScore(): "
834  "Strings should have equal lenghts.");
835  } else if (s1.length() * (s1_is_prot ? 1 : 3) !=
836  s2.length() * (s2_is_prot ? 1 : 3)) {
837  NCBI_THROW(CAlnException, eInvalidRequest,
838  "CAlnVec::CalculateScore(): "
839  "Strings lengths do not match.");
840  }
841 
842  int score = 0;
843 
844  const unsigned char * res1 = (unsigned char *) s1.c_str();
845  const unsigned char * res2 = (unsigned char *) s2.c_str();
846  const unsigned char * end1 = res1 + s1.length();
847  const unsigned char * end2 = res2 + s2.length();
848 
849  static bool s_FullScoreMatrixInitialized = false;
850  if (s1_is_prot && s2_is_prot) {
851  if ( !s_FullScoreMatrixInitialized ) {
852  s_FullScoreMatrixInitialized = true;
854  }
855 
856  // use BLOSUM62 matrix
857  for ( ; res1 != end1; res1++, res2++) {
858  _ASSERT(*res1 < NCBI_FSM_DIM);
859  _ASSERT(*res2 < NCBI_FSM_DIM);
860  score += s_FullScoreMatrix.s[*res1][*res2];
861  }
862  } else if ( !s1_is_prot && !s2_is_prot ) {
863  // use match score/mismatch penalty
864  for ( ; res1 != end1; res1++, res2++) {
865  if (*res1 == *res2) {
866  score += 1;
867  } else {
868  score -= 3;
869  }
870  }
871  } else {
872  string t;
873  if (s1_is_prot) {
874  TranslateNAToAA(s2, t, gen_code2);
875  for ( ; res1 != end1; res1++, res2++) {
876  _ASSERT(*res1 < NCBI_FSM_DIM);
877  _ASSERT(*res2 < NCBI_FSM_DIM);
878  score += s_FullScoreMatrix.s[*res1][*res2];
879  }
880  } else {
881  TranslateNAToAA(s1, t, gen_code1);
882  for ( ; res2 != end2; res1++, res2++) {
883  _ASSERT(*res1 < NCBI_FSM_DIM);
884  _ASSERT(*res2 < NCBI_FSM_DIM);
885  score += s_FullScoreMatrix.s[*res1][*res2];
886  }
887  }
888  }
889  return score;
890 }
891 
892 
893 void CAlnVec::TranslateNAToAA(const string& na,
894  string& aa,
895  int gencode)
896 {
897  if (na.size() % 3) {
898  NCBI_THROW(CAlnException, eTranslateFailure,
899  "CAlnVec::TranslateNAToAA(): "
900  "NA size expected to be divisible by 3");
901  }
902 
903  const CTrans_table& tbl = CGen_code_table::GetTransTable(gencode);
904 
905  size_t na_size = na.size();
906 
907  if (&aa != &na) {
908  aa.resize(na_size / 3);
909  }
910 
911  int state = 0;
912  size_t aa_i = 0;
913  for (size_t na_i = 0; na_i < na_size; ) {
914  for (size_t i = 0; i < 3; i++) {
915  state = tbl.NextCodonState(state, na[na_i++]);
916  }
917  aa[aa_i++] = tbl.GetCodonResidue(state);
918  }
919 
920  if (&aa == &na) {
921  aa.resize(aa_i);
922  }
923 }
924 
925 
927 {
928  TNumrow numrows = m_NumRows;
929  TNumrow index1 = row1, index2 = row2;
930  TSignedSeqPos start1, start2;
931  string buff1, buff2;
932  bool isAA1, isAA2;
933  int score = 0;
934  TSeqPos len;
935 
936  isAA1 = GetBioseqHandle(row1).GetBioseqCore()
938 
939  isAA2 = GetBioseqHandle(row2).GetBioseqCore()
941 
942  CSeqVector& seq_vec1 = x_GetSeqVector(row1);
943  TSeqPos size1 = seq_vec1.size();
944  CSeqVector & seq_vec2 = x_GetSeqVector(row2);
945  TSeqPos size2 = seq_vec2.size();
946 
947  for (TNumseg seg = 0; seg < m_NumSegs; seg++) {
948  start1 = m_Starts[index1];
949  start2 = m_Starts[index2];
950 
951  if (start1 >=0 && start2 >= 0) {
952  len = m_Lens[seg];
953 
954  if (IsPositiveStrand(row1)) {
955  seq_vec1.GetSeqData(start1,
956  start1 + len,
957  buff1);
958  } else {
959  seq_vec1.GetSeqData(size1 - (start1 + len),
960  size1 - start1,
961  buff1);
962  }
963  if (IsPositiveStrand(row2)) {
964  seq_vec2.GetSeqData(start2,
965  start2 + len,
966  buff2);
967  } else {
968  seq_vec2.GetSeqData(size2 - (start2 + len),
969  size2 - start2,
970  buff2);
971  }
972  score += CalculateScore(buff1, buff2, isAA1, isAA2,
973  GetGenCode(row1), GetGenCode(row2));
974  }
975 
976  index1 += numrows;
977  index2 += numrows;
978  }
979  return score;
980 }
981 
982 
984  TSeqPos aln_pos,
985  TResidueCount * residue_count,
986  bool gaps_in_count) const
987 {
988  buffer.resize(GetNumRows(), GetEndChar());
989  if (aln_pos > GetAlnStop()) {
990  aln_pos = GetAlnStop(); // out-of-range adjustment
991  }
992  TNumseg seg = GetSeg(aln_pos);
993  TSeqPos delta = aln_pos - GetAlnStart(seg);
994  TSeqPos len = GetLen(seg);
995 
996  TSignedSeqPos pos;
997 
998  for (TNumrow row = 0; row < m_NumRows; row++) {
999  pos = GetStart(row, seg);
1000  if (pos >= 0) {
1001  // it's a sequence residue
1002 
1003  bool plus = IsPositiveStrand(row);
1004  if (plus) {
1005  pos += delta;
1006  } else {
1007  pos += len - 1 - delta;
1008  }
1009 
1010  CSeqVector& seq_vec = x_GetSeqVector(row);
1011  if (GetWidth(row) == 3) {
1012  string na_buff, aa_buff;
1013  if (plus) {
1014  seq_vec.GetSeqData(pos, pos + 3, na_buff);
1015  } else {
1016  TSeqPos size = seq_vec.size();
1017  seq_vec.GetSeqData(size - pos - 3, size - pos, na_buff);
1018  }
1019  TranslateNAToAA(na_buff, aa_buff, GetGenCode(row));
1020  buffer[row] = aa_buff[0];
1021  } else {
1022  buffer[row] = seq_vec[plus ? pos : seq_vec.size() - pos - 1];
1023  }
1024 
1025  if (residue_count) {
1026  (*residue_count)[FromIupac(buffer[row])]++;
1027  }
1028 
1029  } else {
1030  // it's a gap or endchar
1031 
1032  if (GetEndChar() != (buffer[row] = GetGapChar(row))) {
1033  // need to check the where the segment is
1034  // only if endchar != gap
1035  // this saves a check if there're the same
1036  TSegTypeFlags type = GetSegType(row, seg);
1037  if (type & fNoSeqOnLeft || type & fNoSeqOnRight) {
1038  buffer[row] = GetEndChar();
1039  }
1040  }
1041 
1042  if (gaps_in_count && residue_count) {
1043  (*residue_count)[FromIupac(buffer[row])]++;
1044  }
1045  }
1046  } // for row
1047 
1048  return buffer;
1049 }
1050 
1052 {
1053  string column;
1054  column.resize(m_NumRows);
1055 
1056  TResidueCount residue_cnt;
1057  residue_cnt.resize(16, 0);
1058 
1059  GetColumnVector(column, aln_pos, &residue_cnt);
1060 
1061  int max = 0, total = 0;
1062  ITERATE (TResidueCount, i_res, residue_cnt) {
1063  if (*i_res > max) {
1064  max = *i_res;
1065  }
1066  total += *i_res;
1067  }
1068  if (total) {
1069  return 100 * max / total;
1070  }
1071  else {
1072  return 0;
1073  }
1074 }
1075 
1076 
1077 END_objects_SCOPE // namespace ncbi::objects::
static CRef< CScope > m_Scope
User-defined methods of the data storage class.
static SNCBIFullScoreMatrix s_FullScoreMatrix
Definition: alnvec.cpp:824
TDim TNumrow
Definition: alnmap.hpp:69
bool IsSetAnchor(void) const
Definition: alnmap.hpp:524
list< TSeqPos > TSeqPosList
Definition: alnmap.hpp:73
int GetWidth(TNumrow row) const
Definition: alnmap.hpp:560
const TNumseg & x_GetSeqLeftSeg(TNumrow row) const
Definition: alnmap.cpp:716
const CDense_seg::TStarts & m_Starts
Definition: alnmap.hpp:373
TSegTypeFlags GetSegType(TNumrow row, TNumseg seg, int offset=0) const
Definition: alnmap.hpp:503
TNumseg m_NumSegs
Definition: alnmap.hpp:371
TSignedSeqPos GetStart(TNumrow row, TNumseg seg, int offset=0) const
Definition: alnmap.hpp:614
const CSeq_id & GetSeqId(TNumrow row) const
Definition: alnmap.hpp:645
bool IsPositiveStrand(TNumrow row) const
Definition: alnmap.hpp:600
const CDense_seg::TIds & m_Ids
Definition: alnmap.hpp:372
TNumseg GetSeg(TSeqPos aln_pos) const
Definition: alnmap.cpp:373
@ fNoSeqOnRight
Definition: alnmap.hpp:57
@ fNoSeqOnLeft
Definition: alnmap.hpp:58
@ fSeq
Definition: alnmap.hpp:52
TDim GetNumRows(void) const
Definition: alnmap.hpp:517
TNumrow m_Anchor
Definition: alnmap.hpp:378
const CDense_seg::TStrands & m_Strands
Definition: alnmap.hpp:375
CConstRef< CDense_seg > m_DS
Definition: alnmap.hpp:369
const TNumseg & x_GetSeqRightSeg(TNumrow row) const
Definition: alnmap.cpp:736
CRef< CAlnChunkVec > GetAlnChunks(TNumrow row, const TSignedRange &range, TGetChunkFlags flags=fAlnSegsOnly) const
Definition: alnmap.cpp:1002
unsigned int TSegTypeFlags
Definition: alnmap.hpp:50
TSeqPos GetAlnStart(void) const
Definition: alnmap.hpp:177
TSeqPos GetLen(TNumseg seg, int offset=0) const
Definition: alnmap.hpp:621
CDense_seg::TNumseg TNumseg
Definition: alnmap.hpp:72
TNumrow m_NumRows
Definition: alnmap.hpp:370
TSeqPos GetAlnStop(void) const
Definition: alnmap.hpp:495
@ fSkipInserts
Definition: alnmap.hpp:94
@ fSkipUnalignedGaps
Definition: alnmap.hpp:91
const CDense_seg::TLens & m_Lens
Definition: alnmap.hpp:374
static void CollectNucleotideFrequences(const string &col, int base_count[], int numBases)
Definition: alnvec.cpp:558
CAlnVec(const CDense_seg &ds, CScope &scope)
Definition: alnvec.cpp:59
const CBioseq_Handle & GetBioseqHandle(TNumrow row) const
Definition: alnvec.cpp:86
TResidue GetGapChar(TNumrow row) const
Definition: alnvec.hpp:358
string & GetSeqString(string &buffer, TNumrow row, TSeqPos seq_from, TSeqPos seq_to) const
Definition: alnvec.hpp:288
TResidue GetEndChar() const
Definition: alnvec.hpp:387
static void TranslateNAToAA(const string &na, string &aa, int gen_code=kDefaultGenCode)
Definition: alnvec.cpp:893
int GetGenCode(TNumrow row) const
Definition: alnvec.hpp:425
~CAlnVec(void)
Definition: alnvec.cpp:81
string & GetColumnVector(string &buffer, TSeqPos aln_pos, TResidueCount *residue_count=0, bool gaps_in_count=false) const
Definition: alnvec.cpp:983
string & GetWholeAlnSeqString(TNumrow row, string &buffer, TSeqPosList *insert_aln_starts=0, TSeqPosList *insert_starts=0, TSeqPosList *insert_lens=0, unsigned int scrn_width=0, TSeqPosList *scrn_lefts=0, TSeqPosList *scrn_rights=0) const
Definition: alnvec.cpp:199
TCoding m_AaCoding
Definition: alnvec.hpp:197
CSeqVector & x_GetSeqVector(TNumrow row) const
Definition: alnvec.cpp:108
CScope & GetScope(void) const
Definition: alnvec.hpp:247
static void CollectProteinFrequences(const string &col, int base_count[], int numBases)
Definition: alnvec.cpp:636
static unsigned char ToIupac(unsigned char c)
Definition: alnvec.hpp:468
CRef< CDense_seg > CreateConsensus(int &consensus_row) const
Definition: alnvec.cpp:817
string & GetAlnSeqString(string &buffer, TNumrow row, const CAlnMap::TSignedRange &aln_rng) const
Definition: alnvec.cpp:145
int CalculateScore(TNumrow row1, TNumrow row2) const
Definition: alnvec.cpp:926
void RetrieveSegmentSequences(size_t segment, vector< string > &segs) const
Definition: alnvec.cpp:777
TBioseqHandleCache m_BioseqHandlesCache
Definition: alnvec.hpp:183
vector< int > TResidueCount
Definition: alnvec.hpp:55
static unsigned char FromIupac(unsigned char c)
Definition: alnvec.hpp:444
TSeqVectorCache m_SeqVectorCache
Definition: alnvec.hpp:184
int CalculatePercentIdentity(TSeqPos aln_pos) const
Definition: alnvec.cpp:1051
static void TransposeSequences(vector< string > &segs)
Definition: alnvec.cpp:527
TCoding m_NaCoding
Definition: alnvec.hpp:196
CBioseq_Handle –.
CConstRef –.
Definition: ncbiobj.hpp:1266
static const CTrans_table & GetTransTable(int id)
CIUPACaa –.
Definition: IUPACaa.hpp:66
CIUPACna –.
Definition: IUPACna.hpp:66
CScope –.
Definition: scope.hpp:92
CSeqVector –.
Definition: seq_vector.hpp:65
@Seq_descr.hpp User-defined methods of the data storage class.
Definition: Seq_descr.hpp:55
Definition: Seq_entry.hpp:56
char GetCodonResidue(int state) const
static int NextCodonState(int state, unsigned char ch)
const_iterator end() const
Definition: map.hpp:152
const_iterator find(const key_type &key) const
Definition: map.hpp:153
#define false
Definition: bool.h:36
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:61
static const char * column
Definition: stats.c:23
char data[12]
Definition: iconv.c:80
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
int TSignedSeqPos
Type for signed sequence position.
Definition: ncbimisc.hpp:887
string
Definition: cgiapp.hpp:690
#define NULL
Definition: ncbistd.hpp:225
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
const string AsFastaString(void) const
Definition: Seq_id.cpp:2266
CSeq_entry_Handle AddTopLevelSeqEntry(CSeq_entry &top_entry, TPriority pri=kPriority_Default, EExist action=eExist_Default)
Add seq_entry, default priority is higher than for defaults or loaders Add object to the score with p...
Definition: scope.cpp:522
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
bool IsNucleotide(void) const
TBioseqCore GetBioseqCore(void) const
Get bioseq core structure.
CSeqVector GetSeqVector(EVectorCoding coding, ENa_strand strand=eNa_strand_plus) const
Get sequence: Iupacna or Iupacaa if use_iupac_coding is true.
@ eStrand_Plus
Plus strand.
@ eStrand_Minus
Minus strand.
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
Definition: seq_vector.cpp:304
TSeqPos size(void) const
Definition: seq_vector.hpp:291
bool IsProtein(void) const
Definition: seq_vector.hpp:350
void SetCoding(TCoding coding)
void SetIupacCoding(void)
Set coding to either Iupacaa or Iupacna depending on molecule type.
bool IsNucleotide(void) const
Definition: seq_vector.hpp:357
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
TLens & SetLens(void)
Assign a value to Lens data member.
Definition: Dense_seg_.hpp:561
void SetDim(TDim value)
Assign a value to Dim data member.
Definition: Dense_seg_.hpp:427
TStarts & SetStarts(void)
Assign a value to Starts data member.
Definition: Dense_seg_.hpp:536
TStrands & SetStrands(void)
Assign a value to Strands data member.
Definition: Dense_seg_.hpp:586
void SetNumseg(TNumseg value)
Assign a value to Numseg data member.
Definition: Dense_seg_.hpp:474
const TIds & GetIds(void) const
Get the Ids member data.
Definition: Dense_seg_.hpp:505
TIds & SetIds(void)
Assign a value to Ids data member.
Definition: Dense_seg_.hpp:511
@ eNa_strand_unknown
Definition: Na_strand_.hpp:65
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
TId & SetId(void)
Assign a value to Id data member.
Definition: Bioseq_.hpp:296
const TInst & GetInst(void) const
Get the Inst member data.
Definition: Bioseq_.hpp:336
TIupacna & SetIupacna(void)
Select the variant.
Definition: Seq_data_.hpp:517
TMol GetMol(void) const
Get the Mol member data.
Definition: Seq_inst_.hpp:612
TComment & SetComment(void)
Select the variant.
Definition: Seqdesc_.hpp:1065
void SetInst(TInst &value)
Assign a value to Inst data member.
Definition: Bioseq_.cpp:86
virtual void Reset(void)
Reset the whole object.
Definition: Bioseq_.cpp:97
void SetDescr(TDescr &value)
Assign a value to Descr data member.
Definition: Bioseq_.cpp:65
void SetRepr(TRepr value)
Assign a value to Repr data member.
Definition: Seq_inst_.hpp:574
Tdata & Set(void)
Assign a value to data member.
Definition: Seq_descr_.hpp:172
void SetLength(TLength value)
Assign a value to Length data member.
Definition: Seq_inst_.hpp:668
void SetSeq_data(TSeq_data &value)
Assign a value to Seq_data data member.
Definition: Seq_inst_.cpp:130
TIupacaa & SetIupacaa(void)
Select the variant.
Definition: Seq_data_.hpp:537
void SetMol(TMol value)
Assign a value to Mol data member.
Definition: Seq_inst_.hpp:621
TSeqPos TLength
Definition: Seq_inst_.hpp:147
@ eRepr_raw
continuous sequence
Definition: Seq_inst_.hpp:94
@ e_not_set
No variant selected.
Definition: Seq_data_.hpp:103
@ eMol_na
just a nucleic acid
Definition: Seq_inst_.hpp:113
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
@ e_not_set
char * buf
int i
yy_size_t n
int len
double value_type
The numeric datatype used by the parser.
Definition: muParserDef.h:228
const struct ncbi::grid::netcache::search::fields::SIZE size
EIPRangeType t
Definition: ncbi_localip.c:101
T max(T x_, T y_)
T plus(T x_)
Int4 delta(size_t dimension_, const Int4 *score_)
#define count
static uint8_t * buffer
Definition: pcre2test.c:1016
const SNCBIPackedScoreMatrix NCBISM_Blosum62
Definition: sm_blosum62.c:92
#define NCBI_FSM_DIM
Recommended approach: unpack and index directly.
Definition: raw_scoremat.h:85
void NCBISM_Unpack(const SNCBIPackedScoreMatrix *psm, SNCBIFullScoreMatrix *fsm)
Expand a packed score matrix into an unpacked one, which callers can proceed to index directly by sta...
Definition: raw_scoremat.c:81
#define row(bind, expected)
Definition: string_bind.c:73
TNCBIScore s[128][128]
Definition: raw_scoremat.h:87
Definition: type.c:6
#define _ASSERT
Modified on Fri Sep 20 14:57:11 2024 by modify_doxy.py rev. 669887