NCBI C++ ToolKit
seq.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /*
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's offical duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================*/
25 
26 /*****************************************************************************
27 
28 File name: seq.cpp
29 
30 Author: Jason Papadopoulos
31 
32 Contents: implementation of CSequence class
33 
34 ******************************************************************************/
35 
36 #include <ncbi_pch.hpp>
37 #include <objmgr/seq_vector.hpp>
39 #include <algo/cobalt/seq.hpp>
40 
41 /// @file seq.cpp
42 /// Implementation of CSequence class
43 
45 BEGIN_SCOPE(cobalt)
46 
47 unsigned char
49 {
50  _ASSERT(pos >= 0 && pos < GetLength());
51 
52  int val;
53  switch (m_Sequence[pos]) {
54  case 0: val = '-'; break;
55  case 1: val = 'A'; break;
56  case 2: val = 'B'; break;
57  case 3: val = 'C'; break;
58  case 4: val = 'D'; break;
59  case 5: val = 'E'; break;
60  case 6: val = 'F'; break;
61  case 7: val = 'G'; break;
62  case 8: val = 'H'; break;
63  case 9: val = 'I'; break;
64  case 10: val = 'K'; break;
65  case 11: val = 'L'; break;
66  case 12: val = 'M'; break;
67  case 13: val = 'N'; break;
68  case 14: val = 'P'; break;
69  case 15: val = 'Q'; break;
70  case 16: val = 'R'; break;
71  case 17: val = 'S'; break;
72  case 18: val = 'T'; break;
73  case 19: val = 'V'; break;
74  case 20: val = 'W'; break;
75  case 21: val = 'X'; break;
76  case 22: val = 'Y'; break;
77  case 23: val = 'Z'; break;
78  case 24: val = 'U'; break;
79  case 25: val = '*'; break;
80  default: val = '?'; break;
81  }
82 
83  return val;
84 }
85 
86 void
87 CSequence::Reset(const objects::CSeq_loc& seq_in, objects::CScope& scope)
88 {
89  if (!seq_in.IsWhole() && !seq_in.IsInt()) {
90  NCBI_THROW(CMultiAlignerException, eInvalidInput,
91  "Unsupported SeqLoc encountered");
92  }
93 
94  objects::CSeqVector sv(seq_in, scope);
95 
96  if (!sv.IsProtein()) {
97  NCBI_THROW(CMultiAlignerException, eInvalidInput,
98  "Nucleotide sequences cannot be aligned");
99  }
100 
101  // make a local copy of the sequence data
102 
103  int seq_length = sv.size();
104  m_Sequence.resize(seq_length);
105  for (int i = 0; i < seq_length; i++) {
106  m_Sequence[i] = sv[i];
107  }
108 
109  // residue frequencies start off empty
110 
111  m_Freqs.Resize(seq_length, kAlphabetSize);
112  m_Freqs.Set(0.0);
113 }
114 
115 
116 void
117 CSequence::Reset(int length)
118 {
119  m_Sequence.resize(length);
120  for (int i=0;i < length;i++) {
121  m_Sequence[i] = kGapChar;
122  }
123 }
124 
125 
126 CSequence::CSequence(const objects::CSeq_loc& sl, objects::CScope& scope)
127 {
128  Reset(sl, scope);
129 }
130 
131 
132 void
135 {
136  int new_size = transcript.size();
137 
138  // no gaps means nothing needs updating
139 
140  if (new_size == GetLength()) {
141  return;
142  }
143 
144  vector<unsigned char> new_seq(new_size);
145  TFreqMatrix new_freq(new_size, kAlphabetSize, 0.0);
146 
147  // expand the sequence data and the profile columns
148  // to incorporate new gaps
149 
150  for (int i = 0, j = 0; i < new_size; i++) {
151  if (transcript[i] == gap_choice) {
152  new_seq[i] = kGapChar;
153  }
154  else {
155  new_seq[i] = m_Sequence[j];
156  for (int k = 0; k < kAlphabetSize; k++)
157  new_freq(i, k) = m_Freqs(j, k);
158  _ASSERT(j < GetLength());
159  j++;
160  }
161  }
162 
163  // replace class data
164 
165  m_Sequence.swap(new_seq);
166  m_Freqs.Swap(new_freq);
167 }
168 
169 void
170 CSequence::InsertGaps(const vector<Uint4>& gap_locations, bool consider_gaps)
171 {
172  Uint4 new_size = (Uint4)(GetLength() + gap_locations.size());
173  bool is_freqs_set = m_Freqs.GetRows() > 0;
174 
175  // no gaps means nothing needs updating
176 
177  if (new_size == (Uint4)GetLength()) {
178  return;
179  }
180 
181  vector<unsigned char> new_seq(new_size);
182  TFreqMatrix new_freq;
183  if (is_freqs_set) {
184  new_freq.Resize(new_size, kAlphabetSize, 0.0);
185  }
186 
187  // expand the sequence data and the profile columns
188  // to incorporate new gaps
189 
190  Uint4 location = 0, gap_ind = 0;
191  for (size_t i = 0, j = 0; i < new_size; i++) {
192  if (gap_ind < gap_locations.size()
193  && location == gap_locations[gap_ind]) {
194  new_seq[i] = kGapChar;
195  gap_ind++;
196 
197  if (consider_gaps) {
198  location++;
199  }
200  }
201  else if (j < m_Sequence.size()) {
202  new_seq[i] = m_Sequence[j];
203  if (is_freqs_set) {
204  for (int k = 0; k < kAlphabetSize; k++)
205  new_freq(i, k) = m_Freqs(j, k);
206  }
207 
208  if (m_Sequence[j] != kGapChar || consider_gaps) {
209  location++;
210  }
211 
212  j++;
213  }
214  else {
215  // Gaps at the end of the sequence
216  new_seq[i] = kGapChar;
217  gap_ind++;
218  }
219 
220  }
221  _ASSERT(gap_ind == gap_locations.size());
222 
223  // replace class data
224 
225  m_Sequence.swap(new_seq);
226  if (is_freqs_set) {
227  m_Freqs.Swap(new_freq);
228  }
229 }
230 
231 
232 void CSequence::CompressSequences(vector<CSequence>& seq,
233  vector<int> index_list)
234 {
235  int align_length = seq[index_list[0]].GetLength();
236  int num_seqs = index_list.size();
237  int new_length = 0;
238 
239  // for each alignment column
240 
241  for (int i = 0; i < align_length; i++) {
242  int j;
243  for (j = 0; j < num_seqs; j++) {
244  if (seq[index_list[j]].m_Sequence[i] != kGapChar)
245  break;
246  }
247 
248  // if the specified list of sequences do not all
249  // have a gap character in column i, keep the column
250 
251  if (j < num_seqs) {
252  for (j = 0; j < num_seqs; j++) {
253  seq[index_list[j]].m_Sequence[new_length] =
254  seq[index_list[j]].m_Sequence[i];
255  for (int k = 0; k < kAlphabetSize; k++) {
256  seq[index_list[j]].m_Freqs(new_length, k) =
257  seq[index_list[j]].m_Freqs(i, k);
258  }
259  }
260  new_length++;
261  }
262  }
263 
264  // if the length changed, shorten m_Sequence and m_Freqs
265 
266  if (new_length != align_length) {
267  for (int i = 0; i < num_seqs; i++) {
268  seq[index_list[i]].m_Sequence.resize(new_length);
269  seq[index_list[i]].m_Freqs.Resize(new_length, kAlphabetSize);
270  }
271  }
272 }
273 
274 void CSequence::CreateMsa(const objects::CSeq_align& seq_align,
275  objects::CScope& scope,
276  vector<CSequence>& msa)
277 {
278  const objects::CDense_seg& denseg = seq_align.GetSegs().GetDenseg();
279  const objects::CDense_seg::TStarts& starts = denseg.GetStarts();
280  const objects::CDense_seg::TLens& lens = denseg.GetLens();
281 
282  int num_seqs = denseg.GetDim();
283  int seq_length = 0;
284  ITERATE (objects::CDense_seg::TLens, it, lens) {
285  seq_length += *it;
286  }
287 
288  // reserve memory for MSA
289  msa.resize(num_seqs);
290  NON_CONST_ITERATE (vector<CSequence>, it, msa) {
291  it->m_Sequence.resize(seq_length);
292  it->m_Freqs.Resize(seq_length, kAlphabetSize);
293  it->m_Freqs.Set(0.0);
294  }
295 
296  // get sequences
297  vector< CRef<objects::CSeqVector> > seq_vectors;
298  seq_vectors.reserve(num_seqs);
299  ITERATE (objects::CDense_seg::TIds, it, denseg.GetIds()) {
300  seq_vectors.push_back(CRef<objects::CSeqVector>(
301  new objects::CSeqVector(
302  scope.GetBioseqHandle(**it))));
303  }
304 
305  // convert Seq_align to strings of residues and gaps
306  // start column in MSA
307  int from = 0;
308  size_t seg_index = 0;
309 
310  // for each alignment segment
311  while (seg_index < lens.size()) {
312 
313  TSeqPos seg_len = lens[seg_index];
314 
315  _ASSERT(from + seg_len - 1 < (int)seq_length);
316 
317  // for each sequence start position
318  for (int i=0;i < num_seqs;i++) {
319 
320  // if gap in sequence then put gaps
321  if (starts[seg_index * num_seqs + i] < 0) {
322  for (TSeqPos k=0;k < seg_len;k++) {
323  msa[i].m_Sequence[from + k] = kGapChar;
324  }
325  }
326  else {
327 
328  // else copy residues from seq_vector
329  for (TSeqPos k=0;k < seg_len;k++) {
330  msa[i].m_Sequence[from + k] =
331  (*seq_vectors[i])[starts[seg_index * num_seqs + i] + k];
332  }
333  }
334  }
335 
336  // move to the next segment
337  from += seg_len;
338  seg_index++;
339  }
340 
341 }
342 
343 END_SCOPE(cobalt)
static const int kAlphabetSize
The aligner internally works only with the ncbistdaa alphabet.
Definition: base.hpp:119
void Resize(size_t i, size_t j, T val=T())
resize this matrix, filling the empty cells with a known value
Definition: matrix.hpp:390
void Swap(CNcbiMatrix< T > &M)
swap two matrices efficiently
Definition: matrix.hpp:501
void Set(T val)
set all values in the matrix to a known value
Definition: matrix.hpp:417
size_t GetRows() const
get the number of rows in this matrix
Definition: matrix.hpp:298
int GetLength() const
Get the length of the current sequence.
Definition: seq.hpp:125
static void CompressSequences(vector< CSequence > &seq, vector< int > index_list)
Given a collection of sequences, remove all sequence positions where a subset of the sequences all co...
Definition: seq.cpp:232
TFreqMatrix m_Freqs
Position-specific frequency profile corresponding to sequence.
Definition: seq.hpp:171
void PropagateGaps(const CNWAligner::TTranscript &transcript, CNWAligner::ETranscriptSymbol gap_choice)
Given an edit script, insert gaps into a sequence.
Definition: seq.cpp:133
void Reset(const objects::CSeq_loc &seq, objects::CScope &scope)
Replace the sequence represented by a CSequence object.
Definition: seq.cpp:87
CSequence()
Default constructor: build an empty sequence.
Definition: seq.hpp:67
static const unsigned char kGapChar
The ncbistdaa code for a gap.
Definition: seq.hpp:58
vector< unsigned char > m_Sequence
The sequence (ncbistdaa format)
Definition: seq.hpp:170
void InsertGaps(const vector< Uint4 > &gap_locations, bool consider_gaps=false)
Insert gaps into a sequence.
Definition: seq.cpp:170
static void CreateMsa(const objects::CSeq_align &seq_align, objects::CScope &scope, vector< CSequence > &msa)
Create a vector of CSequence objects that represents the alignment in given Seq_align.
Definition: seq.cpp:274
unsigned char GetPrintableLetter(int pos) const
Access the sequence letter at a specified position, and return an ASCII representation of that letter...
Definition: seq.cpp:48
static const char location[]
Definition: config.c:97
vector< ETranscriptSymbol > TTranscript
Definition: nw_aligner.hpp:199
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
map< TSeqRange, string > TStarts
int i
Interface for CSequence class.
#define _ASSERT
Modified on Sat Jul 13 13:37:32 2024 by modify_doxy.py rev. 669887