NCBI C++ ToolKit
alnvec.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef OBJECTS_ALNMGR___ALNVEC__HPP
2 #define OBJECTS_ALNMGR___ALNVEC__HPP
3 
4 /* $Id: alnvec.hpp 80072 2017-11-02 15:21:07Z falkrb $
5  * ===========================================================================
6  *
7  * PUBLIC DOMAIN NOTICE
8  * National Center for Biotechnology Information
9  *
10  * This software/database is a "United States Government Work" under the
11  * terms of the United States Copyright Act. It was written as part of
12  * the author's official duties as a United States Government employee and
13  * thus cannot be copyrighted. This software/database is freely available
14  * to the public for use. The National Library of Medicine and the U.S.
15  * Government have not placed any restriction on its use or reproduction.
16  *
17  * Although all reasonable efforts have been taken to ensure the accuracy
18  * and reliability of the software and data, the NLM and the U.S.
19  * Government do not and cannot warrant the performance or results that
20  * may be obtained by using this software or data. The NLM and the U.S.
21  * Government disclaim all warranties, express or implied, including
22  * warranties of performance, merchantability or fitness for any particular
23  * purpose.
24  *
25  * Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Author: Kamen Todorov, NCBI
30  *
31  * File Description:
32  * Access to the actual aligned residues
33  *
34  */
35 
36 
38 #include <objmgr/seq_vector.hpp>
39 
41 BEGIN_objects_SCOPE
42 
43 
44 // forward declarations
45 class CScope;
46 
48 {
49  typedef CAlnMap Tparent;
52 
53 public:
55  typedef vector<int> TResidueCount;
56 
57  // constructor
58  CAlnVec(const CDense_seg& ds, CScope& scope);
59  CAlnVec(const CDense_seg& ds, TNumrow anchor, CScope& scope);
60 
61  // destructor
62  ~CAlnVec(void);
63 
64  CScope& GetScope(void) const;
65 
66 
67  // GetSeqString methods:
68 
69  // raw seq string (in seq coords)
70  string& GetSeqString (string& buffer,
71  TNumrow row,
72  TSeqPos seq_from, TSeqPos seq_to) const;
73  string& GetSeqString (string& buffer,
74  TNumrow row,
75  const CAlnMap::TRange& seq_rng) const;
76  string& GetSegSeqString(string& buffer,
77  TNumrow row,
78  TNumseg seg, TNumseg offset = 0) const;
79 
80  // alignment (seq + gaps) string (in aln coords)
81  string& GetAlnSeqString(string& buffer,
82  TNumrow row,
83  const CAlnMap::TSignedRange& aln_rng) const;
84 
85  // creates a vertical string of residues for a given aln pos
86  // NB: buffer will be resized to GetNumRows()
87  // optionally, returns a distribution of residues
88  // optionally, counts the gaps in this distribution
89  string& GetColumnVector(string& buffer,
90  TSeqPos aln_pos,
91  TResidueCount * residue_count = 0,
92  bool gaps_in_count = false) const;
93 
94  // get the seq string for the whole alignment (seq + gaps)
95  // optionally, get the inserts and screen limit coords
96  string& GetWholeAlnSeqString(TNumrow row,
97  string& buffer,
98  TSeqPosList * insert_aln_starts = 0,
99  TSeqPosList * insert_starts = 0,
100  TSeqPosList * insert_lens = 0,
101  unsigned int scrn_width = 0,
102  TSeqPosList * scrn_lefts = 0,
103  TSeqPosList * scrn_rights = 0) const;
104 
105  const CBioseq_Handle& GetBioseqHandle(TNumrow row) const;
106  TResidue GetResidue (TNumrow row, TSeqPos aln_pos) const;
107 
108  // Sequence coding. If not set (default), Iupac[na/aa] coding is used.
109  // If set to a value conflicting with the sequence type
111  TCoding GetNaCoding(void) const { return m_NaCoding; }
112  TCoding GetAaCoding(void) const { return m_AaCoding; }
113  void SetNaCoding(TCoding coding) { m_NaCoding = coding; }
114  void SetAaCoding(TCoding coding) { m_AaCoding = coding; }
115 
116  // gap character could be explicitely set otherwise taken from seqvector
117  void SetGapChar(TResidue gap_char);
118  void UnsetGapChar();
119  bool IsSetGapChar() const;
120  TResidue GetGapChar(TNumrow row) const;
121 
122  // end character is ' ' by default
123  void SetEndChar(TResidue gap_char);
124  void UnsetEndChar();
125  bool IsSetEndChar() const;
126  TResidue GetEndChar() const;
127 
128  // genetic code
129  enum EConstants {
130  kDefaultGenCode = 1
131  };
132  void SetGenCode(int gen_code,
133  TNumrow row = -1);
134  void UnsetGenCode();
135  bool IsSetGenCode() const;
136  int GetGenCode(TNumrow row) const;
137 
138  // Functions for obtaining a consensus sequence
139  // These versions add the consensus Bioseq to the scope.
140  CRef<CDense_seg> CreateConsensus(int& consensus_row) const;
141  CRef<CDense_seg> CreateConsensus(int& consensus_row,
142  const CSeq_id& consensus_id) const;
143  // This version returns the consensus Bioseq (in a parameter)
144  // without adding it to the scope.
145  CRef<CDense_seg> CreateConsensus(int& consensus_row,
146  CBioseq& consensus_seq,
147  const CSeq_id& consensus_id,
148  vector<string>* consens = NULL) const;
149 
150  // utilities
151  int CalculateScore (TNumrow row1, TNumrow row2) const;
152  int CalculatePercentIdentity(TSeqPos aln_pos) const;
153 
154  // static utilities
155  static void TranslateNAToAA(const string& na, string& aa,
156  int gen_code = kDefaultGenCode);
157  // gen_code per
158  // http://www.ncbi.nlm.nih.gov/collab/FT/#7.5.5
159 
160  static int CalculateScore (const string& s1, const string& s2,
161  bool s1_is_prot, bool s2_is_prot,
162  int gen_code1 = kDefaultGenCode,
163  int gen_code2 = kDefaultGenCode);
164 
165  // temporaries for conversion (see note below)
166  static unsigned char FromIupac(unsigned char c);
167  static unsigned char ToIupac (unsigned char c);
168 
169  static void TransposeSequences(vector<string>& segs);
170  static void CollectNucleotideFrequences(const string& col, int base_count[], int numBases);
171  static void CollectProteinFrequences(const string& col, int base_count[], int numBases);
172 
173  void RetrieveSegmentSequences(size_t segment, vector<string>& segs) const;
174 
175 protected:
176 
177  CSeqVector& x_GetSeqVector (TNumrow row) const;
179 
180  void CreateConsensus(vector<string>& consens) const;
181 
185 
186 private:
187  // Prohibit copy constructor and assignment operator
188  CAlnVec(const CAlnVec&);
190 
195  vector<int> m_GenCodes;
198 };
199 
200 
201 
203 {
204 public:
205  /// Constructor
206  CAlnVecPrinter(const CAlnVec& aln_vec,
207  CNcbiOstream& out);
208 
209 
210  /// which algorithm to choose
211  enum EAlgorithm {
212  eUseSeqString, /// memory ineficient
213  eUseAlnSeqString, /// memory efficient, recommended for large alns
214  eUseWholeAlnSeqString /// memory ineficient, but very fast
215  };
216 
217  /// Printing methods
218  void PopsetStyle (int scrn_width = 70,
219  EAlgorithm algorithm = eUseAlnSeqString);
220 
221  void ClustalStyle(int scrn_width = 50,
222  EAlgorithm algorithm = eUseAlnSeqString);
223 
224 private:
225  void x_SetChars();
226  void x_UnsetChars();
227 
229 
231 
234 
237 };
238 
239 
240 
241 /////////////////////////////////////////////////////////////////////////////
242 // IMPLEMENTATION of INLINE functions
243 /////////////////////////////////////////////////////////////////////////////
244 
245 
246 inline
248 {
249  return *m_Scope;
250 }
251 
252 
253 inline
255 {
256  if (aln_pos > GetAlnStop()) {
257  return (TResidue) 0; // out of range
258  }
259  TSegTypeFlags type = GetSegType(row, GetSeg(aln_pos));
260  if (type & fSeq) {
261  CSeqVector& seq_vec = x_GetSeqVector(row);
262  TSignedSeqPos pos = GetSeqPosFromAlnPos(row, aln_pos);
263  if (GetWidth(row) == 3) {
264  string na_buff, aa_buff;
265  if (IsPositiveStrand(row)) {
266  seq_vec.GetSeqData(pos, pos + 3, na_buff);
267  } else {
268  TSeqPos size = seq_vec.size();
269  seq_vec.GetSeqData(size - pos - 3, size - pos, na_buff);
270  }
271  TranslateNAToAA(na_buff, aa_buff, GetGenCode(row));
272  return aa_buff[0];
273  } else {
274  return seq_vec[IsPositiveStrand(row) ?
275  pos : seq_vec.size() - pos - 1];
276  }
277  } else {
278  if (type & fNoSeqOnLeft || type & fNoSeqOnRight) {
279  return GetEndChar();
280  } else {
281  return GetGapChar(row);
282  }
283  }
284 }
285 
286 
287 inline
289  TNumrow row,
290  TSeqPos seq_from, TSeqPos seq_to) const
291 {
292  if (GetWidth(row) == 3) {
293  string buff;
294  buffer.erase();
295  if (IsPositiveStrand(row)) {
296  x_GetSeqVector(row).GetSeqData(seq_from, seq_to + 1, buff);
297  } else {
298  CSeqVector& seq_vec = x_GetSeqVector(row);
299  TSeqPos size = seq_vec.size();
300  seq_vec.GetSeqData(size - seq_to - 1, size - seq_from, buff);
301  }
302  TranslateNAToAA(buff, buffer, GetGenCode(row));
303  } else {
304  if (IsPositiveStrand(row)) {
305  x_GetSeqVector(row).GetSeqData(seq_from, seq_to + 1, buffer);
306  } else {
307  CSeqVector& seq_vec = x_GetSeqVector(row);
308  TSeqPos size = seq_vec.size();
309  seq_vec.GetSeqData(size - seq_to - 1, size - seq_from, buffer);
310  }
311  }
312  return buffer;
313 }
314 
315 
316 inline
318  TNumrow row,
319  TNumseg seg, int offset) const
320 {
321  return GetSeqString(buffer, row,
322  GetStart(row, seg, offset),
323  GetStop (row, seg, offset));
324 }
325 
326 
327 inline
329  TNumrow row,
330  const CAlnMap::TRange& seq_rng) const
331 {
332  return GetSeqString(buffer, row,
333  seq_rng.GetFrom(),
334  seq_rng.GetTo());
335 }
336 
337 
338 inline
340 {
341  m_GapChar = gap_char;
342  m_set_GapChar = true;
343 }
344 
345 inline
347 {
348  m_set_GapChar = false;
349 }
350 
351 inline
353 {
354  return m_set_GapChar;
355 }
356 
357 inline
359 {
360  if (IsSetGapChar()) {
361  return m_GapChar;
362  } else {
363  return x_GetSeqVector(row).GetGapChar();
364  }
365 }
366 
367 inline
369 {
370  m_EndChar = end_char;
371  m_set_EndChar = true;
372 }
373 
374 inline
376 {
377  m_set_EndChar = false;
378 }
379 
380 inline
382 {
383  return m_set_EndChar;
384 }
385 
386 inline
388 {
389  if (IsSetEndChar()) {
390  return m_EndChar;
391  } else {
392  return ' ';
393  }
394 }
395 
396 inline
397 void CAlnVec::SetGenCode(int gen_code, TNumrow row)
398 {
399  if (row == -1) {
400  if (IsSetGenCode()) {
401  UnsetGenCode();
402  }
403  m_GenCodes.resize(GetNumRows(), gen_code);
404  } else {
405  if ( !IsSetGenCode() ) {
407  }
408  m_GenCodes[row] = gen_code;
409  }
410 }
411 
412 inline
414 {
415  m_GenCodes.clear();
416 }
417 
418 inline
420 {
421  return !m_GenCodes.empty();
422 }
423 
424 inline
426 {
427  if (IsSetGenCode()) {
428  return m_GenCodes[row];
429  } else {
430  return kDefaultGenCode;
431  }
432 }
433 
434 
435 //
436 // these are a temporary work-around
437 // CSeqportUtil contains routines for converting sequence data from one
438 // format to another, but it places a requirement on the data: it must in
439 // a CSeq_data class. I can get this for my data, but it is a bit of work
440 // (much more work than calling CSeqVector::GetSeqdata(), which, if I use the
441 // internal sequence vector, is guaranteed to be in IUPAC notation)
442 //
443 inline
444 unsigned char CAlnVec::FromIupac(unsigned char c)
445 {
446  switch (c)
447  {
448  case 'A': return 0x01;
449  case 'C': return 0x02;
450  case 'M': return 0x03;
451  case 'G': return 0x04;
452  case 'R': return 0x05;
453  case 'S': return 0x06;
454  case 'V': return 0x07;
455  case 'T': return 0x08;
456  case 'W': return 0x09;
457  case 'Y': return 0x0a;
458  case 'H': return 0x0b;
459  case 'K': return 0x0c;
460  case 'D': return 0x0d;
461  case 'B': return 0x0e;
462  case 'N': return 0x0f;
463  }
464 
465  return 0x00;
466 }
467 
468 inline unsigned char CAlnVec::ToIupac(unsigned char c)
469 {
470  const char *data = "-ACMGRSVTWYHKDBN";
471  return ((c < 16) ? data[c] : 0);
472 }
473 
474 
475 ///////////////////////////////////////////////////////////
476 ////////////////// end of inline methods //////////////////
477 ///////////////////////////////////////////////////////////
478 
479 END_objects_SCOPE // namespace ncbi::objects::
481 
482 #endif /* OBJECTS_ALNMGR___ALNVEC__HPP */
void CollectNucleotideFrequences(const vector< string > &rows, int col, int base_count[], int numBases)
void CollectProteinFrequences(const vector< string > &rows, int col, int base_count[], int numBases)
TSignedSeqPos GetStop(TNumrow row, TNumseg seg, int offset=0) const
Definition: alnmap.hpp:635
list< TSeqPos > TSeqPosList
Definition: alnmap.hpp:73
int GetWidth(TNumrow row) const
Definition: alnmap.hpp:560
TSegTypeFlags GetSegType(TNumrow row, TNumseg seg, int offset=0) const
Definition: alnmap.hpp:503
TSignedSeqPos GetStart(TNumrow row, TNumseg seg, int offset=0) const
Definition: alnmap.hpp:614
bool IsPositiveStrand(TNumrow row) const
Definition: alnmap.hpp:600
TSignedSeqPos GetSeqPosFromAlnPos(TNumrow for_row, TSeqPos aln_pos, ESearchDirection dir=eNone, bool try_reverse_dir=true) const
Definition: alnmap.cpp:663
TNumseg GetSeg(TSeqPos aln_pos) const
Definition: alnmap.cpp:373
@ fNoSeqOnRight
Definition: alnmap.hpp:57
@ fNoSeqOnLeft
Definition: alnmap.hpp:58
@ fSeq
Definition: alnmap.hpp:52
TDim GetNumRows(void) const
Definition: alnmap.hpp:517
unsigned int TSegTypeFlags
Definition: alnmap.hpp:50
CDense_seg::TNumseg TNumseg
Definition: alnmap.hpp:72
TSeqPos GetAlnStop(void) const
Definition: alnmap.hpp:495
CSeqVector::TResidue TResidue
Definition: alnvec.hpp:230
TResidue m_OrigGapChar
Definition: alnvec.hpp:233
bool m_OrigSetEndChar
Definition: alnvec.hpp:235
const CAlnVec & m_AlnVec
Definition: alnvec.hpp:228
EAlgorithm
which algorithm to choose
Definition: alnvec.hpp:211
@ eUseAlnSeqString
memory ineficient
Definition: alnvec.hpp:213
TResidue m_OrigEndChar
Definition: alnvec.hpp:236
bool m_OrigSetGapChar
Definition: alnvec.hpp:232
TResidue m_EndChar
Definition: alnvec.hpp:193
TCoding GetNaCoding(void) const
Definition: alnvec.hpp:111
TCoding GetAaCoding(void) const
Definition: alnvec.hpp:112
CSeqVector & x_GetConsensusSeqVector(void) const
CSeq_data::E_Choice TCoding
Definition: alnvec.hpp:110
bool IsSetGenCode() const
Definition: alnvec.hpp:419
bool IsSetEndChar() const
Definition: alnvec.hpp:381
TResidue GetGapChar(TNumrow row) const
Definition: alnvec.hpp:358
string & GetSeqString(string &buffer, TNumrow row, TSeqPos seq_from, TSeqPos seq_to) const
Definition: alnvec.hpp:288
void SetEndChar(TResidue gap_char)
Definition: alnvec.hpp:368
TResidue GetEndChar() const
Definition: alnvec.hpp:387
static void TranslateNAToAA(const string &na, string &aa, int gen_code=kDefaultGenCode)
Definition: alnvec.cpp:893
TResidue m_GapChar
Definition: alnvec.hpp:191
CAlnMap Tparent
Definition: alnvec.hpp:49
bool m_set_GapChar
Definition: alnvec.hpp:192
map< TNumrow, CBioseq_Handle > TBioseqHandleCache
Definition: alnvec.hpp:50
int GetGenCode(TNumrow row) const
Definition: alnvec.hpp:425
void SetGapChar(TResidue gap_char)
Definition: alnvec.hpp:339
bool IsSetGapChar() const
Definition: alnvec.hpp:352
map< TNumrow, CRef< CSeqVector > > TSeqVectorCache
Definition: alnvec.hpp:51
void SetNaCoding(TCoding coding)
Definition: alnvec.hpp:113
CAlnVec & operator=(const CAlnVec &)
EConstants
Definition: alnvec.hpp:129
@ kDefaultGenCode
Definition: alnvec.hpp:130
TCoding m_AaCoding
Definition: alnvec.hpp:197
CSeqVector & x_GetSeqVector(TNumrow row) const
Definition: alnvec.cpp:108
CScope & GetScope(void) const
Definition: alnvec.hpp:247
vector< int > m_GenCodes
Definition: alnvec.hpp:195
void SetAaCoding(TCoding coding)
Definition: alnvec.hpp:114
bool m_set_EndChar
Definition: alnvec.hpp:194
CSeqVector::TResidue TResidue
Definition: alnvec.hpp:54
void UnsetGenCode()
Definition: alnvec.hpp:413
void SetGenCode(int gen_code, TNumrow row=-1)
Definition: alnvec.hpp:397
CAlnVec(const CAlnVec &)
static unsigned char ToIupac(unsigned char c)
Definition: alnvec.hpp:468
TResidue GetResidue(TNumrow row, TSeqPos aln_pos) const
Definition: alnvec.hpp:254
TBioseqHandleCache m_BioseqHandlesCache
Definition: alnvec.hpp:183
vector< int > TResidueCount
Definition: alnvec.hpp:55
static unsigned char FromIupac(unsigned char c)
Definition: alnvec.hpp:444
TSeqVectorCache m_SeqVectorCache
Definition: alnvec.hpp:184
string & GetSegSeqString(string &buffer, TNumrow row, TNumseg seg, TNumseg offset=0) const
Definition: alnvec.hpp:317
TCoding m_NaCoding
Definition: alnvec.hpp:196
CRef< CScope > m_Scope
Definition: alnvec.hpp:182
void UnsetGapChar()
Definition: alnvec.hpp:346
void UnsetEndChar()
Definition: alnvec.hpp:375
CBioseq_Handle –.
CScope –.
Definition: scope.hpp:92
CSeqVector –.
Definition: seq_vector.hpp:65
objects::CSeqVector::TResidue TResidue
std::ofstream out("events_result.xml")
main entry point for tests
objects::CSeqVectorTypes::TResidue TResidue
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
int TSignedSeqPos
Type for signed sequence position.
Definition: ncbimisc.hpp:887
#define NULL
Definition: ncbistd.hpp:225
unsigned char TResidue
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
Definition: seq_vector.cpp:304
TSeqPos size(void) const
Definition: seq_vector.hpp:291
TResidue GetGapChar(ECaseConversion case_cvt=eCaseConversion_none) const
Return gap symbol corresponding to the selected coding.
Definition: seq_vector.hpp:318
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
#define NCBI_XALNMGR_EXPORT
Definition: ncbi_export.h:1065
TTo GetTo(void) const
Get the To member data.
Definition: Range_.hpp:269
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
E_Choice
Choice variants.
Definition: Seq_data_.hpp:102
static const char * GetResidue(TokenStatBlkPtr stoken)
Definition: indx_blk.cpp:234
const struct ncbi::grid::netcache::search::fields::SIZE size
static pcre_uint8 * buffer
Definition: pcretest.c:1051
int offset
Definition: replacements.h:160
Definition: type.c:6
CScope & GetScope()
Modified on Sun Mar 03 03:11:05 2024 by modify_doxy.py rev. 669887