NCBI C++ ToolKit
aln_printer.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: aln_printer.cpp 72378 2016-05-04 14:59:01Z camacho $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's offical duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================*/
25 
26 /*****************************************************************************
27 
28 File name: aln_printer.cpp
29 
30 Author: Greg Boratyn
31 
32 Contents: Printer for standard multiple sequence alignmnet formats
33 
34 ******************************************************************************/
35 #include <ncbi_pch.hpp>
36 #include <objmgr/bioseq_handle.hpp>
37 #include <objmgr/util/sequence.hpp>
41 
43 USING_SCOPE(align_format);
44 
45 
46 // Relace non-alphanumeric characters with '_'
47 static void s_ReplaceNonAlphaNum(string& str)
48 {
49  for (size_t i=0;i < str.length();i++) {
50  if (!isalnum(str[i])) {
51  str[i] = '_';
52  }
53  }
54 }
55 
56 // Get sequence title from Bioseq
57 static string s_GetTitle(const CBioseq_Handle& bhandle)
58 {
59  string retval;
60  ITERATE(CSeq_descr::Tdata, it, bhandle.GetDescr().Get()) {
61  if ((*it)->IsTitle()) {
62  if (!retval.empty()) {
63  retval += " ";
64  }
65  retval += (*it)->GetTitle();
66  }
67  }
68  return retval;
69 }
70 
71 // Get sequence label for Phylip formats, returns either sequence title or
72 // Seq-id
73 static string s_GetLabel(const CBioseq_Handle& bhandle)
74 {
75  CConstRef<CSeq_id> id = bhandle.GetSeqId();
76 
77  // for local id with text content, return content
78  if (id->IsLocal() && id->GetLocal().IsStr()) {
79  string label;
80  id->GetLabel(&label, CSeq_id::eContent);
81  return label;
82  }
83 
84  // otherwise return title or Seq-id if the title is empty
85  string retval = s_GetTitle(bhandle);
86  if (retval.empty()) {
87  retval = id->AsFastaString();
88  }
89 
90  return retval;
91 }
92 
93 
95  CScope& scope,
97  /* = eNotSet */)
98  : m_AlnVec(new CAlnVec(seqalign.GetSegs().GetDenseg(), scope)),
99  m_AlignType(type),
100  m_Format(CMultiAlnPrinter::eFastaPlusGaps),
101  m_Width(60)
102 {
103  m_AlnVec->SetGapChar('-');
104  m_AlnVec->SetEndChar('-');
106 }
107 
108 
110 {
111  switch (m_Format) {
112  case eFastaPlusGaps:
113  x_PrintFastaPlusGaps(ostr);
114  break;
115 
116  case eClustal:
117  x_PrintClustal(ostr);
118  break;
119 
120  case ePhylipSequential:
122  break;
123 
124  case ePhylipInterleaved:
126  break;
127 
128  case eNexus:
129  x_PrintNexus(ostr);
130  break;
131  }
132 }
133 
134 
136 {
137  int num_seqs = m_AlnVec->GetNumRows();
138  string seq;
139  for (int i=0;i < num_seqs;i++) {
141  m_AlnVec->GetSeqId(i),
143 
144  ostr << ">";
145  CConstRef<CSeq_id> id = bhandle.GetSeqId();
146  // if Seq-id is local, then ...
147  if (id->IsLocal()) {
148  // ... for numeric ids print full Seq-id
149  if (id->GetLocal().IsId()) {
150  ostr << id->AsFastaString();
151  }
152  else {
153  // ... for text ids, print only content
154  string label;
155  id->GetLabel(&label, CSeq_id::eContent);
156  ostr << label;
157  }
158  }
159  else {
160  // for non-local Seq-ids, print all Seq-ids from Bioseq
161  const vector<CSeq_id_Handle>& ids = bhandle.GetId();
162  ITERATE (vector<CSeq_id_Handle>, it, ids) {
163  ostr << it->GetSeqId()->AsFastaString();
164  if (it + 1 != ids.end()) {
165  ostr << "|";
166  }
167  }
168  }
169  string title = s_GetTitle(bhandle);
170  if (!title.empty()) {
171  ostr << " " << title;
172  }
173  ostr << NcbiEndl;
174 
176 
177  for (int j=0;j < (int)seq.length();j++) {
178  if (j % m_Width == 0 && j != 0) {
179  ostr << NcbiEndl;
180  }
181  ostr << seq[j];
182  }
183  ostr << NcbiEndl;
184  }
185 }
186 
187 
189 {
190  CAlnVecPrinter printer(*m_AlnVec, ostr);
191  printer.ClustalStyle(m_Width);
192 }
193 
194 
196 {
197  int num_sequences = m_AlnVec->GetNumRows();
198  // sequence title must be up to 10 characters long
199  const unsigned int kSeqTitleWidth = 10;
200 
201  string sequence;
202  m_AlnVec->GetWholeAlnSeqString(0, sequence);
203 
204  ostr << " " << num_sequences << " " << sequence.length() << NcbiEndl;
205 
206 
207  for (int i=0;i < num_sequences;i++) {
208 
210  m_AlnVec->GetSeqId(i),
212 
213  string seq_title = s_GetLabel(bhandle);
214  // sequence title width must be 10
215  if (seq_title.length() > kSeqTitleWidth) {
216  seq_title.erase(kSeqTitleWidth - 1, seq_title.size() - 1);
217  }
218  s_ReplaceNonAlphaNum(seq_title);
219  while (seq_title.length() < kSeqTitleWidth) {
220  seq_title += " ";
221  }
222  ostr << seq_title;
223 
224  // if i == 0 the sequence is already retreaved
225  if (i > 0) {
226  m_AlnVec->GetWholeAlnSeqString(i, sequence);
227  }
228 
229  unsigned int j = 0;
230  for (j=0;j < sequence.length() && j < m_Width - kSeqTitleWidth;j++) {
231  ostr << sequence[j];
232  }
233  for (;j < sequence.length();j++) {
234  if ((j + kSeqTitleWidth) % m_Width == 0 && j != 0) {
235  ostr << NcbiEndl;
236  }
237  ostr << sequence[j];
238  }
239  ostr << NcbiEndl;
240  }
241 }
242 
244 {
245  int num_sequences = m_AlnVec->GetNumRows();
246  int aln_width = m_AlnVec->GetAlnStop() + 1;
247  // sequence title must be up to 10 characters long
248  const unsigned int kSeqTitleWidth = 10;
249 
250  // print numer of sequences and width (number of charachets)
251  // of the alignment
252  ostr << " " << num_sequences << " " << aln_width << NcbiEndl;
253 
254  // print sequence title and the first portions of the sequences
255  for (int i=0;i < num_sequences;i++) {
256 
258  m_AlnVec->GetSeqId(i),
260 
261  string seq_title = s_GetLabel(bhandle);
262  // the space for sequence title must be exactly 10 characters long
263  if (seq_title.length() > kSeqTitleWidth) {
264  seq_title.erase(kSeqTitleWidth - 1, seq_title.size() - 1);
265  }
266  s_ReplaceNonAlphaNum(seq_title);
267  while (seq_title.length() < kSeqTitleWidth) {
268  seq_title += " ";
269  }
270  ostr << seq_title;
271 
272  string seq;
274  min(m_Width - (int)kSeqTitleWidth, aln_width)));
275  ostr << seq << NcbiEndl;
276  }
277  ostr << NcbiEndl;
278 
279  // print remaining portions of the sequences
280  int from = m_Width - kSeqTitleWidth;
281  while (from < (int)aln_width) {
282  int to = min(from + m_Width, aln_width);
283  for (int i=0;i < num_sequences;i++) {
284  string seq;
286  ostr << seq << NcbiEndl;
287  }
288  ostr << NcbiEndl;
289  from = to + 1;
290  }
291 }
292 
294 {
295  if (m_AlignType == eNotSet) {
296  NCBI_THROW(CException, eInvalid, "Alignment type must be set for the "
297  "Nexus format");
298  }
299 
300  int num_sequences = m_AlnVec->GetNumRows();
301  int last_pos = m_AlnVec->GetAlnStop(); /* alignment width - 1 */
302  vector<string> seqids(num_sequences);
303  int max_id_length = 0;
304  for (int i=0;i < num_sequences;i++) {
305 
306  seqids[i] = m_AlnVec->GetSeqId(i).GetSeqIdString();
307  if ((int)seqids[i].length() > max_id_length) {
308  max_id_length = seqids[i].length();
309  }
310  }
311 
312  ostr << "#NEXUS" << NcbiEndl << NcbiEndl
313  << "BEGIN DATA;" << NcbiEndl
314  << "DIMENSIONS ntax=" << num_sequences << " nchar="
315  << last_pos + 1 << ";"
316  << NcbiEndl
317  << "FORMAT datatype="
318  << (m_AlignType == eNucleotide ? "dna" : "protein")
319  << " gap=" << (char)m_AlnVec->GetGapChar(0)
320  << " interleave;"
321  << NcbiEndl
322  << "MATRIX" << NcbiEndl;
323 
324 
325  int from = 0;
326  int seqid_width = max_id_length + 2;
327  while (from < last_pos) {
328  int to = min(from + m_Width, last_pos);
329  for (int i=0;i < num_sequences;i++) {
330 
331  ostr << seqids[i];
332  int margin = seqid_width - seqids[i].length();
333  while (margin > 0) {
334  ostr << " ";
335  margin--;
336  }
337 
338  string sequence;
339  m_AlnVec->GetAlnSeqString(sequence, i,
340  CAlnMap::TSignedRange(from, to));
341  ostr << sequence << NcbiEndl;
342  }
343  ostr << NcbiEndl;
344  from = to + 1;
345  }
346  ostr << ";" << NcbiEndl << "END;" << NcbiEndl;
347 }
348 
static void s_ReplaceNonAlphaNum(string &str)
Definition: aln_printer.cpp:47
static string s_GetLabel(const CBioseq_Handle &bhandle)
Definition: aln_printer.cpp:73
static string s_GetTitle(const CBioseq_Handle &bhandle)
Definition: aln_printer.cpp:57
USING_NCBI_SCOPE
Definition: aln_printer.cpp:42
USING_SCOPE(align_format)
const CSeq_id & GetSeqId(TNumrow row) const
Definition: alnmap.hpp:645
TDim GetNumRows(void) const
Definition: alnmap.hpp:517
TSeqPos GetAlnStop(TNumseg seg) const
Definition: alnmap.hpp:488
void ClustalStyle(int scrn_width=50, EAlgorithm algorithm=eUseAlnSeqString)
TResidue GetGapChar(TNumrow row) const
Definition: alnvec.hpp:358
void SetEndChar(TResidue gap_char)
Definition: alnvec.hpp:368
void SetGapChar(TResidue gap_char)
Definition: alnvec.hpp:339
string & GetWholeAlnSeqString(TNumrow row, string &buffer, TSeqPosList *insert_aln_starts=0, TSeqPosList *insert_starts=0, TSeqPosList *insert_lens=0, unsigned int scrn_width=0, TSeqPosList *scrn_lefts=0, TSeqPosList *scrn_rights=0) const
Definition: alnvec.cpp:199
CScope & GetScope(void) const
Definition: alnvec.hpp:247
void SetAaCoding(TCoding coding)
Definition: alnvec.hpp:114
string & GetAlnSeqString(string &buffer, TNumrow row, const CAlnMap::TSignedRange &aln_rng) const
Definition: alnvec.cpp:145
CBioseq_Handle –.
Printer for popular multiple alignmnet formats.
Definition: aln_printer.hpp:51
CMultiAlnPrinter(const CSeq_align &seqalign, CScope &scope, EAlignType type=eNotSet)
Constructor.
Definition: aln_printer.cpp:94
void x_PrintPhylipSequential(CNcbiOstream &ostr)
Print alignment in Phylip format with sequetial sequences.
void x_PrintNexus(CNcbiOstream &ostr)
Print alignment in Nexus format.
EAlignType
Alignment display type for showing nucleotice or protein-related information.
Definition: aln_printer.hpp:56
int m_Width
Selected width of the text field.
void Print(CNcbiOstream &ostr)
Print alignment.
CRef< CAlnVec > m_AlnVec
Alignment manager.
EFormat m_Format
Selected alignment format.
void x_PrintClustal(CNcbiOstream &ostr)
Print alignment in ClustalW format.
void x_PrintPhylipInterleaved(CNcbiOstream &ostr)
Print alignment in Phylip format with interleaved sequences.
EAlignType m_AlignType
Alignment type.
void x_PrintFastaPlusGaps(CNcbiOstream &ostr)
Print alignment in fasta + gaps format.
CScope –.
Definition: scope.hpp:92
static const char * str(char *buf, int n)
Definition: stats.c:84
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
const string AsFastaString(void) const
Definition: Seq_id.cpp:2266
string GetSeqIdString(bool with_version=false) const
Return seqid string with optional version for text seqid type.
Definition: Seq_id.cpp:2145
@ eContent
Untagged human-readable accession or the like.
Definition: Seq_id.hpp:605
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
@ eGetBioseq_All
Search bioseq, load if not loaded yet.
Definition: scope.hpp:128
CConstRef< CSeq_id > GetSeqId(void) const
Get id which can be used to access this bioseq handle Throws an exception if none is available.
const TDescr & GetDescr(void) const
const TId & GetId(void) const
#define NcbiEndl
Definition: ncbistre.hpp:548
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
static const char label[]
bool IsStr(void) const
Check if variant Str is selected.
Definition: Object_id_.hpp:291
bool IsId(void) const
Check if variant Id is selected.
Definition: Object_id_.hpp:264
const TLocal & GetLocal(void) const
Get the variant data.
Definition: Seq_id_.cpp:193
bool IsLocal(void) const
Check if variant Local is selected.
Definition: Seq_id_.hpp:775
list< CRef< CSeqdesc > > Tdata
Definition: Seq_descr_.hpp:91
const Tdata & Get(void) const
Get the member data.
Definition: Seq_descr_.hpp:166
@ e_Ncbieaa
extended ASCII 1 letter aa codes
Definition: Seq_data_.hpp:111
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
int isalnum(Uchar c)
Definition: ncbictype.hpp:62
T min(T x_, T y_)
Definition: type.c:6
Modified on Sat Jun 29 13:55:01 2024 by modify_doxy.py rev. 669887