NCBI C++ ToolKit
su_sequence_set.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: su_sequence_set.cpp 45159 2010-03-18 13:32:19Z thiessen $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Paul Thiessen
27 *
28 * File Description:
29 * Classes to hold sets of sequences
30 *
31 * ===========================================================================
32 */
33 
34 #include <ncbi_pch.hpp>
35 #include <corelib/ncbistd.hpp>
36 #include <corelib/ncbistre.hpp>
37 #include <corelib/ncbistl.hpp>
38 
39 #include <vector>
40 #include <map>
41 
47 #include <objects/seq/Seq_inst.hpp>
48 #include <objects/seq/Seq_data.hpp>
49 #include <objects/seq/NCBIeaa.hpp>
50 #include <objects/seq/IUPACaa.hpp>
52 #include <objects/seq/NCBI4na.hpp>
53 #include <objects/seq/NCBI8na.hpp>
54 #include <objects/seq/NCBI2na.hpp>
55 #include <objects/seq/IUPACna.hpp>
60 #include <objects/seq/Seqdesc.hpp>
64 
66 #include "su_private.hpp"
67 
70 
71 
72 BEGIN_SCOPE(struct_util)
73 
74 static void UnpackSeqSet(CBioseq_set& bss, SequenceSet::SequenceList& seqlist)
75 {
76  CBioseq_set::TSeq_set::iterator q, qe = bss.SetSeq_set().end();
77  for (q=bss.SetSeq_set().begin(); q!=qe; ++q) {
78  if (q->GetObject().IsSeq()) {
79 
80  // only store amino acid or nucleotide sequences
81  if (q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_aa &&
82  q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_dna &&
83  q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_rna &&
84  q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_na)
85  continue;
86 
87  CRef < Sequence > sequence(new Sequence(q->GetObject().SetSeq()));
88  seqlist.push_back(sequence);
89 
90  } else { // Bioseq-set
91  UnpackSeqSet(q->GetObject().SetSet(), seqlist);
92  }
93  }
94 }
95 
96 static void UnpackSeqEntry(CSeq_entry& seqEntry, SequenceSet::SequenceList& seqlist)
97 {
98  if (seqEntry.IsSeq()) {
99  CRef < Sequence > sequence(new Sequence(seqEntry.SetSeq()));
100  seqlist.push_back(sequence);
101  } else { // Bioseq-set
102  UnpackSeqSet(seqEntry.SetSet(), seqlist);
103  }
104 }
105 
107 {
108  SeqEntryList::iterator s, se = seqEntries.end();
109  for (s=seqEntries.begin(); s!=se; ++s)
110  UnpackSeqEntry(s->GetObject(), m_sequences);
111 
112  TRACE_MESSAGE("number of sequences: " << m_sequences.size());
113 }
114 
115 #define FIRSTOF2(byte) (((byte) & 0xF0) >> 4)
116 #define SECONDOF2(byte) ((byte) & 0x0F)
117 
118 static void StringFrom4na(const vector< char >& vec, string *str, bool isDNA)
119 {
120  if (SECONDOF2(vec.back()) > 0)
121  str->resize(vec.size() * 2);
122  else
123  str->resize(vec.size() * 2 - 1);
124 
125  // first, extract 4-bit values
126  unsigned int i;
127  for (i=0; i<vec.size(); ++i) {
128  str->at(2*i) = FIRSTOF2(vec[i]);
129  if (SECONDOF2(vec[i]) > 0) str->at(2*i + 1) = SECONDOF2(vec[i]);
130  }
131 
132  // then convert 4-bit values to ascii characters
133  for (i=0; i<str->size(); ++i) {
134  switch (str->at(i)) {
135  case 1: str->at(i) = 'A'; break;
136  case 2: str->at(i) = 'C'; break;
137  case 4: str->at(i) = 'G'; break;
138  case 8: isDNA ? str->at(i) = 'T' : str->at(i) = 'U'; break;
139  default:
140  str->at(i) = 'X';
141  }
142  }
143 }
144 
145 #define FIRSTOF4(byte) (((byte) & 0xC0) >> 6)
146 #define SECONDOF4(byte) (((byte) & 0x30) >> 4)
147 #define THIRDOF4(byte) (((byte) & 0x0C) >> 2)
148 #define FOURTHOF4(byte) ((byte) & 0x03)
149 
150 static void StringFrom2na(const vector< char >& vec, string *str, bool isDNA)
151 {
152  str->resize(vec.size() * 4);
153 
154  // first, extract 4-bit values
155  unsigned int i;
156  for (i=0; i<vec.size(); ++i) {
157  str->at(4*i) = FIRSTOF4(vec[i]);
158  str->at(4*i + 1) = SECONDOF4(vec[i]);
159  str->at(4*i + 2) = THIRDOF4(vec[i]);
160  str->at(4*i + 3) = FOURTHOF4(vec[i]);
161  }
162 
163  // then convert 4-bit values to ascii characters
164  for (i=0; i<str->size(); ++i) {
165  switch (str->at(i)) {
166  case 0: str->at(i) = 'A'; break;
167  case 1: str->at(i) = 'C'; break;
168  case 2: str->at(i) = 'G'; break;
169  case 3: isDNA ? str->at(i) = 'T' : str->at(i) = 'U'; break;
170  }
171  }
172 }
173 
174 static void StringFromStdaa(const vector < char >& vec, string *str)
175 {
176  static const char *stdaaMap = "-ABCDEFGHIKLMNPQRSTVWXYZU*OJ";
177 
178  str->resize(vec.size());
179  for (unsigned int i=0; i<vec.size(); ++i)
180  str->at(i) = stdaaMap[(unsigned int) vec[i]];
181 }
182 
183 Sequence::Sequence(ncbi::objects::CBioseq& bioseq) :
184  m_bioseqASN(&bioseq), m_isProtein(false)
185 {
186  // fill out description
187  if (bioseq.IsSetDescr()) {
188  string defline, taxid;
189  CSeq_descr::Tdata::const_iterator d, de = bioseq.GetDescr().Get().end();
190  for (d=bioseq.GetDescr().Get().begin(); d!=de; ++d) {
191 
192  // get "defline" from title or compound
193  if ((*d)->IsTitle()) { // prefer title over compound
194  defline = (*d)->GetTitle();
195  } else if (defline.size() == 0 && (*d)->IsPdb() && (*d)->GetPdb().GetCompound().size() > 0) {
196  defline = (*d)->GetPdb().GetCompound().front();
197  }
198 
199  // get taxonomy
200  if ((*d)->IsSource()) {
201  if ((*d)->GetSource().GetOrg().IsSetTaxname())
202  taxid = (*d)->GetSource().GetOrg().GetTaxname();
203  else if ((*d)->GetSource().GetOrg().IsSetCommon())
204  taxid = (*d)->GetSource().GetOrg().GetCommon();
205  }
206  }
207  if (taxid.size() > 0)
208  m_description = string("[") + taxid + ']';
209  if (defline.size() > 0) {
210  if (taxid.size() > 0)
211  m_description += ' ';
212  m_description += defline;
213  }
214  }
215 
216  // get sequence string
217  if (bioseq.GetInst().GetRepr() == CSeq_inst::eRepr_raw && bioseq.GetInst().IsSetSeq_data()) {
218 
219  // protein formats
220  if (bioseq.GetInst().GetSeq_data().IsNcbieaa()) {
221  m_sequenceString = bioseq.GetInst().GetSeq_data().GetNcbieaa().Get();
222  m_isProtein = true;
223  } else if (bioseq.GetInst().GetSeq_data().IsIupacaa()) {
224  m_sequenceString = bioseq.GetInst().GetSeq_data().GetIupacaa().Get();
225  m_isProtein = true;
226  } else if (bioseq.GetInst().GetSeq_data().IsNcbistdaa()) {
227  StringFromStdaa(bioseq.GetInst().GetSeq_data().GetNcbistdaa().Get(), &m_sequenceString);
228  m_isProtein = true;
229  }
230 
231  // nucleotide formats
232  else if (bioseq.GetInst().GetSeq_data().IsIupacna()) {
233  m_sequenceString = bioseq.GetInst().GetSeq_data().GetIupacna().Get();
234  // convert 'T' to 'U' for RNA
235  if (bioseq.GetInst().GetMol() == CSeq_inst::eMol_rna) {
236  for (unsigned int i=0; i<m_sequenceString.size(); ++i) {
237  if (m_sequenceString[i] == 'T')
238  m_sequenceString[i] = 'U';
239  }
240  }
241  } else if (bioseq.GetInst().GetSeq_data().IsNcbi4na()) {
242  StringFrom4na(bioseq.GetInst().GetSeq_data().GetNcbi4na().Get(), &m_sequenceString,
243  (bioseq.GetInst().GetMol() == CSeq_inst::eMol_dna));
244  } else if (bioseq.GetInst().GetSeq_data().IsNcbi8na()) { // same repr. for non-X as 4na
245  StringFrom4na(bioseq.GetInst().GetSeq_data().GetNcbi8na().Get(), &m_sequenceString,
246  (bioseq.GetInst().GetMol() == CSeq_inst::eMol_dna));
247  } else if (bioseq.GetInst().GetSeq_data().IsNcbi2na()) {
248  StringFrom2na(bioseq.GetInst().GetSeq_data().GetNcbi2na().Get(), &m_sequenceString,
249  (bioseq.GetInst().GetMol() == CSeq_inst::eMol_dna));
250  if (bioseq.GetInst().IsSetLength() && bioseq.GetInst().GetLength() < m_sequenceString.length())
251  m_sequenceString.resize(bioseq.GetInst().GetLength());
252  }
253 
254  else
255  THROW_MESSAGE("Sequence::Sequence(): confused by sequence format");
256 
257  // check length
258  if (bioseq.GetInst().IsSetLength() && bioseq.GetInst().GetLength() != m_sequenceString.length())
259  THROW_MESSAGE("Sequence::Sequence() - sequence string length mismatch");
260 
261  // force uppercase
262  for (unsigned int i=0; i<m_sequenceString.size(); ++i)
263  m_sequenceString[i] = toupper((unsigned char) m_sequenceString[i]);
264 
265  } else
266  THROW_MESSAGE("Sequence::Sequence(): confused by sequence representation");
267 }
268 
270 {
271 }
272 
273 #define RETURN_FIRST_SEQID_THAT_(is) \
274  for (i=m_bioseqASN->GetId().begin(); i!=ie; ++i) \
275  if ((*i)->is()) \
276  return **i
277 
279 {
280  CBioseq::TId::const_iterator i, ie = m_bioseqASN->GetId().end();
281 
282  // try to find one of these first
285 
286  // otherwise, just use the first one
287  return m_bioseqASN->GetId().front().GetObject();
288 }
289 
290 bool Sequence::MatchesSeqId(const CSeq_id& seqID) const
291 {
292  CBioseq::TId::const_iterator i, ie = m_bioseqASN->GetId().end();
293  for (i=m_bioseqASN->GetId().begin(); i!=ie; ++i) {
294  if (seqID.Match(**i))
295  return true;
296  }
297  return false;
298 }
299 
300 string Sequence::IdentifierString(void) const
301 {
303 }
304 
305 END_SCOPE(struct_util)
#define static
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
#define false
Definition: bool.h:36
CRef –.
Definition: ncbiobj.hpp:618
Definition: Seq_entry.hpp:56
std::list< ncbi::CRef< ncbi::objects::CSeq_entry > > SeqEntryList
std::list< ncbi::CRef< Sequence > > SequenceList
SequenceList m_sequences
SequenceSet(SeqEntryList &seqEntries)
void UnpackSeqEntry(const objects::CSeq_entry &seqEntry)
Definition: cav_seqset.cpp:99
std::string m_sequenceString
std::string m_description
const ncbi::objects::CSeq_id & GetPreferredIdentifier(void) const
Sequence(ncbi::objects::CBioseq &bioseq)
ncbi::CRef< ncbi::objects::CBioseq > m_bioseqASN
bool MatchesSeqId(const ncbi::objects::CSeq_id &seqID) const
std::string IdentifierString(void) const
Include a standard set of the NCBI C++ Toolkit most basic headers.
string
Definition: cgiapp.hpp:687
bool Match(const CSeq_id &sid2) const
Match() - TRUE if SeqIds are equivalent.
Definition: Seq_id.hpp:1033
static string GetStringDescr(const CBioseq &bioseq, EStringFormat fmt)
Definition: Seq_id.cpp:2291
@ eFormat_FastA
Definition: Seq_id.hpp:630
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
bool IsSeq(void) const
Check if variant Seq is selected.
Definition: Seq_entry_.hpp:257
TSeq & SetSeq(void)
Select the variant.
Definition: Seq_entry_.cpp:108
@ eRepr_raw
continuous sequence
Definition: Seq_inst_.hpp:94
@ eMol_na
just a nucleic acid
Definition: Seq_inst_.hpp:113
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
int toupper(Uchar c)
Definition: ncbictype.hpp:73
The NCBI C++/STL use hints.
NCBI C++ stream class wrappers for triggering between "new" and "old" C++ stream libraries.
static const char * str(char *buf, int n)
Definition: stats.c:84
#define TRACE_MESSAGE(s)
#define THROW_MESSAGE(str)
Definition: su_private.hpp:48
static void StringFromStdaa(const vector< char > &vec, string *str)
USING_SCOPE(objects)
static void StringFrom4na(const vector< char > &vec, string *str, bool isDNA)
static void UnpackSeqSet(CBioseq_set &bss, SequenceSet::SequenceList &seqlist)
#define RETURN_FIRST_SEQID_THAT_(is)
#define FOURTHOF4(byte)
#define SECONDOF2(byte)
#define SECONDOF4(byte)
#define THIRDOF4(byte)
#define FIRSTOF4(byte)
static void StringFrom2na(const vector< char > &vec, string *str, bool isDNA)
USING_NCBI_SCOPE
#define FIRSTOF2(byte)
static void UnpackSeqEntry(CSeq_entry &seqEntry, SequenceSet::SequenceList &seqlist)
Modified on Sat Dec 02 09:20:31 2023 by modify_doxy.py rev. 669887