NCBI C++ ToolKit
cuReadFastaWrapper.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: cuReadFastaWrapper.cpp 40765 2009-01-15 19:18:36Z lanczyck $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Chris Lanczycki
27  *
28  * File Description:
29  * A general interface to using various possible mechanisms
30  * for reading MFasta-formatted input, with concrete implementations.
31  *
32  */
33 
34 #include <ncbi_pch.hpp>
35 #include <corelib/ncbistd.hpp>
36 #include <corelib/ncbistl.hpp>
37 #include <corelib/ncbistre.hpp>
38 #include <serial/serial.hpp>
39 
40 #include <serial/objistrasn.hpp>
41 #include <serial/objostrasn.hpp>
42 #include <serial/objistr.hpp>
43 #include <serial/objostr.hpp>
44 
47 #include <objects/seq/Bioseq.hpp>
52 
54 #include <algorithm>
55 
58 BEGIN_SCOPE(cd_utils)
59 
60 
61 const char CFastaIOWrapper::gt = '>';
62 const char CFastaIOWrapper::nl = '\n';
63 
65 {
66  bool result = ReadFile(iStream);
67  if (result) {
68  seqEntry->Assign(*m_seqEntry);
69  }
70  return result;
71 
72 }
73 
75 {
76  bool result = (iStream.good());
77 
78  if (!result) {
79  m_error = "Read Error: invalid stream.\n";
80  } else {
81 
82  CNcbiOstrstream oss;
83  oss << iStream.rdbuf();
84  iStream.seekg(0);
85 
88 
89  // temporarily turn off warning messages (in case of '.' in *.a2m files)
90  EDiagSev originalDiagSev = SetDiagPostLevel(eDiag_Error);
91 
92  try{
93  CStreamLineReader lineReader(iStream);
94  CFastaReader fastaReader(lineReader, m_readFastaFlags);
95  //CCounterManager counterMgr(reader.SetIDGenerator(), NULL);
96  m_seqEntry = fastaReader.ReadSet();
97 
98  // If there is only one sequence in the fasta, the Seq-entry returned is a Bioseq and not a Bioseq-set.
99  // In that case, change the Bioseq to a Bioseq-set so caller doesn't have to manage multiple Seq-entry choices.
100  if (m_seqEntry->IsSeq() && m_useBioseqSet) {
101  CRef<CSeq_entry> bioseqFromFasta(new CSeq_entry);
102  bioseqFromFasta->Assign(*m_seqEntry);
103 
105  m_seqEntry->SetSet().SetSeq_set().push_back(bioseqFromFasta);
106  }
107 
108  } catch (...) {
109  result = false;
110  m_seqEntry.Reset();
111  }
112 
113  if (m_seqEntry.Empty()) {
114  result = false;
115  m_error = "Read Error: empty seq entry.\n";
116  }
117  SetDiagPostLevel(originalDiagSev);
118 
119  }
120  return result;
121 }
122 
123 
124 unsigned int CFastaIOWrapper::GetNumRead() const
125 {
126  unsigned int n = 0;
127  if (m_seqEntry.NotEmpty()) {
128  if (m_seqEntry->IsSet()) {
129  n = m_seqEntry->GetSet().GetSeq_set().size();
130  } else {
131  n = 1; // other option is type 'seq', which is a single bioseq
132  }
133  }
134  return n;
135 
136 /*
137 #ifdef NCBI_COMPILER_WORKSHOP
138  unsigned int n = 0;
139  count(m_activeFastaString.begin(), m_activeFastaString.end(), gt, n);
140  return n;
141 #else
142  cerr << "m_activeFastaString: " << m_activeFastaString << endl;
143  vector<string> tokenizedString;
144  string delim(&gt);
145  NStr::TokenizePattern(m_activeFastaString, delim, tokenizedString);
146  cerr << "tokenizer: #strings = " << tokenizedString.size() << endl;
147 
148  unsigned int counter = 0;
149  string::size_type pos = 0;
150  while (pos = m_activeFastaString.find_first_of(delim, pos) != string::npos) {
151  ++counter;
152  ++pos;
153  }
154  cerr << "while loop: #delims = " << counter << endl;
155  return counter;
156 
157 // return count(m_activeFastaString.begin(), m_activeFastaString.end(), gt);
158 #endif
159 */
160 }
161 
162 string CFastaIOWrapper::GetSubstring(const string& s, unsigned int index, bool isDefline) const
163 {
164  string result = "";
165  int nFound = -1;
166  SIZE_TYPE pos = 0, nextPos = 0;
167  while (nextPos != NPOS && nFound < (int) index) {
168  nextPos = s.find(gt, pos);
169  if (nextPos != NPOS) {
170  ++nFound;
171  ++nextPos; // advance to next character in the string
172  pos = nextPos;
173  }
174 // cout << "nFound = " << nFound << "; pos = " << pos << "; nextPos = " << nextPos << endl;
175  }
176  if (pos > 0) --pos;
177 
178  if (pos != NPOS && nFound == (int) index) {
179  nextPos = s.find(nl, pos);
180  if (nextPos != NPOS) {
181  if (isDefline) {
182  nextPos = nextPos - pos; // no +1 as I don't care about the new line
183  result = s.substr(pos, nextPos);
184  } else {
185  pos = nextPos + 1; // skip the newline
186  nextPos = s.find(gt, pos);
187  if (nextPos != NPOS) {
188  nextPos = nextPos - pos - 1; // -1 as I don't care about the last new line itself
189  }
190  result = s.substr(pos, nextPos);
191  }
192  }
193  }
194  return result;
195 }
196 
197 string CFastaIOWrapper::GetActiveDefline(unsigned int index) const
198 {
199  return GetSubstring(m_activeFastaString, index, true);
200 }
201 
202 string RemoveWhitespace_CJL(const string& s) {
203  string newString;
204  unsigned int i, len = s.length();
205  for (i = 0; i < len; ++i) {
206  if (!isspace(s[i])) {
207  newString += s[i];
208  }
209  }
210  return newString;
211 }
212 
213 string CFastaIOWrapper::GetActiveSequence(unsigned int index, bool removeWhitespace) const
214 {
215  string s = GetSubstring(m_activeFastaString, index, false);
216  if (removeWhitespace) {
217  s = RemoveWhitespace_CJL(s);
218  }
219  return s;
220 }
221 
222 string CFastaIOWrapper::GetRawDefline(unsigned int index) const
223 {
224  if (!m_cacheRawFasta)
225  return "";
226  else
227  return GetSubstring(m_rawFastaString, index, true);
228 }
229 
230 string CFastaIOWrapper::GetRawSequence(unsigned int index, bool removeWhitespace) const
231 {
232  string s = "";
233  if (m_cacheRawFasta) {
234  s = GetSubstring(m_rawFastaString, index, false);
235  if (removeWhitespace) {
236  s = RemoveWhitespace_CJL(s);
237  }
238  }
239  return s;
240 }
241 
242 
243 END_SCOPE(cd_utils)
User-defined methods of the data storage class.
TReadFastaFlags m_readFastaFlags
virtual bool ReadFile(CNcbiIstream &iStream)
bool ReadAsSeqEntry(CNcbiIstream &iStream, CRef< CSeq_entry > &seqEntry)
static const char nl
virtual unsigned int GetNumRead() const
virtual string GetActiveSequence(unsigned int index, bool removeWhitespace=true) const
virtual string GetActiveDefline(unsigned int index) const
static const char gt
string GetSubstring(const string &s, unsigned int index, bool isDefline) const
virtual string GetRawDefline(unsigned int index) const
virtual string GetRawSequence(unsigned int index, bool removeWhitespace=true) const
CRef< CSeq_entry > m_seqEntry
Base class for reading FASTA sequences.
Definition: fasta.hpp:80
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
Definition: Seq_entry.hpp:56
Simple implementation of ILineReader for i(o)streams.
Include a standard set of the NCBI C++ Toolkit most basic headers.
USING_SCOPE(objects)
string RemoveWhitespace_CJL(const string &s)
EDiagSev SetDiagPostLevel(EDiagSev post_sev=eDiag_Error)
Set the threshold severity for posting the messages.
Definition: ncbidiag.cpp:6129
EDiagSev
Severity level for the posted diagnostics.
Definition: ncbidiag.hpp:650
@ eDiag_Error
Error message.
Definition: ncbidiag.hpp:653
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
CRef< CSeq_entry > ReadSet(int max_seqs=kMax_Int, ILineErrorListener *pMessageListener=nullptr)
Read multiple sequences (by default, as many as are available.)
Definition: fasta.cpp:442
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
#define NPOS
Definition: ncbistr.hpp:133
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
const TSet & GetSet(void) const
Get the variant data.
Definition: Seq_entry_.cpp:124
bool IsSeq(void) const
Check if variant Seq is selected.
Definition: Seq_entry_.hpp:257
bool IsSet(void) const
Check if variant Set is selected.
Definition: Seq_entry_.hpp:263
const TSeq_set & GetSeq_set(void) const
Get the Seq_set member data.
void Select(E_Choice index, EResetVariant reset=eDoResetVariant)
Select the requested variant if needed.
TSeq_set & SetSeq_set(void)
Assign a value to Seq_set data member.
int i
yy_size_t n
int len
int isspace(Uchar c)
Definition: ncbictype.hpp:69
The NCBI C++/STL use hints.
NCBI C++ stream class wrappers for triggering between "new" and "old" C++ stream libraries.
else result
Definition: token2.c:20
Modified on Sat Dec 09 04:48:30 2023 by modify_doxy.py rev. 669887