NCBI C++ ToolKit
aln_reader.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef OBJTOOLS_READERS___ALN_READER__HPP
2 #define OBJTOOLS_READERS___ALN_READER__HPP
3 
4 /* $Id: aln_reader.hpp 100576 2023-08-11 17:49:41Z gotvyans $
5  * ===========================================================================
6  *
7  * PUBLIC DOMAIN NOTICE
8  * National Center for Biotechnology Information
9  *
10  * This software/database is a "United States Government Work" under the
11  * terms of the United States Copyright Act. It was written as part of
12  * the author's official duties as a United States Government employee and
13  * thus cannot be copyrighted. This software/database is freely available
14  * to the public for use. The National Library of Medicine and the U.S.
15  * Government have not placed any restriction on its use or reproduction.
16  *
17  * Although all reasonable efforts have been taken to ensure the accuracy
18  * and reliability of the software and data, the NLM and the U.S.
19  * Government do not and cannot warrant the performance or results that
20  * may be obtained by using this software or data. The NLM and the U.S.
21  * Government disclaim all warranties, express or implied, including
22  * warranties of performance, merchantability or fitness for any particular
23  * purpose.
24  *
25  * Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Authors: Josh Cherry
30  *
31  * File Description: C++ wrappers for alignment file reading
32  *
33  */
34 
35 #include <corelib/ncbistd.hpp>
39 #include <objects/seq/Seq_inst.hpp>
40 #include <objects/seq/Bioseq.hpp>
46 
48 
50 
51 namespace objects {
52  class CSeq_id;
53 }
54 
56 {
57 public:
58 
59  // error categories
60  typedef enum {
66  eAlnErr_BadChar
68 
69  // constructor
70  CAlnError(int category, int line_num, string id, string message);
71 
72  /// Copy constructor.
73  CAlnError(const CAlnError& e);
74 
75  // destructor
77 
78  // accessors
79  EAlnErr GetCategory() const { return m_Category; }
80  int GetLineNum() const { return m_LineNum; }
81  const string& GetID() const { return m_ID; }
82  const string& GetMsg() const { return m_Message; }
83 
84  /// @deprecated Use GetMsg() instead - to avoid conflict with Windows macro
86  const string& GetMessage() const { return m_Message; }
87 
88 private:
90  int m_LineNum;
91  string m_ID;
92  string m_Message;
93 };
94 
95 ///
96 /// class CAlnReader supports importing a large variety of text-based
97 /// alignment formats into standard data structures.
98 ///
100 {
101 public:
102  // alphabets to try
103  enum EAlphabet {
111  };
112 
113  // This class is deprecated
115  {
116  private:
117  list<CAlnError> errors;
118  public:
119  size_t GetErrorCount(CAlnError::EAlnErr /*category*/) const
120  {
121  return 0;
122  }
123 
124  void clear(void) {
125  }
126 
127  void push_back(const CAlnError& /*error*/) {
128  }
129 
130  size_t size(void) const {
131  return 0;
132  }
133 
134  typedef list<CAlnError> TErrors;
135  typedef TErrors::const_iterator const_iterator;
136  const_iterator begin(void) const { return errors.begin(); }
137  const_iterator end(void) const { return errors.end(); }
138  };
139 
140 
141  // error messages
143 
144  static string
145  GetAlphabetLetters(
146  EAlphabet);
147 
149  using FValidateIds =
150  function<void(const list<CRef<objects::CSeq_id>>&,
151  int,
152  objects::CAlnErrorReporter*)>;
153 
154  using FIdValidate =
155  function<void(const objects::CSeq_id&,
156  int,
157  objects::CAlnErrorReporter*)>;
158 
159 
160  // constructor
161  // defaults to protein alphabet and A2M gap characters
162  CAlnReader(CNcbiIstream& is, FValidateIds fIdValidate=nullptr);
163 
164  CAlnReader(CNcbiIstream& is, FIdValidate fSingleIdValidate);
165 
166  // destructor
167  virtual ~CAlnReader(void);
168 
169 
170  const string& GetAlphabet(void) const;
171  void SetAlphabet(const string& value);
172  void SetAlphabet(EAlphabet alpha);
173  bool IsAlphabet(EAlphabet alpha) const;
174 
175  const string& GetBeginningGap(void) const;
176  void SetBeginningGap(const string& value);
177 
178  const string& GetMiddleGap(void) const;
179  void SetMiddleGap(const string& value);
180 
181  const string& GetEndGap(void) const;
182  void SetEndGap(const string& value);
183 
184  bool GetUseNexusInfo() const {return m_UseNexusInfo; };
185  void SetUseNexusInfo(bool useNexusInfo) { m_UseNexusInfo = useNexusInfo; };
186 
187  /// Convenience function for setting beginning, middle, and
188  /// end gap to the same thing
189  void SetAllGap(const string& value);
190 
191  const string& GetMissing(void) const {return mSequenceInfo.Missing();};
192  void SetMissing(const string& value) {mSequenceInfo.SetMissing(value);};
193 
194  const string& GetMatch(void) const {return mSequenceInfo.Match();};
195  void SetMatch(const string& value) {mSequenceInfo.SetMatch(value);};
196 
197 
198  /// Alternative & easy way to choose alphabet, etc.
199  void SetFastaGap(EAlphabet alpha);
200  void SetClustal (EAlphabet alpha);
201  void SetPhylip (EAlphabet alpha);
202  void SetPaup (EAlphabet alpha);
203 
204 
205  /// Read the file
206  /// This are the main functions.
207  /// either would parse the alignment file and create the result data
208  enum EReadFlags {
209  fReadDefaults = 0,
210  fGenerateLocalIDs = 1
211  };
212  typedef int TReadFlags; ///< binary OR of EReadFlags
213 
215  void Read(
216  bool guess,
217  bool generate_local_ids=false,
218  objects::ILineErrorListener* pErrorListener=nullptr);
219 
220  void Read(
221  TReadFlags = fReadDefaults,
222  objects::ILineErrorListener* pErrorListener=nullptr);
223 
224  /// Parsed result data accessors
225  const vector<string>& GetIds(void) const {return m_IdStrings;};
226  const vector<string>& GetSeqs(void) const {return m_Seqs;};
227  NCBI_DEPRECATED const vector<string>& GetOrganisms(void) const {return m_Organisms;};
228  const vector<string>& GetDeflines(void) const {return m_Deflines;};
229  const vector<TLineInfo>& GetDeflineInfo(void) const { return m_DeflineInfo; };
230  int GetDim(void) const {return m_Dim;};
231  EAlignFormat GetLastAlignmentFileFormat(void) const;
232 
234  const TErrorList& GetErrorList(void) const {return m_Errors;};
235 
236  using TFastaFlags = objects::CFastaDeflineReader::TFastaFlags;
237  /// Create ASN.1 classes from the parsed alignment
238  CRef<objects::CSeq_align> GetSeqAlign(TFastaFlags fasta_flags=0,
239  objects::ILineErrorListener* pErrorListener=nullptr);
240  CRef<objects::CSeq_entry> GetSeqEntry(TFastaFlags fasta_flags=objects::CFastaReader::fAddMods,
241  objects::ILineErrorListener* pErrorListener=nullptr);
242 
243  /// Get a sequence's moltype, also considering the alphabet used to read it
244  objects::CSeq_inst::EMol GetSequenceMolType(
245  const string& alphabet,
246  const string& seqData,
247  objects::ILineErrorListener* pErrorListener=nullptr
248  );
249 
250 private:
251  /// Prohibit copy constructor and assignment operator
254 
255  int x_GetGCD(const int a, const int b) const;
256 
257  bool x_IsReplicatedSequence(const char* sequence_data,
258  int sequence_length,
259  int repeat_interval) const;
260 
261  void x_VerifyAlignmentInfo(
263  TReadFlags readFlags);
264 
265  CRef<objects::CSeq_inst> x_GetSeqInst(objects::CSeq_inst::EMol mol,
266  const string& seqData) const;
267 
268  objects::CSeq_inst::EMol x_GetSequenceMolType(
269  const string& alphabet,
270  const string& seqData,
271  const string& seqId="", // Used in error message
272  objects::ILineErrorListener* pErrorListener=nullptr);
273 
274  ncbi::objects::CSequenceInfo mSequenceInfo;
275 
276 
277  /// Parsed result data (analogous to SAlignmentFile)
278  /// Seqs are upper-case strings representing the sequences, with
279  /// '-' for a gap. Ids are ids read from file. Organisms and
280  /// Deflines may not be set, depending on the file.
281 
282  using TIdList = list<CRef<objects::CSeq_id>>;
283  vector<string> m_IdStrings;
284  vector<TIdList> m_Ids;
285  vector<string> m_Seqs;
286  vector<string> m_Organisms; // redundant
287  vector<string> m_Deflines; // redundant
288  vector<TLineInfo> m_DeflineInfo;
289  FValidateIds m_fValidateIds=nullptr;
291 
292 
293  /// Other internal data
297  int m_Dim;
300  vector<string> m_SeqVec;
301  vector<TSeqPos> m_SeqLen;
304 
305  /// characters have different contexts, depending on
306  /// whether they are before the first non-gap character,
307  /// after the last non-gap character, or between the
308  /// first and last non-gap character. This must be
309  /// precalculated before gap characters can be converted.
310  typedef pair<TSeqPos, TSeqPos> TAlignMiddleInterval;
311  typedef vector<TAlignMiddleInterval> TAlignMiddles;
313  void x_CalculateMiddleSections();
315  bool x_IsGap(TNumrow row, TSeqPos pos, const string& residue);
316  void x_AssignDensegIds(
317  TFastaFlags fasta_flags,
318  objects::CDense_seg& denseg);
319 
320  void x_ParseAndValidateSeqIds(
321  const TLineInfo& seqIdInfo,
323  TIdList& ids);
324 
325  void x_AddMods(const TLineInfo& defline_info,
326  objects::CBioseq& bioseq,
327  objects::ILineErrorListener* pErrorListener);
328 
329  void x_AddTitle(const string& defline, objects::CBioseq& bioseq);
330 
331 protected:
332  virtual CRef<objects::CSeq_id> GenerateID(const string& fasta_defline,
333  const TSeqPos& line_number,
334  TFastaFlags fasta_flags);
335 
336  using SLineTextAndLoc = objects::CFastaDeflineReader::SLineTextAndLoc;
337  using TSeqTitles = objects::CFastaDeflineReader::TSeqTitles;
338  using SDeflineParseInfo = objects::CFastaDeflineReader::SDeflineParseInfo;
339  using TIgnoredProblems = objects::CFastaDeflineReader::TIgnoredProblems;
340 
342  void ParseDefline(const string& defline,
343  const SDeflineParseInfo& info,
344  const TIgnoredProblems& ignoredErrors,
345  list<CRef<objects::CSeq_id>>& ids,
346  bool& hasRange,
347  TSeqPos& rangeStart,
348  TSeqPos& rangeEnd,
349  TSeqTitles& seqTitles,
350  objects::ILineErrorListener* pMessageListener);
351 
352 protected:
353  objects::CFastaIdHandler m_FastaIdHandler;
354 };
355 
356 
357 
358 ///////////////////////////////////////////////////////////////////////
359 //
360 // Inline Methods
361 //
362 
363 inline
364 const string& CAlnReader::GetAlphabet(void) const
365 {
366  return mSequenceInfo.Alphabet();
367 }
368 
369 
370 inline
371 void CAlnReader::SetAlphabet(const string& value)
372 {
373  mSequenceInfo.SetAlphabet(value);
374 }
375 
376 
377 inline
378 const string& CAlnReader::GetBeginningGap(void) const
379 {
380  return mSequenceInfo.BeginningGap();
381 }
382 
383 
384 inline
386 {
387  mSequenceInfo.SetBeginningGap(value);
388 }
389 
390 
391 inline
392 const string& CAlnReader::GetMiddleGap(void) const
393 {
394  return mSequenceInfo.MiddleGap();
395 }
396 
397 
398 inline
399 void CAlnReader::SetMiddleGap(const string& value)
400 {
401  mSequenceInfo.SetMiddleGap(value);
402 }
403 
404 
405 inline
406 const string& CAlnReader::GetEndGap(void) const
407 {
408  return mSequenceInfo.EndGap();
409 }
410 
411 inline
412 void CAlnReader::SetEndGap(const string& value)
413 {
414  mSequenceInfo.SetEndGap(value);
415 }
416 
417 
418 inline
420 {
422 }
423 
424 
425 inline
427 {
428  return (GetAlphabet() == GetAlphabetLetters(alpha));
429 }
430 
431 
432 inline
433 void CAlnReader::SetAllGap(const string& value)
434 {
435  mSequenceInfo.SetBeginningGap(value).SetMiddleGap(value).SetEndGap(value);
436 }
437 
438 inline
440 {
441  return m_AlignFormat;
442 }
443 
444 
446 
447 #endif // OBJTOOLS_READERS___ALN_READER__HPP
EAlignFormat
Definition: aln_formats.hpp:37
CAnchoredAln::TDim TDim
struct SLineInfo SLineInfo
EAlnErr
Definition: alnread.h:72
@ eAlnErr_Unknown
Definition: alnread.h:73
@ eAlnErr_NoError
Definition: alnread.h:74
struct SAlignmentFile SAlignmentFile
int GetLineNum() const
Definition: aln_reader.hpp:80
EAlnErr GetCategory() const
Definition: aln_reader.hpp:79
const string & GetMsg() const
Definition: aln_reader.hpp:82
const string & GetID() const
Definition: aln_reader.hpp:81
const string & GetMessage() const
Definition: aln_reader.hpp:86
string m_ID
Definition: aln_reader.hpp:91
@ eAlnErr_BadFormat
Definition: aln_reader.hpp:65
@ eAlnErr_BadData
Definition: aln_reader.hpp:64
string m_Message
Definition: aln_reader.hpp:92
int m_LineNum
Definition: aln_reader.hpp:90
EAlnErr m_Category
Definition: aln_reader.hpp:89
const_iterator end(void) const
Definition: aln_reader.hpp:137
TErrors::const_iterator const_iterator
Definition: aln_reader.hpp:135
void push_back(const CAlnError &)
Definition: aln_reader.hpp:127
size_t GetErrorCount(CAlnError::EAlnErr) const
Definition: aln_reader.hpp:119
const_iterator begin(void) const
Definition: aln_reader.hpp:136
class CAlnReader supports importing a large variety of text-based alignment formats into standard dat...
Definition: aln_reader.hpp:100
vector< string > m_IdStrings
Definition: aln_reader.hpp:283
bool m_ReadSucceeded
Definition: aln_reader.hpp:296
EAlignFormat GetLastAlignmentFileFormat(void) const
Definition: aln_reader.hpp:439
bool m_ReadDone
Definition: aln_reader.hpp:295
const vector< TLineInfo > & GetDeflineInfo(void) const
Definition: aln_reader.hpp:229
@ eAlpha_Rna_no_ambiguity
Definition: aln_reader.hpp:110
@ eAlpha_Dna_no_ambiguity
Definition: aln_reader.hpp:109
objects::CFastaDeflineReader::TIgnoredProblems TIgnoredProblems
Definition: aln_reader.hpp:339
const TErrorList & GetErrorList(void) const
Definition: aln_reader.hpp:234
int x_GetGCD(const int a, const int b) const
const vector< string > & GetSeqs(void) const
Definition: aln_reader.hpp:226
objects::CFastaDeflineReader::SDeflineParseInfo SDeflineParseInfo
Definition: aln_reader.hpp:338
bool x_IsReplicatedSequence(const char *sequence_data, int sequence_length, int repeat_interval) const
CAlnReader & operator=(const CAlnReader &value)
void SetUseNexusInfo(bool useNexusInfo)
Definition: aln_reader.hpp:185
void Read(TReadFlags=fReadDefaults, objects::ILineErrorListener *pErrorListener=nullptr)
vector< string > m_SeqVec
Definition: aln_reader.hpp:300
void Read(bool guess, bool generate_local_ids=false, objects::ILineErrorListener *pErrorListener=nullptr)
TAlignMiddles m_MiddleSections
Definition: aln_reader.hpp:312
function< void(const list< CRef< objects::CSeq_id > > &, int, objects::CAlnErrorReporter *)> FValidateIds
Definition: aln_reader.hpp:152
static string GetAlphabetLetters(EAlphabet)
Definition: aln_reader.cpp:207
vector< TSeqPos > m_SeqLen
Definition: aln_reader.hpp:301
const string & GetMiddleGap(void) const
Definition: aln_reader.hpp:392
vector< string > m_Deflines
Definition: aln_reader.hpp:287
void SetMatch(const string &value)
Definition: aln_reader.hpp:195
bool m_UseNexusInfo
Definition: aln_reader.hpp:303
int TReadFlags
binary OR of EReadFlags
Definition: aln_reader.hpp:212
objects::CDense_seg::TDim TNumrow
Definition: aln_reader.hpp:314
vector< string > m_Organisms
Definition: aln_reader.hpp:286
EReadFlags
Read the file This are the main functions.
Definition: aln_reader.hpp:208
const string & GetAlphabet(void) const
Definition: aln_reader.hpp:364
pair< TSeqPos, TSeqPos > TAlignMiddleInterval
characters have different contexts, depending on whether they are before the first non-gap character,...
Definition: aln_reader.hpp:310
objects::CFastaDeflineReader::TFastaFlags TFastaFlags
Definition: aln_reader.hpp:236
int GetDim(void) const
Definition: aln_reader.hpp:230
vector< TIdList > m_Ids
Definition: aln_reader.hpp:284
bool GetUseNexusInfo() const
Definition: aln_reader.hpp:184
function< void(const objects::CSeq_id &, int, objects::CAlnErrorReporter *)> FIdValidate
Definition: aln_reader.hpp:157
const vector< string > & GetDeflines(void) const
Definition: aln_reader.hpp:228
ncbi::objects::CSequenceInfo mSequenceInfo
Definition: aln_reader.hpp:274
const vector< string > & GetIds(void) const
Parsed result data accessors.
Definition: aln_reader.hpp:225
objects::CFastaDeflineReader::SLineTextAndLoc SLineTextAndLoc
Definition: aln_reader.hpp:336
EAlignFormat m_AlignFormat
Definition: aln_reader.hpp:290
const vector< string > & GetOrganisms(void) const
Definition: aln_reader.hpp:227
TErrorList m_Errors
Definition: aln_reader.hpp:302
void SetMiddleGap(const string &value)
Definition: aln_reader.hpp:399
bool IsAlphabet(EAlphabet alpha) const
Definition: aln_reader.hpp:426
objects::CFastaIdHandler m_FastaIdHandler
Definition: aln_reader.hpp:353
CNcbiIstream & m_IS
Other internal data.
Definition: aln_reader.hpp:294
vector< string > m_Seqs
Definition: aln_reader.hpp:285
objects::SLineInfo TLineInfo
Definition: aln_reader.hpp:148
CRef< objects::CSeq_entry > m_Entry
Definition: aln_reader.hpp:299
const string & GetEndGap(void) const
Definition: aln_reader.hpp:406
void SetEndGap(const string &value)
Definition: aln_reader.hpp:412
void SetBeginningGap(const string &value)
Definition: aln_reader.hpp:385
CAlnReader(const CAlnReader &value)
Prohibit copy constructor and assignment operator.
const string & GetMissing(void) const
Definition: aln_reader.hpp:191
const string & GetMatch(void) const
Definition: aln_reader.hpp:194
objects::CFastaDeflineReader::TSeqTitles TSeqTitles
Definition: aln_reader.hpp:337
void SetAlphabet(const string &value)
Definition: aln_reader.hpp:371
CRef< objects::CSeq_align > m_Aln
Definition: aln_reader.hpp:298
void SetMissing(const string &value)
Definition: aln_reader.hpp:192
void SetAllGap(const string &value)
Convenience function for setting beginning, middle, and end gap to the same thing.
Definition: aln_reader.hpp:433
list< CRef< objects::CSeq_id > > TIdList
Parsed result data (analogous to SAlignmentFile) Seqs are upper-case strings representing the sequenc...
Definition: aln_reader.hpp:282
vector< TLineInfo > m_DeflineInfo
Definition: aln_reader.hpp:288
vector< TAlignMiddleInterval > TAlignMiddles
Definition: aln_reader.hpp:311
const string & GetBeginningGap(void) const
Definition: aln_reader.hpp:378
CAlnErrorContainer TErrorList
Definition: aln_reader.hpp:142
Include a standard set of the NCBI C++ Toolkit most basic headers.
static uch flags
Operators to edit gaps in sequences.
static unsigned int line_num
Definition: attributes.c:11
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define NCBI_DEPRECATED
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
#define NCBI_XOBJREAD_EXPORT
Definition: ncbi_export.h:1315
static MDB_envinfo info
Definition: mdb_load.c:37
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
unsigned int a
Definition: ncbi_localip.c:102
#define row(bind, expected)
Definition: string_bind.c:73
Modified on Fri Sep 20 14:57:25 2024 by modify_doxy.py rev. 669887