NCBI C++ ToolKit
blast_fasta_input.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: blast_fasta_input.hpp 79584 2017-09-22 18:03:01Z boratyng $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Jason Papadopoulos
27  *
28  */
29 
30 /** @file blast_fasta_input.hpp
31  * Interface for FASTA files into blast sequence input
32  */
33 
34 #ifndef ALGO_BLAST_BLASTINPUT___BLAST_FASTA_INPUT__HPP
35 #define ALGO_BLAST_BLASTINPUT___BLAST_FASTA_INPUT__HPP
36 
40 #include <util/range.hpp>
41 
43 BEGIN_SCOPE(blast)
44 
45 /// Class representing a text file containing sequences
46 /// in fasta format
47 ///
49 {
50 public:
51 
52  /// Constructor
53  /// @param infile The file to read [in]
54  /// @param iconfig Input configuration object, this options apply to all
55  /// input read [in]
57  const CBlastInputSourceConfig& iconfig);
58 
59  /// Constructor
60  /// @param objmgr Object Manager instance [in]
61  /// @param user_input User provided input in a string [in]
62  /// @param iconfig Input configuration object, this options apply to all
63  /// input read [in]
64  CBlastFastaInputSource(const string& user_input,
65  const CBlastInputSourceConfig& iconfig);
66 
67  /// Destructor
69 
70 protected:
71  /// Retrieve a single sequence (in an SSeqLoc container)
72  /// @param scope CScope object to use in SSeqLoc returned [in]
73  /// @throws CObjReaderParseException if input file is empty or the end of
74  /// file is reached unexpectedly
75  /// @note all masks are returned in either the plus strand (for
76  /// nucleotides) or unknown (for proteins)
77  virtual SSeqLoc GetNextSSeqLoc(CScope& scope);
78 
79  /// Retrieve a single sequence (in a CBlastSearchQuery container)
80  /// @param scope CScope object to use in CBlastSearchQuery returned [in]
81  /// @throws CObjReaderParseException if input file is empty of the end of
82  /// file is reached unexpectedly
83  /// @note all masks are returned in either both strands (for
84  /// nucleotides) or unknown (for proteins)
85  virtual CRef<CBlastSearchQuery> GetNextSequence(CScope& scope);
86 
87  /// Signal whether there are any unread sequences left
88  /// @return true if no unread sequences remaining
89  virtual bool End();
90 
91 private:
92  CBlastInputSourceConfig m_Config; ///< Configuration for the sequences to be read
93  CRef<ILineReader> m_LineReader; ///< interface to read lines
94  /// Reader of FASTA sequences or identifiers
96  bool m_ReadProteins; ///< read protein sequences?
97 
98  /// Read a single sequence from file and convert to a Seq_loc
99  /// @param lcase_mask A Seq_loc that describes the
100  /// lowercase-masked regions in the query that was read in.
101  /// If there are no such locations, the Seq_loc is of type
102  /// 'null', otherwise it is of type 'packed_seqint' [out]
103  /// @param scope CScope object to which the read sequence is added [in]
104  /// @return The sequence in Seq_loc format
105  ///
107  x_FastaToSeqLoc(CRef<objects::CSeq_loc>& lcase_mask, CScope& scope);
108 
109  /// Initialization method for the input reader
110  void x_InitInputReader();
111 };
112 
113 
115  : public CBlastInputSourceOMF
116 {
117 public:
118  /// Input formats
120  eFasta = 0,
122  eFastq
123  };
124 
125 
127  EInputFormat format = eFasta,
128  bool paired = false);
129 
131  EInputFormat format = eFasta);
132 
134 
135  virtual int GetNextSequence(CBioseq_set& bioseq_set);
136 
137  virtual bool End(void) {return m_LineReader->AtEOF();}
138 
139  void SetParseSeqIds(bool val) {m_ParseSeqIds = val;}
140 
141 private:
144 
145  CTempString x_ParseDefline(CTempString& line);
146 
147  /// Read sequences in FASTA or FASTQ format
148  void x_ReadFastaOrFastq(CBioseq_set& bioseq_set);
149 
150  /// Read one sequence from a FASTA file
151  CRef<CSeq_entry> x_ReadFastaOneSeq(CRef<ILineReader> line_reader);
152 
153  /// Read one sequence from a FASTQ file
154  CRef<CSeq_entry> x_ReadFastqOneSeq(CRef<ILineReader> line_reader);
155 
156  /// Read sequences from two FASTA or FASTQ files (for paired reads)
157  bool x_ReadFromTwoFiles(CBioseq_set& bioseq_set, EInputFormat format);
158 
159  /// Read sequences in FASTC format: defline, new line, a pair of sequences
160  /// on a single line separated by '><'
161  void x_ReadFastc(CBioseq_set& bioseq_set);
162 
163  CRef<CSeq_id> x_GetNextSeqId(void);
164 
165  /// Number of bases added so far
167  /// string::capacity() can be used instead
170  // for reading paired reads from two FASTA files
172  string m_Sequence;
173  /// Are paired sequences in the input
175  /// Input format: FASTA, FASTQ, FASTC
177  /// A counter for generating local ids
178  unsigned int m_Id;
179  /// Should defline ids be used Bioseq objects
181 };
182 
183 
184 END_SCOPE(blast)
186 
187 #endif /* ALGO_BLAST_BLASTINPUT___BLAST_FASTA_INPUT__HPP */
Interface for converting sources of sequence data into blast sequence input.
Declares CBlastScopeSource class to create properly configured CScope objects to invoke the BLAST dat...
Class representing a text file containing sequences in fasta format.
virtual ~CBlastFastaInputSource()
Destructor.
CRef< ILineReader > m_LineReader
interface to read lines
AutoPtr< CFastaReader > m_InputReader
Reader of FASTA sequences or identifiers.
bool m_ReadProteins
read protein sequences?
CBlastInputSourceConfig m_Config
Configuration for the sequences to be read.
Class that centralizes the configuration data for sequences to be converted.
Definition: blast_input.hpp:48
virtual int GetNextSequence(CBioseq_set &bioseq_set)=0
Get one sequence (or a pair for NGS reads)
Base class representing a source of biological sequences.
CRef –.
Definition: ncbiobj.hpp:618
CScope –.
Definition: scope.hpp:92
CShortReadFastaInputSource(const CShortReadFastaInputSource &)
EInputFormat m_Format
Input format: FASTA, FASTQ, FASTC.
TSeqPos m_BasesAdded
Number of bases added so far.
bool m_ParseSeqIds
Should defline ids be used Bioseq objects.
CRef< ILineReader > m_SecondLineReader
unsigned int m_Id
A counter for generating local ids.
CRef< ILineReader > m_LineReader
TSeqPos m_SeqBuffLen
string::capacity() can be used instead
CShortReadFastaInputSource & operator=(const CShortReadFastaInputSource &)
bool m_IsPaired
Are paired sequences in the input.
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
Operators to edit gaps in sequences.
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
#define NCBI_BLASTINPUT_EXPORT
Definition: ncbi_export.h:336
static Format format
Definition: njn_ioutil.cpp:53
static FILE * infile
Definition: pcre2test.c:950
Structure to represent a single sequence to be fed to BLAST.
Definition: sseqloc.hpp:47
Modified on Fri Sep 20 14:58:12 2024 by modify_doxy.py rev. 669887