NCBI C++ ToolKit
columnar_vcf_reader.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef GUI_WIDGETS___LOADERS___COLUMNAR_VCF_READER__HPP_
2 #define GUI_WIDGETS___LOADERS___COLUMNAR_VCF_READER__HPP_
3 /* $Id: columnar_vcf_reader.hpp 46761 2021-10-01 01:07:59Z evgeniev $
4  * ===========================================================================
5  *
6  * PUBLIC DOMAIN NOTICE
7  * National Center for Biotechnology Information
8  *
9  * This software/database is a "United States Government Work" under the
10  * terms of the United States Copyright Act. It was written as part of
11  * the author's official duties as a United States Government employee and
12  * thus cannot be copyrighted. This software/database is freely available
13  * to the public for use. The National Library of Medicine and the U.S.
14  * Government have not placed any restriction on its use or reproduction.
15  *
16  * Although all reasonable efforts have been taken to ensure the accuracy
17  * and reliability of the software and data, the NLM and the U.S.
18  * Government do not and cannot warrant the performance or results that
19  * may be obtained by using this software or data. The NLM and the U.S.
20  * Government disclaim all warranties, express or implied, including
21  * warranties of performance, merchantability or fitness for any particular
22  * purpose.
23  *
24  * Please cite the author in any work or product based on this material.
25  *
26  * ===========================================================================
27  *
28  * Authors: Andrea Asztalos, Anatoliy Kuznetsov
29  *
30  * File Description:
31  *
32  *
33  */
34 
35 
36 #include <corelib/ncbistd.hpp>
37 #include <corelib/ncbiobj.hpp>
38 #include <corelib/ncbimisc.hpp>
39 #include <gui/gui_export.h>
40 #include <util/icanceled.hpp>
42 #include <unordered_map>
43 #include <functional>
44 #include <mutex>
45 #include <utility>
46 
47 
49 
50 class CVCFVariantList;
52  class CSeq_id;
53  class ILineErrorListener;
56 
57 struct SVcfFieldData;
58 
59 /// Class responsible to read VCF files
61 {
62 public:
63  using TFieldVector = vector<CConstRef<SVcfFieldData>>;
64  using TSeqIdVarsListPair = pair<CConstRef<objects::CSeq_id>, CRef<CVCFVariantList>>;
65  /// Defines a callable object, used when a variants list is processed by the reader
66  /// @param[in] Reference to the variants lst.
67  using TOnVCFVariantListReady = std::function<void(CVCFVariantList&)>;
68  using TReportProgress = function<void(const string&)>;
69 
71  virtual ~CColumnarVCFReader() {}
72 
73  /// Reads only the header section of the file
74  /// @param[in] in
75  /// Reference to an input stream.
76  /// @param[in] canceled
77  /// Optional pointer to check for cancelation requests.
78  /// @param[in] listener
79  /// Optional pointer to report VCF syntax errors.
80  bool ReadHeader(CNcbiIstream& in, ICanceled* canceled = nullptr, objects::ILineErrorListener* listener = nullptr);
81 
82  void LoadAllInfo(bool value) { m_LoadAllInfo = value; }
83  set<string>& LoadSelectedInfoFields() { return m_LoadInfoFields; }
84 
85  void LoadAllSamples(bool value) { m_LoadAllSamples = value; }
86  map<unsigned, string>& LoadSelectedSamples() { return m_LoadSamples; }
87 
88  /// Reads only the data section of the file
89  /// @param[in] in
90  /// Reference to an input stream.
91  /// @param[in] canceled
92  /// Optional pointer to check for cancelation requests.
93  /// @param[in] listener
94  /// Optional pointer to report VCF syntax errors.
95  /// @param[in] on_variants_list_ready
96  /// Optional function, called when a VCF variants list becomes available.
97  /// @return
98  /// Returns false if the reading was canceled, true otherwise.
99  ///
100  /// The reader has two modes of operation:
101  /// - Events based - when on_variants_list_ready is set to a callable object, the reader calls it for every variants list read from the input and then releases the list;
102  /// - DOM mode - all variants lists are loaded in memory and can be obtained by calling @ref GetVariantsForChr(const string&).
103  bool ReadData(CNcbiIstream& in, ICanceled* canceled = nullptr, objects::ILineErrorListener* listener = nullptr, TReportProgress prog_func = TReportProgress(), TOnVCFVariantListReady on_variants_list_ready = TOnVCFVariantListReady());
104 
105  /// Reads a list of variants
106  /// @param[in] in
107  /// Reference to an input stream.
108  /// @param[in] chr_list
109  /// Map of seq-id to a list of synonyms.
110  /// @param[in] canceled
111  /// Optional pointer to check for cancelation requests.
112  /// @param[in] listener
113  /// Optional pointer to report VCF syntax errors.
114  /// @param[in] on_variants_list_ready
115  /// Optional function, called when a VCF variants list becomes available.
116  /// @return
117  /// Returns pairs of seq-id - variants list
118  ///
119  /// The reader has two modes of operation:
120  /// - Events based - when on_variants_list_ready is set to a callable object, the reader calls it for every variants list read from the input and then releases the list;
121  /// - DOM mode - all variants lists are loaded in memory and returned. They can be obtained by calling @ref GetVariantsForChr(const string&).
122  vector<TSeqIdVarsListPair>
123  ReadVariantsForChrs(CNcbiIstream& in,
124  const vector<pair<CConstRef<objects::CSeq_id>, vector<string>>>& chr_list,
125  ICanceled* canceled = nullptr,
126  objects::ILineErrorListener* listener = nullptr,
127  TReportProgress prog_func = TReportProgress(),
128  TOnVCFVariantListReady on_variants_list_ready = TOnVCFVariantListReady());
129 
130  /// Returns a vector, holding the chrs/contigs identifiers, read from the file
131  /// @return
132  /// Returns false if the reading was canceled, true otherwise.
133  /// @note
134  /// Use these function when the entire file has been read.
135  vector<string> GetChromosomeNames() const;
136  /// Retrieves the variants list for a given chr/contig
137  /// @param[in] in
138  /// String reference specifying the chr/contig
139  /// @return
140  /// Returns a pointer to the variants list for a given chr/contig.
141  /// @note
142  /// Use these function when the entire file has been read.
143  CRef<CVCFVariantList> GetVariantsForChr(const string& chr_name) const;
144 
145  /// Limits the number of variations to be read
146  /// @param[in] count
147  /// Maximum number of variations to read
148  void SetVariationsLimits(int count) { m_VariationsLimit = count; };
149 
150  // use these functions after ReadHeader() was called
151  const string& GetVCFversion() const { return m_VCFversion; }
152  const string& GetAssembly() const { return m_Assembly; }
153  const set<CConstRef<SVcfFieldData>>& GetInfoFields() const { return m_InfoFields; }
154  const map<unsigned, string>& GetSamples() const { return m_SampleCols; }
155 
156 protected:
157  void x_InterruptReading();
158  void x_ResetInfo();
159 
160  unsigned x_ProcessHeaderLine(const string& header_line, unsigned line_nr, objects::ILineErrorListener* listener);
161  void x_GatherSampleColNames(const string& header_line, objects::ILineErrorListener* listener, unsigned line_nr);
162  void x_GetSamplesToLoad(const string& header_line, objects::ILineErrorListener* listener, unsigned line_nr);
163 
164  void x_ProcessWarning(objects::CObjReaderLineException& err, objects::ILineErrorListener* error_cont);
165  void x_ProcessError(objects::CObjReaderLineException& err, objects::ILineErrorListener* error_cont);
166  void x_ProcessCriticalError(objects::CObjReaderLineException& err, objects::ILineErrorListener* error_cont);
167 
168  // members used when reading the header
169  /// @note
170  /// Update @ref x_ResetInfo() if the list is modified
171  string m_VCFversion;
172  string m_Assembly;
174  set<CConstRef<SVcfFieldData>> m_InfoFields; ///< List of INFO fields parsed from the header of the file
175  map<unsigned, string> m_SampleCols; ///< List of SAMPLE columns parsed from the last line of the header, order is important
176 
177  // members used when reading the data section
178  bool m_LoadAllInfo{ true }; ///< Flag to load every INFO field
179  set<string> m_LoadInfoFields; ///< List of INFO fields required to be loaded
180 
181  bool m_LoadAllSamples{ true }; ///< Flag to load every SAMPLE column
182  map<unsigned, string> m_LoadSamples; ///< List of SAMPLES required to be loaded
183  unordered_map<string, CRef<CVCFVariantList>> m_ChromosomeMap;
184  int m_VariationsLimit { -1 };
185 };
186 
187 /// Structure to store characteristics of an INFO field
188 /// It is constructed from an INFO meta-information line from the VCF header
189 /// Example:
190 /// ##INFO=<ID=ID,Number=number,Type=type,Description="description",Source="description",Version="128">
191 /// It can be used/expanded to include other structured meta-information lines: FORMAT, FILTER
193 {
194  SVcfFieldData(const string& line);
195  SVcfFieldData(const string& name, const string& descr, const string& nr)
196  : m_Name(name), m_Description(descr), m_Number(nr) {}
197 
198  string m_Name; ///< INFO ID (name)
199  string m_Description; ///< INFO Description
200  string m_Number; /// INFO Number - it describes the number of values that can be included with this field
201 };
202 
203 
204 // Testing the reader
206 {
207 public:
208  // These two functions also output memory footprint and timings
209  void SerializeToDisk(const string& prefix, CNcbiOstream* out = nullptr);
210  void Deserialize(const string& prefix, CNcbiOstream* out = nullptr);
211 
212  void ListColumns(CNcbiOstream& out, bool only_sv_cols = false);
213  void ListIndexVectors(CNcbiOstream& out);
214  void GetStatistics(CNcbiOstream& out);
215 
216 };
217 
219 
220 
221 #endif //GUI_WIDGETS___LOADERS___COLUMNAR_VCF_READER__HPP_
Class responsible to read VCF files.
pair< CConstRef< objects::CSeq_id >, CRef< CVCFVariantList > > TSeqIdVarsListPair
const set< CConstRef< SVcfFieldData > > & GetInfoFields() const
map< unsigned, string > & LoadSelectedSamples()
map< unsigned, string > m_SampleCols
List of SAMPLE columns parsed from the last line of the header, order is important.
map< unsigned, string > m_LoadSamples
List of SAMPLES required to be loaded.
function< void(const string &)> TReportProgress
const string & GetAssembly() const
void LoadAllInfo(bool value)
void LoadAllSamples(bool value)
void SetVariationsLimits(int count)
Limits the number of variations to be read.
set< CConstRef< SVcfFieldData > > m_InfoFields
List of INFO fields parsed from the header of the file.
std::function< void(CVCFVariantList &)> TOnVCFVariantListReady
Defines a callable object, used when a variants list is processed by the reader.
const string & GetVCFversion() const
set< string > & LoadSelectedInfoFields()
vector< CConstRef< SVcfFieldData > > TFieldVector
const map< unsigned, string > & GetSamples() const
unordered_map< string, CRef< CVCFVariantList > > m_ChromosomeMap
set< string > m_LoadInfoFields
List of INFO fields required to be loaded.
CObject –.
Definition: ncbiobj.hpp:180
CRef –.
Definition: ncbiobj.hpp:618
Interface for testing cancellation request in a long lasting operation.
Definition: icanceled.hpp:51
Include a standard set of the NCBI C++ Toolkit most basic headers.
std::ofstream out("events_result.xml")
main entry point for tests
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
#define NCBI_GUIWIDGETS_LOADERS_EXPORT
Definition: gui_export.h:525
Defines to provide correct exporting from DLLs in Windows.
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
Miscellaneous common-use basic types and functionality.
Portable reference counted smart and weak pointers using CWeakRef, CRef, CObject and CObjectEx.
std::istream & in(std::istream &in_, double &x_)
static const char * prefix[]
Definition: pcregrep.c:405
Structure to store characteristics of an INFO field It is constructed from an INFO meta-information l...
string m_Name
INFO ID (name)
SVcfFieldData(const string &name, const string &descr, const string &nr)
string m_Description
INFO Description.
void Deserialize(CNcbiIstream &istr, CRawScoreVector< Key, Score > &)
Modified on Mon May 20 04:58:35 2024 by modify_doxy.py rev. 669887