NCBI C++ ToolKit
demo_read_large_vcf.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: demo_read_large_vcf.cpp 46618 2021-08-05 19:04:11Z asztalos $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Andrea Asztalos
27  *
28  * File Description:
29  * Demo application for reading large (> 100MB) VCF files
30  */
31 
32 
33 #include <ncbi_pch.hpp>
34 
35 #include <corelib/ncbiapp.hpp>
36 #include <corelib/ncbireg.hpp>
37 #include <corelib/ncbi_limits.hpp>
43 #include <chrono>
44 
46 
48 {
49 public:
50  virtual void Init(void);
51  virtual int Run(void);
52  virtual void Exit(void);
53 private:
54  bool x_LoadVCFFile(const string& fname);
55  bool x_LoadSerializedData(const string& fname);
56  void x_UpdateProgress(const string& text);
57  bool x_ListErrors(objects::IMessageListener* errCont);
58 
59  TSeqRange x_ParseRange(const string& range_str);
60  void x_ReadVCFBlob(const string& fname, vector<char>& vcf_blob);
61 };
62 
64 {
65  unique_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
66  arg_desc->SetUsageContext(GetArguments().GetProgramBasename(), "Demo application to read VCF files");
67 
68  arg_desc->AddOptionalKey("i", "InFile", "Input VCF file", CArgDescriptions::eInputFile);
69 
70  arg_desc->AddOptionalKey("iserial", "InFile", "Input File containing the serialized data", CArgDescriptions::eInputFile);
71 
72  arg_desc->AddOptionalKey("o", "OutFile", "Single Output File", CArgDescriptions::eOutputFile);
73 
74  arg_desc->AddFlag("sv_cols", "Output only the first five columns from the data section of the input file");
75 
76  // for serializing the entire list of variants
77  arg_desc->AddOptionalKey("serial", "OutputFile", "Output file for serialized data", CArgDescriptions::eOutputFile);
78 
79  // to deserialize and compare
80  arg_desc->AddFlag("deserialize_and_compare", "Deserialize data and compare it with the existing variant data");
81 
82  arg_desc->AddOptionalKey("range", "GenomicRange", "Specify the genomic range for which to extract the data", CArgDescriptions::eString);
83  // for serializing individual columns
84  //arg_desc->AddOptionalKey("serialize_cols", "OutputFile", "Output file with logging serialization and deserialization", CArgDescriptions::eOutputFile);
85 
86  arg_desc->AddOptionalKey("index", "OutFile", "Output file for listing index vectors", CArgDescriptions::eOutputFile);
87 
88  //arg_desc->AddOptionalKey("stats", "OutputFile", "Output File for listing statistics", CArgDescriptions::eOutputFile);
89 
90  //arg_desc->AddFlag("hist", "Compute histogram");
91 
92  SetupArgDescriptions(arg_desc.release());
93 }
94 
95 // To serialize the data into one file:
96 // demo_read_large_vcf -i bbb.gz -serial serial_bbb.bin -o output_bbb.txt -sv_cols
97 
98 // To read the serialized data from a file:
99 // demo_read_large_vcf -iserial serial_bbb.bin -o output_bbb.txt -sv_cols
100 
101 // demo_read_large_vcf -i bbb.gz -serial serial_bbb.bin -deserialize_and_compare
102 bool CColumnarVCFReaderApp::x_LoadVCFFile(const string& fname)
103 {
104  cout << "Starting to read file: " << fname << endl;
105  CCompressedFile file(fname);
106  CColumnarVCFReader reader;
107  CRef<CErrorContainer> err_cont{ new CErrorContainer(1000) };
108 
109  bool has_errors = false;
110  bool read_data = false;
111  try {
112  CColumnarVCFReader::TReportProgress progress = bind(&CColumnarVCFReaderApp::x_UpdateProgress, this, placeholders::_1);
113  read_data = reader.ReadData(file.GetIstream(false), nullptr, err_cont, progress);
114  has_errors = x_ListErrors(err_cont);
115  }
116  catch (const CException& e) {
117  cout << "Loading VCF file " << fname << " failed: " << e.GetMsg();
118  has_errors |= x_ListErrors(err_cont);
119  }
120  catch (const exception& e) {
121  cout << "Loading VCF file " << fname << " failed: " << e.what();
122  has_errors |= x_ListErrors(err_cont);
123  }
124 
125  if (!read_data)
126  return false;
127 
128  cout << "VCF file was successfully loaded into memory" << endl;
129 
130  vector<string> chr_names = reader.GetChromosomeNames();
131  for (const auto& it : chr_names) {
132  auto var = reader.GetVariantsForChr(it);
133  cout << it << "\t" << var->Count() << " variants\n";
134  }
135 
136  const CArgs& args = GetArgs();
137 
138  if (args["serial"]) {
139  const string& serial_fname = args["serial"].AsString();
140  auto start = chrono::steady_clock::now();
141  for (const auto& it : chr_names) {
142  auto var = reader.GetVariantsForChr(it);
143  var->WriteSerializedData(it + "_" + serial_fname);
144  }
145  auto diff = chrono::steady_clock::now() - start;
146  cout << "Serialization of all data took: " <<
147  chrono::duration_cast<chrono::milliseconds>(diff).count() << " ms" << endl;
148 
149  if (args["deserialize_and_compare"]) {
150  for (const auto& it : chr_names) {
151 
152  auto orig_var = reader.GetVariantsForChr(it);
153  CRef<CVCFVariantList> new_variant(new CVCFVariantList(it, it + "_" + serial_fname));
154  if (*orig_var == *new_variant) {
155  cout << it << "\t" << "are the same\n";
156  }
157  else {
158  cout << it << "\t" << "are not the same\n";
159  return false;
160  }
161  }
162  }
163  }
164 
165  if (args["o"]) {
166  try {
167  for (const auto& it : chr_names) {
168  auto var = reader.GetVariantsForChr(it);
169  string out_fname = it + "_" + args["o"].AsString();
170  CNcbiOfstream out(out_fname.data());
171  var->List(out, args["sv_cols"]);
172  }
173  cout << "Finished listing columns" << endl;
174  }
175  catch (const CException& e) {
176  cout << "Listing data columns has failed: " << e.GetMsg() << endl;
177  return false;
178  }
179  }
180 
181  if (args["index"]) {
182  try {
183  for (const auto& it : chr_names) {
184  auto var = reader.GetVariantsForChr(it);
185  string out_fname = it + "_" + args["index"].AsString();
186  CNcbiOfstream out(out_fname.data());
187  var->ListPositionVectors(out);
188  }
189  cout << "Finished listing index vectors " << endl;
190  }
191  catch (const CException& e) {
192  cout << "Listing index vectors has failed: " << e.GetMsg() << endl;
193  return false;
194  }
195  }
196 
197  return true;
198 }
199 
201 {
202  cout << text << endl;
203 }
204 
205 bool CColumnarVCFReaderApp::x_ListErrors(objects::IMessageListener* errCont)
206 {
207  bool has_errors = (errCont && errCont->Count() > 0);
208  if (has_errors) {
209  for (size_t i = 0; i < errCont->Count(); ++i) {
210  const auto& lerror = errCont->GetError(i);
211  cerr << "Line " << lerror.Line() << ". " << lerror.Message() << endl;
212  }
213  }
214  return has_errors;
215 }
216 
218 {
219 
220  // The VCF blob does not contain the name of the chromosome. For now we use a
221  // placeholder for it: 'chr1".
222  const CArgs& args = GetArgs();
223 
224  if (!args["range"]) {
225  CRef<CVCFVariantList> new_variant;
226  try {
227  new_variant.Reset(new CVCFVariantList("chr1", fname));
228  cout << "Number of variants read: " << new_variant->Count() << endl;
229  }
230  catch (const CException& e) {
231  cout << "Failed to load data from " << fname << ": " << e.GetMsg() << endl;
232  }
233  if (!new_variant)
234  return false;
235 
236  return true;
237  }
238 
239 
240 
241  if (args["range"]) {
242  TSeqRange range = x_ParseRange(args["range"].AsString());
243  if (range.Empty())
244  return true;
245 
246  vector<char> vcf_blob;
247  x_ReadVCFBlob(fname, vcf_blob);
248 
250  CVCFSlicedVariants variants(vcf_blob, &range, data_cols);
251 
252  cout << "There are " << variants.Count(range) << " variants in the specified range";
253 
254  string ofname("range_" + NStr::UInt8ToString(range.GetFrom()) + "_" + NStr::UInt8ToString(range.GetTo()) + ".txt");
255  CNcbiOfstream out(ofname.data());
256  variants.List(out);
257  }
258  return true;
259 }
260 
262 {
264  if (range_str.empty())
265  return range;
266 
267  vector<string> positions;
268  NStr::Split(range_str, "-", positions, NStr::fSplit_MergeDelimiters);
269  if (positions.size() != 2) {
270  // don't process bad input data
271  return range;
272  }
273 
274  NStr::TruncateSpacesInPlace(positions[0]);
275  NStr::TruncateSpacesInPlace(positions[1]);
276  TSeqPos from = 1, to = 1;
277  try {
278  from = NStr::StringToUInt8(positions[0]);
279  to = NStr::StringToUInt8(positions[1]);
280  }
281  catch (const CException&) {
282  return range;
283  }
284 
285  cout << "Parsed: " << from << " and " << to << endl;
286  range.Set(from, to);
287  cout << "Range: " << range.GetFrom() << " and " << range.GetTo() << endl;
288  return range;
289 }
290 
291 void CColumnarVCFReaderApp::x_ReadVCFBlob(const string& fname, vector<char>& vcf_blob)
292 {
293  CFileIO fio;
294  try {
295  fio.Open(fname, CFileIO::eOpen, CFileIO::eRead);
296  }
297  catch (const CException& e) {
298  cerr << "Cannot open " + fname + "\nas: " + e.GetMsg() << endl;
299  }
300 
301  vcf_blob.resize(0);
302  vcf_blob.reserve(fio.GetFileSize());
303  char* buf = vcf_blob.data();
304  fio.Read(buf, fio.GetFileSize());
305  fio.Close();
306 }
307 
308 
310 {
311  const CArgs& args = GetArgs();
312 
313  if (args["i"]) {
314  if (!x_LoadVCFFile(args["i"].AsString())) {
315  return 1;
316  }
317  }
318  else if (args["iserial"]) {
319  if (!x_LoadSerializedData(args["iserial"].AsString())) {
320  return 1;
321  }
322  }
323  return 0;
324 }
325 
326 /*
327 
328  if (args["i"]) {
329  if (args["serialize_cols"]) {
330  reader.SerializeToDisk(fname, &args["serialize_cols"].AsOutputFile());
331  reader.Deserialize(fname, &args["serialize_cols"].AsOutputFile());
332  cout << "Finished serializing data and deserialization was successful" << endl;
333  }
334 
335  if (args["stats"]) {
336  reader.GetStatistics(args["stats"].AsOutputFile());
337  cout << "Finished printing statistics" << endl;
338  try {
339  }
340  catch (const CException& e) {
341  cout << "Printing statistics of vectors has failed" << endl;
342  return 4;
343  }
344  }
345  }
346 
347 */
348 
349 
351 {
352  SetDiagStream(0);
353 #ifdef NCBI_OS_MSWIN
354  cout << "Press any key to end the demo app" << endl;
355  getchar();
356 #endif
357 }
358 
359 
360 /////////////////////////////////////////////////////////////////////////////
361 // MAIN
362 int NcbiSys_main(int argc, ncbi::TXChar* argv[])
363 {
364  // Execute main application function
365  return CColumnarVCFReaderApp().AppMain(argc, argv);
366 }
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgs –.
Definition: ncbiargs.hpp:379
bool x_LoadVCFFile(const string &fname)
void x_ReadVCFBlob(const string &fname, vector< char > &vcf_blob)
virtual void Exit(void)
Cleanup on application exit.
bool x_LoadSerializedData(const string &fname)
virtual int Run(void)
Run the application.
TSeqRange x_ParseRange(const string &range_str)
virtual void Init(void)
Initialize the application.
bool x_ListErrors(objects::IMessageListener *errCont)
void x_UpdateProgress(const string &text)
Class responsible to read VCF files.
vector< string > GetChromosomeNames() const
Returns a vector, holding the chrs/contigs identifiers, read from the file.
function< void(const string &)> TReportProgress
CRef< CVCFVariantList > GetVariantsForChr(const string &chr_name) const
Retrieves the variants list for a given chr/contig.
bool ReadData(CNcbiIstream &in, ICanceled *canceled=nullptr, objects::ILineErrorListener *listener=nullptr, TReportProgress prog_func=TReportProgress(), TOnVCFVariantListReady on_variants_list_ready=TOnVCFVariantListReady())
Reads only the data section of the file.
CErrorContainer.
Class for support low level input/output for files.
Definition: ncbifile.hpp:3475
CRef –.
Definition: ncbiobj.hpp:618
void List(CNcbiOstream &out) const
static const vector< string > & s_GetAllColNames()
contains sm_INFO, sm_SAMPLES
int NcbiSys_main(int argc, ncbi::TXChar *argv[])
USING_NCBI_SCOPE
std::ofstream out("events_result.xml")
main entry point for tests
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:285
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
Definition: ncbiapp.cpp:799
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
Definition: ncbiapp.cpp:1175
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
@ eInputFile
Name of file (must exist and be readable)
Definition: ncbiargs.hpp:595
@ eString
An arbitrary string.
Definition: ncbiargs.hpp:589
@ eOutputFile
Name of file (must be writable)
Definition: ncbiargs.hpp:596
void SetDiagStream(CNcbiOstream *os, bool quick_flush=true, FDiagCleanup cleanup=0, void *cleanup_data=0, const string &stream_name="")
Set diagnostic stream.
Definition: ncbidiag.cpp:8083
const string & GetMsg(void) const
Get message string.
Definition: ncbiexpt.cpp:461
virtual const char * what(void) const noexcept
Standard report (includes full backlog).
Definition: ncbiexpt.cpp:342
void Close(void)
Close file.
Definition: ncbifile.cpp:6640
void Open(const string &filename, EOpenMode open_mode, EAccessMode access_mode, EShareMode share_mode=eShare)
Open file.
Definition: ncbifile.cpp:6416
Uint8 GetFileSize(void) const
Get file size.
Definition: ncbifile.cpp:6860
size_t Read(void *buf, size_t count) const
Read file.
Definition: ncbifile.cpp:6662
@ eRead
File can be read.
Definition: ncbifile.hpp:3435
@ eOpen
Open an existing file, or create a new one.
Definition: ncbifile.hpp:3425
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
IO_PREFIX::ofstream CNcbiOfstream
Portable alias for ofstream.
Definition: ncbistre.hpp:500
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3457
char TXChar
Definition: ncbistr.hpp:172
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
Definition: ncbistr.cpp:3197
static Uint8 StringToUInt8(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to Uint8.
Definition: ncbistr.cpp:873
static string UInt8ToString(Uint8 value, TNumToStringFlags flags=0, int base=10)
Convert UInt8 to string.
Definition: ncbistr.hpp:5167
@ fSplit_MergeDelimiters
Merge adjacent delimiters.
Definition: ncbistr.hpp:2498
FILE * file
char * buf
int i
static void text(MDB_val *v)
Definition: mdb_dump.c:62
range(_Ty, _Ty) -> range< _Ty >
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Process information in the NCBI Registry, including working with configuration files.
C++ I/O stream wrappers to compress/decompress data on-the-fly.
Modified on Sat Dec 02 09:20:20 2023 by modify_doxy.py rev. 669887