NCBI C++ ToolKit
columnar_vcf_variants.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: columnar_vcf_variants.cpp 47479 2023-05-02 13:24:02Z ucko $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Andrea Asztalos, Anatoliy Kuznetsov
27  *
28  * File Description:
29  *
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <util/line_reader.hpp>
36 #include <util/bitset/bmdbg.h>
40 #include <chrono>
41 #include <numeric>
42 
44 
45 CVCFVariantList::CVCFVariantList(const string& chr_name,
46  bool load_all_info,
47  const set<string>& info_fields,
48  const map<unsigned, string>& sample_cols)
49  : CVCFVariantsBase(load_all_info, info_fields, sample_cols), m_ChrName(chr_name)
50 {
51 }
52 
53 CVCFVariantList::CVCFVariantList(const string& chr_name, const string& filename)
54  : m_ChrName(chr_name)
55 {
56  CFileIO fio;
57  try {
58  fio.Open(filename, CFileIO::eOpen, CFileIO::eRead);
59  }
60  catch (const CException& e) {
61  NCBI_THROW(CVCFDataException, eFileCorrupted, "Cannot open " + filename + "\nas: " + e.GetMsg());
62  }
63  size_t total_size = fio.GetFileSize();
64 
65  char* buf_ptr = new char[total_size];
66  try {
67  fio.Read(buf_ptr, total_size);
68  }
69  catch (const CException& e) {
70  NCBI_THROW(CVCFDataException, eFileCorrupted, e.GetMsg());
71  }
72 
73  m_Buffer = new vector<char>(buf_ptr, buf_ptr + total_size);
74  _ASSERT(m_Buffer->size() == total_size);
75 
76  delete[] buf_ptr;
77  buf_ptr = nullptr;
78 
79  fio.Close();
81  m_Index = 0;
82 }
83 
84 CVCFVariantList::CVCFVariantList(const string& chr_name, const vector<char>& data)
85  : m_ChrName(chr_name)
86 {
87  m_Buffer = new vector<char>(data.begin(), data.end());
89  m_Index = 0;
90 }
91 
92 CVCFVariantList::CVCFVariantList(const vector<char>& data, const string& version)
93 {
94  if (version != s_GetPreviousVersion()) {
95  NCBI_THROW(CColumnarVCFException, eConversionFailed, "Trying to open a project with version: " + version);
96  }
97 
98  const unsigned char* buf_ptr = (const unsigned char*)data.data();
99  size_t nr_cols = 0;
100  bool skip_max_feat_length = true;
101  x_DeserializeIndexVectors(buf_ptr, nr_cols, skip_max_feat_length);
102 
103  x_DeserializeColumn(CVCFVariantsBase::sm_ID, buf_ptr, nr_cols);
104  auto var_it = m_Descriptors[CVCFVariantsBase::sm_ID].begin();
108 
109  m_MaxFeatLength = 0;
110  bool found_end_col = x_DeserializeColumn("END", buf_ptr, nr_cols);
111  if (!found_end_col) {
112  // when the 'END' Info subfield is not present, the data can be converted with m_MaxFeatLength = 0
113  }
114  else {
115  // When 'END' subfield is present, we need to fetch the start positions of the variants to
116  // calculate m_MaxFeatLength. This is time expensive due to the way these positions are stored.
117  // Therefore we will convert only those VCF blobs that don't contain more than a given number of variants
118  if (Count() > 100000) {
119  NCBI_THROW(CColumnarVCFException, eConversionFailed,
120  "Too many VCF variants to be converted: " + NStr::UIntToString(Count()));
121  }
122  else {
123  // calculate the max feat length
124  auto start = chrono::steady_clock::now();
125  const auto& end_vector = m_Descriptors.GetInfoField("END");
126 
127  auto end_it = end_vector.begin();
128  auto end_stop_it = end_vector.end();
129  m_MaxFeatLength = 0;
130 
131  for (; end_it != end_stop_it; ++end_it) {
132  if (!end_it.is_null()) {
133  try {
134  unsigned end_point = NStr::StringToUInt(*end_it);
135  unsigned start_point = m_Posindexmap.GetPositionForIndex(end_it.pos());
136  int tmp_delta = end_point - start_point + 1;
137  if (tmp_delta < 0) {
138  tmp_delta = tmp_delta * (-1);
139  }
140  m_MaxFeatLength = max((unsigned)tmp_delta, m_MaxFeatLength);
141  }
142  catch (const CException&) {}
143  }
144  }
145  auto diff = chrono::steady_clock::now() - start;
146  LOG_POST(Info << "Finding maxfeat length for " << Count() << " variants took: "
147  << chrono::duration_cast<chrono::milliseconds>(diff).count() << "ms");
148  }
149  }
150 
151  // store the serialized data in the current format
152  size_t total_size = data.size() + sizeof(unsigned); // for additionally saving m_MaxFeatLength
153 
154  m_Buffer = new vector<char>(total_size);
155  char* write_ptr = m_Buffer->data();
156  buf_ptr = (const unsigned char*)data.data();
157 
158  // save the number of index vectors
159  size_t idx_size = GetNumberOfIndexVecs();
160  memcpy(write_ptr, &idx_size, sizeof(idx_size));
161  write_ptr += sizeof(idx_size);
162  buf_ptr += sizeof(idx_size);
163 
164  // save the number of string columns
165  memcpy(write_ptr, &nr_cols, sizeof(nr_cols));
166  write_ptr += sizeof(nr_cols);
167  buf_ptr += sizeof(nr_cols);
168 
169  // save the maximum feature length
170  memcpy(write_ptr, &m_MaxFeatLength, sizeof(m_MaxFeatLength));
171  write_ptr += sizeof(m_MaxFeatLength);
172 
173  memcpy(write_ptr, buf_ptr, data.size() - sizeof(idx_size) - sizeof(nr_cols));
174 }
175 
177 {
178  // delete the actual data as well
179  if (m_Buffer) {
180  delete m_Buffer;
181  m_Buffer = nullptr;
182  }
183 }
184 
185 void CVCFVariantList::ParseLine(const string& line)
186 {
187  size_t pos = line.find("\t");
188  _ASSERT(m_ChrName == line.substr(0, pos));
189 
192  }
193 
194  string remainder = line;
195  // parsing the POS column
196  string value = x_ParseNextColumn(remainder, pos);
198  bool added = (errno == 0);
199  if (added) {
200  added = m_Posindexmap.Add(m_StopPos, static_cast<unsigned int>(m_Index));
202  }
203  if (!added) {
204  NCBI_THROW(CColumnarVCFException, eParsePosFailed, "Failed to parse POS value:\n" + line);
205  }
206 
207  // parsing the ID column
208  value = x_ParseNextColumn(remainder, pos);
209  vector<string> var_ids;
210  if (value.find(";") == NPOS) { // there is a single variant ID
211  var_ids.push_back(value);
212  }
213  else {
214  NStr::Split(value, ";", var_ids);
215  for (auto& it : var_ids) {
217  }
218  sort(var_ids.begin(), var_ids.end());
219  var_ids.erase(unique(var_ids.begin(), var_ids.end()), var_ids.end());
220  }
221 
222  _ASSERT(!var_ids.empty());
223 
224  const auto& col_names = CVCFVariantsBase::s_GetAllColNames();
225  auto col_it = col_names.begin();
226  m_Descriptors.PushBack(*col_it, var_ids[0]);
228 
229  if (var_ids.size() == 1) {
230  for (++col_it; col_it != col_names.end() - 1; ++col_it) {
231  value = x_ParseNextColumn(remainder, pos);
232  m_Descriptors.PushBack(*col_it, value);
233  }
234  if (pos == NPOS) {
235  value = kEmptyStr;
236  }
237  else {
238  value = remainder.substr(pos + 1, NPOS);
239  }
240  m_Descriptors.PushBack(*col_it, value);
241  }
242  else {
243  vector<string> tmp_values;
244  for (++col_it; col_it != col_names.end() - 1; ++col_it) {
245  value = x_ParseNextColumn(remainder, pos);
246  m_Descriptors.PushBack(*col_it, value);
247  tmp_values.push_back(value);
248  }
249  if (pos == NPOS) {
250  value = kEmptyStr;
251  }
252  else {
253  value = remainder.substr(pos + 1, NPOS);
254  }
255  m_Descriptors.PushBack(*col_it, value);
256  tmp_values.push_back(value);
257 
258  for (size_t index = 1; index < var_ids.size(); ++index) {
259  m_Index++;
260  m_Posindexmap.Add(m_StopPos, static_cast<unsigned int>(m_Index));
262 
263  col_it = col_names.begin();
264  m_Descriptors.PushBack(*col_it, var_ids[index]);
265 
266  size_t n = 0;
267  for (++col_it; col_it != col_names.end() && n < tmp_values.size(); ++col_it, ++n) {
268  value = tmp_values[n];
269  m_Descriptors.PushBack(*col_it, value);
270  }
271  }
272  }
273 
274  // prepare the index for the next line
275  ++m_Index;
276 }
277 
279 {
283 
284  m_Index = 0;
285  auto var_it = m_Descriptors[CVCFVariantsBase::sm_ID].begin();
288 }
289 
290 string CVCFVariantList::x_ParseNextColumn(string& line, size_t& pos)
291 {
292  if (pos == NPOS) {
293  line.resize(0);
295  }
296 
297  line = line.substr(pos + 1, NPOS);
298  pos = line.find("\t");
299  /*
300  if (pos == NPOS) {
301  NCBI_THROW(CColumnarVCFException, eParseTabExpected, "Line expected to be tab delimited");
302  }
303  */
304  return line.substr(0, pos);
305 }
306 
307 bool CVCFVariantList::AreVariantIdsUnique(vector<pair<string, unsigned>>& copies)
308 {
310  copies.clear();
311 
312  auto it = m_Descriptors[CVCFVariantsBase::sm_ID].begin();
313  auto it_end = m_Descriptors[CVCFVariantsBase::sm_ID].end();
314  for (; it != it_end; ++it) {
315  if (!it.is_null()) {
316  auto ret = tmp.insert(*it);
317  if (ret.second == false) {
318  string search(*it);
319  auto copies_it = find_if(copies.begin(), copies.end(),
320  [&search](const pair<string, unsigned>& elem) { return elem.first == search; });
321  if (copies_it == copies.end()) {
322  copies.emplace_back(search, 2);
323  }
324  else {
325  copies_it->second++;
326  }
327  }
328  }
329  }
330  return (copies.empty());
331 }
332 
333 void CVCFVariantList::GetPositionsForMissingVarID(vector<unsigned>& positions)
334 {
337  if (indices.count() > 0) {
338  m_Posindexmap.Lookup(indices, positions);
339  }
340 }
341 
342 bool CVCFVariantList::GetPositionsForVariant(const string& variant_id, vector<unsigned>& positions)
343 {
344  if (variant_id == CVCFVariantsBase::sm_MissingValue) {
345  GetPositionsForMissingVarID(positions);
346  return true;
347  }
348 
350  bool found = m_Descriptors.GetIndicesForVariant(variant_id, indices);
351  if (found) {
352  m_Posindexmap.Lookup(indices, positions);
353  }
354  sort(positions.begin(), positions.end());
355  positions.erase(unique(positions.begin(), positions.end()), positions.end());
356  return found;
357 }
358 
359 const vector<char>& CVCFVariantList::GetSerializedData() const
360 {
361  if (!m_Buffer || m_Buffer->empty()) {
362  x_SerializeData();
363  }
364  return *m_Buffer;
365 }
366 
367 void CVCFVariantList::WriteSerializedData(const string& filename)
368 {
369  if (!m_Buffer || m_Buffer->empty()) {
370  x_SerializeData();
371  }
372 
373  _ASSERT(!m_Buffer->empty());
374  NVcfUtil::PrintToFile(m_Buffer->data(), m_Buffer->size(), filename);
375 }
376 
378 {
379  return (m_ChrName == other.m_ChrName) &&
380  (m_Index == other.m_Index) &&
381  (m_StartPos == other.m_StartPos) &&
382  (m_StopPos == other.m_StopPos) &&
383  (m_Descriptors == other.m_Descriptors) &&
384  (m_Posindexmap == other.m_Posindexmap);
385 }
386 
387 void CVCFVariantList::List(CNcbiOstream& out, bool only_sv_cols) const
388 {
389  for (CVCFVariant_CI var_iter(*this); var_iter; ++var_iter) {
390  out << var_iter.GetPosition() << "\t" << var_iter.GetVariantID() << "\t"
391  << var_iter.GetRef() << "\t" << var_iter.GetAlt();
392  if (!only_sv_cols) {
393  out << "\t" << var_iter.GetQual() << "\t" << var_iter.GetFilter()
394  << "\t" << var_iter.GetInfo() << "\t" << var_iter.GetFormat();
395  }
396  out << endl;
397  }
398 }
399 
401 {
402  auto samples = m_Descriptors.GetSampleNames();
403  for (const auto& it : samples) {
404  out << it << "\t";
405  }
406  out << endl;
407  for (CVCFVariant_CI var_iter(*this); var_iter; ++var_iter) {
408  out << var_iter.GetPosition() << "\t" << var_iter.GetVariantID() << "\t";
409  for (const auto& it : samples) {
410  out << var_iter.GetSample(it) << "\t";
411  }
412  out << endl;
413  }
414 }
415 
416 vector<string> CVCFVariantList::GetAllVariantIDS() const
417 {
418  vector<string> variants;
419  for (CVCFVariant_CI var_iter(*this); var_iter; ++var_iter) {
420  variants.push_back(var_iter.GetVariantID());
421  }
422  return variants;
423 }
424 
425 
427 {
428  bm::bv_statistics stat_sum;
429  stat_sum.reset();
430  out << "Resetting stat_sum...." << endl;
431  NVcfUtil::PrintStats(stat_sum, out);
432  bool Jaccard_index = false;
433 
434  const auto& col_names = CVCFVariantsBase::s_GetColNames();
435  for (const auto& it : col_names) {
436  auto& str_vector = m_Descriptors[it];
437  out << "-----------------------" << it << " vector-------------------------" << endl;
438  LOG_POST(Info << "-----------------------" << it << " vector-------------------------");
440 #if defined(BM_SCALAR_VERSION) && BM_SCALAR_VERSION >= 0x070903
441 // Slight anachronism -- if BM_SCALAR_VERSION *is* defined, it'll be
442 // at least 0x070a04.
443  out,
444 #endif
445  str_vector, Jaccard_index);
446  NVcfUtil::AddStats(stat_sum, str_vector, out);
447  }
448  m_Posindexmap.GetStatistics(stat_sum, Jaccard_index, out);
449 }
450 
452 {
453  // prefix of descriptor file names
454  string label = x_GetFilePrefix(prefix);
455  bool status = true;
456 
457  unsigned cum_memory_used = 0;
458  unsigned cum_layout_size = 0;
459 
460  if (out) {
461  *out << "\nStarting to serialize columns for chr: " << m_ChrName << endl;
462  }
463 
464  try {
465  m_Posindexmap.SerializeVectors(label, out, cum_memory_used, cum_layout_size);
466 
467  const auto& col_names = CVCFVariantsBase::s_GetColNames();
468  for (const auto& it : col_names) {
469  auto& str_vector = m_Descriptors[it];
470  NVcfUtil::SerializeColumn(str_vector, label, it, out, cum_memory_used, cum_layout_size);
471  }
472 
473  const auto info_fields = m_Descriptors.GetInfoFieldNames();
474  for (const auto& it : info_fields) {
475  auto& info_subcol = m_Descriptors.GetInfoField(it);
476  NVcfUtil::SerializeColumn(info_subcol, label, it, out, cum_memory_used, cum_layout_size);
477  }
478 
479  const auto samples = m_Descriptors.GetSampleNames();
480  for (const auto& it : samples) {
481  auto& sample_col = m_Descriptors.GetSample(it);
482  NVcfUtil::SerializeColumn(sample_col, label, it, out, cum_memory_used, cum_layout_size);
483  }
484 
485  if (out) {
486  *out << endl << "Total memory used: " << cum_memory_used << endl << endl;
487  *out << "Total layout size: " << cum_layout_size << endl;
488  }
489  }
490  catch (const CException& e) {
491  status = false;
492  LOG_POST(Info << "Serialization of data has failed: " << e.GetMsg());
493  }
494  catch (const exception& e) {
495  status = false;
496  LOG_POST(Info << "Serialization of data has failed: " << e.what());
497  }
498  return status;
499 }
500 
502 {
503  string label = x_GetFilePrefix(prefix);
504  bool status = true;
505 
506  if (out) {
507  *out << "\nStarting to deserialize blobs for chr: " << m_ChrName << endl;
508  }
509 
510  try {
512 
513  const auto& col_names = CVCFVariantsBase::s_GetColNames();
514  for (const auto& it : col_names) {
515  auto& str_vector = m_Descriptors[it];
516  NVcfUtil::DeserializeColumn(str_vector, label, it, out);
517  }
518 
519  const auto info_fields = m_Descriptors.GetInfoFieldNames();
520  for (const auto& it : info_fields) {
521  auto& info_subcol = m_Descriptors.SetInfoField(it);
522  NVcfUtil::DeserializeColumn(info_subcol, label, it, out);
523  }
524 
525  const auto samples = m_Descriptors.GetSampleNames();
526  for (const auto& it : samples) {
527  auto& sample_col = m_Descriptors.SetSample(it);
528  NVcfUtil::DeserializeColumn(sample_col, label, it, out);
529  }
530  }
531  catch (const CException& e) {
532  status = false;
533  LOG_POST(Info << "Data deserialization has failed: " << e.GetMsg());
534  }
535  catch (const exception& e) {
536  status = false;
537  LOG_POST(Info << "Data deserialization has failed: " << e.what());
538  }
539  return status;
540 }
541 
543 {
544  string label = x_GetFilePrefix(prefix);
545  bool status = true;
546  try {
547  const auto& col_names = CVCFVariantsBase::s_GetColNames();
548  for (const auto& it : col_names) {
550  }
551 
552  status = status && m_Posindexmap.RemoveSerializedOutput(label);
553 
554  const auto info_fields = m_Descriptors.GetInfoFieldNames();
555  for (const auto& it : info_fields) {
557  }
558 
559  const auto samples = m_Descriptors.GetSampleNames();
560  for (const auto& it : samples) {
562  }
563  }
564  catch (const CException& e) {
565  status = false;
566  LOG_POST(Info << "Could not delete file: " << e.GetMsg());
567  }
568  catch (const exception& e) {
569  status = false;
570  LOG_POST(Info << "Could not delete file: " << e.what());
571  }
572  return status;
573 }
574 
575 string CVCFVariantList::x_GetFilePrefix(const string& prefix) const
576 {
577  return prefix + "_" + m_ChrName;
578 }
579 
580 
581 
582 // CVCFSlicedVariants
583 
584 CVCFSlicedVariants::CVCFSlicedVariants(const vector<char>& data, const TSeqRange* range, const set<string>& cols_to_decode, bool only_start)
585  : m_ColsDecode(cols_to_decode)
586 {
587  m_BufferPtr = (const unsigned char*)data.data();
590 }
591 
592 bool CVCFSlicedVariants::GetPositionsForVariant(const string& variant_id, vector<unsigned>& positions)
593 {
594  if (variant_id == CVCFVariantsBase::sm_MissingValue) {
595  GetPositionsForMissingVarID(positions);
596  return true;
597  }
598 
600  bool found = m_Descriptors.GetIndicesForVariant(variant_id, indices);
601  if (!found) {
602  // deserialize the full ID vector and search in it
604  found = m_Descriptors.GetIndicesForVariant(variant_id, indices);
605  }
606 
607  if (found) {
608  m_Posindexmap.Lookup(indices, positions);
609  }
610  sort(positions.begin(), positions.end());
611  positions.erase(unique(positions.begin(), positions.end()), positions.end());
612  return found;
613 }
614 
615 void CVCFSlicedVariants::GetPositionsForMissingVarID(vector<unsigned>& positions)
616 {
617  // deserialize the full ID vector and search in it
621  if (indices.count() > 0) {
622  m_Posindexmap.Lookup(indices, positions);
623  }
624 }
625 
627 {
628  for (CVCFVariant_CI var_iter(*this); var_iter; ++var_iter) {
629  out << var_iter.GetPosition() << "\t" << var_iter.GetVariantID() << "\t"
630  << var_iter.GetRef() << "\t" << var_iter.GetAlt() << "\t"
631  << var_iter.GetQual() << "\t" << var_iter.GetFilter() << "\t"
632  << var_iter.GetInfo();
633  if (var_iter.IsSetFormat()) {
634  out << "\t" << var_iter.GetFormat() << "\t" << var_iter.GetSampleCols();
635  }
636  out << endl;
637  }
638 }
639 
640 
642 
#define BM_SCALAR_VERSION
Definition: bmconst.h:254
Debugging functions (internal). Poorly documented, not well written.
Class for support low level input/output for files.
Definition: ncbifile.hpp:3476
void GetStatistics(bm::bv_statistics &stat_sum, bool Jaccard_index, CNcbiOstream &out)
void DeserializeVectors(const string &prefix, CNcbiOstream *out)
void FinalizeReading()
Flushes the insert iterators after which it remaps and optimizes each vector.
unsigned GetPositionForIndex(const size_t &index) const
void Lookup(const TSparseStrVector::bvector_type &values, vector< unsigned > &indices) const
const size_t & GetMaxIndex() const
bool Add(const unsigned &index, const unsigned &value)
void SerializeVectors(const string &prefix, CNcbiOstream *out, unsigned &cum_memory_used, unsigned &cum_layout_size)
bool RemoveSerializedOutput(const string &prefix)
void List(CNcbiOstream &out) const
virtual void GetPositionsForMissingVarID(vector< unsigned > &positions)
const unsigned char * m_BufferPtr
size_t m_NrCols
number of data columns to be deserialized
virtual bool GetPositionsForVariant(const string &variant_id, vector< unsigned > &positions)
set< string > m_ColsDecode
the name of data columns to be deserialized
CVCFSlicedVariants(const vector< char > &data, const TSeqRange *range=nullptr, const set< string > &cols_to_decode=set< string >(), bool only_start=false)
virtual bool GetPositionsForVariant(const string &variant_id, vector< unsigned > &positions)
void ParseLine(const string &line)
void GetStatistics(CNcbiOstream &out)
CVCFVariantList(const string &chr_name, bool load_all_info=true, const set< string > &info_fields=set< string >(), const map< unsigned, string > &sample_cols=map< unsigned, string >())
bool RemoveSerializedOutput(const string &prefix)
void List(CNcbiOstream &out, bool only_sv_cols=false) const
virtual void GetPositionsForMissingVarID(vector< unsigned > &positions)
const vector< char > & GetSerializedData() const
bool AreVariantIdsUnique(vector< pair< string, unsigned >> &copies)
void WriteSerializedData(const string &filename)
bool operator==(const CVCFVariantList &other) const
string x_GetFilePrefix(const string &prefix) const
string x_ParseNextColumn(string &line, size_t &pos)
void ListSamples(CNcbiOstream &out) const
vector< string > GetAllVariantIDS() const
bool SerializeVariantData(const string &prefix, CNcbiOstream *out=nullptr)
bool DeserializeAndCheck(const string &prefix, CNcbiOstream *out=nullptr)
bool x_DeserializeColumn(const string &col_name, const unsigned char *buf_ptr, const size_t &nr_cols)
void x_DeserializeIndexVectors(const unsigned char *&buf_ptr, size_t &nr_cols, bool skip_feat_length=false)
void x_DeserializeDescr_Range(const unsigned char *buf_ptr, const size_t &nr_cols, const TSeqRange *range=nullptr, const set< string > &cols_to_decode=set< string >(), bool only_start=false)
static const vector< string > & s_GetAllColNames()
contains sm_INFO, sm_SAMPLES
static const vector< string > & s_GetColNames()
does not contain sm_INFO and sm_SAMPLES
TSeqPos m_StartPos
in genomic coordinates (1-based)
static const string sm_MissingValue
TSeqPos m_StopPos
in genomic coordinates (1-based)
unsigned m_MaxFeatLength
maximum feature variant length
CVariantDescriptors m_Descriptors
bool GetIndicesForVariant(const string &variant_id, TSparseStrVector::bvector_type &indices) const
const TSparseOptVector & GetSample(const string &name) const
TSparseOptVector & SetSample(const string &name)
const TSparseOptVector & GetInfoField(const string &field_name) const
TSparseOptVector & SetInfoField(const string &field_name)
vector< string > GetInfoFieldNames() const
void FinalizeReading()
Flushes the insert iterators after which it remaps and optimizes each vector.
void PushBackPos(const unsigned &value)
Push back starting position of a variant.
vector< string > GetSampleNames() const
unsigned GetMaxFeatureLength()
Returns the maximum feature length within the set or 0 if the end points are not specified.
void PushBack(const string &label, const string &value)
Push back 'value' into the vector identified by 'label' The 'value' is not actually stored in the vec...
void GetIndicesForMissingVarID(TSparseStrVector::bvector_type &indices) const
const_iterator begin() const noexcept
Provide const iterator access to container content.
std::ofstream out("events_result.xml")
main entry point for tests
static char tmp[3200]
Definition: utf8.c:42
char data[12]
Definition: iconv.c:80
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
const string & GetMsg(void) const
Get message string.
Definition: ncbiexpt.cpp:461
virtual const char * what(void) const noexcept
Standard report (includes full backlog).
Definition: ncbiexpt.cpp:342
void Info(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1185
void Close(void)
Close file.
Definition: ncbifile.cpp:6640
void Open(const string &filename, EOpenMode open_mode, EAccessMode access_mode, EShareMode share_mode=eShare)
Open file.
Definition: ncbifile.cpp:6416
Uint8 GetFileSize(void) const
Get file size.
Definition: ncbifile.cpp:6860
size_t Read(void *buf, size_t count) const
Read file.
Definition: ncbifile.cpp:6662
@ eRead
File can be read.
Definition: ncbifile.hpp:3436
@ eOpen
Open an existing file, or create a new one.
Definition: ncbifile.hpp:3426
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
#define kEmptyStr
Definition: ncbistr.hpp:123
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3452
#define NPOS
Definition: ncbistr.hpp:133
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate whitespace in a string (in-place)
Definition: ncbistr.cpp:3192
static string UIntToString(unsigned int value, TNumToStringFlags flags=0, int base=10)
Convert UInt to string.
Definition: ncbistr.hpp:5111
static unsigned int StringToUInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to unsigned int.
Definition: ncbistr.cpp:642
@ fConvErr_NoThrow
Do not throw an exception on error.
Definition: ncbistr.hpp:285
static const char label[]
yy_size_t n
Lightweight interface for getting lines of data with minimal memory copying.
static int version
Definition: mdb_load.c:29
bool RemoveFile(const string &fname)
void SerializeColumn(SV &vec, const string &prefix, const string &col_name, CNcbiOstream *out, unsigned &cum_memory_used, unsigned &cum_layout_size)
void PrintStats(const bm::bv_statistics &sum, CNcbiOstream &out)
void AddStats(bm::bv_statistics &sum, SV &vec, CNcbiOstream &out)
void DeserializeColumn(SV &vec, const string &prefix, const string &col_name, CNcbiOstream *out)
void PrintToFile(const char *buff, size_t size, const string &fname)
string GenerateColFileName(const string &prefix, const string &col_name)
void print_svector_stat(TOut &tout, const SV &svect, bool print_sim=false)
Definition: bmdbg.h:630
range(_Ty, _Ty) -> range< _Ty >
constexpr auto sort(_Init &&init)
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
Compressed bitset (entry point to bm.h)
T max(T x_, T y_)
static const char * prefix[]
Definition: pcregrep.c:405
Structure with statistical information about memory allocation footprint, serialization projection,...
Definition: bmfunc.h:56
void reset() noexcept
Reset statisctics.
Definition: bmfunc.h:94
#define _ASSERT
Modified on Mon Jul 22 05:07:59 2024 by modify_doxy.py rev. 669887