49 :
CVCFVariantsBase(load_all_info, info_fields, sample_cols), m_ChrName(chr_name)
65 char* buf_ptr =
new char[total_size];
67 fio.
Read(buf_ptr, total_size);
73 m_Buffer =
new vector<char>(buf_ptr, buf_ptr + total_size);
87 m_Buffer =
new vector<char>(data.begin(), data.end());
98 const unsigned char* buf_ptr = (
const unsigned char*)data.data();
100 bool skip_max_feat_length =
true;
111 if (!found_end_col) {
118 if (
Count() > 100000) {
124 auto start = chrono::steady_clock::now();
127 auto end_it = end_vector.
begin();
128 auto end_stop_it = end_vector.end();
131 for (; end_it != end_stop_it; ++end_it) {
132 if (!end_it.is_null()) {
136 int tmp_delta = end_point - start_point + 1;
138 tmp_delta = tmp_delta * (-1);
145 auto diff = chrono::steady_clock::now() - start;
147 << chrono::duration_cast<chrono::milliseconds>(diff).count() <<
"ms");
152 size_t total_size = data.size() +
sizeof(unsigned);
154 m_Buffer =
new vector<char>(total_size);
156 buf_ptr = (
const unsigned char*)data.data();
160 memcpy(write_ptr, &idx_size,
sizeof(idx_size));
161 write_ptr +=
sizeof(idx_size);
162 buf_ptr +=
sizeof(idx_size);
165 memcpy(write_ptr, &nr_cols,
sizeof(nr_cols));
166 write_ptr +=
sizeof(nr_cols);
167 buf_ptr +=
sizeof(nr_cols);
173 memcpy(write_ptr, buf_ptr, data.size() -
sizeof(idx_size) -
sizeof(nr_cols));
187 size_t pos = line.find(
"\t");
194 string remainder = line;
198 bool added = (errno == 0);
209 vector<string> var_ids;
211 var_ids.push_back(
value);
215 for (
auto& it : var_ids) {
218 sort(var_ids.begin(), var_ids.end());
219 var_ids.erase(unique(var_ids.begin(), var_ids.end()), var_ids.end());
225 auto col_it = col_names.begin();
229 if (var_ids.size() == 1) {
230 for (++col_it; col_it != col_names.end() - 1; ++col_it) {
243 vector<string> tmp_values;
244 for (++col_it; col_it != col_names.end() - 1; ++col_it) {
247 tmp_values.push_back(
value);
256 tmp_values.push_back(
value);
258 for (
size_t index = 1; index < var_ids.size(); ++index) {
263 col_it = col_names.begin();
267 for (++col_it; col_it != col_names.end() &&
n < tmp_values.size(); ++col_it, ++
n) {
297 line = line.substr(pos + 1,
NPOS);
298 pos = line.find(
"\t");
304 return line.substr(0, pos);
314 for (; it != it_end; ++it) {
316 auto ret =
tmp.insert(*it);
317 if (ret.second ==
false) {
319 auto copies_it = find_if(copies.begin(), copies.end(),
320 [&search](
const pair<string, unsigned>& elem) { return elem.first == search; });
321 if (copies_it == copies.end()) {
322 copies.emplace_back(search, 2);
330 return (copies.empty());
337 if (indices.count() > 0) {
354 sort(positions.begin(), positions.end());
355 positions.erase(unique(positions.begin(), positions.end()), positions.end());
390 out << var_iter.GetPosition() <<
"\t" << var_iter.GetVariantID() <<
"\t"
391 << var_iter.GetRef() <<
"\t" << var_iter.GetAlt();
393 out <<
"\t" << var_iter.GetQual() <<
"\t" << var_iter.GetFilter()
394 <<
"\t" << var_iter.GetInfo() <<
"\t" << var_iter.GetFormat();
403 for (
const auto& it : samples) {
408 out << var_iter.GetPosition() <<
"\t" << var_iter.GetVariantID() <<
"\t";
409 for (
const auto& it : samples) {
410 out << var_iter.GetSample(it) <<
"\t";
418 vector<string> variants;
420 variants.push_back(var_iter.GetVariantID());
430 out <<
"Resetting stat_sum...." << endl;
432 bool Jaccard_index =
false;
435 for (
const auto& it : col_names) {
437 out <<
"-----------------------" << it <<
" vector-------------------------" << endl;
438 LOG_POST(
Info <<
"-----------------------" << it <<
" vector-------------------------");
445 str_vector, Jaccard_index);
457 unsigned cum_memory_used = 0;
458 unsigned cum_layout_size = 0;
461 *
out <<
"\nStarting to serialize columns for chr: " <<
m_ChrName << endl;
468 for (
const auto& it : col_names) {
474 for (
const auto& it : info_fields) {
480 for (
const auto& it : samples) {
486 *
out << endl <<
"Total memory used: " << cum_memory_used << endl << endl;
487 *
out <<
"Total layout size: " << cum_layout_size << endl;
494 catch (
const exception& e) {
507 *
out <<
"\nStarting to deserialize blobs for chr: " <<
m_ChrName << endl;
514 for (
const auto& it : col_names) {
520 for (
const auto& it : info_fields) {
526 for (
const auto& it : samples) {
535 catch (
const exception& e) {
548 for (
const auto& it : col_names) {
555 for (
const auto& it : info_fields) {
560 for (
const auto& it : samples) {
568 catch (
const exception& e) {
585 : m_ColsDecode(cols_to_decode)
610 sort(positions.begin(), positions.end());
611 positions.erase(unique(positions.begin(), positions.end()), positions.end());
621 if (indices.count() > 0) {
629 out << var_iter.GetPosition() <<
"\t" << var_iter.GetVariantID() <<
"\t"
630 << var_iter.GetRef() <<
"\t" << var_iter.GetAlt() <<
"\t"
631 << var_iter.GetQual() <<
"\t" << var_iter.GetFilter() <<
"\t"
632 << var_iter.GetInfo();
633 if (var_iter.IsSetFormat()) {
634 out <<
"\t" << var_iter.GetFormat() <<
"\t" << var_iter.GetSampleCols();
#define BM_SCALAR_VERSION
Debugging functions (internal). Poorly documented, not well written.
Class for support low level input/output for files.
void GetStatistics(bm::bv_statistics &stat_sum, bool Jaccard_index, CNcbiOstream &out)
void DeserializeVectors(const string &prefix, CNcbiOstream *out)
void FinalizeReading()
Flushes the insert iterators after which it remaps and optimizes each vector.
unsigned GetPositionForIndex(const size_t &index) const
void Lookup(const TSparseStrVector::bvector_type &values, vector< unsigned > &indices) const
const size_t & GetMaxIndex() const
bool Add(const unsigned &index, const unsigned &value)
void SerializeVectors(const string &prefix, CNcbiOstream *out, unsigned &cum_memory_used, unsigned &cum_layout_size)
bool RemoveSerializedOutput(const string &prefix)
void List(CNcbiOstream &out) const
virtual void GetPositionsForMissingVarID(vector< unsigned > &positions)
const unsigned char * m_BufferPtr
size_t m_NrCols
number of data columns to be deserialized
virtual bool GetPositionsForVariant(const string &variant_id, vector< unsigned > &positions)
set< string > m_ColsDecode
the name of data columns to be deserialized
CVCFSlicedVariants(const vector< char > &data, const TSeqRange *range=nullptr, const set< string > &cols_to_decode=set< string >(), bool only_start=false)
virtual bool GetPositionsForVariant(const string &variant_id, vector< unsigned > &positions)
void ParseLine(const string &line)
void GetStatistics(CNcbiOstream &out)
CVCFVariantList(const string &chr_name, bool load_all_info=true, const set< string > &info_fields=set< string >(), const map< unsigned, string > &sample_cols=map< unsigned, string >())
bool RemoveSerializedOutput(const string &prefix)
void List(CNcbiOstream &out, bool only_sv_cols=false) const
virtual void GetPositionsForMissingVarID(vector< unsigned > &positions)
const vector< char > & GetSerializedData() const
bool AreVariantIdsUnique(vector< pair< string, unsigned >> &copies)
void WriteSerializedData(const string &filename)
bool operator==(const CVCFVariantList &other) const
string x_GetFilePrefix(const string &prefix) const
string x_ParseNextColumn(string &line, size_t &pos)
void ListSamples(CNcbiOstream &out) const
vector< string > GetAllVariantIDS() const
bool SerializeVariantData(const string &prefix, CNcbiOstream *out=nullptr)
bool DeserializeAndCheck(const string &prefix, CNcbiOstream *out=nullptr)
bool x_DeserializeColumn(const string &col_name, const unsigned char *buf_ptr, const size_t &nr_cols)
static const string sm_ID
void x_DeserializeIndexVectors(const unsigned char *&buf_ptr, size_t &nr_cols, bool skip_feat_length=false)
size_t GetNumberOfIndexVecs() const
void x_DeserializeDescr_Range(const unsigned char *buf_ptr, const size_t &nr_cols, const TSeqRange *range=nullptr, const set< string > &cols_to_decode=set< string >(), bool only_start=false)
static const vector< string > & s_GetAllColNames()
contains sm_INFO, sm_SAMPLES
static string s_GetPreviousVersion()
static const vector< string > & s_GetColNames()
does not contain sm_INFO and sm_SAMPLES
TSeqPos m_StartPos
in genomic coordinates (1-based)
CPosToIndex m_Posindexmap
void x_DeserializeAllData()
static const string sm_MissingValue
vector< char > * m_Buffer
TSeqPos m_StopPos
in genomic coordinates (1-based)
void x_SerializeData() const
unsigned m_MaxFeatLength
maximum feature variant length
CVariantDescriptors m_Descriptors
bool GetIndicesForVariant(const string &variant_id, TSparseStrVector::bvector_type &indices) const
bool AreInsertersReady() const
const TSparseOptVector & GetSample(const string &name) const
TSparseOptVector & SetSample(const string &name)
const TSparseOptVector & GetInfoField(const string &field_name) const
TSparseOptVector & SetInfoField(const string &field_name)
vector< string > GetInfoFieldNames() const
void FinalizeReading()
Flushes the insert iterators after which it remaps and optimizes each vector.
void PushBackPos(const unsigned &value)
Push back starting position of a variant.
vector< string > GetSampleNames() const
unsigned GetMaxFeatureLength()
Returns the maximum feature length within the set or 0 if the end points are not specified.
void PushBack(const string &label, const string &value)
Push back 'value' into the vector identified by 'label' The 'value' is not actually stored in the vec...
void GetIndicesForMissingVarID(TSparseStrVector::bvector_type &indices) const
const_iterator begin() const noexcept
Provide const iterator access to container content.
std::ofstream out("events_result.xml")
main entry point for tests
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
const string & GetMsg(void) const
Get message string.
virtual const char * what(void) const noexcept
Standard report (includes full backlog).
void Info(CExceptionArgs_Base &args)
void Close(void)
Close file.
void Open(const string &filename, EOpenMode open_mode, EAccessMode access_mode, EShareMode share_mode=eShare)
Open file.
Uint8 GetFileSize(void) const
Get file size.
size_t Read(void *buf, size_t count) const
Read file.
@ eOpen
Open an existing file, or create a new one.
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate spaces in a string (in-place)
static string UIntToString(unsigned int value, TNumToStringFlags flags=0, int base=10)
Convert UInt to string.
static unsigned int StringToUInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to unsigned int.
@ fConvErr_NoThrow
Do not throw an exception on error.
static const char label[]
Lightweight interface for getting lines of data with minimal memory copying.
bool RemoveFile(const string &fname)
void SerializeColumn(SV &vec, const string &prefix, const string &col_name, CNcbiOstream *out, unsigned &cum_memory_used, unsigned &cum_layout_size)
void PrintStats(const bm::bv_statistics &sum, CNcbiOstream &out)
void AddStats(bm::bv_statistics &sum, SV &vec, CNcbiOstream &out)
void DeserializeColumn(SV &vec, const string &prefix, const string &col_name, CNcbiOstream *out)
void PrintToFile(const char *buff, size_t size, const string &fname)
string GenerateColFileName(const string &prefix, const string &col_name)
void print_svector_stat(TOut &tout, const SV &svect, bool print_sim=false)
range(_Ty, _Ty) -> range< _Ty >
constexpr auto sort(_Init &&init)
Compressed bitset (entry point to bm.h)
static const char * prefix[]
Structure with statistical information about memory allocation footprint, serialization projection,...
void reset() noexcept
Reset statisctics.