58 edit::CParseTextOptions options;
59 options.SetStartText(
"ID=");
60 options.SetStopText(
",");
61 m_Name = options.GetSelectedText(line);
63 options.SetStartText(
",Number=");
64 options.SetStopText(
",");
65 m_Number = options.GetSelectedText(line);
67 options.SetStartText(
",Description=\"");
68 options.SetStopText(
"\"");
77 unsigned nr_lines = 0;
82 if (nr_lines % 200 == 0 && (canceled && canceled->
IsCanceled())) {
108 edit::CParseTextOptions options;
109 options.SetStartText(
"accession=");
110 options.SetStopText(
",");
113 options.SetStopText(
">");
137 "Line starting with ##fileformat is missing",
151 if (header_line.find(
" ") !=
NPOS || header_line.find(
"\t") ==
NPOS) {
156 "Header line expected to be tab delimited",
160 vector<string> col_names;
163 bool is_unique =
false;
165 set<string> unique_strs(col_names.begin(), col_names.end());
166 is_unique = (col_names.size() == unique_strs.
size());
174 "Column names are not unique",
179 auto it = col_names.begin();
180 for (; it != col_names.end(); ++it) {
188 for (; it != col_names.end(); ++it, ++index) {
216 "Error allowance exceeded",
227 bool placed = error_cont->
PutError(err);
235 "Error allowance exceeded",
243 auto start = chrono::steady_clock::now();
246 unsigned nr_lines = 0;
247 unsigned lines_per_contig = 0;
250 string previous_chrom;
252 vector<future<void>> async_calls;
253 auto JoinOptimization = [&async_calls]()
256 for (
auto& task : async_calls) {
263 auto task_start = chrono::steady_clock::now();
265 auto opt_start = chrono::steady_clock::now();
266 var_map->FinalizeReading();
267 auto diff_opt = chrono::steady_clock::now() - opt_start;
268 LOG_POST(
Info <<
"Optimization of " << chr <<
" took " << chrono::duration_cast<chrono::milliseconds>(diff_opt).
count() <<
" ms");
270 if (on_variants_list_ready) {
271 on_variants_list_ready(*var_map);
274 auto diff_opt = chrono::steady_clock::now() - task_start;
279 while (
in.good() && !
in.eof()) {
280 if (nr_lines % 1000 == 0 && (canceled && canceled->
IsCanceled())) {
294 if (line.empty() || (!line.empty() && line[0] ==
'#')) {
298 if (line.find(
"\t") ==
NPOS) {
303 "Has been skipped as it is not tab delimited:\n" + line,
309 size_t pos = line.find(
"\t");
310 string chrom = line.substr(0, pos);
311 if (chrom != previous_chrom) {
312 if (!previous_chrom.empty()) {
314 async_calls.push_back(async(std::launch::async | std::launch::deferred, OptimizeVariantsList, std::ref(
m_ChromosomeMap.at(previous_chrom))));
318 previous_chrom = chrom;
319 lines_per_contig = 0;
324 vars_list = inserted.first->second.GetPointer();
331 chrom +
" data line found out of its block. All entries for a specific CHROM should form a contiguous block within the VCF file.",
338 if (prog_func && lines_per_contig > 0 && lines_per_contig % 500000 == 0) {
345 vars_list->ParseLine(line);
359 auto diff_parsing = chrono::steady_clock::now() - start;
360 LOG_POST(
Info <<
"Parsed " << nr_lines <<
" lines from VCF file in "
361 << chrono::duration_cast<chrono::milliseconds>(diff_parsing).
count() <<
" ms ");
370 if (!
in.eof() && !
in.good()) {
371 LOG_POST(
Error <<
"Reading cannot be completed, as input stream is corrupted");
384 if (on_variants_list_ready) {
396 if (header_line.find(
" ") !=
NPOS || header_line.find(
"\t") ==
NPOS) {
401 "Header line is expected to be tab delimited",
407 const unsigned kMandatoryCols = 8;
408 unsigned nr_tabs =
static_cast<unsigned>(
count(header_line.begin(), header_line.end(),
'\t'));
409 if (nr_tabs + 1 < kMandatoryCols) {
414 "Header line is expected to have at least 8 columns",
458 vector<CColumnarVCFReader::TSeqIdVarsListPair>
468 auto start = chrono::steady_clock::now();
470 unsigned nr_lines = 0;
471 unsigned lines_per_contig = 0;
474 size_t search_chrs = chr_list.size();
477 vector<future<void>> async_calls;
478 auto JoinOptimization = [&async_calls]()
481 for (
auto& task : async_calls) {
487 auto chr = var_map->GetChrName();
488 auto task_start = chrono::steady_clock::now();
490 auto opt_start = chrono::steady_clock::now();
491 var_map->FinalizeReading();
492 auto diff_opt = chrono::steady_clock::now() - opt_start;
493 LOG_POST(
Info <<
"Optimization of " << chr <<
" took " << chrono::duration_cast<chrono::milliseconds>(diff_opt).
count() <<
" ms");
495 if (on_variants_list_ready) {
496 on_variants_list_ready(*var_map);
499 auto diff_opt = chrono::steady_clock::now() - task_start;
503 auto CallOptimizeVarsList = [&]() {
505 async_calls.push_back(async(std::launch::async | std::launch::deferred, OptimizeVariantsList, vcf_vars));
507 OptimizeVariantsList(vcf_vars);
511 while (
in.good() && !
in.eof() && search_chrs > 0) {
512 if (nr_lines % 1000 == 0 && (canceled && canceled->
IsCanceled())) {
515 variants_list.clear();
516 return variants_list;
528 if (line.empty() || (!line.empty() && line[0] ==
'#')) {
532 if (line.find(
"\t") ==
NPOS) {
537 "Has been skipped as it is not tab delimited:\n" + line,
543 size_t pos = line.find(
"\t");
544 string chrom = line.substr(0, pos);
545 if (!vcf_vars || (vcf_vars && !
NStr::EqualCase(vcf_vars->GetChrName(), chrom))) {
546 if (prev_chrom == chrom)
550 for (
const auto& syn_it : chr_list) {
551 const auto& seq_id = syn_it.first;
552 const auto& synonyms = syn_it.second;
553 if (find_if(synonyms.begin(), synonyms.end(),
554 [&chrom](
const string& elem) { return NStr::EqualCase(chrom, elem); }) != synonyms.end()) {
558 CallOptimizeVarsList();
559 lines_per_contig = 0;
563 if (find_if(variants_list.begin(), variants_list.end(),
564 [&seq_id](
const TSeqIdVarsListPair& elem) { return (seq_id->AsFastaString() == elem.first->AsFastaString()); }) == variants_list.end()) {
566 vcf_vars = variants_list.back().second;
573 chrom +
" data line found out of its block. All entries for a specific CHROM should form a contiguous block within the VCF file.",
583 CallOptimizeVarsList();
585 vcf_vars.
Reset(
nullptr);
589 lines_per_contig = 0;
594 if (prog_func && lines_per_contig > 0 && lines_per_contig % 500000 == 0) {
599 vcf_vars->ParseLine(line);
624 auto diff_parsing = chrono::steady_clock::now() - start;
626 << chrono::duration_cast<chrono::milliseconds>(diff_parsing).
count() <<
" ms ");
631 variants_list.clear();
632 return variants_list;
635 if (!
in.good() && !
in.eof()) {
636 LOG_POST(
Error <<
"Reading cannot be completed, as input stream is corrupted");
640 variants_list.clear();
645 OptimizeVariantsList(vcf_vars);
650 if (chr_list.size() != variants_list.size()) {
651 for (
const auto& chr_it : chr_list) {
652 if (find_if(variants_list.begin(), variants_list.end(),
654 { return elem.first->Equals(*chr_it.first); }) == variants_list.end()) {
656 auto id_str = chr_it.first->AsFastaString();
661 "Chromosome " + id_str +
" is not in the file",
668 if (!on_variants_list_ready) {
669 for (
auto& var_it : variants_list) {
674 return variants_list;
679 vector<string>
names;
681 names.push_back(it.first);
700 it.second->GetStatistics(
out);
707 it.second->SerializeVariantData(prefix,
out);
714 it.second->DeserializeAndCheck(prefix,
out);
721 it.second->List(
out, only_sv_cols);
728 it.second->ListPositionVectors(
out);
Debugging functions (internal). Poorly documented, not well written.
Serialization for sparse_vector<>
void ListColumns(CNcbiOstream &out, bool only_sv_cols=false)
void SerializeToDisk(const string &prefix, CNcbiOstream *out=nullptr)
void GetStatistics(CNcbiOstream &out)
void Deserialize(const string &prefix, CNcbiOstream *out=nullptr)
void ListIndexVectors(CNcbiOstream &out)
bool m_LoadAllInfo
Flag to load every INFO field.
pair< CConstRef< objects::CSeq_id >, CRef< CVCFVariantList > > TSeqIdVarsListPair
void x_ProcessCriticalError(objects::CObjReaderLineException &err, objects::ILineErrorListener *error_cont)
vector< string > GetChromosomeNames() const
Returns a vector, holding the chrs/contigs identifiers, read from the file.
map< unsigned, string > m_SampleCols
List of SAMPLE columns parsed from the last line of the header, order is important.
unsigned x_ProcessHeaderLine(const string &header_line, unsigned line_nr, objects::ILineErrorListener *listener)
map< unsigned, string > m_LoadSamples
List of SAMPLES required to be loaded.
function< void(const string &)> TReportProgress
void x_ProcessError(objects::CObjReaderLineException &err, objects::ILineErrorListener *error_cont)
CRef< CVCFVariantList > GetVariantsForChr(const string &chr_name) const
Retrieves the variants list for a given chr/contig.
bool ReadHeader(CNcbiIstream &in, ICanceled *canceled=nullptr, objects::ILineErrorListener *listener=nullptr)
Reads only the header section of the file.
void x_GetSamplesToLoad(const string &header_line, objects::ILineErrorListener *listener, unsigned line_nr)
set< CConstRef< SVcfFieldData > > m_InfoFields
List of INFO fields parsed from the header of the file.
bool m_LoadAllSamples
Flag to load every SAMPLE column.
void x_ProcessWarning(objects::CObjReaderLineException &err, objects::ILineErrorListener *error_cont)
std::function< void(CVCFVariantList &)> TOnVCFVariantListReady
Defines a callable object, used when a variants list is processed by the reader.
void x_InterruptReading()
void x_GatherSampleColNames(const string &header_line, objects::ILineErrorListener *listener, unsigned line_nr)
unordered_map< string, CRef< CVCFVariantList > > m_ChromosomeMap
set< string > m_LoadInfoFields
List of INFO fields required to be loaded.
vector< TSeqIdVarsListPair > ReadVariantsForChrs(CNcbiIstream &in, const vector< pair< CConstRef< objects::CSeq_id >, vector< string >>> &chr_list, ICanceled *canceled=nullptr, objects::ILineErrorListener *listener=nullptr, TReportProgress prog_func=TReportProgress(), TOnVCFVariantListReady on_variants_list_ready=TOnVCFVariantListReady())
Reads a list of variants.
bool ReadData(CNcbiIstream &in, ICanceled *canceled=nullptr, objects::ILineErrorListener *listener=nullptr, TReportProgress prog_func=TReportProgress(), TOnVCFVariantListReady on_variants_list_ready=TOnVCFVariantListReady())
Reads only the data section of the file.
void Throw(void) const
this function to throw this object.
static CObjReaderLineException * Create(EDiagSev eSeverity, unsigned int uLine, const std::string &strMessage, EProblem eProblem=eProblem_GeneralParsingError, const std::string &strSeqId=string(""), const std::string &strFeatureName=string(""), const std::string &strQualifierName=string(""), const std::string &strQualifierValue=string(""), CObjReaderLineException::EErrCode eErrCode=eFormat, const TVecOfLines &vecOfOtherLines=TVecOfLines())
Please use this instead of the constructor because the ctor is protected.
std::string Message() const
Simple implementation of ILineReader for i(o)streams.
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
const string & GetChrName() const
static const string sm_FORMAT
Interface for testing cancellation request in a long lasting operation.
virtual bool PutError(const ILineError &)=0
Store error in the container, and return true if error was stored fine, and return false if the calle...
@ eProblem_GeneralParsingError
string SeverityStr() const
iterator_bool insert(const value_type &val)
const_iterator begin() const
const_iterator end() const
const Uint8 kAsyncVarsThreshold
std::ofstream out("events_result.xml")
main entry point for tests
static const struct name_t names[]
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
@ eDiag_Warning
Warning message.
@ eDiag_Critical
Critical error message.
void Error(CExceptionArgs_Base &args)
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
const string & GetMsg(void) const
Get message string.
void Info(CExceptionArgs_Base &args)
char PeekChar(void) const
Returns the first character of the next string without consuming it.
Uint8 GetLineNumber(void) const
Returns the current line number (counting from 1, not 0).
void Reset(void)
Reset reference object.
TObjectType * Release(void)
Release a reference to the object and return a pointer to the object.
uint64_t Uint8
8-byte (64-bit) unsigned integer
#define END_NCBI_SCOPE
End previously defined NCBI scope.
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
virtual bool IsCanceled(void) const =0
CNcbiIstream & NcbiGetlineEOL(CNcbiIstream &is, string &str, string::size_type *count=NULL)
Read from "is" to "str" the next line (taking into account platform specifics of End-of-Line)
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
static bool EqualCase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-sensitive equality of a substring with another string.
bool empty(void) const
Return true if the represented string is empty (i.e., the length is zero)
static string UIntToString(unsigned int value, TNumToStringFlags flags=0, int base=10)
Convert UInt to string.
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
CTempString substr(size_type pos) const
Obtain a substring from this string, beginning at a given offset.
static string UInt8ToString(Uint8 value, TNumToStringFlags flags=0, int base=10)
Convert UInt8 to string.
Lightweight interface for getting lines of data with minimal memory copying.
Compressed bitset (entry point to bm.h)
std::istream & in(std::istream &in_, double &x_)
Structure to store characteristics of an INFO field It is constructed from an INFO meta-information l...
string m_Name
INFO ID (name)
string m_Description
INFO Description.
SVcfFieldData(const string &line)