NCBI C++ ToolKit
vcf_reader.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: vcf_reader.cpp 93579 2021-05-01 20:54:52Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Frank Ludwig
27  *
28  * File Description:
29  * VCF file reader
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 
36 #include <util/line_reader.hpp>
37 
42 
45 
52 
54 
55 #include <algorithm>
56 
58 
59 #define NCBI_USE_ERRCODE_X Objtools_Rd_RepMask
60 
62 BEGIN_objects_SCOPE // namespace ncbi::objects::
63 
64 // ============================================================================
65 const double CVcfReader::mMaxSupportedVersion = 4.1;
66 // ============================================================================
67 
68 // ============================================================================
69 class CVcfData
70 // ============================================================================
71 {
72 public:
75 
76  CVcfData() { m_pdQual = 0; };
77  ~CVcfData() { delete m_pdQual; };
78 
79  string m_strLine;
80  string m_strChrom;
81  int m_iPos;
82  vector<string> m_Ids;
83  string m_strRef;
84  vector<string> m_Alt;
85  double* m_pdQual;
86  string m_strFilter;
88  vector<string> m_FormatKeys;
89 // vector< vector<string> > m_GenotypeData;
91  enum SetType_t {
96  ST_MIXED
98 };
99 
100 // ----------------------------------------------------------------------------
102  const string& spectype )
103 // ----------------------------------------------------------------------------
104 {
105  static map<string, ESpecType> typemap;
106  if ( typemap.empty() ) {
107  typemap["Integer"] = eType_Integer;
108  typemap["Float"] = eType_Float;
109  typemap["Flag"] = eType_Flag;
110  typemap["Character"] = eType_Character;
111  typemap["String"] = eType_String;
112  }
113  try {
114  return typemap[spectype];
115  }
116  catch( ... ) {
120  0,
121  "CVcfReader::xProcessMetaLineInfo: Unrecognized line or record type.",
123  pErr->Throw();
124  return eType_String;
125  }
126 };
127 
128 // ----------------------------------------------------------------------------
130  const string& specnumber )
131 // ----------------------------------------------------------------------------
132 {
133  if ( specnumber == "R" ) {
135  }
136  if ( specnumber == "A" ) {
137  return eNumber_CountAlleles;
138  }
139  if ( specnumber == "G" ) {
140  return eNumber_CountGenotypes;
141  }
142  if ( specnumber == "." ) {
143  return eNumber_CountUnknown;
144  }
145  try {
146  return ESpecNumber( NStr::StringToInt( specnumber ) );
147  }
148  catch( ... ) {
152  0,
153  "CVcfReader::xProcessMetaLineInfo: Unrecognized SpecNumber type in FORAMT directive. "
154  "Recognized settings are \'A\', \'G\', \'R\', \'.\', or numeric.",
156  pErr->Throw();
157  }
158  return ESpecNumber( 0 );
159 };
160 
161 // ----------------------------------------------------------------------------
163  int flags,
164  CReaderListener* pRL):
165  CReaderBase(flags, "", "", CReadUtil::AsSeqId, pRL),
166  mActualVersion(0.0),
167  m_MetaHandled(false)
168 // ----------------------------------------------------------------------------
169 {
170 }
171 
172 
173 // ----------------------------------------------------------------------------
175 // ----------------------------------------------------------------------------
176 {
177 }
178 
179 // ----------------------------------------------------------------------------
182  ILineReader& lr,
183  ILineErrorListener* pEC )
184 // ----------------------------------------------------------------------------
185 {
186  if (!m_Meta) {
187  m_Meta.Reset( new CAnnotdesc );
188  m_Meta->SetUser().SetType().SetStr( "vcf-meta-info" );
189  }
191  if (pAnnot) {
192  xAssignTrackData(*pAnnot);
193  xAssignVcfMeta(*pAnnot);
194  }
195  return pAnnot;
196 }
197 
198 // ----------------------------------------------------------------------------
201 // ----------------------------------------------------------------------------
202 {
204  pAnnot->SetData().SetFtable();
205  return pAnnot;
206 }
207 
208 // ----------------------------------------------------------------------------
209 void
211  ILineReader& lr,
212  TReaderData& readerData)
213 // ----------------------------------------------------------------------------
214 {
215  readerData.clear();
216  string line;
217  if (!xGetLine(lr, line)) {
218  return;
219  }
220  if (xIsTrackLine(line) && m_uDataCount) {
221  xUngetLine(lr);
222  return;
223  }
224  readerData.push_back(TReaderLine{m_uLineNumber, line});
225 }
226 
227 // ----------------------------------------------------------------------------
228 void
230  const TReaderData& readerData,
231  CSeq_annot& annot)
232 // ----------------------------------------------------------------------------
233 {
234  for (auto lineInfo: readerData) {
235  const auto& line = lineInfo.mData;
236  if (mActualVersion == 0.0) {
237  bool lineContainsVersion(false);
238  xSetFileFormat(line, annot, lineContainsVersion);
239  // Note:
240  // Currently, the line format specifier is also processed as a
241  // meta line even though it really isn't, or at least it's a very
242  // different meta than all other VCF metas.
243  // Uncomment the following lines to no longer process the file format
244  // specifier as a meta.
245  //if (lineContainsVersion) {
246  //return;
247  //}
248  }
249 
250  if (xParseBrowserLine(line, annot)) {
251  return;
252  }
253  if (xProcessTrackLine(line, annot)) {
254  return;
255  }
256  if (xProcessMetaLine(line, annot)) {
257  return;
258  }
259  if (xProcessHeaderLine(line, annot)) {
260  return;
261  }
262  if (xProcessDataLine(line, annot)) {
263  ++m_uDataCount;
264  return;
265  }
266  CReaderMessage warning(
269  "CVcfReader::ReadSeqAnnot: Unrecognized line or record type.");
270  m_pMessageHandler->Report(warning);
271  }
272 }
273 
274 // ----------------------------------------------------------------------------
275 bool
277  const string& line,
278  CSeq_annot& annot)
279 // ----------------------------------------------------------------------------
280 {
281  if ( ! NStr::StartsWith( line, "##" ) ) {
282  if ( !m_MetaDirectives.empty() && !m_MetaHandled ) {
283  m_Meta->SetUser().AddField("meta-information", m_MetaDirectives);
284  }
285  m_MetaHandled = true;
286  return false;
287  }
288  m_MetaDirectives.push_back(line.substr(2));
289 
290  if (xProcessMetaLineInfo(line, annot)) {
291  return true;
292  }
293  if (xProcessMetaLineFilter(line, annot)) {
294  return true;
295  }
296  if (xProcessMetaLineFormat(line, annot)) {
297  return true;
298  }
299  return true;
300 }
301 
302 // ----------------------------------------------------------------------------
303 void
305  const string& line,
306  CSeq_annot& annot,
307  bool& lineContainsVersion)
308 // ----------------------------------------------------------------------------
309 {
310  const string prefix = "##fileformat=VCFv";
311 
312  if (!NStr::StartsWith(line, prefix)) {
313  CReaderMessage warning(
316  string("CVcfReader::xProcessMetaLineFileFormat: ") +
317  "Missing VCF version string. Assuming VCFv" +
319  ". Proceed with care!");
320  m_pMessageHandler->Report(warning);
322  lineContainsVersion = false;
323  return;
324  }
325 
326  lineContainsVersion = true;
327 
328  string versionStr = line.substr(prefix.length(), string::npos);
329  try {
330  mActualVersion = NStr::StringToDouble(versionStr);
331  }
332  catch (std::exception except) {
333  CReaderMessage warning(
336  string("CVcfReader::xProcessMetaLineFileFormat: ") +
337  "Data file contains an unrecognized version string \"" +
338  versionStr +
339  "\". Assuming VCFv" +
341  ". Proceed with care!");
342  m_pMessageHandler->Report(warning);
344  return;
345  }
346 
348  CReaderMessage warning(
351  string("CVcfReader::xProcessMetaLineFileFormat: Data file format \"") +
352  versionStr +
353  "\" exceeds reader supported format \"" +
355  "\". Proceed with care!");
356  m_pMessageHandler->Report(warning);
358  return;
359  }
360 }
361 
362 // ----------------------------------------------------------------------------
363 bool
365  const string& line,
366  CSeq_annot& annot)
367 // ----------------------------------------------------------------------------
368 {
369  const string prefix = "##INFO=<";
370  const string postfix = ">";
371 
372  if ( ! NStr::StartsWith( line, prefix ) || ! NStr::EndsWith( line, postfix ) ) {
373  return false;
374  }
375 
376  try {
377  vector<string> fields;
378  string key, id, numcount, type, description;
379  string info = line.substr(
380  prefix.length(), line.length() - prefix.length() - postfix.length() );
381  NStr::Split( info, ",", fields );
382  NStr::SplitInTwo( fields[0], "=", key, id );
383  if ( key != "ID" ) {
386  eDiag_Error,
387  0,
388  "CVcfReader::xProcessMetaLineInfo: ##INFO with bad or missing \"ID\".",
390  pErr->Throw();
391  }
392  NStr::SplitInTwo( fields[1], "=", key, numcount );
393  if ( key != "Number" ) {
396  eDiag_Error,
397  0,
398  "CVcfReader::xProcessMetaLineInfo: ##INFO with bad or missing \"Number\".",
400  pErr->Throw();
401  }
402  NStr::SplitInTwo( fields[2], "=", key, type );
403  if ( key != "Type" ) {
406  eDiag_Error,
407  0,
408  "CVcfReader::xProcessMetaLineInfo: ##INFO with bad or missing \"Type\".",
410  pErr->Throw();
411  }
412  NStr::SplitInTwo( fields[3], "=", key, description );
413  if ( key != "Description" ) {
416  eDiag_Error,
417  0,
418  "CVcfReader::xProcessMetaLineInfo: ##INFO with bad or missing \"Description\".",
420  pErr->Throw();
421  }
422  m_InfoSpecs[id] = CVcfInfoSpec( id, numcount, type, description );
423  }
424  catch (CObjReaderLineException& err) {
425  ProcessError(err, nullptr);
426  }
427  return true;
428 }
429 
430 // ----------------------------------------------------------------------------
431 bool
433  const string& line,
434  CSeq_annot& annot)
435 // ----------------------------------------------------------------------------
436 {
437  const string prefix = "##FILTER=<";
438  const string postfix = ">";
439 
440  if ( ! NStr::StartsWith( line, prefix ) || ! NStr::EndsWith( line, postfix ) ) {
441  return false;
442  }
443 
444  try {
445  vector<string> fields;
446  string key, id, description;
447  string info = line.substr(
448  prefix.length(), line.length() - prefix.length() - postfix.length() );
449  NStr::Split( info, ",", fields );
450  NStr::SplitInTwo( fields[0], "=", key, id );
451  if ( key != "ID" ) {
454  eDiag_Error,
455  0,
456  "CVcfReader::xProcessMetaLineInfo: ##FILTER with bad or missing \"ID\".",
458  pErr->Throw();
459  }
460  NStr::SplitInTwo( fields[1], "=", key, description );
461  if ( key != "Description" ) {
464  eDiag_Error,
465  0,
466  "CVcfReader::xProcessMetaLineInfo: ##FILTER with bad or missing \"Description\".",
468  pErr->Throw();
469  }
470  m_FilterSpecs[id] = CVcfFilterSpec( id, description );
471  }
472  catch (CObjReaderLineException& err) {
473  ProcessError(err, nullptr);
474  }
475  return true;
476 }
477 
478 // ----------------------------------------------------------------------------
479 bool
481  const string& line,
482  CSeq_annot& annot)
483 // ----------------------------------------------------------------------------
484 {
485  const string prefix = "##FORMAT=<";
486  const string postfix = ">";
487 
488  if ( ! NStr::StartsWith( line, prefix ) || ! NStr::EndsWith( line, postfix ) ) {
489  return false;
490  }
491 
492  try {
493  vector<string> fields;
494  string key, id, numcount, type, description;
495  string info = line.substr(
496  prefix.length(), line.length() - prefix.length() - postfix.length() );
497  NStr::Split( info, ",", fields );
498  NStr::SplitInTwo( fields[0], "=", key, id );
499  if ( key != "ID" ) {
502  eDiag_Error,
503  0,
504  "CVcfReader::xProcessMetaLineInfo: ##FORMAT with bad or missing \"ID\".",
506  pErr->Throw();
507  }
508  NStr::SplitInTwo( fields[1], "=", key, numcount );
509  if ( key != "Number" ) {
512  eDiag_Error,
513  0,
514  "CVcfReader::xProcessMetaLineInfo: "
515  "##FORMAT with bad or missing \"Number\".",
517  pErr->Throw();
518  }
519  NStr::SplitInTwo( fields[2], "=", key, type );
520  if ( key != "Type" ) {
523  eDiag_Error,
524  0,
525  "CVcfReader::xProcessMetaLineInfo: "
526  "##FORMAT with bad or missing \"Type\".",
528  pErr->Throw();
529  }
530  NStr::SplitInTwo( fields[3], "=", key, description );
531  if ( key != "Description" ) {
534  eDiag_Error,
535  0,
536  "CVcfReader::xProcessMetaLineInfo: "
537  "##FORMAT with bad or missing \"Description\".",
539  pErr->Throw();
540  }
541  m_FormatSpecs[id] = CVcfFormatSpec( id, numcount, type, description );
542  }
543  catch (CObjReaderLineException& err) {
544  ProcessError(err, nullptr);
545  }
546  return true;
547 }
548 
549 // ----------------------------------------------------------------------------
550 bool
552  const string& line,
553  CSeq_annot& annot )
554 // ----------------------------------------------------------------------------
555 {
556  if ( ! NStr::StartsWith( line, "#CHROM" ) ) {
557  return false;
558  }
559 
560  //
561  // Per spec:
562  // The header line provides the column headers for the data records that follow.
563  // the first few are fixed and mandatory: CHROM .. FILTER.
564  // If genotype data is present this is followed by the FORMAT header.
565  // After that come the various headers for the genotype information, and these
566  // need to be preserved:
567  //
569  vector<string>::iterator pos_format = find(
570  m_GenotypeHeaders.begin(), m_GenotypeHeaders.end(), "FORMAT");
571  if ( pos_format == m_GenotypeHeaders.end() ) {
572  m_GenotypeHeaders.clear();
573  }
574  else {
575  m_GenotypeHeaders.erase( m_GenotypeHeaders.begin(), pos_format+1 );
576  m_Meta->SetUser().AddField("genotype-headers", m_GenotypeHeaders);
577  }
578 
579  //
580  // The header line signals the end of meta information, so migrate the
581  // accumulated meta information into the seq descriptor:
582  //
583  return true;
584 }
585 
586 // ----------------------------------------------------------------------------
587 bool
589  CSeq_annot& annot)
590 // ----------------------------------------------------------------------------
591 {
592  if (m_Meta && m_Meta->IsUser() && m_Meta->GetUser().IsSetData()) {
593  if (!annot.IsSetDesc()) {
595  annot.SetDesc(*desc);
596  }
597  annot.SetDesc().Set().push_back( m_Meta );
598  }
599  //else { // VCF input ought to include a header
600  // CReaderMessage warning(
601  // eDiag_Warning,
602  // m_uLineNumber,
603  // "CVcfReader::xAssignVcfMeta: Missing VCF header data.");
604  // m_pMessageHandler->Report(warning);
605  //}
606  return true;
607 }
608 
609 // ----------------------------------------------------------------------------
610 bool
612  const string& line,
613  CSeq_annot& annot)
614 // ----------------------------------------------------------------------------
615 {
616  if ( NStr::StartsWith( line, "#" ) ) {
617  return false;
618  }
619 
620  CVcfData data;
621  if (!xParseData(line, data, nullptr)) {
622  return false;
623  }
624  CRef<CSeq_feat> pFeat( new CSeq_feat );
625  pFeat->SetData().SetVariation().SetData().SetSet().SetType(
626  CVariation_ref::C_Data::C_Set::eData_set_type_package );
627  pFeat->SetData().SetVariation().SetVariant_prop().SetVersion( 5 );
628  CSeq_feat::TExt& ext = pFeat->SetExt();
629  ext.SetType().SetStr( "VcfAttributes" );
630 
631  if (!xAssignFeatureLocationSet(data, pFeat)) {
632  return false;
633  }
634  if (!xAssignVariationIds(data, pFeat)) {
635  return false;
636  }
637  if (!xAssignVariationAlleleSet(data, pFeat)) {
638  return false;
639  }
640  if (!xProcessScore(data, pFeat)) {
641  return false;
642  }
643  if (!xProcessFilter(data, pFeat)) {
644  return false;
645  }
646  if (!xProcessInfo( data, pFeat)) {
647  return false;
648  }
649  if (!xProcessFormat(data, pFeat)) {
650  return false;
651  }
652 
653  if ( pFeat->GetExt().GetData().empty() ) {
654  pFeat->ResetExt();
655  }
656  annot.SetData().SetFtable().push_back(pFeat);
657  return true;
658 }
659 
660 // ----------------------------------------------------------------------------
661 bool
663  const CVcfData& data,
664  CRef<CSeq_feat> pFeature )
665 // ----------------------------------------------------------------------------
666 {
668  pFeature->SetData().SetVariation().SetData().SetSet().SetVariations();
669 
670  //make one variation for the reference
671  CRef<CVariation_ref> pIdentity(new CVariation_ref);
672  vector<string> variant;
673 
674  switch(data.m_SetType) {
676  pIdentity->SetDeletion();
677  break;
678  default:
679  variant.push_back(data.m_strRef);
680  pIdentity->SetSNV(variant, CVariation_ref::eSeqType_na);
681  break;
682  }
683  CVariation_inst& instance = pIdentity->SetData().SetInstance();
686  if (data.m_SetType != CVcfData::ST_ALL_INS) {
687  variants.push_back(pIdentity);
688  }
689 
690  bool no_alt = true;
691  for (unsigned int i=0; i < data.m_Alt.size(); ++i) {
692  if (!NStr::Equal(data.m_Alt[i],".")) {
693  no_alt = false;
694  }
695  }
696  if (no_alt) {
698  return true;
699  }
700 
701 
702  //add additional variations, one for each alternative
703  for (unsigned int i=0; i < data.m_Alt.size(); ++i) {
704  if (NStr::Equal(data.m_Alt[i],".")) {
705  continue;
706  }
707  switch(data.m_SetType) {
708  default:
709  if (!xAssignVariantDelins(data, i, pFeature)) {
710  return false;
711  }
712  break;
714  if (!xAssignVariantSnv(data, i, pFeature)) {
715  return false;
716  }
717  break;
719  if (!xAssignVariantMnv(data, i, pFeature)) {
720  return false;
721  }
722  break;
724  if (!xAssignVariantIns(data, i, pFeature)) {
725  return false;
726  }
727  break;
729  if (!xAssignVariantDel(data, i, pFeature)) {
730  return false;
731  }
732  break;
733  }
734  }
735  return true;
736 }
737 
738 // ----------------------------------------------------------------------------
739 bool
741  const CVcfData& data,
742  unsigned int index,
743  CRef<CSeq_feat> pFeature )
744 // ----------------------------------------------------------------------------
745 {
747  pFeature->SetData().SetVariation().SetData().SetSet().SetVariations();
748 
749  CRef<CVariation_ref> pVariant(new CVariation_ref);
750  {{
751  vector<string> variant;
752  variant.push_back(data.m_Alt[index]);
753  pVariant->SetSNV(variant, CVariation_ref::eSeqType_na);
754  }}
755  variants.push_back(pVariant);
756  return true;
757 }
758 
759 // ----------------------------------------------------------------------------
760 bool
762  const CVcfData& data,
763  unsigned int index,
764  CRef<CSeq_feat> pFeature )
765 // ----------------------------------------------------------------------------
766 {
768  pFeature->SetData().SetVariation().SetData().SetSet().SetVariations();
769 
770  CRef<CVariation_ref> pVariant(new CVariation_ref);
771  {{
772  vector<string> variant;
773  variant.push_back(data.m_Alt[index]);
774  pVariant->SetMNP(variant, CVariation_ref::eSeqType_na);
775  }}
776  variants.push_back(pVariant);
777  return true;
778 }
779 
780 // ----------------------------------------------------------------------------
781 static void
783  CVariation_inst& instance )
784 {
785  CRef<CDelta_item> pItem(new CDelta_item);
786 
787  pItem->SetSeq().SetThis();
789  pItem->SetAction(CDelta_item::eAction_del_at);
790  instance.SetDelta().push_back(pItem);
791 }
792 
793 
794 // ----------------------------------------------------------------------------
795 bool
797  const CVcfData& data,
798  unsigned int index,
799  CRef<CSeq_feat> pFeature )
800 // ----------------------------------------------------------------------------
801 {
803  pFeature->SetData().SetVariation().SetData().SetSet().SetVariations();
804 
805  CRef<CVariation_ref> pVariant(new CVariation_ref);
806  {{
807  //pVariant->SetData().SetNote("DEL");
808  pVariant->SetDeletion();
809  CVariation_inst& instance = pVariant->SetData().SetInstance();
810  s_AddDeleteDeltaItem(instance);
811 
812  }}
813  variants.push_back(pVariant);
814  return true;
815 }
816 
817 // ----------------------------------------------------------------------------
818 bool
820  const CVcfData& data,
821  unsigned int index,
822  CRef<CSeq_feat> pFeature )
823 // ----------------------------------------------------------------------------
824 {
826  pFeature->SetData().SetVariation().SetData().SetSet().SetVariations();
827 
828  CRef<CVariation_ref> pVariant(new CVariation_ref);
829  {{
830  string insertion(data.m_Alt[index]);
831  CRef<CSeq_literal> pLiteral(new CSeq_literal);
832  pLiteral->SetSeq_data().SetIupacna().Set(insertion);
833  pLiteral->SetLength(
834  static_cast<TSeqPos>(insertion.size()));
835  CRef<CDelta_item> pItem(new CDelta_item);
836  pItem->SetAction(CDelta_item::eAction_ins_before);
837  pItem->SetSeq().SetLiteral(*pLiteral);
838  CVariation_inst& instance = pVariant->SetData().SetInstance();
840  instance.SetDelta().push_back(pItem);
841  }}
842  variants.push_back(pVariant);
843  return true;
844 }
845 
846 
847 
848 // ----------------------------------------------------------------------------
849 bool
851  const CVcfData& data,
852  unsigned int index,
853  CRef<CSeq_feat> pFeature )
854 // ----------------------------------------------------------------------------
855 {
856  string insertion(data.m_Alt[index]);
857 
859  pFeature->SetData().SetVariation().SetData().SetSet().SetVariations();
860 
861  CRef<CVariation_ref> pVariant(new CVariation_ref);
862  CVariation_inst& instance = pVariant->SetData().SetInstance();
863 
864  // If it is a deletion, add Deleteion Delta-item and be done.
865  if (insertion.size() == 0) {
866 
867  s_AddDeleteDeltaItem(instance);
868  variants.push_back(pVariant);
869  return true;
870  }
871 
872  // Must be a SNV or delins
873  CRef<CSeq_literal> pLiteral(new CSeq_literal);
874  pLiteral->SetSeq_data().SetIupacna().Set(insertion);
875  pLiteral->SetLength(static_cast<TSeqPos>(insertion.size()));
876 
877  CRef<CDelta_item> pItem(new CDelta_item);
878  pItem->SetSeq().SetLiteral(*pLiteral);
879  instance.SetDelta().push_back(pItem);
880 
881  //Let's try to smartly set the Type.
882  if (insertion.size() == 1 && data.m_strRef.size() == 1) {
884  } else {
886  }
887 
888  variants.push_back(pVariant);
889  return true;
890 
891 }
892 
893 // ----------------------------------------------------------------------------
894 bool
896  const string& line,
897  CVcfData& data,
898  ILineErrorListener* pEC)
899 // ----------------------------------------------------------------------------
900 {
901  vector<string> columns;
903  if ( columns.size() < 8 ) {
904  return false;
905  }
906  try {
907  data.m_strLine = line;
908 
909  data.m_strChrom = columns[0];
910  data.m_iPos = NStr::StringToInt( columns[1] );
911  NStr::Split( columns[2], ";", data.m_Ids );
912  if ( (data.m_Ids.size() == 1) && (data.m_Ids[0] == ".") ) {
913  data.m_Ids.clear();
914  }
915  data.m_strRef = columns[3];
916  NStr::Split( columns[4], ",", data.m_Alt );
917  if ( columns[5] != "." ) {
918  data.m_pdQual = new double( NStr::StringToDouble( columns[5] ) );
919  }
920  data.m_strFilter = columns[6];
921 
922  vector<string> infos;
923  if ( columns[7] != "." ) {
925  for ( vector<string>::iterator it = infos.begin();
926  it != infos.end(); ++it )
927  {
928  string key, value;
929  NStr::SplitInTwo( *it, "=", key, value );
930  data.m_Info[key] = vector<string>();
931  NStr::Split( value, ",", data.m_Info[key] );
932  }
933  }
934  if ( columns.size() > 8 ) {
936 
937  for ( size_t u=9; u < columns.size(); ++u ) {
938  vector<string> values;
940  data.m_GenotypeData[ m_GenotypeHeaders[u-9] ] = values;
941  }
942  }
943  }
944  catch ( ... ) {
947  eDiag_Error,
948  0,
949  "Unable to parse given VCF data (syntax error).",
951  ProcessError(*pErr, pEC);
952  return false;
953  }
954 
955  if (!xNormalizeData(data, pEC)) {
956  return false;
957  }
958 
959  //assign set type:
960 
961  //test for all SNVs
962  bool maybeAllSnv = (data.m_strRef.size() == 1);
963  if (maybeAllSnv) {
964  for (size_t u=0; u < data.m_Alt.size(); ++u) {
965  if (data.m_Alt[u].size() != 1) {
966  maybeAllSnv = false;
967  break;
968  }
969  }
970  if (maybeAllSnv) {
971  data.m_SetType = CVcfData::ST_ALL_SNV;
972  return true;
973  }
974  }
975 
976  //test for all mnvs:
977  bool maybeAllMnv = true;
978  size_t refSize = data.m_strRef.size();
979  for (size_t u=0; u < data.m_Alt.size(); ++u) {
980  if (data.m_Alt[u].size() != refSize) {
981  maybeAllMnv = false;
982  break;
983  }
984  }
985  if (maybeAllMnv) {
986  data.m_SetType = CVcfData::ST_ALL_MNV;
987  return true;
988  }
989 
990  //test for all insertions:
991  bool maybeAllIns = true;
992  for (size_t u=0; u < data.m_Alt.size(); ++u) {
993  if (! NStr::StartsWith(data.m_Alt[u], data.m_strRef)) {
994  maybeAllIns = false;
995  break;
996  }
997  }
998  if (maybeAllIns) {
999  data.m_SetType = CVcfData::ST_ALL_INS;
1000  return true;
1001  }
1002 
1003  //test for all deletions:
1004  // note: even it is all deletions we are not able to process them
1005  // as such because those deletions would be at different ASN1
1006  // locations. Hence we punt to "indel" if there is more than one
1007  // alternative.
1008  bool maybeAllDel = false;
1009  for (size_t u=0; u < data.m_Alt.size(); ++u) {
1010  if (data.m_Alt.size() == 1 && data.m_Alt[0].empty()) {
1011  maybeAllDel = true;
1012  }
1013  }
1014  if (maybeAllDel) {
1015  data.m_SetType = CVcfData::ST_ALL_DEL;
1016  return true;
1017  }
1018 
1019  data.m_SetType = CVcfData::ST_MIXED;
1020  return true;
1021 }
1022 
1023 
1024 // ---------------------------------------------------------------------------
1025 bool
1027  CVcfData& data,
1028  ILineErrorListener* pEC)
1029 // ---------------------------------------------------------------------------
1030 {
1031  // make sure none of the alternatives is equal to the reference:
1032  for (size_t u=0; u < data.m_Alt.size(); ++u) {
1033  if (data.m_Alt[u] == data.m_strRef) {
1036  eDiag_Error,
1037  0,
1038  "CVcfReader::xNormalizeData: Invalid alternative.",
1040  ProcessError(*pErr, pEC);
1041  return false;
1042  }
1043  }
1044 
1045  // normalize ref/alt by trimming common prefices and adjusting location
1046  bool trimComplete = false;
1047  while (!data.m_strRef.empty()) {
1048  char leadBase = data.m_strRef[0];
1049  for (size_t u=0; u < data.m_Alt.size(); ++u) {
1050  if (!NStr::StartsWith(data.m_Alt[u], leadBase)) {
1051  trimComplete = true;
1052  break;
1053  }
1054  }
1055  if (trimComplete) {
1056  break;
1057  }
1058  data.m_strRef = data.m_strRef.substr(1);
1059  for (size_t u=0; u < data.m_Alt.size(); ++u) {
1060  data.m_Alt[u] = data.m_Alt[u].substr(1);
1061  }
1062  data.m_iPos++;
1063  }
1064 
1065  // normalize ref/alt by trimming common postfixes and adjusting location
1066  trimComplete = false;
1067  size_t refSize = data.m_strRef.size();
1068  size_t trimSize = 0;
1069  while (refSize > trimSize) {
1070  string postfix = data.m_strRef.substr(refSize-1-trimSize, trimSize+1);
1071  for (size_t u=0; u < data.m_Alt.size(); ++u) {
1072  size_t altSize = data.m_Alt[u].size();
1073  if (altSize < trimSize+1) {
1074  trimComplete = true;
1075  break;
1076  }
1077  string postfixA = data.m_Alt[u].substr(altSize-1-trimSize, trimSize+1);
1078  if (postfix != postfixA) {
1079  trimComplete = true;
1080  break;
1081  }
1082  }
1083  if (trimComplete) {
1084  break;
1085  }
1086  trimSize++;
1087  }
1088  if (trimSize > 0) {
1089  data.m_strRef =
1090  data.m_strRef.substr(0, data.m_strRef.size()-trimSize);
1091  for (size_t u=0; u < data.m_Alt.size(); ++u) {
1092  data.m_Alt[u] =
1093  data.m_Alt[u].substr(0, data.m_Alt[u].size()-trimSize);
1094  }
1095  }
1096  return true;
1097 }
1098 
1099 // ---------------------------------------------------------------------------
1100 bool
1102  const CVcfData& data,
1103  CRef<CSeq_feat> pFeat )
1104 // ---------------------------------------------------------------------------
1105 {
1106  CRef<CSeq_id> pId(CReadUtil::AsSeqId(data.m_strChrom, m_iFlags));
1107 
1108  //context:
1109  // we are trying to package all the allele of this feature into a single
1110  // variation_ref, hence, they all need a common location.
1111  // Referenced location differ between the different types of variations,
1112  // so we need to find the most specific variation type that describes them
1113  // all. Once the actual variation type has been found we can set the location
1114  // accordingly.
1115 
1116  // in practice, we will choose the common variation type if it is indeed
1117  // common for all the alleles. Otherwise, we just make it a MNV.
1118 
1119  if (data.m_SetType == CVcfData::ST_ALL_SNV) {
1120  //set location for SNVs
1121  pFeat->SetLocation().SetPnt().SetPoint(data.m_iPos-1);
1122  pFeat->SetLocation().SetPnt().SetId(*pId);
1123  return true;
1124  }
1125  if (data.m_SetType == CVcfData::ST_ALL_MNV) {
1126  //set location for MNV. This will be the location of the reference
1127  pFeat->SetLocation().SetInt().SetFrom(data.m_iPos-1);
1128  pFeat->SetLocation().SetInt().SetTo(
1129  static_cast<TSeqPos>(data.m_iPos + data.m_strRef.size() - 2));
1130  pFeat->SetLocation().SetInt().SetId(*pId);
1131  return true;
1132  }
1133  if (data.m_SetType == CVcfData::ST_ALL_INS) {
1134  //set location for INSs. Will always be a point!
1135  //m_iPos points to the 1-based position of the first
1136  //nt that is unique between alt and ref
1137  pFeat->SetLocation().SetPnt().SetPoint(data.m_iPos-1);
1138  pFeat->SetLocation().SetPnt().SetId(*pId);
1139  return true;
1140  }
1141  if (data.m_SetType == CVcfData::ST_ALL_DEL) {
1142  if (data.m_strRef.size() == 1) {
1143  //deletion of a single base
1144  pFeat->SetLocation().SetPnt().SetPoint(data.m_iPos-1);
1145  pFeat->SetLocation().SetPnt().SetId(*pId);
1146  }
1147  else {
1148  pFeat->SetLocation().SetInt().SetFrom(data.m_iPos-1);
1149  //-1 for 0-based,
1150  //another -1 for inclusive end-point ( i.e. [], not [) )
1151  pFeat->SetLocation().SetInt().SetTo(
1152  static_cast<TSeqPos>(data.m_iPos -1 + data.m_strRef.length() - 1));
1153  pFeat->SetLocation().SetInt().SetId(*pId);
1154  }
1155  return true;
1156  }
1157 
1158  //default: For MNV's we will use the single starting point
1159  //NB: For references of size >=2, this location will not
1160  //match the reference allele. Future Variation-ref
1161  //normalization code will address these issues,
1162  //and obviate the need for this code altogether.
1163  if (data.m_strRef.size() == 1) {
1164  //deletion of a single base
1165  pFeat->SetLocation().SetPnt().SetPoint(data.m_iPos-1);
1166  pFeat->SetLocation().SetPnt().SetId(*pId);
1167  }
1168  else {
1169  pFeat->SetLocation().SetInt().SetFrom(data.m_iPos-1);
1170  pFeat->SetLocation().SetInt().SetTo(
1171  static_cast<TSeqPos>(data.m_iPos -1 + data.m_strRef.length() - 1));
1172  pFeat->SetLocation().SetInt().SetId(*pId);
1173  }
1174  return true;
1175 }
1176 
1177 // ----------------------------------------------------------------------------
1178 bool
1180  CVcfData& data,
1181  CRef<CSeq_feat> pFeature )
1182 // ----------------------------------------------------------------------------
1183 {
1184  CSeq_feat::TExt& ext = pFeature->SetExt();
1185  if ( data.m_pdQual ) {
1186  ext.AddField( "score", *data.m_pdQual );
1187  }
1188  return true;
1189 }
1190 
1191 // ----------------------------------------------------------------------------
1192 bool
1194  CVcfData& data,
1195  CRef<CSeq_feat> pFeature )
1196 // ----------------------------------------------------------------------------
1197 {
1198  if(!NStr::Equal(data.m_strFilter,".")) {
1199  CSeq_feat::TExt& ext = pFeature->SetExt();
1200  ext.AddField( "filter", data.m_strFilter );
1201  }
1202  return true;
1203 }
1204 
1205 // ----------------------------------------------------------------------------
1206 bool
1208  CVcfData& data,
1209  CRef<CSeq_feat> pFeature)
1210 // ----------------------------------------------------------------------------
1211 {
1212  if (!xAssignVariantProps(data, pFeature)) {
1213  return false;
1214  }
1215  CSeq_feat::TExt& ext = pFeature->SetExt();
1216  if (data.m_Info.empty()) {
1217  return true;
1218  }
1219  vector<string> infos;
1220  for ( map<string,vector<string> >::const_iterator cit = data.m_Info.begin();
1221  cit != data.m_Info.end(); cit++ )
1222  {
1223  const string& key = cit->first;
1224  vector<string> value = cit->second;
1225  if ( value.empty() ) {
1226  infos.push_back( key );
1227  }
1228  else {
1229  string joined = NStr::Join( list<string>( value.begin(), value.end() ), "," );
1230  infos.push_back( key + "=" + joined );
1231  }
1232  }
1233  ext.AddField( "info", NStr::Join( infos, ";" ) );
1234  return true;
1235 }
1236 
1237 // ----------------------------------------------------------------------------
1238 bool
1240  const string& strLine,
1241  CSeq_annot& current)
1242 // ----------------------------------------------------------------------------
1243 {
1244  if (!xIsTrackLine(strLine)) {
1245  return false;
1246  }
1247  vector<string> parts;
1248  CReadUtil::Tokenize( strLine, " \t", parts );
1249  if (parts.size() >= 3) {
1250  const string digits("0123456789");
1251  bool col2_is_numeric =
1252  (string::npos == parts[1].find_first_not_of(digits));
1253  bool col3_is_numeric =
1254  (string::npos == parts[2].find_first_not_of(digits));
1255  if (col2_is_numeric && col3_is_numeric) {
1256  return false;
1257  }
1258  }
1259  if (!CReaderBase::xParseTrackLine(strLine)) {
1260  CReaderMessage warning(
1261  eDiag_Warning,
1262  m_uLineNumber,
1263  "Bad track line: Expected \"track key1=value1 key2=value2 ...\". Ignored.");
1264  m_pMessageHandler->Report(warning);
1265  }
1266  return true;
1267 }
1268 
1269 // ----------------------------------------------------------------------------
1270 bool
1272  CVcfData& data,
1273  CRef<CSeq_feat> pFeature )
1274 // ----------------------------------------------------------------------------
1275 {
1276  if (data.m_FormatKeys.empty()) {
1277  return true;
1278  }
1279 
1280  CSeq_feat::TExt& ext = pFeature->SetExt();
1281  ext.AddField("format", data.m_FormatKeys);
1282 
1283  CRef<CUser_field> pGenotypeData( new CUser_field );
1284  pGenotypeData->SetLabel().SetStr("genotype-data");
1285 
1286  for ( CVcfData::GTDATA::const_iterator cit = data.m_GenotypeData.begin();
1287  cit != data.m_GenotypeData.end(); ++cit) {
1288  pGenotypeData->AddField(cit->first,cit->second);
1289  }
1290  ext.SetData().push_back(pGenotypeData);
1291  return true;
1292 }
1293 
1294 
1295 // ----------------------------------------------------------------------------
1297  const vector<string>& ids,
1298  CRef<CDbtag> pDbtag) const
1299 // ----------------------------------------------------------------------------
1300 {
1301  for (const string& id : ids) {
1302  if (NStr::StartsWith(id, "rs") ||
1303  NStr::StartsWith(id, "ss") )
1304  {
1305  try {
1306  const int idval = NStr::StringToInt(id.substr(2));
1307  pDbtag->SetDb("dbSNP");
1308  pDbtag->SetTag().SetId(idval);
1309  }
1310  catch (...) {
1311  continue;
1312  }
1313  return true;
1314  }
1315  }
1316  return false;
1317 }
1318 
1319 
1320 // ----------------------------------------------------------------------------
1321 bool
1323  CVcfData& data,
1324  CRef<CSeq_feat> pFeature )
1325 // ----------------------------------------------------------------------------
1326 {
1327  if ( data.m_Ids.empty() ) {
1328  return true;
1329  }
1330  CVariation_ref& variation = pFeature->SetData().SetVariation();
1331 // CVariation_ref::TVariant_prop& var_prop = variation.SetVariant_prop();
1332 // var_prop.SetVersion( 5 );
1333 
1334  auto it = data.m_Info.find("SOURCE");
1335 
1336  if (data.m_Info.end() != it) {
1337  vector<string> sources = it->second;
1338  if (sources.size() > 0 &&
1339  NStr::Equal(sources.front(), "dbsnp"))
1340  {
1341  CRef<CDbtag> pDbtag = Ref(new CDbtag());
1342  if (xAssigndbSNPTag(data.m_Ids, pDbtag)) {
1343  variation.SetId(pDbtag.GetNCObject());
1344  return true;
1345  }
1346  }
1347  }
1348 
1349 
1350  if ( data.m_Info.find( "DB" ) != data.m_Info.end() ) {
1351  string id = data.m_Ids[0];
1352  NStr::ToLower(id);
1353  if (NStr::StartsWith(id, "rs") || NStr::StartsWith(id, "ss")) {
1354  variation.SetId().SetDb("dbSNP");
1355  }
1356  else {
1357  variation.SetId().SetDb( "dbVar" );
1358  }
1359  }
1360  else if ( data.m_Info.find( "H2" ) != data.m_Info.end() ) {
1361  variation.SetId().SetDb( "HapMap2" );
1362  }
1363  else {
1364  variation.SetId().SetDb( "local" );
1365  }
1366  variation.SetId().SetTag().SetStr( data.m_Ids[0] );
1367 
1368  for ( size_t i=1; i < data.m_Ids.size(); ++i ) {
1369  if ( data.m_Info.find( "DB" ) != data.m_Info.end()
1370  && data.m_Info.find( "H2" ) != data.m_Info.end() )
1371  {
1372  variation.SetId().SetDb( "HapMap2" );
1373  }
1374  else {
1375  variation.SetId().SetDb( "local" );
1376  }
1377  variation.SetId().SetTag().SetStr( data.m_Ids[i] );
1378  }
1379  return true;
1380 }
1381 
1382 
1383 // ----------------------------------------------------------------------------
1384 bool
1386  CVcfData& data,
1387  CRef<CSeq_feat> pFeat)
1388 // ----------------------------------------------------------------------------
1389 {
1390  typedef CVariantProperties VP;
1391 
1392  CVcfData::INFOS& infos = data.m_Info;
1393  VP& props = pFeat->SetData().SetVariation().SetVariant_prop();
1395 
1396  props.SetResource_link() = 0;
1397  props.SetGene_location() = 0;
1398  props.SetEffect() = 0;
1399  props.SetMapping() = 0;
1400  props.SetFrequency_based_validation() = 0;
1401  props.SetGenotype() = 0;
1402  props.SetQuality_check() = 0;
1403 
1404  //byte F0
1405  props.SetVersion() = 5;
1406 
1407  //superbyte F1
1408  it = infos.find("SLO");
1409  if (infos.end() != it) {
1410  props.SetResource_link() |= VP::eResource_link_submitterLinkout;
1411  infos.erase(it);
1412  }
1413  it = infos.find("S3D");
1414  if (infos.end() != it) {
1415  props.SetResource_link() |= VP::eResource_link_has3D;
1416  infos.erase(it);
1417  }
1418  it = infos.find("TPA");
1419  if (infos.end() != it) {
1420  props.SetResource_link() |= VP::eResource_link_provisional;
1421  infos.erase(it);
1422  }
1423  it = infos.find("PM");
1424  if (infos.end() != it) {
1425  props.SetResource_link() |= VP::eResource_link_preserved;
1426  infos.erase(it);
1427  }
1428  it = infos.find("CLN");
1429  if (infos.end() != it) {
1430  props.SetResource_link() |= VP::eResource_link_clinical;
1431  infos.erase(it);
1432  }
1433  //todo: INFO ID=PMC
1434  it = infos.find("PMC");
1435  if (infos.end() != it) {
1436  infos.erase(it);
1437  }
1438  it = infos.find("PMID");
1439  if (infos.end() != it) {
1440  vector<string> pmids = it->second;
1441  for (vector<string>::const_iterator cit = pmids.begin();
1442  cit != pmids.end(); ++cit)
1443  {
1444  try {
1445  string db, tag;
1446  NStr::SplitInTwo(*cit, ":", db, tag);
1447  if (db != "PM") {
1448  CReaderMessage warning(
1449  eDiag_Warning,
1450  m_uLineNumber,
1451  "CVcfReader::xAssignVariantProps: Invalid PMID database ID.");
1452  m_pMessageHandler->Report(warning);
1453  continue;
1454  }
1455  CRef<CDbtag> pDbtag(new CDbtag);
1456  pDbtag->SetDb(db);
1457  pDbtag->SetTag().SetId(
1459  pFeat->SetDbxref().push_back(pDbtag);
1460  }
1461  catch(...) {}
1462  }
1463  infos.erase(it);
1464  }
1465 
1466  xAssignVariantSource(data, pFeat);
1467 
1468  //superbyte F2
1469  it = infos.find("R5");
1470  if (infos.end() != it) {
1471  props.SetGene_location() |= VP::eGene_location_near_gene_5;
1472  infos.erase(it);
1473  }
1474  it = infos.find("R3");
1475  if (infos.end() != it) {
1476  props.SetGene_location() |= VP::eGene_location_near_gene_3;
1477  infos.erase(it);
1478  }
1479  it = infos.find("INT");
1480  if (infos.end() != it) {
1481  props.SetGene_location() |= VP::eGene_location_intron;
1482  infos.erase(it);
1483  }
1484  it = infos.find("DSS");
1485  if (infos.end() != it) {
1486  props.SetGene_location() |= VP::eGene_location_donor;
1487  infos.erase(it);
1488  }
1489  it = infos.find("ASS");
1490  if (infos.end() != it) {
1491  props.SetGene_location() |= VP::eGene_location_acceptor;
1492  infos.erase(it);
1493  }
1494  it = infos.find("U5");
1495  if (infos.end() != it) {
1496  props.SetGene_location() |= VP::eGene_location_utr_5;
1497  infos.erase(it);
1498  }
1499  it = infos.find("U3");
1500  if (infos.end() != it) {
1501  props.SetGene_location() |= CVariantProperties::eGene_location_utr_3;
1502  infos.erase(it);
1503  }
1504 
1505  it = infos.find("SYN");
1506  if (infos.end() != it) {
1507  props.SetGene_location() |= VP::eEffect_synonymous;
1508  infos.erase(it);
1509  }
1510  it = infos.find("NSN");
1511  if (infos.end() != it) {
1512  props.SetGene_location() |= VP::eEffect_stop_gain;
1513  infos.erase(it);
1514  }
1515  it = infos.find("NSM");
1516  if (infos.end() != it) {
1517  props.SetGene_location() |= VP::eEffect_missense;
1518  infos.erase(it);
1519  }
1520  it = infos.find("NSF");
1521  if (infos.end() != it) {
1522  props.SetGene_location() |= VP::eEffect_frameshift;
1523  infos.erase(it);
1524  }
1525 
1526  //byte F3
1527  it = infos.find("WGT");
1528  if (infos.end() != it) {
1529  int weight = NStr::StringToInt( infos["WGT"][0] );
1530  switch(weight) {
1531  default:
1532  break;
1533  case 1:
1534  props.SetMap_weight() = VP::eMap_weight_is_uniquely_placed;
1535  infos.erase(it);
1536  break;
1537  case 2:
1538  props.SetMap_weight() = VP::eMap_weight_placed_twice_on_same_chrom;
1539  infos.erase(it);
1540  break;
1541  case 3:
1542  props.SetMap_weight() = VP::eMap_weight_placed_twice_on_diff_chrom;
1543  infos.erase(it);
1544  break;
1545  case 10:
1546  props.SetMap_weight() = VP::eMap_weight_many_placements;
1547  break;
1548  }
1549  }
1550 
1551  it = infos.find("ASP");
1552  if (infos.end() != it) {
1553  props.SetMapping() |= VP::eMapping_is_assembly_specific;
1554  infos.erase(it);
1555  }
1556  it = infos.find("CFL");
1557  if (infos.end() != it) {
1558  props.SetMapping() |= VP::eMapping_has_assembly_conflict;
1559  infos.erase(it);
1560  }
1561  it = infos.find("OTH");
1562  if (infos.end() != it) {
1563  props.SetMapping() |= VP::eMapping_has_other_snp;
1564  infos.erase(it);
1565  }
1566 
1567  //byte F4
1568  it = infos.find("OTH");
1569  if (infos.end() != it) {
1570  props.SetFrequency_based_validation() |= VP::eFrequency_based_validation_above_5pct_all;
1571  infos.erase(it);
1572  }
1573  it = infos.find("G5A");
1574  if (infos.end() != it) {
1575  props.SetFrequency_based_validation() |= VP::eFrequency_based_validation_above_5pct_1plus;
1576  infos.erase(it);
1577  }
1578  it = infos.find("VLD");
1579  if (infos.end() != it) {
1580  props.SetFrequency_based_validation() |= VP::eFrequency_based_validation_validated;
1581  infos.erase(it);
1582  }
1583  it = infos.find("MUT");
1584  if (infos.end() != it) {
1585  props.SetFrequency_based_validation() |= VP::eFrequency_based_validation_is_mutation;
1586  infos.erase(it);
1587  }
1588  it = infos.find("GMAF");
1589  if (infos.end() != it) {
1590  props.SetAllele_frequency() = NStr::StringToDouble(infos["GMAF"][0]);
1591  infos.erase(it);
1592  }
1593 
1594  //byte F5
1595  it = infos.find("GNO");
1596  if (infos.end() != it) {
1597  props.SetGenotype() |= VP::eGenotype_has_genotypes;
1598  infos.erase(it);
1599  }
1600  it = infos.find("HD");
1601  if (infos.end() != it) {
1602  props.SetResource_link() |= VP::eResource_link_genotypeKit;
1603  infos.erase(it);
1604  }
1605 
1606  //byte F6
1607  if (infos.end() != infos.find("PH3")) {
1608  CRef<CDbtag> pDbtag(new CDbtag);
1609  pDbtag->SetDb("BioProject");
1610  pDbtag->SetTag().SetId(60835);
1611  pFeat->SetData().SetVariation().SetOther_ids().push_back(pDbtag);
1612  }
1613  if (infos.end() != infos.find("KGPhase1")) {
1614  CRef<CDbtag> pDbtag(new CDbtag);
1615  pDbtag->SetDb("BioProject");
1616  pDbtag->SetTag().SetId(28889);
1617  pFeat->SetData().SetVariation().SetOther_ids().push_back(pDbtag);
1618  }
1619 
1620  //byte F7
1621 
1622  //byte F8
1623  //no relevant information found in VCF
1624 
1625  //byte F9
1626  it = infos.find("GCF");
1627  if (infos.end() != it) {
1628  props.SetQuality_check() |= VP::eQuality_check_genotype_conflict;
1629  infos.erase(it);
1630  }
1631  it = infos.find("NOV");
1632  if (infos.end() != it) {
1633  props.SetQuality_check() |= VP::eQuality_check_non_overlapping_alleles;
1634  infos.erase(it);
1635  }
1636  it = infos.find("WTD");
1637  if (infos.end() != it) {
1638  props.SetQuality_check() |= VP::eQuality_check_withdrawn_by_submitter;
1639  infos.erase(it);
1640  }
1641  it = infos.find("NOC");
1642  if (infos.end() != it) {
1643  props.SetQuality_check() |= VP::eQuality_check_contig_allele_missing;
1644  infos.erase(it);
1645  }
1646  return true;
1647 }
1648 
1649 
1650 // ----------------------------------------------------------------------------
1652  CRef<CSeq_feat> pFeat)
1653 // ----------------------------------------------------------------------------
1654 {
1655  CVcfData::INFOS& infos = data.m_Info;
1656  auto it = infos.find("SOURCE");
1657  if (infos.end() != it) {
1658  vector<string> sources = it->second;
1659  if (sources.size() > 0 &&
1660  NStr::Equal(sources.front(),"dbsnp"))
1661  {
1662  bool valid_id=false;
1663  CRef<CDbtag> pDbtag(new CDbtag());
1664  if (xAssigndbSNPTag(data.m_Ids, pDbtag)) {
1665  pFeat->SetDbxref().push_back(pDbtag);
1666  valid_id = true;
1667  }
1668 
1669  if (!valid_id) {
1670  CReaderMessage warning(
1671  eDiag_Warning,
1672  m_uLineNumber,
1673  "CVcfReader::xAssignVariantProps: No valid dbSNP identifier");
1674  m_pMessageHandler->Report(warning);
1675  }
1676  infos.erase(it);
1677  }
1678  }
1679 }
1680 
1681 
1682 // ----------------------------------------------------------------------------
1684  const CTempString& strLine)
1685 // ----------------------------------------------------------------------------
1686 {
1687  if (NStr::StartsWith(strLine, "#CHROM")) {
1688  return false;
1689  }
1690  return CReaderBase::xIsCommentLine(strLine);
1691 }
1692 
1693 END_objects_SCOPE
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
AutoPtr –.
Definition: ncbimisc.hpp:401
CAnnot_descr –.
Definition: Annot_descr.hpp:66
CAnnotdesc –.
Definition: Annotdesc.hpp:66
Definition: Dbtag.hpp:53
static CObjReaderLineException * Create(EDiagSev eSeverity, unsigned int uLine, const std::string &strMessage, EProblem eProblem=eProblem_GeneralParsingError, const std::string &strSeqId=string(""), const std::string &strFeatureName=string(""), const std::string &strQualifierName=string(""), const std::string &strQualifierValue=string(""), CObjReaderLineException::EErrCode eErrCode=eFormat, const TVecOfLines &vecOfOtherLines=TVecOfLines())
Please use this instead of the constructor because the ctor is protected.
Definition: line_error.cpp:417
Common file reader utility functions.
Definition: read_util.hpp:47
static void Tokenize(const string &instr, const string &delim, vector< string > &tokens)
Tokenize a given string, respecting quoted substrings an atomic units.
Definition: read_util.cpp:47
static CRef< CSeq_id > AsSeqId(const string &rawId, long flags=0, bool localInts=true)
Convert a raw ID string to a Seq-id, based in given customization flags.
Definition: read_util.cpp:89
Defines and provides stubs for a general interface to a variety of file readers.
Definition: reader_base.hpp:63
unique_ptr< CReaderMessageHandler > m_pMessageHandler
virtual bool xUngetLine(ILineReader &)
virtual bool xIsCommentLine(const CTempString &)
virtual CRef< CSeq_annot > xCreateSeqAnnot()
unsigned int m_uDataCount
unsigned int m_uLineNumber
virtual bool xParseBrowserLine(const string &, CSeq_annot &)
void ProcessError(CObjReaderLineException &, ILineErrorListener *)
vector< TReaderLine > TReaderData
Definition: reader_base.hpp:70
virtual bool xGetLine(ILineReader &, string &)
TReaderFlags m_iFlags
virtual bool xParseTrackLine(const string &)
virtual void xAssignTrackData(CSeq_annot &)
virtual CRef< CSeq_annot > ReadSeqAnnot(CNcbiIstream &istr, ILineErrorListener *pErrors=nullptr)
Read an object from a given input stream, render it as a single Seq-annot.
struct SReaderLine { SReaderLine(unsigned int line, string data):mLine(line), mData(data) {} TReaderLine
Definition: reader_base.hpp:66
virtual bool xIsTrackLine(const CTempString &)
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
CUser_field & AddField(const string &label, int value)
add fields to the current user field
Definition: User_field.cpp:92
CUser_object & AddField(const string &label, const string &value, EParseField parse=eParse_String)
add a data field to the user object that holds a given value
CVariation_inst –.
enum CVcfData::SetType_t m_SetType
map< string, vector< string > > INFOS
Definition: vcf_reader.cpp:73
string m_strRef
Definition: vcf_reader.cpp:83
string m_strLine
Definition: vcf_reader.cpp:77
INFOS m_Info
Definition: vcf_reader.cpp:87
string m_strFilter
Definition: vcf_reader.cpp:86
int m_iPos
Definition: vcf_reader.cpp:81
vector< string > m_FormatKeys
Definition: vcf_reader.cpp:88
vector< string > m_Ids
Definition: vcf_reader.cpp:82
string m_strChrom
Definition: vcf_reader.cpp:80
map< string, vector< string > > GTDATA
Definition: vcf_reader.cpp:74
GTDATA m_GenotypeData
Definition: vcf_reader.cpp:90
vector< string > m_Alt
Definition: vcf_reader.cpp:84
double * m_pdQual
Definition: vcf_reader.cpp:85
vector< string > m_MetaDirectives
Definition: vcf_reader.hpp:333
void xProcessData(const TReaderData &, CSeq_annot &) override
Definition: vcf_reader.cpp:229
virtual bool xAssignVariantMnv(const CVcfData &, unsigned int, CRef< CSeq_feat >)
Definition: vcf_reader.cpp:761
virtual bool xProcessFilter(CVcfData &, CRef< CSeq_feat >)
void xAssignVariantSource(CVcfData &, CRef< CSeq_feat >)
virtual bool xAssignVariantProps(CVcfData &, CRef< CSeq_feat >)
virtual bool xProcessScore(CVcfData &, CRef< CSeq_feat >)
virtual bool xProcessTrackLine(const string &, CSeq_annot &)
bool m_MetaHandled
Definition: vcf_reader.hpp:336
virtual bool xAssignVariantIns(const CVcfData &, unsigned int, CRef< CSeq_feat >)
Definition: vcf_reader.cpp:819
virtual bool xProcessDataLine(const string &, CSeq_annot &)
Definition: vcf_reader.cpp:611
virtual bool xProcessMetaLine(const string &, CSeq_annot &)
Definition: vcf_reader.cpp:276
CRef< CAnnotdesc > m_Meta
Definition: vcf_reader.hpp:329
map< string, CVcfFormatSpec > m_FormatSpecs
Definition: vcf_reader.hpp:331
virtual bool xNormalizeData(CVcfData &, ILineErrorListener *=nullptr)
bool xAssigndbSNPTag(const vector< string > &ids, CRef< CDbtag > pDbtag) const
virtual bool xProcessHeaderLine(const string &, CSeq_annot &)
Definition: vcf_reader.cpp:551
static const double mMaxSupportedVersion
Definition: vcf_reader.hpp:327
virtual bool xProcessMetaLineInfo(const string &, CSeq_annot &)
Definition: vcf_reader.cpp:364
virtual bool xAssignVariationAlleleSet(const CVcfData &, CRef< CSeq_feat >)
Definition: vcf_reader.cpp:662
vector< string > m_GenotypeHeaders
Definition: vcf_reader.hpp:334
virtual bool xAssignVariantDelins(const CVcfData &, unsigned int, CRef< CSeq_feat >)
Definition: vcf_reader.cpp:850
map< string, CVcfInfoSpec > m_InfoSpecs
Definition: vcf_reader.hpp:330
map< string, CVcfFilterSpec > m_FilterSpecs
Definition: vcf_reader.hpp:332
bool xIsCommentLine(const CTempString &) override
CVcfReader(int=0, CReaderListener *=nullptr)
Definition: vcf_reader.cpp:162
virtual bool xAssignVcfMeta(CSeq_annot &)
Definition: vcf_reader.cpp:588
double mActualVersion
Definition: vcf_reader.hpp:328
virtual bool xAssignVariationIds(CVcfData &, CRef< CSeq_feat >)
virtual bool xAssignVariantDel(const CVcfData &, unsigned int, CRef< CSeq_feat >)
Definition: vcf_reader.cpp:796
virtual bool xAssignVariantSnv(const CVcfData &, unsigned int, CRef< CSeq_feat >)
Definition: vcf_reader.cpp:740
virtual void xSetFileFormat(const string &, CSeq_annot &, bool &)
Definition: vcf_reader.cpp:304
virtual bool xAssignFeatureLocationSet(const CVcfData &, CRef< CSeq_feat >)
CRef< CSeq_annot > ReadSeqAnnot(ILineReader &, ILineErrorListener *=nullptr) override
Read an object from a given line reader, render it as a single Seq-annot, if possible.
Definition: vcf_reader.cpp:181
void xGetData(ILineReader &, TReaderData &) override
Definition: vcf_reader.cpp:210
virtual bool xProcessMetaLineFilter(const string &, CSeq_annot &)
Definition: vcf_reader.cpp:432
virtual bool xParseData(const string &, CVcfData &, ILineErrorListener *=nullptr)
Definition: vcf_reader.cpp:895
virtual bool xProcessMetaLineFormat(const string &, CSeq_annot &)
Definition: vcf_reader.cpp:480
virtual ~CVcfReader()
Definition: vcf_reader.cpp:174
CRef< CSeq_annot > xCreateSeqAnnot() override
Definition: vcf_reader.cpp:200
virtual bool xProcessFormat(CVcfData &, CRef< CSeq_feat >)
virtual bool xProcessInfo(CVcfData &, CRef< CSeq_feat >)
@ eProblem_BadInfoLine
Definition: line_error.hpp:96
@ eProblem_BadFormatLine
Definition: line_error.hpp:97
@ eProblem_BadFilterLine
Definition: line_error.hpp:98
@ eProblem_GeneralParsingError
Definition: line_error.hpp:105
Abstract base class for lightweight line-by-line reading.
Definition: line_reader.hpp:54
void erase(iterator pos)
Definition: map.hpp:167
const_iterator end() const
Definition: map.hpp:152
bool empty() const
Definition: map.hpp:149
const_iterator find(const key_type &key) const
Definition: map.hpp:153
Definition: map.hpp:338
Include a standard set of the NCBI C++ Toolkit most basic headers.
static uch flags
#define false
Definition: bool.h:36
static int type
Definition: getdata.c:31
static const column_t columns[]
Definition: utf8_2.c:22
char data[12]
Definition: iconv.c:80
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
@ eDiag_Error
Error message.
Definition: ncbidiag.hpp:653
@ eDiag_Warning
Warning message.
Definition: ncbidiag.hpp:652
CRef< C > Ref(C *object)
Helper functions to get CRef<> and CConstRef<> objects.
Definition: ncbiobj.hpp:2015
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
TObjectType & GetNCObject(void) const
Get object.
Definition: ncbiobj.hpp:1187
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static string DoubleToString(double value, int precision=-1, TNumToStringFlags flags=0)
Convert double to string.
Definition: ncbistr.hpp:5181
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3452
static bool EndsWith(const CTempString str, const CTempString end, ECase use_case=eCase)
Check if a string ends with a specified suffix value.
Definition: ncbistr.hpp:5424
static double StringToDouble(const CTempStringEx str, TStringToNumFlags flags=0)
Convert string to double.
Definition: ncbistr.cpp:1381
static string Join(const TContainer &arr, const CTempString &delim)
Join strings using the specified delimiter.
Definition: ncbistr.hpp:2699
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5406
static bool SplitInTwo(const CTempString str, const CTempString delim, string &str1, string &str2, TSplitFlags flags=0)
Split a string into two pieces using the specified delimiters.
Definition: ncbistr.cpp:3545
static bool Equal(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2, ECase use_case=eCase)
Test for equality of a substring with another string.
Definition: ncbistr.hpp:5378
static string & ToLower(string &str)
Convert string to lower case – string& version.
Definition: ncbistr.cpp:405
@ fSplit_Truncate
Definition: ncbistr.hpp:2503
@ fSplit_MergeDelimiters
Merge adjacent delimiters.
Definition: ncbistr.hpp:2500
bool IsSetData(void) const
the object itself Check if a value has been assigned to Data data member.
void SetTag(TTag &value)
Assign a value to Tag data member.
Definition: Dbtag_.cpp:66
TData & SetData(void)
Assign a value to Data data member.
void SetLabel(TLabel &value)
Assign a value to Label data member.
const TData & GetData(void) const
Get the Data member data.
void SetType(TType &value)
Assign a value to Type data member.
void SetDb(const TDb &value)
Assign a value to Db data member.
Definition: Dbtag_.hpp:229
TDbxref & SetDbxref(void)
Assign a value to Dbxref data member.
Definition: Seq_feat_.hpp:1339
void SetLocation(TLocation &value)
Assign a value to Location data member.
Definition: Seq_feat_.cpp:131
void SetExt(TExt &value)
Assign a value to Ext data member.
Definition: Seq_feat_.cpp:153
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_feat_.cpp:94
void ResetExt(void)
Reset Ext data member.
Definition: Seq_feat_.cpp:148
const TExt & GetExt(void) const
Get the Ext member data.
Definition: Seq_feat_.hpp:1219
void SetLength(TLength value)
Assign a value to Length data member.
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_annot_.cpp:244
void SetSeq_data(TSeq_data &value)
Assign a value to Seq_data data member.
void SetDesc(TDesc &value)
Assign a value to Desc data member.
Definition: Seq_annot_.cpp:223
TUser & SetUser(void)
Select the variant.
Definition: Annotdesc_.cpp:190
const TUser & GetUser(void) const
Get the variant data.
Definition: Annotdesc_.cpp:184
bool IsSetDesc(void) const
used only for stand alone Seq-annots Check if a value has been assigned to Desc data member.
Definition: Seq_annot_.hpp:840
bool IsUser(void) const
Check if variant User is selected.
Definition: Annotdesc_.hpp:561
list< CRef< CVariation_ref > > TVariations
void SetObservation(TObservation value)
Assign a value to Observation data member.
void SetType(TType value)
Assign a value to Type data member.
TDelta & SetDelta(void)
Assign a value to Delta data member.
@ eType_snv
delta=[morph of length 1] NOTE: this is snV not snP; the latter requires frequency-based validation t...
@ eType_delins
delta=[del, ins]
@ eGene_location_utr_3
In 3' UTR (0x80)
@ eAction_del_at
excise sequence at location if multiplier is specified, delete len(location)*multiplier positions dow...
@ eAction_ins_before
insert seq before the location.start
@ eObservation_variant
inst represent the observed variant at a given position
@ eObservation_reference
inst represents the reference base at the position
n font weight
int i
Lightweight interface for getting lines of data with minimal memory copying.
static MDB_envinfo info
Definition: mdb_load.c:37
const struct ncbi::grid::netcache::search::fields::KEY key
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
const char * tag
static SLJIT_INLINE sljit_ins lr(sljit_gpr dst, sljit_gpr src)
Definition: type.c:6
static void s_AddDeleteDeltaItem(CVariation_inst &instance)
Definition: vcf_reader.cpp:782
ESpecType SpecType(const string &spectype)
Definition: vcf_reader.cpp:101
ESpecNumber SpecNumber(const string &specnumber)
Definition: vcf_reader.cpp:129
ESpecNumber
Definition: vcf_reader.hpp:66
@ eNumber_CountUnknown
Definition: vcf_reader.hpp:69
@ eNumber_CountGenotypes
Definition: vcf_reader.hpp:68
@ eNumber_CountAlleles
Definition: vcf_reader.hpp:67
@ eNumber_CountAllAlleles
Definition: vcf_reader.hpp:70
ESpecType
Definition: vcf_reader.hpp:54
@ eType_Integer
Definition: vcf_reader.hpp:55
@ eType_Float
Definition: vcf_reader.hpp:56
@ eType_String
Definition: vcf_reader.hpp:59
@ eType_Character
Definition: vcf_reader.hpp:58
@ eType_Flag
Definition: vcf_reader.hpp:57
Modified on Fri Sep 20 14:58:13 2024 by modify_doxy.py rev. 669887