NCBI C++ ToolKit
wig2table.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: wig2table.cpp 101904 2024-02-29 18:50:05Z vasilche $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Aaron Ucko
27 *
28 * File Description:
29 * Simple program demonstrating the use of serializable objects (in this
30 * case, biological sequences). Does NOT use the object manager.
31 *
32 * ===========================================================================
33 */
34 
35 #include <ncbi_pch.hpp>
36 #include <corelib/ncbiapp.hpp>
37 #include <corelib/ncbiargs.hpp>
38 #include <corelib/ncbienv.hpp>
39 
40 #include <serial/iterator.hpp>
41 #include <serial/objistr.hpp>
42 #include <serial/objostr.hpp>
43 #include <serial/serial.hpp>
44 
55 #include <corelib/ncbifile.hpp>
56 
59 
61 {
62 public:
63  /// read from the file, "-" (but not "./-") means standard input
64  CWigBufferedLineReader(const string& filename);
65  virtual ~CWigBufferedLineReader();
66 
67  bool AtEOF(void) const;
69  void UngetLine(void);
70  CTempStringEx operator*(void) const;
71  CT_POS_TYPE GetPosition(void) const;
72  Uint8 GetLineNumber(void) const;
73 
74 protected:
75 
76  void x_LoadLong(void);
77  bool x_ReadBuffer(void);
78 
79 private:
81  bool m_Eof;
83  size_t m_BufferSize;
85  char* m_Pos;
86  char* m_End;
88  string m_String;
91 };
92 
93 
95  : m_Reader(CFileReader::New(filename)),
96  m_Eof(false),
97  m_UngetLine(false),
98  m_BufferSize(32*1024),
99  m_Buffer(new char[m_BufferSize]),
100  m_Pos(m_Buffer.get()),
101  m_End(m_Pos),
102  m_InputPos(0),
103  m_LineNumber(0)
104 {
105  x_ReadBuffer();
106 }
107 
108 
110 {
111 }
112 
113 
115 {
116  return m_Eof && !m_UngetLine;
117 }
118 
119 
121 {
123  _ASSERT(m_Line.begin());
124  --m_LineNumber;
125  m_UngetLine = true;
126 }
127 
128 
130 {
131  ++m_LineNumber;
132  if ( m_UngetLine ) {
133  _ASSERT(m_Line.begin());
134  m_UngetLine = false;
135  return *this;
136  }
137  // check if we are at the buffer end
138  char* start = m_Pos;
139  char* end = m_End;
140  for ( char* p = start; p < end; ++p ) {
141  if ( *p == '\n' ) {
142  *p = '\0';
143  m_Line.assign(start, p - start, CTempStringEx::eHasZeroAtEnd);
144  m_Pos = ++p;
145  if ( p == end ) {
146  m_String = m_Line;
147  m_Line = m_String;
148  x_ReadBuffer();
149  }
150  return *this;
151  }
152  else if ( *p == '\r' ) {
153  *p = '\0';
154  m_Line.assign(start, p - start, CTempStringEx::eHasZeroAtEnd);
155  if ( ++p == end ) {
156  m_String = m_Line;
157  m_Line = m_String;
158  if ( x_ReadBuffer() ) {
159  p = m_Pos;
160  if ( *p == '\n' ) {
161  m_Pos = p+1;
162  }
163  }
164  return *this;
165  }
166  if ( *p != '\n' ) {
167  return *this;
168  }
169  m_Pos = ++p;
170  if ( p == end ) {
171  m_String = m_Line;
172  m_Line = m_String;
173  x_ReadBuffer();
174  }
175  return *this;
176  }
177  }
178  x_LoadLong();
179  return *this;
180 }
181 
182 
184 {
185  char* start = m_Pos;
186  char* end = m_End;
187  m_String.assign(start, end);
188  while ( x_ReadBuffer() ) {
189  start = m_Pos;
190  end = m_End;
191  for ( char* p = start; p < end; ++p ) {
192  char c = *p;
193  if ( c == '\r' || c == '\n' ) {
194  m_String.append(start, p - start);
195  m_Line = m_String;
196  if ( ++p == end ) {
197  m_String = m_Line;
198  m_Line = m_String;
199  if ( x_ReadBuffer() ) {
200  p = m_Pos;
201  end = m_End;
202  if ( p < end && c == '\r' && *p == '\n' ) {
203  ++p;
204  m_Pos = p;
205  }
206  }
207  }
208  else {
209  if ( c == '\r' && *p == '\n' ) {
210  if ( ++p == end ) {
211  x_ReadBuffer();
212  p = m_Pos;
213  }
214  }
215  m_Pos = p;
216  }
217  return;
218  }
219  }
220  m_String.append(start, end - start);
221  }
222  m_Line = m_String;
223  return;
224 }
225 
226 
228 {
229  _ASSERT(m_Reader);
230 
231  if ( m_Eof ) {
232  return false;
233  }
234 
236  m_Pos = m_End = m_Buffer.get();
237  for (bool flag = true; flag; ) {
238  size_t size;
241  switch (result) {
242  case eRW_NotImplemented:
243  case eRW_Error:
244  NCBI_THROW(CIOException, eRead, "Read error");
245  /*NOTREACHED*/
246  break;
247  case eRW_Timeout:
248  // keep spinning around
249  break;
250  case eRW_Eof:
251  m_Eof = true;
252  // fall through
253  case eRW_Success:
254  m_End = m_Pos + size;
255  return (result == eRW_Success || size > 0);
256  default:
257  _ASSERT(0);
258  }
259  } // for
260  return false;
261 }
262 
263 
265 {
266  return m_Line;
267 }
268 
269 
271 {
272  return m_InputPos + CT_OFF_TYPE(m_Pos - m_Buffer.get());
273 }
274 
275 
277 {
278  return m_LineNumber;
279 }
280 
281 
283 {
284  virtual void Init(void);
285  virtual int Run(void);
286 
292  void DumpAnnot(void);
293  void DumpChromValues(void);
294  void ResetChromValues(void);
295  void SetChrom(CTempString chrom);
296 
297  double EstimateSize(size_t rows, bool fixed_span) const;
298 
299  void ReadBrowser(void);
300  void ReadTrack(void);
301  void ReadFixedStep(void);
302  void ReadVariableStep(void);
303  void ReadBedLine(CTempString chrom);
304 
307 
309 
310  bool x_GetLine(void);
311  void x_UngetLine(void);
312  bool x_SkipWS(void);
313  bool x_CommentLine(void) const;
314  CTempString x_GetWord(void);
317  bool x_TryGetPos(TSeqPos& v);
318  bool x_TryGetDoubleSimple(double& v);
319  bool x_TryGetDouble(double& v);
320  void x_GetPos(TSeqPos& v);
321  void x_GetDouble(double& v);
322 
323  void x_Error(const char* msg);
324 
325  string m_ChromId;
326  struct SValueInfo {
329  double m_Value;
330 
331  TSeqPos GetEnd(void) const {
332  return m_Pos + m_Span;
333  }
334  bool operator<(const SValueInfo& v) const {
335  return m_Pos < v.m_Pos;
336  }
337  };
338 
339  struct SStat {
345 
346  SStat(void)
347  : m_FixedSpan(true),
348  m_HaveGaps(false),
349  m_IntValues(true),
350  m_Span(1),
351  m_Min(0),
352  m_Max(0),
353  m_Step(1),
354  m_StepMul(1)
355  {
356  }
358  {
359  m_FixedSpan = true;
360  m_Span = span;
361  }
362  void AddSpan(TSeqPos span)
363  {
364  if ( span != m_Span ) {
365  m_FixedSpan = false;
366  }
367  }
368  void SetFirstValue(double v)
369  {
370  m_Min = m_Max = v;
371  m_IntValues = v == int(v);
372  }
373  void AddValue(double v)
374  {
375  if ( v < m_Min ) {
376  m_Min = v;
377  }
378  if ( v > m_Max ) {
379  m_Max = v;
380  }
381  if ( m_IntValues && v != int(v) ) {
382  m_IntValues = false;
383  }
384  }
385  int AsByte(double v) const
386  {
387  return int((v-m_Min)*m_StepMul+.5);
388  }
389  };
390  // sort values and return min and max values
391  SStat x_PreprocessValues(void);
393  void x_SetTotalLoc(CSeq_loc& loc, CSeq_id& chrom_id);
394 
395  void AddValue(const SValueInfo& value) {
396  if ( !m_OmitZeros || value.m_Value != 0 ) {
397  m_Values.push_back(value);
398  }
399  }
400 
401  string m_TrackName;
404  enum ETrackType {
408  };
412  typedef vector<SValueInfo> TValues;
415 
416  bool m_AsGraph;
420  bool m_AsByte;
422  double m_GapValue;
424 };
425 
426 
428 {
429  // Create command-line argument descriptions class
430  unique_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
431 
432  // Specify USAGE context
433  arg_desc->SetUsageContext
434  (GetArguments().GetProgramBasename(),
435  "Object serialization demo program: Seq-entry translator");
436 
437  // Describe the expected command-line arguments
438  arg_desc->AddDefaultKey
439  ("input", "InputFile",
440  "name of file to read from (standard input by default)",
442 
443  arg_desc->AddDefaultKey
444  ("output", "OutputFile",
445  "name of file to write to (standard output by default)",
447  arg_desc->AddDefaultKey("outfmt", "OutputFormat", "format of output file",
449  arg_desc->SetConstraint("outfmt",
450  &(*new CArgAllow_Strings, "asn", "asnb", "xml"));
451 
452  arg_desc->AddOptionalKey("mapfile", "MapFile",
453  "IdMapper config filename",
455  arg_desc->AddDefaultKey("genome", "Genome",
456  "UCSC build number",
458 
459  arg_desc->AddFlag("as-graph",
460  "Generate Seq-graph");
461  arg_desc->AddFlag("single-annot",
462  "Put all Seq-graphs in a single Seq-annot");
463  arg_desc->AddFlag("as-byte",
464  "Convert data in byte range");
465  arg_desc->AddFlag("omit-zeros",
466  "Omit zero values");
467  arg_desc->AddFlag("join-same",
468  "Join equal sequential values");
469  arg_desc->AddFlag("keep-integer",
470  "Keep integer as is if they fit in an output range");
471  arg_desc->AddDefaultKey("gap-value", "GapValue",
472  "Assumed value in gaps",
474 
475  arg_desc->AddOptionalKey("name", "Name",
476  "Track name or graph title",
478 
479  // Setup arg.descriptions for this application
480  SetupArgDescriptions(arg_desc.release());
481 }
482 
483 
484 static ESerialDataFormat s_GetFormat(const string& name)
485 {
486  if (name == "asn") {
487  return eSerial_AsnText;
488  } else if (name == "asnb") {
489  return eSerial_AsnBinary;
490  } else if (name == "xml") {
491  return eSerial_Xml;
492  } else {
493  // Should be caught by argument processing, but as an illustration...
494  THROW1_TRACE(runtime_error, "Bad serial format name " + name);
495  }
496 }
497 
498 
499 double CWig2tableApplication::EstimateSize(size_t rows, bool fixed_span) const
500 {
501  double ret = 0;
502  ret += rows*4;
503  if ( !fixed_span )
504  ret += rows*4;
505  if ( m_AsByte )
506  ret += rows;
507  else
508  ret += 8*rows;
509  return ret;
510 }
511 
512 
514 {
515  SStat stat;
516  bool sorted = true;
517  size_t size = m_Values.size();
518  if ( size ) {
519  stat.SetFirstSpan(m_Values[0].m_Span);
520  stat.SetFirstValue(m_Values[0].m_Value);
521 
522  for ( size_t i = 1; i < size; ++i ) {
523  stat.AddSpan(m_Values[i].m_Span);
524  stat.AddValue(m_Values[i].m_Value);
525  if ( sorted ) {
526  if ( m_Values[i].m_Pos < m_Values[i-1].m_Pos ) {
527  sorted = false;
528  }
529  if ( m_Values[i].m_Pos != m_Values[i-1].GetEnd() ) {
530  stat.m_HaveGaps = true;
531  }
532  }
533  }
534  }
535  if ( !sorted ) {
536  sort(m_Values.begin(), m_Values.end());
537  stat.m_HaveGaps = false;
538  for ( size_t i = 1; i < size; ++i ) {
539  if ( m_Values[i].m_Pos != m_Values[i-1].GetEnd() ) {
540  stat.m_HaveGaps = true;
541  break;
542  }
543  }
544  }
545  if ( m_AsGraph && stat.m_HaveGaps ) {
546  stat.AddValue(m_GapValue);
547  }
548 
549  const int range = 255;
550  if ( stat.m_Max > stat.m_Min &&
551  (!m_KeepInteger ||
552  !stat.m_IntValues ||
553  stat.m_Max-stat.m_Min > range) ) {
554  stat.m_Step = (stat.m_Max-stat.m_Min)/range;
555  stat.m_StepMul = 1/stat.m_Step;
556  }
557 
558  if ( !m_AsGraph && m_JoinSame && size ) {
559  TValues nv;
560  nv.reserve(size);
561  nv.push_back(m_Values[0]);
562  for ( size_t i = 1; i < size; ++i ) {
563  if ( m_Values[i].m_Pos == nv.back().GetEnd() &&
564  m_Values[i].m_Value == nv.back().m_Value ) {
565  nv.back().m_Span += m_Values[i].m_Span;
566  }
567  else {
568  nv.push_back(m_Values[i]);
569  }
570  }
571  if ( nv.size() != size ) {
572  double s = EstimateSize(size, stat.m_FixedSpan);
573  double ns = EstimateSize(nv.size(), false);
574  if ( ns < s*.75 ) {
575  m_Values.swap(nv);
576  size = m_Values.size();
577  LOG_POST("Joined size: "<<size);
578  stat.m_FixedSpan = false;
579  }
580  }
581  }
582 
583  if ( m_AsGraph && !stat.m_FixedSpan ) {
584  stat.m_Span = 1;
585  stat.m_FixedSpan = true;
586  }
587 
588  return stat;
589 }
590 
591 
593 {
595  if ( m_IdMapper ) {
596  m_IdMapper->MapObject(*chrom_id);
597  }
598  return chrom_id;
599 }
600 
601 
603 {
604  if ( m_Values.empty() ) {
605  loc.SetEmpty(chrom_id);
606  }
607  else {
608  CSeq_interval& interval = loc.SetInt();
609  interval.SetId(chrom_id);
610  interval.SetFrom(m_Values.front().m_Pos);
611  interval.SetTo(m_Values.back().GetEnd()-1);
612  }
613 }
614 
615 
617 {
619 
620  table->SetFeat_type(0);
621 
622  CRef<CSeq_id> chrom_id = x_MakeChromId();
623 
624  CRef<CSeq_loc> table_loc(new CSeq_loc);
625  { // Seq-table location
627  table->SetColumns().push_back(col_id);
628  col_id->SetHeader().SetField_name("Seq-table location");
629  col_id->SetDefault().SetLoc(*table_loc);
630  }
631 
632  { // Seq-id
634  table->SetColumns().push_back(col_id);
636  col_id->SetDefault().SetId(*chrom_id);
637  }
638 
639  // position
641  table->SetColumns().push_back(col_pos);
643  CSeqTable_multi_data::TInt& pos = col_pos->SetData().SetInt();
644 
645  SStat stat = x_PreprocessValues();
646 
647  x_SetTotalLoc(*table_loc, *chrom_id);
648 
649  size_t size = m_Values.size();
650  table->SetNum_rows(size);
651  pos.reserve(size);
652 
653  CSeqTable_multi_data::TInt* span_ptr = 0;
654  { // span
656  table->SetColumns().push_back(col_span);
657  col_span->SetHeader().SetField_name("span");
658  if ( stat.m_FixedSpan ) {
659  col_span->SetDefault().SetInt(stat.m_Span);
660  }
661  else {
662  span_ptr = &col_span->SetData().SetInt();
663  span_ptr->reserve(size);
664  }
665  }
666 
667  if ( stat.m_HaveGaps ) {
669  table->SetColumns().push_back(col_step);
670  col_step->SetHeader().SetField_name("value_gap");
671  col_step->SetDefault().SetReal(m_GapValue);
672  }
673 
674  if ( m_AsByte ) { // values
676  table->SetColumns().push_back(col_min);
677  col_min->SetHeader().SetField_name("value_min");
678  col_min->SetDefault().SetReal(stat.m_Min);
679 
681  table->SetColumns().push_back(col_step);
682  col_step->SetHeader().SetField_name("value_step");
683  col_step->SetDefault().SetReal(stat.m_Step);
684 
686  table->SetColumns().push_back(col_val);
687  col_val->SetHeader().SetField_name("values");
688 
689  if ( 1 ) {
690  AutoPtr< vector<char> > values(new vector<char>());
691  values->reserve(size);
692  ITERATE ( TValues, it, m_Values ) {
693  pos.push_back(it->m_Pos);
694  if ( span_ptr ) {
695  span_ptr->push_back(it->m_Span);
696  }
697  values->push_back(stat.AsByte(it->m_Value));
698  }
699  col_val->SetData().SetBytes().push_back(values.release());
700  }
701  else {
702  CSeqTable_multi_data::TInt& values = col_val->SetData().SetInt();
703  values.reserve(size);
704 
705  ITERATE ( TValues, it, m_Values ) {
706  pos.push_back(it->m_Pos);
707  if ( span_ptr ) {
708  span_ptr->push_back(it->m_Span);
709  }
710  values.push_back(stat.AsByte(it->m_Value));
711  }
712  }
713  }
714  else {
716  table->SetColumns().push_back(col_val);
717  col_val->SetHeader().SetField_name("values");
718  CSeqTable_multi_data::TReal& values = col_val->SetData().SetReal();
719  values.reserve(size);
720 
721  ITERATE ( TValues, it, m_Values ) {
722  pos.push_back(it->m_Pos);
723  if ( span_ptr ) {
724  span_ptr->push_back(it->m_Span);
725  }
726  values.push_back(it->m_Value);
727  }
728  }
729  return table;
730 }
731 
732 
734 {
735  CRef<CSeq_graph> graph(new CSeq_graph);
736 
737  CRef<CSeq_id> chrom_id = x_MakeChromId();
738 
739  CRef<CSeq_loc> graph_loc(new CSeq_loc);
740  graph->SetLoc(*graph_loc);
741 
742  SStat stat = x_PreprocessValues();
743 
744  x_SetTotalLoc(*graph_loc, *chrom_id);
745 
746  if ( !m_TrackName.empty() ) {
747  graph->SetTitle(m_TrackName);
748  }
749  graph->SetComp(stat.m_Span);
750  graph->SetA(stat.m_Step);
751  graph->SetB(stat.m_Min);
752 
753  CByte_graph& b_graph = graph->SetGraph().SetByte();
754  b_graph.SetMin(stat.AsByte(stat.m_Min));
755  b_graph.SetMax(stat.AsByte(stat.m_Max));
756  b_graph.SetAxis(0);
757  CByte_graph::TValues& bytes = b_graph.SetValues();
758 
759  if ( m_Values.empty() ) {
760  graph->SetNumval(0);
761  }
762  else {
763  _ASSERT(stat.m_FixedSpan);
764  TSeqPos start = m_Values[0].m_Pos;
765  TSeqPos end = m_Values.back().GetEnd();
766  size_t size = (end-start)/stat.m_Span;
767  graph->SetNumval(size);
768  bytes.resize(size, stat.AsByte(m_GapValue));
769  ITERATE ( TValues, it, m_Values ) {
770  TSeqPos pos = it->m_Pos - start;
771  TSeqPos span = it->m_Span;
772  _ASSERT(pos % stat.m_Span == 0);
773  _ASSERT(span % stat.m_Span == 0);
774  size_t i = pos / stat.m_Span;
775  int v = stat.AsByte(it->m_Value);
776  for ( ; span > 0; span -= stat.m_Span, ++i ) {
777  bytes[i] = v;
778  }
779  }
780  }
781  return graph;
782 }
783 
784 
786 {
787  CRef<CSeq_annot> annot(new CSeq_annot);
788  if ( !m_TrackDescription.empty() ) {
789  CRef<CAnnotdesc> desc(new CAnnotdesc);
791  annot->SetDesc().Set().push_back(desc);
792  }
793  if ( !m_TrackName.empty() ) {
794  CRef<CAnnotdesc> desc(new CAnnotdesc);
795  desc->SetName(m_TrackName);
796  annot->SetDesc().Set().push_back(desc);
797  }
798  if ( true ) {
799  m_TrackParams["track type"] = "graph";
800  }
801  if ( !m_TrackParams.empty() ) {
802  CRef<CAnnotdesc> desc(new CAnnotdesc);
803  annot->SetDesc().Set().push_back(desc);
804  CUser_object& user = desc->SetUser();
805  user.SetType().SetStr("Track Data");
807  CRef<CUser_field> field(new CUser_field);
808  field->SetLabel().SetStr(it->first);
809  field->SetData().SetStr(it->second);
810  user.SetData().push_back(field);
811  }
812  }
813  return annot;
814 }
815 
816 
818 {
819  CRef<CSeq_annot> annot = MakeAnnot();
820  annot->SetData().SetSeq_table(*MakeTable());
821  return annot;
822 }
823 
824 
826 {
827  CRef<CSeq_annot> annot = MakeAnnot();
828  annot->SetData().SetGraph().push_back(MakeGraph());
829  return annot;
830 }
831 
832 
834 {
835  m_ChromId.clear();
836  m_Values.clear();
837 }
838 
839 
841 {
842  ERR_FATAL(GetArgs()["input"].AsString()<<":"<<m_Input->GetLineNumber()<<
843  ": "<<msg<<": \""<<m_CurLine<<"\"");
844 }
845 
846 
848 {
849  const char* ptr = m_CurLine.data();
850  size_t skip = 0;
851  for ( size_t len = m_CurLine.size(); skip < len; ++skip ) {
852  char c = ptr[skip];
853  if ( c != ' ' && c != '\t' ) {
854  break;
855  }
856  }
857  m_CurLine = m_CurLine.substr(skip);
859  return !m_CurLine.empty();
860 }
861 
862 
864 {
865  char c = m_CurLine.data()[0];
866  return c == '#';// || c == '\0';
867 }
868 
869 
871 {
872  const char* ptr = m_CurLine.data();
873  size_t skip = 0;
874  for ( size_t len = m_CurLine.size(); skip < len; ++skip ) {
875  char c = ptr[skip];
876  if ( c == ' ' || c == '\t' ) {
877  break;
878  }
879  }
880  if ( skip == 0 ) {
881  x_Error("Identifier expected");
882  }
883  m_CurLine = m_CurLine.substr(skip);
885  return CTempString(ptr, skip);
886 }
887 
888 
890 {
891  const char* ptr = m_CurLine.data();
892  size_t skip = 0;
893  for ( size_t len = m_CurLine.size(); skip < len; ++skip ) {
894  char c = ptr[skip];
895  if ( c == '=' ) {
896  m_CurLine = m_CurLine.substr(skip+1);
898  return CTempString(ptr, skip);
899  }
900  if ( c == ' ' || c == '\t' ) {
901  break;
902  }
903  }
904  x_Error("'=' expected");
905  return CTempString();
906 }
907 
908 
910 {
911  const char* ptr = m_CurLine.data();
912  size_t len = m_CurLine.size();
913  if ( len && *ptr == '"' ) {
914  size_t pos = 1;
915  for ( ; pos < len; ++pos ) {
916  char c = ptr[pos];
917  if ( c == '"' ) {
918  m_CurLine = m_CurLine.substr(pos+1);
920  return CTempString(ptr+1, pos-1);
921  }
922  }
923  x_Error("Open quotes");
924  }
925  return x_GetWord();
926 }
927 
928 
930 {
931  TSeqPos ret = 0;
932  const char* ptr = m_CurLine.data();
933  for ( size_t skip = 0; ; ++skip ) {
934  char c = ptr[skip];
935  if ( c >= '0' && c <= '9' ) {
936  ret = ret*10 + (c-'0');
937  }
938  else if ( (c == ' ' || c == '\t' || c == '\0') && skip ) {
939  m_CurLine = m_CurLine.substr(skip);
941  v = ret;
942  return;
943  }
944  else {
945  x_Error("Integer value expected");
946  }
947  }
948 }
949 
950 
952 {
953  double ret = 0;
954  const char* ptr = m_CurLine.data();
955  size_t skip = 0;
956  bool negate = false, digits = false;
957  for ( ; ; ++skip ) {
958  char c = ptr[skip];
959  if ( !skip ) {
960  if ( c == '-' ) {
961  negate = true;
962  continue;
963  }
964  if ( c == '+' ) {
965  continue;
966  }
967  }
968  if ( c >= '0' && c <= '9' ) {
969  digits = true;
970  ret = ret*10 + (c-'0');
971  }
972  else if ( c == '.' ) {
973  ++skip;
974  break;
975  }
976  else if ( c == '\0' ) {
977  if ( !digits ) {
978  return false;
979  }
980  m_CurLine.clear();
982  if ( negate ) {
983  ret = -ret;
984  }
985  v = ret;
986  return true;
987  }
988  else {
989  return false;
990  }
991  }
992  double digit_mul = 1;
993  for ( ; ; ++skip ) {
994  char c = ptr[skip];
995  if ( c >= '0' && c <= '9' ) {
996  digits = true;
997  digit_mul *= .1;
998  ret += (c-'0')*digit_mul;
999  }
1000  else if ( (c == ' ' || c == '\t' || c == '\0') && digits ) {
1001  m_CurLine.clear();
1003  v = ret;
1004  if ( negate ) {
1005  ret = -ret;
1006  }
1007  return true;
1008  }
1009  else {
1010  return false;
1011  }
1012  }
1013 }
1014 
1015 
1017 {
1018  if ( x_TryGetDoubleSimple(v) ) {
1019  return true;
1020  }
1021  const char* ptr = m_CurLine.data();
1022  char* endptr = 0;
1023  v = strtod(ptr, &endptr);
1024  if ( endptr == ptr ) {
1025  return false;
1026  }
1027  if ( *endptr ) {
1028  x_Error("extra text in line");
1029  }
1030  m_CurLine.clear();
1032  return true;
1033 }
1034 
1035 
1037 {
1038  char c = m_CurLine.data()[0];
1039  if ( c < '0' || c > '9' ) {
1040  return false;
1041  }
1042  x_GetPos(v);
1043  return true;
1044 }
1045 
1046 
1048 {
1049  if ( !x_TryGetDouble(v) ) {
1050  x_Error("Floating point value expected");
1051  }
1052 }
1053 
1054 
1056 {
1057  if ( m_Input->AtEOF() ) {
1058  return false;
1059  }
1060  m_CurLine = *++*m_Input;
1062  return true;
1063 }
1064 
1065 
1067 {
1068  m_Input->UngetLine();
1069 }
1070 
1071 
1073 {
1074  if ( !m_Annot ) {
1075  return;
1076  }
1077  if ( m_AsGraph ) {
1078  LOG_POST("DumpAnnot");
1079  }
1080  *m_Output << *m_Annot;
1081  m_Annot = null;
1082 }
1083 
1084 
1086 {
1087  if ( m_ChromId.empty() ) {
1088  return;
1089  }
1090  LOG_POST("Chrom: "<<m_ChromId<<" "<<m_Values.size());
1091  if ( !m_Annot ) {
1092  m_Annot = MakeAnnot();
1093  }
1094  if ( m_AsGraph ) {
1095  m_Annot->SetData().SetGraph().push_back(MakeGraph());
1096  }
1097  else {
1098  m_Annot->SetData().SetSeq_table(*MakeTable());
1099  }
1100  if ( !m_SingleAnnot ) {
1101  DumpAnnot();
1102  }
1103  ResetChromValues();
1104 }
1105 
1106 
1108 {
1109  if ( chrom != m_ChromId ) {
1110  DumpChromValues();
1111  m_ChromId = chrom;
1112  }
1113 }
1114 
1115 
1117 {
1118 }
1119 
1120 
1122 {
1123  DumpAnnot();
1124 
1125  m_TrackDescription.clear();
1126  m_TrackTypeValue.clear();
1128  m_TrackParams.clear();
1129  while ( x_SkipWS() ) {
1130  CTempString name = x_GetParamName();
1132  if ( name == "type" ) {
1134  if ( value == "wiggle_0" ) {
1136  }
1137  else if ( value == "bedGraph" ) {
1139  }
1140  else {
1141  x_Error("Invalid track type");
1142  }
1143  }
1144  else if ( name == "name" ) {
1145  m_TrackName = value;
1146  }
1147  else if ( name == "description" ) {
1149  }
1150  else {
1151  m_TrackParams[name] = value;
1152  }
1153  }
1154  if ( m_TrackType == eTrackType_invalid ) {
1155  x_Error("Unknown track type");
1156  }
1157 }
1158 
1159 
1161 {
1162  if ( m_TrackType != eTrackType_wiggle_0 &&
1164  x_Error("Track type=wiggle_0 is required");
1165  }
1166 
1167  size_t start = 0;
1168  size_t step = 0;
1169  size_t span = 1;
1170  while ( x_SkipWS() ) {
1171  CTempString name = x_GetParamName();
1173  if ( name == "chrom" ) {
1174  SetChrom(value);
1175  }
1176  else if ( name == "start" ) {
1177  start = NStr::StringToUInt(value);
1178  }
1179  else if ( name == "step" ) {
1180  step = NStr::StringToUInt(value);
1181  }
1182  else if ( name == "span" ) {
1183  span = NStr::StringToUInt(value);
1184  }
1185  else {
1186  x_Error("Bad param name");
1187  }
1188  }
1189  if ( m_ChromId.empty() ) {
1190  x_Error("No chrom");
1191  }
1192  if ( start == 0 ) {
1193  x_Error("No start");
1194  }
1195  if ( step == 0 ) {
1196  x_Error("No step");
1197  }
1198 
1199  SValueInfo value;
1200  value.m_Pos = start-1;
1201  value.m_Span = span;
1202  while ( x_GetLine() ) {
1203  if ( !x_TryGetDouble(value.m_Value) ) {
1204  x_UngetLine();
1205  break;
1206  }
1207  AddValue(value);
1208  value.m_Pos += step;
1209  }
1210 }
1211 
1212 
1214 {
1215  if ( m_TrackType != eTrackType_wiggle_0 &&
1217  x_Error("Track type=wiggle_0 is required");
1218  }
1219 
1220  size_t span = 1;
1221  while ( x_SkipWS() ) {
1222  CTempString name = x_GetParamName();
1224  if ( name == "chrom" ) {
1225  SetChrom(value);
1226  }
1227  else if ( name == "span" ) {
1228  span = NStr::StringToUInt(value);
1229  }
1230  else {
1231  x_Error("Bad param name");
1232  }
1233  }
1234  if ( m_ChromId.empty() ) {
1235  x_Error("No chrom");
1236  }
1237  SValueInfo value;
1238  value.m_Span = span;
1239  while ( x_GetLine() ) {
1240  if ( !x_TryGetPos(value.m_Pos) ) {
1241  x_UngetLine();
1242  break;
1243  }
1244  x_SkipWS();
1245  x_GetDouble(value.m_Value);
1246  value.m_Pos -= 1;
1247  AddValue(value);
1248  }
1249 }
1250 
1251 
1253 {
1254  if ( m_TrackType != eTrackType_bedGraph &&
1256  x_Error("Track type=bedGraph is required");
1257  }
1258  SetChrom(chrom);
1259  SValueInfo value;
1260  x_SkipWS();
1261  x_GetPos(value.m_Pos);
1262  x_SkipWS();
1263  x_GetPos(value.m_Span);
1264  x_SkipWS();
1265  x_GetDouble(value.m_Value);
1266  value.m_Span -= value.m_Pos;
1267  AddValue(value);
1268 }
1269 
1270 
1272 {
1274  // Get arguments
1275  const CArgs& args = GetArgs();
1276 
1277  if ( args["mapfile"] ) {
1278  m_IdMapper.reset(new CIdMapperConfig(args["mapfile"].AsInputFile(),
1279  args["genome"].AsString(),
1280  false));
1281  }
1282  else if ( !args["genome"].AsString().empty() ) {
1283  LOG_POST("Genome: "<<args["genome"].AsString());
1284  m_IdMapper.reset(new CIdMapperBuiltin(args["genome"].AsString(),
1285  false));
1286  }
1287 
1288  m_AsGraph = args["as-graph"];
1289  m_SingleAnnot = args["single-annot"];
1290  m_OmitZeros = args["omit-zeros"];
1291  m_JoinSame = args["join-same"];
1292  m_AsByte = m_AsGraph || args["as-byte"];
1293  m_KeepInteger = args["keep-integer"];
1294  m_GapValue = args["gap-value"].AsDouble();
1295  if ( m_AsGraph ) {
1296  if ( m_OmitZeros ) {
1297  ERR_POST(Warning<<
1298  "The option -omit_zeros is ignored for Seq-graph");
1299  m_OmitZeros = false;
1300  }
1301  if ( m_JoinSame ) {
1302  ERR_POST(Warning<<
1303  "The option -join_same is ignored for Seq-graph");
1304  m_JoinSame = false;
1305  }
1306  }
1307  else {
1308  if ( m_SingleAnnot ) {
1309  ERR_POST(Warning<<
1310  "The option -single-annot is ignored for Seq-table");
1311  m_SingleAnnot = false;
1312  }
1313  }
1314 
1315  // Read the entry
1316  m_Input.reset(new CWigBufferedLineReader(args["input"].AsString()));
1317  m_Output.reset
1318  (CObjectOStream::Open(s_GetFormat(args["outfmt"].AsString()),
1319  args["output"].AsOutputFile()));
1320 
1321  if ( args["name"] ) {
1322  m_TrackName = args["name"].AsString();
1323  }
1324  else if ( m_TrackName.empty() ) {
1325  m_TrackName = "User Track";
1326  }
1327 
1328  while ( x_GetLine() ) {
1329  if ( x_CommentLine() ) {
1330  continue;
1331  }
1332  CTempString s = x_GetWord();
1333  if ( s == "browser" ) {
1334  ReadBrowser();
1335  }
1336  else if ( s == "track" ) {
1337  ReadTrack();
1338  }
1339  else if ( s == "fixedStep" ) {
1340  ReadFixedStep();
1341  }
1342  else if ( s == "variableStep" ) {
1343  ReadVariableStep();
1344  }
1345  else {
1346  ReadBedLine(s);
1347  }
1348  }
1349  m_Input.reset();
1350 
1351  DumpChromValues();
1352  DumpAnnot();
1353  m_Output.reset();
1354 
1355  // Exit successfully
1356  return 0;
1357 }
1358 
1359 
1360 /////////////////////////////////////////////////////////////////////////////
1361 // MAIN
1362 
1363 
1364 int main(int argc, const char* argv[])
1365 {
1366  // Execute main application function
1367  return CWig2tableApplication().AppMain(argc, argv);
1368 }
User-defined methods of the data storage class.
User-defined methods of the data storage class.
CAnnotdesc –.
Definition: Annotdesc.hpp:66
CArgAllow_Strings –.
Definition: ncbiargs.hpp:1641
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgs –.
Definition: ncbiargs.hpp:379
CByte_graph –.
Definition: Byte_graph.hpp:66
IdMapper implementation using hardcoded values.
Definition: idmapper.hpp:275
IdMapper implementation using an external configuration file.
Definition: idmapper.hpp:189
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
void ReadBedLine(CTempString chrom)
Definition: wig2table.cpp:1252
void x_GetPos(TSeqPos &v)
Definition: wig2table.cpp:929
bool x_TryGetDouble(double &v)
Definition: wig2table.cpp:1016
ETrackType m_TrackType
Definition: wig2table.cpp:409
CTempString x_GetParamValue(void)
Definition: wig2table.cpp:909
CTempString x_GetWord(void)
Definition: wig2table.cpp:870
CRef< CSeq_annot > MakeGraphAnnot(void)
Definition: wig2table.cpp:825
vector< SValueInfo > TValues
Definition: wig2table.cpp:412
void ResetChromValues(void)
Definition: wig2table.cpp:833
CRef< CSeq_annot > m_Annot
Definition: wig2table.cpp:414
bool x_CommentLine(void) const
Definition: wig2table.cpp:863
void SetChrom(CTempString chrom)
Definition: wig2table.cpp:1107
bool x_TryGetPos(TSeqPos &v)
Definition: wig2table.cpp:1036
void x_SetTotalLoc(CSeq_loc &loc, CSeq_id &chrom_id)
Definition: wig2table.cpp:602
AutoPtr< IIdMapper > m_IdMapper
Definition: wig2table.cpp:423
void x_GetDouble(double &v)
Definition: wig2table.cpp:1047
TTrackParams m_TrackParams
Definition: wig2table.cpp:411
CTempString x_GetParamName(void)
Definition: wig2table.cpp:889
CRef< CSeq_annot > MakeTableAnnot(void)
Definition: wig2table.cpp:817
virtual int Run(void)
Run the application.
Definition: wig2table.cpp:1271
AutoPtr< CObjectOStream > m_Output
Definition: wig2table.cpp:306
void ReadVariableStep(void)
Definition: wig2table.cpp:1213
CRef< CSeq_graph > MakeGraph(void)
Definition: wig2table.cpp:733
void x_Error(const char *msg)
Definition: wig2table.cpp:840
void ReadFixedStep(void)
Definition: wig2table.cpp:1160
SStat x_PreprocessValues(void)
Definition: wig2table.cpp:513
void DumpChromValues(void)
Definition: wig2table.cpp:1085
virtual void Init(void)
Initialize the application.
Definition: wig2table.cpp:427
void AddValue(const SValueInfo &value)
Definition: wig2table.cpp:395
double EstimateSize(size_t rows, bool fixed_span) const
Definition: wig2table.cpp:499
bool x_TryGetDoubleSimple(double &v)
Definition: wig2table.cpp:951
CRef< CSeq_table > MakeTable(void)
Definition: wig2table.cpp:616
map< string, string > TTrackParams
Definition: wig2table.cpp:410
AutoPtr< CWigBufferedLineReader > m_Input
Definition: wig2table.cpp:305
CTempStringEx m_CurLine
Definition: wig2table.cpp:308
CRef< CSeq_id > x_MakeChromId(void)
Definition: wig2table.cpp:592
CRef< CSeq_annot > MakeAnnot(void)
Definition: wig2table.cpp:785
CT_POS_TYPE m_InputPos
Definition: wig2table.cpp:89
AutoPtr< IReader > m_Reader
Definition: wig2table.cpp:80
Uint8 GetLineNumber(void) const
Definition: wig2table.cpp:276
CWigBufferedLineReader & operator++(void)
Definition: wig2table.cpp:129
CWigBufferedLineReader(const string &filename)
read from the file, "-" (but not "./-") means standard input
Definition: wig2table.cpp:94
CT_POS_TYPE GetPosition(void) const
Definition: wig2table.cpp:270
bool AtEOF(void) const
Definition: wig2table.cpp:114
virtual ~CWigBufferedLineReader()
Definition: wig2table.cpp:109
AutoArray< char > m_Buffer
Definition: wig2table.cpp:84
CTempStringEx operator*(void) const
Definition: wig2table.cpp:264
CTempStringEx m_Line
Definition: wig2table.cpp:87
virtual void MapObject(CSerialObject &)=0
Map all embedded IDs in a given object at once.
bool empty() const
Definition: map.hpp:149
void clear()
Definition: map.hpp:169
#define true
Definition: bool.h:35
#define false
Definition: bool.h:36
void reset(element_type *p=0, EOwnership ownership=eTakeOwnership)
Reset will delete the old pointer (if owned), set content to the new value, and assume the ownership ...
Definition: ncbimisc.hpp:480
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:305
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
Definition: ncbiapp.cpp:832
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
Definition: ncbiapp.cpp:1208
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
element_type * get(void) const
Get pointer.
Definition: ncbimisc.hpp:581
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
element_type * release(void)
Release will release ownership of pointer to caller.
Definition: ncbimisc.hpp:472
@ fPreOpen
Open file right away; for eInputFile, eOutputFile, eIOFile.
Definition: ncbiargs.hpp:618
@ eInputFile
Name of file (must exist and be readable)
Definition: ncbiargs.hpp:595
@ eDouble
Convertible into a floating point number (double)
Definition: ncbiargs.hpp:594
@ eString
An arbitrary string.
Definition: ncbiargs.hpp:589
@ eOutputFile
Name of file (must be writable)
Definition: ncbiargs.hpp:596
#define ERR_FATAL(message)
Posting fatal error and abort.
Definition: ncbidiag.hpp:240
EDiagSev SetDiagPostLevel(EDiagSev post_sev=eDiag_Error)
Set the threshold severity for posting the messages.
Definition: ncbidiag.cpp:6132
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
@ eDiag_Warning
Warning message.
Definition: ncbidiag.hpp:652
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
void Warning(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1191
#define THROW1_TRACE(exception_class, exception_arg)
Throw trace.
Definition: ncbiexpt.hpp:417
ESerialDataFormat
Data file format.
Definition: serialdef.hpp:71
@ eSerial_AsnText
ASN.1 text.
Definition: serialdef.hpp:73
@ eSerial_Xml
XML.
Definition: serialdef.hpp:75
@ eSerial_AsnBinary
ASN.1 binary.
Definition: serialdef.hpp:74
void SetEmpty(TEmpty &v)
Definition: Seq_loc.hpp:981
void SetInt(TInt &v)
Definition: Seq_loc.hpp:983
static CObjectOStream * Open(ESerialDataFormat format, CNcbiOstream &outStream, bool deleteOutStream)
Create serial object writer and attach it to an output stream.
Definition: objostr.cpp:126
uint64_t Uint8
8-byte (64-bit) unsigned integer
Definition: ncbitype.h:105
ERW_Result
Result codes for I/O operations.
#define CT_OFF_TYPE
Definition: ncbistre.hpp:731
virtual ERW_Result Read(void *buf, size_t count, size_t *bytes_read=0)=0
Read as many as "count" bytes into a buffer pointed to by the "buf" argument.
#define CT_POS_TYPE
Definition: ncbistre.hpp:730
@ eRW_NotImplemented
Action / information is not available.
@ eRW_Eof
End of data, should be considered permanent.
@ eRW_Error
Unrecoverable error, no retry possible.
@ eRW_Timeout
Timeout expired, try again later.
@ eRW_Success
Everything is okay, I/O completed.
CTempStringEx substr(size_type pos) const
Obtain a substring from this string, beginning at a given offset.
Definition: tempstr.hpp:1010
const char * data(void) const
Return a pointer to the array represented.
Definition: tempstr.hpp:313
bool empty(void) const
Return true if the represented string is empty (i.e., the length is zero)
Definition: tempstr.hpp:334
void clear(void)
Clear value to an empty string.
Definition: tempstr.hpp:1003
static unsigned int StringToUInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to unsigned int.
Definition: ncbistr.cpp:642
bool HasZeroAtEnd(void) const
Definition: tempstr.hpp:1036
CTempStringEx & assign(const char *str, size_type len)
Assign new values to the content of the a string.
Definition: tempstr.hpp:970
size_type size(void) const
Return the length of the represented array.
Definition: tempstr.hpp:327
const_iterator begin() const
Return an iterator to the string's starting position.
Definition: tempstr.hpp:299
TData & SetData(void)
Assign a value to Data data member.
void SetLabel(TLabel &value)
Assign a value to Label data member.
void SetType(TType &value)
Assign a value to Type data member.
void SetData(TData &value)
Assign a value to Data data member.
void SetHeader(THeader &value)
Assign a value to Header data member.
void SetData(TData &value)
Assign a value to Data data member.
void SetDefault(TDefault &value)
Assign a value to Default data member.
void SetTo(TTo value)
Assign a value to To data member.
void SetId(TId &value)
Assign a value to Id data member.
void SetFrom(TFrom value)
Assign a value to From data member.
@ e_Local
local use
Definition: Seq_id_.hpp:95
void SetA(TA value)
Assign a value to A data member.
void SetMin(TMin value)
Assign a value to Min data member.
void SetTitle(const TTitle &value)
Assign a value to Title data member.
Definition: Seq_graph_.hpp:784
vector< char > TValues
Definition: Byte_graph_.hpp:89
void SetNumval(TNumval value)
Assign a value to Numval data member.
void SetComp(TComp value)
Assign a value to Comp data member.
TValues & SetValues(void)
Assign a value to Values data member.
void SetGraph(TGraph &value)
Assign a value to Graph data member.
Definition: Seq_graph_.cpp:250
void SetB(TB value)
Assign a value to B data member.
void SetMax(TMax value)
Assign a value to Max data member.
void SetLoc(TLoc &value)
Assign a value to Loc data member.
Definition: Seq_graph_.cpp:224
void SetAxis(TAxis value)
Assign a value to Axis data member.
TTitle & SetTitle(void)
Select the variant.
Definition: Annotdesc_.hpp:528
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_annot_.cpp:244
void SetDesc(TDesc &value)
Assign a value to Desc data member.
Definition: Seq_annot_.cpp:223
TName & SetName(void)
Select the variant.
Definition: Annotdesc_.hpp:508
TUser & SetUser(void)
Select the variant.
Definition: Annotdesc_.cpp:190
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
<!DOCTYPE HTML >< html > n< header > n< title > PubSeq Gateway Help Page</title > n< style > n table
int i
int len
range(_Ty, _Ty) -> range< _Ty >
constexpr auto sort(_Init &&init)
constexpr bool empty(list< Ts... >) noexcept
const struct ncbi::grid::netcache::search::fields::SIZE size
Magic spell ;-) needed for some weird compilers... very empiric.
const GenericPointer< typename T::ValueType > T2 value
Definition: pointer.h:1227
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Defines command line argument related classes.
Defines unified interface to application:
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
@ eRead
Definition: ns_types.hpp:56
static SLJIT_INLINE sljit_ins msg(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
void SetFirstValue(double v)
Definition: wig2table.cpp:368
void AddSpan(TSeqPos span)
Definition: wig2table.cpp:362
void SetFirstSpan(TSeqPos span)
Definition: wig2table.cpp:357
int AsByte(double v) const
Definition: wig2table.cpp:385
bool operator<(const SValueInfo &v) const
Definition: wig2table.cpp:334
#define _ASSERT
else result
Definition: token2.c:20
USING_SCOPE(ncbi)
int main(int argc, const char *argv[])
Definition: wig2table.cpp:1364
static ESerialDataFormat s_GetFormat(const string &name)
Definition: wig2table.cpp:484
Modified on Fri Sep 20 14:57:59 2024 by modify_doxy.py rev. 669887