NCBI C++ ToolKit
microarray_reader.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: microarray_reader.cpp 93579 2021-05-01 20:54:52Z stakhovv $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Frank Ludwig
27  *
28  * File Description:
29  * MicroArray file reader
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <util/line_reader.hpp>
35 
45 
47 
49 
51 BEGIN_objects_SCOPE
52 
53 // ----------------------------------------------------------------------------
55  int flags,
56  CReaderListener* pRL)
57 // ----------------------------------------------------------------------------
58  : CReaderBase(flags, "", "", CReadUtil::AsSeqId, pRL),
59  m_currentId(""),
60  m_columncount(15),
61  m_usescore(false)
62 {
64 }
65 
66 // ----------------------------------------------------------------------------
68 // ----------------------------------------------------------------------------
69 {
70 }
71 
72 // ----------------------------------------------------------------------------
75  ILineReader& lr,
76  ILineErrorListener* pEC)
77 // ----------------------------------------------------------------------------
78 {
80  if (pAnnot) {
81  xAssignTrackData(*pAnnot);
82 
83  if(m_columncount >= 3) {
84  CRef<CUser_object> columnCountUser( new CUser_object() );
85  columnCountUser->SetType().SetStr( "NCBI_BED_COLUMN_COUNT" );
86  columnCountUser->AddField("NCBI_BED_COLUMN_COUNT", int ( m_columncount ) );
87 
88  CRef<CAnnotdesc> userDesc( new CAnnotdesc() );
89  userDesc->SetUser().Assign( *columnCountUser );
90  pAnnot->SetDesc().Set().push_back( userDesc );
91  }
92  }
93  return pAnnot;
94 }
95 
96 // ----------------------------------------------------------------------------
99 // ----------------------------------------------------------------------------
100 {
103  pAnnot->SetDesc(*desc);
104  pAnnot->SetData().SetFtable();
105  return pAnnot;
106 }
107 
108 
109 // ----------------------------------------------------------------------------
110 void
112  const TReaderData& readerData,
113  CSeq_annot& annot)
114 // ----------------------------------------------------------------------------
115 {
116  for (const auto& lineInfo: readerData) {
117  const auto& line = lineInfo.mData;
118  if (xParseBrowserLine(line, annot)) {
119  return;
120  }
121  if (xProcessTrackLine(line)) {
122  return;
123  }
124  xProcessFeature(line, annot);
125  }
126 }
127 
128 // ----------------------------------------------------------------------------
129 void
131  ILineReader& lr,
132  TReaderData& readerData)
133 // ----------------------------------------------------------------------------
134 {
135  const int MAX_RECORDS = 100000;
136 
137  readerData.clear();
138  if (m_uDataCount == MAX_RECORDS) {
139  m_uDataCount = 0;
140  m_currentId.clear();
141  return;
142  }
143 
144  string line, head, tail;
145  if (!xGetLine( lr, line)) {
146  return;
147  }
148  if (xIsTrackLine(line)) {
149  if (!m_currentId.empty()) {
150  xUngetLine(lr);
151  m_uDataCount = 0;
152  m_currentId.clear();
153  return;
154  }
155  else {
156  readerData.push_back(TReaderLine{m_uLineNumber, line});
157  ++m_uDataCount;
158  return;
159  }
160  }
161 
162  NStr::SplitInTwo(line, "\t", head, tail);
163  if (!m_currentId.empty() && head != m_currentId) {
164  xUngetLine(lr);
165  m_uDataCount = 0;
166  m_currentId.clear();
167  return;
168  }
169  readerData.push_back(TReaderLine{m_uLineNumber, line});
170  if (m_currentId.empty()) {
171  m_currentId = head;
172  }
173  ++m_uDataCount;
174 }
175 
176 // ----------------------------------------------------------------------------
178  const string& line,
179  CSeq_annot& annot)
180 // ----------------------------------------------------------------------------
181 {
182  const size_t COLUMNCOUNT = 15;
183 
184  vector<string> fields;
185  NStr::Split(line, " \t", fields, NStr::fSplit_MergeDelimiters);
186  xCleanColumnValues(fields);
187  if (fields.size() != COLUMNCOUNT) {
189  eDiag_Error,
191  "Feature Processing: Bad column count. Should be 15." );
192  throw(error);
193  }
194 
195  CRef<CSeq_feat> feature;
196  feature.Reset(new CSeq_feat);
197  xSetFeatureLocation(feature, fields);
198  xSetFeatureDisplayData(feature, fields);
199  annot.SetData().SetFtable().push_back(feature);
200  return true;
201 }
202 
203 // ----------------------------------------------------------------------------
205  CRef<CSeq_feat>& feature,
206  const vector<string>& fields )
207 // ----------------------------------------------------------------------------
208 {
209  feature->ResetLocation();
210 
211  CRef<CSeq_id> id( new CSeq_id() );
212  id->SetLocal().SetStr( fields[0] );
213 
215  CSeq_interval& interval = location->SetInt();
216  interval.SetFrom( NStr::StringToInt( fields[1] ) );
217  interval.SetTo( NStr::StringToInt( fields[2] ) - 1 );
218  interval.SetStrand(
219  ( fields[5] == "+" ) ? eNa_strand_plus : eNa_strand_minus );
220  location->SetId( *id );
221 
222  feature->SetLocation( *location );
223 }
224 
225 // ----------------------------------------------------------------------------
227  CRef<CSeq_feat>& feature,
228  const vector<string>& fields )
229 // ----------------------------------------------------------------------------
230 {
231  CRef<CUser_object> display_data( new CUser_object );
232  display_data->SetType().SetStr( "Display Data" );
233 
234  display_data->AddField( "name", fields[3] );
235  if ( !m_usescore ) {
236  display_data->AddField( "score", NStr::StringToInt(fields[4]) );
237  }
238  else {
239  display_data->AddField( "greylevel", NStr::StringToInt(fields[4]) );
240  }
241  display_data->AddField( "thickStart", NStr::StringToInt(fields[6]) );
242  display_data->AddField( "thickEnd", NStr::StringToInt(fields[7]) - 1 );
243  display_data->AddField( "itemRGB", NStr::StringToInt(fields[8]) );
244  display_data->AddField( "blockCount", NStr::StringToInt(fields[9]) );
245  display_data->AddField( "blockSizes", fields[10] );
246  display_data->AddField( "blockStarts", fields[11] );
247 
248  if ( !(m_iFlags & fReadAsBed) ) {
249  if ( fields.size() >= 13 ) {
250  display_data->AddField( "expCount", NStr::StringToInt(fields[12]) );
251  }
252  if ( fields.size() >= 14 ) {
253  display_data->AddField( "expIds", fields[13] );
254  }
255  if ( fields.size() >= 15 ) {
256  display_data->AddField( "expStep", NStr::StringToInt(fields[14]) );
257  }
258  }
259 
260  feature->SetData().SetUser( *display_data );
261 }
262 
263 // ----------------------------------------------------------------------------
265  const string& strLine)
266 // ----------------------------------------------------------------------------
267 {
268  m_strExpNames = "";
269  m_iExpScale = -1;
270  m_iExpStep = -1;
271 
272  if (!CReaderBase::xParseTrackLine(strLine)) {
273  return false;
274  }
275  if ( m_iFlags & fReadAsBed ) {
276  return true;
277  }
278 
279  if ( m_strExpNames.empty() ) {
283  "Track Line Processing: Missing \"expName\" parameter.");
284  m_pMessageHandler->Report(error);
285  }
286  if ( m_iExpScale == -1 ) {
290  "Track Line Processing: Missing \"expScale\" parameter." );
291  m_pMessageHandler->Report(error);
292  }
293  if ( m_iExpStep == -1 ) {
297  "Track Line Processing: Missing \"expStep\" parameter." );
298  m_pMessageHandler->Report(error);
299  }
300 
301  return true;
302 }
303 
304 // ----------------------------------------------------------------------------
305 void
307  vector<string>& columns)
308 // ----------------------------------------------------------------------------
309 {
310  string fixup;
311  auto columnCount = columns.size();
312 
313  if (columnCount <= 1) {
314  return;
315  }
316  if (NStr::EqualNocase(columns[0], "chr")) {
317  columns[1] = columns[0] + columns[1];
318  columns.erase(columns.begin());
319  }
320 
321  if (columnCount <= 2) {
322  return;
323  }
324  try {
325  NStr::Replace(columns[1], ",", "", fixup);
326  columns[1] = fixup;
327  }
328  catch (CException&) {
330  eDiag_Error,
331  0,
332  "Bad data line: Invalid \"SeqStart\" (column 2) value." );
333  throw(error);
334  }
335 
336  if (columnCount <= 3) {
337  return;
338  }
339  try {
340  NStr::Replace(columns[2], ",", "", fixup);
341  columns[2] = fixup;
342  }
343  catch (CException&) {
345  eDiag_Error,
346  0,
347  "Bad data line: Invalid \"SeqStop\" (column 3) value." );
348  throw(error);
349  }
350 }
351 
352 END_objects_SCOPE
User-defined methods of the data storage class.
User-defined methods of the data storage class.
CAnnot_descr –.
Definition: Annot_descr.hpp:66
CAnnotdesc –.
Definition: Annotdesc.hpp:66
CRef< CSeq_annot > ReadSeqAnnot(ILineReader &, ILineErrorListener *=nullptr) override
Read an object from a given line reader, render it as a single Seq-annot, if possible.
CRef< CSeq_annot > xCreateSeqAnnot() override
bool xProcessFeature(const string &, CSeq_annot &)
CMicroArrayReader(int=fDefaults, CReaderListener *=nullptr)
void xSetFeatureLocation(CRef< CSeq_feat > &, const vector< string > &)
static void xCleanColumnValues(vector< string > &)
void xSetFeatureDisplayData(CRef< CSeq_feat > &, const vector< string > &)
vector< string >::size_type m_columncount
virtual bool xProcessTrackLine(const string &)
void xProcessData(const TReaderData &, CSeq_annot &) override
void xGetData(ILineReader &, TReaderData &) override
Common file reader utility functions.
Definition: read_util.hpp:47
Defines and provides stubs for a general interface to a variety of file readers.
Definition: reader_base.hpp:63
unique_ptr< CReaderMessageHandler > m_pMessageHandler
virtual bool xUngetLine(ILineReader &)
virtual CRef< CSeq_annot > xCreateSeqAnnot()
unsigned int m_uDataCount
unsigned int m_uLineNumber
virtual bool xParseBrowserLine(const string &, CSeq_annot &)
vector< TReaderLine > TReaderData
Definition: reader_base.hpp:70
virtual bool xGetLine(ILineReader &, string &)
TReaderFlags m_iFlags
virtual bool xParseTrackLine(const string &)
virtual void xAssignTrackData(CSeq_annot &)
virtual CRef< CSeq_annot > ReadSeqAnnot(CNcbiIstream &istr, ILineErrorListener *pErrors=nullptr)
Read an object from a given input stream, render it as a single Seq-annot.
struct SReaderLine { SReaderLine(unsigned int line, string data):mLine(line), mData(data) {} TReaderLine
Definition: reader_base.hpp:66
virtual bool xIsTrackLine(const CTempString &)
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
CUser_object & AddField(const string &label, const string &value, EParseField parse=eParse_String)
add a data field to the user object that holds a given value
Abstract base class for lightweight line-by-line reading.
Definition: line_reader.hpp:54
#define head
Definition: ct_nlmzip_i.h:138
static uch flags
#define false
Definition: bool.h:36
static const char location[]
Definition: config.c:97
static const column_t columns[]
Definition: utf8_2.c:22
@ eDiag_Error
Error message.
Definition: ncbidiag.hpp:653
@ eDiag_Warning
Warning message.
Definition: ncbidiag.hpp:652
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Set object to copy of another one.
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3461
static string & Replace(const string &src, const string &search, const string &replace, string &dst, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3314
static bool SplitInTwo(const CTempString str, const CTempString delim, string &str1, string &str2, TSplitFlags flags=0)
Split a string into two pieces using the specified delimiters.
Definition: ncbistr.cpp:3554
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5353
@ fSplit_MergeDelimiters
Merge adjacent delimiters.
Definition: ncbistr.hpp:2498
void SetType(TType &value)
Assign a value to Type data member.
void SetLocation(TLocation &value)
Assign a value to Location data member.
Definition: Seq_feat_.cpp:131
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_feat_.cpp:94
void ResetLocation(void)
Reset Location data member.
Definition: Seq_feat_.cpp:122
void SetTo(TTo value)
Assign a value to To data member.
void SetFrom(TFrom value)
Assign a value to From data member.
void SetStrand(TStrand value)
Assign a value to Strand data member.
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
void SetData(TData &value)
Assign a value to Data data member.
Definition: Seq_annot_.cpp:244
void SetDesc(TDesc &value)
Assign a value to Desc data member.
Definition: Seq_annot_.cpp:223
TUser & SetUser(void)
Select the variant.
Definition: Annotdesc_.cpp:190
Lightweight interface for getting lines of data with minimal memory copying.
Modified on Wed Jun 19 17:01:39 2024 by modify_doxy.py rev. 669887