NCBI C++ ToolKit
format_guess_ex.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: format_guess_ex.cpp 98242 2022-10-17 14:08:26Z ludwigf $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Nathan Bouk
27  *
28  * File Description:
29  * Wrapper and extention to CFormatGuess, using actual file readers
30  * when CFormatGuess fails
31  *
32  */
33 
34 #include <ncbi_pch.hpp>
35 #include <corelib/ncbistd.hpp>
36 #include <corelib/ncbiapp.hpp>
37 #include <corelib/ncbithr.hpp>
38 #include <corelib/ncbiutil.hpp>
39 #include <corelib/ncbiexpt.hpp>
40 #include <corelib/stream_utils.hpp>
41 
42 #include <util/static_map.hpp>
43 #include <util/line_reader.hpp>
44 
45 #include <serial/iterator.hpp>
46 #include <serial/objistrasn.hpp>
47 #include <serial/objistrasnb.hpp>
48 #include <serial/objistrxml.hpp>
49 #include <serial/objistrjson.hpp>
50 
52 
56 #include <objects/seq/Bioseq.hpp>
58 
59 //#include <objtools/hgvs/hgvs_parser.hpp>
70 
71 
72 #define NCBI_USE_ERRCODE_X Objtools_Rd_RepMask
73 
75 using namespace ncbi;
76 using namespace objects;
77 using namespace std;
78 
87 };
88 
89 
91  m_Guesser(new CFormatGuess),
92  m_pEffectiveRecognizedGenbankObjectTypes(&sDefaultRecognizedGenbankObjectTypes)
93 {
94 }
95 
96 
97 CFormatGuessEx::CFormatGuessEx(const string& FileName) :
98  m_Guesser(new CFormatGuess(FileName)),
99  m_pEffectiveRecognizedGenbankObjectTypes(&sDefaultRecognizedGenbankObjectTypes)
100 {
101  CNcbiIfstream FileIn(FileName.c_str());
102  x_FillLocalBuffer(FileIn);
103 }
104 
105 
107  m_Guesser(new CFormatGuess(In)),
108  m_pEffectiveRecognizedGenbankObjectTypes(&sDefaultRecognizedGenbankObjectTypes)
109 {
110  x_FillLocalBuffer(In);
111 }
112 
113 
115 {
116 }
117 
118 
120 {
121  CFormatGuess::EFormat Guess;
122  Guess = m_Guesser->GuessFormat();
123 
124  ERR_POST(Info << " CFormatGuessEx:: Initial CFormatGuess: " << (int)Guess);
125 
126  if(Guess != CFormatGuess::eUnknown) {
127  return Guess;
128  }
129  else {
130  CFormatGuess::EFormat CheckOrder[] = {
131  //CFormatGuess::eRmo
133  //case CFormatGuess::eXml:
138  //case CFormatGuess::eTextAsn:
142  //CFormatGuess::eHgvs
143  };
144  constexpr size_t checkCount = sizeof(CheckOrder) / sizeof(CFormatGuess::eAgp);
145  for(size_t loop = 0; loop < checkCount; loop++ ) {
146  auto CheckFormat = CheckOrder[loop];
147  if (m_Guesser->IsEnabled(CheckFormat) && x_TryFormat(CheckFormat)) {
148  return CheckFormat;
149  }
150  }
151  return CFormatGuess::eUnknown;
152  }
153 }
154 
155 
157 {
158  bool TestResult = m_Guesser->TestFormat(Format);
159 
160  if(TestResult) {
161  return true;
162  }
163  else {
164  return x_TryFormat(Format);
165  }
166 }
167 
168 
169 
171 {
172  m_LocalBuffer.str().clear();
173  m_LocalBuffer.clear();
174 
175  streamsize Total = 0;
176  while(!In.eof()) {
177  char buff[4096];
178  In.read(buff, sizeof(buff));
179  streamsize count = In.gcount();
180  if(count == 0)
181  break;
182  m_LocalBuffer.write(buff, count);
183  Total += count;
184  if(Total >= (1024*1024))
185  break;
186  }
187 
188  CStreamUtils::Pushback(In, m_LocalBuffer.str().c_str(), Total);
189  In.clear();
190 
191  return true;
192 }
193 
194 
196 {
197  switch(Format) {
198 
199  //case CFormatGuess::eBinaryAsn:
200  // return x_TryBinaryAsn();
201  case CFormatGuess::eRmo:
202  return x_TryRmo();
203  case CFormatGuess::eAgp:
204  return x_TryAgp();
205  //case CFormatGuess::eXml:
206  // return x_TryXml();
208  return x_TryWiggle();
209  case CFormatGuess::eBed:
210  return x_TryBed();
212  return x_TryBed15();
214  return x_TryFasta();
215  //case CFormatGuess::eTextAsn:
216  // return x_TryTextAsn();
217  case CFormatGuess::eGtf:
218  return x_TryGtf();
219  case CFormatGuess::eGff3:
220  return x_TryGff3();
221  case CFormatGuess::eGff2:
222  return x_TryGff2();
223  //case CFormatGuess::eHgvs:
224  // return x_TryHgvs();
225 
226  default:
227  return false;
228  };
229 }
230 
231 
232 // bool x_TryBinaryAsn();
233 
235 {
236  m_LocalBuffer.clear();
237  m_LocalBuffer.seekg(0);
238 
239  CRmReader::TFlags Flags =
242  CRef<CSerialObject> Result;
243 
244  unique_ptr<CRepeatMaskerReader> reader(new CRepeatMaskerReader(Flags));
245  try
246  {
247  Result = reader->ReadObject(m_LocalBuffer);
248  }
249  catch(...)
250  {
251  }
252 
253  return Result.NotEmpty();
254 }
255 
257 {
258  m_LocalBuffer.clear();
259  m_LocalBuffer.seekg(0);
260 
262  try {
263  CAgpToSeqEntry agp_reader;
264  if( 0 != agp_reader.ReadStream(m_LocalBuffer) ) {
265  return false;
266  }
267  Bioseqs.swap( agp_reader.GetResult() );
268  } catch(CException&) {
269  } catch(...) {
270  }
271 
272  return (!Bioseqs.empty());
273 }
274 
275 // bool x_TryXml();
276 
278 {
279  m_LocalBuffer.clear();
280  m_LocalBuffer.seekg(0);
281 
282  int WiggleCount = 0;
283 
285  CStreamLineReader LineReader(m_LocalBuffer);
286 
287  CRef<CSeq_annot> Annot;
288  try {
289  Annot = Reader.ReadSeqAnnot(LineReader);
290  } catch(CException&) {
291  } catch(...) {
292  }
293 
294  if (!Annot.IsNull() &&
295  Annot->CanGetData() &&
296  Annot->GetData().IsFtable())
297  WiggleCount++;
298 
299  return (WiggleCount > 0);
300 }
301 
303 {
304  m_LocalBuffer.clear();
305  m_LocalBuffer.seekg(0);
306 
307  int BedCount = 0;
308 
310  CStreamLineReader LineReader(m_LocalBuffer);
311 
312  list<CRef<CSeq_annot> > LocalAnnots;
313  try {
314  Reader.ReadSeqAnnots(LocalAnnots, LineReader);
315  } catch(CException&) {
316  } catch(...) {
317  }
318 
319  ITERATE(list<CRef<CSeq_annot> >, AnnotIter, LocalAnnots) {
320  if(!AnnotIter->IsNull() && (*AnnotIter)->CanGetData() &&
321  (*AnnotIter)->GetData().IsFtable())
322  BedCount++;
323  }
324 
325  return (BedCount > 0);
326 }
327 
329 {
330  m_LocalBuffer.clear();
331  m_LocalBuffer.seekg(0);
332 
333  int Bed15Count = 0;
334 
336  CStreamLineReader LineReader(m_LocalBuffer);
337 
338  CRef<CSeq_annot> Annot;
339  try {
340  Annot = Reader.ReadSeqAnnot(LineReader);
341  } catch(CException&) {
342  } catch(...) {
343  }
344 
345  if (!Annot.IsNull() &&
346  Annot->CanGetData() &&
347  Annot->GetData().IsFtable())
348  Bed15Count++;
349 
350  return (Bed15Count > 0);
351 }
352 
354 {
355  m_LocalBuffer.clear();
356  m_LocalBuffer.seekg(0);
357 
358  CRef<CSeq_entry> Result;
359  try {
360  CFastaReader Reader(m_LocalBuffer);
361  Result = Reader.ReadSet(1);
362  }
363  catch(...) {
364  return false;
365  }
366 
367  //
368  return (Result && Result->IsSet() && !Result->GetSet().GetSeq_set().empty());
369 }
370 
371 // bool x_TryTextAsn();
372 
374 {
375  m_LocalBuffer.clear();
376  m_LocalBuffer.seekg(0);
377 
378  int GtfCount = 0;
379 
380  CGtfReader Reader(0);
381  CStreamLineReader LineReader(m_LocalBuffer);
382 
383  CGtfReader::TAnnots LocalAnnots;
384  try {
385  Reader.ReadSeqAnnots(LocalAnnots, LineReader);
386  } catch(CException&) {
387  } catch(...) {
388  }
389 
390  ITERATE(CGtfReader::TAnnots, AnnotIter, LocalAnnots) {
391  if(!AnnotIter->IsNull() && (**AnnotIter).CanGetData() &&
392  (**AnnotIter).GetData().IsFtable())
393  GtfCount++;
394  }
395 
396  return (GtfCount > 0);
397 }
398 
400 {
401  m_LocalBuffer.clear();
402  m_LocalBuffer.seekg(0);
403 
404  int Gff3Count = 0;
405 
406  CGff3Reader Reader(0);
407  CStreamLineReader LineReader(m_LocalBuffer);
408 
409  CGff3Reader::TAnnots LocalAnnots;
410  try {
411  Reader.ReadSeqAnnots(LocalAnnots, LineReader);
412  } catch(CException&) {
413  } catch(...) {
414  }
415 
416  ITERATE(CGff3Reader::TAnnots, AnnotIter, LocalAnnots) {
417  if (!AnnotIter->IsNull() && (**AnnotIter).CanGetData() &&
418  (**AnnotIter).GetData().IsFtable())
419  Gff3Count++;
420  }
421 
422  return (Gff3Count > 0);
423 }
424 
426 {
427  m_LocalBuffer.clear();
428  m_LocalBuffer.seekg(0);
429 
430  int Gff2Count = 0;
431 
432  CGff2Reader Reader(0);
433  CStreamLineReader LineReader(m_LocalBuffer);
434 
435  CGff2Reader::TAnnots LocalAnnots;
436  try {
437  Reader.ReadSeqAnnots(LocalAnnots, LineReader);
438  } catch(CException&) {
439  } catch(...) {
440  }
441 
442  ITERATE(CGff2Reader::TAnnots, AnnotIter, LocalAnnots) {
443  if (!AnnotIter->IsNull() && (**AnnotIter).CanGetData() &&
444  (**AnnotIter).IsFtable())
445  Gff2Count++;
446  }
447 
448  return (Gff2Count > 0);
449 }
450 
451 
453  CFormatGuess::EFormat baseFormat)
454 {
455  unique_ptr<CObjectIStream> pObjStream;
456  m_LocalBuffer.clear();
457  m_LocalBuffer.seekg(0);
458 
459  switch(baseFormat) {
460  default:
461  return nullptr;
463  pObjStream.reset(new CObjectIStreamAsn(m_LocalBuffer, eNoOwnership));
464  break;
466  pObjStream.reset(new CObjectIStreamAsnBinary(m_LocalBuffer, eNoOwnership));
467  break;
468  case CFormatGuess::eXml:
469  pObjStream.reset(new CObjectIStreamXml(m_LocalBuffer, eNoOwnership));
470  break;
471  case CFormatGuess::eJSON:
472  pObjStream.reset(new CObjectIStreamJson(m_LocalBuffer, eNoOwnership));
473  break;
474  }
475  if( !pObjStream.get() ) {
476  return nullptr;
477  }
478 
479  set<TTypeInfo> types = pObjStream->GuessDataType(*m_pEffectiveRecognizedGenbankObjectTypes);
480  if ( types.size() != 1 ) {
481  return nullptr;
482  }
483  return *types.begin();
484 }
485 
487  const set<TTypeInfo>& recognizedGenbankTypes)
488 {
489  m_pEffectiveRecognizedGenbankObjectTypes = &recognizedGenbankTypes;
490 }
491 
493  CFileContentInfo& contentInfo)
494 {
495  auto baseFormat = GuessFormat();
496  switch (baseFormat) {
497  default:
498  new(&contentInfo.mInfoNone) CFileContentInfoNone();
499  break;
502  case CFormatGuess::eXml:
503  case CFormatGuess::eJSON:
504  new(&contentInfo.mInfoGenbank) CFileContentInfoGenbank();
505  contentInfo.mInfoGenbank.mTypeInfo = xGuessGenbankObjectType(baseFormat);
506  if (contentInfo.mInfoGenbank.mTypeInfo) {
507  contentInfo.mInfoGenbank.mObjectType =
508  contentInfo.mInfoGenbank.mTypeInfo->GetName();
509  }
510  break;
511  case CFormatGuess::eGff3:
512  new(&contentInfo.mInfoGff3) CFileContentInfoGff3();
513  break;
515  new(&contentInfo.mInfoAlign) CFileContentInfoAlign();
516  break;
517  }
518  return baseFormat;
519 }
520 
521 /*
522  bool CFormatGuessEx::x_TryHgvs()
523  {
524  m_LocalBuffer.clear();
525  m_LocalBuffer.seekg(0);
526 
527  CScope* Dummy = NULL;
528  CHgvsParser Parser(*Dummy);
529 
530  int HgvsCount = 0;
531  while(m_LocalBuffer) {
532  string Line;
533  NcbiGetlineEOL(m_LocalBuffer, Line);
534 
535  if(m_LocalBuffer.eof() || Line.empty() || Line[0] == '#')
536  continue;
537 
538  NStr::ReplaceInPlace(Line, "\r", "");
539  NStr::ReplaceInPlace(Line, "\n", "");
540 
541  bool Parsed;
542  try {
543  Parsed = Parser.CanParseHgvsExpression(Line);
544  //Feat = Parser.AsVariationFeat(Line);
545  } catch(CException&) {
546  } catch(...) {
547  }
548 
549  if(Parsed)
550  HgvsCount++;
551  }
552 
553  return (HgvsCount > 0);
554  }
555 */
556 
557 
558 
User-defined methods of the data storage class.
virtual int ReadStream(CNcbiIstream &is, EFinalize eFinalize=eFinalize_Yes)
Read an AGP file from the given input stream.
Definition: agp_util.cpp:1084
This class is used to turn an AGP file into a vector of Seq-entry's.
vector< CRef< objects::CSeq_entry > > TSeqEntryRefVec
This is the way the results will be returned Each Seq-entry contains just one Bioseq,...
TSeqEntryRefVec & GetResult(void)
This gets the results found, but don't call before finalizing.
CReaderBase implementation that reads BED data files, either a single object or all objects found.
Definition: bed_reader.hpp:109
CFormatGuess::EFormat GuessFormatAndContent(CFileContentInfo &contentInfo)
const set< TTypeInfo > * m_pEffectiveRecognizedGenbankObjectTypes
static set< TTypeInfo > sDefaultRecognizedGenbankObjectTypes
bool x_TryFormat(CFormatGuess::EFormat Format)
unique_ptr< CFormatGuess > m_Guesser
std::stringstream m_LocalBuffer
bool TestFormat(CFormatGuess::EFormat)
void SetRecognizedGenbankTypes(const set< TTypeInfo > &recognizedGenbankTypes)
CFormatGuess::EFormat GuessFormat()
bool x_FillLocalBuffer(CNcbiIstream &In)
TTypeInfo xGuessGenbankObjectType(CFormatGuess::EFormat baseFormat)
Class implements different ad-hoc unreliable file format identifications.
EFormat
The formats are checked in the same order as declared here.
@ eBinaryASN
Binary ASN.1.
@ eGff2
GFF2, CGff2Reader, any GFF-like that doesn't fit the others.
@ eBed
UCSC BED file format, CBedReader.
@ eGtf
New GTF, CGtfReader.
@ eAgp
AGP format assembly, AgpRead.
@ eGff3
GFF3, CGff3Reader.
@ eFasta
FASTA format sequence record, CFastaReader.
@ eUnknown
unknown format
@ eRmo
RepeatMasker Output.
@ eTextASN
Text ASN.1.
@ eAlignment
Text alignment.
@ eBed15
UCSC BED15 or microarray format.
@ eWiggle
UCSC WIGGLE file format.
CObjectIStreamAsnBinary –.
Definition: objistrasnb.hpp:59
CObjectIStreamAsn –.
Definition: objistrasn.hpp:54
CObjectIStreamJson –.
Definition: objistrjson.hpp:54
CObjectIStreamXml –.
Definition: objistrxml.hpp:56
TAnnotList TAnnots
Definition: reader_base.hpp:91
Implements a concrete class for reading RepeatMasker output from tabular form and rendering it as ASN...
Definition: rm_reader.hpp:690
Simple implementation of ILineReader for i(o)streams.
CTypeInfo class contains all information about C++ types (both basic and classes): members and layout...
Definition: typeinfo.hpp:76
@ fIncludeRepeatClass
Same as fIncludeRepeatFamily.
Definition: rm_reader.hpp:536
Include a standard set of the NCBI C++ Toolkit most basic headers.
Operators to edit gaps in sequences.
set< TTypeInfo > sDefaultRecognizedGenbankObjectTypes
Definition: formatguess.cpp:64
static void TestResult(SQLRETURN result0, int level, const char *func)
Definition: raiserror.c:38
static const struct type types[]
Definition: type.c:22
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
@ eNoOwnership
No ownership is assumed.
Definition: ncbi_types.h:135
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
void Info(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1185
static TTypeInfo GetTypeInfo(void)
Definition: objecttype.hpp:85
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool IsNull(void) const THROWS_NONE
Check if pointer is null – same effect as Empty().
Definition: ncbiobj.hpp:735
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static void Pushback(CNcbiIstream &is, CT_CHAR_TYPE *buf, streamsize buf_size, void *del_ptr)
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
const string & GetName(void) const
Get name of this type.
Definition: typeinfo.cpp:249
const TSet & GetSet(void) const
Get the variant data.
Definition: Seq_entry_.cpp:124
bool IsSet(void) const
Check if variant Set is selected.
Definition: Seq_entry_.hpp:263
const TSeq_set & GetSeq_set(void) const
Get the Seq_set member data.
bool IsFtable(void) const
Check if variant Ftable is selected.
Definition: Seq_annot_.hpp:615
const TData & GetData(void) const
Get the Data member data.
Definition: Seq_annot_.hpp:873
bool CanGetData(void) const
Check if it is safe to call GetData method.
Definition: Seq_annot_.hpp:867
Lightweight interface for getting lines of data with minimal memory copying.
Magic spell ;-) needed for some weird compilers... very empiric.
GenericReader< UTF8< char >, UTF8< char >, CrtAllocator > Reader
Reader with UTF8 encoding and default allocator.
Definition: fwd.h:88
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Defines NCBI C++ exception handling.
Multi-threading – classes, functions, and features.
Useful/utility classes and methods.
Format
Definition: njn_ioutil.hpp:52
#define count
CFileContentInfoGff3 mInfoGff3
CFileContentInfoNone mInfoNone
CFileContentInfoAlign mInfoAlign
CFileContentInfoGenbank mInfoGenbank
Modified on Fri Sep 20 14:58:09 2024 by modify_doxy.py rev. 669887