NCBI C++ ToolKit
format_guess.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 #ifndef FORMATGUESS__HPP
2 #define FORMATGUESS__HPP
3 
4 /* $Id: format_guess.hpp 102775 2024-07-10 16:05:26Z gotvyans $
5  * ===========================================================================
6  *
7  * PUBLIC DOMAIN NOTICE
8  * National Center for Biotechnology Information
9  *
10  * This software/database is a "United States Government Work" under the
11  * terms of the United States Copyright Act. It was written as part of
12  * the author's official duties as a United States Government employee and
13  * thus cannot be copyrighted. This software/database is freely available
14  * to the public for use. The National Library of Medicine and the U.S.
15  * Government have not placed any restriction on its use or reproduction.
16  *
17  * Although all reasonable efforts have been taken to ensure the accuracy
18  * and reliability of the software and data, the NLM and the U.S.
19  * Government do not and cannot warrant the performance or results that
20  * may be obtained by using this software or data. The NLM and the U.S.
21  * Government disclaim all warranties, express or implied, including
22  * warranties of performance, merchantability or fitness for any particular
23  * purpose.
24  *
25  * Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Author: Anatoliy Kuznetsov
30  *
31  * File Description: Different "fuzzy-logic" methods to identify file formats.
32  *
33  */
34 
35 #include <corelib/ncbistd.hpp>
36 #include <util/static_map.hpp>
37 #include <bitset>
38 
40 
41 class CFormatGuessHints;
42 
43 
44 //////////////////////////////////////////////////////////////////
45 ///
46 /// Class implements different ad-hoc unreliable file format
47 /// identifications.
48 ///
49 
51 {
52 public:
53  /// The formats are checked in the same order as declared here.
54  enum EFormat {
55  // WARNING! Never change numeric values of these enumerators!
56  // E.g. these values are hard-coded in the Local Data Storage (LDS)
57  // index databases.
58  eUnknown = 0, ///< unknown format
59  eBinaryASN = 1, ///< Binary ASN.1
60  eRmo = 2, ///< RepeatMasker Output
61  eGtf_POISENED = 3, ///< Old and Dead GFF/GTF style annotations
62  eGlimmer3 = 4, ///< Glimmer3 predictions
63  eAgp = 5, ///< AGP format assembly, AgpRead
64  eXml = 6, ///< XML
65  eWiggle = 7, ///< UCSC WIGGLE file format
66  eBed = 8, ///< UCSC BED file format, CBedReader
67  eBed15 = 9, ///< UCSC BED15 or microarray format
68  eNewick = 10, ///< Newick file
69  eAlignment = 11, ///< Text alignment
70  eDistanceMatrix = 12, ///< Distance matrix file
71  eFlatFileSequence = 13, ///< GenBank/GenPept/DDBJ/EMBL flat-file sequence portion
72  eFiveColFeatureTable = 14, ///< Five-column feature table
73  eSnpMarkers = 15, ///< SNP Marker flat file
74  eFasta = 16, ///< FASTA format sequence record, CFastaReader
75  eTextASN = 17, ///< Text ASN.1
76  eTaxplot = 18, ///< Taxplot file
77  ePhrapAce = 19, ///< Phrap ACE assembly file
78  eTable = 20, ///< Generic table
79  eGtf = 21, ///< New GTF, CGtfReader
80  eGff3 = 22, ///< GFF3, CGff3Reader
81  eGff2 = 23, ///< GFF2, CGff2Reader, any GFF-like that doesn't fit the others
82  eHgvs = 24, ///< HGVS, CHgvsParser
83  eGvf = 25, ///< GVF, CGvfReader
84  eZip = 26, ///< zip compressed file
85  eGZip = 27, ///< GNU zip compressed file
86  eBZip2 = 28, ///< bzip2 compressed file
87  eLzo = 29, ///< lzo compressed file
88  eSra = 30, ///< INSDC Sequence Read Archive file
89  eBam = 31, ///< Binary alignment/map file
90  eVcf = 32, ///< VCF, CVcfReader
91  eUCSCRegion = 33, ///< USCS Region file format
92  eGffAugustus = 34, ///< GFFish output of Augustus Gene Prediction
93  eJSON = 35, ///< JSON
94  ePsl = 36, ///< PSL alignment format
95  // The following formats are not yet recognized by CFormatGuess - CXX-10039
96  eAltGraphX = 37,
97  eBed5FloatScore = 38,
98  eBedGraph = 39,
99  eBedRnaElements = 40,
100  eBigBarChart = 41,
101  eBigBed = 42,
102  eBigPsl = 43,
103  eBigChain = 44,
104  eBigMaf = 45,
105  eBigWig = 46,
106  eBroadPeak = 47,
107  eChain = 48,
108  eClonePos = 49,
109  eColoredExon = 50,
110  eCtgPos = 51,
111  eDownloadsOnly = 52,
112  eEncodeFiveC = 53,
113  eExpRatio = 54,
114  eFactorSource = 55,
115  eGenePred = 56,
116  eLd2 = 57,
117  eNarrowPeak = 58,
118  eNetAlign = 59,
119  ePeptideMapping = 60,
120  eRmsk = 61,
121  eSnake = 62,
122  eVcfTabix = 63,
123  eWigMaf = 64,
124 
125  // The following formats *are* recognized by CFormatGuess:
126  eFlatFileGenbank = 65,
127  eFlatFileEna = 66,
128  eFlatFileUniProt = 67,
129 
130  eZstd = 68, ///< Zstandard (zstd) compressed data
131 
132  // *** Adding new format codes? ***
133  // (1) A sanity check in the implementation depends on the format codes being
134  // consecutive. Hence no gaps allowed!
135  // (2) Heed the warning above about never changing an already existing format code!
136  // (3) You must provide a display name for the new format. Do that in sm_FormatNames (see .cpp).
137  // (4) You must add your new format to sm_CheckOrder (see .cpp)
138  // (unless you don't want your format actually being checked and recognized).
139 
140  /// Max value of EFormat
141  eFormat_max
142  };
143 
147  eProtein
148  };
149 
150  enum EMode {
152  eThorough
153  };
154 
156  eST_Lax, ///< Implement historic behavior, risking false positives.
157  eST_Default, ///< Be relatively strict, but still allow for typos.
158  eST_Strict ///< Require 100% encodability of printable non-digits.
159  };
160 
161  enum EOnError {
162  eDefault = 0, ///< Return eUnknown
163  eThrowOnBadSource, ///< Throw an exception if the data source (stream, file) can't be read
164  };
165 
166  static bool IsSupportedFormat(EFormat format);
167 
168  /// Hints for guessing formats. Two hint types can be used: preferred and
169  /// disabled. Preferred are checked before any other formats. Disabled
170  /// formats are not checked at all.
172  {
173  public:
175 
176  CFormatHints(void) {}
177 
178  /// Mark the format as preferred.
179  CFormatHints& AddPreferredFormat(TFormat fmt);
180  /// Mark the format as disabled.
181  CFormatHints& AddDisabledFormat(TFormat fmt);
182  /// Disable all formats not marked as preferred
183  CFormatHints& DisableAllNonpreferred(void);
184  /// Remove format hint.
185  void RemoveFormat(TFormat fmt);
186  /// Remove all hints
187  CFormatHints& Reset(void);
188 
189  /// Check if there are any hints are set at all.
190  bool IsEmpty(void) const;
191  /// Check if the format is listed as preferred.
192  bool IsPreferred(TFormat fmt) const;
193  /// Check if the format is listed as disabled.
194  bool IsDisabled(TFormat fmt) const;
195 
196  private:
197  typedef bitset<CFormatGuess::eFormat_max> THints;
198 
201  };
202 
203  /// Guess sequence type. Function calculates sequence alphabet and
204  /// identifies if the source belongs to nucleotide or protein sequence
205  static ESequenceType SequenceType(const char* str, unsigned length = 0,
206  ESTStrictness strictness = eST_Default);
207 
208  static const char* GetFormatName(EFormat format);
209 
210  // ----------------------------------------------------------------------
211  // "Stateless" interface:
212  // Useful for checking for all formats in one simple call.
213  // May go away; use object interface instead.
214  // ----------------------------------------------------------------------
215 
216  /// Guess file format
217  static
218  EFormat Format(const string& path, EOnError onerror = eDefault);
219 
220  /// Format prediction based on an input stream
221  /// @note On completion, the function pushes whatever data it had to read
222  /// (in order to detect data format) back to the stream -- using
223  /// CStreamUtils::Stepback()
224  static
226 
227 
228  // ----------------------------------------------------------------------
229  // "Object" interface:
230  // Use when interested only in a limited number of formats, in excluding
231  // certain tests, a specific order in which formats are tested, ...
232  // ----------------------------------------------------------------------
233 
234  CFormatGuess();
235 
236  CFormatGuess(const string& fname);
237 
238  /// @note Data format detection methods GuessFormat() and TestFormat()
239  /// take care to push whatever data they read back to the stream
240  /// using CStreamUtils::Stepback()
242 
243  ~CFormatGuess();
244 
245 
246  NCBI_DEPRECATED EFormat GuessFormat(EMode);
247  NCBI_DEPRECATED bool TestFormat(EFormat, EMode);
248 
249  /// @note If the instance of the class is built upon std::istream, then
250  /// on completion this function pushes whatever data it had to read
251  /// (in order to detect data format) back to the stream -- using
252  /// CStreamUtils::Stepback()
253  EFormat GuessFormat(EOnError onerror = eDefault);
254 
255 
256  /// @note If the instance of the class is built upon std::istream, then
257  /// on completion this function pushes whatever data it had to read
258  /// (in order to detect data format) back to the stream -- using
259  /// CStreamUtils::Stepback()
260  bool TestFormat(EFormat, EOnError onerror = eDefault);
261 
262  /// Get format hints
263  CFormatHints& GetFormatHints(void) { return m_Hints; }
264 
265  /// Check whether testing is enabled for given format
266  bool IsEnabled(EFormat format) const { return !m_Hints.IsDisabled(format); };
267 
268 protected:
269  void Initialize();
270 
271  bool EnsureTestBuffer();
272  bool EnsureStats();
273  bool EnsureSplitLines();
274  bool IsAllComment();
275  bool IsAsciiText();
276 
277  bool TestFormatRepeatMasker(EMode);
278  bool TestFormatPhrapAce(EMode);
279  bool TestFormatGtf(EMode);
280  bool TestFormatGvf(EMode);
281  bool TestFormatGff3(EMode);
282  bool TestFormatGff2(EMode);
283  bool TestFormatGlimmer3(EMode);
284  bool TestFormatAgp(EMode);
285  bool TestFormatNewick(EMode);
286  bool TestFormatXml(EMode);
287  bool TestFormatAlignment(EMode);
288  bool TestFormatCLUSTAL(void);
289  bool TestFormatBinaryAsn(EMode);
290  bool TestFormatDistanceMatrix(EMode);
291  bool TestFormatTaxplot(EMode);
292  bool TestFormatFlatFileSequence(EMode);
293  bool TestFormatFiveColFeatureTable(EMode);
294  bool TestFormatTable(EMode);
295  bool TestFormatFasta(EMode);
296  bool TestFormatTextAsn(EMode);
297  bool TestFormatSnpMarkers(EMode);
298  bool TestFormatBed(EMode);
299  bool TestFormatBed15(EMode);
300  bool TestFormatWiggle(EMode);
301  bool TestFormatHgvs(EMode);
302  bool TestFormatZip(EMode);
303  bool TestFormatGZip(EMode);
304  bool TestFormatZstd(EMode);
305  bool TestFormatBZip2(EMode);
306  bool TestFormatLzo(EMode);
307  bool TestFormatSra(EMode);
308  bool TestFormatBam(EMode);
309  bool TestFormatVcf(EMode);
310  bool TestFormatAugustus(EMode);
311  bool TestFormatJson(EMode);
312  bool TestFormatPsl(EMode);
313 
314  bool TestFormatFlatFileGenbank(EMode);
315  bool TestFormatFlatFileEna(EMode);
316  bool TestFormatFlatFileUniProt(EMode);
317 
318  bool IsInputRepeatMaskerWithoutHeader();
319  bool IsInputRepeatMaskerWithHeader();
320 
321  static bool IsLineFlatFileSequence(const std::string&);
322  static bool IsSampleNewick(const std::string&);
323  static bool IsLabelNewick(const std::string&);
324  static bool IsLineAgp(const std::string&);
325  static bool IsLineGlimmer3(const std::string&);
326  static bool IsLineGtf(const std::string&);
327  static bool IsLineGvf(const std::string&);
328  static bool IsLineGff3(const std::string&);
329  static bool IsLineGff2(const std::string&);
330  static bool IsLineAugustus(const std::string&);
331  static bool IsLinePhrapId(const std::string&);
332  static bool IsLineRmo(const std::string&);
333  static bool IsAsnComment(const vector<string>&);
334  static bool IsLineHgvs(const std::string&);
335  static bool IsLinePsl(const std::string&, bool ignoreFirstColumn);
336 
337 private:
338  static bool x_TestInput( CNcbiIstream& input, EOnError onerror );
339 
340  bool x_TestFormat(EFormat format, EMode mode);
341 
342  // to test for a table we check each of the most common delimiter combitions,
343  // ' ' ' \t' '\t' ',' '|'
344  bool x_TestTableDelimiter(const string& delims);
345 
346  // Check that the beginning of testString looks like JSON
347  bool x_CheckJsonStart(const string& testString) const;
348 
349  // In-place deletion of JSON strings
350  void x_StripJsonStrings(string& testString) const;
351 
352  // Starting at from_pos, find the next set of double quotes
353  // indicating the end of a JSON string
354  size_t x_FindNextJsonStringStop(const string& input, const size_t from_pos) const;
355 
356  void x_FindJsonStringLimits(const string& testString, list<size_t>& limits) const;
357 
358  // Checks and removes punctuation from testString
359  bool x_CheckStripJsonPunctuation(string& testString) const;
360 
361  // In-place deletion of JSON punctuation
362  // Returns the number of characters deleted.
363  size_t x_StripJsonPunctuation(string& testString) const;
364 
365  // In-place deletion of JSON keywords: true, false, null
366  void x_StripJsonKeywords(string& testString) const;
367 
368  bool x_CheckStripJsonNumbers(string& testString) const;
369 
370  bool x_IsTruncatedJsonNumber(const string& testString) const;
371 
372  // Is a truncation of true, false, or null
373  bool x_IsTruncatedJsonKeyword(const string& testString) const;
374 
375  bool x_IsNumber(const string& testString) const;
376 
377  // Return true if the string is blank or a list of space-delimited numbers
378  bool x_IsBlankOrNumbers(const string& testString) const;
379 
380  bool x_TryProcessCLUSTALSeqData(const string& line, string& id, size_t& seg_length) const;
381  bool x_LooksLikeCLUSTALConservedInfo(const string& line) const;
382 
383 protected:
387  streamsize m_iTestBufferSize;
388  streamsize m_iTestDataSize;
389 
392  unsigned int m_iStatsCountData;
394  unsigned int m_iStatsCountDnaChars;
395  unsigned int m_iStatsCountAaChars;
396  unsigned int m_iStatsCountBraces;
397  std::list<std::string> m_TestLines;
399 };
400 
401 
404 {
405  m_Disabled.reset(fmt);
406  m_Preferred.set(fmt);
407  return *this;
408 }
409 
410 
413 {
414  m_Preferred.reset(fmt);
415  m_Disabled.set(fmt);
416  return *this;
417 }
418 
421 {
422  m_Disabled = ~m_Preferred;
423  return *this;
424 }
425 
427 {
428  m_Disabled.reset(fmt);
429  m_Preferred.reset(fmt);
430 }
431 
434 {
435  m_Preferred.reset();
436  m_Disabled.reset();
437  return *this;
438 }
439 
440 inline bool CFormatGuess::CFormatHints::IsEmpty(void) const
441 {
442  return m_Preferred.count() == 0 && m_Disabled.count() == 0;
443 }
444 
446 {
447  return m_Preferred.test(fmt);
448 }
449 
451 {
452  return m_Disabled.test(fmt);
453 }
454 
456 
457 #endif
Hints for guessing formats.
CFormatHints & AddDisabledFormat(TFormat fmt)
Mark the format as disabled.
void RemoveFormat(TFormat fmt)
Remove format hint.
bitset< CFormatGuess::eFormat_max > THints
bool IsEmpty(void) const
Check if there are any hints are set at all.
CFormatHints & Reset(void)
Remove all hints.
CFormatGuess::EFormat TFormat
bool IsPreferred(TFormat fmt) const
Check if the format is listed as preferred.
CFormatHints & AddPreferredFormat(TFormat fmt)
Mark the format as preferred.
CFormatHints & DisableAllNonpreferred(void)
Disable all formats not marked as preferred.
bool IsDisabled(TFormat fmt) const
Check if the format is listed as disabled.
Class implements different ad-hoc unreliable file format identifications.
unsigned int m_iStatsCountBraces
CNcbiIstream & m_Stream
CFormatHints & GetFormatHints(void)
Get format hints.
unsigned int m_iStatsCountData
size_t x_FindNextJsonStringStop(const string &input, const size_t from_pos) const
bool IsEnabled(EFormat format) const
Check whether testing is enabled for given format.
EFormat
The formats are checked in the same order as declared here.
unsigned int m_iStatsCountAaChars
streamsize m_iTestBufferSize
char * m_pTestBuffer
unsigned int m_iStatsCountAlNumChars
std::list< std::string > m_TestLines
streamsize m_iTestDataSize
unsigned int m_iStatsCountDnaChars
CFormatHints m_Hints
@ eThrowOnBadSource
Throw an exception if the data source (stream, file) can't be read.
@ eST_Lax
Implement historic behavior, risking false positives.
@ eST_Default
Be relatively strict, but still allow for typos.
Include a standard set of the NCBI C++ Toolkit most basic headers.
static const char * str(char *buf, int n)
Definition: stats.c:84
static const TDS_WORD limits[]
Definition: num_limits.h:85
string
Definition: cgiapp.hpp:690
#define NCBI_DEPRECATED
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
static bool IsSupportedFormat(CFormatGuess::EFormat format)
Definition: lds2.cpp:1001
static int input()
mdb_mode_t mode
Definition: lmdb++.h:38
static Format format
Definition: njn_ioutil.cpp:53
Format
Definition: njn_ioutil.hpp:52
NCBI_XUTIL_EXPORT
Parameter to control printing diagnostic message about conversion of static array data from a differe...
Definition: static_set.hpp:72
Modified on Fri Sep 20 14:57:42 2024 by modify_doxy.py rev. 669887