NCBI C++ ToolKit
format_guess.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: format_guess.cpp 102775 2024-07-10 16:05:26Z gotvyans $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Anatoliy Kuznetsov
27  *
28  * File Description: Implemented methods to identify file formats.
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 #include <util/format_guess.hpp>
34 #include <util/util_exception.hpp>
35 #include <corelib/ncbifile.hpp>
36 #include <corelib/ncbistre.hpp>
37 #include <corelib/stream_utils.hpp>
38 
39 
41 
42 
43 // Must list all *supported* EFormats except eUnknown and eFormat_max.
44 // Will cause assertion if violated!
45 
47 {
48  CFormatGuess::eBam, // must precede eGZip!
55  CFormatGuess::ePsl, // must be checked before eRmo!
87 };
88 constexpr size_t sm_CheckOrder_Size = sizeof(sm_CheckOrder) / sizeof(sm_CheckOrder[0]);
89 
90 
91 // This array must stay in sync with enum CFormatGuess::EFormat,
92 // but that's not supposed to change in the middle anyway,
93 // so the explicit size should suffice to avoid accidental skew.
94 
97 
99 {
100  { CFormatGuess::eUnknown, "unknown" },
101  { CFormatGuess::eBinaryASN, "binary ASN.1" },
102  { CFormatGuess::eRmo, "RepeatMasker" },
103  { CFormatGuess::eGtf_POISENED, "GFF/GTF Poisoned" },
104  { CFormatGuess::eGlimmer3, "Glimmer3" },
105  { CFormatGuess::eAgp, "AGP" },
106  { CFormatGuess::eXml, "XML" },
107  { CFormatGuess::eWiggle, "WIGGLE" },
108  { CFormatGuess::eBed, "BED" },
109  { CFormatGuess::eBed15, "BED15" },
110  { CFormatGuess::eNewick, "Newick" },
111  { CFormatGuess::eAlignment, "alignment" },
112  { CFormatGuess::eDistanceMatrix, "distance matrix" },
113  { CFormatGuess::eFlatFileSequence, "flat-file sequence" },
114  { CFormatGuess::eFiveColFeatureTable, "five-column feature table" },
115  { CFormatGuess::eSnpMarkers, "SNP Markers" },
116  { CFormatGuess::eFasta, "FASTA" },
117  { CFormatGuess::eTextASN, "text ASN.1" },
118  { CFormatGuess::eTaxplot, "Taxplot" },
119  { CFormatGuess::ePhrapAce, "Phrap ACE" },
120  { CFormatGuess::eTable, "table" },
121  { CFormatGuess::eGtf, "GTF" },
122  { CFormatGuess::eGff3, "GFF3" },
123  { CFormatGuess::eGff2, "GFF2" },
124  { CFormatGuess::eHgvs, "HGVS" },
125  { CFormatGuess::eGvf, "GVF" },
126  { CFormatGuess::eZip, "zip" },
127  { CFormatGuess::eGZip, "gzip" },
128  { CFormatGuess::eBZip2, "bzip2" },
129  { CFormatGuess::eLzo, "lzo" },
130  { CFormatGuess::eSra, "SRA" },
131  { CFormatGuess::eBam, "BAM" },
132  { CFormatGuess::eVcf, "VCF" },
133  { CFormatGuess::eUCSCRegion, "UCSC Region" },
134  { CFormatGuess::eGffAugustus, "GFF Augustus" },
135  { CFormatGuess::eJSON, "JSON" },
136  { CFormatGuess::ePsl, "PSL" },
137  { CFormatGuess::eAltGraphX, "altGraphX" },
138  { CFormatGuess::eBed5FloatScore, "BED5 float score" },
139  { CFormatGuess::eBedGraph, "BED graph" },
140  { CFormatGuess::eBedRnaElements, "BED Rna elements" },
141  { CFormatGuess::eBigBarChart, "bigBarChart" },
142  { CFormatGuess::eBigBed, "BigBED" },
143  { CFormatGuess::eBigPsl, "BigPSL" },
144  { CFormatGuess::eBigChain, "BigChain" },
145  { CFormatGuess::eBigMaf, "BigMaf" },
146  { CFormatGuess::eBigWig, "BigWig" },
147  { CFormatGuess::eBroadPeak, "BroadPeak" },
148  { CFormatGuess::eChain, "Chain" },
149  { CFormatGuess::eClonePos, "ClonePos" },
150  { CFormatGuess::eColoredExon, "ColoredExon" },
151  { CFormatGuess::eCtgPos, "CtgPos" },
152  { CFormatGuess::eDownloadsOnly, "DowloadsOnly" },
153  { CFormatGuess::eEncodeFiveC, "EncodeFiveC" },
154  { CFormatGuess::eExpRatio, "ExpRatio" },
155  { CFormatGuess::eFactorSource, "FactorSource" },
156  { CFormatGuess::eGenePred, "GenePred" },
157  { CFormatGuess::eLd2, "Ld2" },
158  { CFormatGuess::eNarrowPeak, "NarrowPeak" },
159  { CFormatGuess::eNetAlign, "NetAlign" },
160  { CFormatGuess::ePeptideMapping, "PeptideMapping" },
161  { CFormatGuess::eRmsk, "Rmsk" },
162  { CFormatGuess::eSnake, "Snake" },
163  { CFormatGuess::eVcfTabix, "VcfTabix" },
164  { CFormatGuess::eWigMaf, "WigMaf" },
165  { CFormatGuess::eFlatFileGenbank, "Genbank FlatFile" },
166  { CFormatGuess::eFlatFileEna, "ENA FlatFile" },
167  { CFormatGuess::eFlatFileUniProt, "UniProt FlatFile" },
168  { CFormatGuess::eZstd, "zstd" },
169 };
171 
172 
173 
175  fDNA_Main_Alphabet = 1<<0, ///< Just ACGTUN-.
176  fDNA_Ambig_Alphabet = 1<<1, ///< Anything else representable in ncbi4na.
177  fProtein_Alphabet = 1<<2, ///< Allows BZX*-, but not JOU.
178  fLineEnd = 1<<3,
179  fAlpha = 1<<4,
180  fDigit = 1<<5,
181  fSpace = 1<<6,
182  fInvalid = 1<<7
183 };
184 
186  eNo = 0,
188  eYes
189 };
190 
191 
192 
193 // ============================================================================
194 // Helper routine--- file scope only:
195 // ============================================================================
196 
197 static unsigned char symbol_type_table[256];
198 
199 // ----------------------------------------------------------------------------
200 static bool s_IsTokenPosInt(
201  const string& strToken )
202  // ----------------------------------------------------------------------------
203 {
204  size_t tokenSize = strToken.size();
205  if (tokenSize == 0) {
206  return false;
207  }
208  if (tokenSize == 1 && strToken[0] == '0') {
209  return true;
210  }
211  if (strToken[0] < '1' || '9' < strToken[0]) {
212  return false;
213  }
214  for (size_t i=1; i<tokenSize; ++i) {
215  if (strToken[i] < '0' || '9' < strToken[i]) {
216  return false;
217  }
218  }
219  return true;
220 }
221 
222 // ----------------------------------------------------------------------------
223 static bool s_IsTokenInteger(
224  const string& strToken )
225 // ----------------------------------------------------------------------------
226 {
227  if ( ! strToken.empty() && (strToken[0] == '-' || strToken[0] == '+')) {
228  return s_IsTokenPosInt( strToken.substr( 1 ) );
229  }
230  return s_IsTokenPosInt( strToken );
231 }
232 
233 // ----------------------------------------------------------------------------
234 static bool s_IsTokenDouble(
235  const string& strToken )
236 {
237  string token( strToken );
238  NStr::ReplaceInPlace( token, ".", "1", 0, 1 );
239  if ( token.size() > 1 && token[0] == '-' ) {
240  token[0] = '1';
241  }
242  if (token.size() > 1 && token[0] == '0') {
243  token[0] = '1';
244  }
245  return s_IsTokenPosInt(token);
246 }
247 
248 // ----------------------------------------------------------------------------
249 static void init_symbol_type_table(void)
250 {
251  if ( symbol_type_table[0] == 0 ) {
252  for ( const char* s = "ACGNTU"; *s; ++s ) {
253  int c = *s;
255  c = tolower(c);
257  }
258  for ( const char* s = "BDHKMRSVWY"; *s; ++s ) {
259  int c = *s;
261  c = tolower(c);
263  }
264  for ( const char* s = "ACDEFGHIKLMNPQRSTVWYBZX"; *s; ++s ) {
265  int c = *s;
267  c = tolower(c);
269  }
272  for ( const char* s = "\r\n"; *s; ++s ) {
273  int c = *s;
275  }
276  for ( int c = 1; c < 256; ++c ) {
277  if ( isalpha((unsigned char)c) )
279  if ( isdigit((unsigned char)c) )
281  if ( isspace((unsigned char)c) )
283  }
285  }
286 }
287 
288 
289 const char*
291 {
292  auto formatIt = sm_FormatNames.find(format);
293  if (formatIt == sm_FormatNames.end()) {
294  NCBI_THROW(CUtilException, eWrongData,
295  "CFormatGuess::GetFormatName: out-of-range format value "
297  }
298  return formatIt->second;
299 }
300 
301 
302 // ============================================================================
303 // Old style class interface:
304 // ============================================================================
305 
306 // ----------------------------------------------------------------------------
308 CFormatGuess::SequenceType(const char* str, unsigned length,
309  ESTStrictness strictness)
310 {
311  if (length == 0)
312  length = (unsigned)::strlen(str);
313 
315  unsigned int main_nuc_content = 0, ambig_content = 0, bad_nuc_content = 0,
316  amino_acid_content = 0, exotic_aa_content = 0, bad_aa_content = 0;
317 
318  for (unsigned i = 0; i < length; ++i) {
319  unsigned char c = str[i];
320  unsigned char type = symbol_type_table[c];
321  if ( type & fDNA_Main_Alphabet ) {
322  ++main_nuc_content;
323  } else if ( type & fDNA_Ambig_Alphabet ) {
324  ++ambig_content;
325  } else if ( !(type & (fSpace | fDigit)) ) {
326  ++bad_nuc_content;
327  }
328 
329  if ( type & fProtein_Alphabet ) {
330  ++amino_acid_content;
331  } else if ( type & fAlpha ) {
332  ++exotic_aa_content;
333  } else if ( !(type & (fSpace | fDigit)) ) {
334  ++bad_aa_content;
335  }
336  }
337 
338  switch (strictness) {
339  case eST_Lax:
340  {
341  double dna_content = (double)main_nuc_content / (double)length;
342  double prot_content = (double)amino_acid_content / (double)length;
343 
344  if (dna_content > 0.7) {
345  return eNucleotide;
346  }
347  if (prot_content > 0.7) {
348  return eProtein;
349  }
350  }
351 
352  case eST_Default:
353  if (bad_nuc_content + ambig_content <= main_nuc_content / 9
354  || (bad_nuc_content + ambig_content <= main_nuc_content / 3 &&
355  bad_nuc_content <= (main_nuc_content + ambig_content) / 19)) {
356  // >=90% main alph. (ACGTUN-) or >=75% main and >=95% 4na-encodable
357  return eNucleotide;
358  } else if (bad_aa_content + exotic_aa_content
359  <= amino_acid_content / 9) {
360  // >=90% relatively standard protein residues. (JOU don't count.)
361  return eProtein;
362  }
363 
364  case eST_Strict: // Must be 100% encodable
365  if (bad_nuc_content == 0 && ambig_content <= main_nuc_content / 3) {
366  return eNucleotide;
367  } else if (bad_aa_content == 0
368  && exotic_aa_content <= amino_acid_content / 9) {
369  return eProtein;
370  }
371  }
372 
373  return eUndefined;
374 }
375 
376 
377 // ----------------------------------------------------------------------------
378 CFormatGuess::EFormat CFormatGuess::Format(const string& path, EOnError /*onerror*/)
379 {
380  CNcbiIfstream input(path.c_str(), IOS_BASE::in | IOS_BASE::binary);
381  return Format(input);
382 }
383 
384 // ----------------------------------------------------------------------------
386 {
387  CFormatGuess FG( input );
388  return FG.GuessFormat( onerror );
389 }
390 
391 
392 // ============================================================================
393 // New style object interface:
394 // ============================================================================
395 
396 // ----------------------------------------------------------------------------
398  : m_Stream(* new CNcbiIfstream)
399  , m_bOwnsStream(true)
400  , m_iTestBufferSize(0)
401 {
402  Initialize();
403 }
404 
405 // ----------------------------------------------------------------------------
407  const string& FileName )
408  : m_Stream( * new CNcbiIfstream( FileName.c_str(), ios::binary ) )
409  , m_bOwnsStream( true )
410 {
411  Initialize();
412 }
413 
414 // ----------------------------------------------------------------------------
417  : m_Stream( Stream )
418  , m_bOwnsStream( false )
419 {
420  Initialize();
421 }
422 
423 // ----------------------------------------------------------------------------
425 {
426  delete[] m_pTestBuffer;
427  if ( m_bOwnsStream ) {
428  delete &m_Stream;
429  }
430 }
431 
432 // ----------------------------------------------------------------------------
433 bool
435 {
436  for (size_t i = 0; i < sm_CheckOrder_Size; ++i) {
437  if (sm_CheckOrder[i] == format) {
438  return true;
439  }
440  }
441  return false;
442 }
443 
444 // ----------------------------------------------------------------------------
447 {
448  return GuessFormat(eDefault);
449 }
450 
451 // ----------------------------------------------------------------------------
454 {
455  //sqd-4036:
456  // make sure we got something to work with
457  //
458  if (!x_TestInput(m_Stream, onerror)) {
459  return eUnknown;
460  }
461  if (!EnsureTestBuffer()) {
462  //one condition that won't allow us to get a good test buffer is an ascii
463  // file without any line breaks. so before giving up, let's specifically
464  // try any formats that would allow for that:
465  if(TestFormatNewick(eQuick)) {
466  return CFormatGuess::eNewick;
467  }
468  return CFormatGuess::eUnknown;
469  }
470 
471  EMode mode = eQuick;
472 
473  // First, try to use hints
474  if ( !m_Hints.IsEmpty() ) {
475  for (size_t f = 0; f < sm_CheckOrder_Size; ++f) {
476  EFormat fmt = EFormat( sm_CheckOrder[f] );
477  if (m_Hints.IsPreferred(fmt) && x_TestFormat(fmt, mode)) {
478  return fmt;
479  }
480  }
481  }
482 
483  // Check other formats, skip the ones that are disabled through hints
484  for (size_t f = 0; f < sm_CheckOrder_Size; ++f) {
485  EFormat fmt = EFormat( sm_CheckOrder[f] );
486  if ( ! m_Hints.IsDisabled(fmt) && x_TestFormat(fmt, mode) ) {
487  return fmt;
488  }
489  }
490  return eUnknown;
491 }
492 
493 // ----------------------------------------------------------------------------
494 bool
496 {
497  return TestFormat( format, eDefault);
498 }
499 
500 // ----------------------------------------------------------------------------
502  EFormat format,
503  EOnError onerror )
504 {
505  if (format != eUnknown && !x_TestInput(m_Stream, onerror)) {
506  return false;
507  }
508  EMode mode = eQuick;
509  return x_TestFormat(format, mode);
510 }
511 
512 // ----------------------------------------------------------------------------
514 {
515  // First check if the format is disabled
516  if ( m_Hints.IsDisabled(format) ) {
517  return false;
518  }
519 
520  switch( format ) {
521 
522  case eBinaryASN:
523  return TestFormatBinaryAsn( mode );
524  case eRmo:
525  return TestFormatRepeatMasker( mode );
526  case eGtf:
527  return TestFormatGtf( mode );
528  case eGvf:
529  return TestFormatGvf( mode );
530  case eGff3:
531  return TestFormatGff3( mode );
532  case eGff2:
533  return TestFormatGff2( mode );
534  case eGlimmer3:
535  return TestFormatGlimmer3( mode );
536  case eAgp:
537  return TestFormatAgp( mode );
538  case eXml:
539  return TestFormatXml( mode );
540  case eNewick:
541  return TestFormatNewick( mode );
542  case eWiggle:
543  return TestFormatWiggle( mode );
544  case eBed:
545  return TestFormatBed( mode );
546  case eBed15:
547  return TestFormatBed15( mode );
548  case eAlignment:
549  return TestFormatAlignment( mode );
550  case eDistanceMatrix:
551  return TestFormatDistanceMatrix( mode );
552  case eFlatFileSequence:
556  case eSnpMarkers:
557  return TestFormatSnpMarkers( mode );
558  case eFasta:
559  return TestFormatFasta( mode );
560  case eTextASN:
561  return TestFormatTextAsn( mode );
562  case eTaxplot:
563  return TestFormatTaxplot( mode );
564  case ePhrapAce:
565  return TestFormatPhrapAce( mode );
566  case eTable:
567  return TestFormatTable( mode );
568  case eHgvs:
569  return TestFormatHgvs( mode );
570  case eZip:
571  return TestFormatZip( mode );
572  case eGZip:
573  return TestFormatGZip( mode );
574  case eZstd:
575  return TestFormatZstd( mode );
576  case eBZip2:
577  return TestFormatBZip2( mode );
578  case eLzo:
579  return TestFormatLzo( mode );
580  case eSra:
581  return TestFormatSra( mode );
582  case eBam:
583  return TestFormatBam( mode );
584  case ePsl:
585  return TestFormatPsl( mode );
586  case eVcf:
587  return TestFormatVcf( mode );
588  case eUCSCRegion:
589  return false;
590  case eGffAugustus:
591  return TestFormatAugustus( mode );
592  case eJSON:
593  return TestFormatJson( mode );
594  case eFlatFileGenbank:
596  case eFlatFileEna:
597  return TestFormatFlatFileEna( mode );
598  case eFlatFileUniProt:
600  default:
601  NCBI_THROW( CCoreException, eInvalidArg,
602  "CFormatGuess::x_TestFormat(): Unsupported format ID (" +
603  NStr::NumericToString((int)format) + ")." );
604  }
605 }
606 
607 // ----------------------------------------------------------------------------
608 void
610 {
611  NCBI_ASSERT(eFormat_max == sm_FormatNames.size(),
612  "sm_FormatNames does not list all possible formats");
613  m_pTestBuffer = 0;
614 
615  m_bStatsAreValid = false;
616  m_bSplitDone = false;
617  m_iStatsCountData = 0;
622 }
623 
624 // ----------------------------------------------------------------------------
625 bool
627 // ----------------------------------------------------------------------------
628 {
629  if ( m_pTestBuffer ) {
630  return true;
631  }
632  if ( ! m_Stream.good() ) {
633  return false;
634  }
635 
636  // Fix to the all-comment problem.
637  // Read a test buffer,
638  // Test it for being all comment
639  // If its all comment, read a twice as long buffer
640  // Stop when its no longer all comment, end of the stream,
641  // or Multiplier hits 1024
642 
643  const streamsize k_TestBufferGranularity = 8096;
644 
645  int Multiplier = 1;
646 
647  while(true) {
648  m_iTestBufferSize = Multiplier * k_TestBufferGranularity;
649  m_pTestBuffer = new char[ m_iTestBufferSize ];
651  m_iTestDataSize = m_Stream.gcount();
652  if (m_iTestDataSize == 0) {
653  delete[] m_pTestBuffer;
654  m_pTestBuffer = 0;
655  m_iTestBufferSize = 0;
656  return false; //empty file
657  }
658  m_Stream.clear(); // in case we reached eof
660 
661  if (IsAllComment()) {
662  if (Multiplier >= 1024) {
663  // this is how far we will go and no further.
664  // if it's indeed all comments then none of the format specific
665  // tests will assert.
666  // if something was misidentified as a comment then the relevant
667  // format specific test may still have a good sample to work with.
668  // so it does not hurt to at least try.
669  return true;
670  }
671  Multiplier *= 2;
672  delete [] m_pTestBuffer;
675  return false;
676  }
677  continue;
678  } else {
679  break;
680  }
681  }
682 
683  return true;
684 }
685 
686 // ----------------------------------------------------------------------------
687 bool
689 // ----------------------------------------------------------------------------
690 {
691  if ( m_bStatsAreValid ) {
692  return true;
693  }
694  if ( ! EnsureTestBuffer() ) {
695  return false;
696  }
697 
698  string strBuffer(m_pTestBuffer, m_iTestDataSize);
699  CNcbiIstrstream TestBuffer(strBuffer);
700  string strLine;
701 
703  // Things we keep track of:
704  // m_iStatsCountAlNumChars: number of characters that are letters or
705  // digits
706  // m_iStatsCountData: number of characters not part of a line starting
707  // with '>', ignoring whitespace
708  // m_iStatsCountDnaChars: number of characters counted in m_iStatsCountData
709  // from the DNA alphabet
710  // m_iStatsCountAaChars: number of characters counted in m_iStatsCountData
711  // from the AA alphabet
712  // m_iStatsCountBraces: Opening { and closing } braces
713  //
714  while ( ! TestBuffer.fail() ) {
715  NcbiGetline( TestBuffer, strLine, "\r\n" );
716 // code in CFormatGuess::Format counts line ends
717 // so, we will count them here as well
718  if (!strLine.empty()) {
719  strLine += '\n';
720  }
721  size_t size = strLine.size();
722  bool is_header = size > 0 && strLine[0] == '>';
723  for ( size_t i=0; i < size; ++i ) {
724  unsigned char c = strLine[i];
725  unsigned char type = symbol_type_table[c];
726 
727  if ( type & (fAlpha | fDigit | fSpace) ) {
729  }
730  else if (c == '{' || c == '}') {
732  }
733  if ( !is_header ) {
734  if ( !(type & fSpace) ) {
736  }
737 
738  if ( type & fDNA_Main_Alphabet ) {
740  }
741  if ( type & fProtein_Alphabet ) {
743  }
744  }
745  }
746  }
747  m_bStatsAreValid = true;
748  return true;
749 }
750 
751 // ----------------------------------------------------------------------------
753 {
754  if (!input) {
755  if (onerror == eThrowOnBadSource) {
756  NCBI_THROW(CUtilException,eNoInput,"Unreadable input stream");
757  }
758  return false;
759  }
760  return true;
761 }
762 
763 // ----------------------------------------------------------------------------
765  EMode /* not used */ )
766 {
767  if ( ! EnsureStats() || ! EnsureSplitLines() ) {
768  return false;
769  }
772 }
773 
774 
775 // ----------------------------------------------------------------------------
776 
777 static bool s_LooksLikeNucSeqData(const string& line, size_t minLength=10) {
778  if (line.size()<minLength) {
779  return false;
780  }
781 
782  int nucCount=0;
783  for (auto c : line) {
784  if (isalpha(c)) {
785  auto index = static_cast<int>(c);
786  if (symbol_type_table[index] & fDNA_Main_Alphabet) {
787  ++nucCount;
788  }
789  continue;
790  }
791 
792  if (!isspace(c)) {
793  return false;
794  }
795  }
796 
797  return (nucCount/line.size() > 0.9);
798 }
799 
800 
801 // ----------------------------------------------------------------------------
802 bool
804  EMode /* not used */ )
805 {
806  if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
807  return false;
808  }
809 
810  if (memchr(m_pTestBuffer, 0, m_iTestDataSize)) { // Cannot contain NuLL bytes
811  return false; // RW-1102
812  }
813 
814  bool foundId = false;
815  for (const auto& line : m_TestLines) {
816  if (foundId) {
817  if (s_LooksLikeNucSeqData(line)) {
818  return true;
819  }
820  }
821  else if (IsLinePhrapId(line)) {
822  foundId = true;
823  }
824  }
825  return false;
826 }
827 
828 // -----------------------------------------------------------------------------
829 bool
831  EMode /* not used */ )
832 {
833  if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
834  return false;
835  }
836 
837  unsigned int uGtfLineCount = 0;
838  list<string>::iterator it = m_TestLines.begin();
839 
840  for ( ; it != m_TestLines.end(); ++it) {
841  //
842  // Make sure to ignore any UCSC track and browser lines prior to the
843  // start of data
844  //
845  if ( it->empty() || (*it)[0] == '#' ) {
846  continue;
847  }
848  if ( !uGtfLineCount && NStr::StartsWith( *it, "browser " ) ) {
849  continue;
850  }
851  if ( !uGtfLineCount && NStr::StartsWith( *it, "track " ) ) {
852  continue;
853  }
854  if ( ! IsLineGtf( *it ) ) {
855  return false;
856  }
857  ++uGtfLineCount;
858  }
859  return (uGtfLineCount != 0);
860 }
861 
862 // -----------------------------------------------------------------------------
863 bool
865  EMode /* not used */ )
866 {
867  if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
868  return false;
869  }
870 
871  unsigned int uGvfLineCount = 0;
872  list<string>::iterator it = m_TestLines.begin();
873 
874  for ( ; it != m_TestLines.end(); ++it) {
875  //
876  // Make sure to ignore any UCSC track and browser lines prior to the
877  // start of data
878  //
879  if ( it->empty() || (*it)[0] == '#' ) {
880  if (NStr::StartsWith(*it, "##gvf-version")) {
881  return true;
882  }
883  continue;
884  }
885  if ( !uGvfLineCount && NStr::StartsWith( *it, "browser " ) ) {
886  continue;
887  }
888  if ( !uGvfLineCount && NStr::StartsWith( *it, "track " ) ) {
889  continue;
890  }
891  if ( ! IsLineGvf( *it ) ) {
892  return false;
893  }
894  ++uGvfLineCount;
895  }
896  return (uGvfLineCount != 0);
897 }
898 
899 
900 // -----------------------------------------------------------------------------
901 bool
903  EMode /* not used */ )
904 {
905  if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
906  return false;
907  }
908 
909  unsigned int uGffLineCount = 0;
910  list<string>::iterator it = m_TestLines.begin();
911 
912  for ( ; it != m_TestLines.end(); ++it) {
913  //
914  // Make sure to ignore any UCSC track and browser lines prior to the
915  // start of data
916  //
917  if (!uGffLineCount && NStr::StartsWith(*it, "##gff-version")) {
918  return NStr::StartsWith(*it, "##gff-version 3");
919  }
920  if ( it->empty() || (*it)[0] == '#' ) {
921  continue;
922  }
923  if ( !uGffLineCount && NStr::StartsWith( *it, "browser " ) ) {
924  continue;
925  }
926  if ( !uGffLineCount && NStr::StartsWith( *it, "track " ) ) {
927  continue;
928  }
929  if ( ! IsLineGff3( *it ) ) {
930  return false;
931  }
932  ++uGffLineCount;
933  }
934  return (uGffLineCount != 0);
935 }
936 
937 
938 // -----------------------------------------------------------------------------
939 bool
941  EMode /*not used*/)
942 {
943  if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
944  return false;
945  }
946 
947  unsigned int uGffLineCount = 0;
948  list<string>::iterator it = m_TestLines.begin();
949 
950  for ( ; it != m_TestLines.end(); ++it) {
951  //
952  // Make sure to ignore any UCSC track and browser lines prior to the
953  // start of data
954  //
955  if (!uGffLineCount && NStr::StartsWith(*it, "##gff-version 3")) {
956  return false;
957  }
958  if ( it->empty() || (*it)[0] == '#' ) {
959  continue;
960  }
961  if ( !uGffLineCount && NStr::StartsWith( *it, "browser " ) ) {
962  return false;
963  }
964  if ( !uGffLineCount && NStr::StartsWith( *it, "track " ) ) {
965  return false;
966  }
967  if ( !IsLineAugustus( *it ) ) {
968  return false;
969  }
970  ++uGffLineCount;
971  }
972  return (uGffLineCount != 0);
973 }
974 
975 
976 // -----------------------------------------------------------------------------
977 bool
979  EMode /* not used */ )
980 {
981  if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
982  return false;
983  }
984 
985  unsigned int uGffLineCount = 0;
986  list<string>::iterator it = m_TestLines.begin();
987 
988  for ( ; it != m_TestLines.end(); ++it) {
989  //
990  // Make sure to ignore any UCSC track and browser lines prior to the
991  // start of data
992  //
993  if ( it->empty() || (*it)[0] == '#' ) {
994  continue;
995  }
996  if ( !uGffLineCount && NStr::StartsWith( *it, "browser " ) ) {
997  continue;
998  }
999  if ( !uGffLineCount && NStr::StartsWith( *it, "track " ) ) {
1000  continue;
1001  }
1002  if ( ! IsLineGff2( *it ) ) {
1003  return false;
1004  }
1005  ++uGffLineCount;
1006  }
1007  return (uGffLineCount != 0);
1008 }
1009 
1010 
1011 // -----------------------------------------------------------------------------
1012 bool
1014  EMode /* not used */ )
1015 {
1016  if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
1017  return false;
1018  }
1019 
1020  /// first line should be a FASTA defline
1021  list<string>::iterator it = m_TestLines.begin();
1022  if (it->empty() || (*it)[0] != '>') {
1023  return false;
1024  }
1025 
1026  /// there should be additional data lines, and they should be easily parseable,
1027  /// with five columns
1028  ++it;
1029  if (it == m_TestLines.end()) {
1030  return false;
1031  }
1032  for ( /**/; it != m_TestLines.end(); ++it) {
1033  if ( !IsLineGlimmer3( *it ) ) {
1034  return false;
1035  }
1036  }
1037  return true;
1038 }
1039 
1040 // -----------------------------------------------------------------------------
1041 bool
1043  EMode /* not used */ )
1044 {
1045  if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
1046  return false;
1047  }
1048  ITERATE( list<string>, it, m_TestLines ) {
1049  try {
1050  if ( !IsLineAgp( *it ) ) {
1051  return false;
1052  }
1053  } catch(...) {
1054  return false;
1055  }
1056  }
1057  return true;
1058 }
1059 
1060 // -----------------------------------------------------------------------------
1061 bool
1063  EMode /* not used */ )
1064 {
1065 // -----------------------------------------------------------------------------
1066  // newick trees can be found in nexus files. check for that first as a special case
1067  if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
1068  const int BUFFSIZE = 8096;
1069  if (m_pTestBuffer) {
1070  delete [] m_pTestBuffer;
1071  }
1072  m_pTestBuffer = new char[BUFFSIZE+1];
1073  m_Stream.read( m_pTestBuffer, BUFFSIZE );
1074  m_iTestDataSize = m_Stream.gcount();
1076  m_Stream.clear(); // in case we reached eof
1078  m_TestLines.push_back(m_pTestBuffer);
1079  }
1080 
1081  // Note: We can live with false negatives. Avoid false positives
1082  // at all cost.
1083 
1084  bool is_nexus = false;
1085  bool has_trees = false;
1086  const size_t check_size = 12;
1087 
1088  ITERATE( list<string>, it, m_TestLines ) {
1089  if ( NPOS != it->find( "#NEXUS" ) ) {
1090  is_nexus = true;
1091  }
1092  }
1093 
1094  // Trees can be anywhere in a nexus file. If nexus is true,
1095  // try to read the whole file to see if there is a tree.
1096  if (is_nexus) {
1097  // Read in file one chunk at a time. Readline would be better
1098  // but is not avialable for this stream. Since the text we
1099  // are looking for "begin trees;" may span two chunks, we
1100  // copy the last 12 characters of the previous chunk to
1101  // the front of the new one.
1102  const size_t read_size = 16384;
1103  char test_buf[read_size + check_size + 1];
1104  memset(test_buf, ' ', check_size); // "previous chunk" initially blank.
1105 
1106  size_t max_reads = 32768; // max read to locate tree: 512 MB
1107  for (size_t i = 0; i < max_reads; ++i) {
1108  m_Stream.read(test_buf+check_size, read_size);
1109  size_t num_read = m_Stream.gcount();
1110  if (num_read > 0) {
1111  test_buf[num_read + check_size] = 0; // null terminator
1112  if (NPOS != NStr::FindNoCase(CTempString(test_buf), "begin trees;")) {
1113  has_trees = true;
1114  m_Stream.clear(); // in case we reached eof
1115  break;
1116  }
1117  // copy end of buffer to beginning in case string
1118  // spans two buffers:
1119  strncpy(test_buf, test_buf + num_read, check_size);
1120  }
1121 
1122  if (m_Stream.eof() || m_Stream.fail()) {
1123  m_Stream.clear(); // clear eof
1124  break;
1125  }
1126  }
1127  }
1128 
1129  // In a nexus file with a tree, we will just read in the tree (ignoring for now
1130  // the alignment)
1131  if (is_nexus ) {
1132  if (has_trees)
1133  return true;
1134  return false;
1135  }
1136 
1137  // special newick consideration:
1138  // newick files may come with all data cramped into a single run-on line,
1139  // that single oversized line may not have a line terminator
1140  const size_t maxSampleSize = 8*1024-1;
1141  size_t sampleSize = 0;
1142  char* pSample = new char[maxSampleSize+1];
1143  AutoArray<char> autoDelete(pSample);
1144 
1145  m_Stream.read(pSample, maxSampleSize);
1146  sampleSize = (size_t)m_Stream.gcount();
1147  m_Stream.clear(); // in case we reached eof
1148  CStreamUtils::Stepback(m_Stream, pSample, sampleSize);
1149  if (0 == sampleSize) {
1150  return false;
1151  }
1152 
1153  pSample[sampleSize] = 0;
1154  if (!IsSampleNewick(pSample)) { // tolerant of embedded line breaks
1155  return false;
1156  }
1157  return true;
1158 }
1159 
1160 // -----------------------------------------------------------------------------
1161 bool
1163  EMode /* not used */ )
1164 {
1165  if ( ! EnsureTestBuffer() ) {
1166  return false;
1167  }
1168 
1169  //
1170  // Criterion: Presence of any non-printing characters
1171  //
1172  EConfidence conf = eNo;
1173  for (int i = 0; i < m_iTestDataSize; ++i) {
1174  if ( !isgraph((unsigned char) m_pTestBuffer[i]) &&
1175  !isspace((unsigned char) m_pTestBuffer[i]) )
1176  {
1177  if (m_pTestBuffer[i] == '\1') {
1178  conf = eMaybe;
1179  } else {
1180  return true;
1181  }
1182  }
1183  }
1184  return (conf == eYes);
1185 }
1186 
1187 
1188 // -----------------------------------------------------------------------------
1189 bool
1191  EMode /* not used */ )
1192 {
1193  if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
1194  return false;
1195  }
1196 
1197  //
1198  // criteria are odd:
1199  //
1200  list<string>::const_iterator iter = m_TestLines.begin();
1201  list<string> toks;
1202 
1203  /// first line: one token, one number
1204  NStr::Split(*iter++, "\t ", toks, NStr::fSplit_Tokenize);
1205  if (toks.size() != 1 ||
1206  toks.front().find_first_not_of("0123456789") != string::npos) {
1207  return false;
1208  }
1209 
1210  // now, for remaining ones, we expect an alphanumeric item first,
1211  // followed by a set of floating-point values. Unless we are at the last
1212  // line, the number of values should increase monotonically
1213  for (size_t i = 1; iter != m_TestLines.end(); ++i, ++iter) {
1214  toks.clear();
1215  NStr::Split(*iter, "\t ", toks, NStr::fSplit_Tokenize);
1216  if (toks.size() != i) {
1217  /// we can ignore the last line ; it may be truncated
1218  list<string>::const_iterator it = iter;
1219  ++it;
1220  if (it != m_TestLines.end()) {
1221  return false;
1222  }
1223  }
1224 
1225  list<string>::const_iterator it = toks.begin();
1226  for (++it; it != toks.end(); ++it) {
1227  if ( ! s_IsTokenDouble( *it ) ) {
1228  return false;
1229  }
1230  }
1231  }
1232 
1233  return true;
1234 }
1235 
1236 // -----------------------------------------------------------------------------
1237 bool
1239  EMode /* not used */ )
1240 {
1241  if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
1242  return false;
1243  }
1244 
1245  ITERATE (list<string>, it, m_TestLines) {
1246  if ( !IsLineFlatFileSequence( *it ) ) {
1247  return false;
1248  }
1249  }
1250  return true;
1251 }
1252 
1253 // -----------------------------------------------------------------------------
1254 bool
1256  EMode /* not used */ )
1257 {
1258  if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
1259  return false;
1260  }
1261 
1262  ITERATE( list<string>, it, m_TestLines ) {
1263  if (it->empty()) {
1264  continue;
1265  }
1266 
1267  if (it->find(">Feature ") != 0 && it->find(">Features ") != 0) {
1268  return false;
1269  }
1270  break;
1271  }
1272 
1273  return true;
1274 }
1275 
1276 // -----------------------------------------------------------------------------
1277 bool
1279  EMode /* not used */ )
1280 {
1281  if ( ! EnsureTestBuffer() ) {
1282  return false;
1283  }
1284 
1285  string input( m_pTestBuffer, (size_t)m_iTestDataSize );
1287 
1288  //
1289  // Test 1: If it starts with typical XML decorations such as "<?xml..."
1290  // then respect that:
1291  //
1292  if ( NStr::StartsWith( input, "<?XML", NStr::eNocase ) ) {
1293  return true;
1294  }
1295  if ( NStr::StartsWith( input, "<!DOCTYPE", NStr::eNocase ) ) {
1296  return true;
1297  }
1298 
1299  //
1300  // Test 2: In the absence of XML specific declarations, check whether the
1301  // input starts with the opening tag of a well known set of doc types:
1302  //
1303  static const char* known_types[] = {
1304  "<Blast4-request>"
1305  };
1306  for ( size_t i=0; i < ArraySize(known_types); ++i ) {
1307  if ( NStr::StartsWith( input, known_types[i], NStr::eCase ) ) {
1308  return true;
1309  }
1310  }
1311 
1312  return false;
1313 }
1314 
1315 // -----------------------------------------------------------------------------
1316 bool
1318  EMode /* not used */ )
1319 {
1320  if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
1321  return false;
1322  }
1323 
1324 
1325  if (TestFormatCLUSTAL()) {
1326  return true;
1327  }
1328 
1329  // Alignment files come in all different shapes and broken formats,
1330  // and some of them are hard to recognize as such, in particular
1331  // if they have been hacked up in a text editor.
1332 
1333  // This functions only concerns itself with the ones that are
1334  // easy to recognize.
1335 
1336  // Note: We can live with false negatives. Avoid false positives
1337  // at all cost.
1338 
1339  ITERATE( list<string>, it, m_TestLines ) {
1340  if ( NPOS != it->find( "#NEXUS" ) ) {
1341  return true;
1342  }
1343  }
1344  return false;
1345 }
1346 
1347 // -----------------------------------------------------------------------------
1349 {
1350 
1351  for (auto c : line) {
1352  if ( isspace(c)) {
1353  continue;
1354  }
1355 
1356  if (c != ':' &&
1357  c != '*' &&
1358  c != '.') {
1359  return false;
1360  }
1361  }
1362  return true;
1363 }
1364 
1365 // -----------------------------------------------------------------------------
1366 bool CFormatGuess::x_TryProcessCLUSTALSeqData(const string& line, string& id, size_t& seg_length) const
1367 {
1368  vector<string> toks;
1370  const size_t num_toks = toks.size();
1371 
1372  if (num_toks != 2 &&
1373  num_toks != 3) {
1374  return false;
1375  }
1376 
1377  const string& seqdata = toks[1];
1378 
1379 
1380  unsigned int cumulated_res = 0;
1381  if (num_toks == 3) {
1382  cumulated_res = NStr::StringToUInt(toks[2], NStr::fConvErr_NoThrow);
1383  if (cumulated_res == 0) {
1384  return false;
1385  }
1386  }
1387 
1388  // Check sequence data
1389  ESequenceType seqtype =
1390  SequenceType(seqdata.c_str(), static_cast<unsigned int>(seqdata.size()), eST_Strict);
1391 
1392  if (seqtype == eUndefined) {
1393  return false;
1394  }
1395 
1396  if (num_toks == 3) {
1397  size_t num_gaps = count(seqdata.begin(), seqdata.end(), '-');
1398  if (((seqdata.size() - num_gaps) > cumulated_res)) {
1399  return false;
1400  }
1401  }
1402 
1403 
1404  id = toks[0];
1405  seg_length = seqdata.size();
1406 
1407  return true;
1408 }
1409 
1410 
1411 // -----------------------------------------------------------------------------
1412 
1413 namespace { // anonymous namespace
1414 
1415 struct SClustalBlockInfo
1416 {
1417  bool m_InBlock;
1418  unsigned int m_Size;
1419  set<string> m_Ids;
1420 
1421 
1422  void Reset(void) {
1423  m_InBlock = false;
1424  m_Size = 0;
1425  m_Ids.clear();
1426  }
1427 
1428  SClustalBlockInfo() { Reset(); }
1429 };
1430 
1431 }
1432 
1433 // -----------------------------------------------------------------------------
1434 bool
1436 {
1437 
1438  if (!EnsureTestBuffer()) {
1439  return false;
1440  }
1441 
1442  string strBuffer(m_pTestBuffer, m_iTestDataSize);
1443  CNcbiIstrstream TestBuffer(strBuffer);
1444  string strLine;
1445 
1446  SClustalBlockInfo block_info;
1447 
1448  bool has_valid_block = false;
1449  size_t seg_length = 0;
1450  size_t seg_length_prev = 0;
1451 
1452 
1453  const bool buffer_full = m_iTestDataSize == m_iTestBufferSize;
1454 
1455  while ( !TestBuffer.eof() ) {
1456  NcbiGetline(TestBuffer, strLine, "\r\n");
1457 
1458  if (buffer_full &&
1459  TestBuffer.eof()) { // Skip last line if buffer is full
1460  break; // to avoid misidentification due to line truncation
1461  }
1462 
1463  if (TestBuffer.fail()) {
1464  break;
1465  }
1466 
1467  if (NStr::StartsWith(strLine, "CLUSTAL")) {
1468  continue;
1469  }
1470 
1471  if (NStr::IsBlank(strLine)) {
1472  if (block_info.m_InBlock) {
1473  if (block_info.m_Size < 2) {
1474  return false;
1475  }
1476  block_info.Reset();
1477  }
1478  continue;
1479  }
1480 
1481  if (x_LooksLikeCLUSTALConservedInfo(strLine)) {
1482  if (! block_info.m_InBlock || block_info.m_Size<2) {
1483  return false;
1484  }
1485  block_info.Reset();
1486  continue;
1487  }
1488 
1489  string seq_id;
1490  if (!x_TryProcessCLUSTALSeqData(strLine, seq_id, seg_length)) {
1491  return false;
1492  }
1493 
1494  if (seg_length > 60) {
1495  return false;
1496  }
1497  if (block_info.m_InBlock) {
1498  if(seg_length != seg_length_prev) {
1499  return false;
1500  }
1501  has_valid_block = true;
1502  }
1503 
1504  if (block_info.m_Ids.find(seq_id) != block_info.m_Ids.end()) {
1505  return false;
1506  }
1507  block_info.m_Ids.insert(seq_id);
1508 
1509  seg_length_prev = seg_length;
1510  block_info.m_InBlock = true;
1511  ++(block_info.m_Size);
1512  }
1513 
1514  return has_valid_block;
1515 }
1516 
1517 
1518 // -----------------------------------------------------------------------------
1519  bool
1521  {
1522  list<string>::const_iterator iter = m_TestLines.begin();
1523  list<string> toks;
1524 
1525  // Skip initial lines since not all headers start with comments like # or ;:
1526  // Don't skip though if file is very short - add up to 3, 1 for each line
1527  // over 5:
1528  for (size_t i=5; i<7; ++i)
1529  if (m_TestLines.size() > i)
1530  ++iter;
1531 
1532  /// determine the number of observed columns
1533  size_t ncols = 0;
1534  for ( ; iter != m_TestLines.end(); ++iter) {
1535  if (iter->empty() || (*iter)[0] == '#' || (*iter)[0] == ';') {
1536  continue;
1537  }
1538 
1539  toks.clear();
1540  NStr::Split(*iter, delims, toks, NStr::fSplit_Tokenize);
1541  ncols = toks.size();
1542  break;
1543  }
1544  if ( ncols < 2 ) {
1545  return false;
1546  }
1547 
1548  size_t nlines = 1;
1549  // verify that columns all have the same size
1550  // we can add an exception for the last line
1551  for ( ; iter != m_TestLines.end(); ++iter) {
1552  if (iter->empty() || (*iter)[0] == '#' || (*iter)[0] == ';') {
1553  continue;
1554  }
1555 
1556  toks.clear();
1557  NStr::Split(*iter, delims, toks, NStr::fSplit_Tokenize);
1558  if (toks.size() != ncols) {
1559  list<string>::const_iterator it = iter;
1560  ++it;
1561  if (it != m_TestLines.end() || (m_iTestDataSize < m_iTestBufferSize) ) {
1562  return false;
1563  }
1564  } else {
1565  ++nlines;
1566  }
1567  // Tokens should only contain printable characters
1568  for (const auto& token : toks) {
1569  auto it = find_if(token.begin(), token.end(),
1570  [](unsigned char c){ return !isprint(c); });
1571  if (it != token.end()) {
1572  return false;
1573  }
1574  }
1575  }
1576  return ( nlines >= 3 );
1577  }
1578 
1579 bool
1581  EMode /* not used */ )
1582 {
1583  if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
1584  return false;
1585  }
1586  if ( ! IsAsciiText()) {//gp-13007: "table" means "ascii table"
1587  return false;
1588  }
1589 
1590  //
1591  // NOTE 1:
1592  // There is a bunch of file formats that are a special type of table and
1593  // that we want to identify (like Repeat Masker output). So not to shade
1594  // out those more special formats, this test should be performed only after
1595  // all the more specialized table formats have been tested.
1596  //
1597 
1598  //
1599  // NOTE 2:
1600  // The original criterion for this test was "the same number of observed
1601  // columns in every line".
1602  // In order to weed out false positives the following *additional*
1603  // conditions have been imposed:
1604  // - there are at least two observed columns
1605  // - the sample contains at least two non-comment lines.
1606  //
1607 
1608  //' ' ' \t' '\t' ',' '|'
1609  if (x_TestTableDelimiter(" "))
1610  return true;
1611  else if (x_TestTableDelimiter(" \t"))
1612  return true;
1613  else if (x_TestTableDelimiter("\t"))
1614  return true;
1615  else if (x_TestTableDelimiter(","))
1616  return true;
1617  else if (x_TestTableDelimiter("|"))
1618  return true;
1619 
1620  return false;
1621 }
1622 
1623 // -----------------------------------------------------------------------------
1625 {
1626  const CTempString COMMENT_SYMBOLS(";#!");
1627  const CTempString NEW_LINE_SYMBOLS("\r\n");
1628  while (true) {
1630  if ( COMMENT_SYMBOLS.find(text[0]) != CTempString::npos ) {
1631  CTempString::size_type pos = text.find_first_of(NEW_LINE_SYMBOLS, 1);
1632  text = text.substr(pos);
1633  } else {
1634  break;
1635  }
1636  }
1637 }
1638 
1639 bool
1641 {
1642  if ( ! EnsureStats() ) {
1643  return false;
1644  }
1645 
1646  // reject obvious misfits:
1648  SkipCommentAndBlank(header);
1649  if ( m_iTestDataSize == 0 || header.length() == 0 || header[0] != '>' ) {
1650  return false;
1651  }
1652  if ( m_iStatsCountData == 0 ) {
1653  if (0.75 > double(m_iStatsCountAlNumChars)/double(m_iTestDataSize) ) {
1654  return false;
1655  }
1656  return ( NStr::Find( m_pTestBuffer, "|" ) <= 10 );
1657  }
1658 
1659  // remaining decision based on text stats:
1660  double dAlNumFraction = (double)m_iStatsCountAlNumChars / (double)m_iTestDataSize;
1661  double dDnaFraction = (double)m_iStatsCountDnaChars / (double)m_iStatsCountData;
1662  double dAaFraction = (double)m_iStatsCountAaChars / (double)m_iStatsCountData;
1663 
1664  // want at least 80% text-ish overall:
1665  if ( dAlNumFraction < 0.8 ) {
1666  return false;
1667  }
1668 
1669  // want more than 91 percent of either DNA content or AA content in what we
1670  // presume is data:
1671  if ( dDnaFraction > 0.91 || dAaFraction > 0.91 ) {
1672  return true;
1673  }
1674  return false;
1675 }
1676 
1677 // ----------------------------------------------------------------------------
1678 bool
1680  EMode /* not used */ )
1681 {
1682  if ( ! EnsureStats() ) {
1683  return false;
1684  }
1685 
1686  // reject obvious misfits:
1687  if ( m_iTestDataSize == 0 || m_pTestBuffer[0] == '>' ) {
1688  return false;
1689  }
1690 
1691  // criteria:
1692  // at least 80% text-ish,
1693  // 1st field of the first non-blank not comment line must start with letter.
1694  // "::=" as the 2nd field of the first non-blank non comment line.
1695  //
1696  double dAlNumFraction = (double)(m_iStatsCountAlNumChars+m_iStatsCountBraces) /
1697  (double)m_iTestDataSize;
1698  if ( dAlNumFraction < 0.80 ) {
1699  return false;
1700  }
1701 
1702  string strBuffer(m_pTestBuffer, m_iTestDataSize);
1703  CNcbiIstrstream TestBuffer(strBuffer);
1704  string strLine;
1705 
1706  while ( ! TestBuffer.fail() ) {
1707  vector<string> Fields;
1708  NcbiGetline(TestBuffer, strLine, "\n\r");
1709  NStr::Split(strLine, " \t", Fields, NStr::fSplit_Tokenize);
1710  if ( IsAsnComment( Fields ) ) {
1711  continue;
1712  }
1713  return ( Fields.size() >= 2 && Fields[1] == "::=" && isalpha(Fields[0][0]));
1714  }
1715  return false;
1716 }
1717 
1718 // -----------------------------------------------------------------------------
1719 bool
1721  EMode /* not used */ )
1722 {
1723  return false;
1724 }
1725 
1726 // -----------------------------------------------------------------------------
1727 bool
1729  EMode /* not used */ )
1730 {
1731  if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
1732  return false;
1733  }
1734  ITERATE( list<string>, it, m_TestLines ) {
1735  string str = *it;
1736  int rsid, chr, pos, numMatched;
1737  numMatched = sscanf( it->c_str(), "rs%d\t%d\t%d", &rsid, &chr, &pos);
1738  if ( numMatched == 3) {
1739  return true;
1740  }
1741  }
1742  return false;
1743 }
1744 
1745 
1746 // ----------------------------------------------------------------------------
1747 bool
1749  EMode /* not used */ )
1750 {
1751  if ( ! EnsureStats() || ! EnsureSplitLines() ) {
1752  return false;
1753  }
1754 
1755  bool bTrackLineFound( false );
1756  bool bHasStartAndStop ( false );
1757  size_t columncount = 0;
1758  ITERATE( list<string>, it, m_TestLines ) {
1759  string str = NStr::TruncateSpaces( *it );
1760  if ( str.empty() ) {
1761  continue;
1762  }
1763 
1764  // 'chr 8' fixup, the bedreader does this too
1765  if (str.find("chr ") == 0 ||
1766  str.find("Chr ") == 0 ||
1767  str.find("CHR ") == 0)
1768  str.erase(3, 1);
1769 
1770  //
1771  // while occurrence of the following decorations _is_ a good sign, they could
1772  // also be indicator for a variety of other UCSC data formats
1773  //
1774  if ( NStr::StartsWith( str, "track" ) ) {
1775  bTrackLineFound = true;
1776  continue;
1777  }
1778  if ( NStr::StartsWith( str, "browser" ) ) {
1779  continue;
1780  }
1781  if ( NStr::StartsWith( str, "#" ) ) {
1782  continue;
1783  }
1784 
1785  vector<string> columns;
1787  if (columns.size() < 3 || columns.size() > 12) {
1788  return false;
1789  }
1790  if ( columns.size() != columncount ) {
1791  if ( columncount == 0 ) {
1792  columncount = columns.size();
1793  }
1794  else {
1795  return false;
1796  }
1797  }
1798  if(columns.size() >= 3) {
1799  if (s_IsTokenPosInt(columns[1]) &&
1800  s_IsTokenPosInt(columns[2])) {
1801  bHasStartAndStop = true;
1802  }
1803  }
1804  }
1805 
1806  return (bHasStartAndStop || bTrackLineFound);
1807 }
1808 
1809 // ----------------------------------------------------------------------------
1810 bool
1812  EMode /* not used */ )
1813 {
1814  if ( ! EnsureStats() || ! EnsureSplitLines() ) {
1815  return false;
1816  }
1817 
1818  bool LineFound = false;
1819  size_t columncount = 15;
1820  ITERATE( list<string>, it, m_TestLines ) {
1821  if ( NStr::TruncateSpaces( *it ).empty() ) {
1822  continue;
1823  }
1824  //
1825  // while occurrence of the following decorations _is_ a good sign, they could
1826  // also be indicator for a variety of other UCSC data formats
1827  //
1828  if ( NStr::StartsWith( *it, "track" ) ) {
1829  continue;
1830  }
1831  if ( NStr::StartsWith( *it, "browser" ) ) {
1832  continue;
1833  }
1834  if ( NStr::StartsWith( *it, "#" ) ) {
1835  continue;
1836  }
1837 
1838  vector<string> columns;
1840  if ( columns.size() != columncount ) {
1841  return false;
1842  } else {
1843  if (!s_IsTokenPosInt(columns[1]) || //chr start
1844  !s_IsTokenPosInt(columns[2]) || //chr end
1845  !s_IsTokenPosInt(columns[4]) || //score
1846  !s_IsTokenPosInt(columns[6]) || //thick draw start
1847  !s_IsTokenPosInt(columns[7])) //thick draw end
1848  return false;
1849  string strand = NStr::TruncateSpaces(columns[5]);
1850 
1851  if (strand != "+" && strand != "-")
1852  return false;
1853 
1854  LineFound = true;
1855  }
1856  }
1857  return LineFound;
1858 }
1859 
1860 // ----------------------------------------------------------------------------
1861 bool
1863  EMode /* not used */ )
1864 {
1865  if ( ! EnsureStats() || ! EnsureSplitLines() ) {
1866  return false;
1867  }
1868  ITERATE( list<string>, it, m_TestLines ) {
1869  if ( NStr::StartsWith( *it, "track" ) ) {
1870  if ( NStr::Find( *it, "type=wiggle_0" ) != NPOS ) {
1871  return true;
1872  }
1873  if ( NStr::Find( *it, "type=bedGraph" ) != NPOS ) {
1874  return true;
1875  }
1876  }
1877  if ( NStr::StartsWith(*it, "fixedStep") ) { /* MSS-140 */
1878  if ( NStr::Find(*it, "chrom=") && NStr::Find(*it, "start=") ) {
1879  return true;
1880  }
1881  }
1882  if ( NStr::StartsWith(*it, "variableStep") ) { /* MSS-140 */
1883  if ( NStr::Find(*it, "chrom=") ) {
1884  return true;
1885  }
1886  return true;
1887  }
1888  }
1889  return false;
1890 }
1891 
1892 // ----------------------------------------------------------------------------
1893 bool
1895  EMode /* not used */ )
1896 {
1897  if ( ! EnsureStats() || ! EnsureSplitLines() ) {
1898  const int BUFFSIZE = 1024;
1899  if (m_pTestBuffer) {
1900  delete [] m_pTestBuffer;
1901  }
1902  m_pTestBuffer = new char[BUFFSIZE+1];
1903  m_Stream.read( m_pTestBuffer, BUFFSIZE );
1904  m_iTestDataSize = m_Stream.gcount();
1906  m_Stream.clear(); // in case we reached eof
1908  m_TestLines.push_back(m_pTestBuffer);
1909  }
1910 
1911  unsigned int uHgvsLineCount = 0;
1912  list<string>::iterator it = m_TestLines.begin();
1913 
1914  for ( ; it != m_TestLines.end(); ++it) {
1915  if ( it->empty() || (*it)[0] == '#' ) {
1916  continue;
1917  }
1918  if ( ! IsLineHgvs( *it ) ) {
1919  return false;
1920  }
1921  ++uHgvsLineCount;
1922  }
1923  return (uHgvsLineCount != 0);
1924 }
1925 
1926 
1927 // ----------------------------------------------------------------------------
1928 bool
1930  EMode /* not used */ )
1931 {
1932  if ( ! EnsureTestBuffer() ) {
1933  return false;
1934  }
1935  // check if the first two bytes match with the zip magic number: 0x504B,
1936  // or PK and the next two bytes match with any of 0x0102, 0x0304, 0x0506
1937  // and 0x0708.
1938  if ( m_iTestDataSize < 4) {
1939  return false;
1940  }
1941  if (m_pTestBuffer[0] == 'P' && m_pTestBuffer[1] == 'K' &&
1942  ((m_pTestBuffer[2] == (char)1 && m_pTestBuffer[3] == (char)2) ||
1943  (m_pTestBuffer[2] == (char)3 && m_pTestBuffer[3] == (char)4) ||
1944  (m_pTestBuffer[2] == (char)5 && m_pTestBuffer[3] == (char)6) ||
1945  (m_pTestBuffer[2] == (char)7 && m_pTestBuffer[3] == (char)8) ) ) {
1946  return true;
1947  }
1948  return false;
1949 }
1950 
1951 
1952 // ----------------------------------------------------------------------------
1953 bool
1955  EMode /* not used */ )
1956 {
1957  if ( ! EnsureTestBuffer() ) {
1958  return false;
1959  }
1960  // check if the first two bytes match the gzip magic number: 0x1F8B
1961  if ( m_iTestDataSize < 2) {
1962  return false;
1963  }
1964  if (m_pTestBuffer[0] == (char)31 && m_pTestBuffer[1] == (char)139) {
1965  return true;
1966  }
1967  return false;
1968 }
1969 
1970 
1971 // ----------------------------------------------------------------------------
1972 bool
1974  EMode /* not used */ )
1975 {
1976  if ( ! EnsureTestBuffer() ) {
1977  return false;
1978  }
1979  // check if the first 4 bytes match with the zstd magic number: 0xFD2FB528
1980  if ( m_iTestDataSize < 4) {
1981  return false;
1982  }
1983  if (m_pTestBuffer[0] == (char)0x28 &&
1984  m_pTestBuffer[1] == (char)0xB5 &&
1985  m_pTestBuffer[2] == (char)0x2F &&
1986  m_pTestBuffer[3] == (char)0xFD ) {
1987  return true;
1988  }
1989  return false;
1990 }
1991 
1992 
1993 // ----------------------------------------------------------------------------
1994 bool
1996  EMode /* not used */ )
1997 {
1998  if ( ! EnsureTestBuffer() ) {
1999  return false;
2000  }
2001 
2002  // check if the first two bytes match with the bzip2 magic number: 0x425A,
2003  // or 'BZ' and the next two bytes match with 0x68(h) and 0x31-39(1-9)
2004  if ( m_iTestDataSize < 4) {
2005  return false;
2006  }
2007 
2008  if (m_pTestBuffer[0] == 'B' && m_pTestBuffer[1] == 'Z' &&
2009  m_pTestBuffer[2] == 'h' && m_pTestBuffer[3] >= '1' &&
2010  m_pTestBuffer[3] <= '9') {
2011  return true;
2012  }
2013 
2014  return false;
2015 }
2016 
2017 
2018 // ----------------------------------------------------------------------------
2019 bool
2021  EMode /* not used */ )
2022 {
2023  if ( ! EnsureTestBuffer() ) {
2024  return false;
2025  }
2026 
2027  if (m_iTestDataSize >= 3 && m_pTestBuffer[0] == 'L' &&
2028  m_pTestBuffer[1] == 'Z' && m_pTestBuffer[2] == 'O') {
2029  if (m_iTestDataSize == 3 ||
2030  (m_iTestDataSize > 3 && m_pTestBuffer[3] == '\0')) {
2031  return true;
2032  }
2033  }
2034 
2035  if (m_iTestDataSize >= 4 && m_pTestBuffer[1] == 'L' &&
2036  m_pTestBuffer[2] == 'Z' && m_pTestBuffer[3] == 'O') {
2037  if (m_iTestDataSize == 4 ||
2038  (m_iTestDataSize > 4 && m_pTestBuffer[4] == '\0')) {
2039  return true;
2040  }
2041  }
2042 
2043  return false;
2044 }
2045 
2046 
2047 bool CFormatGuess::TestFormatSra(EMode /* not used */ )
2048 {
2049  if ( !EnsureTestBuffer() || m_iTestDataSize < 16
2050  || CTempString(m_pTestBuffer, 8) != "NCBI.sra") {
2051  return false;
2052  }
2053 
2054  if (m_pTestBuffer[8] == '\x05' && m_pTestBuffer[9] == '\x03'
2055  && m_pTestBuffer[10] == '\x19' && m_pTestBuffer[11] == '\x88') {
2056  return true;
2057  } else if (m_pTestBuffer[8] == '\x88' && m_pTestBuffer[9] == '\x19'
2058  && m_pTestBuffer[10] == '\x03' && m_pTestBuffer[11] == '\x05') {
2059  return true;
2060  } else {
2061  return false;
2062  }
2063 }
2064 
2066 {
2067  //rw-9:
2068  // the original heuristic to "guess" at the content of a gzip archive
2069  // broke down and we found a whole class of false positives for the
2070  // BAM format- on our very own FTP site no less!
2071  //To really be sure we are dealing indeed with BAM we would have to
2072  // decompress the beginning of the archive and peek inside- however, gzip
2073  // decompression is not available in this module.
2074 
2075  //If reliable BAM detection is needed, use objtools/readers/format_guess_ex
2076  // instead. It's a drop in replacement, and it's not any slower for any file
2077  // format that can be detected reliably here (because it calls this code
2078  // before doing any of the more fancy stuff). And because of the fancy
2079  // stuff it does, it will even classify some files this code can't (though
2080  // possibly at considerable extra expense).
2081  return false;
2082 }
2083 
2084 
2086 {
2087  // for the most part, following https://genome.ucsc.edu/FAQ/FAQformat.html#format2.
2088  // note that UCSC downloads often include one extra column, right at the start
2089  // of each line. If that's the case then all records have that extra column. Since
2090  // UCSC downloads are common we will also accept as PSL anything that follows the
2091  // spec after the first column of every line has been tossed.
2092  // Note that I have also seen "#" columns but only at the very beginning of a file.
2093  //
2094  if ( ! EnsureTestBuffer() || ! EnsureSplitLines() ) {
2095  return false;
2096  }
2097 
2098  bool ignoreFirstColumn = false;
2099  unsigned int uPslLineCount = 0;
2100  list<string>::iterator it = m_TestLines.begin();
2101  while (it != m_TestLines.end() && NStr::StartsWith(*it, "#")) {
2102  it++;
2103  }
2104  if (it == m_TestLines.end()) {
2105  return false;
2106  }
2107  if (!IsLinePsl(*it, ignoreFirstColumn)) {
2108  ignoreFirstColumn = true;
2109  if (!IsLinePsl(*it, ignoreFirstColumn)) {
2110  return false;
2111  }
2112  }
2113  uPslLineCount++;
2114  it++;
2115  for ( ; it != m_TestLines.end(); ++it) {
2116  if ( ! IsLinePsl(*it, ignoreFirstColumn) ) {
2117  return false;
2118  }
2119  uPslLineCount++;
2120  }
2121  return (uPslLineCount != 0);
2122 }
2123 
2124 // ----------------------------------------------------------------------------
2125 bool
2127  list<string>::iterator& lineIt,
2128  list<string>::iterator endIt,
2129  string& keyword,
2130  string& data)
2131 // ----------------------------------------------------------------------------
2132 {
2133  if (lineIt == endIt) {
2134  return false;
2135  }
2136  if (lineIt->size() > 79) {
2137  return false;
2138  }
2139 
2140  vector<int> validIndents = {0, 2, 3, 5, 12, 21};
2141  auto firstNotBlank = lineIt->find_first_not_of(" ");
2142  while (firstNotBlank != 0) {
2143  if (std::find(validIndents.begin(), validIndents.end(), firstNotBlank) ==
2144  validIndents.end()) {
2145  auto firstNotBlankOrDigit = lineIt->find_first_not_of(" 1234567890");
2146  if (firstNotBlankOrDigit != 10) {
2147  return false;
2148  }
2149  }
2150  lineIt++;
2151  if (lineIt == endIt) {
2152  return false;
2153  }
2154  firstNotBlank = lineIt->find_first_not_of(" ");
2155  }
2156  try {
2158  *lineIt, " ", keyword, data, NStr::fSplit_MergeDelimiters);
2159  }
2160  catch (CException&) {
2161  return false;
2162  }
2163  lineIt++;
2164  return true;
2165 }
2166 
2167 // ----------------------------------------------------------------------------
2169  EMode /*unused*/)
2170 {
2171  // see ftp://ftp.ncbi.nih.gov/genbank/gbrel.txt
2172 
2173  if ( ! EnsureStats() || ! EnsureSplitLines() ) {
2174  return false;
2175  }
2176 
2177  // smell test:
2178  // note: sample size at least 8000 characters, line length soft limited to
2179  // 80 characters
2180  if (m_TestLines.size() < 9) { // number of required records
2181  return false;
2182  }
2183 
2184  string keyword, data, lookingFor;
2185  auto recordIt = m_TestLines.begin();
2186  auto endIt = m_TestLines.end();
2188  *recordIt, " ", keyword, data, NStr::fSplit_MergeDelimiters);
2189 
2190  lookingFor = "LOCUS"; // excactly one
2191  if (keyword != lookingFor) {
2192  return false;
2193  }
2194  recordIt++;
2195  if (!GenbankGetKeywordLine(recordIt, endIt, keyword, data)) {
2196  return false;
2197  }
2198 
2199  lookingFor = "DEFINITION"; // one or more
2200  if (keyword != lookingFor) {
2201  return false;
2202  }
2203  while (keyword == lookingFor) {
2204  if (!GenbankGetKeywordLine(recordIt, endIt, keyword, data)) {
2205  return false;
2206  }
2207  }
2208 
2209  lookingFor = "ACCESSION"; // one or more
2210  if (keyword != lookingFor) {
2211  return false;
2212  }
2213  while (keyword == lookingFor) {
2214  if (!GenbankGetKeywordLine(recordIt, endIt, keyword, data)) {
2215  return false;
2216  }
2217  }
2218 
2219  bool nidSeen = false;
2220  lookingFor = "NID"; // zero or one, can come before or after VERSION
2221  if (keyword == lookingFor) {
2222  nidSeen = true;
2223  if (!GenbankGetKeywordLine(recordIt, endIt, keyword, data)) {
2224  return false;
2225  }
2226  }
2227 
2228  lookingFor = "VERSION"; // exactly one
2229  if (keyword != lookingFor) {
2230  return false;
2231  }
2232  if (!GenbankGetKeywordLine(recordIt, endIt, keyword, data)) {
2233  return false;
2234  }
2235 
2236  if (!nidSeen) {
2237  lookingFor = "NID"; // zero or one
2238  if (keyword == lookingFor) {
2239  if (!GenbankGetKeywordLine(recordIt, endIt, keyword, data)) {
2240  return false;
2241  }
2242  }
2243  }
2244 
2245  lookingFor = "PROJECT"; // zero or more
2246  while (keyword == lookingFor) {
2247  if (!GenbankGetKeywordLine(recordIt, endIt, keyword, data)) {
2248  return false;
2249  }
2250  }
2251 
2252  lookingFor = "DBLINK"; // zero or more
2253  while (keyword == lookingFor) {
2254  if (!GenbankGetKeywordLine(recordIt, endIt, keyword, data)) {
2255  return false;
2256  }
2257  }
2258 
2259  lookingFor = "KEYWORDS"; // one or more
2260  if (keyword != lookingFor) {
2261  return false;
2262  }
2263 
2264  // I am convinced now. There may be flaws farther down but this input
2265  // definitely wants to be a Genbank flat file.
2266  return true;
2267 }
2268 
2269 // ----------------------------------------------------------------------------
2270 bool
2272  list<string>::iterator& lineIt,
2273  list<string>::iterator endIt,
2274  string& lineCode,
2275  string& lineData)
2276 // ----------------------------------------------------------------------------
2277 {
2278  while (lineIt != endIt && NStr::StartsWith(*lineIt, "XX")) {
2279  lineIt++;
2280  }
2281  if (lineIt == endIt) {
2282  return false;
2283  }
2284  try {
2286  *lineIt, " ", lineCode, lineData, NStr::fSplit_MergeDelimiters);
2287  }
2288  catch(CException&) {
2289  lineCode = *lineIt;
2290  lineData = "";
2291  }
2292  lineIt++;
2293  return true;
2294 }
2295 
2296 // ----------------------------------------------------------------------------
2298  EMode /*unused*/)
2299 {
2300  // see: ftp://ftp.ebi.ac.uk/pub/databases/ena/sequence/release/doc/usrman.txt
2301 
2302  if ( ! EnsureStats() || ! EnsureSplitLines() ) {
2303  return false;
2304  }
2305 
2306  // smell test:
2307  // note: sample size at least 8000 characters, line length soft limited to
2308  // 78 characters
2309  if (m_TestLines.size() < 19) { // number of required records
2310  return false;
2311  }
2312 
2313  string lineCode, lineData, lookingFor;
2314  auto recordIt = m_TestLines.begin();
2315  auto endIt = m_TestLines.end();
2317  *recordIt, " ", lineCode, lineData, NStr::fSplit_MergeDelimiters);
2318 
2319  lookingFor = "ID"; // excactly one
2320  if (lineCode != lookingFor) {
2321  return false;
2322  }
2323  recordIt++;
2324 
2325  lookingFor = "AC"; // one or more
2326  if (!EnaGetLineData(recordIt, endIt, lineCode, lineData)) {
2327  return false;
2328  }
2329  if (lineCode != lookingFor) {
2330  return false;
2331  }
2332  while (lineCode == lookingFor) {
2333  if (!EnaGetLineData(recordIt, endIt, lineCode, lineData)) {
2334  return false;
2335  }
2336  }
2337 
2338  lookingFor = "PR"; // zero or more
2339  while (lineCode == lookingFor) {
2340  if (!EnaGetLineData(recordIt, endIt, lineCode, lineData)) {
2341  return false;
2342  }
2343  }
2344 
2345  lookingFor = "DT"; // two (first hard difference from UniProt)
2346  for (int i = 0; i < 2; ++i) {
2347  if (lineCode != lookingFor) {
2348  return false;
2349  }
2350  if (!EnaGetLineData(recordIt, endIt, lineCode, lineData)) {
2351  return false;
2352  }
2353  }
2354 
2355  lookingFor = "DE"; // one or more
2356  if (lineCode != lookingFor) {
2357  return false;
2358  }
2359  while (lineCode == lookingFor) {
2360  if (!EnaGetLineData(recordIt, endIt, lineCode, lineData)) {
2361  return true;
2362  }
2363  }
2364 
2365  lookingFor = "KW"; // one or more
2366  if (lineCode != lookingFor) {
2367  return false;
2368  }
2369  while (lineCode == lookingFor) {
2370  if (!EnaGetLineData(recordIt, endIt, lineCode, lineData)) {
2371  return true;
2372  }
2373  }
2374 
2375  lookingFor = "OS"; // one or more
2376  if (lineCode != lookingFor) {
2377  return false;
2378  }
2379  while (lineCode == lookingFor) {
2380  if (!EnaGetLineData(recordIt, endIt, lineCode, lineData)) {
2381  return true;
2382  }
2383  }
2384 
2385  lookingFor = "OC"; // one or more
2386  if (lineCode != lookingFor) {
2387  return false;
2388  }
2389  while (lineCode == lookingFor) {
2390  if (!EnaGetLineData(recordIt, endIt, lineCode, lineData)) {
2391  return true;
2392  }
2393  }
2394 
2395  // once here it's Ena or someone is messing with me
2396  return true;
2397 }
2398 
2399 // ----------------------------------------------------------------------------
2400 bool
2402  list<string>::iterator& lineIt,
2403  list<string>::iterator endIt,
2404  string& lineCode,
2405  string& lineData)
2406 // ----------------------------------------------------------------------------
2407 {
2408  if (lineIt == endIt) {
2409  return false;
2410  }
2411  try {
2413  *lineIt, " ", lineCode, lineData, NStr::fSplit_MergeDelimiters);
2414  }
2415  catch(CException&) {
2416  lineCode = *lineIt;
2417  lineData = "";
2418  }
2419  lineIt++;
2420  return true;
2421 }
2422 
2423 // ----------------------------------------------------------------------------
2425  EMode /*unused*/)
2426 {
2427  // see: https://web.expasy.org/docs/userman.html#genstruc
2428 
2429  if ( ! EnsureStats() || ! EnsureSplitLines() ) {
2430  return false;
2431  }
2432 
2433  // smell test:
2434  // note: sample size at least 8000 characters, line length soft limited to
2435  // 75 characters
2436  if (m_TestLines.size() < 15) { // number of required records
2437  return false;
2438  }
2439 
2440  // note:
2441  // we are only trying to assert that the input is *meant* to be uniprot.
2442  // we should not be in the business of validation - this should happen
2443  // downstream, with better error messages than we could possibly provide here.
2444  string lineCode, lineData, lookingFor;
2445  auto recordIt = m_TestLines.begin();
2446  auto endIt = m_TestLines.end();
2448  *recordIt, " ", lineCode, lineData, NStr::fSplit_MergeDelimiters);
2449 
2450  lookingFor = "ID"; // excatly one
2451  if (lineCode != lookingFor) {
2452  return false;
2453  }
2454  recordIt++;
2455 
2456  lookingFor = "AC"; // one or more
2457  if (!UniProtGetLineData(recordIt, endIt, lineCode, lineData)) {
2458  return false;
2459  }
2460  if (lineCode != lookingFor) {
2461  return false;
2462  }
2463  while (lineCode == lookingFor) {
2464  if (!UniProtGetLineData(recordIt, endIt, lineCode, lineData)) {
2465  return false;
2466  }
2467  }
2468 
2469  lookingFor = "DT"; // three (first hard difference from UniProt)
2470  for (int i = 0; i < 3; ++i) {
2471  if (lineCode != lookingFor) {
2472  return false;
2473  }
2474  if (!UniProtGetLineData(recordIt, endIt, lineCode, lineData)) {
2475  return false;
2476  }
2477  }
2478 
2479 
2480  lookingFor = "DE"; // one or more
2481  if (lineCode != lookingFor) {
2482  return false;
2483  }
2484  while (lineCode == lookingFor) {
2485  if (!UniProtGetLineData(recordIt, endIt, lineCode, lineData)) {
2486  return true;
2487  }
2488  }
2489 
2490  // optional "GN" line or first "OS" line
2491  if (lineCode != "GN" && lineCode != "OS") {
2492  return false;
2493  }
2494 
2495  // once here it's UniProt or someone is messing with me
2496  return true;
2497 }
2498 
2499 // ----------------------------------------------------------------------------
2501  EMode)
2502 {
2503  // Currently, only look for the header line identifying the VCF version.
2504  // Waive requirement this be the first line, but still expect it to by
2505  // in the initial sample.
2506  if ( ! EnsureStats() || ! EnsureSplitLines() ) {
2507  return false;
2508  }
2509 
2510  ITERATE( list<string>, it, m_TestLines ) {
2511  if (NStr::StartsWith(*it, "##fileformat=VCFv")) {
2512  return true;
2513  }
2514  }
2515  return false;
2516 }
2517 // ----------------------------------------------------------------------------
2518 
2519 
2520 // ----------------------------------------------------------------------------
2521 void CFormatGuess::x_StripJsonStrings(string& testString) const
2522 {
2523  list<size_t> limits;
2524  x_FindJsonStringLimits(testString, limits);
2525 
2526  // If no strings found
2527  if ( limits.empty() ) {
2528  return;
2529  }
2530 
2531  if (limits.size()%2 == 1) {
2532  // Perhaps testString ends on an open string
2533  // Tack on an additional set of quotes at the end
2534  testString += "\"";
2535  limits.push_back(testString.size()-1);
2536  }
2537  // The length of the limits container is now even
2538 
2539  // Iterate over string start and stop sites
2540  // Strip strings and copy what remains to complement
2541  string complement = "";
2542 
2543  auto it = limits.begin();
2544  size_t comp_interval_start = 0;
2545  while (it != limits.end()) {
2546  const size_t string_start = *it++;
2547  if (string_start > comp_interval_start) {
2548  const size_t comp_interval_length = string_start-comp_interval_start;
2549  complement += testString.substr(comp_interval_start, comp_interval_length);
2550  }
2551 
2552  const size_t string_stop = *it++;
2553  comp_interval_start = string_stop+1;
2554  }
2555 
2556  if (comp_interval_start < testString.size()) {
2557  complement += testString.substr(comp_interval_start);
2558  }
2559 
2560  testString = complement;
2561  return;
2562 }
2563 // ----------------------------------------------------------------------------
2564 
2565 
2566 // ----------------------------------------------------------------------------
2567 void CFormatGuess::x_FindJsonStringLimits(const string& input, list<size_t>& limits) const
2568 {
2569  limits.clear();
2570  const string& double_quotes = R"(")";
2571 
2572  bool is_start = true;
2573  size_t pos = NStr::Find(input, double_quotes);
2574  // List all string start and stop positions
2575  while ( pos != NPOS ) {
2576  limits.push_back(pos);
2577  if (is_start) {
2578  pos = x_FindNextJsonStringStop(input, pos+1);
2579  } else {
2580  pos = NStr::Find(input, double_quotes, pos+1);
2581  }
2582  is_start = !is_start;
2583  }
2584 }
2585 // ----------------------------------------------------------------------------
2586 
2587 
2588 // ----------------------------------------------------------------------------
2589 size_t s_GetPrecedingFslashCount(const string& input, const size_t pos)
2590 {
2591  if (pos == 0 ||
2592  pos >= input.size() ||
2593  NStr::IsBlank(input) )
2594  {
2595  return 0;
2596  }
2597 
2598  int current_pos = static_cast<int>(pos)-1;
2599  size_t num_fslash = 0;
2600  while ( current_pos >= 0 && input[current_pos] == '\\' ) {
2601  ++num_fslash;
2602  --current_pos;
2603  }
2604  return num_fslash;
2605 }
2606 // ----------------------------------------------------------------------------
2607 
2608 
2609 // ----------------------------------------------------------------------------
2610 size_t CFormatGuess::x_FindNextJsonStringStop(const string& input, const size_t from_pos) const
2611 {
2612  const string& double_quotes = R"(")";
2613  size_t pos = NStr::Find(input, double_quotes, from_pos);
2614 
2615  // Double quotes immediately preceded by an odd number of forward
2616  // slashes, for example, /", ///", are escaped
2617  while (pos != NPOS) {
2618  const size_t num_fslash = s_GetPrecedingFslashCount(input, pos);
2619  // If the number of forward slashes is even,
2620  // return the position of the double quotes
2621  if (num_fslash%2 == 0) {
2622  break;
2623  }
2624  pos = NStr::Find(input, double_quotes, pos+1);
2625  }
2626  return pos;
2627 }
2628 // ----------------------------------------------------------------------------
2629 
2630 
2631 // ----------------------------------------------------------------------------
2632 bool CFormatGuess::x_CheckStripJsonNumbers(string& testString) const
2633 {
2634  if (NStr::IsBlank(testString)) {
2635  return true;
2636  }
2637 
2638  list<string> subStrings;
2639  // Split on white space
2640  NStr::Split(testString, " \r\t\n", subStrings, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
2641 
2642  for (auto it = subStrings.cbegin(); it != subStrings.cend(); ++it) {
2643  const string subString = *it;
2644 
2645  if (!x_IsNumber(subString)) { // The last substring might be a truncated number or keyword
2646  ++it;
2647  if (it == subStrings.cend()) {
2648  testString = subString;
2649  return true;
2650  }
2651  return false;
2652  }
2653  }
2654 
2655  testString.clear();
2656  return true;
2657 }
2658 // ----------------------------------------------------------------------------
2659 
2660 
2661 // -----------------------------------------------------------------------------
2662 bool CFormatGuess::x_IsTruncatedJsonNumber(const string& testString) const
2663 {
2664  // Truncation of a JSON number may result strings of the following type:
2665  // 1.1e
2666  // 1.1E
2667  // 1.7E-
2668  // +
2669  // -
2670  // NStr::StringToDouble cannot handle such truncations, but we can "fix"
2671  // the truncation by appending zero ("0") to the truncated string
2672 
2673  const string extendedString = testString + "0";
2674 
2675  return x_IsNumber(extendedString);
2676 }
2677 // -----------------------------------------------------------------------------
2678 
2679 
2680 // -----------------------------------------------------------------------------
2681 bool CFormatGuess::x_IsNumber(const string& testString) const
2682 {
2683  try {
2684  NStr::StringToDouble(testString);
2685  }
2686  catch (...) {
2687  return false;
2688  }
2689  return true;
2690 }
2691 // -----------------------------------------------------------------------------
2692 
2693 
2694 // -----------------------------------------------------------------------------
2695 bool CFormatGuess::x_IsTruncatedJsonKeyword(const string& testString) const
2696 {
2697  const size_t stringSize = testString.size();
2698  // nul, tru, fals
2699  if (stringSize > 4) {
2700  return false;
2701  }
2702 
2703  const string nullString("null");
2704  const string trueString("true");
2705  const string falseString("false");
2706 
2707  if (testString == nullString.substr(0, stringSize) ||
2708  testString == trueString.substr(0, stringSize) ||
2709  testString == falseString.substr(0, stringSize)) {
2710  return true;
2711  }
2712 
2713  return false;
2714 }
2715 // -----------------------------------------------------------------------------
2716 
2717 
2718 // ----------------------------------------------------------------------------
2719 bool CFormatGuess::x_IsBlankOrNumbers(const string& testString) const
2720 {
2721  if (NStr::IsBlank(testString)) {
2722  return true;
2723  }
2724 
2725  list<string> numStrings;
2726  // Split on white space
2727  NStr::Split(testString, " \r\t\n", numStrings, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
2728 
2729  for (auto numString : numStrings) {
2730  if (!x_IsNumber(numString)) {
2731  return false;
2732  }
2733  }
2734 
2735  return true;
2736 }
2737 // ----------------------------------------------------------------------------
2738 
2739 
2740 // ----------------------------------------------------------------------------
2741 bool CFormatGuess::x_CheckStripJsonPunctuation(string& testString) const
2742 {
2743  // Parentheses are prohibited
2744  if (testString.find_first_of("()") != string::npos) {
2745  return false;
2746  }
2747 
2748  const size_t punctuation_threshold = 4;
2749 
2750  // Reject if the number of punctuation characters falls below some threshold value.
2751  // In this case, the threshold is hardcoded to 4.
2752  if (x_StripJsonPunctuation(testString) < punctuation_threshold) {
2753  return false;
2754  }
2755 
2756  return true;
2757 }
2758 // ----------------------------------------------------------------------------
2759 
2760 
2761 // ----------------------------------------------------------------------------
2762 size_t CFormatGuess::x_StripJsonPunctuation(string& testString) const
2763 {
2764  size_t initial_len = testString.size();
2765 
2766  NStr::ReplaceInPlace(testString, "{", "");
2767  NStr::ReplaceInPlace(testString, "}", "");
2768  NStr::ReplaceInPlace(testString, "[", "");
2769  NStr::ReplaceInPlace(testString, "]", "");
2770  NStr::ReplaceInPlace(testString, ":", "");
2771  NStr::ReplaceInPlace(testString, ",", "");
2772 
2773  return testString.size() - initial_len;
2774 }
2775 // ----------------------------------------------------------------------------
2776 
2777 
2778 // ----------------------------------------------------------------------------
2779 void CFormatGuess::x_StripJsonKeywords(string& testString) const
2780 {
2781  NStr::ReplaceInPlace(testString, "true", "");
2782  NStr::ReplaceInPlace(testString, "false", "");
2783  NStr::ReplaceInPlace(testString, "null", "");
2784 }
2785 // ----------------------------------------------------------------------------
2786 
2787 
2788 // ----------------------------------------------------------------------------
2789 bool CFormatGuess::x_CheckJsonStart(const string& testString) const
2790 {
2791  if (NStr::StartsWith(testString, "{")) {
2792  // Next character must begin a string
2793  const auto next_pos = testString.find_first_not_of("( \t\r\n",1);
2794  if (next_pos != NPOS && testString[next_pos] == '\"') {
2795  return true;
2796  }
2797  }
2798  else
2799  if (NStr::StartsWith(testString, "[")) {
2800  return true;
2801  }
2802 
2803  return false;
2804 }
2805 // ----------------------------------------------------------------------------
2806 
2807 
2808 // ----------------------------------------------------------------------------
2810  EMode)
2811 {
2812 
2813  // Convert the test-buffer character array to a string
2814  string testString(m_pTestBuffer, m_iTestDataSize);
2815 
2816  if ( NStr::IsBlank(testString) ) {
2817  return false;
2818  }
2819 
2821 
2822  if (!x_CheckJsonStart(testString)) {
2823  return false;
2824  }
2825 
2826  x_StripJsonStrings(testString);
2827 
2828  if ( !x_CheckStripJsonPunctuation(testString) ) {
2829  return false;
2830  }
2831 
2832  x_StripJsonKeywords(testString);
2833 
2834  if (!x_CheckStripJsonNumbers(testString)) {
2835  return false;
2836  }
2837 
2838  if ( NStr::IsBlank(testString) ) {
2839  return true;
2840  }
2841 
2842  // What remains is either a truncated number
2843  // or a truncated keyword
2844  return x_IsTruncatedJsonNumber(testString) |
2845  x_IsTruncatedJsonKeyword(testString);
2846 }
2847 // ----------------------------------------------------------------------------
2848 
2849 
2850 // ----------------------------------------------------------------------------
2852 {
2853  //
2854  // Repeatmasker files consist of columnar data with a couple of lines
2855  // of column labels prepended to it (but sometimes someone strips those
2856  // labels).
2857  // This function tries to identify repeatmasker data by those column
2858  // label lines. They should be the first non-blanks in the file.
2859  //
2860  string labels_1st_line[] = { "SW", "perc", "query", "position", "matching", "" };
2861  string labels_2nd_line[] = { "score", "div.", "del.", "ins.", "sequence", "" };
2862 
2863  //
2864  // Purge junk lines:
2865  //
2866  list<string>::iterator it = m_TestLines.begin();
2867  for ( ; it != m_TestLines.end(); ++it ) {
2869  if ( *it != "" ) {
2870  break;
2871  }
2872  }
2873 
2874  if ( it == m_TestLines.end() ) {
2875  return false;
2876  }
2877 
2878  //
2879  // Verify first line of labels:
2880  //
2881  size_t current_offset = 0;
2882  for ( size_t i=0; labels_1st_line[i] != ""; ++i ) {
2883  current_offset = NStr::FindCase( *it, labels_1st_line[i], current_offset );
2884  if ( current_offset == NPOS ) {
2885  return false;
2886  }
2887  }
2888 
2889  //
2890  // Verify second line of labels:
2891  //
2892  ++it;
2893  if ( it == m_TestLines.end() ) {
2894  return false;
2895  }
2896  current_offset = 0;
2897  for ( size_t j=0; labels_2nd_line[j] != ""; ++j ) {
2898  current_offset = NStr::FindCase( *it, labels_2nd_line[j], current_offset );
2899  if ( current_offset == NPOS ) {
2900  return false;
2901  }
2902  }
2903 
2904  //
2905  // Should have at least one extra line:
2906  //
2907  ++it;
2908  if ( it == m_TestLines.end() ) {
2909  return false;
2910  }
2911 
2912  return true;
2913 }
2914 
2915 
2916 // ----------------------------------------------------------------------------
2918 {
2919  //
2920  // Repeatmasker files consist of columnar data with a couple of lines
2921  // of column labels prepended to it (but sometimes someone strips those
2922  // labels).
2923  // This function assumes the column labels have been stripped and attempts
2924  // to identify RMO by checking the data itself.
2925  //
2926 
2927  //
2928  // We declare the data as RMO if we are able to parse every record in the
2929  // sample we got:
2930  //
2931  ITERATE( list<string>, it, m_TestLines ) {
2932  string str = NStr::TruncateSpaces( *it );
2933  if ( str == "" ) {
2934  continue;
2935  }
2936  if ( ! IsLineRmo( str ) ) {
2937  return false;
2938  }
2939  }
2940 
2941  return true;
2942 }
2943 
2944 
2945 // ----------------------------------------------------------------------------
2946 bool
2948  const string& cline )
2949 // ----------------------------------------------------------------------------
2950 {
2951  // NOTE:
2952  // See http://evolution.genetics.washington.edu/phylip/newick_doc.html
2953  //
2954  // Note that Newick tree tend to be written out as a single long line. Thus,
2955  // we are most likely only seeing the first part of a tree.
2956  //
2957 
2958  // NOTE:
2959  // MSS-112 introduced the concept of multitree files is which after the ";"
2960  // another tree may start. The new logic accepts files as Newick if they
2961  // are Newick up to and including the first semicolon. It does not look
2962  // beyond.
2963 
2964  string line = NStr::TruncateSpaces( cline );
2965  if ( line.empty() || line[0] != '(') {
2966  return false;
2967  }
2968  {{
2969  // Strip out comments:
2970  string trimmed;
2971  bool in_comment = false;
2972  for ( size_t ii=0; line.c_str()[ii] != 0; ++ii ) {
2973  if ( ! in_comment ) {
2974  if ( line.c_str()[ii] != '[' ) {
2975  trimmed += line.c_str()[ii];
2976  }
2977  else {
2978  in_comment = true;
2979  }
2980  }
2981  else /* in_comment */ {
2982  if ( line.c_str()[ii] == ']' ) {
2983  in_comment = false;
2984  }
2985  }
2986  }
2987  line = trimmed;
2988  }}
2989  {{
2990  // Compress quoted labels:
2991  string trimmed;
2992  bool in_quote = false;
2993  for ( size_t ii=0; line.c_str()[ii] != 0; ++ii ) {
2994  if ( ! in_quote ) {
2995  if ( line.c_str()[ii] != '\'' ) {
2996  trimmed += line.c_str()[ii];
2997  }
2998  else {
2999  in_quote = true;
3000  trimmed += 'A';
3001  }
3002  }
3003  else { /* in_quote */
3004  if ( line.c_str()[ii] == '\'' ) {
3005  in_quote = false;
3006  }
3007  }
3008  }
3009  line = trimmed;
3010  }}
3011  {{
3012  // Strip distance markers:
3013  string trimmed;
3014  size_t ii=0;
3015  while ( line.c_str()[ii] != 0 ) {
3016  if ( line.c_str()[ii] != ':' ) {
3017  trimmed += line.c_str()[ii++];
3018  }
3019  else {
3020  ii++;
3021  if ( line.c_str()[ii] == '-' || line.c_str()[ii] == '+' ) {
3022  ii++;
3023  }
3024  while ( '0' <= line.c_str()[ii] && line.c_str()[ii] <= '9' ) {
3025  ii++;
3026  }
3027  if ( line.c_str()[ii] == '.' ) {
3028  ii++;
3029  while ( '0' <= line.c_str()[ii] && line.c_str()[ii] <= '9' ) {
3030  ii++;
3031  }
3032  }
3033  }
3034  }
3035  line = trimmed;
3036  }}
3037  {{
3038  // Rough lexical analysis of what's left. Bail immediately on fault:
3039  if (line.empty() || line[0] != '(') {
3040  return false;
3041  }
3042  size_t paren_count = 1;
3043  for ( size_t ii=1; line.c_str()[ii] != 0; ++ii ) {
3044  switch ( line.c_str()[ii] ) {
3045  default:
3046  break;
3047  case '(':
3048  ++paren_count;
3049  break;
3050  case ')':
3051  if ( paren_count == 0 ) {
3052  return false;
3053  }
3054  --paren_count;
3055  break;
3056  case ',':
3057  if ( paren_count == 0 ) {
3058  return false;
3059  }
3060  break;
3061  case ';':
3062 // if ( line[ii+1] != 0 ) {
3063 // return false;
3064 // }
3065  break;
3066  }
3067  }
3068  }}
3069  return true;
3070 }
3071 
3072 
3073 // ----------------------------------------------------------------------------
3075  const string& line )
3076 {
3077  // blocks of ten residues (or permitted punctuation characters)
3078  // with a count at the start or end; require at least four
3079  // (normally six)
3080  SIZE_TYPE pos = line.find_first_not_of("0123456789 \t");
3081  if (pos == NPOS || pos + 45 >= line.size()) {
3082  return false;
3083  }
3084 
3085  for (SIZE_TYPE i = 0; i < 45; ++i) {
3086  char c = line[pos + i];
3087  if (i % 11 == 10) {
3088  if ( !isspace(c) ) {
3089  return false;
3090  }
3091  } else {
3092  if ( !isalpha(c) && c != '-' && c != '*') {
3093  return false;
3094  }
3095  }
3096  }
3097 
3098  return true;
3099 }
3100 
3101 
3102 // ----------------------------------------------------------------------------
3104  const string& label )
3105 {
3106  // Starts with a string of anything other than "[]:", optionally followed by
3107  // a single ':', followed by a number, optionally followed by a dot and
3108  // another number.
3109  if ( NPOS != label.find_first_of( "[]" ) ) {
3110  return false;
3111  }
3112  size_t colon = label.find( ':' );
3113  if ( NPOS == colon ) {
3114  return true;
3115  }
3116  size_t dot = label.find_first_not_of( "0123456789", colon + 1 );
3117  if ( NPOS == dot ) {
3118  return true;
3119  }
3120  if ( label[ dot ] != '.' ) {
3121  return false;
3122  }
3123  size_t end = label.find_first_not_of( "0123456789", dot + 1 );
3124  return ( NPOS == end );
3125 }
3126 
3127 
3128 // ----------------------------------------------------------------------------
3130  const string& strLine )
3131 {
3132  //
3133  // Note: The reader allows for line and endline comments starting with a '#'.
3134  // So we accept them here, too.
3135  //
3136  string line( strLine );
3137  size_t uCommentStart = NStr::Find( line, "#" );
3138 
3139  if ( NPOS != uCommentStart ) {
3140  line = line.substr( 0, uCommentStart );
3141  }
3143  if ( line.empty() ) {
3144  return true;
3145  }
3146 
3147  vector<string> tokens;
3148  if ( NStr::Split(line, " \t", tokens, NStr::fSplit_Tokenize).size() < 8 ) {
3149  return false;
3150  }
3151 
3152  if ( tokens[1].size() > 1 && tokens[1][0] == '-' ) {
3153  tokens[1][0] = '1';
3154  }
3155  if ( -1 == NStr::StringToNonNegativeInt( tokens[1] ) ) {
3156  return false;
3157  }
3158 
3159  if ( tokens[2].size() > 1 && tokens[2][0] == '-' ) {
3160  tokens[2][0] = '1';
3161  }
3162  if ( -1 == NStr::StringToNonNegativeInt( tokens[2] ) ) {
3163  return false;
3164  }
3165 
3166  if ( tokens[3].size() > 1 && tokens[3][0] == '-' ) {
3167  tokens[3][0] = '1';
3168  }
3169  if ( -1 == NStr::StringToNonNegativeInt( tokens[3] ) ) {
3170  return false;
3171  }
3172 
3173  if ( tokens[4].size() != 1 || NPOS == tokens[4].find_first_of( "ADFGPNOW" ) ) {
3174  return false;
3175  }
3176  if ( tokens[4] == "N" ) {
3177  if ( -1 == NStr::StringToNonNegativeInt( tokens[5] ) ) {
3178  return false;
3179  }
3180  }
3181  else {
3182  if ( -1 == NStr::StringToNonNegativeInt( tokens[6] ) ) {
3183  return false;
3184  }
3185  if ( -1 == NStr::StringToNonNegativeInt( tokens[7] ) ) {
3186  return false;
3187  }
3188  if ( tokens.size() != 9 ) {
3189  return false;
3190  }
3191  if ( tokens[8].size() != 1 || NPOS == tokens[8].find_first_of( "+-" ) ) {
3192  return false;
3193  }
3194  }
3195 
3196  return true;
3197 }
3198 
3199 
3200 // ----------------------------------------------------------------------------
3202  const string& line )
3203 {
3204  list<string> toks;
3205  NStr::Split(line, "\t ", toks, NStr::fSplit_Tokenize);
3206  if (toks.size() != 5) {
3207  return false;
3208  }
3209 
3210  list<string>::iterator i = toks.begin();
3211 
3212  /// first column: skip (ascii identifier)
3213  ++i;
3214 
3215  /// second, third columns: both ints
3216  if ( ! s_IsTokenInteger( *i++ ) ) {
3217  return false;
3218  }
3219  if ( ! s_IsTokenInteger( *i++ ) ) {
3220  return false;
3221  }
3222 
3223  /// fourth column: int in the range of -3...3
3224  if ( ! s_IsTokenInteger( *i ) ) {
3225  return false;
3226  }
3227  int frame = NStr::StringToInt( *i++ );
3228  if (frame < -3 || frame > 3) {
3229  return false;
3230  }
3231 
3232  /// fifth column: score; double
3233  if ( ! s_IsTokenDouble( *i ) ) {
3234  return false;
3235  }
3236 
3237  return true;
3238 }
3239 
3240 
3241 // ----------------------------------------------------------------------------
3243  const string& line )
3244 {
3245  vector<string> tokens;
3246  if ( NStr::Split(line, " \t", tokens, NStr::fSplit_Tokenize).size() < 8 ) {
3247  return false;
3248  }
3249  if ( ! s_IsTokenPosInt( tokens[3] ) ) {
3250  return false;
3251  }
3252  if ( ! s_IsTokenPosInt( tokens[4] ) ) {
3253  return false;
3254  }
3255  if ( ! s_IsTokenDouble( tokens[5] ) ) {
3256  return false;
3257  }
3258  if ( tokens[6].size() != 1 || NPOS == tokens[6].find_first_of( ".+-" ) ) {
3259  return false;
3260  }
3261  if ( tokens[7].size() != 1 || NPOS == tokens[7].find_first_of( ".0123" ) ) {
3262  return false;
3263  }
3264  if ( tokens.size() < 9 ||
3265  (NPOS == tokens[8].find( "gene_id" ) && NPOS == tokens[8].find( "transcript_id" ) ) ) {
3266  return false;
3267  }
3268  return true;
3269 }
3270 
3271 
3272 // ----------------------------------------------------------------------------
3274  const string& line )
3275 // ----------------------------------------------------------------------------
3276 {
3277  vector<string> tokens;
3278  if ( NStr::Split(line, " \t", tokens, NStr::fSplit_Tokenize).size() < 8 ) {
3279  return false;
3280  }
3281  if ( ! s_IsTokenPosInt( tokens[3] ) ) {
3282  return false;
3283  }
3284  if ( ! s_IsTokenPosInt( tokens[4] ) ) {
3285  return false;
3286  }
3287 
3288  //make sure that "type" is a GVF admissible value:
3289  {{
3290  bool typeOk = false;
3291  list<string> terms;
3292  terms.push_back("snv");
3293  terms.push_back("cnv");
3294  terms.push_back("copy_number_variation");
3295  terms.push_back("gain");
3296  terms.push_back("copy_number_gain");
3297  terms.push_back("loss");
3298  terms.push_back("copy_number_loss");
3299  terms.push_back("loss_of_heterozygosity");
3300  terms.push_back("complex");
3301  terms.push_back("complex_substitution");
3302  terms.push_back("complex_sequence_alteration");
3303  terms.push_back("indel");
3304  terms.push_back("insertion");
3305  terms.push_back("inversion");
3306  terms.push_back("substitution");
3307  terms.push_back("deletion");
3308  terms.push_back("duplication");
3309  terms.push_back("translocation");
3310  terms.push_back("upd");
3311  terms.push_back("uniparental_disomy");
3312  terms.push_back("maternal_uniparental_disomy");
3313  terms.push_back("paternal_uniparental_disomy");
3314  terms.push_back("tandom_duplication");
3315  terms.push_back("structural_variation");
3316  terms.push_back("sequence_alteration");
3317  ITERATE(list<string>, termiter, terms) {
3318  if(NStr::EqualNocase(*termiter, tokens[2])) {
3319  typeOk = true;
3320  break;
3321  }
3322  }
3323  if (!typeOk) {
3324  return false;
3325  }
3326  }}
3327  if ( ! s_IsTokenDouble( tokens[5] ) ) {
3328  return false;
3329  }
3330  if ( tokens[6].size() != 1 || NPOS == tokens[6].find_first_of( ".+-" ) ) {
3331  return false;
3332  }
3333  if ( tokens[7].size() != 1 || NPOS == tokens[7].find_first_of( ".0123" ) ) {
3334  return false;
3335  }
3336 
3337  //make sure all the mandatory attributes are present:
3338  string attrs = tokens[8];
3339  if (string::npos == attrs.find("ID="))
3340  return false;
3341  if (string::npos == attrs.find("Variant_seq=")) {
3342  return false;
3343  }
3344  return true;
3345 }
3346 
3347 
3348 // ----------------------------------------------------------------------------
3350  const string& line )
3351 {
3352  vector<string> tokens;
3353  if ( NStr::Split(line, " \t", tokens, NStr::fSplit_Tokenize).size() < 8 ) {
3354  return false;
3355  }
3356  if ( ! s_IsTokenPosInt( tokens[3] ) ) {
3357  return false;
3358  }
3359  if ( ! s_IsTokenPosInt( tokens[4] ) ) {
3360  return false;
3361  }
3362  if ( ! s_IsTokenDouble( tokens[5] ) ) {
3363  return false;
3364  }
3365  if ( tokens[6].size() != 1 || NPOS == tokens[6].find_first_of( ".+-?" ) ) {
3366  return false;
3367  }
3368  if ( tokens[7].size() != 1 || NPOS == tokens[7].find_first_of( ".0123" ) ) {
3369  return false;
3370  }
3371  if ( tokens.size() < 9 || tokens[8].empty()) {
3372  return false;
3373  }
3374  if ( tokens.size() >= 9 && tokens[8].size() > 1) {
3375  const string& col9 = tokens[8];
3376  if ( NPOS == NStr::Find(col9, "ID") &&
3377  NPOS == NStr::Find(col9, "Parent") &&
3378  NPOS == NStr::Find(col9, "Target") &&
3379  NPOS == NStr::Find(col9, "Name") &&
3380  NPOS == NStr::Find(col9, "Alias") &&
3381  NPOS == NStr::Find(col9, "Note") &&
3382  NPOS == NStr::Find(col9, "Dbxref") &&
3383  NPOS == NStr::Find(col9, "Xref") ) {
3384  return false;
3385  }
3386  }
3387 
3388  return true;
3389 }
3390 
3391 
3392 // ----------------------------------------------------------------------------
3394  const string& line )
3395 {
3396  vector<string> tokens;
3397  string remaining(line), head, tail;
3398 
3399  //column 0: ID, string
3400  if (!NStr::SplitInTwo(remaining, " \t", head, tail)) {
3401  return false;
3402  }
3403  remaining = tail;
3404 
3405  //column 1: method, most likely "AUGUSTUS" but don't want to rely on this
3406  if (!NStr::SplitInTwo(remaining, " \t", head, tail)) {
3407  return false;
3408  }
3409  remaining = tail;
3410 
3411  //column 2: feature type, controlled vocabulary
3412  if (!NStr::SplitInTwo(remaining, " \t", head, tail)) {
3413  return false;
3414  }
3415  remaining = tail;
3416  string featureType = head;
3417 
3418  //column 3: start, integer
3419  if (!NStr::SplitInTwo(remaining, " \t", head, tail) || !s_IsTokenPosInt(head)) {
3420  return false;
3421  }
3422  remaining = tail;
3423 
3424  //column 4: stop, integer
3425  if (!NStr::SplitInTwo(remaining, " \t", head, tail) || !s_IsTokenPosInt(head)) {
3426  return false;
3427  }
3428  remaining = tail;
3429 
3430  //column 5: score, double
3431  if (!NStr::SplitInTwo(remaining, " \t", head, tail) || !s_IsTokenDouble(head)) {
3432  return false;
3433  }
3434  remaining = tail;
3435 
3436  //column 6: strand, one in "+-.?"
3437  const string legalStrands{"+-.?"};
3438  if (!NStr::SplitInTwo(remaining, " \t", head, tail) || head.size() != 1 ||
3439  string::npos == legalStrands.find(head)) {
3440  return false;
3441  }
3442  remaining = tail;
3443 
3444  //column 7: phase, one in ".0123"
3445  const string legalPhases{".0123"};
3446  if (!NStr::SplitInTwo(remaining, " \t", head, tail) || head.size() != 1 ||
3447  string::npos == legalPhases.find(head)) {
3448  return false;
3449  }
3450  remaining = tail;
3451 
3452  //everything else: attributes, format depends on featureType
3453  if (remaining.empty()) {
3454  return false;
3455  }
3456 
3457  if (featureType == "gene") {
3458  if (NPOS != NStr::Find(remaining, ";")) {
3459  return false;
3460  }
3461  if (NPOS != NStr::Find(remaining, " ")) {
3462  return false;
3463  }
3464  return true;
3465  }
3466  if (featureType == "transcript") {
3467  if (NPOS != NStr::Find(remaining, ";")) {
3468  return false;
3469  }
3470  if (NPOS != NStr::Find(remaining, " ")) {
3471  return false;
3472  }
3473  return true;
3474  }
3475  if (NPOS == NStr::Find(remaining, "transcript_id")) {
3476  return false;
3477  }
3478  if (NPOS == NStr::Find(remaining, "gene_id")) {
3479  return false;
3480  }
3481  return true;
3482 }
3483 
3484 
3485 // ----------------------------------------------------------------------------
3487  const string& line )
3488 {
3489  vector<string> tokens;
3490  const size_t num_cols = NStr::Split(line, " \t", tokens, NStr::fSplit_Tokenize).size();
3491  if ( num_cols < 8 ) {
3492  return false;
3493  }
3494  if ( ! s_IsTokenPosInt( tokens[3] ) ) {
3495  return false;
3496  }
3497  if ( ! s_IsTokenPosInt( tokens[4] ) ) {
3498  return false;
3499  }
3500  if ( ! s_IsTokenDouble( tokens[5] ) ) {
3501  return false;
3502  }
3503  if ( tokens[6].size() != 1 || NPOS == tokens[6].find_first_of( ".+-" ) ) {
3504  return false;
3505  }
3506  if ( tokens[7].size() != 1 || NPOS == tokens[7].find_first_of( ".0123" ) ) {
3507  return false;
3508  }
3509  return true;
3510 }
3511 
3512 
3513 // ----------------------------------------------------------------------------
3515  const string& line )
3516 {
3517  vector<string> values;
3518  if ( NStr::Split(line, " \t", values, NStr::fSplit_Tokenize).empty() ) {
3519  return false;
3520  }
3521 
3522  //
3523  // Old style: "^DNA \\w+ "
3524  //
3525  if ( values[0] == "DNA" ) {
3526  return true;
3527  }
3528 
3529  //
3530  // New style: "^AS [0-9]+ [0-9]+"
3531  //
3532  if ( values[0] == "AS" ) {
3533  return ( 0 <= NStr::StringToNonNegativeInt( values[1] ) &&
3534  0 <= NStr::StringToNonNegativeInt( values[2] ) );
3535  }
3536 
3537  return false;
3538 }
3539 
3540 
3541 // ----------------------------------------------------------------------------
3543  const string& line )
3544 {
3545  const size_t MIN_VALUES_PER_RECORD = 14;
3546 
3547  //
3548  // Make sure there is enough stuff on that line:
3549  //
3550  list<string> values;
3551  if ( NStr::Split(line, " \t", values, NStr::fSplit_Tokenize).size() < MIN_VALUES_PER_RECORD ) {
3552  return false;
3553  }
3554 
3555  //
3556  // Look at specific values and make sure they are of the correct type:
3557  //
3558 
3559  // 1: positive integer:
3560  list<string>::iterator it = values.begin();
3561  if ( ! s_IsTokenPosInt( *it ) ) {
3562  return false;
3563  }
3564 
3565  // 2: float:
3566  ++it;
3567  if ( ! s_IsTokenDouble( *it ) ) {
3568  return false;
3569  }
3570 
3571  // 3: float:
3572  ++it;
3573  if ( ! s_IsTokenDouble( *it ) ) {
3574  return false;
3575  }
3576 
3577  // 4: float:
3578  ++it;
3579  if ( ! s_IsTokenDouble( *it ) ) {
3580  return false;
3581  }
3582 
3583  // 5: string, not checked
3584  ++it;
3585 
3586  // 6: positive integer:
3587  ++it;
3588  if ( ! s_IsTokenPosInt( *it ) ) {
3589  return false;
3590  }
3591 
3592  // 7: positive integer:
3593  ++it;
3594  if ( ! s_IsTokenPosInt( *it ) ) {
3595  return false;
3596  }
3597 
3598  // 8: positive integer, likely in paretheses, not checked:
3599  ++it;
3600 
3601  // 9: '+' or 'C':
3602  ++it;
3603  if ( *it != "+" && *it != "C" ) {
3604  return false;
3605  }
3606 
3607  // and that's enough for now. But there are at least two more fields
3608  // with values that look testable.
3609 
3610  return true;
3611 }
3612 
3613 
3614 // ----------------------------------------------------------------------------
3615 bool
3617  const string& line,
3618  bool ignoreFirstLine)
3619 // ----------------------------------------------------------------------------
3620 {
3621  vector<string> tokens;
3622  int firstColumn = (ignoreFirstLine ? 1 : 0);
3623  NStr::Split(line, " \t", tokens, NStr::fSplit_Tokenize);
3624  if (tokens.size() - firstColumn != 21) {
3625  return false;
3626  }
3627  // first 8 columns are positive integers:
3628  for (auto column = firstColumn; column < firstColumn + 8; ++column) {
3629  if (!s_IsTokenPosInt(tokens[column]) ) {
3630  return false;
3631  }
3632  }
3633  // next is one or two "+" or "-":
3634  const string& token = tokens[firstColumn + 8];
3635  if (token.empty() || token.size() > 2) {
3636  return false;
3637  }
3638  if (token.find_first_not_of("-+") != string::npos) {
3639  return false;
3640  }
3641  // columns 11 - 13 are positive integers:
3642  for (auto column = firstColumn + 10; column < firstColumn + 13; ++column) {
3643  if (!s_IsTokenPosInt(tokens[column]) ) {
3644  return false;
3645  }
3646  }
3647  // columns 15 - 18 are positive integers:
3648  for (auto column = firstColumn + 14; column < firstColumn + 18; ++column) {
3649  if (!s_IsTokenPosInt(tokens[column]) ) {
3650  return false;
3651  }
3652  }
3653 
3654  // the following is disabled because we want to allow incorrect but recognizable
3655  // PSL to pass as PSL.
3656  // This will hopefully give the user a better error message as to what is wrong
3657  // with the data than we can do within the constraints of the format guesser.
3658 #if 0
3659  int blockCount = NStr::StringToInt(tokens[firstColumn + 17]);
3660  // columns 19 - 21 are comma separated lists of positive integers, list size
3661  // must be equal to blockCount
3662  for (auto column = firstColumn + 18; column < firstColumn + 21; ++column) {
3663  vector<string> hopefullyInts;
3664  NStr::Split(tokens[column], ",", hopefullyInts, NStr::fSplit_Tokenize);
3665  if (hopefullyInts.size() != blockCount) {
3666  return false;
3667  }
3668  for (auto hopefulInt: hopefullyInts) {
3669  if (!s_IsTokenPosInt(hopefulInt) ) {
3670  return false;
3671  }
3672  }
3673  }
3674 #endif
3675  return true;
3676 }
3677 
3678 
3679 // ----------------------------------------------------------------------------
3680 bool
3682  const vector<string>& Fields )
3683 {
3684  if ( Fields.size() == 0 ) {
3685  return true;
3686  }
3687  return ( NStr::StartsWith( Fields[0], "--" ) );
3688 }
3689 
3690 // ----------------------------------------------------------------------------
3691 
3692 bool
3694 // ----------------------------------------------------------------------------
3695 {
3696  if ( m_bSplitDone ) {
3697  return !m_TestLines.empty();
3698  }
3699  m_bSplitDone = true;
3700 
3701  //
3702  // Make sure the given data is ASCII before checking potential line breaks:
3703  //
3704  const size_t MIN_HIGH_RATIO = 20;
3705  size_t high_count = 0;
3706  for ( streamsize i=0; i < m_iTestDataSize; ++i ) {
3707  if ( 0x80 & m_pTestBuffer[i] ) {
3708  ++high_count;
3709  }
3710  }
3711  if ( 0 < high_count && m_iTestDataSize / high_count < MIN_HIGH_RATIO ) {
3712  return false;
3713  }
3714 
3715  //
3716  // Let's expect at least one line break in the given data:
3717  //
3718  string data( m_pTestBuffer, (size_t)m_iTestDataSize );
3719  m_TestLines.clear();
3720 
3721  if ( string::npos != data.find("\r\n") ) {
3723  }
3724  else if ( string::npos != data.find("\n") ) {
3726  }
3727  else if ( string::npos != data.find("\r") ) {
3729  }
3730  else if ( m_iTestDataSize == m_iTestBufferSize) {
3731  //most likely single truncated line
3732  return false;
3733  }
3734  else {
3735  //test buffer contains the entire file
3736  m_TestLines.push_back(data);
3737  }
3738 
3739  if ( m_iTestDataSize == m_iTestBufferSize && m_TestLines.size() > 1 ) {
3740  //multiple lines, last likely truncated
3741  m_TestLines.pop_back();
3742  }
3743  return !m_TestLines.empty();
3744 }
3745 
3746 // ----------------------------------------------------------------------------
3747 bool
3749 {
3750  const double REQUIRED_ASCII_RATIO = 0.9;
3751 
3752  // first stab - are we text? comments are only valid if we are text
3753  size_t count = 0;
3754  size_t count_print = 0;
3755  for (int i = 0; i < m_iTestDataSize; ++i, ++count) {
3756  if (isprint((unsigned char) m_pTestBuffer[i])) {
3757  ++count_print;
3758  }
3759  }
3760  if (count_print < (double)count * REQUIRED_ASCII_RATIO) {
3761  return false;
3762  }
3763  return true;
3764 }
3765 
3766 // ----------------------------------------------------------------------------
3767 bool
3769 {
3770  if (!IsAsciiText()) {
3771  return false;
3772  }
3773 
3774  m_bSplitDone = false;
3775  m_TestLines.clear();
3776  EnsureSplitLines();
3777 
3778  ITERATE(list<string>, it, m_TestLines) {
3779  if(it->empty()) {
3780  continue;
3781  }
3782  if (NStr::StartsWith(*it, "#")) {
3783  continue;
3784  }
3785  if(NStr::StartsWith(*it, "--")) {
3786  continue;
3787  }
3788  return false;
3789  }
3790 
3791  return true;
3792 }
3793 
3794 // ----------------------------------------------------------------------------
3796  const string& line )
3797 {
3798  // This simple check can mistake Newwick, so Newwick is checked first
3799  // /[:alnum:]+:(g|c|r|p|m|mt|n)\.[:alnum:]+/ as in NC_000001.9:g.1234567C>T
3800  int State = 0;
3801  ITERATE(string, Iter, line) {
3802  char Char = *Iter;
3803  char Next = '\0';
3804  string::const_iterator NextI = Iter;
3805  ++NextI;
3806  if(NextI != line.end())
3807  Next = *NextI;
3808 
3809  if(State == 0) {
3810  if( isalnum(Char) )
3811  State++;
3812  } else if(State == 1) {
3813  if(Char == ':')
3814  State++;
3815  } else if(State == 2) {
3816  if (Char == 'g' ||
3817  Char == 'c' ||
3818  Char == 'r' ||
3819  Char == 'p' ||
3820  Char == 'n' ||
3821  Char == 'm' ) {
3822  State++;
3823  if (Char=='m' && Next == 't') {
3824  ++Iter;
3825  }
3826  } else {
3827  return false;
3828  }
3829  } else if(State == 3) {
3830  if(Char == '.')
3831  State++;
3832  else
3833  return false;
3834  } else if(State == 4) {
3835  if( isalnum(Char) )
3836  State++;
3837  }
3838  }
3839 
3840  return (State == 5);
3841 }
3842 
3843 
3844 
CCoreException –.
Definition: ncbiexpt.hpp:1476
bool IsEmpty(void) const
Check if there are any hints are set at all.
bool IsPreferred(TFormat fmt) const
Check if the format is listed as preferred.
bool IsDisabled(TFormat fmt) const
Check if the format is listed as disabled.
Class implements different ad-hoc unreliable file format identifications.
bool TestFormatLzo(EMode)
bool TestFormatJson(EMode)
bool x_IsTruncatedJsonKeyword(const string &testString) const
static bool IsLineGvf(const std::string &)
unsigned int m_iStatsCountBraces
bool TestFormatBinaryAsn(EMode)
CNcbiIstream & m_Stream
bool TestFormatDistanceMatrix(EMode)
bool EnsureTestBuffer()
bool x_IsTruncatedJsonNumber(const string &testString) const
bool TestFormatBZip2(EMode)
bool TestFormatGff3(EMode)
bool x_CheckStripJsonNumbers(string &testString) const
bool TestFormatTable(EMode)
bool TestFormatTaxplot(EMode)
unsigned int m_iStatsCountData
bool x_TestTableDelimiter(const string &delims)
bool TestFormatSra(EMode)
bool TestFormatFlatFileUniProt(EMode)
size_t x_FindNextJsonStringStop(const string &input, const size_t from_pos) const
static bool IsLineGff3(const std::string &)
bool TestFormatAgp(EMode)
bool x_CheckJsonStart(const string &testString) const
bool TestFormatBed15(EMode)
bool EnsureSplitLines()
bool x_IsBlankOrNumbers(const string &testString) const
static bool IsLineHgvs(const std::string &)
static bool IsLinePhrapId(const std::string &)
bool TestFormatFiveColFeatureTable(EMode)
static bool IsLineFlatFileSequence(const std::string &)
bool EnsureStats()
bool TestFormatGlimmer3(EMode)
static bool IsLabelNewick(const std::string &)
bool TestFormatBed(EMode)
bool TestFormatFlatFileSequence(EMode)
bool TestFormatFlatFileEna(EMode)
bool IsInputRepeatMaskerWithHeader()
bool TestFormat(EFormat, EMode)
bool TestFormatSnpMarkers(EMode)
bool x_LooksLikeCLUSTALConservedInfo(const string &line) const
static bool IsSupportedFormat(EFormat format)
bool TestFormatZip(EMode)
bool TestFormatNewick(EMode)
bool TestFormatCLUSTAL(void)
bool TestFormatWiggle(EMode)
EFormat
The formats are checked in the same order as declared here.
@ eBZip2
bzip2 compressed file
@ eSra
INSDC Sequence Read Archive file.
@ eFiveColFeatureTable
Five-column feature table.
@ eBinaryASN
Binary ASN.1.
@ eLzo
lzo compressed file
@ eVcf
VCF, CVcfReader.
@ eFormat_max
Max value of EFormat.
@ eGff2
GFF2, CGff2Reader, any GFF-like that doesn't fit the others.
@ eBed
UCSC BED file format, CBedReader.
@ eGtf
New GTF, CGtfReader.
@ eGZip
GNU zip compressed file.
@ eZip
zip compressed file
@ eSnpMarkers
SNP Marker flat file.
@ eGvf
GVF, CGvfReader.
@ eHgvs
HGVS, CHgvsParser.
@ eAgp
AGP format assembly, AgpRead.
@ eDistanceMatrix
Distance matrix file.
@ ePhrapAce
Phrap ACE assembly file.
@ eFlatFileSequence
GenBank/GenPept/DDBJ/EMBL flat-file sequence portion.
@ eGff3
GFF3, CGff3Reader.
@ eTable
Generic table.
@ eGtf_POISENED
Old and Dead GFF/GTF style annotations.
@ eTaxplot
Taxplot file.
@ eGlimmer3
Glimmer3 predictions.
@ eNewick
Newick file.
@ eFasta
FASTA format sequence record, CFastaReader.
@ eUnknown
unknown format
@ eGffAugustus
GFFish output of Augustus Gene Prediction.
@ eRmo
RepeatMasker Output.
@ eZstd
Zstandard (zstd) compressed data.
@ eUCSCRegion
USCS Region file format.
@ eTextASN
Text ASN.1.
@ eAlignment
Text alignment.
@ ePsl
PSL alignment format.
@ eBed15
UCSC BED15 or microarray format.
@ eWiggle
UCSC WIGGLE file format.
@ eBam
Binary alignment/map file.
bool TestFormatZstd(EMode)
bool TestFormatAugustus(EMode)
bool TestFormatBam(EMode)
void x_FindJsonStringLimits(const string &testString, list< size_t > &limits) const
bool x_IsNumber(const string &testString) const
unsigned int m_iStatsCountAaChars
static bool IsLinePsl(const std::string &, bool ignoreFirstColumn)
bool TestFormatGff2(EMode)
bool TestFormatAlignment(EMode)
bool TestFormatFasta(EMode)
void x_StripJsonStrings(string &testString) const
streamsize m_iTestBufferSize
bool TestFormatGvf(EMode)
char * m_pTestBuffer
static bool x_TestInput(CNcbiIstream &input, EOnError onerror)
static bool IsLineGtf(const std::string &)
EFormat GuessFormat(EMode)
static bool IsLineRmo(const std::string &)
bool TestFormatPsl(EMode)
unsigned int m_iStatsCountAlNumChars
static bool IsSampleNewick(const std::string &)
void x_StripJsonKeywords(string &testString) const
std::list< std::string > m_TestLines
bool TestFormatPhrapAce(EMode)
streamsize m_iTestDataSize
bool TestFormatXml(EMode)
bool TestFormatTextAsn(EMode)
unsigned int m_iStatsCountDnaChars
size_t x_StripJsonPunctuation(string &testString) const
CFormatHints m_Hints
static bool IsAsnComment(const vector< string > &)
bool x_CheckStripJsonPunctuation(string &testString) const
static bool IsLineGff2(const std::string &)
static bool IsLineGlimmer3(const std::string &)
static const char * GetFormatName(EFormat format)
static bool IsLineAugustus(const std::string &)
@ eDefault
Return eUnknown.
@ eThrowOnBadSource
Throw an exception if the data source (stream, file) can't be read.
@ eST_Lax
Implement historic behavior, risking false positives.
@ eST_Strict
Require 100% encodability of printable non-digits.
@ eST_Default
Be relatively strict, but still allow for typos.
bool x_TryProcessCLUSTALSeqData(const string &line, string &id, size_t &seg_length) const
bool TestFormatVcf(EMode)
static EFormat Format(const string &path, EOnError onerror=eDefault)
Guess file format.
bool TestFormatHgvs(EMode)
bool TestFormatGtf(EMode)
bool TestFormatRepeatMasker(EMode)
static bool IsLineAgp(const std::string &)
bool TestFormatGZip(EMode)
static ESequenceType SequenceType(const char *str, unsigned length=0, ESTStrictness strictness=eST_Default)
Guess sequence type.
bool TestFormatFlatFileGenbank(EMode)
bool IsInputRepeatMaskerWithoutHeader()
bool x_TestFormat(EFormat format, EMode mode)
class CStaticArrayMap<> is an array adaptor that provides an STLish interface to statically-defined a...
Definition: static_map.hpp:105
CTempString implements a light-weight string on top of a storage buffer whose lifetime management is ...
Definition: tempstr.hpp:65
Concept for reading and writing characters.
void clear()
Definition: set.hpp:153
#define head
Definition: ct_nlmzip_i.h:138
static const TFormatNamesItem s_format_to_name_table[]
static bool s_IsTokenPosInt(const string &strToken)
static unsigned char symbol_type_table[256]
void SkipCommentAndBlank(CTempString &text)
bool EnaGetLineData(list< string >::iterator &lineIt, list< string >::iterator endIt, string &lineCode, string &lineData)
DEFINE_STATIC_ARRAY_MAP(TFormatNamesMap, sm_FormatNames, s_format_to_name_table)
bool GenbankGetKeywordLine(list< string >::iterator &lineIt, list< string >::iterator endIt, string &keyword, string &data)
static bool s_IsTokenDouble(const string &strToken)
static bool s_IsTokenInteger(const string &strToken)
constexpr size_t sm_CheckOrder_Size
ESymbolType
@ fProtein_Alphabet
Allows BZX*-, but not JOU.
@ fInvalid
@ fDigit
@ fDNA_Main_Alphabet
Just ACGTUN-.
@ fLineEnd
@ fDNA_Ambig_Alphabet
Anything else representable in ncbi4na.
@ fAlpha
@ fSpace
EConfidence
@ eMaybe
@ eNo
@ eYes
SStaticPair< CFormatGuess::EFormat, const char * > TFormatNamesItem
static bool s_LooksLikeNucSeqData(const string &line, size_t minLength=10)
static const CFormatGuess::EFormat sm_CheckOrder[]
bool UniProtGetLineData(list< string >::iterator &lineIt, list< string >::iterator endIt, string &lineCode, string &lineData)
CStaticPairArrayMap< CFormatGuess::EFormat, const char * > TFormatNamesMap
static void init_symbol_type_table(void)
#define true
Definition: bool.h:35
#define false
Definition: bool.h:36
static const char * str(char *buf, int n)
Definition: stats.c:84
static const char * column
Definition: stats.c:23
static const TDS_WORD limits[]
Definition: num_limits.h:85
static const column_t columns[]
Definition: utf8_2.c:22
char data[12]
Definition: iconv.c:80
constexpr size_t ArraySize(const Element(&)[Size])
Definition: ncbimisc.hpp:1532
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NULL
Definition: ncbistd.hpp:225
#define NCBI_ASSERT(expr, mess)
Definition: ncbidbg.hpp:130
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
char Char
Alias for char.
Definition: ncbitype.h:93
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
CNcbiIstream & NcbiGetline(CNcbiIstream &is, string &str, char delim, string::size_type *count=NULL)
Read from "is" to "str" up to the delimiter symbol "delim" (or EOF)
static void Stepback(CNcbiIstream &is, CT_CHAR_TYPE *buf, streamsize buf_size, void *del_ptr=0)
IO_PREFIX::istream CNcbiIstream
Portable alias for istream.
Definition: ncbistre.hpp:146
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
static CTempString TruncateSpaces_Unsafe(const CTempString str, ETrunc where=eTrunc_Both)
Truncate whitespace in a string.
Definition: ncbistr.cpp:3182
static int StringToNonNegativeInt(const CTempString str, TStringToNumFlags flags=0)
Convert string to non-negative integer value.
Definition: ncbistr.cpp:457
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3452
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2984
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
static double StringToDouble(const CTempStringEx str, TStringToNumFlags flags=0)
Convert string to double.
Definition: ncbistr.cpp:1381
#define NPOS
Definition: ncbistr.hpp:133
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate whitespace in a string (in-place)
Definition: ncbistr.cpp:3192
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5086
static SIZE_TYPE Find(const CTempString str, const CTempString pattern, ECase use_case=eCase, EDirection direction=eForwardSearch, SIZE_TYPE occurrence=0)
Find the pattern in the string.
Definition: ncbistr.cpp:2882
static SIZE_TYPE FindCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case sensitive search.
Definition: ncbistr.hpp:5492
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5414
size_t size_type
Definition: tempstr.hpp:70
static bool SplitInTwo(const CTempString str, const CTempString delim, string &str1, string &str2, TSplitFlags flags=0)
Split a string into two pieces using the specified delimiters.
Definition: ncbistr.cpp:3545
size_type length(void) const
Return the length of the represented array.
Definition: tempstr.hpp:320
static unsigned int StringToUInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to unsigned int.
Definition: ncbistr.cpp:642
static bool EqualNocase(const CTempString s1, SIZE_TYPE pos, SIZE_TYPE n, const char *s2)
Case-insensitive equality of a substring with another string.
Definition: ncbistr.hpp:5355
static enable_if< is_arithmetic< TNumeric >::value||is_convertible< TNumeric, Int8 >::value, string >::type NumericToString(TNumeric value, TNumToStringFlags flags=0, int base=10)
Convert numeric value to string.
Definition: ncbistr.hpp:673
size_type find(const CTempString match, size_type pos=0) const
Find the first instance of the entire matching string within the current string, beginning at an opti...
Definition: tempstr.hpp:655
static string & ReplaceInPlace(string &src, const string &search, const string &replace, SIZE_TYPE start_pos=0, SIZE_TYPE max_replace=0, SIZE_TYPE *num_replace=0)
Replace occurrences of a substring within a string.
Definition: ncbistr.cpp:3396
static string TruncateSpaces(const string &str, ETrunc where=eTrunc_Both)
Truncate whitespace in a string.
Definition: ncbistr.cpp:3177
static const size_type npos
Definition: tempstr.hpp:72
@ fConvErr_NoThrow
Do not throw an exception on error.
Definition: ncbistr.hpp:285
@ fSplit_Truncate
Definition: ncbistr.hpp:2503
@ fSplit_Tokenize
All delimiters are merged and trimmed, to get non-empty tokens only.
Definition: ncbistr.hpp:2510
@ fSplit_MergeDelimiters
Merge adjacent delimiters.
Definition: ncbistr.hpp:2500
@ eTrunc_Begin
Truncate leading whitespace only.
Definition: ncbistr.hpp:2240
@ eNocase
Case insensitive compare.
Definition: ncbistr.hpp:1206
@ eCase
Case sensitive compare.
Definition: ncbistr.hpp:1205
static const char label[]
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
static int input()
int i
static void text(MDB_val *v)
Definition: mdb_dump.c:62
constexpr bool empty(list< Ts... >) noexcept
mdb_mode_t mode
Definition: lmdb++.h:38
const struct ncbi::grid::netcache::search::fields::SIZE size
int isalpha(Uchar c)
Definition: ncbictype.hpp:61
int isspace(Uchar c)
Definition: ncbictype.hpp:69
int isalnum(Uchar c)
Definition: ncbictype.hpp:62
int tolower(Uchar c)
Definition: ncbictype.hpp:72
int isdigit(Uchar c)
Definition: ncbictype.hpp:64
int isgraph(Uchar c)
Definition: ncbictype.hpp:65
int isprint(Uchar c)
Definition: ncbictype.hpp:67
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
NCBI C++ stream class wrappers for triggering between "new" and "old" C++ stream libraries.
static Format format
Definition: njn_ioutil.cpp:53
std::istream & in(std::istream &in_, double &x_)
double f(double x_, const double &y_)
Definition: njn_root.hpp:188
#define count
static size_t read_size(CNcbiIstream &stream, const char *name)
Definition: reader_snp.cpp:404
Template structure SStaticPair is simlified replacement of STL pair<> Main reason of introducing this...
Definition: static_set.hpp:60
Definition: type.c:6
Modified on Wed Sep 04 15:03:22 2024 by modify_doxy.py rev. 669887