NCBI C++ ToolKit
AgpFastaComparator.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: AgpFastaComparator.cpp 100571 2023-08-11 13:06:42Z gotvyans $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Mike DiCuccio, Michael Kornbluh
27  *
28  * File Description:
29  * Makes sure an AGP file builds the same sequence found in a FASTA
30  * file.
31  *
32  */
33 
34 #include <ncbi_pch.hpp>
35 
36 #include "AgpFastaComparator.hpp"
37 
38 #include <algorithm>
39 #include <sstream>
40 
41 #include <corelib/ncbiapp.hpp>
42 #include <corelib/ncbiargs.hpp>
43 #include <corelib/ncbienv.hpp>
44 #include <corelib/ncbiexec.hpp>
45 
46 #include <util/checksum.hpp>
47 
49 #include <objmgr/bioseq_handle.hpp>
50 #include <objmgr/scope.hpp>
54 #include <objects/seq/Seq_ext.hpp>
58 #include <objmgr/bioseq_ci.hpp>
59 #include <objmgr/seq_vector.hpp>
60 #include <objmgr/util/sequence.hpp>
63 #include <objtools/lds2/lds2.hpp>
68 
69 #ifdef COMP_LOG
70 # error COMP_LOG was already defined
71 #endif
72 
73 // convenience macro for writing to the logfile (if it's open)
74 #define COMP_LOG(msg) \
75  do { \
76  if( x_IsLogFileOpen() ) { \
77  *m_pLoadLogFile << msg << endl; \
78  } \
79  } while(false)
80 
81 
84 
85 namespace {
86 
87  // pScope can be NULL
88  CRef<CSeq_id> s_CustomGetSeqIdFromStr( const string & str, CScope * pScope )
89  {
90  // start with parent class's default parsing
92 
93  // optimize for the (hopefully common) fast case of local IDs
94  if( seq_id->IsLocal() ) {
95  return seq_id;
96  }
97 
98  // build what this would look like as a local ID
100 
101  // reject prot-only accessions, or accessions that aren't found
102  CSeq_id::EAccessionInfo fAccnInfo = seq_id->IdentifyAccession();
103  const bool bAccnIsProtOnly = (
104  (fAccnInfo & CSeq_id::fAcc_prot) &&
105  ! (fAccnInfo & CSeq_id::fAcc_nuc));
106  const bool bSeqIdIsFound = ( pScope ? pScope->GetBioseqHandle(*seq_id) : false );
107  if( bAccnIsProtOnly || ! bSeqIdIsFound )
108  {
109  // fall back on local ID
110  return pLocalSeqId;
111  }
112 
113  const bool bLocalSeqIdIsfound = (
114  pScope ? pScope->GetBioseqHandle(*pLocalSeqId) : false );
115  if( bLocalSeqIdIsfound ) {
116  // print a warning that a local ID was overridden
117  cerr << "Warning: '" << str << "' was used as an accession, "
118  "so the local component was ignored." << endl;
119  }
120 
121  // everything looks fine, so return it
122  return seq_id;
123  }
124 
125  // slight customization to CAgpToSeqEntry:
126  // if an ID can be found in GenBank, use that
127  // and otherwise fall back on local ID
128  class CCustomAgpToSeqEntry : public CAgpToSeqEntry {
129  public:
130 
131  CCustomAgpToSeqEntry(CScope * pScope)
132  : m_pScope(pScope)
133  {
134  }
135 
136  protected:
138  {
139  return s_CustomGetSeqIdFromStr(str, m_pScope.GetPointer());
140  }
141 
142  private:
143  CRef<CScope> m_pScope;
144  };
145 }
146 
147 /////////////////////////////////////////////////////////////////////////////
148 // CAgpFastaComparator::
149 
151  : m_bSuccess(true)
152 {
153 }
154 
155 /////////////////////////////////////////////////////////////////////////////
156 // Run test (printout arguments obtained from command-line)
157 
158 
160  const std::list<std::string> & files,
161  const std::string & loadlog,
162  const std::string & agp_as_fasta_file,
163  TDiffsToHide diffsToHide,
164  int diffs_to_find // how many differences to show
165  )
166 {
167  LOG_POST(Error << "" ); // newline
168  LOG_POST(Error << "Starting AGP/Fasta Compare" );
169  LOG_POST(Error << "" ); // newline
170 
171  // figure out which files are AGP and which are FASTA
172  list<string> compAndObjFiles;
173  list<string> agpFiles;
174  ITERATE( std::list<std::string>, file_iter, files ) {
175  const string & file = *file_iter;
176  switch( x_GuessFileType(file) ) {
177  case eFileType_FASTA:
178  case eFileType_ASN1:
179  case eFileType_Unknown: // unknown might be binary ASN.1 (we might want to fix that)
180  compAndObjFiles.push_back(file);
181  break;
182  case eFileType_AGP:
183  agpFiles.push_back(file);
184  break;
185  }
186  }
187 
188  if( ! loadlog.empty() ) {
189  m_pLoadLogFile.reset(
190  new CNcbiOfstream(loadlog.c_str() ) );
191  }
192 
193  if( ! agp_as_fasta_file.empty() ) {
194  m_pAgpAsFastaFile.reset(
195  new CNcbiOfstream(agp_as_fasta_file.c_str()));
196  }
197 
199 
200  // quickly scan the AGP files to determine the component
201  // Seq-ids
202  TSeqIdSet compSeqIds;
203  TSeqIdSet objSeqIds;
204  if( ! x_GetCompAndObjSeqIds( compSeqIds, objSeqIds, agpFiles ) ) {
205  // error message should've been printed inside x_GetCompAndObjSeqIds
207  }
208  if( x_IsLogFileOpen() ) {
209  ITERATE(TSeqIdSet, seq_id_it, compSeqIds) {
210  COMP_LOG("Component seq-id from AGP file(s): "
211  << seq_id_it->AsString());
212  }
213  ITERATE(TSeqIdSet, seq_id_it, objSeqIds) {
214  COMP_LOG("Object seq-id from AGP file(s): "
215  << seq_id_it->AsString());
216  }
217  }
218 
219  // load local component FASTA sequences and Genbank into
220  // local scope for lookups using local data storage
221  unique_ptr<CTmpFile> ldsdb_file;
222  CRef<CLDS2_Manager> lds_mgr;
223  ldsdb_file.reset( new CTmpFile ); // file deleted on object destruction
224  lds_mgr.Reset(new CLDS2_Manager( ldsdb_file->GetFileName() ));
225 
226  // adjust FASTA flags
227  // (workaround for CXX-3453 which caused WGS-246 )
228  CFastaReader::TFlags fasta_flags = lds_mgr->GetFastaFlags();
229  fasta_flags &= ~CFastaReader::fParseGaps;
230  // component ids are always interpreted as local
231  fasta_flags &= ~CFastaReader::fParseRawID;
232  fasta_flags |= CFastaReader::fAddMods;
233  fasta_flags |= CFastaReader::fDisableParseRange; //rw-1155: match what the AGP parser does.
234  lds_mgr->SetFastaFlags(fasta_flags);
235 
236  list<string> objfiles;
237  ITERATE( list<string>, file_iter, compAndObjFiles ) {
238  // check if file is a FASTA component file
239 
240  if( eFileType_FASTA != x_GuessFileType( *file_iter ) ) {
241  // we support text ASN.1 object files
242  COMP_LOG("Object file: " << *file_iter);
243  objfiles.push_back(*file_iter);
244  continue;
245  }
246 
247  ifstream file_strm( file_iter->c_str() );
248  string line;
249  // look at the ids in the file to try to determine what
250  //
251  while( NcbiGetline(file_strm, line, "\r\n") ) {
252  // extract accession
253  // Get first word, trim final '|' (if any).
254  if( ! NStr::StartsWith(line, ">") ) {
255  continue;
256  }
257  SIZE_TYPE after_seq_id_pos = line.find_first_of(" \t");
258  if( after_seq_id_pos == string::npos ) {
259  after_seq_id_pos = line.length();
260  }
261  string acc_long = line.substr(1, (after_seq_id_pos - 1));
262  CRef<CSeq_id> seq_id = s_CustomGetSeqIdFromStr( acc_long, NULL );
263  CSeq_id_Handle acc_h = CSeq_id_Handle::GetHandle(*seq_id);
264 
265  COMP_LOG("Sample accession from " << *file_iter
266  << ": " << acc_h.AsString());
267  if( compSeqIds.find(acc_h) != compSeqIds.end() ) {
268  // component files go into the component object
269  // temporary database
270  COMP_LOG("Component file: " << *file_iter);
271  lds_mgr->AddDataFile( *file_iter );
272  break;
273  } else if( objSeqIds.find(acc_h) != objSeqIds.end() ) {
274  // object files will be remembered for later processing
275  COMP_LOG("Object file: " << *file_iter);
276  objfiles.push_back(*file_iter);
277  break;
278  }
279  }
280 
281  // no seq-id in the file seems relevant
282  if( ! file_strm ) {
283  // none of the seq-ids seem to be used anywhere
284  cerr << "Warning: This file seems to be unused: '"
285  << *file_iter << "'" << endl;
286  }
287  }
288  lds_mgr->UpdateData();
290  *om, ldsdb_file->GetFileName(), ( fasta_flags & ~CFastaReader::fNoSeqData ),
293  *om, 0,
295 
296  // calculate checksum of the AGP sequences and the FASTA sequences
297 
298  // temporary dir to hold outputs so we can diff.
299  // this is only used if we're showing diffs
300  unique_ptr<CTmpSeqVecStorage> temp_dir;
301  if( diffs_to_find > 0 ) {
302  temp_dir.reset( new CTmpSeqVecStorage );
303  }
304 
305  TUniqueSeqs agp_ids;
306  // process every AGP file
307  if( agpFiles.empty() ) {
308  cerr << "error: could not find any agp files" << endl;
309  return eResult_Failure;
310  }
311  x_ProcessAgps( agpFiles, agp_ids, temp_dir.get() );
312 
313  TUniqueSeqs fasta_ids;
314  // process every objfile
315  if( objfiles.empty() ) {
316  cerr << "error: could not find any obj files" << endl;
317  return eResult_Failure;
318  }
319  x_ProcessObjects( objfiles, fasta_ids, temp_dir.get() );
320 
321  // check for duplicate sequences
322  x_CheckForDups( fasta_ids, "object file(s)" );
323  x_CheckForDups( agp_ids, "AGP file(s)" );
324 
325  // will hold ones that are only in FASTA or only in AGP.
326  // Of course, if one appears in both, we should print it in a more
327  // user-friendly way
328  TSeqIdSet vSeqIdFASTAOnly;
329  TSeqIdSet vSeqIdAGPOnly;
330 
331  TUniqueSeqs::const_iterator iter1 = fasta_ids.begin();
332  TUniqueSeqs::const_iterator iter1_end = fasta_ids.end();
333 
334  TUniqueSeqs::const_iterator iter2 = agp_ids.begin();
335  TUniqueSeqs::const_iterator iter2_end = agp_ids.end();
336 
337  // make sure set of sequences in obj FASTA match AGP's objects.
338  // Print discrepancies.
339  LOG_POST(Error << "Reporting differences...");
340  for ( ; iter1 != iter1_end && iter2 != iter2_end; ) {
341  if (iter1->first < iter2->first) {
342  copy( iter1->second.begin(), iter1->second.end(),
343  inserter(vSeqIdFASTAOnly, vSeqIdFASTAOnly.begin() ) );
344  ++iter1;
345  }
346  else if (iter2->first < iter1->first) {
347  copy( iter2->second.begin(), iter2->second.end(),
348  inserter(vSeqIdAGPOnly, vSeqIdAGPOnly.begin() ) );
349  ++iter2;
350  }
351  else if( iter1->second != iter2->second ) {
352  // Find the ones in FASTA but not AGP
353  set_difference( iter1->second.begin(), iter1->second.end(),
354  iter2->second.begin(), iter2->second.end(),
355  inserter(vSeqIdFASTAOnly,
356  vSeqIdFASTAOnly.begin() ) );
357 
358  // Find the ones in AGP but not FASTA
359  set_difference( iter2->second.begin(), iter2->second.end(),
360  iter1->second.begin(), iter1->second.end(),
361  inserter(vSeqIdAGPOnly,
362  vSeqIdAGPOnly.begin() ) );
363 
364  ++iter1;
365  ++iter2;
366  }
367  else {
368  ++iter1;
369  ++iter2;
370  }
371  }
372 
373  for ( ; iter1 != iter1_end; ++iter1) {
374  copy( iter1->second.begin(), iter1->second.end(),
375  inserter(vSeqIdFASTAOnly, vSeqIdFASTAOnly.begin() ) );
376  }
377 
378  for ( ; iter2 != iter2_end; ++iter2) {
379  copy( iter2->second.begin(), iter2->second.end(),
380  inserter(vSeqIdAGPOnly, vSeqIdAGPOnly.begin() ) );
381  }
382 
383  // look at vSeqIdFASTAOnly and vSeqIdAGPOnly and
384  // print in user-friendly way
385  // Also, fill in SeqIds that are in both
386  TSeqIdSet seqIdIntersection;
387  x_OutputDifferingSeqIds( vSeqIdFASTAOnly, vSeqIdAGPOnly, diffsToHide, seqIdIntersection );
388 
389  const bool bThereWereDifferences = (
390  ( ! vSeqIdFASTAOnly.empty() &&
391  ! (diffsToHide & fDiffsToHide_ObjfileOnly) ) ||
392  ( ! vSeqIdAGPOnly.empty() &&
393  ! (diffsToHide & fDiffsToHide_AGPOnly) ) );
394  if( ! bThereWereDifferences ) {
395  LOG_POST(Error << "No differences found");
396  }
397  if( bThereWereDifferences ) {
398  m_bSuccess = false;
399  }
400 
401  if( bThereWereDifferences && diffs_to_find > 0 &&
402  ! seqIdIntersection.empty() )
403  {
404  x_OutputSeqDifferences( diffs_to_find,
405  seqIdIntersection,
406  *temp_dir );
407  }
408 
409 
411 }
412 
414  m_dir( x_GetTmpDir() )
415 {
416  if( m_dir.Exists() ) {
417  throw std::runtime_error("Temp dir already exists: " + m_dir.GetPath() );
418  }
419 
420  if( ! m_dir.Create() ) {
421  throw std::runtime_error("Could not create temp dir: " + m_dir.GetPath() );
422  }
423 }
424 
426 {
427  if( ! m_dir.Remove() ) {
428  cerr << "Warning: could not delete temporary dir "
429  << m_dir.GetPath() << endl;
430  }
431 }
432 
434 {
435  for (CBioseq_CI bioseq_it(seh, CSeq_inst::eMol_na); bioseq_it; ++bioseq_it)
436  {
437  CSeq_id_Handle idh = sequence::GetId(*bioseq_it,
439  ofstream output_stream( GetFileName(type, idh).c_str() );
440 
441  // write raw sequence, but have a newline every 60 residues.
442  // newlines are important for the "diff" command
443  CSeqVector vec(*bioseq_it, CBioseq_Handle::eCoding_Iupac);
444  CSeqVector::const_iterator iter = vec.begin();
445  int bytes_copied = 0;
446  for( ; iter != vec.end(); ++iter, ++bytes_copied ) {
447  if( bytes_copied > 0 && (bytes_copied % 60) == 0 ) {
448  // use '\n' instead of endl to avoid flushing
449  output_stream << '\n';
450  }
451  output_stream << *iter;
452  }
453  output_stream << endl;
454  }
455 }
456 
457 string
459 {
460  std::stringstream file_name_strm;
461  file_name_strm << m_dir.GetPath() << CDirEntry::GetPathSeparator();
462 
463  switch( type ) {
464  case eType_AGP:
465  file_name_strm << "agp";
466  break;
467  case eType_Obj:
468  file_name_strm << "obj";
469  break;
470  default:
471  _TROUBLE;
472  // in case _TROUBLE falls through, do the best we can:
473  file_name_strm << "UNKNOWN";
474  break;
475  }
476 
477  file_name_strm << '.';
478 
479  // get cleaned version of seqid without any
480  // illegal characters
481  {
482  const string initial_seq_id = idh.AsString();
483  std::stringstream final_seq_id;
484  ITERATE(string, ch_iter, initial_seq_id) {
485  const unsigned char ch = *ch_iter;
486  if( isalnum(ch) ) {
487  final_seq_id << ch;
488  } else {
489  final_seq_id << '_' << setfill('0') << setw(3) << ch;
490  }
491  }
492  file_name_strm << final_seq_id.str();
493  }
494 
495  return file_name_strm.str();
496 }
497 
499 {
500  std::stringstream dir_strm;
501  dir_strm << CDir::GetTmpDir() << '/'
502  << "AgpFastaComparator." << CCurrentProcess::GetPid()
503  << "."
504  << CTime(CTime::eCurrent).AsString("YMDTh:m:s.l");
505  return dir_strm.str();
506 }
507 
509  TUniqueSeqs& seqs,
510  int * in_out_pUniqueBioseqsLoaded,
511  int * in_out_pBioseqsSkipped,
512  CNcbiOfstream *pDataOutFile )
513 {
514  _ASSERT(
515  in_out_pUniqueBioseqsLoaded != NULL &&
516  in_out_pBioseqsSkipped != NULL );
517 
518  // skipped is total minus loaded.
519  int total = 0;
520 
521  for (CBioseq_CI bioseq_it(seh, CSeq_inst::eMol_na); bioseq_it; ++bioseq_it) {
522  ++total;
523  CSeqVector vec(*bioseq_it, CBioseq_Handle::eCoding_Iupac);
524  CSeq_id_Handle idh = sequence::GetId(*bioseq_it,
526  string data;
527  if( ! vec.CanGetRange(0, bioseq_it->GetBioseqLength()) ) {
528  LOG_POST(Error << " Skipping one: could not load due to error "
529  "in AGP file "
530  "(length issue or does not include range [1, "
531  << bioseq_it->GetBioseqLength() << "] or "
532  "doesn't exist) for " << idh
533  << " (though issue could be due to failure to resolve "
534  "one of the contigs. "
535  "Are all necessary components in GenBank or in files "
536  "specified on the command-line?)." );
537 
538  // try to figure out where the length error is
539  x_PrintDetailsOfLengthIssue( *bioseq_it );
540  m_bSuccess = false;
541  continue;
542  }
543  try {
544  vec.GetSeqData(0, bioseq_it->GetBioseqLength(), data);
545  } catch(const CSeqVectorException& ex) {
546  LOG_POST(Error << " Skipping one: could not load due to error, "
547  "probably in AGP file, possibly a length issue, for "
548  << idh << Endl() << Endl()
549  << "Raw technical information about error: " << ex.what() );
550  m_bSuccess = false;
551  continue;
552  }
553 
554  if( pDataOutFile != NULL ) {
555  x_WriteDataAsFasta( *pDataOutFile, idh, data );
556  }
557 
559  cks.AddLine(data);
560 
561  string md5;
562  cks.GetMD5Digest(md5);
563 
564  TKey key(md5, bioseq_it->GetBioseqLength());
565  pair<TSeqIdSet::iterator, bool> insert_result =
566  seqs[key].insert(idh);
567  if( ! insert_result.second ) {
568  LOG_POST(Error << " Error: skipping sequence with same name and values: " << idh);
569  m_bSuccess = false;
570  continue;
571  }
572 
573  if( x_IsLogFileOpen() ) {
574  CNcbiOstrstream os;
575  ITERATE (string, i, key.first) {
576  os << setw(2) << setfill('0') << hex << (int)((unsigned char)*i);
577  }
578  COMP_LOG(" " << idh << ": "
579  << string(CNcbiOstrstreamToString(os))
580  << " / " << key.second);
581  }
582 
583  ++*in_out_pUniqueBioseqsLoaded;
584  }
585 
586  *in_out_pBioseqsSkipped = ( total - *in_out_pUniqueBioseqsLoaded);
587 }
588 
590  CNcbiOfstream & dataOutFile,
591  const CSeq_id_Handle & idh,
592  const std::string & data )
593 {
594  const static SIZE_TYPE kFastaWidth = 60;
595 
596  dataOutFile << '>' << idh << endl;
597 
598  const SIZE_TYPE data_len = data.length();
599  SIZE_TYPE next_idx = 0;
600  for( ; next_idx < data_len ; next_idx += kFastaWidth ) {
601  SIZE_TYPE chars_to_copy = min( kFastaWidth, (data_len - next_idx) );
602  dataOutFile.write( data.c_str() + next_idx, chars_to_copy );
603  dataOutFile << '\n';
604  }
605 }
606 
607 void CAgpFastaComparator::x_PrintDetailsOfLengthIssue(
608  CBioseq_Handle bioseq_h )
609 {
610  const static string kBugInAgpFastaCompare(
611  " This is probably a bug in agp_fasta_compare: could not get "
612  "information on the bioseq with an error" );
613 
614  const CDelta_ext::Tdata *p_delta_data = NULL;
615  try {
616  CScope &scope = bioseq_h.GetScope();
617 
618  p_delta_data = &bioseq_h.GetCompleteBioseq()->GetInst().GetExt().GetDelta().Get();
619 
620  if( p_delta_data == NULL ) {
621  LOG_POST(Error << kBugInAgpFastaCompare);
622  return;
623  }
624 
625 
626  // put it in a reference to make it easier to work with
627  const CDelta_ext::Tdata &delta_data = *p_delta_data;
628 
629  ITERATE( CDelta_ext::Tdata, delta_iter, delta_data ) {
630  if( (*delta_iter)->IsLiteral() ) {
631  continue;
632  }
633 
634  const CSeq_interval & seq_int = (*delta_iter)->GetLoc().GetInt();
635 
636  const TSeqPos highest_pnt =
637  max( seq_int.GetFrom(), seq_int.GetTo() );
638  CSeq_id_Handle seq_id_h =
639  CSeq_id_Handle::GetHandle(seq_int.GetId());
640 
641  CBioseq_Handle inner_bioseq_h;
642  try {
643  inner_bioseq_h = scope.GetBioseqHandle(seq_id_h);
644  if( ! inner_bioseq_h ) {
645  LOG_POST(Error << " Couldn't find bioseq for "
646  << seq_id_h
647  << ". Maybe you need to specify component file(s)." );
648  } else if( ! inner_bioseq_h.IsSetInst_Length() ) {
649  LOG_POST(Error << " Could not get length of bioseq for "
650  << seq_id_h );
651  } else {
652  const TSeqPos bioseq_len = inner_bioseq_h.GetInst_Length();
653  if( highest_pnt >= bioseq_len ) {
654  LOG_POST(Error << " For "
655  << seq_id_h
656  << " length is " << bioseq_len
657  << " but user tries to access the point "
658  << (highest_pnt+1) ); // "+1" because user sees 1-based
659  }
660  }
661  } catch(...) {
662  LOG_POST(Error << " Could not find bioseq for "
663  << seq_id_h
664  << ". Maybe you need to specify component file(s)." );
665  }
666  }
667  } catch(std::exception & ex) {
668  CNcbiOstrstream bioseq_strm;
669  bioseq_strm << MSerial_AsnText << *bioseq_h.GetCompleteBioseq();
670  LOG_POST(Error << kBugInAgpFastaCompare << ": "
671  << Endl() << Endl()
672  << "Raw technical information about error: " << Endl()
673  << ex.what()
674  << Endl()
675  << " Bioseq ASN.1: " << (string)CNcbiOstrstreamToString(bioseq_strm) );
676  return;
677  } catch(...) {
678  CNcbiOstrstream bioseq_strm;
679  bioseq_strm << MSerial_AsnText << *bioseq_h.GetCompleteBioseq();
680  LOG_POST(Error << kBugInAgpFastaCompare << ": "
681  << "(unknown error)"
682  << " Bioseq ASN.1: " << (string)CNcbiOstrstreamToString(bioseq_strm) );
683  return;
684  }
685 }
686 
687 bool CAgpFastaComparator::x_GetCompAndObjSeqIds(
688  TSeqIdSet & out_compSeqIds,
689  TSeqIdSet & out_objSeqIds,
690  const std::list<std::string> & agpFiles )
691 {
692  const static CTempString kDelim("\t");
693 
694  const static CTempString kNotAGPErr(
695  "This file is not in a recognized AGP format: ");
696 
697  // what is held in some of the AGP columns
698  const static int kObjSeqIdCol = 0;
699  const static int kCompTypeCol = 4;
700  const static int kCompSeqIdCol = 5;
701  const static int kMaxColUsed = kCompSeqIdCol;
702 
703  vector<CTempString> vecLineTokens;
704 
705  // for speed, we do the parsing ourselves with only very minimal
706  // error-checking
707  ITERATE( std::list<std::string>, file_iter, agpFiles ) {
708  ifstream file_strm(file_iter->c_str());
709  string line;
710  while( NcbiGetline(file_strm, line, "\r\n") ) {
711  // skip comment lines
712  if( line.empty() || line[0] == '#' ) {
713  continue;
714  }
715 
716  vecLineTokens.clear();
717  NStr::Split(line, kDelim, vecLineTokens, 0);
718 
719  // are there enough columns for an AGP file?
720  if( vecLineTokens.size() <= kMaxColUsed ){
721  cerr << kNotAGPErr << *file_iter << endl;
722  return false;
723  }
724 
725  // skip gaps
726  CTempString sComponentType = vecLineTokens[kCompTypeCol];
727  if( sComponentType.length() != 1 ) {
728  cerr << kNotAGPErr << *file_iter << endl;
729  return false;
730  }
731  const char chCompType = toupper(sComponentType[0]);
732  if( chCompType == 'N' || chCompType == 'U' )
733  {
734  // skip gaps
735  continue;
736  }
737 
738  // get object Seq-id
739  CRef<CSeq_id> objSeqId = s_CustomGetSeqIdFromStr(
740  vecLineTokens[kObjSeqIdCol], NULL);
741  out_objSeqIds.insert(
742  CSeq_id_Handle::GetHandle(*objSeqId));
743 
744  // get component Seq-id
745  CRef<CSeq_id> comp_seq_id =
746  s_CustomGetSeqIdFromStr(
747  vecLineTokens[kCompSeqIdCol], NULL);
748  out_compSeqIds.insert(
749  CSeq_id_Handle::GetHandle(*comp_seq_id) );
750  }
751  }
752 
753  return true;
754 }
755 
756 void CAgpFastaComparator::x_ProcessObjects(
757  const list<string> & filenames,
758  TUniqueSeqs& fasta_ids,
759  CTmpSeqVecStorage *temp_dir )
760 {
761  int iNumLoaded = 0;
762  int iNumSkipped = 0;
763 
764  LOG_POST(Error << "Processing object file(s)...");
765  COMP_LOG("Processing object file(s)...");
766  ITERATE( list<string>, file_iter, filenames ) {
767  const string &filename = *file_iter;
768  try {
769  CFormatGuess guesser( filename );
770  const CFormatGuess::EFormat format =
771  guesser.GuessFormat();
772 
773  if( format == CFormatGuess::eFasta ) {
774  CNcbiIfstream file_istrm( filename.c_str(), ios::binary );
775  CFastaReader reader(file_istrm, CFastaReader::fAddMods);
776  while (file_istrm) {
777  CRef<CSeq_entry> entry = reader.ReadOneSeq();
778 
779  CRef<CScope> scope(new CScope(*CObjectManager::GetInstance()));
780  CSeq_entry_Handle seh = scope->AddTopLevelSeqEntry(*entry);
781  x_Process(seh, fasta_ids, &iNumLoaded, &iNumSkipped, NULL );
782  if( temp_dir ) {
783  temp_dir->WriteData( CTmpSeqVecStorage::eType_Obj, seh );
784  }
785  }
786  } else if( format == CFormatGuess::eBinaryASN ||
787  format == CFormatGuess::eTextASN )
788  {
789  // see if it's a submit
790  CRef<CSeq_submit> submit( new CSeq_submit );
791  {
792  CNcbiIfstream file_istrm( filename.c_str(), ios::binary );
793  x_SetBinaryVsText( file_istrm, format );
794  file_istrm >> *submit;
795  }
796 
797  if( submit ) {
798 
799  if( ! submit->IsEntrys() ) {
800  LOG_POST(Error << "Seq-submits must have 'entrys'.");
801  m_bSuccess = false;
802  return;
803  }
804 
805  ITERATE( CSeq_submit::C_Data::TEntrys, entry_iter,
806  submit->GetData().GetEntrys() )
807  {
808  const CSeq_entry &entry = **entry_iter;
809 
810  CRef<CScope> scope(new CScope(*CObjectManager::GetInstance()));
811  CSeq_entry_Handle seh = scope->AddTopLevelSeqEntry(entry);
812  x_Process(seh, fasta_ids, &iNumLoaded, &iNumSkipped, NULL );
813  if( temp_dir ) {
814  temp_dir->WriteData( CTmpSeqVecStorage::eType_Obj, seh );
815  }
816  }
817  }
818  else
819  {
820  CRef<CSeq_entry> entry( new CSeq_entry );
821 
822  CNcbiIfstream file_istrm( filename.c_str(), ios::binary );
823  x_SetBinaryVsText( file_istrm, format );
824  file_istrm >> *entry;
825 
826  CRef<CScope> scope(new CScope(*CObjectManager::GetInstance()));
827  CSeq_entry_Handle seh = scope->AddTopLevelSeqEntry(*entry);
828  x_Process(seh, fasta_ids, &iNumLoaded, &iNumSkipped, NULL );
829  if( temp_dir ) {
830  temp_dir->WriteData( CTmpSeqVecStorage::eType_Obj, seh );
831  }
832  }
833  } else {
834  LOG_POST(Error << "Could not determine format of " << filename
835  << ", best guess is: " << CFormatGuess::GetFormatName(format) );
836  m_bSuccess = false;
837  return;
838  }
839  }
840  catch(CObjReaderParseException & ex ) {
841  if( ex.GetErrCode() == CObjReaderParseException::eEOF ) {
842  // end of file; no problem
843  } else {
844  LOG_POST(Error << "Error reading object file: " << ex.what() );
845  m_bSuccess = false;
846  return;
847  }
848  }
849  catch (CException& ex ) {
850  LOG_POST(Error << "Error reading object file: " << ex.what() );
851  m_bSuccess = false;
852  return;
853  }
854  }
855 
856  LOG_POST(Error << "Loaded " << iNumLoaded << " object file sequence(s).");
857  if( iNumSkipped > 0 ) {
858  LOG_POST(Error << " Skipped " << iNumSkipped << " FASTA sequence(s).");
859  }
860 }
861 
862 
863 void CAgpFastaComparator::x_ProcessAgps(const list<string> & filenames,
864  TUniqueSeqs& agp_ids,
865  CTmpSeqVecStorage *temp_dir )
866 {
867  int iNumLoaded = 0;
868  int iNumSkipped = 0;
869 
870  LOG_POST(Error << "Processing AGP...");
871  COMP_LOG("Processing AGP...");
872 
873  CRef<CScope> pAgpToSeqEntryScope(new CScope(*CObjectManager::GetInstance()));
874  pAgpToSeqEntryScope->AddDefaults();
875 
876  ITERATE( list<string>, file_iter, filenames ) {
877  const string &filename = *file_iter;
878  CNcbiIfstream istr( filename.c_str() );
879  while (istr) {
880  CCustomAgpToSeqEntry agp_reader(pAgpToSeqEntryScope.GetPointer());
881  int err_code = agp_reader.ReadStream( istr ); // loads entries
882  if( err_code != 0 ) {
883  LOG_POST(Error << "Error occurred reading AGP file: "
884  << agp_reader.GetErrorMessage() );
885  m_bSuccess = false;
886  return;
887  }
888  ITERATE (vector< CRef<CSeq_entry> >, it, agp_reader.GetResult() ) {
889  CRef<CSeq_entry> entry = *it;
890 
891  CRef<CScope> scope(new CScope(*CObjectManager::GetInstance()));
892  CSeq_entry_Handle seh = scope->AddTopLevelSeqEntry(*entry);
893  scope->AddDefaults();
894 
895  x_Process(seh, agp_ids, &iNumLoaded, &iNumSkipped, m_pAgpAsFastaFile.get() );
896  if( temp_dir ) {
897  temp_dir->WriteData( CTmpSeqVecStorage::eType_AGP, seh );
898  }
899  }
900  }
901  }
902  LOG_POST(Error << "Loaded " << iNumLoaded << " AGP sequence(s).");
903  if( iNumSkipped > 0 ) {
904  LOG_POST(Error << " Skipped " << iNumSkipped << " AGP sequence(s).");
905  }
906 }
907 
908 void CAgpFastaComparator::x_OutputDifferingSeqIds(
909  const TSeqIdSet & vSeqIdFASTAOnly,
910  const TSeqIdSet & vSeqIdAGPOnly,
911  TDiffsToHide diffs_to_hide,
912  TSeqIdSet & out_seqIdIntersection )
913 {
914  // find the ones in both
915  set_intersection(
916  vSeqIdFASTAOnly.begin(), vSeqIdFASTAOnly.end(),
917  vSeqIdAGPOnly.begin(), vSeqIdAGPOnly.end(),
918  inserter(out_seqIdIntersection, out_seqIdIntersection.begin()) );
919  if( ! out_seqIdIntersection.empty() ) {
920  LOG_POST(Error << " These " << out_seqIdIntersection.size()
921  << " differ between object file and AGP:");
922  ITERATE( TSeqIdSet, id_iter, out_seqIdIntersection ) {
923  LOG_POST(Error << " " << *id_iter);
924  }
925  }
926 
927  // find the ones in FASTA only
928  TSeqIdSet vSeqIdTempSet;
929  set_difference(
930  vSeqIdFASTAOnly.begin(), vSeqIdFASTAOnly.end(),
931  vSeqIdAGPOnly.begin(), vSeqIdAGPOnly.end(),
932  inserter(vSeqIdTempSet, vSeqIdTempSet.begin()) );
933  if( ! vSeqIdTempSet.empty() && ! (diffs_to_hide & fDiffsToHide_ObjfileOnly) ) {
934  LOG_POST(Error << " These " << vSeqIdTempSet.size()
935  << " are in Object file only: " << "\n"
936  << " (Check above: were some AGP sequences skipped due "
937  << "to errors?)");
938  ITERATE( TSeqIdSet, id_iter, vSeqIdTempSet ) {
939  LOG_POST(Error << " " << *id_iter);
940  }
941  }
942 
943  // find the ones in AGP only
944  vSeqIdTempSet.clear();
945  set_difference(
946  vSeqIdAGPOnly.begin(), vSeqIdAGPOnly.end(),
947  vSeqIdFASTAOnly.begin(), vSeqIdFASTAOnly.end(),
948  inserter(vSeqIdTempSet, vSeqIdTempSet.begin()) );
949  if( ! vSeqIdTempSet.empty() && ! (diffs_to_hide & fDiffsToHide_AGPOnly) ) {
950  LOG_POST(Error << " These " << vSeqIdTempSet.size()
951  << " are in AGP only: " << "\n"
952  << " (Check above: were some FASTA sequences skipped due "
953  << "to errors?)");
954  ITERATE( TSeqIdSet, id_iter, vSeqIdTempSet ) {
955  LOG_POST(Error << " " << *id_iter);
956  }
957  }
958 }
959 
960 void CAgpFastaComparator::x_CheckForDups( TUniqueSeqs & unique_ids,
961  const string & file_type )
962 {
963  ITERATE( TUniqueSeqs, unique_id_iter, unique_ids ) {
964  const TSeqIdSet & id_set = unique_id_iter->second;
965  if( id_set.size() > 1 ) {
966  CNcbiOstrstream errmsg;
967  errmsg << "WARNING: Identical sequences in " << file_type << ":";
968  ITERATE( TSeqIdSet, id_iter, id_set ) {
969  errmsg << " '" << *id_iter << "'";
970  }
971  LOG_POST( Error << (string)CNcbiOstrstreamToString(errmsg) );
972  }
973  }
974 }
975 
976 void CAgpFastaComparator::x_OutputSeqDifferences(
977  int diffs_to_find,
978  const TSeqIdSet & seqIdIntersection,
979  CTmpSeqVecStorage & temp_dir )
980 {
981  const static string kDiff = "/usr/bin/diff";
982  if( ! CExec::IsExecutable(kDiff) ) {
983  cerr << "No differences shown because cannot run " << kDiff << endl;
984  return;
985  }
986 
987  const static string kAwk = "/usr/bin/awk";
988  if( ! CExec::IsExecutable(kAwk) ) {
989  cerr << "No differences shown because cannot run " << kAwk << endl;
990  return;
991  }
992 
993  ITERATE( TSeqIdSet, id_iter, seqIdIntersection ) {
994  const CSeq_id_Handle & idh = *id_iter;
995  const string agp_file = temp_dir.GetFileName( CTmpSeqVecStorage::eType_AGP, idh );
996  const string obj_file = temp_dir.GetFileName( CTmpSeqVecStorage::eType_Obj, idh );
997 
998  cout << endl;
999  cout << "##### Comparing " << idh << " for AGP ('<') and Obj ('>'):" << endl;
1000  cout << endl;
1001 
1002  // This is a suboptimal implementation for multiple reasons:
1003  // - It won't work in Windows
1004  // - CExec::System is prone to exploits (though since agp_validate
1005  // is not setuid or setgid, this is less severe an issue than
1006  // it could be).
1007  // - Similarly, building a command-line from a stringstream
1008  // could also be dangerous.
1009  // I'm awaiting JIRA CXX-3145 to see if a superior
1010  // solution is possible. In particular, I would like the NCBI
1011  // C++ toolkit to have a diff library.
1012  std::stringstream cmd_strm;
1013  cmd_strm << kDiff << " '" << agp_file << "' '" << obj_file << "' 2> /dev/null | " << kAwk << " 'BEGIN { max_lines = " << diffs_to_find << "; left_seen = 0; right_seen = 0; } "
1014  << "/^</ { left_seen += 1; if( left_seen <= max_lines ) { print } } "
1015  << "/^>/ { right_seen += 1; if( right_seen <= max_lines ) { print } } "
1016  << "/^[0-9]/ { if( left_seen > right_seen ) { right_seen = left_seen } else { left_seen = right_seen } if( left_seen >= max_lines && right_seen >= max_lines) { exit } ; print } "
1017  << "/^-/ { print }'";
1018  CExec::System( cmd_strm.str().c_str() );
1019  }
1020 }
1021 
1022 void CAgpFastaComparator::x_SetBinaryVsText( CNcbiIstream & file_istrm,
1023  CFormatGuess::EFormat guess_format )
1024 {
1025  // set binary vs. text
1026  switch( guess_format ) {
1027  case CFormatGuess::eBinaryASN:
1028  file_istrm >> MSerial_AsnBinary;
1029  break;
1030  case CFormatGuess::eTextASN:
1031  file_istrm >> MSerial_AsnText;
1032  break;
1033  default:
1034  break;
1035  // a format where binary vs. text is irrelevant
1036  }
1037 }
1038 
1039 CAgpFastaComparator::EFileType CAgpFastaComparator::x_GuessFileType( const string & filename )
1040 {
1041  // To prevent us from reading huge files
1042  int iterations_remaining = 100;
1043 
1044  ifstream file_strm(filename.c_str());
1045  string line;
1046 
1047  // find first non-blank line
1048  while( file_strm && line.empty() &&
1049  iterations_remaining-- > 0 )
1050  {
1051  // get line and trim it
1052  NcbiGetline(file_strm, line, "\r\n");
1053  NStr::TruncateSpacesInPlace( line );
1054  }
1055 
1056  if( line.empty() ) {
1057  return eFileType_Unknown;
1058  }
1059 
1060  if( line[0] == '>' ) {
1061  return eFileType_FASTA;
1062  }
1063 
1064  if( line.find("::=") != NPOS ) {
1065  return eFileType_ASN1;
1066  }
1067 
1068  if( line[0] == '#' ) {
1069  return eFileType_AGP;
1070  }
1071 
1072  int num_tabs = 0;
1073  // did not use std::count because Sun WorkShop compiler defines it in
1074  // a non-standard way and this is cleaner than preprocessor directives
1075  ITERATE( string, str_iter, line ) {
1076  if( *str_iter == '\t' ) {
1077  ++num_tabs;
1078  }
1079  }
1080  if( num_tabs >= 7 ) {
1081  return eFileType_AGP;
1082  }
1083 
1084  return eFileType_Unknown;
1085 }
USING_SCOPE(objects)
#define COMP_LOG(msg)
USING_NCBI_SCOPE
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
User-defined methods of the data storage class.
Checksum and hash calculation classes.
void WriteData(EType type, objects::CSeq_entry_Handle seh)
string GetFileName(EType type, objects::CSeq_id_Handle idh)
EFileType x_GuessFileType(const string &filename)
bool x_GetCompAndObjSeqIds(TSeqIdSet &out_compSeqIds, TSeqIdSet &out_objSeqIds, const std::list< std::string > &agpFiles)
void x_CheckForDups(TUniqueSeqs &unique_ids, const string &file_type)
EResult Run(const std::list< std::string > &files, const std::string &loadlog, const std::string &agp_as_fasta_file, TDiffsToHide diffsToHide, int diffs_to_find)
void x_Process(const objects::CSeq_entry_Handle seh, TUniqueSeqs &seqs, int *in_out_pUniqueBioseqsLoaded, int *in_out_pBioseqsSkipped, CNcbiOfstream *pDataOutFile)
unique_ptr< CNcbiOfstream > m_pLoadLogFile
void x_ProcessObjects(const list< string > &filenames, TUniqueSeqs &fasta_ids, CTmpSeqVecStorage *temp_dir)
void x_ProcessAgps(const list< string > &filenames, TUniqueSeqs &agp_ids, CTmpSeqVecStorage *temp_dir)
pair< string, TSeqPos > TKey
void x_OutputSeqDifferences(int diffs_to_find, const TSeqIdSet &seqIdIntersection, CTmpSeqVecStorage &temp_dir)
void x_PrintDetailsOfLengthIssue(objects::CBioseq_Handle bioseq_h)
void x_OutputDifferingSeqIds(const TSeqIdSet &vSeqIdFASTAOnly, const TSeqIdSet &vSeqIdAGPOnly, TDiffsToHide diffs_to_hide, TSeqIdSet &out_seqIdIntersection)
void x_WriteDataAsFasta(CNcbiOfstream &dataOutFile, const objects::CSeq_id_Handle &idh, const std::string &data)
unique_ptr< CNcbiOfstream > m_pAgpAsFastaFile
This class is used to turn an AGP file into a vector of Seq-entry's.
static CRef< objects::CSeq_id > s_DefaultSeqIdFromStr(const std::string &str)
This is the default method used to turn strings into Seq-ids in AGP contexts.
virtual CRef< objects::CSeq_id > x_GetSeqIdFromStr(const std::string &str)
If you must change exactly how strings are turned into Seq-ids, you can override this in a subclass.
static CRef< objects::CSeq_id > s_LocalSeqIdFromStr(const std::string &str)
Turn a string into a local Seq-id (removing "lcl|" from the beginning if needed)
CBioseq_CI –.
Definition: bioseq_ci.hpp:69
CChecksum – Checksum calculator.
Definition: checksum.hpp:302
Base class for reading FASTA sequences.
Definition: fasta.hpp:80
static TRegisterLoaderInfo RegisterInObjectManager(CObjectManager &om, CReader *reader=0, CObjectManager::EIsDefault is_default=CObjectManager::eDefault, CObjectManager::TPriority priority=CObjectManager::kPriority_NotSet)
Definition: gbloader.cpp:366
static TRegisterLoaderInfo RegisterInObjectManager(CObjectManager &om, CObjectManager::EIsDefault is_default=CObjectManager::eNonDefault, CObjectManager::TPriority priority=CObjectManager::kPriority_NotSet)
Argument-less loader - for compatibility only, unusable.
Class for managing LDS2 database and related data files.
Definition: lds2.hpp:46
CNcbiOstrstreamToString class helps convert CNcbiOstrstream to a string Sample usage:
Definition: ncbistre.hpp:802
CScope –.
Definition: scope.hpp:92
SeqVector related exceptions.
CSeqVector –.
Definition: seq_vector.hpp:65
CSeq_entry_Handle –.
CTime –.
Definition: ncbitime.hpp:296
CTmpFile –.
Definition: ncbifile.hpp:2353
container_type::const_iterator const_iterator
Definition: map.hpp:53
const_iterator begin() const
Definition: map.hpp:151
const_iterator end() const
Definition: map.hpp:152
iterator_bool insert(const value_type &val)
Definition: map.hpp:165
Definition: map.hpp:338
const_iterator begin() const
Definition: set.hpp:135
bool empty() const
Definition: set.hpp:133
const_iterator find(const key_type &key) const
Definition: set.hpp:137
const_iterator end() const
Definition: set.hpp:136
Operators to edit gaps in sequences.
#define true
Definition: bool.h:35
#define false
Definition: bool.h:36
static const char * str(char *buf, int n)
Definition: stats.c:84
static void md5(const char *src, const char *out)
Definition: challenge.c:77
char data[12]
Definition: iconv.c:80
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
@ null
Definition: ncbimisc.hpp:646
string
Definition: cgiapp.hpp:687
#define NULL
Definition: ncbistd.hpp:225
void AddLine(const char *line, size_t len)
Definition: checksum.hpp:609
void GetMD5Digest(unsigned char digest[16]) const
Return calculated MD5 digest.
Definition: checksum.hpp:637
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
virtual const char * what(void) const noexcept
Standard report (includes full backlog).
Definition: ncbiexpt.cpp:342
static string GetTmpDir(void)
Get temporary directory.
Definition: ncbifile.cpp:3660
virtual bool Exists(void) const
Check if directory "dirname" exists.
Definition: ncbifile.hpp:4066
static char GetPathSeparator(void)
Get path separator symbol specific for the current platform.
Definition: ncbifile.cpp:433
bool Create(TCreateFlags flags=fCreate_Default) const
Create the directory using "dirname" passed in the constructor.
Definition: ncbifile.cpp:4071
const string & GetPath(void) const
Get entry path.
Definition: ncbifile.hpp:3911
#define ASN(...)
Definition: serialbase.hpp:889
long TFlags
binary OR of EFlags
Definition: fasta.hpp:117
@ fAddMods
Parse defline mods and add to SeqEntry.
Definition: fasta.hpp:104
@ fNoSeqData
Parse the deflines but skip the data.
Definition: fasta.hpp:94
@ fDisableParseRange
No ranges in seq-ids. Ranges part of seq-id instead.
Definition: fasta.hpp:114
static EAccessionInfo IdentifyAccession(const CTempString &accession, TParseFlags flags=fParse_AnyRaw)
Deduces information from a bare accession a la WHICH_db_accession; may report false negatives on prop...
Definition: Seq_id.cpp:1634
EAccessionInfo
For IdentifyAccession (below)
Definition: Seq_id.hpp:220
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
string AsString(void) const
@ fAcc_prot
Definition: Seq_id.hpp:252
@ fAcc_nuc
Definition: Seq_id.hpp:251
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
@ eGetId_Best
return the "best" gi (uses FindBestScore(), with CSeq_id::CalculateScore() as the score function
Definition: sequence.hpp:101
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CBioseq_Handle GetBioseqHandle(const CSeq_id &id)
Get bioseq handle by seq-id.
Definition: scope.cpp:95
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
bool CanGetRange(TSeqPos start, TSeqPos stop) const
Check if the sequence data is available for the interval [start, stop).
Definition: seq_vector.cpp:292
void GetSeqData(TSeqPos start, TSeqPos stop, string &buffer) const
Fill the buffer string with the sequence data for the interval [start, stop).
Definition: seq_vector.cpp:304
const_iterator begin(void) const
Definition: seq_vector.hpp:298
const_iterator end(void) const
Definition: seq_vector.hpp:305
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
static TPid GetPid(void)
Get process identifier (pid) for the current process.
IO_PREFIX::ofstream CNcbiOfstream
Portable alias for ofstream.
Definition: ncbistre.hpp:500
CNcbiIstream & NcbiGetline(CNcbiIstream &is, string &str, char delim, string::size_type *count=NULL)
Read from "is" to "str" up to the delimiter symbol "delim" (or EOF)
const char * Endl(void)
Platform-specific EndOfLine.
Definition: ncbistre.cpp:184
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5414
string AsString(const CTimeFormat &format=kEmptyStr, TSeconds out_tz=eCurrentTimeZone) const
Transform time to string.
Definition: ncbitime.cpp:1512
@ eCurrent
Use current time. See also CCurrentTime.
Definition: ncbitime.hpp:300
bool IsLocal(void) const
Check if variant Local is selected.
Definition: Seq_id_.hpp:775
@ eMol_na
just a nucleic acid
Definition: Seq_inst_.hpp:113
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
exit(2)
FILE * file
int i
yy_size_t n
static void hex(unsigned char c)
Definition: mdb_dump.c:56
const struct ncbi::grid::netcache::search::fields::KEY key
unsigned int a
Definition: ncbi_localip.c:102
EIPRangeType t
Definition: ncbi_localip.c:101
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Defines command line argument related classes.
int isalnum(Uchar c)
Definition: ncbictype.hpp:62
Defines unified interface to application:
Defines a portable execute class.
static Format format
Definition: njn_ioutil.cpp:53
std::istream & in(std::istream &in_, double &x_)
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
void copy(Njn::Matrix< S > *matrix_, const Njn::Matrix< T > &matrix0_)
Definition: njn_matrix.hpp:613
The Object manager core.
#define BEGIN(X)
CRef< objects::CObjectManager > om
Definition: type.c:6
#define _TROUBLE
#define _ASSERT
Modified on Wed Sep 04 15:05:07 2024 by modify_doxy.py rev. 669887