NCBI C++ ToolKit
concat_seqentries.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: concat_seqentries.cpp 95044 2021-09-29 19:04:12Z whlavina $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Cheinan Marks
27  *
28  * File Description:
29  * Concatenate all the seq entry blobs in an ASN cache. No indexing is done.
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbiapp.hpp>
35 #include <corelib/ncbienv.hpp>
36 #include <corelib/ncbiargs.hpp>
37 #include <corelib/ncbifile.hpp>
38 #include <corelib/ncbistre.hpp>
39 
40 #include <util/compress/stream.hpp>
41 #include <util/compress/zlib.hpp>
42 
43 #include <serial/serial.hpp>
44 #include <serial/objostrasnb.hpp>
45 #include <serial/objistrasnb.hpp>
46 
48 
51 
54 
55 
56 /////////////////////////////////////////////////////////////////////////////
57 // CConcatSeqEntriesApplication::
58 
59 
61 {
62 private:
63  virtual void Init(void);
64  virtual int Run(void);
65  virtual void Exit(void);
66 };
67 
68 
69 void DumpSeqEntries( CDir & cache_path, CNcbiOstream & output_stream );
70 
71 /////////////////////////////////////////////////////////////////////////////
72 // Init test for all different types of arguments
73 
74 
76 {
77  // Create command-line argument descriptions class
78  unique_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
79 
80  // Specify USAGE context
81  arg_desc->SetUsageContext(GetArguments().GetProgramBasename(),
82  "Concatenate all the seq entries from a cache.");
83 
84  arg_desc->AddKey("cache", "Cache",
85  "Path to ASN.1 cache",
87 
88  arg_desc->AddDefaultKey( "o", "SeqEntryFile",
89  "Write the seq entries here.",
91  "-");
92 
93  // Setup arg.descriptions for this application
94  SetupArgDescriptions(arg_desc.release());
95 }
96 
97 
98 
100 {
101  // Get arguments
102  const CArgs& args = GetArgs();
103 
104  CStopWatch sw;
105  sw.Start();
106 
107  CDir cache_dir( args["cache"].AsString() );
108  if (! cache_dir.Exists() ) {
109  ERR_POST( Error << cache_dir.GetPath() << " does not exist!" );
110  return 1;
111  } else if ( ! cache_dir.IsDir() ) {
112  ERR_POST( Error << cache_dir.GetPath() << " does not point to a "
113  << "valid cache path!" );
114  return 2;
115  }
116 
117  DumpSeqEntries( cache_dir, args["o"].AsOutputFile() );
118 
119  return 0;
120 }
121 
122 
123 void DumpSeqEntries( CDir & cache_dir, CNcbiOstream & output_stream )
124 {
125  CCache_blob a_blob;
126 
127  CDir::TEntries chunk_list
128  = cache_dir.GetEntries( NASNCacheFileName::GetChunkPrefix() + "*",
130  size_t chunk_count = 0;
131  size_t seq_entry_count = 0;
132  ITERATE( CDir::TEntries, chunk_file_iter, chunk_list ) {
133  CNcbiIfstream chunk_stream( (*chunk_file_iter)->GetPath().c_str() );
134  CObjectIStreamAsnBinary blob_asn_stream( chunk_stream );
135 
136  try {
137  while ( true ) {
138  blob_asn_stream >> a_blob;
139 
140  CCompressionOStream zip(output_stream,
143 
144  CObjectOStreamAsnBinary output_asn_stream( zip );
145  CSeq_entry a_seq_entry;
146  a_blob.UnPack( a_seq_entry );
147  output_asn_stream << a_seq_entry;
148  seq_entry_count++;
149  }
150  } catch( CEofException & ) {
151  // Ignore this -- we reached the end of the file.
152  } catch (...) {
153  ERR_POST( Error << "Object stream exception on chunk number " << chunk_count );
154  throw;
155  }
156 
157  chunk_count++;
158  LOG_POST( Info << "Finished processing " << (*chunk_file_iter)->GetPath() );
159  }
160 
161  LOG_POST( Info << seq_entry_count << " seq entries from "
162  << chunk_count << " chunks written." );
163 }
164 
165 
166 /////////////////////////////////////////////////////////////////////////////
167 // Cleanup
168 
169 
171 {
172  SetDiagStream(0);
173 }
174 
175 
176 /////////////////////////////////////////////////////////////////////////////
177 // MAIN
178 
179 
180 int main(int argc, const char* argv[])
181 {
182  // Execute main application function
183  return CConcatSeqEntriesApplication().AppMain(argc, argv);
184 }
Contains the class definiton for CAsnCache, the main client class for accessing the ASN cache data.
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgs –.
Definition: ncbiargs.hpp:379
virtual int Run(void)
Run the application.
virtual void Init(void)
Initialize the application.
virtual void Exit(void)
Cleanup on application exit.
CDir –.
Definition: ncbifile.hpp:1695
CObjectIStreamAsnBinary –.
Definition: objistrasnb.hpp:59
CObjectOStreamAsnBinary –.
Definition: objostrasnb.hpp:58
Definition: Seq_entry.hpp:56
CStopWatch –.
Definition: ncbitime.hpp:1938
CZipStreamCompressor – zlib based compression stream processor.
Definition: zlib.hpp:765
USING_SCOPE(objects)
void DumpSeqEntries(CDir &cache_path, CNcbiOstream &output_stream)
int main(int argc, const char *argv[])
USING_NCBI_SCOPE
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:285
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
Definition: ncbiapp.cpp:799
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
Definition: ncbiapp.cpp:1175
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
@ eInputFile
Name of file (must exist and be readable)
Definition: ncbiargs.hpp:595
@ eOutputFile
Name of file (must be writable)
Definition: ncbiargs.hpp:596
@ fGZip
Set of flags for gzip file support. See each flag description above.
Definition: zlib.hpp:120
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
void SetDiagStream(CNcbiOstream *os, bool quick_flush=true, FDiagCleanup cleanup=0, void *cleanup_data=0, const string &stream_name="")
Set diagnostic stream.
Definition: ncbidiag.cpp:8083
void Error(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1197
void Info(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1185
TEntries GetEntries(const string &mask=kEmptyStr, TGetEntriesFlags flags=0) const
Get directory entries based on the specified "mask".
Definition: ncbifile.cpp:3846
virtual bool Exists(void) const
Check if directory "dirname" exists.
Definition: ncbifile.hpp:4065
bool IsDir(EFollowLinks follow=eFollowLinks) const
Check whether a directory entry is a directory.
Definition: ncbifile.hpp:3946
list< TEntry > TEntries
Definition: ncbifile.hpp:1750
const string & GetPath(void) const
Get entry path.
Definition: ncbifile.hpp:3910
@ eIgnoreRecursive
Definition: ncbifile.hpp:1774
@ fCreateObjects
Create appropriate subclasses of CDirEntry (CFile,CDir,...), not just CDirEntry objects.
Definition: ncbifile.hpp:1758
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
void Start(void)
Start the timer.
Definition: ncbitime.hpp:2765
static CStopWatch sw
string GetChunkPrefix()
Definition: file_names.hpp:53
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Defines command line argument related classes.
Defines unified interface to application:
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
NCBI C++ stream class wrappers for triggering between "new" and "old" C++ stream libraries.
ZLib Compression API.
Modified on Sat Dec 09 04:44:20 2023 by modify_doxy.py rev. 669887