NCBI C++ ToolKit
tls.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: tls.cpp 92208 2020-12-22 18:37:20Z grichenk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Colleen Bollin
27  *
28  * File Description:
29  * 1. open two files (one has contigs, one has master)
30  * 2. for each contig, calculate the targeted locus name and add
31  * AutodefOptions object with targeted locus name
32  * 3. calculate the consensus targeted locus name and
33  * apply it to the master
34  *
35  */
36 
37 #include <ncbi_pch.hpp>
38 #include <corelib/ncbistd.hpp>
39 #include <corelib/ncbistre.hpp>
40 #include <corelib/ncbiapp.hpp>
41 #include <corelib/ncbienv.hpp>
42 #include <corelib/ncbiargs.hpp>
43 #include <corelib/ncbiutil.hpp>
44 
45 #include <serial/serial.hpp>
46 #include <serial/objistr.hpp>
47 #include <serial/objostr.hpp>
48 #include <serial/objectio.hpp>
49 #include <serial/objhook.hpp>
50 
52 #include <connect/ncbi_util.h>
53 
54 // Objects includes
56 #include <objects/seq/Bioseq.hpp>
58 
61 #include <util/line_reader.hpp>
62 
64 
65 // Object Manager includes
67 #include <objmgr/scope.hpp>
68 #include <objmgr/seq_descr_ci.hpp>
69 #include <objmgr/bioseq_handle.hpp>
70 #include <objmgr/bioseq_ci.hpp>
71 #include <objmgr/seqdesc_ci.hpp>
72 
74 
75 #include <serial/objcopy.hpp>
76 
77 
78 #include <common/test_assert.h> /* This header must go last */
79 
80 
81 using namespace ncbi;
82 using namespace objects;
83 
84 const char * TLS_APP_VER = "1.0";
85 
86 /////////////////////////////////////////////////////////////////////////////
87 //
88 // Application
89 //
90 
91 
93 {
94 public:
95  CTLSHandler() : m_ObjMgr(0)
96  { m_ObjMgr = CObjectManager::GetInstance(); }
97 
98  virtual ~CTLSHandler() { }
99 
100  virtual void ProcessBioseq(CBioseq_Handle bh) {}
101 
102  void OpenInputFile(const string& filename, bool binary);
103  void OpenOutputFile(const string& filename, bool binary);
104  void ProcessAsnInput(void);
105  void ProcessSeqSubmit(void);
106  void ProcessSeqEntry(void);
107  void ProcessSet(void);
108 
109  void ProcessSeqEntry(CRef<CSeq_entry> se);
110 
111  CRef<CBioseq_set> ReadBioseqSet(void);
112  CRef<CSeq_entry> ReadSeqEntry(void);
113 
114  void StreamFile(const string& infile, const string& outfile, bool binary);
115 
116 protected:
118  unique_ptr<CObjectIStream> m_In;
119  unique_ptr<CObjectOStream> m_Out;
120 
121  CRef<CScope> BuildScope(void);
122 
123 };
124 
125 
126 void CTLSHandler::OpenInputFile(const string& file, bool binary)
127 {
128  // file format
130 
131  m_In.reset(CObjectIStream::Open(file, format));
132 
133 }
134 
135 
136 void CTLSHandler::OpenOutputFile(const string& file, bool binary)
137 {
138  // file format
140 
141  m_Out.reset(CObjectOStream::Open(file, format));
142 
143 }
144 
145 
147 {
148  CRef<CScope> scope(new CScope(*m_ObjMgr));
149  scope->AddDefaults();
150 
151  return scope;
152 }
153 
154 
156 {
157  // Process file based on its content
158  // Unless otherwise specifien we assume the file in hand is
159  // a Seq-entry ASN.1 file, other option are a Seq-submit or NCBI
160  // Release file (batch processing) where we process each Seq-entry
161  // at a time.
162  string header = m_In->ReadFileHeader();
163 
164  bool unhandled = false;
165  if (header == "Seq-submit") { // Seq-submit
166  ProcessSeqSubmit();
167  } else if (header == "Seq-entry") { // Seq-entry
168  ProcessSeqEntry();
169  } else if (header == "Bioseq-set") { // Bioseq-set
170  ProcessSet();
171  } else {
172  unhandled = true;
173  }
174  if (unhandled) {
175  NCBI_THROW(CException, eUnknown, "Unhandled type " + header);
176  }
177 
178 }
179 
181 {
182  CRef<CScope> scope = BuildScope();
183  CSeq_entry_Handle seh = scope->AddTopLevelSeqEntry(*se);
185  while (bi) {
186  ProcessBioseq(*bi);
187  ++bi;
188  }
189  scope->RemoveTopLevelSeqEntry(seh);
190 }
191 
192 
194 {
196 
197  // Get seq-submit to process
198  m_In->Read(ObjectInfo(*ss), CObjectIStream::eNoFileHeader);
199 
200  // Process Seq-submit
201  CRef<CScope> scope = BuildScope();
202  if (ss->GetData().IsEntrys()) {
204  ProcessSeqEntry(*se);
205  }
206  }
207  *m_Out << *ss;
208 
209 }
210 
211 
213 {
214  // Get Bioseq-set to process
215  CRef<CBioseq_set> set(ReadBioseqSet());
216  if (set && set->IsSetSeq_set()) {
218  ProcessSeqEntry(*se);
219  }
220  }
221  *m_Out << *set;
222 }
223 
224 
226 {
227  // Get seq-entry to process
228  CRef<CSeq_entry> se(ReadSeqEntry());
229 
230  ProcessSeqEntry(se);
231  *m_Out << *se;
232 }
233 
234 
235 
237 {
238  CRef<CBioseq_set> set(new CBioseq_set());
239  m_In->Read(ObjectInfo(*set), CObjectIStream::eNoFileHeader);
240 
241  return set;
242 }
243 
244 
246 {
248  m_In->Read(ObjectInfo(*se), CObjectIStream::eNoFileHeader);
249 
250  return se;
251 }
252 
253 
254 
256 {
257 public:
258  CTLSContigHandler() : CTLSHandler(), m_Consensus(kEmptyStr), m_CalculateConsensus(true) {}
259  virtual ~CTLSContigHandler() {}
260 
261  virtual void ProcessBioseq(CBioseq_Handle bh);
262  const string& GetConsensus() { return m_Consensus; }
263  void SetConsensus(const string& consensus) { m_Consensus = consensus; m_CalculateConsensus = false; }
264  bool first = true;
265 
266 protected:
267  string m_Consensus;
269 };
270 
271 
273 {
274  if (m_CalculateConsensus) {
275  string tls = edit::GenerateTargetedLocusName(bh);
276 
277  if (!NStr::IsBlank(tls)) {
278  if (first || !NStr::IsBlank(m_Consensus)) {
279  // if an earlier collision rendered the consensus name blank, it should stay blank
280  m_Consensus = edit::GetTargetedLocusNameConsensus(m_Consensus, tls);
281  }
283  first = false;
284  }
285  } else {
286  edit::SetTargetedLocusName(bh, m_Consensus);
287  }
288 }
289 
290 
292 {
293 public:
294  CTLSMasterHandler() : CTLSHandler(), m_Consensus(kEmptyStr) {}
295  virtual ~CTLSMasterHandler() {}
296 
297  void SetConsensus(const string& consensus) { m_Consensus = consensus; }
298  virtual void ProcessBioseq(CBioseq_Handle bh);
299 
300 protected:
301  string m_Consensus;
302 };
303 
304 
306 {
307  edit::SetTargetedLocusName(bh, m_Consensus);
308 }
309 
310 
311 class CTLSApp : public CNcbiApplication
312 {
313 public:
314  CTLSApp(void);
315 
316  virtual void Init(void);
317  virtual int Run (void);
318 
319 private:
320 
321  void Setup(const CArgs& args);
322 
325 };
326 
327 
329 {
330 }
331 
332 
333 void CTLSApp::Init(void)
334 {
335  // Prepare command line descriptions
336 
337  // Create
338  unique_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
339 
340  arg_desc->AddKey("master", "MasterRecord", "File with master record",
342  arg_desc->AddOptionalKey("contigsindir", "ContigDirectory",
343  "Directory that contains contig files, defaults to current directory",
345  arg_desc->AddOptionalKey("contigsinsuffix", "ContigFileSuffix", "Suffix for contig flags",
347  arg_desc->AddOptionalKey("contigsoutdir", "ContigsOutputDirectory",
348  "Contigs Output Directory, defaults to -p value",
350  arg_desc->AddOptionalKey("contigsoutsuffix", "ContigsOutputSuffix",
351  "Suffix for contig output files, defaults to .tls",
353 
354 
355  arg_desc->AddOptionalKey("masterout", "MasterRecordOutputFile",
356  "Master Record Output File (defaults to [master].tls)",
358  arg_desc->AddFlag("b", "Input is in binary format");
359 
360  arg_desc->AddOptionalKey("targetedlocusname", "TargetedLocusName",
361  "Optional value to use for Targeted Locus Name for master and all contigs instead of calculated value",
363 
364  // Program description
365  string prog_description = "Targeted Locus Name Generator\n";
366  arg_desc->SetUsageContext(GetArguments().GetProgramBasename(),
367  prog_description, false);
368 
369  // Pass argument descriptions to the application
370  SetupArgDescriptions(arg_desc.release());
371 
372 }
373 
374 
375 int CTLSApp::Run(void)
376 {
377  const CArgs& args = GetArgs();
378  Setup(args);
379 
380  if (args["targetedlocusname"]) {
381  m_ContigHandler.SetConsensus(args["targetedlocusname"].AsString());
382  }
383 
384  // process contigs
385 
386  string dir_name = ".";
387  if (args["contigsindir"]) {
388  dir_name = args["contigsindir"].AsString();
389  }
390 
391  string suffix = ".bss";
392  if (args["contigsinsuffix"]) {
393  suffix = args["contigsinsuffix"].AsString();
394  if (NStr::StartsWith(suffix, "'") && NStr::EndsWith(suffix, "'")) {
395  suffix = suffix.substr(1, suffix.length() - 2);
396  }
397  if (NStr::StartsWith(suffix, "*")) {
398  suffix = suffix.substr(1);
399  }
400  }
401 
402  string outdir = dir_name;
403  if (args["contigsoutdir"]) {
404  outdir = args["contigsoutdir"].AsString();
405  }
406 
407  string outsuffix = ".tls";
408  if (args["contigsoutsuffix"]) {
409  outsuffix = args["contigsoutsuffix"].AsString();
410  }
411 
412  CDir dir(dir_name);
413  string mask = "*" + suffix;
414 
416  ITERATE(CDir::TEntries, ii, files) {
417  string fname = CDirEntry::MakePath(dir_name, (*ii)->GetName());
418  string oname = CDirEntry::MakePath(outdir, fname + outsuffix);
419  m_ContigHandler.OpenInputFile(fname, args["b"]);
420  m_ContigHandler.OpenOutputFile(oname, args["b"]);
421  m_ContigHandler.ProcessAsnInput();
422  }
423 
424  const string& consensus = m_ContigHandler.GetConsensus();
425 
426  // update master
427  m_MasterHandler.SetConsensus(consensus);
428  const string& master_file = args["master"].AsString();
429  m_MasterHandler.OpenInputFile(master_file, args["b"]);
430  m_MasterHandler.OpenOutputFile(args["masterout"] ? args["masterout"].AsString() : master_file + ".tls", args["b"]);
431  m_MasterHandler.ProcessAsnInput();
432 
433  return 0;
434 }
435 
436 
437 void CTLSApp::Setup(const CArgs& args)
438 {
439  // Setup application registry and logs for CONNECT library
441  CORE_SetREG(REG_cxx2c(&GetConfig(), false));
442  // Setup MT-safety for CONNECT library
443  // CORE_SetLOCK(MT_LOCK_cxx2c());
444 
445 }
446 
447 
448 
449 
450 /////////////////////////////////////////////////////////////////////////////
451 // MAIN
452 
453 
454 int main(int argc, const char* argv[])
455 {
456  return CTLSApp().AppMain(argc, argv, 0, eDS_Default, 0);
457 }
User-defined methods of the data storage class.
ncbi::TMaskedQueryRegions mask
#define true
Definition: bool.h:35
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgs –.
Definition: ncbiargs.hpp:379
CBioseq_CI –.
Definition: bioseq_ci.hpp:69
CBioseq_Handle –.
CDir –.
Definition: ncbifile.hpp:1695
CScope –.
Definition: scope.hpp:92
CSeq_entry_Handle –.
Definition: Seq_entry.hpp:56
Definition: tls.cpp:312
virtual int Run(void)
Definition: tls.cpp:375
CTLSMasterHandler m_MasterHandler
Definition: tls.cpp:324
CTLSApp(void)
Definition: tls.cpp:328
CTLSContigHandler m_ContigHandler
Definition: tls.cpp:323
virtual void Init(void)
Definition: tls.cpp:333
void Setup(const CArgs &args)
Definition: tls.cpp:437
void SetConsensus(const string &consensus)
Definition: tls.cpp:263
virtual ~CTLSContigHandler()
Definition: tls.cpp:259
string m_Consensus
Definition: tls.cpp:267
const string & GetConsensus()
Definition: tls.cpp:262
bool m_CalculateConsensus
Definition: tls.cpp:268
virtual void ProcessBioseq(CBioseq_Handle bh)
Definition: tls.cpp:272
virtual void ProcessBioseq(CBioseq_Handle bh)
Definition: tls.cpp:100
virtual ~CTLSHandler()
Definition: tls.cpp:98
CRef< CBioseq_set > ReadBioseqSet(void)
Definition: tls.cpp:236
unique_ptr< CObjectIStream > m_In
Definition: tls.cpp:118
void ProcessSet(void)
Definition: tls.cpp:212
CTLSHandler()
Definition: tls.cpp:95
void StreamFile(const string &infile, const string &outfile, bool binary)
void ProcessSeqSubmit(void)
Definition: tls.cpp:193
void OpenOutputFile(const string &filename, bool binary)
Definition: tls.cpp:136
CRef< CSeq_entry > ReadSeqEntry(void)
Definition: tls.cpp:245
void ProcessSeqEntry(void)
Definition: tls.cpp:225
unique_ptr< CObjectOStream > m_Out
Definition: tls.cpp:119
void ProcessAsnInput(void)
Definition: tls.cpp:155
void OpenInputFile(const string &filename, bool binary)
Definition: tls.cpp:126
CRef< CObjectManager > m_ObjMgr
Definition: tls.cpp:117
CRef< CScope > BuildScope(void)
Definition: tls.cpp:146
virtual void ProcessBioseq(CBioseq_Handle bh)
Definition: tls.cpp:305
virtual ~CTLSMasterHandler()
Definition: tls.cpp:295
string m_Consensus
Definition: tls.cpp:301
void SetConsensus(const string &consensus)
Definition: tls.cpp:297
Include a standard set of the NCBI C++ Toolkit most basic headers.
static void Init(void)
Definition: cursor6.c:76
static DLIST_TYPE *DLIST_NAME() first(DLIST_LIST_TYPE *list)
Definition: dlist.tmpl.h:46
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define CNcbiApplication
@ eInputFile
Name of file (must exist and be readable)
Definition: ncbiargs.hpp:595
@ eString
An arbitrary string.
Definition: ncbiargs.hpp:589
@ eOutputFile
Name of file (must be writable)
Definition: ncbiargs.hpp:596
@ eDirectory
Name of file directory.
Definition: ncbiargs.hpp:598
@ eDS_Default
Try standard log file (app.name + ".log") in /log/, use stderr on failure.
Definition: ncbidiag.hpp:1790
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
TEntries GetEntries(const string &mask=kEmptyStr, TGetEntriesFlags flags=0) const
Get directory entries based on the specified "mask".
Definition: ncbifile.cpp:3846
static string MakePath(const string &dir=kEmptyStr, const string &base=kEmptyStr, const string &ext=kEmptyStr)
Assemble a path from basic components.
Definition: ncbifile.cpp:413
list< TEntry > TEntries
Definition: ncbifile.hpp:1750
@ eFile
Regular file.
Definition: ncbifile.hpp:783
@ eUnknown
Definition: app_popup.hpp:72
ESerialDataFormat
Data file format.
Definition: serialdef.hpp:71
@ eSerial_AsnText
ASN.1 text.
Definition: serialdef.hpp:73
@ eSerial_AsnBinary
ASN.1 binary.
Definition: serialdef.hpp:74
pair< TObjectPtr, TTypeInfo > ObjectInfo(C &obj)
Definition: objectinfo.hpp:762
static CObjectOStream * Open(ESerialDataFormat format, CNcbiOstream &outStream, bool deleteOutStream)
Create serial object writer and attach it to an output stream.
Definition: objostr.cpp:126
static CObjectIStream * Open(ESerialDataFormat format, CNcbiIstream &inStream, bool deleteInStream)
Create serial object reader and attach it to an input stream.
Definition: objistr.cpp:195
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
CSeq_entry_Handle AddTopLevelSeqEntry(CSeq_entry &top_entry, TPriority pri=kPriority_Default, EExist action=eExist_Default)
Add seq_entry, default priority is higher than for defaults or loaders Add object to the score with p...
Definition: scope.cpp:522
void AddDefaults(TPriority pri=kPriority_Default)
Add default data loaders from object manager.
Definition: scope.cpp:504
void RemoveTopLevelSeqEntry(const CTSE_Handle &entry)
Revoke TSE previously added using AddTopLevelSeqEntry() or AddBioseq().
Definition: scope.cpp:376
#define kEmptyStr
Definition: ncbistr.hpp:123
static bool EndsWith(const CTempString str, const CTempString end, ECase use_case=eCase)
Check if a string ends with a specified suffix value.
Definition: ncbistr.hpp:5430
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5412
void Run(void)
Enter the main loop.
LOG LOG_cxx2c(void)
Create LOG on top of C++ Toolkit CNcbiDiag.
void CORE_SetREG(REG rg)
Set the registry (no registry if "rg" is passed zero) – to be used by the core internals.
Definition: ncbi_util.c:692
REG REG_cxx2c(IRWRegistry *reg, bool pass_ownership=false)
Convert a C++ Toolkit registry object to a REG registry.
void CORE_SetLOG(LOG lg)
Set the log handle (no logging if "lg" is passed zero) – to be used by the core internals (CORE LOG).
Definition: ncbi_util.c:123
bool IsSetSeq_set(void) const
Check if a value has been assigned to Seq_set data member.
const TSeq_set & GetSeq_set(void) const
Get the Seq_set member data.
list< CRef< CSeq_entry > > TSeq_set
@ eMol_na
just a nucleic acid
Definition: Seq_inst_.hpp:113
list< CRef< CSeq_entry > > TEntrys
const TEntrys & GetEntrys(void) const
Get the variant data.
const TData & GetData(void) const
Get the Data member data.
bool IsEntrys(void) const
Check if variant Entrys is selected.
FILE * file
Lightweight interface for getting lines of data with minimal memory copying.
Magic spell ;-) needed for some weird compilers... very empiric.
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
#define GetArgs
Avoid preprocessor name clash with the NCBI C Toolkit.
Definition: ncbiapp_api.hpp:53
Defines command line argument related classes.
Defines unified interface to application:
NCBI C++ stream class wrappers for triggering between "new" and "old" C++ stream libraries.
Useful/utility classes and methods.
static Format format
Definition: njn_ioutil.cpp:53
The Object manager core.
static const char * suffix[]
Definition: pcregrep.c:408
static FILE * outfile
Definition: pcretest.c:1033
void SetTargetedLocusName(CBioseq_Handle seq, const string &tls)
string GetTargetedLocusNameConsensus(const string &tls1, const string &tls2)
string GenerateTargetedLocusName(CBioseq_Handle seq)
int main(int argc, const char *argv[])
Definition: tls.cpp:454
const char * TLS_APP_VER
Definition: tls.cpp:84
CRef< CScope > BuildScope(void)
Modified on Wed Feb 28 07:12:39 2024 by modify_doxy.py rev. 669887