NCBI C++ ToolKit
bam2graph.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: bam2graph.cpp 91930 2020-12-16 19:23:30Z grichenk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Eugene Vasilchenko
27  *
28  * File Description:
29  * Sample test application for BAM coverage generator
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbiapp.hpp>
35 #include <corelib/ncbifile.hpp>
40 #include <serial/serial.hpp>
41 #include <serial/objostr.hpp>
42 
45 #include <objects/seq/seq__.hpp>
47 
48 #include <common/test_data_path.h>
49 
52 
53 /////////////////////////////////////////////////////////////////////////////
54 // CBam2GraphApp::
55 
56 
58 {
59 private:
60  virtual void Init(void);
61  virtual int Run(void);
62  virtual void Exit(void);
63 
64  void ProcessFile(const string& file);
65  void ProcessSrz(string srz_name);
66 };
67 
68 
69 /////////////////////////////////////////////////////////////////////////////
70 // Init test
71 
72 
73 #define BAM_DIR "1000genomes/ftp/data"
74 #define CONFIG_FILE_NAME "analysis.bam.cfg"
75 
77 {
78  // Create command-line argument descriptions class
79  unique_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
80 
81  // Specify USAGE context
82  arg_desc->SetUsageContext(GetArguments().GetProgramBasename(),
83  "CArgDescriptions demo program");
84 
85  arg_desc->AddOptionalKey("srz", "srz",
86  "SRZ accession config file"
87  " (-log and -int options are ignored)",
89  arg_desc->AddOptionalKey("file", "File",
90  "BAM file name",
92  arg_desc->AddOptionalKey("dir", "Dir",
93  "BAM files files directory",
95  arg_desc->AddOptionalKey("odir", "OutputDir",
96  "Destination directory",
98 
99  arg_desc->AddOptionalKey("ref_label", "RefLabel",
100  "RefSeq id in BAM file",
102  arg_desc->AddOptionalKey("seq_id", "SeqId",
103  "RefSeq Seq-id",
105  arg_desc->AddOptionalKey("delta", "Delta",
106  "Delta-ext in text ASN.1",
108  arg_desc->AddOptionalKey("delta_file", "DeltaFile",
109  "File with Delta-ext in text ASN.1",
111 
112  arg_desc->AddFlag("estimated", "Make estimated graph using index only");
113  arg_desc->AddFlag("raw-access", "Make graph using raw access to BAM");
114  arg_desc->AddOptionalKey("min_quality", "MinMapQuality",
115  "Minimal alignment map quality",
117  arg_desc->AddFlag("log", "Generate logarithmic graph");
118  arg_desc->AddFlag("int", "Generate graph with int values");
119  arg_desc->AddOptionalKey("max", "OutlierMax",
120  "Factor over average to treat as outlier",
122  arg_desc->AddFlag("max_details", "Include detailed outlier info");
123  arg_desc->AddOptionalKey("bin_size", "BinSize",
124  "Seq-graph bin size",
126  arg_desc->AddOptionalKey("annot_name", "annot_name",
127  "Annot name with generated Seq-graph",
129  arg_desc->AddOptionalKey("title", "Title",
130  "Title of generated Seq-graph",
132 
133  arg_desc->AddDefaultKey("o", "OutputFile",
134  "Output file of ASN.1",
136  "-");
137  arg_desc->AddFlag("b", "Write binary ASN.1");
138  arg_desc->AddFlag("annot", "Write Seq-annot only");
139 
140  // Setup arg.descriptions for this application
141  SetupArgDescriptions(arg_desc.release());
142 }
143 
144 
145 
146 /////////////////////////////////////////////////////////////////////////////
147 // Run test
148 /////////////////////////////////////////////////////////////////////////////
149 
150 
152 {
154  // Get arguments
155  const CArgs& args = GetArgs();
156 
157  if ( args["file"] ) {
158  ProcessFile(args["file"].AsString());
159  }
160  else if ( args["srz"] ) {
161  ProcessSrz(args["srz"].AsString());
162  }
163  return 0;
164 }
165 
166 
167 void CBam2GraphApp::ProcessFile(const string& file)
168 {
169  const CArgs& args = GetArgs();
170 
171  string path;
172  if ( NStr::StartsWith(file, "http://") ||
173  NStr::StartsWith(file, "https://") ||
174  NStr::StartsWith(file, "ftp://") ||
175  CFile(file).Exists() ) {
176  path = file;
177  }
178  else {
179  string dir;
180  if ( args["dir"] ) {
181  dir = args["dir"].AsString();
182  }
183  else {
184  vector<string> reps;
185  NStr::Split("traces02:traces04", ":", reps);
186  ITERATE ( vector<string>, it, reps ) {
187  string path = CFile::MakePath(CFile::MakePath(NCBI_GetTestDataPath(), *it), dir);
188  if ( !CDirEntry(dir).Exists() ) {
189  dir = path;
190  break;
191  }
192  }
193  }
194 
195  path = CFile::MakePath(dir, file);
196  if ( !CFile(path).Exists() ) {
197  vector<string> tt;
198  NStr::Split(CFile(file).GetBase(), ".", tt);
199  if ( tt.size() > 0 ) {
200  path = CFile::MakePath(dir, tt[0]);
201  path = CFile::MakePath(path, "alignment");
202  path = CFile::MakePath(path, file);
203  }
204  }
205  }
206 
207  string ref_label;
208  if ( args["ref_label"] ) {
209  ref_label = args["ref_label"].AsString();
210  }
211  else {
212  vector<string> tt;
213  NStr::Split(CFile(file).GetBase(), ".", tt);
214  if ( tt.size() > 1 ) {
215  ref_label = tt[1];
216  }
217  else {
218  ERR_POST(Fatal<<"Cannot determine RefSeq label");
219  }
220  }
221 
222  CBam2Seq_graph cvt;
223  cvt.SetRefLabel(ref_label);
224  if ( args["seq_id"] ) {
225  CSeq_id seq_id(args["seq_id"].AsString());
226  cvt.SetRefId(seq_id);
227  }
228  else {
229  CSeq_id seq_id(CSeq_id::e_Local, ref_label);
230  cvt.SetRefId(seq_id);
231  }
232  if ( args["annot_name"] ) {
233  cvt.SetAnnotName(args["annot_name"].AsString());
234  }
235  if ( args["title"] ) {
236  cvt.SetGraphTitle(args["title"].AsString());
237  }
238  if ( args["min_quality"] ) {
239  cvt.SetMinMapQuality(args["min_quality"].AsInteger());
240  }
241  if ( args["log"] ) {
243  }
244  if ( args["int"] ) {
246  }
247  if ( args["bin_size"] ) {
248  cvt.SetGraphBinSize(args["bin_size"].AsInteger());
249  }
250  if ( args["max"] ) {
251  cvt.SetOutlierMax(args["max"].AsDouble());
252  }
253  if ( args["max_details"] ) {
254  cvt.SetOutlierDetails();
255  }
256  if ( args["estimated"] ) {
257  cvt.SetEstimated();
258  }
259  if ( args["raw-access"] ) {
260  cvt.SetRawAccess();
261  }
263  if ( args["delta"] ) {
264  delta = new CDelta_ext;
265  string s = args["delta"].AsString();
266  if ( !NStr::StartsWith(s, "Delta-ext") ) {
267  s = "Delta-ext ::= "+s;
268  }
269  CNcbiIstrstream in(s);
270  in >> MSerial_AsnText >> *delta;
271  }
272  else if ( args["delta_file"] ) {
273  delta = new CDelta_ext;
274  args["delta_file"].AsInputFile() >> MSerial_AsnText >> *delta;
275  }
276  if ( delta ) {
277  CRef<CSeq_inst> inst(new CSeq_inst);
279  inst->SetMol(CSeq_inst::eMol_na);
280  inst->SetExt().SetDelta(*delta);
281  cvt.SetSeq_inst(inst);
282  }
283 
284  CRef<CSeq_entry> entry;
285 
287  if ( 0 && args["estimated"] ) {
288  // faster estimated graph from index only
289  CBamHeader header(path);
290  CBamIndex index(path+".bai");
291  LOG_POST(Info<<"CBam2Seq_graph: Opened BAM file in "<<sw.Restart());
292  CRef<CSeq_annot> annot =
293  index.MakeEstimatedCoverageAnnot(header, ref_label,
294  cvt.GetRefId(),
295  cvt.GetAnnotName());
296  entry = new CSeq_entry;
297  entry->SetSet().SetSeq_set();
298  entry->SetSet().SetAnnot().push_back(annot);
299  }
300  else {
301  CBamMgr mgr;
302  CBamDb bam(mgr, path, path+".bai");
303  LOG_POST(Info<<"CBam2Seq_graph: Opened BAM file in "<<sw.Restart());
304  entry = cvt.MakeSeq_entry(bam);
305  }
306  LOG_POST(Info<<"CBam2Seq_graph: Generated graph in "<<sw.Elapsed());
307 
308  CNcbiOstream& out = args["o"].AsOutputFile();
309  if ( args["b"] )
311  else
312  out << MSerial_AsnText;
313  if ( args["annot"] ) {
314  out << *entry->GetAnnot().front();
315  }
316  else {
317  out << *entry;
318  }
319 }
320 
321 
322 void CBam2GraphApp::ProcessSrz(string srz_name)
323 {
324  const CArgs& args = GetArgs();
325 
326  if ( CFile(srz_name).IsDir() ) {
327  srz_name = CFile::MakePath(srz_name, CONFIG_FILE_NAME);
328  }
329  string dir;
330  if ( args["dir"] ) {
331  dir = args["dir"].AsString();
332  }
333  else {
334  dir = CFile(srz_name).GetDir();
335  }
336  string odir;
337  if ( args["odir"] ) {
338  odir = args["odir"].AsString();
339  }
340  else {
341  odir = dir;
342  }
343 
344  CBamMgr mgr;
345  CBamDb db;
346  CBamRawDb raw_db;
347  CBamHeader header;
348  CBamIndex index;
349  string db_name;
350 
351  vector<string> tokens;
352  string line;
353  CNcbiIfstream srz(srz_name.c_str());
354  while ( getline(srz, line) ) {
355  tokens.clear();
356  NStr::Split(line, "\t", tokens);
357  if ( tokens.size() < 4 ) {
358  ERR_POST(Fatal<<"Bad def line: \""<<line<<"\"");
359  }
360  string ref_label = tokens[0];
361  string acc = tokens[1];
362  CRef<CSeq_id> id;
363  if ( tokens[2].empty() ) {
364  id = new CSeq_id(acc);
365  }
366  else {
367  id = new CSeq_id(CSeq_id::e_Gi, tokens[2]);
368  }
369  string bam_name = tokens[3];
370  if ( tokens.size() < 5 ) {
371  ERR_POST("No coverage requested for "<<ref_label);
372  continue;
373  }
374  string bam_path = CFile::MakePath(dir, bam_name);
375  string out_name = tokens[4];
376  string out_path = CFile::MakePath(odir, out_name);
377 
378  LOG_POST("Processing "<<ref_label<<" -> "<<out_path);
379 
380  CBam2Seq_graph cvt;
381  cvt.SetRefLabel(ref_label);
382  cvt.SetRefId(*id);
383  if ( args["bin_size"] ) {
384  cvt.SetGraphBinSize(args["bin_size"].AsInteger());
385  }
386  if ( args["annot_name"] ) {
387  cvt.SetAnnotName(args["annot_name"].AsString());
388  }
389  else {
390  cvt.SetAnnotName(CFile(bam_name).GetBase());
391  }
392  if ( args["title"] ) {
393  cvt.SetGraphTitle(args["title"].AsString());
394  }
395  else {
396  cvt.SetGraphTitle(bam_name+" "+ref_label+" coverage");
397  }
398  if ( args["min_quality"] ) {
399  cvt.SetMinMapQuality(args["min_quality"].AsInteger());
400  }
401  cvt.SetOutlierDetails();
402  if ( args["estimated"] ) {
403  cvt.SetEstimated();
404  }
405  if ( args["raw-access"] ) {
406  cvt.SetRawAccess();
407  }
408 
409  CRef<CSeq_entry> entry;
410  if ( 0 && args["estimated"] ) {
411  // faster estimated graph from index only
412  if ( bam_path != db_name ) {
413  db_name = bam_path;
414  header.Read(bam_path);
415  index.Read(bam_path+".bai");
416  }
417  CRef<CSeq_annot> annot =
418  index.MakeEstimatedCoverageAnnot(header, ref_label,
419  cvt.GetRefId(),
420  cvt.GetAnnotName());
421  entry = new CSeq_entry;
422  entry->SetSet().SetSeq_set();
423  entry->SetSet().SetAnnot().push_back(annot);
424  }
425  else {
426  if ( bam_path != db_name ) {
427  db_name = bam_path;
428  if ( cvt.GetEstimated() ) {
429  raw_db = CBamRawDb(bam_path, bam_path+".bai");
430  }
431  else {
432  db = CBamDb(mgr, bam_path, bam_path+".bai");
433  }
434  }
435  if ( cvt.GetEstimated() ) {
436  entry = cvt.MakeSeq_entry(raw_db, bam_path);
437  }
438  else {
439  entry = cvt.MakeSeq_entry(db, bam_path);
440  }
441  }
442 
445  *out << *entry;
446  }
447 }
448 
449 
450 /////////////////////////////////////////////////////////////////////////////
451 // Cleanup
452 
453 
455 {
456  SetDiagStream(0);
457 }
458 
459 
460 /////////////////////////////////////////////////////////////////////////////
461 // MAIN
462 
463 
464 int main(int argc, const char* argv[])
465 {
466  // Execute main application function
467  return CBam2GraphApp().AppMain(argc, argv);
468 }
User-defined methods of the data storage class.
USING_SCOPE(objects)
#define CONFIG_FILE_NAME
Definition: bam2graph.cpp:74
int main(int argc, const char *argv[])
Definition: bam2graph.cpp:464
USING_NCBI_SCOPE
Definition: bam2graph.cpp:50
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgs –.
Definition: ncbiargs.hpp:379
virtual void Exit(void)
Cleanup on application exit.
Definition: bam2graph.cpp:454
void ProcessFile(const string &file)
Definition: bam2graph.cpp:167
void ProcessSrz(string srz_name)
Definition: bam2graph.cpp:322
virtual int Run(void)
Run the application.
Definition: bam2graph.cpp:151
virtual void Init(void)
Initialize the application.
Definition: bam2graph.cpp:76
CBam2Seq_graph.
Definition: bamgraph.hpp:73
void SetGraphBinSize(TSeqPos bin_size)
Definition: bamgraph.cpp:133
void SetRawAccess(bool raw_access=true)
Definition: bamgraph.cpp:165
void SetGraphTitle(const string &title)
Definition: bamgraph.cpp:91
bool GetEstimated(void) const
make estimated graph using BAM index only the bin size will be derived from index
Definition: bamgraph.hpp:271
void SetOutlierMax(double x)
Definition: bamgraph.cpp:139
void SetEstimated(bool estimated=true)
Definition: bamgraph.cpp:171
const string & GetAnnotName(void) const
Annot name of generated Seq-graph.
Definition: bamgraph.hpp:235
void SetGraphValueType(EGraphValueType type)
Definition: bamgraph.cpp:115
void SetAnnotName(const string &name)
Definition: bamgraph.cpp:97
void SetOutlierDetails(bool details=true)
Definition: bamgraph.cpp:159
CRef< CSeq_entry > MakeSeq_entry(CBamMgr &mgr, const string &bam_file, const string &bam_index)
Generate Seq-entry for BAM file.
Definition: bamgraph.cpp:766
void SetRefLabel(const string &ref_label)
Definition: bamgraph.cpp:79
void SetGraphType(EGraphType type)
Definition: bamgraph.cpp:109
void SetRefId(const CSeq_id &ref_id)
Definition: bamgraph.cpp:85
@ eGraphType_logarithmic
Definition: bamgraph.hpp:104
const CSeq_id & GetRefId(void) const
Seq-id for the reference sequence in generated entry.
Definition: bamgraph.hpp:223
void SetSeq_inst(CRef< CSeq_inst > inst)
Use specified Seq-inst object for the virtual sequence.
Definition: bamgraph.cpp:103
void SetMinMapQuality(int qual)
Definition: bamgraph.cpp:127
void Read(CBGZFStream &stream)
Definition: bamindex.cpp:1581
void Read(const string &index_file_name)
Definition: bamindex.cpp:1190
CRef< CSeq_annot > MakeEstimatedCoverageAnnot(const CBamHeader &header, const string &ref_name, const string &seq_id, const string &annot_name, TIndexLevel min_index_level, TIndexLevel max_index_level) const
Definition: bamindex.cpp:1416
CDirEntry –.
Definition: ncbifile.hpp:262
CFile –.
Definition: ncbifile.hpp:1605
Definition: Seq_entry.hpp:56
const TAnnot & GetAnnot(void) const
Definition: Seq_entry.cpp:179
CStopWatch –.
Definition: ncbitime.hpp:1937
std::ofstream out("events_result.xml")
main entry point for tests
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:305
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
Definition: ncbiapp.cpp:832
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
Definition: ncbiapp.cpp:1208
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
@ eInputFile
Name of file (must exist and be readable)
Definition: ncbiargs.hpp:595
@ eDouble
Convertible into a floating point number (double)
Definition: ncbiargs.hpp:594
@ eString
An arbitrary string.
Definition: ncbiargs.hpp:589
@ eOutputFile
Name of file (must be writable)
Definition: ncbiargs.hpp:596
@ eInteger
Convertible into an integer number (int or Int8)
Definition: ncbiargs.hpp:592
EDiagSev SetDiagPostLevel(EDiagSev post_sev=eDiag_Error)
Set the threshold severity for posting the messages.
Definition: ncbidiag.cpp:6132
#define ERR_POST(message)
Error posting with file, line number information but without error codes.
Definition: ncbidiag.hpp:186
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
void SetDiagStream(CNcbiOstream *os, bool quick_flush=true, FDiagCleanup cleanup=0, void *cleanup_data=0, const string &stream_name="")
Set diagnostic stream.
Definition: ncbidiag.cpp:8086
@ eDiag_Info
Informational message.
Definition: ncbidiag.hpp:651
void Fatal(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1209
void Info(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1185
string GetDir(EIfEmptyPath mode=eIfEmptyPath_Current) const
Get the directory component for this directory entry.
Definition: ncbifile.cpp:475
static string MakePath(const string &dir=kEmptyStr, const string &base=kEmptyStr, const string &ext=kEmptyStr)
Assemble a path from basic components.
Definition: ncbifile.cpp:413
#define MSerial_AsnBinary
Definition: serialbase.hpp:697
#define MSerial_AsnText
I/O stream manipulators –.
Definition: serialbase.hpp:696
@ eSerial_AsnBinary
ASN.1 binary.
Definition: serialdef.hpp:74
static CObjectOStream * Open(ESerialDataFormat format, CNcbiOstream &outStream, bool deleteOutStream)
Create serial object writer and attach it to an output stream.
Definition: objostr.cpp:126
IO_PREFIX::ostream CNcbiOstream
Portable alias for ostream.
Definition: ncbistre.hpp:149
IO_PREFIX::ifstream CNcbiIfstream
Portable alias for ifstream.
Definition: ncbistre.hpp:439
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3452
static bool StartsWith(const CTempString str, const CTempString start, ECase use_case=eCase)
Check if a string starts with a specified prefix value.
Definition: ncbistr.hpp:5414
double Restart(void)
Return time elapsed since first Start() or last Restart() call (in seconds).
Definition: ncbitime.hpp:2816
double Elapsed(void) const
Return time elapsed since first Start() or last Restart() call (in seconds).
Definition: ncbitime.hpp:2775
@ eStart
Start timer immediately after creating.
Definition: ncbitime.hpp:1941
@ e_Gi
GenInfo Integrated Database.
Definition: Seq_id_.hpp:106
@ e_Local
local use
Definition: Seq_id_.hpp:95
TSet & SetSet(void)
Select the variant.
Definition: Seq_entry_.cpp:130
TAnnot & SetAnnot(void)
Assign a value to Annot data member.
TSeq_set & SetSeq_set(void)
Assign a value to Seq_set data member.
void SetExt(TExt &value)
Assign a value to Ext data member.
Definition: Seq_inst_.cpp:147
void SetRepr(TRepr value)
Assign a value to Repr data member.
Definition: Seq_inst_.hpp:574
void SetMol(TMol value)
Assign a value to Mol data member.
Definition: Seq_inst_.hpp:621
@ eRepr_delta
sequence made by changes (delta) to others
Definition: Seq_inst_.hpp:100
@ eMol_na
just a nucleic acid
Definition: Seq_inst_.hpp:113
static CStopWatch sw
FILE * file
CBioseq_Base_Info & GetBase(CTSE_Info &tse, const CBioObjectId &id)
constexpr bool empty(list< Ts... >) noexcept
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Defines classes: CDirEntry, CFile, CDir, CSymLink, CMemoryFile, CFileUtil, CFileLock,...
std::istream & in(std::istream &in_, double &x_)
Int4 delta(size_t dimension_, const Int4 *score_)
Defines location of test data folder at NCBI.
static const char * NCBI_GetTestDataPath(void)
Get the directory where test data is stored at NCBI.
Modified on Wed Sep 04 15:06:11 2024 by modify_doxy.py rev. 669887