NCBI C++ ToolKit
test_bm.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: test_bm.cpp 40942 2018-05-02 15:48:55Z katargir $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Roman Katargin
27  *
28  * File Description:
29  *
30  */
31 
32 
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbiapp.hpp>
35 #include <corelib/ncbiargs.hpp>
36 #include <corelib/ncbienv.hpp>
37 
38 #include "bmsparsevec.h"
39 #include "bmsparsevec_serial.h"
40 #include "bmsparsevec_compr.h"
41 
43 #include <objmgr/scope.hpp>
44 #include <objmgr/util/sequence.hpp>
45 
47 
51 
52 #include <gui/objutils/utils.hpp>
53 
54 
57 
59 typedef bm::compressed_sparse_vector<unsigned, svector > compsvector;
60 
62 {
63 public:
64  CTestBMApp();
65 
66 private:
67  virtual void Init(void);
68  virtual int Run(void);
69 
71 
72  void x_Serialize(const unsigned char* buff, size_t size, const string& fileName);
73 
74  string m_HomeDir;
75  int m_MaxCount = 100000;
76 };
77 
78 
80 {
81 }
82 
83 
84 /////////////////////////////////////////////////////////////////////////////
85 // Init test for all different types of arguments
86 
87 
88 void CTestBMApp::Init(void)
89 {
91 
92  string dir, dir2;
95  CDirEntry::SplitPath(dir, &dir2);
96  dir = CDirEntry::ConcatPath(dir2, "etc");
97  dir = CDirEntry::ConcatPath(dir, "accguide.txt");
98 
99  try {
101  LOG_POST( Info << "loaded sequence accession map from " << dir );
102  }
103  STD_CATCH_ALL( "CGBenchApplication::x_LoadGuiRegistry(): failed to load accession guide" );
104 
105  m_HomeDir = CDirEntry::ConcatPath(GetEnvironment().Get("APPDATA"), "GenomeWorkbench2");
106 }
107 
108 
110 {
111  unique_ptr<CArgDescriptions> arg_desc(new CArgDescriptions());
112  arg_desc->SetUsageContext(GetArguments().GetProgramBasename(),
113  "BitMagic test app");
114  arg_desc->AddKey("acc", "acc", "Genbank accession", CArgDescriptions::eString);
115  arg_desc->AddOptionalKey("max_count", "max_count", "Maximun number of SNPs", CArgDescriptions::eInteger);
116  SetupArgDescriptions(arg_desc.release());
117 }
118 
119 
120 
121 void CTestBMApp::x_Serialize(const unsigned char* buff, size_t size, const string& fileName)
122 {
123  if (size == 0)
124  return;
125 
126  string fullPath = CDirEntry::ConcatPath(m_HomeDir, fileName);
127  CFileIO fio;
128  fio.Open(fullPath, CFileIO::eCreate, CFileIO::eWrite);
129  fio.Write(buff, size);
130 }
131 
132 
134 {
135  const CArgs& args = GetArgs();
136  string accession = args["acc"].AsString();
137  if (args["max_count"].HasValue()) {
138  int max_count = args["max_count"].AsInteger();
139  if (max_count > 0)
140  m_MaxCount = max_count;
141  }
142 
145 
146  CRef<CScope> scope(new CScope(*objectManager));
147  scope->AddDefaults();
148 
149  CRef<CSeq_id> seqId;
150 
151  try {
152  CSeq_id id(accession);
154  idh = sequence::GetId(idh, *scope, sequence::eGetId_Best);
155  if (idh) {
156  seqId.Reset(new CSeq_id());
157  seqId->Assign(*idh.GetSeqId());
158  }
159 
160  }
161  catch (CSeqIdException&) {
162  }
163 
164  if (!seqId) {
165  cout << "Invalid accession: " << accession;
166  return -1;
167  }
168 
171 
172  CRef<CSeq_loc> seqLoc(new CSeq_loc());
173  seqLoc->SetWhole(*seqId);
174  CFeat_CI feat_it(*scope, *seqLoc, sel);
175 
176  svector sv, svRsId, svRsId2(bm::use_null);
177 
178  // temp buffer to avoid unnecessary re-allocations
180 
181  svector::size_type index = 0;
182  for (; feat_it; ++feat_it) {
183 
184  if ((index % 10000) == 0)
185  cout << index << endl;
186 
187  const CSeq_feat& or_feat(feat_it->GetOriginalFeature());
188 
189  // Grab the database info
191  CRef<CVariation> pVariation(new CVariation());
192  if(!NSNPVariationHelper::ConvertFeat(*pVariation, or_feat))
193  continue;
194 
195  // Retrieve gene associated with SNP
196  tag.Reset(); // reuse tag reference.
197  CConstRef<CSeq_feat> gene_feat;
198  gene_feat = sequence::GetBestOverlapForSNP(or_feat,
200  *scope);
201  if (gene_feat) {
202  tag = gene_feat->GetNamedDbxref("GeneID");
203  if (!tag) {
204  tag = gene_feat->GetNamedDbxref("LocusLink");
205  }
206  }
207  if (tag) {
208  pVariation->SetOther_ids().push_back(CRef<CDbtag>(const_cast<CDbtag*>(tag.GetPointer())));
209  }
210 
211  int location = -1;
212  if (pVariation->CanGetPlacements()) {
213  const CVariation::TPlacements& VarPlacements(pVariation->GetPlacements());
214  if (!VarPlacements.empty()) {
215  CRef<CVariantPlacement> pPlacement(VarPlacements.front());
216  if (pPlacement->CanGetLoc())
217  location = pPlacement->GetLoc().GetTotalRange().GetFrom() + 1;
218  }
219  }
220 
221  if (location < 0)
222  continue;
223 
224  int rsid = -1;
225 
226  if (pVariation->GetId().CanGetTag()) {
227  const auto& id = pVariation->GetId().GetTag();
228  if (id.IsId())
229  rsid = id.GetId();
230  }
231 
232  if (rsid < 0)
233  continue;
234 
235  sv[index] = location;
236  svRsId[index] = rsid;
237  svRsId2[location] = rsid;
238  ++index;
239  }
240 
241  cout << "Total number of SNPs: " << index << endl;
242 
243  {
244  sv.optimize();
246  bm::sparse_vector_serialize(sv, sv_lay, tb);
247  x_Serialize(sv_lay.buf(), sv_lay.size(), "snp_pos.bin");
248  double bpsnp = sv_lay.size()*8.0 / index;
249  cout << "SNP positions size: " << sv_lay.size() << " Bits per SNP: " << bpsnp << endl;
250  }
251 
252  {
253  svRsId.optimize();
255  bm::sparse_vector_serialize(svRsId, sv_lay, tb);
256  x_Serialize(sv_lay.buf(), sv_lay.size(), "snp_rsid.bin");
257  double bpsnp = sv_lay.size()*8.0 / index;
258  cout << "SNP RsIds size: " << sv_lay.size() << " Bits per SNP: " << bpsnp << endl;
259  }
260 
261  {
262  compsvector compv;
263  compv.load_from(svRsId2);
264  compv.optimize();
265 
267  bm::sparse_vector_serialize(compv.sv_, sv_lay, tb);
268  x_Serialize(sv_lay.buf(), sv_lay.size(), "snp_rsid_sparse.bin");
269  double bpsnp = sv_lay.size()*8.0 / index;
270  cout << "Sparse SNP RsIds size: " << sv_lay.size() << " Bits per SNP: " << bpsnp << endl;
271  }
272 
273  return 0;
274 }
275 
277 
279 
280 /////////////////////////////////////////////////////////////////////////////
281 // MAIN
282 
283 int NcbiSys_main(int argc, ncbi::TXChar* argv[])
284 {
285  // Execute main application function
286  return CTestBMApp().AppMain(argc, argv);
287 }
User-defined methods of the data storage class.
User-defined methods of the data storage class.
#define BM_DECLARE_TEMP_BLOCK(x)
Definition: bm.h:47
Sparse constainer sparse_vector<> for integer types using bit-transposition transform.
Compressed sparse container rsc_sparse_vector<> for integer types.
Serialization for sparse_vector<>
CArgDescriptions –.
Definition: ncbiargs.hpp:541
CArgs –.
Definition: ncbiargs.hpp:379
Definition: Dbtag.hpp:53
CFeat_CI –.
Definition: feat_ci.hpp:64
Class for support low level input/output for files.
Definition: ncbifile.hpp:3475
static TRegisterLoaderInfo RegisterInObjectManager(CObjectManager &om, CReader *reader=0, CObjectManager::EIsDefault is_default=CObjectManager::eDefault, CObjectManager::TPriority priority=CObjectManager::kPriority_NotSet)
Definition: gbloader.cpp:366
CScope –.
Definition: scope.hpp:92
CSeqIdException –.
Definition: Seq_id.hpp:1001
namespace ncbi::objects::
Definition: Seq_feat.hpp:58
CConstRef< CDbtag > GetNamedDbxref(const CTempString &db) const
Return a specified DB xref.
Definition: Seq_feat.cpp:415
void x_SetupArgDescriptions()
Definition: test_bm.cpp:109
void x_Serialize(const unsigned char *buff, size_t size, const string &fileName)
Definition: test_bm.cpp:121
string m_HomeDir
Definition: test_bm.cpp:74
int m_MaxCount
Definition: test_bm.cpp:75
virtual void Init(void)
Initialize the application.
Definition: test_bm.cpp:88
CTestBMApp()
Definition: test_bm.cpp:79
virtual int Run(void)
Run the application.
Definition: test_bm.cpp:133
static bool ConvertFeat(CVariation &Variation, const CSeq_feat &SrcFeat)
legacy SNP feature conversion into a variation object
Definition: snp_utils.cpp:336
succinct sparse vector with runtime compression using bit-slicing / transposition method
Definition: bmsparsevec.h:87
void optimize(bm::word_t *temp_block=0, typename bvector_type::optmode opt_mode=bvector_type::opt_compress, typename sparse_vector< Val, BV >::statistics *stat=0)
run memory optimization for all vector planes
Definition: bmsparsevec.h:2148
static const char location[]
Definition: config.c:97
const CNcbiEnvironment & GetEnvironment(void) const
Get the application's cached environment.
const string & GetProgramExecutablePath(EFollowLinks follow_links=eIgnoreLinks) const
Get the application's executable path.
virtual const CArgs & GetArgs(void) const
Get parsed command line arguments.
Definition: ncbiapp.cpp:305
int AppMain(int argc, const char *const *argv, const char *const *envp=0, EAppDiagStream diag=eDS_Default, const char *conf=NcbiEmptyCStr, const string &name=NcbiEmptyString)
Main function (entry point) for the NCBI application.
Definition: ncbiapp.cpp:819
virtual void SetupArgDescriptions(CArgDescriptions *arg_desc)
Setup the command line argument descriptions.
Definition: ncbiapp.cpp:1195
const CNcbiArguments & GetArguments(void) const
Get the application's cached unprocessed command-line arguments.
@ eString
An arbitrary string.
Definition: ncbiargs.hpp:589
@ eInteger
Convertible into an integer number (int or Int8)
Definition: ncbiargs.hpp:592
#define LOG_POST(message)
This macro is deprecated and it's strongly recomended to move in all projects (except tests) to macro...
Definition: ncbidiag.hpp:226
#define STD_CATCH_ALL(message)
Standard handling of "exception"-derived exceptions; catches non-standard exceptions and generates "u...
Definition: ncbiexpt.hpp:570
void Info(CExceptionArgs_Base &args)
Definition: ncbiexpt.hpp:1185
size_t Write(const void *buf, size_t count) const
Write file.
Definition: ncbifile.cpp:6709
void Open(const string &filename, EOpenMode open_mode, EAccessMode access_mode, EShareMode share_mode=eShare)
Open file.
Definition: ncbifile.cpp:6416
static string DeleteTrailingPathSeparator(const string &path)
Delete trailing path separator, if any.
Definition: ncbifile.cpp:465
static string ConcatPath(const string &first, const string &second)
Concatenate two parts of the path for the current OS.
Definition: ncbifile.cpp:776
static void SplitPath(const string &path, string *dir=0, string *base=0, string *ext=0)
Split a path string into its basic components.
Definition: ncbifile.cpp:358
@ eWrite
File can be written.
Definition: ncbifile.hpp:3436
@ eCreate
Create a new file, or truncate an existing one.
Definition: ncbifile.hpp:3421
static objects::SAnnotSelector GetAnnotSelector(TAnnotFlags flags=0)
request an annotation selector for a given type
Definition: utils.cpp:167
virtual void Assign(const CSerialObject &source, ESerialRecursionMode how=eRecursive)
Optimized implementation of CSerialObject::Assign, which is not so efficient.
Definition: Seq_id.cpp:318
CConstRef< CSeq_id > GetSeqId(void) const
static void LoadAccessionGuide(const string &filename)
Definition: Seq_id.cpp:1882
static CSeq_id_Handle GetHandle(const CSeq_id &id)
Normal way of getting a handle, works for any seq-id.
void SetWhole(TWhole &v)
Definition: Seq_loc.hpp:982
TRange GetTotalRange(void) const
Definition: Seq_loc.hpp:913
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
CConstRef< CSeq_feat > GetBestOverlapForSNP(const CSeq_feat &snp_feat, CSeqFeatData::E_Choice type, CScope &scope, bool search_both_strands=true)
Get the best overlapping feature for a SNP (variation) feature.
Definition: sequence.cpp:1345
@ eGetId_Best
return the "best" gi (uses FindBestScore(), with CSeq_id::CalculateScore() as the score function
Definition: sequence.hpp:101
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
void AddDefaults(TPriority pri=kPriority_Default)
Add default data loaders from object manager.
Definition: scope.cpp:504
const CSeq_feat & GetOriginalFeature(void) const
Get original feature with unmapped location/product.
SAnnotSelector & SetExcludeExternal(bool exclude=true)
External annotations for the Object Manger are annotations located in top level Seq-entry different f...
SAnnotSelector & SetMaxSize(TMaxSize max_size)
Set maximum number of annotations to find.
SAnnotSelector & AddNamedAnnots(const CAnnotName &name)
Add named annot to set of annots names to look for.
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
char TXChar
Definition: ncbistr.hpp:172
@ use_null
support "non-assigned" or "NULL" logic
Definition: bmconst.h:230
TFrom GetFrom(void) const
Get the From member data.
Definition: Range_.hpp:222
list< CRef< CVariantPlacement > > TPlacements
bool CanGetLoc(void) const
Check if it is safe to call GetLoc method.
const TLoc & GetLoc(void) const
Get the Loc member data.
void sparse_vector_serialize(const SV &sv, sparse_vector_serial_layout< SV > &sv_layout, bm::word_t *temp_block=0)
Serialize sparse vector into a memory buffer(s) structure.
const TYPE & Get(const CNamedParameterList *param)
const struct ncbi::grid::netcache::search::fields::SIZE size
Magic spell ;-) needed for some weird compilers... very empiric.
const char * tag
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
Defines command line argument related classes.
Defines unified interface to application:
The Object manager core.
SAnnotSelector –.
layout class for serialization buffer structure
const unsigned char * buf() const noexcept
Return serialization buffer pointer.
size_t size() const noexcept
return current serialized size
USING_SCOPE(objects)
int NcbiSys_main(int argc, ncbi::TXChar *argv[])
Definition: test_bm.cpp:283
bm::compressed_sparse_vector< unsigned, svector > compsvector
Definition: test_bm.cpp:59
bm::sparse_vector< unsigned, bm::bvector<> > svector
Definition: test_bm.cpp:58
Modified on Tue Apr 16 20:07:05 2024 by modify_doxy.py rev. 669887