NCBI C++ ToolKit
blastdb_dataextract.hpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: blastdb_dataextract.hpp 100517 2023-08-09 13:02:28Z fongah2 $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Christiam Camacho
27  *
28  */
29 
30 /** @file blastdb_dataextract.hpp
31  * Declares classes which extract data from a BLAST database
32  */
33 
34 #ifndef OBJTOOLS_BLASTDB_FORMAT___BLASTDB_DATAEXTRACT__HPP
35 #define OBJTOOLS_BLASTDB_FORMAT___BLASTDB_DATAEXTRACT__HPP
36 
39 #include <objmgr/util/sequence.hpp>
40 #include <sstream>
41 
42 // Note: move this to corelib and define properly (see blastformat equivalent)
43 // #define NCBI_BLASTDB_FORMAT_EXPORT
44 
47 
48 /// Class to extract data from a BLAST database given an identifier
50 
51 public:
52  /// ctor
56  int filt_algo_id = -1,
57  int fmt_algo_id = -1,
58  int line_width = 80,
59  bool target_only = true,
60  bool ctrl_a = false)
61  : m_BlastDb(blastdb),
62  m_OrigSeqRange(range),
63  m_SeqRange(range),
64  m_Strand(strand),
65  m_FiltAlgoId(filt_algo_id),
66  m_FmtAlgoId(fmt_algo_id),
67  m_LineWidth(line_width),
68  m_TargetOnly(target_only),
69  m_UseCtrlA(ctrl_a),
70  m_Oid(0),
71  m_UseLongSeqIds(false)
72 
73  {
74  m_Gi2TaxidMap.first = -1;
75  m_Gi2AccMap.first = -1;
76  m_Gi2TitleMap.first = -1;
77  m_Oid2Pig.first = -1;
78  m_Gi2SeqIdMap.first = -1;
79 
81  if (app) {
82  const CNcbiRegistry& registry = app->GetConfig();
83  m_UseLongSeqIds = (registry.Get("BLAST", "LONG_SEQID") == "1");
84  }
85  }
86 
87  /// Setting seqid
88  /// @param id sequence identifier [in]
89  void SetSeqId(const CBlastDBSeqId &seq_id, bool get_data = false);
90  string ExtractOid();
91  string ExtractPig();
92  string ExtractGi();
93  string ExtractAccession();
94  string ExtractSeqId();
95  string ExtractTitle();
96  string ExtractTaxId();
97  string ExtractLeafTaxIds();
98  string ExtractCommonTaxonomicName();
99  string ExtractLeafCommonTaxonomicNames();
100  string ExtractScientificName();
101  string ExtractLeafScientificNames();
102  string ExtractBlastName();
103  string ExtractSuperKingdom();
104  string ExtractMaskingData();
105  string ExtractSeqData();
106  string ExtractSeqLen();
107  string ExtractHash();
108  string ExtractLinksInteger();
109  string ExtractMembershipInteger();
110  string ExtractAsn1Defline();
111  string ExtractAsn1Bioseq();
112  string ExtractFasta(const CBlastDBSeqId &seq_id);
113 
114  // Call before ExtractFasta or SetSeqId
115  void SetConfig(TSeqRange range, objects::ENa_strand strand, int filt_algo_id);
116 
117 protected:
118  /// underlying Blast database
120  /// sequence range
122  /// sequence range
124  /// strand
126  /// filtering algorithsm for sequence
128  /// filtering algorithsm for outfmt
130  /// FASTA output line width
132  /// Should the record contain mutilple seqids? (used only with %f)
134  /// Replace with ctrl_a? (used only with %f)
137  /// OID of the record
139  /// the target gi
141  /// bioseq
143  /// Cache the defline (for membership bits)
145  /// Pair with a gi2taxid map for one Oid
146  pair<TOID, map<TGi, TTaxId> > m_Gi2TaxidMap;
147  /// Pair with a gi2taxid-set map for one Oid
148  pair<TOID, map<TGi, set<TTaxId> > > m_Gi2TaxidSetMap;
149  /// Pair with a gi2accesion map for one Oid
150  pair<TOID, map<TGi, string> > m_Gi2AccMap;
151  /// Pair with a gi2title map for one Oid
152  pair<TOID, map<TGi, string> > m_Gi2TitleMap;
153  /// Pair with a pig for one Oid.
154  pair<TOID, CSeqDB::TPIG> m_Oid2Pig;
155  // Pair with a gi2seqid for one Oid.
156  pair<TOID, map<TGi, string> > m_Gi2SeqIdMap;
157  /// Use long sequence ids (with gi and accessions with database source)
159 private:
160  void x_ExtractMaskingData(CSeqDB::TSequenceRanges &ranges, int algo_id);
161  TTaxId x_ExtractTaxId();
162  void x_ExtractLeafTaxIds(set<TTaxId>& taxids);
163  /// Sets the map
164  void x_SetGi2AccMap();
165  /// Sets the map
166  void x_SetGi2TitleMap();
167  // sets the gi to seqid map
168  void x_SetGi2SeqIdMap();
169 
170  /// Initialize the cached defline
171  void x_InitDefline();
172 
173  /// Setting the target_only m_Gi
174  void x_SetGi();
175 };
176 
178 {
179 public:
181  unsigned int accession:1;
182  unsigned int seq_id:1;
183  unsigned int gi:1;
184  unsigned int title:1;
185  unsigned int membership:1;
186  unsigned int tax_id:1;
187  unsigned int leaf_node_tax_ids:1;
188  // The tax names include: scientific_name, common_name, blast_name and super kingdom
189  unsigned int tax_names:1;
190  unsigned int leaf_node_tax_names:1;
191  unsigned int pig:1;
192  unsigned int links:1;
193  unsigned int asn_defline:1;
194 
195  };
196 
197  enum FieldIndex {
198  accession = 0,
200  gi,
214  max_index
215  };
216  static void ExtractDataFromBlastDeflineSet(const CBlast_def_line_set & dl_set,
217  vector<string> & results,
218  BlastDeflineFields fields,
219  string target_id,
220  bool use_long_id);
221 
222  static void ExtractDataFromBlastDefline(const CBlast_def_line & dl,
223  vector<string> & results,
224  BlastDeflineFields fields,
225  bool use_long_id);
226 
227  static void ProcessFastaDeflines(CBioseq & bioseq,
228  string & out,
229  bool use_ctrla);
230 
231  static void ProcessFastaDeflines(CBioseq & bioseq,
232  string & out,
233  bool use_ctrla,
234  const CSeq_loc* location,
235  ENa_strand strand);
236 };
237 
239 {
240 public:
241  static Uint4 GetSeqHash(const char* buffer, int length);
242  static void ApplySeqMask(string & seq,
243  const CSeqDB::TSequenceRanges & masks,
245  static void GetReverseStrandSeq(string & seq);
246  static string GetMasksString(const CSeqDB::TSequenceRanges & masks);
247 
248 };
249 
251 
252 #endif /* OBJTOOLS_BLASTDB_FORMAT___BLASTDB_DATAEXTRACT__HPP */
string ExtractAccession(const string &long_acc)
USING_SCOPE(objects)
Definition of an identifier for a sequence in a BLAST database.
Class to extract data from a BLAST database given an identifier.
pair< TOID, CSeqDB::TPIG > m_Oid2Pig
Pair with a pig for one Oid.
pair< TOID, map< TGi, string > > m_Gi2SeqIdMap
int m_FmtAlgoId
filtering algorithsm for outfmt
objects::ENa_strand m_Strand
strand
pair< TOID, map< TGi, set< TTaxId > > > m_Gi2TaxidSetMap
Pair with a gi2taxid-set map for one Oid.
int m_FiltAlgoId
filtering algorithsm for sequence
pair< TOID, map< TGi, string > > m_Gi2AccMap
Pair with a gi2accesion map for one Oid.
TOID m_Oid
OID of the record.
TSeqRange m_OrigSeqRange
sequence range
pair< TOID, map< TGi, TTaxId > > m_Gi2TaxidMap
Pair with a gi2taxid map for one Oid.
CBlastDBExtractor(CSeqDB &blastdb, TSeqRange range=TSeqRange(), objects::ENa_strand strand=objects::eNa_strand_both, int filt_algo_id=-1, int fmt_algo_id=-1, int line_width=80, bool target_only=true, bool ctrl_a=false)
ctor
CRef< CBioseq > m_Bioseq
bioseq
int m_LineWidth
FASTA output line width.
bool m_UseCtrlA
Replace with ctrl_a? (used only with f)
bool m_UseLongSeqIds
Use long sequence ids (with gi and accessions with database source)
CSeqDB & m_BlastDb
underlying Blast database
bool m_TargetOnly
Should the record contain mutilple seqids? (used only with f)
TSeqRange m_SeqRange
sequence range
CRef< CBlast_def_line_set > m_Defline
Cache the defline (for membership bits)
pair< TOID, map< TGi, string > > m_Gi2TitleMap
Pair with a gi2title map for one Oid.
Encapsulates identifier to retrieve data from a BLAST database.
CNcbiApplicationAPI –.
CNcbiRegistry –.
Definition: ncbireg.hpp:913
CSeqDB.
Definition: seqdb.hpp:161
int TOID
Sequence type accepted and returned for OID indices.
Definition: seqdb.hpp:216
static CMemoryRegistry registry
Definition: cn3d_tools.cpp:81
bool ExtractGi(const CRef< CBioseq > &bioseq, TGi &gi, unsigned int nth=1)
Definition: cuSequence.cpp:508
std::ofstream out("events_result.xml")
main entry point for tests
#define false
Definition: bool.h:36
static const char location[]
Definition: config.c:97
const CNcbiRegistry & GetConfig(void) const
Get the application's cached configuration parameters (read-only).
static CNcbiApplicationAPI * Instance(void)
Singleton method.
Definition: ncbiapp.cpp:127
SStrictId_Tax::TId TTaxId
Taxon id type.
Definition: ncbimisc.hpp:1048
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
static TThisType GetEmpty(void)
Definition: range.hpp:306
CRange< TSeqPos > TSeqRange
typedefs for sequence ranges
Definition: range.hpp:419
virtual const string & Get(const string &section, const string &name, TFlags flags=0) const
Get the parameter value.
Definition: ncbireg.cpp:262
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define NCBI_BLASTDB_FORMAT_EXPORT
Definition: ncbi_export.h:1089
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
@ eNa_strand_both
in forward orientation
Definition: Na_strand_.hpp:68
range(_Ty, _Ty) -> range< _Ty >
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
static pcre_uint8 * buffer
Definition: pcretest.c:1051
Defines BLAST database access classes.
List of sequence offset ranges.
Definition: seqdb.hpp:236
Modified on Sat May 04 13:18:11 2024 by modify_doxy.py rev. 669887