NCBI C++ ToolKit
seedtop.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: seedtop.cpp 77822 2017-05-09 14:42:25Z madden $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Ning Ma
27  *
28  */
29 
30 /// @file seedtop.cpp
31 /// Implements the CSeedTop class.
32 
33 #include <ncbi_pch.hpp>
34 
43 #include "blast_setup.hpp"
44 
45 /** @addtogroup AlgoBlast
46  *
47  * @{
48  */
49 
50 
52 BEGIN_SCOPE(blast)
53 
54 CSeedTop::CSeedTop(const string & pattern)
55  : m_Pattern(pattern)
56 {
57  x_ParsePattern();
58  x_MakeScoreBlk();
59  x_MakeLookupTable();
60 }
61 
63 {
64  vector <string> units;
66  NStr::Split(NStr::ToUpper(m_Pattern), "-", units, 0);
67  ITERATE(vector<string>, unit, units){
68  if (*unit != "") {
69  char ch = (*unit)[0];
70  if (ch=='[' || ch=='{' || ch=='X' || (*unit).length()==1 || (*unit)[1]=='(') {
71  m_Units.push_back(SPatternUnit(*unit));
72  } else {
73  for (SIZE_TYPE i=0; i<(*unit).length(); ++i) {
74  m_Units.push_back(SPatternUnit(string(*unit, i, 1)));
75  }
76  }
77  }
78  }
79 }
80 
82 {
83  CLookupTableOptions lookup_options;
84  LookupTableOptionsNew(m_Program, &lookup_options);
85  lookup_options->phi_pattern = strdup(m_Pattern.c_str());
86  // Lookup segments, scoreblk, and rps info arguments are irrelevant
87  // and passed as NULL.
88  LookupTableWrapInit(NULL, lookup_options, NULL, NULL,
90 }
91 
93 {
94  CBlastScoringOptions score_options;
95  BlastScoringOptionsNew(m_Program, &score_options);
98  BlastSetup_ScoreBlkInit(NULL, query_info, score_options, m_Program,
100 }
101 
103 {
104  BlastOffsetPair* offset_pairs = (BlastOffsetPair*)
106 
107  CRef<CSeq_id> sid;
108  TSeqPos slen;
109  TSeedTopResults retv;
110 
111  BlastSeqSrcGetSeqArg seq_arg;
112  memset((void*) &seq_arg, 0, sizeof(seq_arg));
114 
115  BlastSeqSrc *seq_src = db->MakeSeqSrc();
116  IBlastSeqInfoSrc *seq_info_src = db->MakeSeqInfoSrc();
118  (MAX(BlastSeqSrcGetNumSeqs(seq_src)/100, 1));
119 
120  while( (seq_arg.oid = BlastSeqSrcIteratorNext(seq_src, itr))
121  != BLAST_SEQSRC_EOF) {
122  if (seq_arg.oid == BLAST_SEQSRC_ERROR) break;
123  if (BlastSeqSrcGetSequence(seq_src, &seq_arg) < 0) continue;
124 
125  Int4 start_offset = 0;
126  GetSequenceLengthAndId(seq_info_src, seq_arg.oid, sid, &slen);
127 
128  while (start_offset < seq_arg.seq->length) {
129  // Query block and array size arguments are not used when scanning
130  // subject for pattern hits, so pass NULL and 0 for respective
131  // arguments.
132  Int4 hit_count =
133  PHIBlastScanSubject(m_Lookup, NULL, seq_arg.seq, &start_offset,
134  offset_pairs, 0);
135 
136  if (hit_count == 0) break;
137 
138  for (int index = 0; index < hit_count; ++index) {
139  vector<vector<int> > pos_list;
140  vector<int> pos(m_Units.size());
141  unsigned int start = offset_pairs[index].phi_offsets.s_start;
142  unsigned int end = offset_pairs[index].phi_offsets.s_end + 1;
143  x_GetPatternRanges(pos, 0, seq_arg.seq->sequence + start, end-start, pos_list);
144  ITERATE(vector<vector<int> >, it_pos, pos_list) {
145  CSeq_loc::TRanges ranges;
146  int r_start(start);
147  int r_end(r_start);
148  int uid(0);
149  ITERATE(vector<int>, q, *it_pos) {
150  if (m_Units[uid].is_x) {
151  ranges.push_back(CRange<TSeqPos>(r_start, r_end-1));
152  r_start = r_end + *q;
153  r_end = r_start;
154  } else {
155  r_end += (*q);
156  }
157  ++uid;
158  }
159  ranges.push_back(CRange<TSeqPos>(r_start, r_end-1));
160  CRef<CSeq_loc> hit(new CSeq_loc(*sid, ranges));
161  retv.push_back(hit);
162  }
163  // skip the next pos_list.size()-1 hits
164  _ASSERT(index + (Int4)(pos_list.size()) - 1 < hit_count);
165  for (unsigned int i = 1; i< pos_list.size(); ++i) {
166  _ASSERT(offset_pairs[index + i].phi_offsets.s_start == start);
167  _ASSERT(offset_pairs[index + i].phi_offsets.s_end + 1 == end);
168  }
169  index += pos_list.size() - 1;
170  }
171  }
172 
173  BlastSeqSrcReleaseSequence(seq_src, &seq_arg);
174  }
175 
176  BlastSequenceBlkFree(seq_arg.seq);
177  itr = BlastSeqSrcIteratorFree(itr);
178  sfree(offset_pairs);
179  return retv;
180 }
181 
183 {
184  CConstRef<CSeq_id> sid = bhl.GetSeqId();
185  CSeq_loc sl;
186  sl.SetWhole();
187  sl.SetId(*sid);
188  SSeqLoc subject(sl, bhl.GetScope());
189  TSeqLocVector subjects;
190  subjects.push_back(subject);
191  CRef<IQueryFactory> qf(new CObjMgr_QueryFactory(subjects));
192  CRef<CBlastOptionsHandle> opt_handle
194  CRef<CLocalDbAdapter> db(new CLocalDbAdapter(qf, opt_handle));
195  return Run(db);
196 }
197 
198 void
199 CSeedTop::x_GetPatternRanges(vector<int> &pos, Uint4 off, Uint1 *seq, Uint4 len,
200  vector<vector<int> > &ranges)
201 {
202  // Not enough sequence letters
203  if (len + off + m_Units[off].at_least < m_Units.size() + 1) return;
204  // at least test
205  unsigned int rep;
206  for (rep =0; rep < m_Units[off].at_least; ++rep) {
207  if (!m_Units[off].test(NCBISTDAA_TO_AMINOACID[seq[rep]])) return;
208  }
209  // at most test
210  while(off < m_Units.size() - 1) {
211  pos[off] = rep;
212  x_GetPatternRanges(pos, off+1, seq+rep, len-rep, ranges);
213  ++rep;
214  if (rep >= m_Units[off].at_most) return;
215  if (len + off + 1 < m_Units.size() + rep) return;
216  if (!m_Units[off].test(NCBISTDAA_TO_AMINOACID[seq[rep]])) return;
217  }
218  // the last unit of the pattern
219  if (m_Units[off].at_most <= len) return;
220  for (; rep < len; ++rep) {
221  if (!m_Units[off].test(NCBISTDAA_TO_AMINOACID[seq[rep]])) return;
222  }
223  pos[off] = rep;
224  ranges.push_back(pos);
225  return;
226 }
227 
228 END_SCOPE(blast)
230 
231 
232 /* @} */
union BlastOffsetPair BlastOffsetPair
This symbol enables the verbose option in makeblastdb and other BLAST+ search command line applicatio...
#define sfree(x)
Safe free a pointer: belongs to a higher level header.
Definition: blast_def.h:112
Int2 BlastScoringOptionsNew(EBlastProgramType program, BlastScoringOptions **options)
Allocate memory for BlastScoringOptions and fill with default values.
Int2 LookupTableOptionsNew(EBlastProgramType program, LookupTableOptions **options)
Allocate memory for lookup table options and fill with default values.
Declares the CBlastOptionsHandle and CBlastOptionsFactory classes.
BlastQueryInfo * BlastQueryInfoNew(EBlastProgramType program, int num_queries)
Allocate memory for query information structure.
Declarations of auxiliary functions using IBlastSeqInfoSrc to retrieve ids and related sequence infor...
#define BLAST_SEQSRC_ERROR
Error while retrieving sequence.
Definition: blast_seqsrc.h:291
Int4 BlastSeqSrcIteratorNext(const BlastSeqSrc *seq_src, BlastSeqSrcIterator *itr)
Increments the BlastSeqSrcIterator.
Definition: blast_seqsrc.c:425
BlastSeqSrcIterator * BlastSeqSrcIteratorFree(BlastSeqSrcIterator *itr)
Frees the BlastSeqSrcIterator structure.
Definition: blast_seqsrc.c:412
BlastSeqSrcIterator * BlastSeqSrcIteratorNewEx(unsigned int chunk_sz)
Allocate and initialize an iterator over a BlastSeqSrc.
Definition: blast_seqsrc.c:387
void BlastSeqSrcReleaseSequence(const BlastSeqSrc *seq_src, BlastSeqSrcGetSeqArg *getseq_arg)
Deallocate individual sequence.
Definition: blast_seqsrc.c:289
Int4 BlastSeqSrcGetNumSeqs(const BlastSeqSrc *seq_src)
Get the number of sequences contained in the sequence source.
Definition: blast_seqsrc.c:177
Int2 BlastSeqSrcGetSequence(const BlastSeqSrc *seq_src, BlastSeqSrcGetSeqArg *getseq_arg)
Retrieve an individual sequence.
Definition: blast_seqsrc.c:271
#define BLAST_SEQSRC_EOF
No more sequences available.
Definition: blast_seqsrc.h:292
Utilities initialize/setup BLAST.
Int2 BlastSetup_ScoreBlkInit(BLAST_SequenceBlk *query_blk, const BlastQueryInfo *query_info, const BlastScoringOptions *scoring_options, EBlastProgramType program_number, BlastScoreBlk **sbpp, double scale_factor, Blast_Message **blast_message, GET_MATRIX_PATH get_path)
Initializes the score block structure.
Definition: blast_setup.c:456
Internal auxiliary setup classes/functions for C++ BLAST APIs.
@ eBlastp
Protein-Protein.
Definition: blast_types.hpp:59
BLAST_SequenceBlk * BlastSequenceBlkFree(BLAST_SequenceBlk *seq_blk)
Deallocate memory for a sequence block.
Definition: blast_util.c:245
CBioseq_Handle –.
Wrapper class for BlastQueryInfo .
Definition: blast_aux.hpp:311
Wrapper class for BlastScoringOptions .
Definition: blast_aux.hpp:334
Wrapper class for Blast_Message .
Definition: blast_aux.hpp:352
Interface to create a BlastSeqSrc suitable for use in CORE BLAST from a a variety of BLAST database/s...
Wrapper class for LookupTableOptions .
Definition: blast_aux.hpp:314
NCBI C++ Object Manager dependant implementation of IQueryFactory.
Abstract base class to encapsulate retrieval of sequence identifiers.
#define test(a, b, c, d, e)
Definition: numeric.c:170
TSeedTopResults Run(CRef< CLocalDbAdapter > db)
Definition: seedtop.cpp:102
void x_ParsePattern()
Definition: seedtop.cpp:62
CBlastScoreBlk m_ScoreBlk
Definition: seedtop.hpp:155
BlastSeqSrc * MakeSeqSrc()
Retrieves or constructs the BlastSeqSrc.
static CBlastOptionsHandle * Create(EProgram program, EAPILocality locality=CBlastOptions::eLocal)
Creates an options handle object configured with default options for the requested program,...
CLookupTableWrap m_Lookup
Definition: seedtop.hpp:154
string m_Pattern
Definition: seedtop.hpp:153
void x_GetPatternRanges(vector< int > &pos, Uint4 off, Uint1 *seq, Uint4 len, vector< vector< int > > &ranges)
Definition: seedtop.cpp:199
IBlastSeqInfoSrc * MakeSeqInfoSrc()
Retrieves or constructs the IBlastSeqInfoSrc.
char * BlastFindMatrixPath(const char *matrix_name, Boolean is_prot)
Returns the path to a specified matrix.
vector< CConstRef< CSeq_loc > > TSeedTopResults
Definition: seedtop.hpp:140
vector< struct SPatternUnit > m_Units
Definition: seedtop.hpp:156
void x_MakeLookupTable()
Definition: seedtop.cpp:81
static const EBlastProgramType m_Program
Definition: seedtop.hpp:152
void x_MakeScoreBlk()
Definition: seedtop.cpp:92
const char NCBISTDAA_TO_AMINOACID[]
Translates between ncbieaa and ncbistdaa.
void GetSequenceLengthAndId(const IBlastSeqInfoSrc *seqinfo_src, int oid, CRef< objects::CSeq_id > &seqid, TSeqPos *length)
Retrieves subject sequence Seq-id and length.
@ eBlastEncodingProtein
NCBIstdaa.
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NULL
Definition: ncbistd.hpp:225
void SetWhole(TWhole &v)
Definition: Seq_loc.hpp:982
void SetId(CSeq_id &id)
set the 'id' field in all parts of this location
Definition: Seq_loc.cpp:3474
CPacked_seqint::TRanges TRanges
Definition: Seq_loc.hpp:103
CConstRef< CSeq_id > GetSeqId(void) const
Get id which can be used to access this bioseq handle Throws an exception if none is available.
CScope & GetScope(void) const
Get scope this handle belongs to.
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
NCBI_NS_STD::string::size_type SIZE_TYPE
Definition: ncbistr.hpp:132
static list< string > & Split(const CTempString str, const CTempString delim, list< string > &arr, TSplitFlags flags=0, vector< SIZE_TYPE > *token_pos=NULL)
Split a string using specified delimiters.
Definition: ncbistr.cpp:3452
static void TruncateSpacesInPlace(string &str, ETrunc where=eTrunc_Both)
Truncate whitespace in a string (in-place)
Definition: ncbistr.cpp:3192
static string & ToUpper(string &str)
Convert string to upper case – string& version.
Definition: ncbistr.cpp:424
int i
int len
Int4 GetOffsetArraySize(LookupTableWrap *lookup)
Determine the size of the offsets arrays to be filled by the ScanSubject function.
Definition: lookup_wrap.c:255
Int2 LookupTableWrapInit(BLAST_SequenceBlk *query, const LookupTableOptions *lookup_options, const QuerySetUpOptions *query_options, BlastSeqLoc *lookup_segments, BlastScoreBlk *sbp, LookupTableWrap **lookup_wrap_ptr, const BlastRPSInfo *rps_info, Blast_Message **error_msg, BlastSeqSrc *seqsrc)
Create the lookup table for all query words.
Definition: lookup_wrap.c:47
#define strdup
Definition: ncbi_ansi_ext.h:70
#define MAX(a, b)
returns larger of a and b.
Definition: ncbi_std.h:117
NOTE: This file contains work in progress and the APIs are likely to change, please do not rely on th...
Pseudo lookup table structure and database scanning functions used in PHI-BLAST.
Int4 PHIBlastScanSubject(const LookupTableWrap *lookup_wrap, const BLAST_SequenceBlk *query_blk, const BLAST_SequenceBlk *subject, Int4 *offset, BlastOffsetPair *offset_pairs, Int4 array_size)
Scans the subject sequence from "offset" to the end of the sequence.
Definition: phi_lookup.c:725
Declares the CSeedTop class.
Defines a concrete strategy for the IBlastSeqInfoSrc interface for sequence identifiers retrieval fro...
Implementation of the BlastSeqSrc interface for a vector of sequence locations.
static SLJIT_INLINE sljit_ins msg(sljit_gpr r, sljit_s32 d, sljit_gpr x, sljit_gpr b)
vector< SSeqLoc > TSeqLocVector
Vector of sequence locations.
Definition: sseqloc.hpp:129
Uint1 * sequence
Sequence used for search (could be translation).
Definition: blast_def.h:243
Structure used as the second argument to functions satisfying the GetSeqBlkFnPtr signature,...
Definition: blast_seqsrc.h:257
Int4 oid
Oid in BLAST database, index in an array of sequences, etc [in].
Definition: blast_seqsrc.h:259
EBlastEncoding encoding
Encoding of sequence, i.e.
Definition: blast_seqsrc.h:263
BLAST_SequenceBlk * seq
Sequence to return, if NULL, it should allocated by GetSeqBlkFnPtr (using BlastSeqBlkNew or BlastSetU...
Definition: blast_seqsrc.h:284
Complete type definition of Blast Sequence Source Iterator.
Complete type definition of Blast Sequence Source ADT.
Definition: blast_seqsrc.c:43
char * phi_pattern
PHI-BLAST pattern.
Structure to represent a single sequence to be fed to BLAST.
Definition: sseqloc.hpp:47
static string subject
#define _ASSERT
This symbol enables the verbose option in makeblastdb and other BLAST+ search command line applicatio...
Definition: blast_def.h:141
Uint4 s_start
Start offset of pattern in subject.
Definition: blast_def.h:147
Uint4 s_end
End offset of pattern in subject.
Definition: blast_def.h:148
struct BlastOffsetPair::@7 phi_offsets
Pattern offsets in subject (PHI BLAST only)
#define const
Definition: zconf.h:232
voidp calloc(uInt items, uInt size)
Modified on Fri Sep 20 14:57:12 2024 by modify_doxy.py rev. 669887