NCBI C++ ToolKit
split_query_aux_priv.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* ===========================================================================
2  *
3  * PUBLIC DOMAIN NOTICE
4  * National Center for Biotechnology Information
5  *
6  * This software/database is a "United States Government Work" under the
7  * terms of the United States Copyright Act. It was written as part of
8  * the author's official duties as a United States Government employee and
9  * thus cannot be copyrighted. This software/database is freely available
10  * to the public for use. The National Library of Medicine and the U.S.
11  * Government have not placed any restriction on its use or reproduction.
12  *
13  * Although all reasonable efforts have been taken to ensure the accuracy
14  * and reliability of the software and data, the NLM and the U.S.
15  * Government do not and cannot warrant the performance or results that
16  * may be obtained by using this software or data. The NLM and the U.S.
17  * Government disclaim all warranties, express or implied, including
18  * warranties of performance, merchantability or fitness for any particular
19  * purpose.
20  *
21  * Please cite the author in any work or product based on this material.
22  *
23  * ===========================================================================
24  *
25  * Author: Christiam Camacho
26  *
27  */
28 
29 /** @file split_query_aux_priv.cpp
30  * Auxiliary functions and classes to assist in query splitting
31  */
32 
33 #include <ncbi_pch.hpp>
35 #include "blast_setup.hpp"
36 #include "blast_aux_priv.hpp"
37 #include "split_query_aux_priv.hpp"
38 
40 
41 /** @addtogroup AlgoBlast
42  *
43  * @{
44  */
45 
48 BEGIN_SCOPE(blast)
49 
50 size_t
52 {
53  size_t retval = 100;
54  // used for experimentation purposes
55  char* overlap_sz_str = getenv("OVERLAP_CHUNK_SIZE");
56  if (overlap_sz_str && !NStr::IsBlank(overlap_sz_str)) {
57  retval = NStr::StringToInt(overlap_sz_str);
58  _TRACE("Using overlap chunk size from environment " << retval);
59  return retval;
60  }
61 
62  if (Blast_QueryIsTranslated(program)) {
63  // N.B.: this value must be divisible by 3 to work with translated
64  // queries, as we split them in nucleotide coordinates and then do the
65  // translation
66  retval = 297;
67  }
68  _TRACE("Using overlap chunk size " << retval);
69  return retval;
70 }
71 
72 bool
74  size_t chunk_size,
75  size_t concatenated_query_length,
76  size_t num_queries)
77 {
78  // TODO: need to model mem usage and when it's advantageous to split
79  bool retval = true;
80 
81  if (program == eBlastTypeMapping) {
82  return false;
83  }
84 
85  // if ((concatenated_query_length <= chunk_size+SplitQuery_GetOverlapChunkSize(program)) ||
86  // if ((concatenated_query_length <= chunk_size) ||
87  // do not split RPS-BLAST
88  if (Blast_SubjectIsPssm(program) ||
89  // the current implementation does NOT support splitting for multiple
90  // blastx queries, loop over queries individually here...
91  (program == eBlastTypeBlastx && num_queries > 1) ||
92  Blast_ProgramIsPhiBlast(program)) {
93  retval = false;
94  }
95 
96  return retval;
97 }
98 
99 Uint4
101  size_t *chunk_size,
102  size_t concatenated_query_length,
103  size_t num_queries)
104 {
105  if ( !SplitQuery_ShouldSplit(program, *chunk_size,
106  concatenated_query_length, num_queries)) {
107  _TRACE("Not splitting queries");
108  return 1;
109  }
110 
111  size_t overlap_size = SplitQuery_GetOverlapChunkSize(program);
112  Uint4 num_chunks = 0;
113 
114  _DEBUG_ARG(size_t target_chunk_size = *chunk_size);
115 
116  // For translated queries the chunk size should be divisible by CODON_LENGTH
117  if (Blast_QueryIsTranslated(program)) {
118  size_t chunk_size_delta = ((*chunk_size) % CODON_LENGTH);
119  *chunk_size -= chunk_size_delta;
120  _ASSERT((*chunk_size % CODON_LENGTH) == 0);
121  }
122 
123  // Fix for small query size
124  if ((*chunk_size) > overlap_size) {
125  num_chunks = concatenated_query_length / ((*chunk_size) - overlap_size);
126  }
127 
128  // Only one chunk, just return;
129  if (num_chunks <= 1) {
130  *chunk_size = concatenated_query_length;
131  return 1;
132  }
133 
134  // Re-adjust the chunk_size to make load even
135  if (!Blast_QueryIsTranslated(program)) {
136  *chunk_size = (concatenated_query_length + (num_chunks - 1) * overlap_size) / num_chunks;
137  // Round up only if this will not decrease the number of chunks
138  if (num_chunks < (*chunk_size) - overlap_size ) (*chunk_size)++;
139  }
140 
141  _TRACE("Number of chunks: " << num_chunks << "; "
142  "Target chunk size: " << target_chunk_size << "; "
143  "Returned chunk size: " << *chunk_size);
144 
145  return num_chunks;
146 }
147 
148 
149 void
151  CRef<IQueryFactory> full_query_fact,
152  CRef<SInternalData> full_data)
153 {
154  _ASSERT(full_data);
155  _ASSERT(full_data->m_SeqSrc);
156 
157  // If the effective search options have been set, we don't need to
158  // recompute those...
159  if (options->GetEffectiveSearchSpace() != 0) {
160  return;
161  }
162 
163  const BlastSeqSrc* seqsrc = full_data->m_SeqSrc->GetPointer();
164  Int8 total_length = BlastSeqSrcGetTotLenStats(seqsrc);
165  if (total_length <= 0)
166  total_length = BlastSeqSrcGetTotLen(seqsrc);
167  Int4 num_seqs = BlastSeqSrcGetNumSeqsStats(seqsrc);
168  if (num_seqs <= 0)
169  num_seqs = BlastSeqSrcGetNumSeqs(seqsrc);
170 
171  CEffectiveSearchSpaceCalculator calc(full_query_fact, *options,
172  num_seqs, total_length,
173  full_data->m_ScoreBlk->GetPointer());
174  BlastQueryInfo* qinfo = full_data->m_QueryInfo;
175  _ASSERT(qinfo);
176 
177  vector<Int8> eff_searchsp;
178  for (size_t index = 0; index <= (size_t)qinfo->last_context; index++) {
179  eff_searchsp.push_back(calc.GetEffSearchSpaceForContext(index));
180  }
181  options->SetEffectiveSearchSpace(eff_searchsp);
182 }
183 
186  CRef<CBlastOptions> options,
187  CRef<SInternalData> full_data,
188  size_t num_threads /*=1 No Thread*/)
189 {
190  BlastSeqSrc* seqsrc =
191  BlastSeqSrcCopy(full_data->m_SeqSrc->GetPointer());
192  CRef<SBlastSetupData> setup_data =
194  qf, options,
196  seqsrc, num_threads);
198  setup_data->m_InternalData->m_SeqSrc.Reset(new TBlastSeqSrc(seqsrc,
199  BlastSeqSrcFree));
200 
201  _ASSERT(setup_data->m_QuerySplitter->IsQuerySplit() == false);
202 
203  if (full_data->m_ProgressMonitor->Get()) {
204  setup_data->m_InternalData->m_FnInterrupt = full_data->m_FnInterrupt;
205  SBlastProgress* bp =
207  setup_data->m_InternalData->m_ProgressMonitor.Reset(new CSBlastProgress(bp));
208  }
209  return setup_data->m_InternalData;
210 }
211 
213  vector< CRef<IQueryFactory> >* query_chunk_factories /* = NULL */,
214  const CBlastOptions* options /* = NULL */)
215 {
216  const size_t kNumChunks(sqb.GetNumChunks());
217  m_ContextsPerChunk.reserve(kNumChunks);
218  for (size_t i = 0; i < kNumChunks; i++) {
219  m_ContextsPerChunk.push_back(sqb.GetQueryContexts(i));
220  }
221 
222  if (query_chunk_factories == NULL || options == NULL) {
223  return;
224  }
225 
226  /// Populate the data to print out
227  m_StartingChunks.resize(kNumChunks);
228  m_AbsoluteContexts.resize(kNumChunks);
229  for (size_t i = 0; i < kNumChunks; i++) {
230  CRef<IQueryFactory> chunk_qf((*query_chunk_factories)[i]);
231  CRef<ILocalQueryData> chunk_qd(chunk_qf->MakeLocalQueryData(options));
232  BlastQueryInfo* chunk_qinfo = chunk_qd->GetQueryInfo();
233  for (Int4 ctx = chunk_qinfo->first_context;
234  ctx <= chunk_qinfo->last_context; ctx++) {
235  m_StartingChunks[i].push_back(GetStartingChunk(i, ctx));
237  }
238  }
239 }
240 
241 int
243  Int4 context_in_chunk) const
244 {
245  _ASSERT(chunk_num < m_ContextsPerChunk.size());
246  _ASSERT(context_in_chunk < (Int4)m_ContextsPerChunk[chunk_num].size());
247  return m_ContextsPerChunk[chunk_num][context_in_chunk];
248 }
249 
250 int
252  int absolute_context) const
253 {
254  _ASSERT(chunk_num < m_ContextsPerChunk.size());
255  const vector<int>& context_indices = m_ContextsPerChunk[chunk_num];
256  vector<int>::const_iterator itr = find(context_indices.begin(),
257  context_indices.end(),
258  absolute_context);
259  if (itr == context_indices.end()) {
260  return kInvalidContext;
261  }
262  return static_cast<int>(itr - context_indices.begin()); // FIXED
263 }
264 
265 int
267  Int4 context_in_chunk) const
268 {
269  int absolute_context = GetAbsoluteContext(curr_chunk, context_in_chunk);
270  if (absolute_context == kInvalidContext) {
271  return kInvalidContext;
272  }
273 
274  size_t retval = curr_chunk;
275 
276  for (--curr_chunk; static_cast<int>(curr_chunk) >= 0; --curr_chunk) {
277  if (GetContextInChunk(curr_chunk, absolute_context) ==
278  kInvalidContext) {
279  break;
280  }
281  retval = curr_chunk;
282  }
283  return static_cast<int>(retval);
284 }
285 
286 ostream& operator<<(ostream& out, const CContextTranslator& rhs)
287 {
288  if (rhs.m_StartingChunks.front().empty() ||
289  rhs.m_AbsoluteContexts.front().empty()) {
290  return out;
291  }
292 
293  const size_t kNumChunks = rhs.m_ContextsPerChunk.size();
294  out << endl << "NumChunks = " << kNumChunks << endl;
295 
296  for (size_t i = 0; i < kNumChunks; i++) {
297  out << "Chunk" << i << "StartingChunks = "
298  << s_PrintVector(rhs.m_StartingChunks[i]) << endl;
299  }
300  out << endl;
301  for (size_t i = 0; i < kNumChunks; i++) {
302  out << "Chunk" << i << "AbsoluteContexts = "
303  << s_PrintVector(rhs.m_AbsoluteContexts[i]) << endl;
304  }
305  out << endl;
306 
307  return out;
308 }
309 
311  EBlastProgramType program,
312  CRef<ILocalQueryData> local_query_data)
313  : m_Program(program)
314 {
315  const size_t kNumChunks(sqb.GetNumChunks());
316  m_QueryIndicesPerChunk.reserve(kNumChunks);
317 
318  // unique list of query indices in global query
319  set<size_t> global_query_indices;
320 
321  for (size_t i = 0; i < kNumChunks; i++) {
322  m_QueryIndicesPerChunk.push_back(sqb.GetQueryIndices(i));
323  const vector<size_t>& query_indices = m_QueryIndicesPerChunk.back();
324  ITERATE(vector<size_t>, itr, query_indices) {
325  global_query_indices.insert(*itr);
326  }
327  }
328 
329  m_QueryLengths.reserve(global_query_indices.size());
330  ITERATE(set<size_t>, itr, global_query_indices) {
331  m_QueryLengths.push_back(local_query_data->GetSeqLength(*itr));
332  }
333 
335 }
336 
337 size_t
338 CQueryDataPerChunk::GetQueryLength(int global_query_index) const
339 {
340  _ASSERT(global_query_index < (int)m_QueryLengths.size());
341  return m_QueryLengths[global_query_index];
342 }
343 
344 size_t
345 CQueryDataPerChunk::GetQueryLength(size_t chunk_num, int context_in_chunk) const
346 {
347  _ASSERT(chunk_num < m_QueryIndicesPerChunk.size());
348  size_t pos = x_ContextInChunkToQueryIndex(context_in_chunk);
349  _ASSERT(pos < m_QueryIndicesPerChunk[chunk_num].size());
350  return GetQueryLength(static_cast<int>(m_QueryIndicesPerChunk[chunk_num][pos]));
351 }
352 
353 size_t
355 {
356  Int4 retval = Blast_GetQueryIndexFromContext(context_in_chunk, m_Program);
357  _ASSERT(retval != -1);
358  return static_cast<size_t>(retval);
359 }
360 
361 int
362 CQueryDataPerChunk::GetLastChunk(size_t chunk_num, int context_in_chunk)
363 {
364  _ASSERT(chunk_num < m_QueryIndicesPerChunk.size());
365  size_t pos = x_ContextInChunkToQueryIndex(context_in_chunk);
366  _ASSERT(pos < m_QueryIndicesPerChunk[chunk_num].size());
367  return GetLastChunk(static_cast<int>(m_QueryIndicesPerChunk[chunk_num][pos]));
368 }
369 
370 int
371 CQueryDataPerChunk::GetLastChunk(int global_query_index)
372 {
373  bool found = false;
374  int retval = m_LastChunkForQueryCache[global_query_index];
375 
376  if (retval != kUninitialized) {
377  return retval;
378  }
379 
380  for (size_t i = 0; i < m_QueryIndicesPerChunk.size(); i++) {
381  vector<size_t>::const_iterator itr =
382  find(m_QueryIndicesPerChunk[i].begin(),
383  m_QueryIndicesPerChunk[i].end(),
384  (size_t)global_query_index);
385  if (itr == m_QueryIndicesPerChunk[i].end()) {
386  if (found) {
387  break;
388  } else {
389  continue;
390  }
391  }
392  found = true;
393  retval = static_cast<int>(i);
394  }
395 
396  if ( !found ) {
397  return -1;
398  }
399  m_LastChunkForQueryCache[global_query_index] = retval;
400  return retval;
401 }
402 
403 END_SCOPE(blast)
405 
406 /* @} */
407 
User-defined methods of the data storage class.
Auxiliary functions for BLAST.
SBlastProgress * SBlastProgressNew(void *user_data)
Allocates and initializes a new SBlastProgress structure.
Definition: blast_util.c:1387
#define CODON_LENGTH
Codons are always of length 3.
Definition: blast_def.h:63
Boolean Blast_ProgramIsPhiBlast(EBlastProgramType p)
Returns true if program is PHI-BLAST (i.e.
Definition: blast_program.c:70
Boolean Blast_QueryIsTranslated(EBlastProgramType p)
Returns true if the query is translated.
Definition: blast_program.c:60
EBlastProgramType
Defines the engine's notion of the different applications of the BLAST algorithm.
Definition: blast_program.h:72
@ eBlastTypeBlastx
Definition: blast_program.h:75
@ eBlastTypeMapping
Definition: blast_program.h:88
Boolean Blast_SubjectIsPssm(EBlastProgramType p)
Returns true if the subject is PSSM.
Definition: blast_program.c:56
Int4 Blast_GetQueryIndexFromContext(Int4 context, EBlastProgramType program)
Given a context from BLAST engine core, return the query index.
Int8 BlastSeqSrcGetTotLenStats(const BlastSeqSrc *seq_src)
Get the total length of all sequences for calculation of expect value etc.
Definition: blast_seqsrc.c:227
BlastSeqSrc * BlastSeqSrcCopy(const BlastSeqSrc *seq_src)
Copy function: needed to guarantee thread safety.
Definition: blast_seqsrc.c:138
Int4 BlastSeqSrcGetNumSeqs(const BlastSeqSrc *seq_src)
Get the number of sequences contained in the sequence source.
Definition: blast_seqsrc.c:177
Int8 BlastSeqSrcGetTotLen(const BlastSeqSrc *seq_src)
Get the total length of all sequences in the sequence source.
Definition: blast_seqsrc.c:219
BlastSeqSrc * BlastSeqSrcFree(BlastSeqSrc *seq_src)
Frees the BlastSeqSrc structure by invoking the destructor function set by the user-defined construct...
Definition: blast_seqsrc.c:112
Int4 BlastSeqSrcGetNumSeqsStats(const BlastSeqSrc *seq_src)
Get the number of sequences used for calculation of expect values etc.
Definition: blast_seqsrc.c:185
void BlastSeqSrcResetChunkIterator(BlastSeqSrc *seq_src)
Reset the internal "bookmark" of the last chunk for iteration provided by this object.
Definition: blast_seqsrc.c:436
Internal auxiliary setup classes/functions for C++ BLAST APIs.
Encapsulates ALL the BLAST algorithm's options.
Auxiliary class to provide convenient and efficient access to conversions between contexts local to q...
Auxiliary class to compute the effective search space.
Wrapper class for SBlastProgress .
Definition: blast_aux.hpp:357
Wrapper class around SSplitQueryBlk structure.
iterator_bool insert(const value_type &val)
Definition: set.hpp:149
size_type size() const
Definition: set.hpp:132
static const int chunk_size
Declares auxiliary class to calculate the effective search space.
std::ofstream out("events_result.xml")
main entry point for tests
CS_CONTEXT * ctx
Definition: t0006.c:12
vector< vector< int > > m_StartingChunks
CQueryDataPerChunk(const CSplitQueryBlk &sqb, EBlastProgramType program, CRef< ILocalQueryData > local_query_data)
Constructor.
void SplitQuery_SetEffectiveSearchSpace(CRef< CBlastOptions > options, CRef< IQueryFactory > full_query_fact, CRef< SInternalData > full_data)
this might supercede the function below...
size_t SplitQuery_GetOverlapChunkSize(EBlastProgramType program)
Size of the region that overlaps in between each query chunk.
CRef< SBlastSetupData > BlastSetupPreliminarySearchEx(CRef< IQueryFactory > qf, CRef< CBlastOptions > options, CConstRef< CPssmWithParameters > pssm, BlastSeqSrc *seqsrc, size_t num_threads)
Extended interface to set up internal data structures used by the BLAST CORE engine.
vector< vector< int > > m_ContextsPerChunk
Each element in this vector represents a chunk, and it contains the contexts numbers that correspond ...
int GetLastChunk(int global_query_index)
get the last chunk where query identified with global_query_index is found
CRef< TBlastSeqSrc > m_SeqSrc
The source of subject sequence data.
size_t GetNumChunks() const
Retrieve the number of chunks.
int GetAbsoluteContext(size_t chunk_num, Int4 context_in_chunk) const
Get the context number in the absolute (i.e.
Int8 GetEffSearchSpaceForContext(size_t ctx_index) const
Retrieve the effective search space calculated for a given query context.
void SetEffectiveSearchSpace(Int8 eff)
CRef< ILocalQueryData > MakeLocalQueryData(const CBlastOptions *opts)
Creates and caches an ILocalQueryData.
Definition: query_data.cpp:52
CRef< SInternalData > SplitQuery_CreateChunkData(CRef< IQueryFactory > qf, CRef< CBlastOptions > options, CRef< SInternalData > full_data, size_t num_threads)
Function used by search class to retrieve a query factory for a given chunk.
ostream & operator<<(ostream &out, const CContextTranslator &rhs)
SBlastProgress * Get() const
Definition: blast_aux.hpp:357
size_t GetQueryLength(size_t chunk_num, int context_in_chunk) const
Get the length of the query.
vector< size_t > m_QueryLengths
Lengths of the queries.
vector< int > m_LastChunkForQueryCache
Lists the last chunk where the query can be found.
CStructWrapper< BlastSeqSrc > TBlastSeqSrc
CRef< TBlastScoreBlk > m_ScoreBlk
BLAST score block structure.
CContextTranslator(const CSplitQueryBlk &sqb, vector< CRef< IQueryFactory > > *query_chunk_factories=NULL, const CBlastOptions *options=NULL)
Constructor.
TInterruptFnPtr m_FnInterrupt
The interrupt callback.
virtual BlastQueryInfo * GetQueryInfo()=0
Accessor for the BlastQueryInfo structure.
CRef< CSBlastProgress > m_ProgressMonitor
The user data structure to aid in progress monitoring.
BlastQueryInfo * m_QueryInfo
The query information structure.
size_t x_ContextInChunkToQueryIndex(int context_in_chunk) const
Convert a context in a chunk to a query index (within the chunk)
int GetContextInChunk(size_t chunk_num, int absolute_context) const
Get the context number in the split query chunk.
vector< size_t > GetQueryIndices(size_t chunk_num) const
Get the indices of the queries contained in a given chunk.
EBlastProgramType m_Program
BLAST program type.
vector< vector< int > > m_AbsoluteContexts
int GetStartingChunk(size_t curr_chunk, Int4 context_in_chunk) const
Get the chunk number where context_in_chunk starts (i.e.
virtual size_t GetSeqLength(size_t index)=0
Get the length of the sequence indicated by index.
bool SplitQuery_ShouldSplit(EBlastProgramType program, size_t chunk_size, size_t concatenated_query_length, size_t num_queries)
Determines if the input query sequence(s) should be split because it.
vector< int > GetQueryContexts(size_t chunk_num) const
Get the contexts of the queries contained in a given chunk.
Uint4 SplitQuery_CalculateNumChunks(EBlastProgramType program, size_t *chunk_size, size_t concatenated_query_length, size_t num_queries)
Calculate the number of chunks that a query will be split into based upon query length,...
string s_PrintVector(const vector< T > &data2print)
Auxiliary function to print a vector.
Int8 GetEffectiveSearchSpace() const
vector< vector< size_t > > m_QueryIndicesPerChunk
Each element in this vector represents a chunk, and it contains the query indices that correspond in ...
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NULL
Definition: ncbistd.hpp:225
#define _TRACE(message)
Definition: ncbidbg.hpp:122
#define _DEBUG_ARG(arg)
Definition: ncbidbg.hpp:134
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define USING_SCOPE(ns)
Use the specified namespace.
Definition: ncbistl.hpp:78
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
static int StringToInt(const CTempString str, TStringToNumFlags flags=0, int base=10)
Convert string to int.
Definition: ncbistr.cpp:630
static bool IsBlank(const CTempString str, SIZE_TYPE pos=0)
Check if a string is blank (has no text).
Definition: ncbistr.cpp:106
int i
const struct ncbi::grid::netcache::search::fields::SIZE size
const Int4 kInvalidContext
Value to represent an invalid context.
Definition: split_query.c:39
Auxiliary functions and classes to assist in query splitting.
The query related information.
Int4 first_context
Index of the first element of the context array.
Int4 last_context
Index of the last element of the context array.
Complete type definition of Blast Sequence Source ADT.
Definition: blast_seqsrc.c:43
Progress monitoring structure.
Definition: blast_def.h:341
void * user_data
Pointer to user-provided data.
Definition: blast_def.h:344
#define _ASSERT
Modified on Sun Apr 14 05:26:18 2024 by modify_doxy.py rev. 669887