NCBI C++ ToolKit
split_query_cxx.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 
2 /* ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Christiam Camacho
27  *
28  */
29 
30 /** @file split_query_cxx.cpp
31  * Defines CQuerySplitter, a class to split the query sequence(s)
32  */
33 
34 #include <ncbi_pch.hpp>
35 #include "split_query.hpp"
39 
41 #include <objmgr/util/sequence.hpp>
43 
44 #include "split_query_aux_priv.hpp"
45 #include "blast_setup.hpp"
46 
48 BEGIN_SCOPE(blast)
49 
51  const CBlastOptions* options)
52  : m_QueryFactory(query_factory), m_Options(options), m_NumChunks(0),
53  m_LocalQueryData(0), m_TotalQueryLength(0), m_ChunkSize(0)
54 {
55  m_ChunkSize = SplitQuery_GetChunkSize(m_Options->GetProgram());
56  m_LocalQueryData = m_QueryFactory->MakeLocalQueryData(m_Options);
57  m_TotalQueryLength = m_LocalQueryData->GetSumOfSequenceLengths();
58  m_NumChunks = SplitQuery_CalculateNumChunks(m_Options->GetProgramType(),
59  &m_ChunkSize, m_TotalQueryLength, m_LocalQueryData->GetNumQueries());
60  /* No split for ungapped mode JIRA SB-1082 */
61  if (!options->GetGappedMode()) m_NumChunks = 1;
62  x_ExtractCScopesAndMasks();
63 }
64 
65 ostream& operator<<(ostream& out, const CQuerySplitter& rhs)
66 {
67  ILocalQueryData* query_data =
68  const_cast<ILocalQueryData*>(&*rhs.m_LocalQueryData);
69  const size_t kNumQueries = query_data->GetNumQueries();
70  const size_t kNumChunks = rhs.GetNumberOfChunks();
71 
72  out << endl << "; This is read by x_ReadQueryBoundsPerChunk"
73  << endl << "; Format: query start, query end, strand" << endl;
74 
75  // for all query indices {
76  // iterate every chunk and collect the coords, print them out
77  // }
78  for (size_t query_index = 0; query_index < kNumQueries; query_index++) {
79  CConstRef<CSeq_id> query_id
80  (query_data->GetSeq_loc(query_index)->GetId());
81  _ASSERT(query_id);
82 
83  for (size_t chunk_index = 0; chunk_index < kNumChunks; chunk_index++) {
84  ASSERT(rhs.m_SplitQueriesInChunk.size());
85  CRef<CBlastQueryVector> queries_in_chunk =
86  rhs.m_SplitQueriesInChunk[chunk_index];
87 
88  for (size_t qidx = 0; qidx < queries_in_chunk->Size(); qidx++) {
89  CConstRef<CSeq_loc> query_loc_in_chunk =
90  queries_in_chunk->GetQuerySeqLoc(qidx);
91  _ASSERT(query_loc_in_chunk);
92  CConstRef<CSeq_id> query_id_in_chunk
93  (query_loc_in_chunk->GetId());
94  _ASSERT(query_id_in_chunk);
95 
96  if (query_id->Match(*query_id_in_chunk)) {
97  const CSeq_loc::TRange& range =
98  query_loc_in_chunk->GetTotalRange();
99  out << "Chunk" << chunk_index << "Query" << query_index
100  << " = " << range.GetFrom() << ", "
101  << range.GetToOpen() << ", "
102  << (int)query_loc_in_chunk->GetStrand() << endl;
103  }
104  }
105  }
106  out << endl;
107  }
108 
109  return out;
110 }
111 
112 void
114 {
116  _ASSERT(m_Scopes.empty());
117  _ASSERT(m_UserSpecifiedMasks.empty());
118 
119  const size_t num_queries = m_LocalQueryData->GetNumQueries();
120 
121  CObjMgr_QueryFactory* objmgr_qf = NULL;
122  if ( (objmgr_qf = dynamic_cast<CObjMgr_QueryFactory*>(&*m_QueryFactory)) ) {
123  // extract the scopes and masks ...
124  m_Scopes = objmgr_qf->ExtractScopes();
126  _ASSERT(m_Scopes.size() == num_queries);
127  } else {
128  m_NumChunks = 1;
129  m_UserSpecifiedMasks.assign(num_queries, TMaskedQueryRegions());
130  }
131  _ASSERT(m_UserSpecifiedMasks.size() == num_queries);
132 }
133 
134 void
136 {
138 
139  // Note that this information might not need to be stored in the
140  // SSplitQueryBlk structure, as these ranges can be calculated as follows:
141  // chunk_start = (chunk_num*chunk_size) - (chunk_num*overlap_size);
142  // chunk_end = chunk_start + chunk_size > query_size ? query_size :
143  // chunk_start + chunk_size;
144 
145  size_t chunk_start = 0;
146  const size_t kOverlapSize =
148  for (size_t chunk_num = 0; chunk_num < m_NumChunks; chunk_num++) {
149  size_t chunk_end = chunk_start + m_ChunkSize;
150 
151  // if the chunk end is larger than the sequence ...
152  if (chunk_end >= m_TotalQueryLength ||
153  // ... or this is the last chunk and it didn't make it to the end
154  // of the sequence
155  (chunk_end < m_TotalQueryLength && (chunk_num + 1) == m_NumChunks))
156  {
157  // ... assign this chunk's end to the end of the sequence
158  chunk_end = m_TotalQueryLength;
159  }
160 
161  m_SplitBlk->SetChunkBounds(chunk_num,
162  TChunkRange(static_cast<unsigned int>(chunk_start), static_cast<unsigned int>(chunk_end)));
163  _TRACE("Chunk " << chunk_num << ": ranges from " << chunk_start
164  << " to " << chunk_end);
165 
166  chunk_start += (m_ChunkSize - kOverlapSize);
167  if (chunk_start > m_TotalQueryLength ||
168  chunk_end == m_TotalQueryLength) {
169  break;
170  }
171  }
172 
173  // For purposes of having an accurate overlap size when stitching back
174  // HSPs, save the overlap size
175  const size_t kOverlap =
177  ? kOverlapSize / CODON_LENGTH : kOverlapSize;
178  m_SplitBlk->SetChunkOverlapSize(kOverlap);
179 }
180 
181 /// Auxiliary function to assign the split query's Seq-interval so that it's
182 /// constrained within the chunk boundaries
183 /// @param chunk Range for the chunk [in]
184 /// @param query_range Range of sequence data corresponding to the full query
185 /// [in]
186 /// @param split_query_loc Seq-loc for this query constrained by the chunk's
187 /// boundaries [out]
188 static void
190  const TChunkRange& query_range,
191  int query_offset,
192  CRef<CSeq_loc> split_query_loc)
193 {
194  _ASSERT(split_query_loc.NotEmpty());
195  _ASSERT(chunk.IntersectingWith(query_range));
196  CSeq_interval& interval = split_query_loc->SetInt();
197  const int qstart = chunk.GetFrom() - query_range.GetFrom();
198  const int qend = chunk.GetToOpen() - query_range.GetToOpen();
199 
200  interval.SetFrom(max(0, qstart) + query_offset);
201 
202  if (qend >= 0) {
203  interval.SetTo(query_range.GetToOpen() - query_range.GetFrom() + query_offset);
204  } else {
205  interval.SetTo(chunk.GetToOpen() - query_range.GetFrom() + query_offset);
206  }
207 
208  // Note subtraction, as Seq-intervals are assumed to be
209  // open/inclusive
210  interval.SetTo() -= 1;
211 }
212 
213 void
215 {
216  const int kNumQueries = static_cast<int>(m_LocalQueryData->GetNumQueries());
217  const EBlastProgramType kProgram = m_Options->GetProgramType();
218  const ENa_strand kStrandOption = m_Options->GetStrandOption();
219 
220  // Build vector of query ranges along the concatenated query
221  vector<TChunkRange> query_ranges;
222  query_ranges.reserve(kNumQueries);
223  query_ranges.push_back(TChunkRange(0, static_cast<unsigned int>(m_LocalQueryData->GetSeqLength(0)))); // FIXED
224  _TRACE("Query 0: " << query_ranges.back().GetFrom() << "-" <<
225  query_ranges.back().GetToOpen());
226  for (int i = 1; i < kNumQueries; i++) {
227  TSeqPos query_start = query_ranges[i-1].GetTo() + 1;
228  TSeqPos query_end = query_start + static_cast<TSeqPos>(m_LocalQueryData->GetSeqLength(i));
229  query_ranges.push_back(TChunkRange(query_start, query_end));
230  _TRACE("Query " << i << ": " << query_ranges.back().GetFrom()
231  << "-" << query_ranges.back().GetToOpen());
232  }
233 
235  _ASSERT(m_UserSpecifiedMasks.size() == (size_t)kNumQueries);
236 
237  // determine intersection between query ranges and chunk ranges
238  for (size_t chunk_num = 0; chunk_num < m_NumChunks; chunk_num++) {
239  const TChunkRange chunk = m_SplitBlk->GetChunkBounds(chunk_num);
240  // FIXME: can be optimized to avoid examining those that have already
241  // been assigned to a given chunk
242  for (size_t qindex = 0; qindex < query_ranges.size(); qindex++) {
243  const TChunkRange& query_range = query_ranges[qindex];
244  if ( !chunk.IntersectingWith(query_range) ) {
245  continue;
246  }
247  m_SplitBlk->AddQueryToChunk(chunk_num, static_cast<Int4>(qindex));
248  if (m_SplitQueriesInChunk[chunk_num].Empty()) {
249  m_SplitQueriesInChunk[chunk_num].Reset(new CBlastQueryVector);
250  }
251 
252  // Build the split query seqloc for this query
253 
254  CConstRef<CSeq_loc> query_seqloc(m_LocalQueryData->GetSeq_loc(qindex));
255 
256  CRef<CSeq_loc> split_query_loc(new CSeq_loc);
257  {
258  int q_sl_offset = 0;
259  if(query_seqloc->IsInt() && (query_seqloc->GetInt().GetFrom() > 0 )) {
260  q_sl_offset = query_seqloc->GetInt().GetFrom();
261  }
262  s_SetSplitQuerySeqInterval(chunk, query_range, q_sl_offset, split_query_loc);
263  }
264 
265 
266 
267  split_query_loc->SetId(*query_seqloc->GetId());
268  const ENa_strand kStrand =
269  BlastSetup_GetStrand(*query_seqloc, kProgram, kStrandOption);
270  split_query_loc->SetStrand(kStrand);
271  _TRACE("Chunk " << chunk_num << ": query " << qindex << " ("
272  << split_query_loc->GetId()->AsFastaString() << ")"
273  << " " << split_query_loc->GetInt().GetFrom()
274  << "-" << split_query_loc->GetInt().GetTo()
275  << " strand " << (int)split_query_loc->GetStrand());
276 
277  // retrieve the split mask corresponding to this chunk of the query
278  CRef<CSeq_loc> mask_query_loc(new CSeq_loc);
279  s_SetSplitQuerySeqInterval(chunk, query_range, 0, mask_query_loc);
280  TMaskedQueryRegions split_mask =
281  m_UserSpecifiedMasks[qindex].RestrictToSeqInt(mask_query_loc->GetInt());
282 
283  // retrieve the scope to retrieve this query
284  CRef<CScope> scope(m_Scopes[qindex]);
285  _ASSERT(scope.NotEmpty());
286 
287  // our split query chunk :)
288  CRef<CBlastSearchQuery> split_query
289  (new CBlastSearchQuery(*split_query_loc, *scope, split_mask));
290  m_SplitQueriesInChunk[chunk_num]->AddQuery(split_query);
291  }
292  }
293 }
294 
295 /** Adds the necessary shift to the context to record the query contexts for
296  * the query chunks
297  * @param context query context [in]
298  * @param shift shift to add [in]
299  */
300 static inline unsigned int
301 s_AddShift(unsigned int context, int shift)
302 {
303  _ASSERT(context == 3 || context == 4 || context == 5);
304  _ASSERT(shift == 0 || shift == 1 || shift == -1);
305 
306  unsigned int retval;
307  if (shift == 0) {
308  retval = context;
309  } else if (shift == 1) {
310  retval = context == 3 ? 5 : context - shift;
311  } else if (shift == -1) {
312  retval = context == 5 ? 3 : context - shift;
313  } else {
314  abort();
315  }
316  return retval;
317 }
318 
319 /**
320  * @brief Retrieve the shift for the negative strand
321  *
322  * @param query_length length of the query [in]
323  *
324  * @return shift (either 1, -1, or 0)
325  */
326 static inline int
328 {
329  int retval;
330  switch (query_length % CODON_LENGTH) {
331  case 1: retval = -1; break;
332  case 2: retval = 1; break;
333  case 0: default: retval = 0; break;
334  }
335  return retval;
336 }
337 
338 void
340 {
341  const EBlastProgramType kProgram = m_Options->GetProgramType();
342  const unsigned int kNumContexts = GetNumberOfContexts(kProgram);
343  const ENa_strand kStrandOption = m_Options->GetStrandOption();
344  unique_ptr<CQueryDataPerChunk> qdpc;
345 
346  if (Blast_QueryIsTranslated(kProgram)) {
347  qdpc.reset(new CQueryDataPerChunk(*m_SplitBlk, kProgram,
349  }
350 
351  for (size_t chunk_num = 0; chunk_num < m_NumChunks; chunk_num++) {
352  vector<size_t> queries = m_SplitBlk->GetQueryIndices(chunk_num);
353 
354  for (size_t i = 0; i < queries.size(); i++) {
356  const ENa_strand kStrand =
357  BlastSetup_GetStrand(*sl, kProgram, kStrandOption);
358 
359  if (Blast_QueryIsTranslated(kProgram)) {
360  size_t qlength = qdpc->GetQueryLength(static_cast<int>(queries[i]));
361  int last_query_chunk = qdpc->GetLastChunk(static_cast<int>(queries[i]));
362  _ASSERT(last_query_chunk != -1);
363  int shift = s_GetShiftForTranslatedNegStrand(qlength);
364 
365  for (unsigned int ctx = 0; ctx < kNumContexts; ctx++) {
366  // handle the plus strand...
367  if (ctx % NUM_FRAMES < CODON_LENGTH) {
368  if (kStrand == eNa_strand_minus) {
369  m_SplitBlk->AddContextToChunk(chunk_num,
371  } else {
372  m_SplitBlk->AddContextToChunk(chunk_num,
373  static_cast<Int4>(kNumContexts*queries[i]+ctx));
374  }
375  } else { // handle the negative strand
376  if (kStrand == eNa_strand_plus) {
377  m_SplitBlk->AddContextToChunk(chunk_num,
379  } else {
380  if (chunk_num == (size_t)last_query_chunk) {
381  // last chunk doesn't have shift
382  m_SplitBlk->AddContextToChunk(chunk_num,
383  static_cast<Int4>(kNumContexts*queries[i]+ctx));
384  } else {
385  m_SplitBlk->AddContextToChunk(chunk_num,
386  static_cast<Int4>(kNumContexts*queries[i]+
387  s_AddShift(ctx, shift)));
388  }
389  }
390  }
391  }
392  } else if (Blast_QueryIsNucleotide(kProgram)) {
393 
394  for (unsigned int ctx = 0; ctx < kNumContexts; ctx++) {
395  // handle the plus strand...
396  if (ctx % NUM_STRANDS == 0) {
397  if (kStrand == eNa_strand_minus) {
398  m_SplitBlk->AddContextToChunk(chunk_num,
400  } else {
401  m_SplitBlk->AddContextToChunk(chunk_num,
402  static_cast<Int4>(kNumContexts*queries[i]+ctx));
403  }
404  } else { // handle the negative strand
405  if (kStrand == eNa_strand_plus) {
406  m_SplitBlk->AddContextToChunk(chunk_num,
408  } else {
409  m_SplitBlk->AddContextToChunk(chunk_num,
410  static_cast<Int4>(kNumContexts*queries[i]+ctx));
411  }
412  }
413  }
414 
415  } else if (Blast_QueryIsProtein(kProgram)) {
416  m_SplitBlk->AddContextToChunk(chunk_num,
417  static_cast<Int4>(kNumContexts*queries[i]));
418  } else {
419  abort();
420  }
421  }
422  }
423 }
424 
425 /**
426  * @brief Determine whether a given context corresponds to the plus or minus
427  * strand
428  *
429  * @param qinfo BlastQueryInfo structure to determine the strand [in]
430  * @param context_number Context number in the BlastQueryInfo structure (index
431  * into the BlastContextInfo array) [in]
432  */
433 static inline bool
434 s_IsPlusStrand(const BlastQueryInfo* qinfo, Int4 context_number)
435 {
436  return qinfo->contexts[context_number].frame >= 0;
437 }
438 
439 /**
440  * @brief Get the length of a context in absolute terms (i.e.: in the context
441  * of the full, non-split sequence)
442  *
443  * @param chunk_qinfo vector of BlastQueryInfo structures corresponding to the
444  * various query chunks [in]
445  * @param chunk_num Chunk number, index into the vector above [in]
446  * @param ctx_translator auxiliary context translator object [in]
447  * @param absolute_context context in the full, non-split query
448  *
449  * @return length of the requested context
450  */
451 static size_t
452 s_GetAbsoluteContextLength(const vector<const BlastQueryInfo*>& chunk_qinfo,
453  int chunk_num,
454  const CContextTranslator& ctx_translator,
455  int absolute_context)
456 {
457  if (chunk_num < 0) {
458  return 0;
459  }
460 
461  int pos = ctx_translator.GetContextInChunk((size_t)chunk_num,
462  absolute_context);
463  if (pos != kInvalidContext) {
464  return chunk_qinfo[chunk_num]->contexts[pos].query_length;
465  }
466  return 0;
467 }
468 
469 //#define DEBUG_COMPARE_SEQUENCES 1
470 
471 #ifdef DEBUG_COMPARE_SEQUENCES
472 
473 /// Convert a sequence into its printable representation
474 /// @param seq sequence data [in]
475 /// @param len length of sequence to print [in]
476 /// @param is_prot whether the sequence is protein or not [in]
477 static string s_GetPrintableSequence(const Uint1* seq, size_t len, bool is_prot)
478 {
479  string retval;
480  for (size_t i = 0; i < len; i++) {
481  retval.append(1, (is_prot
482  ? NCBISTDAA_TO_AMINOACID[seq[i]]
483  : BLASTNA_TO_IUPACNA[seq[i]]));
484  }
485  return retval;
486 }
487 
488 /** Auxiliary function to validate the context offset corrections
489  * @param global global query sequence data [in]
490  * @param chunk sequence data for chunk [in]
491  * @param len length of the data to compare [in]
492  * @param is_prot whether the sequence is protein or not [in]
493  * @return true if sequence data is identical, false otherwise
494  */
495 static bool cmp_sequence(const Uint1* global, const Uint1* chunk, size_t len,
496  bool is_prot)
497 {
498  bool retval = true;
499 
500  for (size_t i = 0; i < len; i++) {
501  if (global[i] != chunk[i]) {
502  retval = false;
503  break;
504  }
505  }
506 
507  if (retval == false) {
508  _TRACE("Comparing global: '"
509  << s_GetPrintableSequence(global, len, is_prot) << "'");
510  _TRACE("with chunk: '"
511  << s_GetPrintableSequence(chunk, len, is_prot) << "'");
512  }
513 
514  return retval;
515 }
516 #endif
517 
518 /* ----------------------------------------------------------------------------
519  * To compute the offset correction for the plus strand we need to add the
520  * length of all the contexts spread across the various chunks and subtract
521  * the corresponding overlap regions.
522  *
523  * Assumptions: chunk size and overlap (in nucleotide coordinates) are
524  * divisible by CODON_LENGTH.
525  *
526  * let C = current split query chunk
527  * let SC = starting chunk such that the context in question is found in this
528  * chunk
529  * let ctx = context number in split query chunk C
530  * let O = overlap size in final sequence data coordinates (aa for blastx, na
531  * for blastn)
532  * let x = ctx equivalent in the global (absolute) unsplit query
533  * let ctxlen(a, b) be a function which computes the length of context a (given
534  * in global coordinates) in chunk b. If a is not found in b or if b doesn't
535  * exist (i.e.: is negative), 0 is returned.
536  * let corr = correction to be added to the global query offset for chunk C,
537  * context ctx
538  *
539  * corr = 0;
540  * for (; C != SC; C--) {
541  * let prev_len = ctxlen(x, C - 1);
542  * let curr_len = ctxlen(x, C);
543  * let overlap = min(O, curr_len);
544  * corr += prev_len - min(overlap, prev_len);
545  * }
546  *
547  * ----------------------------------------------------------------------------
548  * To compute the offset correction for the negative strand we need to subtract
549  * from the query length how far we have advanced into the query's negative
550  * strand (starting from the right) minus the overlap regions.
551  *
552  * Define the same variables as for the plus strand (except corr, defined
553  * below):
554  * let L = length of context x in the absolute query
555  * let corr = L - S, where S is computed as follows:
556  *
557  * S = 0;
558  * for (; C >= SC; C--) {
559  * let prev_len = ctxlen(x, C - 1);
560  * let curr_len = ctxlen(x, C);
561  * let overlap = min(O, curr_len);
562  * corr += curr_len - min(overlap, prev_len);
563  * }
564  *
565 */
566 
567 void
569 {
572  } else {
574  }
575 }
576 
577 // Record the correction needed to synchronize with the complete query all the
578 // query offsets within HSPs that fall within this context
579 void
581 {
582  _ASSERT( !m_QueryChunkFactories.empty() );
583 
584  const EBlastProgramType kProgram = m_Options->GetProgramType();
585  _ASSERT( !Blast_QueryIsTranslated(kProgram) );
586  const BlastQueryInfo* global_qinfo = m_LocalQueryData->GetQueryInfo();
587 #ifdef DEBUG_COMPARE_SEQUENCES
588  const BLAST_SequenceBlk* global_seq = m_LocalQueryData->GetSequenceBlk();
589 #endif
590  const size_t kOverlap = SplitQuery_GetOverlapChunkSize(kProgram);
592  m_Options);
593  vector<const BlastQueryInfo*> chunk_qinfo(m_NumChunks, 0);
594 
595  for (size_t chunk_num = 0; chunk_num < m_NumChunks; chunk_num++) {
596  CRef<IQueryFactory> chunk_qf(m_QueryChunkFactories[chunk_num]);
598 #ifdef DEBUG_COMPARE_SEQUENCES
599  const BLAST_SequenceBlk* chunk_seq = chunk_qd->GetSequenceBlk();
600 #endif
601 
602  // BlastQueryInfo structure corresponding to chunk number chunk_num
603  chunk_qinfo[chunk_num] = chunk_qd->GetQueryInfo();
604  _ASSERT(chunk_qinfo[chunk_num]);
605 
606  // In case the first context differs from 0, for consistency with the
607  // other data returned by this class...
608  for (Int4 ctx = 0; ctx < chunk_qinfo[chunk_num]->first_context; ctx++) {
610  }
611 
612  for (Int4 ctx = chunk_qinfo[chunk_num]->first_context;
613  ctx <= chunk_qinfo[chunk_num]->last_context;
614  ctx++) {
615 
616  size_t correction = 0;
617  const int starting_chunk =
618  ctx_translator.GetStartingChunk(chunk_num, ctx);
619  const int absolute_context =
620  ctx_translator.GetAbsoluteContext(chunk_num, ctx);
621 
622  if (absolute_context == kInvalidContext ||
623  starting_chunk == kInvalidContext) {
624  _ASSERT( !chunk_qinfo[chunk_num]->contexts[ctx].is_valid );
625  // INT4_MAX is the sentinel value for invalid contexts
627  continue;
628  }
629 
630  if (s_IsPlusStrand(chunk_qinfo[chunk_num], ctx)) {
631 
632  for (int c = static_cast<int>(chunk_num); c != starting_chunk; c--) {
633  size_t prev_len = s_GetAbsoluteContextLength(chunk_qinfo,
634  c - 1,
635  ctx_translator,
636  absolute_context);
637  size_t curr_len = s_GetAbsoluteContextLength(chunk_qinfo, c,
638  ctx_translator,
639  absolute_context);
640  size_t overlap = min(kOverlap, curr_len);
641  correction += prev_len - min(overlap, prev_len);
642  }
643 
644  } else {
645 
646  size_t subtrahend = 0;
647 
648  for (int c = static_cast<int>(chunk_num); c >= starting_chunk && c >= 0; c--) {
649  size_t prev_len = s_GetAbsoluteContextLength(chunk_qinfo,
650  c - 1,
651  ctx_translator,
652  absolute_context);
653  size_t curr_len = s_GetAbsoluteContextLength(chunk_qinfo,
654  c,
655  ctx_translator,
656  absolute_context);
657  size_t overlap = min(kOverlap, curr_len);
658  subtrahend += (curr_len - min(overlap, prev_len));
659  }
660  correction =
661  global_qinfo->contexts[absolute_context].query_length -
662  subtrahend;
663 
664  }
665  _ASSERT((chunk_qinfo[chunk_num]->contexts[ctx].is_valid));
666  m_SplitBlk->AddContextOffsetToChunk(chunk_num, static_cast<int>(correction));
667 #ifdef DEBUG_COMPARE_SEQUENCES
668 {
669  int global_offset = global_qinfo->contexts[absolute_context].query_offset +
670  correction;
671  int chunk_offset = chunk_qinfo[chunk_num]->contexts[ctx].query_offset;
672  if (!cmp_sequence(&global_seq->sequence[global_offset],
673  &chunk_seq->sequence[chunk_offset], 10,
674  Blast_QueryIsProtein(kProgram))) {
675  cerr << "Failed to compare sequence data!" << endl;
676  }
677 }
678 #endif
679 
680  }
681  }
682  _TRACE("CContextTranslator contents: " << ctx_translator);
683 }
684 
685 void
687 {
688  _ASSERT( !m_QueryChunkFactories.empty() );
689 
690  const EBlastProgramType kProgram = m_Options->GetProgramType();
691  _ASSERT(Blast_QueryIsTranslated(kProgram));
692  const BlastQueryInfo* global_qinfo = m_LocalQueryData->GetQueryInfo();
693 #ifdef DEBUG_COMPARE_SEQUENCES
694  const BLAST_SequenceBlk* global_seq = m_LocalQueryData->GetSequenceBlk();
695 #endif
696  const size_t kOverlap =
699  m_Options);
701  vector<const BlastQueryInfo*> chunk_qinfo(m_NumChunks, 0);
702 
703  for (size_t chunk_num = 0; chunk_num < m_NumChunks; chunk_num++) {
704  CRef<IQueryFactory> chunk_qf(m_QueryChunkFactories[chunk_num]);
706 #ifdef DEBUG_COMPARE_SEQUENCES
707  const BLAST_SequenceBlk* chunk_seq = chunk_qd->GetSequenceBlk();
708 #endif
709 
710  // BlastQueryInfo structure corresponding to chunk number chunk_num
711  chunk_qinfo[chunk_num] = chunk_qd->GetQueryInfo();
712  _ASSERT(chunk_qinfo[chunk_num]);
713 
714  // In case the first context differs from 0, for consistency with the
715  // other data returned by this class...
716  for (Int4 ctx = 0; ctx < chunk_qinfo[chunk_num]->first_context; ctx++) {
718  }
719 
720  for (Int4 ctx = chunk_qinfo[chunk_num]->first_context;
721  ctx <= chunk_qinfo[chunk_num]->last_context;
722  ctx++) {
723 
724  size_t correction = 0;
725  const int starting_chunk =
726  ctx_translator.GetStartingChunk(chunk_num, ctx);
727  const int absolute_context =
728  ctx_translator.GetAbsoluteContext(chunk_num, ctx);
729  const int last_query_chunk = qdpc.GetLastChunk(chunk_num, ctx);
730 
731  if (absolute_context == kInvalidContext ||
732  starting_chunk == kInvalidContext) {
733  _ASSERT( !chunk_qinfo[chunk_num]->contexts[ctx].is_valid );
734  // INT4_MAX is the sentinel value for invalid contexts
736  continue;
737  }
738 
739  // The corrections for the contexts corresponding to the negative
740  // strand in the last chunk of a query sequence are all 0
741  if (!s_IsPlusStrand(chunk_qinfo[chunk_num], ctx) &&
742  (chunk_num == (size_t)last_query_chunk) &&
743  (ctx % NUM_FRAMES >= 3)) {
744  correction = 0;
745  goto error_check;
746  }
747 
748  // The corrections for the contexts corresponding to the plus
749  // strand are always the same, so only calculate the first one
750  if (s_IsPlusStrand(chunk_qinfo[chunk_num], ctx) &&
751  (ctx % NUM_FRAMES == 1 || ctx % NUM_FRAMES == 2)) {
752  correction = m_SplitBlk->GetContextOffsets(chunk_num).back();
753  goto error_check;
754  }
755 
756  // If the query length is divisible by CODON_LENGTH, the
757  // corrections for all contexts corresponding to a given strand are
758  // the same, so only calculate the first one
759  if ((qdpc.GetQueryLength(chunk_num, ctx) % CODON_LENGTH == 0) &&
760  (ctx % NUM_FRAMES != 0) && (ctx % NUM_FRAMES != 3)) {
761  correction = m_SplitBlk->GetContextOffsets(chunk_num).back();
762  goto error_check;
763  }
764 
765  // If the query length % CODON_LENGTH == 1, the corrections for the
766  // first two contexts of the negative strand are the same, and the
767  // correction for the last context is one more than that.
768  if ((qdpc.GetQueryLength(chunk_num, ctx) % CODON_LENGTH == 1) &&
769  !s_IsPlusStrand(chunk_qinfo[chunk_num], ctx)) {
770 
771  if (ctx % NUM_FRAMES == 4) {
772  correction =
773  m_SplitBlk->GetContextOffsets(chunk_num).back();
774  goto error_check;
775  } else if (ctx % NUM_FRAMES == 5) {
776  correction =
777  m_SplitBlk->GetContextOffsets(chunk_num).back() + 1;
778  goto error_check;
779  }
780  }
781 
782  // If the query length % CODON_LENGTH == 2, the corrections for the
783  // last two contexts of the negative strand are the same, which is
784  // one more that the first context on the negative strand.
785  if ((qdpc.GetQueryLength(chunk_num, ctx) % CODON_LENGTH == 2) &&
786  !s_IsPlusStrand(chunk_qinfo[chunk_num], ctx)) {
787 
788  if (ctx % NUM_FRAMES == 4) {
789  correction =
790  m_SplitBlk->GetContextOffsets(chunk_num).back() + 1;
791  goto error_check;
792  } else if (ctx % NUM_FRAMES == 5) {
793  correction =
794  m_SplitBlk->GetContextOffsets(chunk_num).back();
795  goto error_check;
796  }
797  }
798 
799  if (s_IsPlusStrand(chunk_qinfo[chunk_num], ctx)) {
800 
801  for (int c = static_cast<int>(chunk_num); c != starting_chunk; c--) {
802  size_t prev_len = s_GetAbsoluteContextLength(chunk_qinfo,
803  c - 1,
804  ctx_translator,
805  absolute_context);
806  size_t curr_len = s_GetAbsoluteContextLength(chunk_qinfo, c,
807  ctx_translator,
808  absolute_context);
809  size_t overlap = min(kOverlap, curr_len);
810  correction += prev_len - min(overlap, prev_len);
811  }
812 
813  } else {
814 
815  size_t subtrahend = 0;
816 
817  for (int c = static_cast<int>(chunk_num); c >= starting_chunk && c >= 0; c--) {
818  size_t prev_len = s_GetAbsoluteContextLength(chunk_qinfo,
819  c - 1,
820  ctx_translator,
821  absolute_context);
822  size_t curr_len = s_GetAbsoluteContextLength(chunk_qinfo,
823  c,
824  ctx_translator,
825  absolute_context);
826  size_t overlap = min(kOverlap, curr_len);
827  subtrahend += (curr_len - min(overlap, prev_len));
828  }
829  correction =
830  global_qinfo->contexts[absolute_context].query_length -
831  subtrahend;
832  }
833 
834 error_check:
835  _ASSERT((chunk_qinfo[chunk_num]->contexts[ctx].is_valid));
836  m_SplitBlk->AddContextOffsetToChunk(chunk_num, static_cast<Int4>(correction));
837 #ifdef DEBUG_COMPARE_SEQUENCES
838 {
839  int global_offset = global_qinfo->contexts[absolute_context].query_offset +
840  correction;
841  int chunk_offset = chunk_qinfo[chunk_num]->contexts[ctx].query_offset;
842  int num_bases2compare =
843  min(10, chunk_qinfo[chunk_num]->contexts[ctx].query_length);
844  if (!cmp_sequence(&global_seq->sequence[global_offset],
845  &chunk_seq->sequence[chunk_offset],
846  num_bases2compare, Blast_QueryIsProtein(kProgram))) {
847  cerr << "Failed to compare sequence data for chunk " << chunk_num
848  << ", context " << ctx << endl;
849  }
850 }
851 #endif
852  }
853  }
854  _TRACE("CContextTranslator contents: " << ctx_translator);
855 }
856 
859 {
860  if (m_SplitBlk.NotEmpty()) {
861  return m_SplitBlk;
862  }
863 
867 
868  if (m_NumChunks == 1) {
870  } else {
871  _TRACE("Splitting into " << m_NumChunks << " query chunks");
875 
876  for (size_t chunk_num = 0; chunk_num < m_NumChunks; chunk_num++) {
878  (new CObjMgr_QueryFactory(*m_SplitQueriesInChunk[chunk_num]));
879  m_QueryChunkFactories.push_back(qf);
880  }
881 
883  }
884 
885  _TRACE("CSplitQuerBlk contents: " << *m_SplitBlk);
886  _TRACE("CQuerySplitter contents: " << *this);
887 
888  return m_SplitBlk;
889 }
890 
893 {
894  if (chunk_num >= m_NumChunks) {
895  string msg("Invalid query chunk number: ");
896  msg += NStr::IntToString(chunk_num) + " out of " +
898  throw out_of_range(msg);
899  }
900 
901  if (m_SplitBlk.Empty()) {
902  Split();
903  }
904 
905  return m_QueryChunkFactories[chunk_num];
906 }
907 
908 END_SCOPE(blast)
910 
#define CODON_LENGTH
Codons are always of length 3.
Definition: blast_def.h:63
#define NUM_STRANDS
Number of frames in a nucleotide sequence.
Definition: blast_def.h:93
#define NUM_FRAMES
Number of frames to which we translate in translating searches.
Definition: blast_def.h:88
Declares class to encapsulate all BLAST options.
Boolean Blast_QueryIsTranslated(EBlastProgramType p)
Returns true if the query is translated.
Definition: blast_program.c:60
Boolean Blast_QueryIsNucleotide(EBlastProgramType p)
Returns true if the query is nucleotide.
Definition: blast_program.c:43
Boolean Blast_QueryIsProtein(EBlastProgramType p)
Returns true if the query is protein.
Definition: blast_program.c:40
EBlastProgramType
Defines the engine's notion of the different applications of the BLAST algorithm.
Definition: blast_program.h:72
Internal auxiliary setup classes/functions for C++ BLAST APIs.
Encapsulates ALL the BLAST algorithm's options.
Query Vector.
Definition: sseqloc.hpp:276
size_type Size() const
Returns the number of queries found in this query vector.
Definition: sseqloc.hpp:305
CConstRef< objects::CSeq_loc > GetQuerySeqLoc(size_type i) const
Get the query Seq-loc for a query by index.
Definition: sseqloc.hpp:313
Search Query.
Definition: sseqloc.hpp:147
Auxiliary class to provide convenient and efficient access to conversions between contexts local to q...
NCBI C++ Object Manager dependant implementation of IQueryFactory.
Auxiliary class to determine information about the query that was split into chunks.
Class responsible for splitting query sequences and providing data to the BLAST search class to searc...
Definition: split_query.hpp:66
CRef –.
Definition: ncbiobj.hpp:618
Wrapper class around SSplitQueryBlk structure.
Provides access (not ownership) to the C structures used to configure local BLAST search class implem...
Definition: query_data.hpp:55
Source of query sequence data for BLAST Provides an interface for search classes to retrieve sequence...
Definition: query_data.hpp:147
Collection of masked regions for a single query sequence.
Definition: seqlocinfo.hpp:113
CS_CONTEXT * ctx
Definition: t0006.c:12
std::ofstream out("events_result.xml")
main entry point for tests
bool Empty(const CNcbiOstrstream &src)
Definition: fileutil.cpp:523
virtual CConstRef< objects::CSeq_loc > GetSeq_loc(size_t index)=0
Get the Seq_loc for the sequence indicated by index.
vector< CRef< IQueryFactory > > m_QueryChunkFactories
Vector of query factories, each element corresponds to a chunk.
size_t SplitQuery_GetOverlapChunkSize(EBlastProgramType program)
Size of the region that overlaps in between each query chunk.
size_t m_TotalQueryLength
Length of the concatenated query.
void AddQueryToChunk(size_t chunk_num, Int4 query_index)
Adds a query index to a given chunk.
TChunkRange GetChunkBounds(size_t chunk_num) const
Get the boundaries of a chunk in the concatenated query.
TScopeVector m_Scopes
Vector of CScope objects.
COpenRange< TSeqPos > TChunkRange
Range describing a query chunk.
int GetLastChunk(int global_query_index)
get the last chunk where query identified with global_query_index is found
void x_ComputeContextOffsetsForChunks()
Compute the context offsets which are used to adjust the results.
int GetAbsoluteContext(size_t chunk_num, Int4 context_in_chunk) const
Get the context number in the absolute (i.e.
void SetChunkOverlapSize(size_t size)
Sets the size (# of bases/residues) of overlap between query chunks.
objects::ENa_strand GetStrandOption() const
const char BLASTNA_TO_IUPACNA[]
Translates between blastna and iupacna.
virtual BLAST_SequenceBlk * GetSequenceBlk()=0
Accessor for the BLAST_SequenceBlk structure.
void SetChunkBounds(size_t chunk_num, const TChunkRange &chunk_range)
Set the boundaries of a chunk in the concatenated query.
CRef< ILocalQueryData > MakeLocalQueryData(const CBlastOptions *opts)
Creates and caches an ILocalQueryData.
Definition: query_data.cpp:52
Uint4 m_NumChunks
Number of chunks, if this is 1, no splitting occurs.
void x_ComputeContextOffsets_NonTranslatedQueries()
Compute the context offsets which are used to adjust the results for non-translated queries.
void AddContextToChunk(size_t chunk_num, Int4 context_index)
Adds a query context to a given chunk.
size_t GetQueryLength(size_t chunk_num, int context_in_chunk) const
Get the length of the query.
size_t SplitQuery_GetChunkSize(EProgram program)
Returns the optimal chunk size for a given task.
Definition: local_blast.cpp:54
EBlastProgramType GetProgramType() const
Returns the CORE BLAST notion of program type.
void AddContextOffsetToChunk(size_t chunk_num, Int4 context_offset)
Adds a context offset (correction) to a given chunk.
virtual BlastQueryInfo * GetQueryInfo()=0
Accessor for the BlastQueryInfo structure.
objects::ENa_strand BlastSetup_GetStrand(const objects::CSeq_loc &query_seqloc, EBlastProgramType program, objects::ENa_strand strand_option)
Choose between a Seq-loc specified query strand and the strand obtained from the CBlastOptions.
void x_ComputeQueryIndicesForChunks()
Compute query indices that correspond to each chunk.
CRef< ILocalQueryData > m_LocalQueryData
Source of local query data.
CRef< IQueryFactory > GetQueryFactoryForChunk(Uint4 chunk_num)
Returns a IQueryFactory suitable to be executed by a BLAST search class.
void x_ComputeContextOffsets_TranslatedQueries()
Compute the context offsets which are used to adjust the results for translated queries.
size_t m_ChunkSize
Size of the query chunks.
unsigned int GetNumberOfContexts(EBlastProgramType p)
Returns the number of contexts for a given BLAST program.
virtual size_t GetNumQueries()=0
Get the number of queries.
TSeqLocInfoVector ExtractUserSpecifiedMasks()
Retrieve any user specified masking locations.
TSeqLocInfoVector m_UserSpecifiedMasks
Vector of masking locations.
const CBlastOptions * m_Options
BLAST options.
int GetContextInChunk(size_t chunk_num, int absolute_context) const
Get the context number in the split query chunk.
void x_ComputeQueryContextsForChunks()
Compute query contexts that correspond to each chunk.
vector< size_t > GetQueryIndices(size_t chunk_num) const
Get the indices of the queries contained in a given chunk.
CRef< CSplitQueryBlk > Split()
Split the query sequence(s)
int GetStartingChunk(size_t curr_chunk, Int4 context_in_chunk) const
Get the chunk number where context_in_chunk starts (i.e.
virtual size_t GetSeqLength(size_t index)=0
Get the length of the sequence indicated by index.
CRef< IQueryFactory > m_QueryFactory
The original, unsplit query factory.
vector< CRef< objects::CScope > > ExtractScopes()
Retrieve the CScope objects associated with the query sequences associated with this object.
Uint4 SplitQuery_CalculateNumChunks(EBlastProgramType program, size_t *chunk_size, size_t concatenated_query_length, size_t num_queries)
Calculate the number of chunks that a query will be split into based upon query length,...
void x_ExtractCScopesAndMasks()
Auxiliary method to extract the CScope objects from the query factory.
Uint4 GetNumberOfChunks() const
Returns the number of chunks the query/queries will be split into.
Definition: split_query.hpp:85
CRef< CSplitQueryBlk > m_SplitBlk
Split query block structure.
vector< size_t > GetContextOffsets(size_t chunk_num) const
Get the context offsets (corrections) of the queries contained in a given chunk.
TSplitQueryVector m_SplitQueriesInChunk
Vector of split queries.
const char NCBISTDAA_TO_AMINOACID[]
Translates between ncbieaa and ncbistdaa.
bool GetGappedMode() const
Returns true if gapped BLAST is set, false otherwise.
void x_ComputeChunkRanges()
Compute all chunk ranges.
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define NULL
Definition: ncbistd.hpp:225
#define _TRACE(message)
Definition: ncbidbg.hpp:122
const string AsFastaString(void) const
Definition: Seq_id.cpp:2265
bool Match(const CSeq_id &sid2) const
Match() - TRUE if SeqIds are equivalent.
Definition: Seq_id.hpp:1033
ENa_strand GetStrand(void) const
Get the location's strand.
Definition: Seq_loc.cpp:882
void SetId(CSeq_id &id)
set the 'id' field in all parts of this location
Definition: Seq_loc.cpp:3474
TRange GetTotalRange(void) const
Definition: Seq_loc.hpp:913
void SetInt(TInt &v)
Definition: Seq_loc.hpp:983
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
Definition: Seq_loc.hpp:941
void SetStrand(ENa_strand strand)
Set the strand for all of the location's ranges.
Definition: Seq_loc.cpp:5196
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
bool IntersectingWith(const TThisType &r) const
Definition: range.hpp:331
position_type GetToOpen(void) const
Definition: range.hpp:138
position_type GetFrom(void) const
Definition: range.hpp:134
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
static string IntToString(int value, TNumToStringFlags flags=0, int base=10)
Convert int to string.
Definition: ncbistr.hpp:5084
void SetTo(TTo value)
Assign a value to To data member.
ENa_strand
strand of nucleic acid
Definition: Na_strand_.hpp:64
TFrom GetFrom(void) const
Get the From member data.
void SetFrom(TFrom value)
Assign a value to From data member.
TTo GetTo(void) const
Get the To member data.
bool IsInt(void) const
Check if variant Int is selected.
Definition: Seq_loc_.hpp:528
const TInt & GetInt(void) const
Get the variant data.
Definition: Seq_loc_.cpp:194
@ eNa_strand_plus
Definition: Na_strand_.hpp:66
@ eNa_strand_minus
Definition: Na_strand_.hpp:67
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
int i
int len
Main class to perform a BLAST search on the local machine.
const string kStrand
range(_Ty, _Ty) -> range< _Ty >
#define INT4_MAX
largest nubmer represented by signed int
Definition: ncbi_std.h:141
#define ASSERT
macro for assert.
Definition: ncbi_std.h:107
T max(T x_, T y_)
T min(T x_, T y_)
void abort()
NOTE: This file contains work in progress and the APIs are likely to change, please do not rely on th...
const Int4 kInvalidContext
Value to represent an invalid context.
Definition: split_query.c:39
Declares CQuerySplitter, a class to split the query sequence(s)
Auxiliary functions and classes to assist in query splitting.
ostream & operator<<(ostream &out, const CQuerySplitter &rhs)
static int s_GetShiftForTranslatedNegStrand(size_t query_length)
Retrieve the shift for the negative strand.
static unsigned int s_AddShift(unsigned int context, int shift)
Adds the necessary shift to the context to record the query contexts for the query chunks.
static void s_SetSplitQuerySeqInterval(const TChunkRange &chunk, const TChunkRange &query_range, int query_offset, CRef< CSeq_loc > split_query_loc)
Auxiliary function to assign the split query's Seq-interval so that it's constrained within the chunk...
static size_t s_GetAbsoluteContextLength(const vector< const BlastQueryInfo * > &chunk_qinfo, int chunk_num, const CContextTranslator &ctx_translator, int absolute_context)
Get the length of a context in absolute terms (i.e.
static bool s_IsPlusStrand(const BlastQueryInfo *qinfo, Int4 context_number)
Determine whether a given context corresponds to the plus or minus strand.
Definition of SSeqLoc structure.
Structure to hold a sequence.
Definition: blast_def.h:242
Uint1 * sequence
Sequence used for search (could be translation).
Definition: blast_def.h:243
Int4 query_length
Length of this query, strand or frame.
Int4 query_offset
Offset of this query, strand or frame in the concatenated super-query.
Int1 frame
Frame number (-1, -2, -3, 0, 1, 2, or 3)
The query related information.
BlastContextInfo * contexts
Information per context.
#define _ASSERT
#define const
Definition: zconf.h:230
Modified on Fri Feb 23 11:43:25 2024 by modify_doxy.py rev. 669887