NCBI C++ ToolKit
repeats_filter_cxx.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: repeats_filter_cxx.cpp 72378 2016-05-04 14:59:01Z camacho $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Ilya Dondoshansky
27  *
28  * Initial Version Creation Date: November 13, 2003
29  *
30  *
31  * File Description:
32  * C++ version of repeats filtering
33  *
34  * */
35 
36 /// @file repeats_filter_cxx.cpp
37 /// C++ version of repeats filtering
38 #include <ncbi_pch.hpp>
39 #include <serial/iterator.hpp>
44 #include <objmgr/util/sequence.hpp>
46 
48 
53 #include "blast_setup.hpp"
54 
58 
60 
61 /** @addtogroup AlgoBlast
62  *
63  * @{
64  */
65 
68 BEGIN_SCOPE(blast)
69 
70 /** Convert a list of mask locations to a CSeq_loc object.
71  * @param query Query sequence location [in]
72  * @param scope Scope for use by object manager [in]
73  * @param loc_list List of mask locations [in]
74  * @return List of mask locations in a CSeq_loc form or NULL if loc_list is
75  * NULL
76  */
77 static CSeq_loc*
79  CScope * scope,
80  BlastSeqLoc * loc_list)
81 {
82  if ( !loc_list ) {
83  return NULL;
84  }
85 
86  CSeq_loc* seqloc = new CSeq_loc();
87  BlastSeqLoc* loc;
88 
89  seqloc->SetNull();
90  for (loc = loc_list; loc; loc = loc->next) {
91  seqloc->SetPacked_int().AddInterval(
92  sequence::GetId(query, scope),
93  loc->ssr->left, loc->ssr->right);
94  }
95 
96  return seqloc;
97 }
98 
99 /** Convert a list of mask locations to a CSeq_loc object.
100  * @param query Query sequence location [in]
101  * @param loc_list List of mask locations [in]
102  * @return List of mask locations in a CSeq_loc form.
103  */
104 static CSeq_loc*
106 {
107  return s_BlastSeqLoc2CSeqloc(*query.seqloc, &*query.scope, loc_list);
108 }
109 
110 /** Convert a list of mask locations to TMaskedQueryRegions.
111  * @param query Query sequence location [in]
112  * @param scope Scope for use by object manager [in]
113  * @param loc_list List of mask locations [in]
114  * @param program type of blast search [in]
115  * @return List of mask locations in TMaskedQueryRegions form.
116  */
119  CScope * scope,
120  BlastSeqLoc * loc_list,
121  EBlastProgramType program)
122 {
123  CConstRef<CSeq_loc> sloc(s_BlastSeqLoc2CSeqloc(query, scope, loc_list));
124 
125  return PackedSeqLocToMaskedQueryRegions(sloc, program);
126 }
127 
128 
129 /// Build a list of BlastSeqLoc's from a set of Dense-seg contained in a
130 /// Seq-align-set.
131 ///
132 /// This function processes Dense-segs, and adds the range of each hit to
133 /// a list of BlastSeqLoc structures. Frame information is used to
134 /// translate hit coordinates hits to the plus strand. All of the
135 /// HSPs should refer to the same query; both the query and subject in
136 /// the HSP are ignored. This is used to construct a set of filtered
137 /// areas from hits against a repeats database.
138 ///
139 /// @param alignment Seq-align-set containing Dense-segs which specify the
140 /// ranges of hits. [in]
141 /// @param locs Filtered areas for this query are added here. [out]
142 
143 static void
145  BlastSeqLoc ** locs)
146 {
147  ITERATE(CSeq_align_set::Tdata, itr, alignment.Get()) {
148  _ASSERT((*itr)->GetSegs().IsDenseg());
149  const CDense_seg& seg = (*itr)->GetSegs().GetDenseg();
150  const int kNumSegments = seg.GetNumseg();
151 #if _DEBUG /* to eliminate compiler warning in release mode */
152  const int kNumDim = seg.GetDim();
153 #endif
154  _ASSERT(kNumDim == 2);
155 
156  const CDense_seg::TStarts& starts = seg.GetStarts();
157  const CDense_seg::TLens& lengths = seg.GetLens();
158  const CDense_seg::TStrands& strands = seg.GetStrands();
159  _ASSERT(kNumSegments*kNumDim == (int) starts.size());
160  _ASSERT(kNumSegments == (int) lengths.size());
161  _ASSERT(kNumSegments*kNumDim == (int) strands.size());
162 
163  int left(0), right(0);
164 
165  if (strands[0] == strands[1]) {
166  left = starts.front();
167  right = starts[(kNumSegments-1)*2] + lengths[kNumSegments-1] - 1;
168  } else {
169  left = starts[(kNumSegments-1)*2];
170  right = starts.front() + lengths.front() - 1;
171  }
172 
173  BlastSeqLocNew(locs, left, right);
174  }
175 }
176 
177 /** Fills the mask locations in the query SSeqLoc structures, as if it was a
178  * lower case mask, given the results of a BLAST search against a database of
179  * repeats.
180  * @param query Vector of query sequence locations structures [in] [out]
181  * @param results alignments returned from a BLAST search against a repeats
182  * database [in]
183  */
184 static void
186  const CSearchResultSet& results)
187 {
188  _ASSERT(results.GetNumResults() == query.size());
189 
190  for (size_t query_index = 0; query_index < query.size(); ++query_index) {
191  const CSearchResults& result = results[query_index];
192 
193  if (result.GetSeqAlign().Empty() || result.GetSeqAlign()->IsEmpty()) {
194  continue;
195  }
196 
197  // Get the previous mask locations
198  BlastSeqLoc* loc_list = CSeqLoc2BlastSeqLoc(query[query_index].mask);
199 
200  // Find all HSP intervals in query
201 /* DELME
202  ITERATE(CSeq_align_set::Tdata, alignment, result.GetSeqAlign()->Get()) {
203  _ASSERT((*alignment)->GetSegs().IsDisc());
204  s_SeqAlignToBlastSeqLoc((*alignment)->GetSegs().GetDisc(),
205  &loc_list);
206  }
207 */
208  s_SeqAlignToBlastSeqLoc(*(result.GetSeqAlign()), &loc_list);
209 
210 
211  // Make the intervals unique
213  BlastSeqLoc* ordered_loc_list = loc_list;
214  loc_list = NULL;
215 
216  /* Create a CSeq_loc with these locations and fill it for the
217  respective query */
218  CRef<CSeq_loc> filter_seqloc(s_BlastSeqLoc2CSeqloc(query[query_index],
219  ordered_loc_list));
220 
221  // Free the combined mask list in the BlastSeqLoc form.
222  ordered_loc_list = BlastSeqLocFree(ordered_loc_list);
223 
224  query[query_index].mask.Reset(filter_seqloc);
225  }
226 }
227 
228 /** Fills the mask locations in the BlastSearchQuery structures, as if it was a
229  * lower case mask, given the results of a BLAST search against a database of
230  * repeats.
231  * @param query Vector of queries [in] [out]
232  * @param results alignments returned from a BLAST search against a repeats
233  * database [in]
234  * @param program type of blast search [in]
235  */
236 static void
238  const CSearchResultSet& results,
239  EBlastProgramType program)
240 {
241  _ASSERT(results.GetNumResults() == query.Size());
242 
243  for (size_t qindex = 0; qindex < query.Size(); ++qindex) {
244  const CSearchResults& result = results[qindex];
245 
246  if (result.GetSeqAlign().Empty() || result.GetSeqAlign()->IsEmpty()) {
247  continue;
248  }
249 
250  // Get the previous mask locations
251  TMaskedQueryRegions mqr = query.GetMaskedRegions(qindex);
252 
254  (new CBlastQueryFilteredFrames(program, mqr));
255 
256  typedef set<CSeqLocInfo::ETranslationFrame> TFrameSet;
257  const TFrameSet& used = frames->ListFrames();
258 
259  BlastSeqLoc* loc_list = 0;
260 
261  ITERATE(TFrameSet, itr, used) {
262  // Pick frame +1 for nucleotide, or 0 (the only one) for protein.
263  int pframe = *itr;
264 
265  BlastSeqLoc* locs1 = *(*frames)[pframe];
266  frames->Release(pframe);
267 
268  BlastSeqLoc ** pplast = & loc_list;
269 
270  while(*pplast) {
271  pplast = & (*pplast)->next;
272  }
273 
274  *pplast = locs1;
275  }
276 
277  // Find all HSP intervals in query
278 /* DELME
279  ITERATE(CSeq_align_set::Tdata, alignment, result.GetSeqAlign()->Get()) {
280  _ASSERT((*alignment)->GetSegs().IsDisc());
281  s_SeqAlignToBlastSeqLoc((*alignment)->GetSegs().GetDisc(),
282  &loc_list);
283  }
284 */
285  s_SeqAlignToBlastSeqLoc(*(result.GetSeqAlign()), &loc_list);
286 
287  // Make the intervals unique
289  BlastSeqLoc* ordered_loc_list = loc_list;
290  loc_list = NULL;
291 
292  /* Create a CSeq_loc with these locations and fill it for the
293  respective query */
294 
295  TMaskedQueryRegions filter_seqloc =
296  s_BlastSeqLoc2MaskedRegions(*query.GetQuerySeqLoc(qindex),
297  query.GetScope(qindex),
298  ordered_loc_list,
299  program);
300 
301  // Free the combined mask list in the BlastSeqLoc form.
302  ordered_loc_list = BlastSeqLocFree(ordered_loc_list);
303 
304  query.SetMaskedRegions(qindex, filter_seqloc);
305  }
306 }
307 
308 /// Create an options handle with the defaults set for a search for repeats.
309 static
311 {
321  opts->SetDustFiltering(false); // FIXME, is this correct?
323  // just to be safe (SB-1067)
324  _ASSERT(opts->GetOptions().GetMBIndexLoaded() == false);
325  return CRef<CBlastOptionsHandle>(opts);
326 }
327 
328 void
330  const CBlastOptionsHandle* opts_handle)
331 {
332  const CBlastNucleotideOptionsHandle* nucl_handle =
333  dynamic_cast<const CBlastNucleotideOptionsHandle*>(opts_handle);
334 
335  // Either non-blastn search or repeat filtering not desired.
336  if (nucl_handle == NULL || nucl_handle->GetRepeatFiltering() == false)
337  return;
338 
340 }
341 
342 void
344 {
345  const CSearchDatabase target_db(filter_db,
347 
349 
350  // Remove any lower case masks, because they should not be used for the
351  // repeat locations search.
352  vector< CRef<CSeq_loc> > lcase_mask_v;
353  lcase_mask_v.reserve(query.size());
354 
355  for (unsigned int index = 0; index < query.size(); ++index) {
356  lcase_mask_v.push_back(query[index].mask);
357  query[index].mask.Reset(NULL);
358  }
359 
360  CRef<IQueryFactory> query_factory(new CObjMgr_QueryFactory(query));
361  CLocalBlast blaster(query_factory, repeat_opts, target_db);
362  CRef<CSearchResultSet> results = blaster.Run();
363 
364  // Restore the lower case masks
365  for (unsigned int index = 0; index < query.size(); ++index) {
366  query[index].mask.Reset(lcase_mask_v[index]);
367  }
368 
369  // Extract the repeat locations and combine them with the previously
370  // existing mask in queries.
372 }
373 
374 void
375 Blast_FindRepeatFilterLoc(CBlastQueryVector& queries, const char* filter_db)
376 {
377  const CSearchDatabase target_db(filter_db,
379 
381 
382  // Remove any lower case masks, because they should not be used for the
383  // repeat locations search.
384  CBlastQueryVector temp_queries;
385  for (size_t i = 0; i < queries.Size(); ++i) {
386  TMaskedQueryRegions no_masks;
388  (new CBlastSearchQuery(*queries.GetQuerySeqLoc(i),
389  *queries.GetScope(i), no_masks));
390  temp_queries.AddQuery(query);
391  }
392 
393  CRef<IQueryFactory> query_factory(new CObjMgr_QueryFactory(temp_queries));
394  CLocalBlast blaster(query_factory, repeat_opts, target_db);
395  CRef<CSearchResultSet> results = blaster.Run();
396 
397  // Extract the repeat locations and combine them with the previously
398  // existing mask in queries.
400 }
401 
402 END_SCOPE(blast)
404 
405 /* @} */
Contains C++ wrapper classes to structures in algo/blast/core as well as some auxiliary functions to ...
BLAST filtering functions.
#define REPEATS_SEARCH_GAP_OPEN
Default gap opening cost.
Definition: blast_filter.h:61
#define REPEATS_SEARCH_WORD_SIZE
Default word size.
Definition: blast_filter.h:63
#define REPEATS_SEARCH_REWARD
Default match reward.
Definition: blast_filter.h:60
#define REPEATS_SEARCH_XDROP_FINAL
Default X-dropoff for gapped extension with traceback.
Definition: blast_filter.h:66
#define REPEATS_SEARCH_PENALTY
Default mismatch penalty.
Definition: blast_filter.h:59
void BlastSeqLocCombine(BlastSeqLoc **mask_loc, Int4 link_value)
Go through all mask locations in one sequence and combine any that overlap, deallocating the unneeded...
Definition: blast_filter.c:972
#define REPEATS_SEARCH_GAP_EXTEND
Default gap extension cost.
Definition: blast_filter.h:62
BlastSeqLoc * BlastSeqLocFree(BlastSeqLoc *loc)
Deallocate all BlastSeqLoc objects in a chain.
Definition: blast_filter.c:737
BlastSeqLoc * BlastSeqLocNew(BlastSeqLoc **head, Int4 from, Int4 to)
Create and initialize a new sequence interval.
Definition: blast_filter.c:608
#define REPEAT_MASK_LINK_VALUE
Largest gap allowed to be filled between repeat mask intervals.
Definition: blast_filter.h:72
#define REPEATS_SEARCH_MINSCORE
Default score cutoff.
Definition: blast_filter.h:58
#define REPEATS_SEARCH_XDROP_UNGAPPED
Default X-dropoff for ungapped extension.
Definition: blast_filter.h:64
Structures and API used for saving BLAST hits.
Declares the CBlastNucleotideOptionsHandle class.
EBlastProgramType
Defines the engine's notion of the different applications of the BLAST algorithm.
Definition: blast_program.h:72
@ eBlastTypeBlastn
Definition: blast_program.h:74
Declaration of ADT to retrieve sequences for the BLAST engine.
Internal auxiliary setup classes/functions for C++ BLAST APIs.
Definitions of special type used in BLAST.
ncbi::TMaskedQueryRegions mask
Handle to the nucleotide-nucleotide options to the BLAST algorithm.
Handle to the options to the BLAST algorithm.
Collection of BlastSeqLoc lists for filtering processing.
Query Vector.
Definition: sseqloc.hpp:276
void AddQuery(CRef< CBlastSearchQuery > q)
Add a query to the set.
Definition: sseqloc.hpp:293
CRef< objects::CScope > GetScope(size_type i) const
Get the scope containing a query by index.
Definition: sseqloc.hpp:322
size_type Size() const
Returns the number of queries found in this query vector.
Definition: sseqloc.hpp:305
CConstRef< objects::CSeq_loc > GetQuerySeqLoc(size_type i) const
Get the query Seq-loc for a query by index.
Definition: sseqloc.hpp:313
Search Query.
Definition: sseqloc.hpp:147
Class to perform a BLAST search on local BLAST databases Note that PHI-BLAST can be run using this cl...
Definition: local_blast.hpp:62
NCBI C++ Object Manager dependant implementation of IQueryFactory.
CScope –.
Definition: scope.hpp:92
Blast Search Subject.
Search Results for All Queries.
Search Results for One Query.
Collection of masked regions for a single query sequence.
Definition: seqlocinfo.hpp:113
Definition: set.hpp:45
static void s_SeqAlignToBlastSeqLoc(const CSeq_align_set &alignment, BlastSeqLoc **locs)
Build a list of BlastSeqLoc's from a set of Dense-seg contained in a Seq-align-set.
BlastSeqLoc * CSeqLoc2BlastSeqLoc(const objects::CSeq_loc *slp)
Converts a CSeq_loc into a BlastSeqLoc structure used in NewBlast.
Definition: blast_aux.cpp:539
TMaskedQueryRegions PackedSeqLocToMaskedQueryRegions(CConstRef< objects::CSeq_loc > sloc, EBlastProgramType program, bool assume_both_strands=false)
Auxiliary function to convert a Seq-loc describing masked query regions to a TMaskedQueryRegions obje...
void SetMatchReward(int r)
Sets MatchReward.
CRef< CSearchResultSet > Run()
Executes the search.
void SetTraditionalBlastnDefaults()
Sets TraditionalBlastnDefaults.
void SetMismatchPenalty(int p)
Sets MismatchPenalty.
static CRef< CBlastOptionsHandle > s_CreateRepeatsSearchOptions()
Create an options handle with the defaults set for a search for repeats.
static CSeq_loc * s_BlastSeqLoc2CSeqloc(const CSeq_loc &query, CScope *scope, BlastSeqLoc *loc_list)
Convert a list of mask locations to a CSeq_loc object.
const CBlastOptions & GetOptions() const
Return the object which this object is a handle for.
void SetXDropoff(double x)
Sets XDropoff.
bool GetRepeatFiltering() const
Is repeat filtering enabled?
TMaskedQueryRegions s_BlastSeqLoc2MaskedRegions(const CSeq_loc &query, CScope *scope, BlastSeqLoc *loc_list, EBlastProgramType program)
Convert a list of mask locations to TMaskedQueryRegions.
void Blast_FindRepeatFilterLoc(TSeqLocVector &query_loc, const CBlastOptionsHandle *opts_handle)
Finds repeats locations for a given set of sequences.
void SetGapXDropoffFinal(double x)
Sets GapXDropoffFinal.
void SetGapExtensionCost(int e)
Sets GapExtensionCost.
bool GetMBIndexLoaded() const
const char * GetRepeatFilteringDB() const
Get the repeat filtering database.
void SetWordSize(int ws)
Sets WordSize.
static void s_FillMaskLocFromBlastResults(TSeqLocVector &query, const CSearchResultSet &results)
Fills the mask locations in the query SSeqLoc structures, as if it was a lower case mask,...
void SetDustFiltering(bool val)
Enable dust filtering.
void SetCutoffScore(int s)
Sets CutoffScore.
void SetGapOpeningCost(int g)
Sets GapOpeningCost.
size_type GetNumResults() const
Return the number of results contained by this object.
@ eBlastDbIsNucleotide
nucleotide
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NULL
Definition: ncbistd.hpp:225
void SetPacked_int(TPacked_int &v)
Definition: Seq_loc.hpp:984
void SetNull(void)
Override all setters to incorporate cache invalidation.
Definition: Seq_loc.hpp:960
const CSeq_id & GetId(const CSeq_loc &loc, CScope *scope)
If all CSeq_ids embedded in CSeq_loc refer to the same CBioseq, returns the first CSeq_id found,...
TObjectType * Release(void)
Release a reference to the object and return a pointer to the object.
Definition: ncbiobj.hpp:846
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define USING_SCOPE(ns)
Use the specified namespace.
Definition: ncbistl.hpp:78
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
vector< TSeqPos > TLens
Definition: Dense_seg_.hpp:108
const TStarts & GetStarts(void) const
Get the Starts member data.
Definition: Dense_seg_.hpp:530
vector< ENa_strand > TStrands
Definition: Dense_seg_.hpp:109
const TLens & GetLens(void) const
Get the Lens member data.
Definition: Dense_seg_.hpp:555
vector< TSignedSeqPos > TStarts
Definition: Dense_seg_.hpp:107
TDim GetDim(void) const
Get the Dim member data.
Definition: Dense_seg_.hpp:421
TNumseg GetNumseg(void) const
Get the Numseg member data.
Definition: Dense_seg_.hpp:465
list< CRef< CSeq_align > > Tdata
const TStrands & GetStrands(void) const
Get the Strands member data.
Definition: Dense_seg_.hpp:580
const Tdata & Get(void) const
Get the member data.
int i
Main class to perform a BLAST search on the local machine.
NOTE: This file contains work in progress and the APIs are likely to change, please do not rely on th...
C++ implementation of repeats filtering for C++ BLAST.
Implementation of the BlastSeqSrc interface using the C++ BLAST databases API.
vector< SSeqLoc > TSeqLocVector
Vector of sequence locations.
Definition: sseqloc.hpp:129
Used to hold a set of positions, mostly used for filtering.
Definition: blast_def.h:204
SSeqRange * ssr
location data on the sequence.
Definition: blast_def.h:206
struct BlastSeqLoc * next
next in linked list
Definition: blast_def.h:205
Structure to represent a single sequence to be fed to BLAST.
Definition: sseqloc.hpp:47
Int4 left
left endpoint of range (zero based)
Definition: blast_def.h:156
Int4 right
right endpoint of range (zero based)
Definition: blast_def.h:157
static string query
#define _ASSERT
else result
Definition: token2.c:20
Modified on Fri Jun 14 16:55:07 2024 by modify_doxy.py rev. 669887