NCBI C++ ToolKit
dust_filter.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: dust_filter.cpp 72378 2016-05-04 14:59:01Z camacho $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Tom Madden
27  *
28  * Initial Version Creation Date: June 20, 2005
29  *
30  *
31  * */
32 
33 /// @file dust_filter.cpp
34 /// Calls sym dust lib in algo/dustmask and returns CSeq_locs for use by BLAST.
35 #include <ncbi_pch.hpp>
36 #include "dust_filter.hpp"
37 #include <serial/iterator.hpp>
39 #include <objmgr/util/sequence.hpp>
43 
44 #include <objmgr/seq_vector.hpp>
45 
47 
48 #include <string.h>
49 
50 /** @addtogroup AlgoBlast
51  *
52  * @{
53  */
54 
57 BEGIN_SCOPE(blast)
58 
59 void
61  const CBlastNucleotideOptionsHandle* nucl_handle)
62 {
63  // Either non-blastn search or dust filtering not desired.
64  if (nucl_handle == NULL || nucl_handle->GetDustFiltering() == false)
65  return;
66 
67  Blast_FindDustFilterLoc(queries, nucl_handle->GetDustFilteringLevel(),
68  nucl_handle->GetDustFilteringWindow(),
69  nucl_handle->GetDustFilteringLinker());
70 }
71 
72 /// Auxiliary function to create CSeq_loc_Mapper from a copy of the target
73 /// Seq-loc.
76  const CSeq_loc* target_seqloc,
77  CScope* scope)
78 {
79  _ASSERT(target_seqloc);
80  _ASSERT(scope);
81 
82  // Create a Seq-loc for the entire query sequence
83  CRef<CSeq_loc> entire_slp(new CSeq_loc);
84  entire_slp->SetWhole().Assign(query_id);
85 
87  (new CSeq_loc_Mapper(*entire_slp,
88  const_cast<CSeq_loc&>(*target_seqloc),
89  scope));
90 }
91 
93  CConstRef<CSeq_loc> seqloc,
94  CRef<CScope> scope,
95  CRef<CSeq_id> query_id,
96  CRef<CSeq_loc>& orig_query_mask,
97  Uint4 level, Uint4 window, Uint4 linker)
98 {
99  CSymDustMasker duster(level, window, linker);
100 
101  CRef<CPacked_seqint> masked_locations =
102  duster.GetMaskedInts(*query_id, data);
103  CPacked_seqint::Tdata locs = masked_locations->Get();
104  if (locs.empty()) {
105  return;
106  }
107 
108  CRef<CSeq_loc> query_masks(new CSeq_loc);
109  ITERATE(CPacked_seqint::Tdata, masked_loc, locs) {
110  CRef<CSeq_loc> seq_interval(new CSeq_loc(*query_id,
111  (*masked_loc)->GetFrom(),
112  (*masked_loc)->GetTo()));
113  query_masks->Add(*seq_interval);
114  }
115 
116  SetDiagFilter(eDiagFilter_Post, "!(1305.31)");
117  CRef<CSeq_loc_Mapper> mapper = s_CreateSeqLocMapper(*query_id, seqloc,
118  scope);
119  query_masks.Reset(mapper->Map(*query_masks));
120 
122  if (orig_query_mask.NotEmpty() && !orig_query_mask->IsNull()) {
123  CRef<CSeq_loc> tmp = orig_query_mask->Add(*query_masks, kTopFlags, 0);
124  orig_query_mask.Reset(tmp);
125  } else {
126  query_masks->Merge(kTopFlags, 0);
127  orig_query_mask.Reset(query_masks);
128  }
129 
130  if (orig_query_mask->IsNull() || orig_query_mask->IsEmpty()) {
131  orig_query_mask.Reset();
132  return;
133  }
134 
135  // in the event this happens, change to Seq-interval so that
136  // CSeq_loc::ChangeToPackedInt can process it
137  if (orig_query_mask->IsWhole()) {
138  orig_query_mask.Reset
139  (new CSeq_loc(*query_id, 0,
140  sequence::GetLength(*query_id, scope) -1));
141  }
142  orig_query_mask->ChangeToPackedInt();
143  _ASSERT(orig_query_mask->IsPacked_int());
144 }
145 
146 void
148  Uint4 level, Uint4 window, Uint4 linker)
149 {
150 
152  {
153  CSeqVector data(*query->seqloc, *query->scope,
155  CRef<CSeq_id> query_id
156  (const_cast<CSeq_id*>(query->seqloc->GetId()));
158  query->scope, query_id,
159  query->mask, level, window,
160  linker);
161  }
162 
163 }
164 
165 void
167  Uint4 level, Uint4 window, Uint4 linker)
168 {
169  for(size_t i = 0; i < queries.Size(); i++)
170  {
171  CSeqVector data(*queries.GetQuerySeqLoc(i), *queries.GetScope(i),
173 
174  CRef<CSeq_id> query_id
175  (const_cast<CSeq_id*>(queries.GetQuerySeqLoc(i)->GetId()));
176  CRef<CSeq_loc> masks = queries.GetMasks(i);
178  queries.GetQuerySeqLoc(i),
179  queries.GetScope(i), query_id,
180  masks, level, window, linker);
181  if (masks.NotEmpty()) {
182  TMaskedQueryRegions mqr =
184  queries.SetMaskedRegions(i, mqr);
185  }
186  }
187 }
188 
189 END_SCOPE(blast)
191 
192 /* @} */
Declares the CBlastNucleotideOptionsHandle class.
@ eBlastTypeBlastn
Definition: blast_program.h:74
Definitions of special type used in BLAST.
Handle to the nucleotide-nucleotide options to the BLAST algorithm.
Query Vector.
Definition: sseqloc.hpp:276
CRef< objects::CScope > GetScope(size_type i) const
Get the scope containing a query by index.
Definition: sseqloc.hpp:322
size_type Size() const
Returns the number of queries found in this query vector.
Definition: sseqloc.hpp:305
CConstRef< objects::CSeq_loc > GetQuerySeqLoc(size_type i) const
Get the query Seq-loc for a query by index.
Definition: sseqloc.hpp:313
void SetMaskedRegions(size_type i, TMaskedQueryRegions mqr)
Assign a list of masked regions to one query.
Definition: sseqloc.hpp:350
CRef< objects::CSeq_loc > GetMasks(size_type i) const
Convenience method to get a CSeq_loc representing the masking locations.
Definition: sseqloc.hpp:341
CScope –.
Definition: scope.hpp:92
CSeqVector –.
Definition: seq_vector.hpp:65
CSeq_loc_Mapper –.
Looks for low complexity parts of sequences according to the symmetric version of DUST algorithm.
Definition: symdust.hpp:61
CRef< objects::CPacked_seqint > GetMaskedInts(objects::CSeq_id &seq_id, const sequence_type &seq)
Mask a sequence and return result as a CPacked_seqint instance.
Definition: symdust.cpp:309
Collection of masked regions for a single query sequence.
Definition: seqlocinfo.hpp:113
Calls sym dust lib in algo/dustmask and returns CSeq_locs for use by BLAST.
static char tmp[3200]
Definition: utf8.c:42
char data[12]
Definition: iconv.c:80
int GetDustFilteringLinker() const
Get linker parameter for dust.
static CRef< CSeq_loc_Mapper > s_CreateSeqLocMapper(CSeq_id &query_id, const CSeq_loc *target_seqloc, CScope *scope)
Auxiliary function to create CSeq_loc_Mapper from a copy of the target Seq-loc.
Definition: dust_filter.cpp:75
TMaskedQueryRegions PackedSeqLocToMaskedQueryRegions(CConstRef< objects::CSeq_loc > sloc, EBlastProgramType program, bool assume_both_strands=false)
Auxiliary function to convert a Seq-loc describing masked query regions to a TMaskedQueryRegions obje...
bool GetDustFiltering() const
Is dust filtering enabled?
int GetDustFilteringLevel() const
Get level parameter for dust.
void s_CombineDustMasksWithUserProvidedMasks(CSeqVector &data, CConstRef< CSeq_loc > seqloc, CRef< CScope > scope, CRef< CSeq_id > query_id, CRef< CSeq_loc > &orig_query_mask, Uint4 level, Uint4 window, Uint4 linker)
Definition: dust_filter.cpp:92
int GetDustFilteringWindow() const
Get window parameter for dust.
void Blast_FindDustFilterLoc(TSeqLocVector &queries, const CBlastNucleotideOptionsHandle *nucl_handle)
Finds dust locations for a given set of sequences by calling the the symmetric dust lib.
Definition: dust_filter.cpp:60
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define NULL
Definition: ncbistd.hpp:225
void SetDiagFilter(EDiagFilter what, const char *filter_str)
Set diagnostic filter.
Definition: ncbidiag.cpp:7673
@ eDiagFilter_Post
for all non-TRACE, non-FATAL
Definition: ncbidiag.hpp:2530
void SetWhole(TWhole &v)
Definition: Seq_loc.hpp:982
void ChangeToPackedInt(void)
Works only if location is currently an interval, point, packed-int (handled trivially),...
Definition: Seq_loc.cpp:3670
CRef< CSeq_loc > Merge(TOpFlags flags, ISynonymMapper *syn_mapper) const
All functions create and return a new seq-loc object.
Definition: Seq_loc.cpp:5037
void Add(const CSeq_loc &other)
Simple adding of seq-locs.
Definition: Seq_loc.cpp:3875
@ fStrand_Ignore
Definition: Seq_loc.hpp:325
@ fMerge_All
Definition: Seq_loc.hpp:331
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
CRef< CSeq_loc > Map(const CSeq_loc &src_loc)
Map seq-loc.
@ eCoding_Iupac
Set coding to printable coding (Iupacna or Iupacaa)
void Reset(void)
Reset reference object.
Definition: ncbiobj.hpp:773
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define USING_SCOPE(ns)
Use the specified namespace.
Definition: ncbistl.hpp:78
#define END_SCOPE(ns)
End the previously defined scope.
Definition: ncbistl.hpp:75
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
#define BEGIN_SCOPE(ns)
Define a new scope.
Definition: ncbistl.hpp:72
bool IsEmpty(void) const
Check if variant Empty is selected.
Definition: Seq_loc_.hpp:516
list< CRef< CSeq_interval > > Tdata
bool IsPacked_int(void) const
Check if variant Packed_int is selected.
Definition: Seq_loc_.hpp:534
bool IsWhole(void) const
Check if variant Whole is selected.
Definition: Seq_loc_.hpp:522
bool IsNull(void) const
Check if variant Null is selected.
Definition: Seq_loc_.hpp:504
int i
vector< SSeqLoc > TSeqLocVector
Vector of sequence locations.
Definition: sseqloc.hpp:129
static string query
#define _ASSERT
Modified on Fri Sep 20 14:57:28 2024 by modify_doxy.py rev. 669887