NCBI C++ ToolKit
vectorscreen.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: vectorscreen.cpp 47479 2023-05-02 13:24:02Z ucko $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors: Yoon Choi
27  */
28 
29 
30 #include <ncbi_pch.hpp>
31 
32 ////@begin includes
33 ////@end includes
34 
36 #include <objmgr/scope.hpp>
37 #include <objmgr/bioseq_ci.hpp>
38 
41 
46 
48 
51 
52 #include <wx/app.h>
53 #include <wx/dir.h>
54 #include <wx/filename.h>
55 
56 #include <sstream>
57 
58 ////@begin XPM images
59 ////@end XPM images
60 
63 USING_SCOPE(blast);
64 
65 
66 // Relative path to blast databases
67 #if defined(NCBI_OS_MSWIN)
68 const string kUniVecDatabase = "screen\\UniVec_Core";
69 #else
70 const string kUniVecDatabase = "screen/UniVec_Core";
71 #endif
72 
73 
74 
75 /*!
76  * CVectorScreen constructors
77  */
78 
80 {
81 }
82 
83 
84 /*!
85  * CVectorScreen destructor
86  */
87 
89 {
90 }
91 
93 {
95  CRef<CSeqDB> refdb = db.GetSeqDb();
96  return refdb.NotNull();
97 }
98 
99 void CVectorScreen::Run(objects::CSeq_entry_Handle seh,
100  wxGauge& progress)
101 {
102  // Our toplevel blob
103  m_TopSeqEntry = seh;
104 
105  // Set up objmgr and scope
107  if (!objmgr) {
108  throw std::runtime_error("Could not initialize object manager");
109  }
110  CRef<CScope> scope( new CScope(*objmgr) );
111 
112  // Add UniVec database to scope
113  CDataLoader* blast_loader =
118  scope->AddDataLoader(blast_loader->GetName());
119 
120  // Add toplevel blob to our scope
121  scope->AddScope(m_TopSeqEntry.GetScope());
122 
123  // Iterate over all nuc bioseqs and collect Seq_locs
124  typedef vector< CRef<CSeq_loc> > TSearchLocVec;
125  m_vecSearchLocs.clear();
127  for( ; bioseq_ci; ++bioseq_ci ) {
128  CBioseq_Handle bioseq_handle = *bioseq_ci;
129 
130  // Select an appropriate seqid
131  CRef<CSeq_id> selected_seq_id = x_SelectSeqId(bioseq_handle);
132  ASSERT( !selected_seq_id.IsNull() );
133 
134  // Search on the "whole" bioseq
135  CRef<CSeq_loc> search_loc( new CSeq_loc );
136  search_loc->SetWhole(*selected_seq_id);
137 
138  m_vecSearchLocs.push_back( search_loc );
139  }
140 
141  // Run blast program on UniVec database and post-process via vecscreen
142  m_vecscreen_summary.clear();
145  opts(CBlastOptionsFactory::CreateTask("vecscreen"));
146  opts->SetHitlistSize(100);
147 
148  progress.SetRange(static_cast<int>(m_vecSearchLocs.size()));
149  int progressCount = 0;
150 
152  NON_CONST_ITERATE( TSearchLocVec, searchLocIter, m_vecSearchLocs ) {
153  CRef<CSeq_loc> search_loc = *searchLocIter;
154 
155  // Blast against UniVec
157  SSeqLoc ssl(*search_loc, *scope);
158  query.push_back(ssl);
159  CRef<IQueryFactory> query_factory(new CObjMgr_QueryFactory(query));
160 
161  CLocalBlast blaster(query_factory, opts, db);
162  CRef<CSearchResultSet> results;
163  results = blaster.Run();
164 
165  // Vecscreen post-process
166  TSeqPos seqlen = sequence::GetLength(*search_loc, scope);
167  CRef< CSeq_align_set > seq_align_set;
168  CVecscreen vecscreen(*((*results)[0].GetSeqAlign()), seqlen);
169  seq_align_set = vecscreen.ProcessSeqAlign();
170 
171  const list<CVecscreen::AlnInfo*>* aln_info =
172  vecscreen.GetAlnInfoList();
173 
174  // Alignment graphic view - html output references gif files
175  wxString resPath = CSysPath::GetResourcePath();
176  vecscreen.SetImagePath( resPath.ToStdString() + string("/") );
177  stringstream ssAlignGraphic;
178  ssAlignGraphic << "<HTML>"
179  << "<TITLE>BLAST Search Results</TITLE>"
180  << "<BODY BGCOLOR=\"#FFFFFF\" LINK=\"#0000FF\" VLINK=\"#660099\" ALINK=\"#660099\" >"
181  << "<PRE>";
182  vecscreen.VecscreenPrint(ssAlignGraphic);
183  ssAlignGraphic << "</PRE>"
184  << "</BODY>"
185  << "</HTML>";
186 
187  // Produce summary
188  list<CVecscreen::AlnInfo*>::const_iterator itr = aln_info->begin();
189  for ( ; itr != aln_info->end(); ++itr )
190  {
191  if ((*itr)->type == CVecscreen::eNoMatch)
192  continue;
193 
194  // Fill in standard summary
196  summary.seqid = search_loc->GetId();
197  summary.range = (*itr)->range;
198  summary.match_type = CVecscreen::GetStrengthString((*itr)->type);
199  m_vecscreen_summary.push_back(summary);
200 
201  // Fill in summary organized around seqid
202  SRangeMatch rangematch;
203  rangematch.m_range = summary.range;
204  rangematch.m_match_type = summary.match_type;
205  vector<SRangeMatch> rangematchArr;
206  rangematchArr.push_back(rangematch);
207 
208  SVecscreenResult vecres(summary.seqid->AsFastaString(),
209  rangematchArr, seqlen);
210 
211  pair<TVecscreenSummaryBySeqid::iterator, bool> ins_res =
213  vecres));
214  if ( !ins_res.second )
215  {
216  // If insert failed, add res to existing container.
217  // (Add any "suspect" to the front of the container.)
218  SVecscreenResult& vecres = ins_res.first->second;
219  if ( NStr::FindNoCase(rangematch.m_match_type, "suspect") != NPOS )
220  {
221  vecres.m_arrRangeMatch.insert(vecres.m_arrRangeMatch.begin(), rangematch);
222  }
223  else
224  {
225  vecres.m_arrRangeMatch.push_back(rangematch);
226  }
227  }
228  }
229 
230  progress.SetValue(++progressCount);
231  wxTheApp->Yield();
232  }
233 }
234 
235 
236 const vector<CVecscreenRun::SVecscreenSummary>&
238 {
239  return m_vecscreen_summary;
240 }
241 
242 
245 {
246  return m_seqidSummary;
247 }
248 
249 
251 {
252  if ( !bioseq_handle.CanGetId() ) {
253  return CRef<CSeq_id>();
254  }
255 
256  // Try accession
257  CRef<CSeq_id> seqid = x_FindAccession(bioseq_handle);
258  if ( seqid )
259  {
260  return seqid;
261  }
262 
263  // Try localid
264  seqid = x_FindLocalId(bioseq_handle);
265  if ( seqid )
266  {
267  return seqid;
268  }
269 
270  // Try type general
271  seqid = x_FindTypeGeneral(bioseq_handle);
272  if ( seqid )
273  {
274  return seqid;
275  }
276 
277  return CRef<CSeq_id>();
278 }
279 
280 
282 {
283  const CBioseq_Handle::TId& ids = bioseq_handle.GetId();
284  ITERATE( CBioseq_Handle::TId, id_itr, ids ) {
285  const CSeq_id_Handle& id_handle = *id_itr;
286  CConstRef<CSeq_id> id = id_handle.GetSeqIdOrNull();
287 
288  if ( !id )
289  {
290  continue;
291  }
292 
293  const CTextseq_id* textseq_id = id->GetTextseq_Id();
294  if ( textseq_id != NULL && textseq_id->CanGetAccession() ) {
295  // Found accession
296  return CRef<CSeq_id>(new CSeq_id(textseq_id->GetAccession()));
297  }
298  }
299 
300  return CRef<CSeq_id>();
301 }
302 
303 
305 {
306  const CBioseq_Handle::TId& ids = bioseq_handle.GetId();
307  ITERATE( CBioseq_Handle::TId, id_itr, ids ) {
308  const CSeq_id_Handle& id_handle = *id_itr;
309  CConstRef<CSeq_id> id = id_handle.GetSeqIdOrNull();
310 
311  if ( !id )
312  {
313  continue;
314  }
315 
316  if ( id->IsGeneral() )
317  {
318  // Found type general
319  return CRef<CSeq_id>(new CSeq_id(id->GetGeneral()));
320  }
321  }
322 
323  return CRef<CSeq_id>();
324 }
325 
326 
328 {
329  const CBioseq_Handle::TId& ids = bioseq_handle.GetId();
330  ITERATE( CBioseq_Handle::TId, id_itr, ids ) {
331  const CSeq_id_Handle& id_handle = *id_itr;
332  CConstRef<CSeq_id> id = id_handle.GetSeqIdOrNull();
333 
334  if ( !id )
335  {
336  continue;
337  }
338 
339  if ( id->IsLocal() )
340  {
341  // Found localid
342  const CObject_id& localid = id->GetLocal();
343  if ( localid.IsId() )
344  {
346  localid.GetId()));
347  }
348  else
349  if ( localid.IsStr() )
350  {
352  localid.GetStr()));
353  }
354  }
355  }
356 
357  return CRef<CSeq_id>();
358 }
359 
360 
362 
Data loader implementation that uses the blast databases.
Declares the CBlastNucleotideOptionsHandle class.
Declares the CBlastOptionsHandle and CBlastOptionsFactory classes.
CBioseq_CI –.
Definition: bioseq_ci.hpp:69
CBioseq_Handle –.
static TRegisterLoaderInfo RegisterInObjectManager(CObjectManager &om, const string &dbname="nr", const EDbType dbtype=eUnknown, bool use_fixed_size_slices=true, CObjectManager::EIsDefault is_default=CObjectManager::eNonDefault, CObjectManager::TPriority priority=CObjectManager::kPriority_NotSet)
Definition: bdbloader.cpp:52
@ eNucleotide
nucleotide database
Definition: bdbloader.hpp:58
Class to perform a BLAST search on local BLAST databases Note that PHI-BLAST can be run using this cl...
Definition: local_blast.hpp:62
NCBI C++ Object Manager dependant implementation of IQueryFactory.
CScope –.
Definition: scope.hpp:92
Blast Search Subject.
static wxString GetResourcePath(void)
the shared resource path corresponds to the <res> alias above.
Definition: sys_path.cpp:189
Example:
void SetImagePath(string path)
Set path to pre-made image gif files with different colors.
const list< AlnInfo * > * GetAlnInfoList() const
return alignment info list
void VecscreenPrint(CNcbiOstream &out)
show alignment graphic view
static const string & GetStrengthString(MatchType match_type)
Returns a string concerning the strength of the match for a given enum value.
CRef< objects::CSeq_align_set > ProcessSeqAlign(void)
Process alignment to show.
vector< CRef< CSeq_loc > > TSearchLocVec
void Run(objects::CSeq_entry_Handle seh, wxGauge &progress)
CRef< CSeq_id > x_FindTypeGeneral(CBioseq_Handle bioseq_handle)
CVectorScreen()
Constructors.
CRef< CSeq_id > x_SelectSeqId(CBioseq_Handle bioseq_handle)
vector< CVecscreenRun::SVecscreenSummary > m_vecscreen_summary
static bool IsDbAvailable()
TSearchLocVec m_vecSearchLocs
const vector< CVecscreenRun::SVecscreenSummary > & GetVecscreenSummary() const
CRef< CSeq_id > x_FindAccession(CBioseq_Handle bioseq_handle)
objects::CSeq_entry_Handle m_TopSeqEntry
const TVecscreenSummaryBySeqid & GetVecscreenSummaryBySeqid() const
CRef< CSeq_id > x_FindLocalId(CBioseq_Handle bioseq_handle)
~CVectorScreen()
Destructor.
TVecscreenSummaryBySeqid m_seqidSummary
iterator_bool insert(const value_type &val)
Definition: map.hpp:165
void clear()
Definition: map.hpp:169
CRef< CSeqDB > GetSeqDb() const
Obtain a reference to the database.
CRef< CSearchResultSet > Run()
Executes the search.
void SetHitlistSize(int s)
Sets HitlistSize.
static CBlastOptionsHandle * CreateTask(string task, EAPILocality locality=CBlastOptions::eLocal)
Creates an options handle object configured with default options for the requested task,...
@ eBlastDbIsNucleotide
nucleotide
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NON_CONST_ITERATE(Type, Var, Cont)
Non constant version of ITERATE macro.
Definition: ncbimisc.hpp:822
#define NULL
Definition: ncbistd.hpp:225
const string AsFastaString(void) const
Definition: Seq_id.cpp:2266
CConstRef< CSeq_id > GetSeqIdOrNull(void) const
void SetWhole(TWhole &v)
Definition: Seq_loc.hpp:982
const CSeq_id * GetId(void) const
Get the id of the location return NULL if has multiple ids or no id at all.
Definition: Seq_loc.hpp:941
TSeqPos GetLength(const CSeq_id &id, CScope *scope)
Get sequence length if scope not null, else return max possible TSeqPos.
TLoader * GetLoader(void) const
Get pointer to the loader.
void AddDataLoader(const string &loader_name, TPriority pri=kPriority_Default)
Add data loader by name.
Definition: scope.cpp:510
static CRef< CObjectManager > GetInstance(void)
Return the existing object manager or create one.
string GetName(void) const
void AddScope(CScope &scope, TPriority pri=kPriority_Default)
Add the scope's datasources as a single group with the given priority All data sources (data loaders ...
Definition: scope.cpp:516
@ kPriority_NotSet
Deprecated: use kPriority_Default instead.
vector< CSeq_id_Handle > TId
bool CanGetId(void) const
const TId & GetId(void) const
bool NotNull(void) const THROWS_NONE
Check if pointer is not null – same effect as NotEmpty().
Definition: ncbiobj.hpp:744
bool IsNull(void) const THROWS_NONE
Check if pointer is null – same effect as Empty().
Definition: ncbiobj.hpp:735
#define END_NCBI_SCOPE
End previously defined NCBI scope.
Definition: ncbistl.hpp:103
#define BEGIN_NCBI_SCOPE
Define ncbi namespace.
Definition: ncbistl.hpp:100
static SIZE_TYPE FindNoCase(const CTempString str, const CTempString pattern, SIZE_TYPE start, SIZE_TYPE end, EOccurrence which=eFirst)
Find the pattern in the specified range of a string using a case insensitive search.
Definition: ncbistr.cpp:2993
#define NPOS
Definition: ncbistr.hpp:133
bool IsStr(void) const
Check if variant Str is selected.
Definition: Object_id_.hpp:291
bool IsId(void) const
Check if variant Id is selected.
Definition: Object_id_.hpp:264
const TStr & GetStr(void) const
Get the variant data.
Definition: Object_id_.hpp:297
TId GetId(void) const
Get the variant data.
Definition: Object_id_.hpp:270
bool IsGeneral(void) const
Check if variant General is selected.
Definition: Seq_id_.hpp:877
bool IsLocal(void) const
Check if variant Local is selected.
Definition: Seq_id_.hpp:775
bool CanGetAccession(void) const
Check if it is safe to call GetAccession method.
const TGeneral & GetGeneral(void) const
Get the variant data.
Definition: Seq_id_.cpp:369
const TAccession & GetAccession(void) const
Get the Accession member data.
@ e_Local
local use
Definition: Seq_id_.hpp:95
@ eMol_na
just a nucleic acid
Definition: Seq_inst_.hpp:113
USING_SCOPE(objects)
const string kUniVecDatabase
Main class to perform a BLAST search on the local machine.
#define ASSERT
macro for assert.
Definition: ncbi_std.h:107
The Object manager core.
NOTE: This file contains work in progress and the APIs are likely to change, please do not rely on th...
vector< SSeqLoc > TSeqLocVector
Vector of sequence locations.
Definition: sseqloc.hpp:129
const CSeq_id * seqid
Seq-id of query.
CRange< TSeqPos > range
range of match.
string match_type
Categorizes strength of match.
string m_match_type
CRange< TSeqPos > m_range
Structure to represent a single sequence to be fed to BLAST.
Definition: sseqloc.hpp:47
vector< SRangeMatch > m_arrRangeMatch
static string query
Modified on Thu Apr 11 15:03:43 2024 by modify_doxy.py rev. 669887