NCBI C++ ToolKit
kblastapi.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: kblastapi.cpp 100101 2023-06-15 14:10:29Z merezhuk $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Tom Madden
27  *
28  * File Description:
29  * API for KMER BLAST searches
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbiapp.hpp>
43 #include <math.h>
44 
47 
49 
54 
57 USING_SCOPE(blast);
58 
59 /// Places all the needed subject sequences into a scope.
60 /// The database loader (or cache perhaps?) does not seem to be thread safe, so it
61 /// is avoided.
62 static void
64 {
65  int numSearches=static_cast<int>(resultSet->GetNumQueries());
66  vector<int> oid_v;
67  for (int index=0; index<numSearches; index++)
68  {
69  CBlastKmerResults& results = (*resultSet)[index];
70  TBlastKmerScoreVector scores = results.GetScores();
71  int oid;
72  for(TBlastKmerScoreVector::const_iterator iter=scores.begin(); iter != scores.end(); ++iter)
73  {
74  CRef<CSeq_id> sid = (*iter).first;
75  if (sid->IsGi())
76  seqdb->GiToOid(sid->GetGi(), oid);
77  else
78  seqdb->SeqidToOid(*sid, oid);
79  oid_v.push_back(oid);
80  }
81  }
82  if (oid_v.size() == 0) // Nothing to be loaded.
83  return;
84 
85  sort(oid_v.begin(), oid_v.end());
86 
87  for(vector<int>::iterator iter=oid_v.begin(); iter!=oid_v.end(); ++iter)
88  {
89  CRef<CBioseq> bioseq = seqdb->GetBioseq(*iter);
90  scope->AddBioseq(*bioseq);
91  }
92  return;
93 }
94 
96 {
97  for (CSearchResultSet::iterator iter=myResultSet->begin(); iter!=myResultSet->end(); ++iter)
98  {
99  resultSet->push_back(*iter);
100  }
101 }
102 
104 CRef<CLocalDbAdapter> dbAdapter, TSearchMessages& msg_vec)
105 {
106  // Search was not run, but we send back an empty CSearchResultSet.
107  CRef<ILocalQueryData> local_query_data = qf->MakeLocalQueryData(&opts);
108  vector< CConstRef<objects::CSeq_id> > seqid_vec;
109  vector< CRef<CBlastAncillaryData> > ancill_vec;
110  TSeqAlignVector sa_vec;
111  size_t index;
112  EResultType res_type = eDatabaseSearch;
113  unsigned int num_subjects = 0;
114  if (dbAdapter.NotEmpty() && !dbAdapter->IsBlastDb() && !dbAdapter->IsDbScanMode()) {
115  res_type = eSequenceComparison;
116  IBlastSeqInfoSrc * subject_infosrc = dbAdapter->MakeSeqInfoSrc();
117  if(subject_infosrc != NULL) {
118  num_subjects = static_cast<unsigned int>(subject_infosrc->Size());
119  }
120  }
121  for (index=0; index<local_query_data->GetNumQueries(); index++)
122  {
123  CConstRef<objects::CSeq_id> query_id(local_query_data->GetSeq_loc(index)->GetId());
124  TQueryMessages q_msg;
125  /// FIXME, PROBLEM??
126  // local_query_data->GetQueryMessages(index, q_msg);
127  // msg_vec.push_back(q_msg);
128  seqid_vec.push_back(query_id);
130  sa_vec.push_back(tmp_align);
131  pair<double, double> tmp_pair(-1.0, -1.0);
132  CRef<CBlastAncillaryData> tmp_ancillary_data(new CBlastAncillaryData(tmp_pair, tmp_pair, tmp_pair, 0));
133  ancill_vec.push_back(tmp_ancillary_data);
134 
135  for(unsigned int i =1; i < num_subjects; i++)
136  {
137  TQueryMessages msg;
138  msg_vec.push_back(msg);
139  seqid_vec.push_back(query_id);
141  sa_vec.push_back(tmp_align);
142  CRef<CBlastAncillaryData> tmp_ancillary_data(new CBlastAncillaryData(tmp_pair, tmp_pair, tmp_pair, 0));
143  ancill_vec.push_back(tmp_ancillary_data);
144  }
145  }
146  msg_vec.resize(seqid_vec.size()); // FIXME
147  CRef<CSearchResultSet> result_set(new CSearchResultSet(seqid_vec, sa_vec, msg_vec, ancill_vec, 0, res_type));
148  return result_set;
149 }
150 
151 /////////////////////////////////////////////////////////////////////////////
152 // Perform a KMER search then a BLAST search.
153 
155 {
157  seqdb->SetNumberOfThreads(1, true);
158 
159  if (m_OptsHandle->GetDbLength() == 0)
161 
162  CBlastpKmerOptionsHandle* kmerOptHndl = dynamic_cast<CBlastpKmerOptionsHandle*> (&*m_OptsHandle);
163  CRef<CBlastKmerOptions> opts(new CBlastKmerOptions()); // KMER specific options.
164  opts->SetThresh(kmerOptHndl->GetThresh());
165  opts->SetMinHits(kmerOptHndl->GetMinHits());
166  opts->SetNumTargetSeqs(kmerOptHndl->GetCandidateSeqs());
167 
168  CObjMgr_QueryFactory* objmgr_qf = NULL;
169  TSeqLocVector tsl_v;
170  if ( (objmgr_qf = dynamic_cast<CObjMgr_QueryFactory*>(&*m_QueryFactory)) )
171  {
172  tsl_v = objmgr_qf->GetTSeqLocVector();
173  _ASSERT(!tsl_v.empty());
174  }
175  CRef<CBlastKmer> blastkmer(new CBlastKmer(tsl_v, opts, seqdb));
176  if (!m_GIList.Empty())
177  blastkmer->SetGiListLimit(m_GIList);
178  else if(!m_NegGIList.Empty())
179  blastkmer->SetGiListLimit(m_NegGIList);
180 
181  CRef<CBlastKmerResultsSet> resultSet = blastkmer->RunSearches();
182 
183  //FIXME: check if all are same or use all?
184  vector< CRef<CScope> > scope_v = objmgr_qf->ExtractScopes();
185  s_GetSequencesIntoScope(resultSet, scope_v[0], seqdb);
186 
187  int numSearches=static_cast<int>(resultSet->GetNumQueries());
188  CRef<CSearchResultSet> search_results(new CSearchResultSet());
189  for (int index=0; index<numSearches; index++)
190  {
191  CBlastKmerResults& results = (*resultSet)[index];
192  if (results.HasErrors() || results.HasWarnings())
193  {
195  string queryId = msg.GetQueryId();
196  for(TQueryMessages::const_iterator iter=msg.begin(); iter != msg.end(); ++iter)
197  {
198  cerr << queryId << " " << (*iter)->GetMessage() << '\n';
199  }
200  }
201  const TBlastKmerScoreVector& scores = results.GetScores();
202 
203  if (scores.size() > 0)
204  {
205  TSeqLocVector subjectTSL;
206  results.GetTSL(subjectTSL, scope_v[0]);
207 
208  TSeqLocVector query_vector_temp;
209  query_vector_temp.push_back(tsl_v[index]);
210  CRef<IQueryFactory> qfactory(new CObjMgr_QueryFactory(query_vector_temp));
211 
212  BlastSeqSrc* seq_src = MultiSeqBlastSeqSrcInit(subjectTSL, eBlastTypeBlastp, true);
213  CRef<IBlastSeqInfoSrc> seqinfo_src(new CSeqVecSeqInfoSrc(subjectTSL));
214 
215  CLocalBlast lcl_blast(qfactory, CRef<CBlastOptionsHandle> (m_OptsHandle.GetPointer()), seq_src, seqinfo_src);
216  CRef<CSearchResultSet> my_blast_results = lcl_blast.Run();
217  s_AddNewResultSet(search_results, my_blast_results);
218  seq_src = BlastSeqSrcFree(seq_src);
219  }
220  else
221  {
222  TSeqLocVector query_vector_temp;
223  query_vector_temp.push_back(tsl_v[index]);
224  CRef<IQueryFactory> qfactory(new CObjMgr_QueryFactory(query_vector_temp));
225  TSearchMessages msg;
226  if (results.HasErrors() || results.HasWarnings())
227  {
228  TQueryMessages qmsg = results.GetErrors(eBlastSevWarning);
229  msg.push_back(qmsg);
230  }
232  s_AddNewResultSet(search_results, my_blast_results);
233  }
234  }
235 
236  return search_results;
237 }
238 
Declares the CBlastAdvancedProteinOptionsHandle class.
@ eBlastSevWarning
Definition: blast_message.h:57
@ eBlastTypeBlastp
Definition: blast_program.h:73
Declares the CBlastProteinOptionsHandle class.
Declares CBlastScopeSource class to create properly configured CScope objects to invoke the BLAST dat...
BlastSeqSrc * BlastSeqSrcFree(BlastSeqSrc *seq_src)
Frees the BlastSeqSrc structure by invoking the destructor function set by the user-defined construct...
Definition: blast_seqsrc.c:112
vector< CRef< objects::CSeq_align_set > > TSeqAlignVector
Vector of Seq-align-sets.
EResultType
Specifies the style of Seq-aligns that should be built from the internal BLAST data structures.
@ eDatabaseSearch
Seq-aligns in the style of a database search.
@ eSequenceComparison
Seq-aligns in the BLAST 2 Sequence style (one alignment per query-subject pair)
vector< pair< CRef< CSeq_id >, double > > TBlastKmerScoreVector
Vector of pairs of seq-ids and scores.
Class used to return ancillary data from a blast search, i.e.
Class of optiosn for the KMEr search.
void SetNumTargetSeqs(int matches)
Sets the number of matches (subject sequences) to return.
void SetThresh(double thresh)
Set the threshold.
void SetMinHits(int minhits)
Set the minimum number of LSH hits to initiate a calculation of the Jaccard distance.
This class represents the results for one KMER search (one query).
void GetTSL(TSeqLocVector &tsl, CRef< CScope > scope) const
Get the results as a TSeqLocVector.
bool HasWarnings() const
Returns true if there are warnings among the results for this object.
TQueryMessages GetErrors(int min_severity=eBlastSevError) const
Accessor for the error/warning messsages for this query.
const TBlastKmerScoreVector & GetScores() const
Get the vector of GIs and scores for the matches.
bool HasErrors() const
Returns true if there are errors among the results for this object.
CRef< IQueryFactory > m_QueryFactory
Holds the query seqloc and scope.
Definition: kblastapi.hpp:90
CRef< CSeqDBNegativeList > m_NegGIList
Negative GIList to limit search by.
Definition: kblastapi.hpp:103
CRef< CSeqDBGiList > m_GIList
GIList to limit search by.
Definition: kblastapi.hpp:99
CRef< CBlastpKmerOptionsHandle > m_OptsHandle
Options for KMER search.
Definition: kblastapi.hpp:93
CRef< CLocalDbAdapter > m_Database
Database to search.
Definition: kblastapi.hpp:96
CRef< CSearchResultSet > Run(void)
Run a KMER and then BLAST search.
Definition: kblastapi.cpp:154
Class to perform a KMER-BLASTP search.
Definition: blastkmer.hpp:72
Encapsulates ALL the BLAST algorithm's options.
Handle to the KMER BLASTP options.
Class to perform a BLAST search on local BLAST databases Note that PHI-BLAST can be run using this cl...
Definition: local_blast.hpp:62
NCBI C++ Object Manager dependant implementation of IQueryFactory.
CRef –.
Definition: ncbiobj.hpp:618
Search Results for All Queries.
CSeqDB.
Definition: seqdb.hpp:161
Uint8 GetTotalLength() const
Returns the sum of the lengths of all available sequences.
Definition: seqdb.cpp:685
@ eProtein
Definition: seqdb.hpp:174
bool SeqidToOid(const CSeq_id &seqid, int &oid) const
Translate a Seq-id to any matching OID.
Definition: seqdb.cpp:903
CRef< CBioseq > GetBioseq(int oid, TGi target_gi=ZERO_GI, const CSeq_id *target_seq_id=NULL) const
Get a CBioseq for a sequence.
Definition: seqdb.cpp:504
void SetNumberOfThreads(int num_threads, bool force_mt=false)
Setting the number of threads.
Definition: seqdb.cpp:1321
bool GiToOid(TGi gi, int &oid) const
Translate a GI to an OID.
Definition: seqdb.cpp:808
Implementation of the IBlastSeqInfoSrc interface to encapsulate retrieval of sequence identifiers and...
Abstract base class to encapsulate retrieval of sequence identifiers.
Class for the messages for an individual query sequence.
typedef for the messages for an entire BLAST search, which could be comprised of multiple query seque...
virtual CConstRef< objects::CSeq_loc > GetSeq_loc(size_t index)=0
Get the Seq_loc for the sequence indicated by index.
string GetDatabaseName() const
Returns the database name if appropriate, else kEmptyStr for subject sequences.
const_iterator begin() const
Returns const_iterator to beginning of container, provided to facilitate STL-style iteration.
void SetDbLength(Int8 len)
Sets DbLength.
int GetCandidateSeqs() const
Gets the max number of candidate matches to process with BLAST.
Int8 GetDbLength() const
Returns DbLength.
TSeqLocVector GetTSeqLocVector()
Retrieves the TSeqLocVector used to construct this object or a conversion of the CBlastQueryVector pr...
int GetMinHits() const
Returns the number of hits to initiate calculation of Jaccard distance.
CRef< CSearchResultSet > Run()
Executes the search.
CRef< ILocalQueryData > MakeLocalQueryData(const CBlastOptions *opts)
Creates and caches an ILocalQueryData.
Definition: query_data.cpp:52
bool IsBlastDb() const
Returns true if this object represents a BLAST database.
BlastSeqSrc * MultiSeqBlastSeqSrcInit(TSeqLocVector &seq_vector, EBlastProgramType program, bool dbscan_mode=false)
Initialize the sequence source structure.
string GetQueryId() const
Get the query id as a string.
Definition: blast_aux.cpp:972
const CBlastOptions & GetOptions() const
Return the object which this object is a handle for.
IBlastSeqInfoSrc * MakeSeqInfoSrc()
Retrieves or constructs the IBlastSeqInfoSrc.
vector< value_type >::iterator iterator
iterator type definition
virtual size_t GetNumQueries()=0
Get the number of queries.
virtual size_t Size() const =0
Returns the size of the underlying container of sequences.
void push_back(value_type &element)
Add a value to the back of this container.
vector< CRef< objects::CScope > > ExtractScopes()
Retrieve the CScope objects associated with the query sequences associated with this object.
const_iterator end() const
Returns const_iterator to end of container, provided to facilitate STL-style iteration.
double GetThresh() const
Returns threshold for Jaccard distance (range: 0-1)
bool IsDbScanMode() const
Returns true if this is not a database but is database scanning mode.
#define NULL
Definition: ncbistd.hpp:225
CBioseq_Handle AddBioseq(CBioseq &bioseq, TPriority pri=kPriority_Default, EExist action=eExist_Throw)
Add bioseq, return bioseq handle.
Definition: scope.cpp:530
TObjectType * GetPointer(void) THROWS_NONE
Get pointer,.
Definition: ncbiobj.hpp:998
bool NotEmpty(void) const THROWS_NONE
Check if CRef is not empty – pointing to an object and has a non-null value.
Definition: ncbiobj.hpp:726
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
TGi GetGi(void) const
Get the variant data.
Definition: Seq_id_.hpp:889
bool IsGi(void) const
Check if variant Gi is selected.
Definition: Seq_id_.hpp:883
USING_SCOPE(objects)
CRef< CSearchResultSet > s_MakeEmptyResults(CRef< IQueryFactory > qf, const CBlastOptions &opts, CRef< CLocalDbAdapter > dbAdapter, TSearchMessages &msg_vec)
Definition: kblastapi.cpp:103
static void s_GetSequencesIntoScope(CRef< CBlastKmerResultsSet > resultSet, CRef< CScope > scope, CRef< CSeqDB > seqdb)
Places all the needed subject sequences into a scope.
Definition: kblastapi.cpp:63
void s_AddNewResultSet(CRef< CSearchResultSet > resultSet, CRef< CSearchResultSet > myResultSet)
Definition: kblastapi.cpp:95
USING_NCBI_SCOPE
Definition: kblastapi.cpp:55
int i
Main class to perform a BLAST search on the local machine.
constexpr auto sort(_Init &&init)
Defines the CNcbiApplication and CAppException classes for creating NCBI applications.
The Object manager core.
NOTE: This file contains work in progress and the APIs are likely to change, please do not rely on th...
Defines BLAST database access classes.
Defines a concrete strategy for the IBlastSeqInfoSrc interface for sequence identifiers retrieval fro...
Implementation of the BlastSeqSrc interface for a vector of sequence locations.
vector< SSeqLoc > TSeqLocVector
Vector of sequence locations.
Definition: sseqloc.hpp:129
Complete type definition of Blast Sequence Source ADT.
Definition: blast_seqsrc.c:43
#define _ASSERT
Uniform BLAST Search Interface.
Modified on Wed Nov 29 02:22:51 2023 by modify_doxy.py rev. 669887