NCBI C++ ToolKit
data4xml2format.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: data4xml2format.cpp 87258 2019-08-13 14:11:06Z fongah2 $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's offical duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Amelia fong
27 */
28 
29 /** @file data4xml2format.cpp
30  * Produce data required for generating BLAST XML2 output
31  */
32 
33 #include <ncbi_pch.hpp>
36 #include <algo/blast/format/data4xml2format.hpp> /* NCBI_FAKE_WARNING */
37 
38 #ifndef SKIP_DOXYGEN_PROCESSING
40 USING_SCOPE(blast);
42 USING_SCOPE(align_format);
43 #endif
44 
47  const CSearchResults& results,
49  CRef<CScope> scope,
50  const vector<align_format::CAlignFormatUtil::SDbInfo> & dbsInfo) :
51  m_Query(query), m_Options (opts), m_Scope(scope),
52  m_DbName(kEmptyStr), m_NumSequences(0), m_NumBases(0),
53  m_TaxDBFound(false), m_isBl2seq(false), m_isIterative(false), m_Matrix(NULL)
54 {
55  x_InitCommon(results, opts);
56  x_InitDB(dbsInfo);
57  results.GetMaskedQueryRegions(m_QueryMasks);
58  x_InitResults(results);
59 }
60 
63  const CSearchResults& results,
65  CRef<CScope> scope,
66  CConstRef<IBlastSeqInfoSrc> subjectsInfo) :
67  m_Query(query), m_Options (opts), m_Scope(scope),
68  m_DbName(kEmptyStr), m_NumSequences(0), m_NumBases(0),
69  m_TaxDBFound(false), m_isBl2seq(true), m_isIterative(false), m_Matrix(NULL)
70 {
71  x_InitCommon(results, opts);
72  x_InitSubjects(subjectsInfo);
73  results.GetMaskedQueryRegions(m_QueryMasks);
74  x_InitResults(results);
75 }
76 
79  const CSearchResultSet& resultSet,
81  CRef<CScope> scope,
82  const vector<CAlignFormatUtil::SDbInfo> & dbsInfo) :
83  m_Query(query), m_Options (opts), m_Scope(scope),
84  m_DbName(kEmptyStr), m_NumSequences(0), m_NumBases(0),
85  m_TaxDBFound(false), m_isBl2seq(false), m_isIterative(true), m_Matrix(NULL)
86 {
87  x_InitCommon(resultSet[0], opts);
88  x_InitDB(dbsInfo);
89  resultSet[0].GetMaskedQueryRegions(m_QueryMasks);
90  for(unsigned int i = 0; i < resultSet.size(); i++) {
91  x_InitResults(resultSet[i]);
92  }
93 }
94 
97  const CSearchResultSet& resultSet,
99  CRef<CScope> scope,
100  CConstRef<IBlastSeqInfoSrc> subjectsInfo) :
101  m_Query(query), m_Options (opts), m_Scope(scope),
102  m_DbName(kEmptyStr), m_NumSequences(0), m_NumBases(0),
103  m_TaxDBFound(false), m_isBl2seq(true), m_isIterative(true), m_Matrix(NULL)
104 {
105  x_InitCommon(resultSet[0], opts);
106  x_InitSubjects(subjectsInfo);
107  resultSet[0].GetMaskedQueryRegions(m_QueryMasks);
108  for(unsigned int i = 0; i < resultSet.size(); i++) {
109  x_InitResults(resultSet[i]);
110  }
111 }
112 
113 void
115 {
116  m_Alignments.push_back(results.GetSeqAlign());
117  m_AncillaryData.push_back(results.GetAncillaryData());
118  string errors = results.GetErrorStrings();
119  if (results.HasWarnings()) {
120  if ( !errors.empty() ) {
121  errors += " ";
122  }
123  errors += results.GetWarningStrings();
124  }
125  if ( !results.HasAlignments() ) {
126  errors += (errors.empty() ? kEmptyStr : " ");
127  errors += CBlastFormatUtil::kNoHitsFound;
128  }
129 
130  m_Errors.push_back(errors);
131 }
132 
133 void
135  const CSearchResults & results,
137 {
138  if(opts.Empty()) {
139  NCBI_THROW(CException, eUnknown, "blastxml2: Empty blast options");
140  }
141 
142  if(m_Scope.Empty()) {
143  NCBI_THROW(CException, eUnknown, "blastxml2: Empty scope");
144  }
145 
146  x_FillScoreMatrix(m_Options->GetMatrixName());
147 
148  string resolved = SeqDB_ResolveDbPath("taxdb.bti");
149  if(!resolved.empty()) {
150  m_TaxDBFound = true;
151  }
152 
154 }
155 
156 void
158  const vector<CAlignFormatUtil::SDbInfo> & dbsInfo)
159 {
160  if(dbsInfo.empty()){
161  NCBI_THROW(CException, eUnknown, "blastxml2: Empty db info");
162  }
163  ITERATE(vector<CBlastFormatUtil::SDbInfo>, i, dbsInfo) {
164  if(i != dbsInfo.begin()) {
165  m_DbName += " " ;
166  }
167  m_DbName += i->name;
168  m_NumSequences += i->number_seqs;
169  m_NumBases += i->total_length;
170  }
171 }
172 
173 void
175 {
176  if(subjectsInfo->Size() == 0) {
177  NCBI_THROW(CException, eUnknown, "blastxml2: Empty seq info src");
178  }
179 
180  for(unsigned int i =0; i < subjectsInfo->Size(); i++) {
181  list<CRef<objects::CSeq_id> > ids = subjectsInfo->GetId(i);
182  m_SubjectIds.push_back(CAlignFormatUtil::GetSeqIdString(ids, true));
183  }
184 }
185 
187 {
188  if(m_Matrix) {
189  delete m_Matrix;
190  }
191 }
192 
193 void
195 {
196  if (matrix_name == NULL)
197  return;
198 
199  int matrix[kMatrixCols][kMatrixCols];
200  int * tmp[kMatrixCols];
201  const SNCBIPackedScoreMatrix *packed_matrix = 0;
202 
203  if (strcmp(matrix_name, "BLOSUM45") == 0)
204  packed_matrix = &NCBISM_Blosum45;
205  else if (strcmp(matrix_name, "BLOSUM50") == 0)
206  packed_matrix = &NCBISM_Blosum50;
207  else if (strcmp(matrix_name, "BLOSUM62") == 0)
208  packed_matrix = &NCBISM_Blosum62;
209  else if (strcmp(matrix_name, "BLOSUM80") == 0)
210  packed_matrix = &NCBISM_Blosum80;
211  else if (strcmp(matrix_name, "BLOSUM90") == 0)
212  packed_matrix = &NCBISM_Blosum90;
213  else if (strcmp(matrix_name, "PAM30") == 0)
214  packed_matrix = &NCBISM_Pam30;
215  else if (strcmp(matrix_name, "PAM70") == 0)
216  packed_matrix = &NCBISM_Pam70;
217  else if (strcmp(matrix_name, "PAM250") == 0)
218  packed_matrix = &NCBISM_Pam250;
219  else if (strcmp(matrix_name, "IDENTITY") == 0)
220  packed_matrix = &NCBISM_Identity;
221  else {
222  string prog_name = Blast_ProgramNameFromType(
223  m_Options->GetProgramType());
224  if (prog_name != "blastn" && prog_name != "megablast") {
225  NCBI_THROW(blast::CBlastException, eInvalidArgument,
226  "unsupported score matrix");
227  }
228  }
229 
230  if (packed_matrix) {
232 
233  NCBISM_Unpack(packed_matrix, &m);
234 
235  for (unsigned int i = 0; i < kMatrixCols; i++) {
236  tmp[i] = matrix[i];
237  for (unsigned int j = 0; j < kMatrixCols; j++) {
238  matrix[i][j] = m.s[i][j];
239  }
240  }
241  }
242 
244 }
245 
246 string
248 {
249  // Program type for deltablast is eBlastTypePsiBlast, because the
250  // sequence search is done by CPsiBlast
251  if ( m_Options->GetProgram() == blast::eDeltaBlast) {
252  return "deltablast";
253  }
254  return blast::Blast_ProgramNameFromType(m_Options->GetProgramType());
255 }
256 
257 double
259 {
260  if (num >= (int)m_AncillaryData.size()) {
261  NCBI_THROW(CException, eUnknown, "blastxml2: Invalid iteration number");
262  }
263 
264  const Blast_KarlinBlk *kbp =
265  m_AncillaryData[num]->GetGappedKarlinBlk();
266  if (kbp)
267  return kbp->Lambda;
268 
269  kbp = m_AncillaryData[num]->GetUngappedKarlinBlk();
270  if (kbp)
271  return kbp->Lambda;
272  return -1.0;
273 }
274 
275 double
277 {
278  if (num >= (int)m_AncillaryData.size()) {
279  NCBI_THROW(CException, eUnknown, "blastxml2: Invalid iteration number");
280  }
281 
282  const Blast_KarlinBlk *kbp =
283  m_AncillaryData[num]->GetGappedKarlinBlk();
284  if (kbp)
285  return kbp->K;
286 
287  kbp = m_AncillaryData[num]->GetUngappedKarlinBlk();
288  if (kbp)
289  return kbp->K;
290  return -1.0;
291 }
292 
293 double
295 {
296  if (num >= (int)m_AncillaryData.size()) {
297  NCBI_THROW(CException, eUnknown, "blastxml2: Invalid iteration number");
298  }
299 
300  const Blast_KarlinBlk *kbp =
301  m_AncillaryData[num]->GetGappedKarlinBlk();
302  if (kbp)
303  return kbp->H;
304 
305  kbp = m_AncillaryData[num]->GetUngappedKarlinBlk();
306  if (kbp)
307  return kbp->H;
308  return -1.0;
309 }
310 
313 {
314  return m_Matrix;
315 }
316 
317 int
319 {
320  if (num >= (int)m_AncillaryData.size()) {
321  NCBI_THROW(CException, eUnknown, "blastxml2: Invalid iteration number");
322  }
323  return (int)m_AncillaryData[num]->GetLengthAdjustment();
324 }
325 
328 {
329  if (num >= (int) m_Alignments.size()) {
330  NCBI_THROW(CException, eUnknown, "blastxml2: Invalid iteration number");
331  }
332  return m_Alignments[num];
333 }
334 
335 Int8
337 {
338  if (num >= (int)m_AncillaryData.size()) {
339  NCBI_THROW(CException, eUnknown, "blastxml2: Invalid iteration number");
340  }
341  return m_AncillaryData[num]->GetSearchSpace();
342 }
343 
345 {
346  if(Blast_QueryIsTranslated(m_Options->GetProgramType()))
347  return m_Options->GetQueryGeneticCode();
348 
349  return 0;
350 }
351 
353 {
354  if(Blast_SubjectIsTranslated(m_Options->GetProgramType()))
355  return m_Options->GetDbGeneticCode();
356 
357  return 0;
358 }
static CRef< CScope > m_Scope
Boolean Blast_QueryIsTranslated(EBlastProgramType p)
Returns true if the query is translated.
Definition: blast_program.c:60
Boolean Blast_SubjectIsTranslated(EBlastProgramType p)
Returns true if the subject is translated.
Definition: blast_program.c:63
@ eDeltaBlast
Delta Blast.
Definition: blast_types.hpp:71
static string GetSeqIdString(const objects::CBioseq &cbs, bool believe_local_id=true)
Returns a full '|'-delimited Seq-id string for a Bioseq.
256x256 matrix used for calculating positives etc.
void x_InitSubjects(CConstRef< blast::IBlastSeqInfoSrc > subjectsInfo)
Int8 m_NumSequences
Number of sequences in all BLAST databases involved in this search.
static const unsigned int kMatrixCols
Number of columns used in score matrices.
CBlastFormattingMatrix * GetMatrix(void) const
Returns a 256x256 ASCII-alphabet matrix, needed for formatting.
void x_FillScoreMatrix(const char *matrix_name=BLAST_DEFAULT_MATRIX)
Initialize the score matrix to be used for formatting (if applicable)
double GetKappa(int num) const
Returns Karlin-Altschul K parameter for a given query.
int GetDbGeneticCode() const
slave genetic code
vector< CConstRef< CSeq_align_set > > m_Alignments
the alignments
double GetEntropy(int num) const
Returns Karlin-Altschul H parameter for a given query.
int GetQueryGeneticCode() const
master genetic code
int GetLengthAdjustment(int num) const
Returns length adjustment for a given query.
CConstRef< blast::CBlastOptions > m_Options
BLAST algorithm options.
CBlastFormattingMatrix * m_Matrix
Score matrix used to determine neighboring protein residues.
Int8 GetEffectiveSearchSpace(int num) const
Returns effective search space for a given query.
CCmdLineBlastXML2ReportData(CConstRef< blast::CBlastSearchQuery > query, const blast::CSearchResults &results, CConstRef< blast::CBlastOptions > opts, CRef< objects::CScope > scope, const vector< align_format::CAlignFormatUtil::SDbInfo > &dbsInfo)
Constructor db search.
Int8 m_NumBases
Number of bases in all BLAST databases involved in this search.
CRef< objects::CScope > m_Scope
CConstRef< CSeq_align_set > GetAlignmentSet(int num) const
Returns a vector continaing set of alignments found for a given query.
string m_DbName
name of blast database
void x_InitDB(const vector< align_format::CAlignFormatUtil::SDbInfo > &dbsInfo)
vector< string > m_Errors
Error messages.
string GetBlastProgramName(void) const
@inheritDoc
void x_InitResults(const blast::CSearchResults &results)
vector< CRef< blast::CBlastAncillaryData > > m_AncillaryData
ancillary results data
double GetLambda(int num) const
Returns Karlin-Altschul Lambda parameter for a given query.
void x_InitCommon(const blast::CSearchResults &results, CConstRef< blast::CBlastOptions > opts)
CConstRef –.
Definition: ncbiobj.hpp:1266
Search Results for All Queries.
Search Results for One Query.
USING_SCOPE(blast)
USING_NCBI_SCOPE
Implementation of interface class to produce data required for generating BLAST XML2 output.
#define true
Definition: bool.h:35
#define false
Definition: bool.h:36
static char tmp[3200]
Definition: utf8.c:42
void GetMaskedQueryRegions(TMaskedQueryRegions &flt_query_regions) const
Retrieve the query regions which were masked by BLAST.
CConstRef< objects::CSeq_align_set > GetSeqAlign() const
Accessor for the Seq-align results.
bool IsIterativeSearch() const
CRef< CBlastAncillaryData > GetAncillaryData() const
Accessor for the query's search ancillary.
size_type size() const
Identical to GetNumResults, provided to facilitate STL-style iteration.
bool HasWarnings() const
Returns true if there are warnings among the results for this object.
string GetWarningStrings() const
Retrieve a string with the query identifier followed by the warnings produced, returns a empty string...
string Blast_ProgramNameFromType(EBlastProgramType program)
Returns a string program name, given a blast::EBlastProgramType enumeration.
Definition: blast_aux.cpp:813
string GetErrorStrings() const
Retrieve a string with the query identifier followed by the errors produced, returns a empty string i...
bool HasAlignments() const
Return true if there are any alignments for this query.
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NULL
Definition: ncbistd.hpp:225
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
bool Empty(void) const THROWS_NONE
Check if CConstRef is empty – not pointing to any object which means having a null value.
Definition: ncbiobj.hpp:1385
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
#define kEmptyStr
Definition: ncbistr.hpp:123
int i
int strcmp(const char *str1, const char *str2)
Definition: odbc_utils.hpp:160
const SNCBIPackedScoreMatrix NCBISM_Pam30
Definition: sm_pam30.c:92
const SNCBIPackedScoreMatrix NCBISM_Blosum62
Definition: sm_blosum62.c:92
const SNCBIPackedScoreMatrix NCBISM_Pam250
Definition: sm_pam250.c:92
const SNCBIPackedScoreMatrix NCBISM_Blosum50
Definition: sm_blosum50.c:92
const SNCBIPackedScoreMatrix NCBISM_Blosum80
Definition: sm_blosum80.c:92
void NCBISM_Unpack(const SNCBIPackedScoreMatrix *psm, SNCBIFullScoreMatrix *fsm)
Expand a packed score matrix into an unpacked one, which callers can proceed to index directly by sta...
Definition: raw_scoremat.c:81
const SNCBIPackedScoreMatrix NCBISM_Pam70
Definition: sm_pam70.c:92
const SNCBIPackedScoreMatrix NCBISM_Blosum45
The standard matrices.
Definition: sm_blosum45.c:92
const SNCBIPackedScoreMatrix NCBISM_Identity
Definition: sm_identity.c:92
const SNCBIPackedScoreMatrix NCBISM_Blosum90
Definition: sm_blosum90.c:92
string SeqDB_ResolveDbPath(const string &filename)
Resolve a file path using SeqDB's path algorithms.
Structure to hold the Karlin-Altschul parameters.
Definition: blast_stat.h:66
double K
K value used in statistics.
Definition: blast_stat.h:68
double Lambda
Lambda value used in statistics.
Definition: blast_stat.h:67
double H
H value used in statistics.
Definition: blast_stat.h:70
TNCBIScore s[128][128]
Definition: raw_scoremat.h:87
static string query
Modified on Wed Jul 17 13:24:10 2024 by modify_doxy.py rev. 669887