NCBI C++ ToolKit
data4xmlformat.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /*
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's offical duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Jason Papadopoulos, Christiam Camacho
27 *
28 */
29 
30 /** @file data4xmlformat.cpp
31  * Produce data required for generating BLAST XML output
32  */
33 
34 #include <ncbi_pch.hpp>
37 #include <algo/blast/format/data4xmlformat.hpp> /* NCBI_FAKE_WARNING */
38 
39 #ifndef SKIP_DOXYGEN_PROCESSING
41 USING_SCOPE(blast);
43 USING_SCOPE(align_format);
44 #endif
45 
48  const blast::CSearchResultSet& results,
49  const blast::CBlastOptions& opts,
50  const string& dbname, bool db_is_aa,
51  int qgencode, int dbgencode,
52  bool is_remote,
53  int dbfilt_algorithm /* = -1 */)
54 : m_Queries(queries), m_Options(opts),
55  m_DbName(dbname),
56  m_QueryGeneticCode(qgencode),
57  m_DbGeneticCode(dbgencode),
58  m_NoHitsFound(false),
59  m_NumSequences(0),
60  m_NumBases(0)
61 {
62  _ASSERT( !m_Queries->Empty() );
63 
64  vector<CBlastFormatUtil::SDbInfo> dbinformation;
65  if ( !m_DbName.empty() ){
66  CBlastFormatUtil::GetBlastDbInfo(dbinformation, m_DbName, db_is_aa,
67  dbfilt_algorithm, is_remote);
68  }
69 
70  x_Init(queries, results, opts, dbinformation, qgencode, dbgencode, is_remote, dbfilt_algorithm);
71 
72 }
73 
76  const blast::CSearchResultSet& results,
77  const blast::CBlastOptions& opts,
78  const vector<CAlignFormatUtil::SDbInfo> & dbInfo,
79  int qgencode,
80  int dbgencode,
81  bool is_remote,
82  int dbfilt_algorithm)
83  : m_Queries(queries), m_Options(opts),
84  m_DbName(kEmptyStr),
85  m_QueryGeneticCode(qgencode),
86  m_DbGeneticCode(dbgencode),
87  m_NoHitsFound(false),
88  m_NumSequences(0),
89  m_NumBases(0)
90 {
91  _ASSERT(!m_Queries->Empty());
92  _ASSERT(!dbInfo.empty());
93 
94  ITERATE(vector<CBlastFormatUtil::SDbInfo>, i, dbInfo) {
95  if(i != dbInfo.begin())
96  m_DbName += " " ;
97  m_DbName += i->name;
98  }
99  x_Init(queries, results, opts, dbInfo, qgencode, dbgencode, is_remote, dbfilt_algorithm);
100 }
101 
102 void
105  const blast::CSearchResultSet& results,
106  const blast::CBlastOptions& opts,
107  const vector<CAlignFormatUtil::SDbInfo> & dbInfo,
108  int qgencode,
109  int dbgencode,
110  bool is_remote,
111  int dbfilt_algorithm)
112 {
113 
114  x_FillScoreMatrix(m_Options.GetMatrixName());
115 
116  if(!dbInfo.empty()) {
117  ITERATE(vector<CBlastFormatUtil::SDbInfo>, i, dbInfo) {
118  m_NumSequences += i->number_seqs;
119  m_NumBases += i->total_length;
120  }
121  }
122 
123  /// @todo FIXME add means to specify masked database (SB-343)
124  // Is this appropriate? What if it breaks parsers?
125  //if (dbfilt_algorithm != -1) {
126  // int x = 0; // should be the index of the masked DB
127  // _ASSERT(!dbinformation[x].filt_algorithm_name.empty());
128  // m_DbName += ", masked using: '" + dbinformation[x].filt_algorithm_name + "'";
129  // if ( !dbinformation[x].filt_algorithm_options.empty() ) {
130  // m_DbName += ", options: '" + dbinformation[x].filt_algorithm_options + "'";
131  // }
132  //}
133 
134  if (results.size() == 0) {
135  m_NoHitsFound = true;
136  m_Errors.insert(m_Errors.end(), m_Queries->Size(),
137  CBlastFormatUtil::kNoHitsFound);
138  } else {
139 
140  if (opts.GetProgram() == ePSIBlast && m_Queries->Size() == 1) {
141  // artificially increment the number of 'queries' to match the
142  // number of results, which represents the actual number of
143  // iterations in PSI-BLAST
144  for (size_t i = 0; i < results.size() - 1; i++) {
145  m_Queries->AddQuery(m_Queries->GetBlastSearchQuery(0));
146  }
147  }
148 
149  m_Masks.resize(GetNumQueries());
150  for (size_t i = 0; i < GetNumQueries(); i++) {
151 
152  m_Alignments.push_back(results[i].GetSeqAlign());
153  m_AncillaryData.push_back(results[i].GetAncillaryData());
154  results[i].GetMaskedQueryRegions(m_Masks[i]);
155 
156  // Check in case there are any errors/warnings
157  {
158  string errors = results[i].GetErrorStrings();
159  if (results[i].HasWarnings()) {
160  if ( !errors.empty() ) {
161  errors += " ";
162  }
163  errors += results[i].GetWarningStrings();
164  }
165  if ( !results[i].HasAlignments() ) {
166  errors += (errors.empty() ? kEmptyStr : " ");
167  errors += CBlastFormatUtil::kNoHitsFound;
168  }
169  m_Errors.push_back(errors);
170  }
171  }
172  }
173 
174 }
175 
176 
178 {
179  for (unsigned int i = 0; i < kMatrixCols; i++)
180  delete [] m_Matrix[i];
181 }
182 
183 
184 void
186 {
187  for (unsigned int i = 0; i < kMatrixCols; i++)
188  m_Matrix[i] = new int[kMatrixCols];
189 
190  if (matrix_name == NULL)
191  return;
192 
193  const SNCBIPackedScoreMatrix *packed_matrix = 0;
194 
195  if (strcmp(matrix_name, "BLOSUM45") == 0)
196  packed_matrix = &NCBISM_Blosum45;
197  else if (strcmp(matrix_name, "BLOSUM50") == 0)
198  packed_matrix = &NCBISM_Blosum50;
199  else if (strcmp(matrix_name, "BLOSUM62") == 0)
200  packed_matrix = &NCBISM_Blosum62;
201  else if (strcmp(matrix_name, "BLOSUM80") == 0)
202  packed_matrix = &NCBISM_Blosum80;
203  else if (strcmp(matrix_name, "BLOSUM90") == 0)
204  packed_matrix = &NCBISM_Blosum90;
205  else if (strcmp(matrix_name, "PAM30") == 0)
206  packed_matrix = &NCBISM_Pam30;
207  else if (strcmp(matrix_name, "PAM70") == 0)
208  packed_matrix = &NCBISM_Pam70;
209  else if (strcmp(matrix_name, "PAM250") == 0)
210  packed_matrix = &NCBISM_Pam250;
211  else if (strcmp(matrix_name, "IDENTITY") == 0)
212  packed_matrix = &NCBISM_Identity;
213  else {
214  string prog_name = Blast_ProgramNameFromType(
215  m_Options.GetProgramType());
216  if (prog_name != "blastn" && prog_name != "megablast") {
217  NCBI_THROW(blast::CBlastException, eInvalidArgument,
218  "unsupported score matrix");
219  }
220  }
221 
222  if (packed_matrix) {
224 
225  NCBISM_Unpack(packed_matrix, &m);
226 
227  for (unsigned int i = 0; i < kMatrixCols; i++) {
228  for (unsigned int j = 0; j < kMatrixCols; j++) {
229  m_Matrix[i][j] = m.s[i][j];
230  }
231  }
232  }
233 }
234 
235 double
237 {
238  if (m_NoHitsFound || query_index >= (int)m_AncillaryData.size()) {
239  return -1.0;
240  }
241 
242  const Blast_KarlinBlk *kbp =
243  m_AncillaryData[query_index]->GetGappedKarlinBlk();
244  if (kbp)
245  return kbp->Lambda;
246 
247  kbp = m_AncillaryData[query_index]->GetUngappedKarlinBlk();
248  if (kbp)
249  return kbp->Lambda;
250  return -1.0;
251 }
252 
253 double
255 {
256  if (m_NoHitsFound || query_index >= (int)m_AncillaryData.size()) {
257  return -1.0;
258  }
259 
260  const Blast_KarlinBlk *kbp =
261  m_AncillaryData[query_index]->GetGappedKarlinBlk();
262  if (kbp)
263  return kbp->K;
264 
265  kbp = m_AncillaryData[query_index]->GetUngappedKarlinBlk();
266  if (kbp)
267  return kbp->K;
268  return -1.0;
269 }
270 
271 double
273 {
274  if (m_NoHitsFound || query_index >= (int)m_AncillaryData.size()) {
275  return -1.0;
276  }
277 
278  const Blast_KarlinBlk *kbp =
279  m_AncillaryData[query_index]->GetGappedKarlinBlk();
280  if (kbp)
281  return kbp->H;
282 
283  kbp = m_AncillaryData[query_index]->GetUngappedKarlinBlk();
284  if (kbp)
285  return kbp->H;
286  return -1.0;
287 }
288 
290 {
291  return new CBlastFormattingMatrix((int **)m_Matrix,
293 }
294 
296  if (m_NoHitsFound || query_index >= (int)m_AncillaryData.size()) {
297  return 0;
298  }
299  return (int)m_AncillaryData[query_index]->GetLengthAdjustment();
300 }
@ ePSIBlast
PSI Blast.
Definition: blast_types.hpp:67
256x256 matrix used for calculating positives etc.
CRef< blast::CBlastQueryVector > m_Queries
Query sequences.
vector< CRef< blast::CBlastAncillaryData > > m_AncillaryData
ancillary results data
int m_NumSequences
Number of sequences in all BLAST databases involved in this search.
bool m_NoHitsFound
True if results did not find any hits.
int * m_Matrix[kMatrixCols]
Score matrix used to determine neighboring protein residues.
TSeqLocInfoVector m_Masks
masks for the queries
unsigned int GetNumQueries(void) const
@inheritDoc
const blast::CBlastOptions & m_Options
BLAST algorithm options.
Int8 m_NumBases
Number of bases in all BLAST databases involved in this search.
void x_FillScoreMatrix(const char *matrix_name=BLAST_DEFAULT_MATRIX)
Initialize the score matrix to be used for formatting (if applicable)
string m_DbName
name of blast database
int GetLengthAdjustment(int) const
@inheritDoc
CCmdLineBlastXMLReportData(CRef< blast::CBlastQueryVector > queries, const blast::CSearchResultSet &results, const blast::CBlastOptions &opts, const string &dbname, bool db_is_aa, int qgencode=BLAST_GENETIC_CODE, int dbgencode=BLAST_GENETIC_CODE, bool is_remote=false, int dbfilt_algorithm=-1)
Constructor.
double GetEntropy(int query_index) const
@inheritDoc
vector< string > m_Errors
Error messages (one element per query)
void x_Init(CRef< blast::CBlastQueryVector > queries, const blast::CSearchResultSet &results, const blast::CBlastOptions &opts, const vector< align_format::CAlignFormatUtil::SDbInfo > &dbInfo, int qgencode, int dbgencode, bool is_remote, int dbfilt_algorith)
vector< CConstRef< CSeq_align_set > > m_Alignments
the alignments
double GetLambda(int query_index) const
@inheritDoc
~CCmdLineBlastXMLReportData()
Destructor.
double GetKappa(int query_index) const
@inheritDoc
static const unsigned int kMatrixCols
Number of columns used in score matrices.
CBlastFormattingMatrix * GetMatrix(void) const
@inheritDoc
USING_SCOPE(blast)
USING_NCBI_SCOPE
Implementation of interface class to produce data required for generating BLAST XML output.
#define false
Definition: bool.h:36
string Blast_ProgramNameFromType(EBlastProgramType program)
Returns a string program name, given a blast::EBlastProgramType enumeration.
Definition: blast_aux.cpp:813
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NULL
Definition: ncbistd.hpp:225
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
bool Empty(void) const THROWS_NONE
Check if CRef is empty – not pointing to any object, which means having a null value.
Definition: ncbiobj.hpp:719
#define kEmptyStr
Definition: ncbistr.hpp:123
char * dbname(DBPROCESS *dbproc)
Get name of current database.
Definition: dblib.c:6929
int i
int strcmp(const char *str1, const char *str2)
Definition: odbc_utils.hpp:160
const SNCBIPackedScoreMatrix NCBISM_Pam30
Definition: sm_pam30.c:92
const SNCBIPackedScoreMatrix NCBISM_Blosum62
Definition: sm_blosum62.c:92
const SNCBIPackedScoreMatrix NCBISM_Pam250
Definition: sm_pam250.c:92
const SNCBIPackedScoreMatrix NCBISM_Blosum50
Definition: sm_blosum50.c:92
const SNCBIPackedScoreMatrix NCBISM_Blosum80
Definition: sm_blosum80.c:92
void NCBISM_Unpack(const SNCBIPackedScoreMatrix *psm, SNCBIFullScoreMatrix *fsm)
Expand a packed score matrix into an unpacked one, which callers can proceed to index directly by sta...
Definition: raw_scoremat.c:81
const SNCBIPackedScoreMatrix NCBISM_Pam70
Definition: sm_pam70.c:92
const SNCBIPackedScoreMatrix NCBISM_Blosum45
The standard matrices.
Definition: sm_blosum45.c:92
const SNCBIPackedScoreMatrix NCBISM_Identity
Definition: sm_identity.c:92
const SNCBIPackedScoreMatrix NCBISM_Blosum90
Definition: sm_blosum90.c:92
Structure to hold the Karlin-Altschul parameters.
Definition: blast_stat.h:66
double K
K value used in statistics.
Definition: blast_stat.h:68
double Lambda
Lambda value used in statistics.
Definition: blast_stat.h:67
double H
H value used in statistics.
Definition: blast_stat.h:70
TNCBIScore s[128][128]
Definition: raw_scoremat.h:87
#define _ASSERT
Modified on Sun Jul 14 04:59:21 2024 by modify_doxy.py rev. 669887