NCBI C++ ToolKit
hspfilter_collector.c
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: hspfilter_collector.c 87828 2019-10-09 11:00:47Z fongah2 $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Ilya Dondoshansky
27  *
28  */
29 
30 /** @file hspfilter_collector.c
31  * Default implementation of the BlastHSPWriter interface to save hits from
32  * a BLAST search, and subsequently return them in sorted order.
33  */
34 
35 
38 #include "blast_hits_priv.h"
39 
40 /** Data structure used by the writer */
41 typedef struct BlastHSPCollectorData {
42  BlastHSPCollectorParams* params; /**< how many hits to save */
43  BlastHSPResults* results; /**< place to store hits */
45 
46 /*************************************************************/
47 /** The following are implementations for BlastHSPWriter ADT */
48 
49 /** Perform pre-run stage-specific initialization
50  * @param data The internal data structure [in][out]
51  * @param results The HSP results to operate on [in]
52  */
53 static int
54 s_BlastHSPCollectorInit(void* data, void* hsp_results)
55 {
56  BlastHSPCollectorData * col_data = data;
57  BlastHSPResults* results = (BlastHSPResults*)hsp_results;
58  /* grab the results as destination to store collected hsps */
59  col_data->results = results;
60  return 0;
61 }
62 
63 /** Perform post-run clean-ups
64  * @param data The buffered data structure [in]
65  * @param results The HSP results to propagate [in][out]
66  */
67 static int
68 s_BlastHSPCollectorFinal(void* data, void* results)
69 {
70  BlastHSPCollectorData * col_data = data;
71  /* results already stored during run, no action needed */
72  col_data->results = NULL;
73  return 0;
74 }
75 
76 /** Perform writing task
77  * ownership of the HSP list and sets the dereferenced pointer to NULL.
78  * @param data To store results to [in][out]
79  * @param hsp_list Pointer to the HSP list to save in the collector. [in]
80  */
81 static int
83 {
84  BlastHSPCollectorData * col_data = data;
85  BlastHSPResults* results = col_data->results;
86  BlastHSPCollectorParams* params = col_data->params;
87  EBlastProgramType program = params->program;
88 
89  if (!hsp_list)
90  return 0;
91 
92  if (!results || !params)
93  return -1;
94 
95  /* The HSP list should already be sorted by score coming into this function.
96  * Still check that this assumption is true. Note that HSP list does not need to be
97  * sorted after preliminary stage for vdb search becuase of offset adjustment.
98  */
99 #ifdef ERR_POST_EX_DEFINED
100  if(!Blast_HSPListIsSortedByScore(hsp_list)) {
101  ErrPostEx(SEV_WARNING, 0, 0, "HSP List is not sorted by score");
102  }
103 #endif
104 
105  /* Rearrange HSPs into multiple hit lists if more than one query */
106  if (results->num_queries > 1) {
107  BlastHSP* hsp;
108  BlastHSPList** hsp_list_array;
109  BlastHSPList* tmp_hsp_list;
110  Int4 index;
111 
112  hsp_list_array = calloc(results->num_queries, sizeof(BlastHSPList*));
113  if (hsp_list_array == NULL)
114  return -1;
115 
116  for (index = 0; index < hsp_list->hspcnt; index++) {
117  Int4 query_index;
118  hsp = hsp_list->hsp_array[index];
119  query_index = Blast_GetQueryIndexFromContext(hsp->context, program);
120 
121  if (!(tmp_hsp_list = hsp_list_array[query_index])) {
122  hsp_list_array[query_index] = tmp_hsp_list =
123  Blast_HSPListNew(params->hsp_num_max);
124  if (tmp_hsp_list == NULL)
125  {
126  sfree(hsp_list_array);
127  return -1;
128  }
129  tmp_hsp_list->oid = hsp_list->oid;
130  }
131 
132  Blast_HSPListSaveHSP(tmp_hsp_list, hsp);
133  hsp_list->hsp_array[index] = NULL;
134  }
135 
136  /* All HSPs from the hsp_list structure are now moved to the results
137  structure, so set the HSP count back to 0 */
138  hsp_list->hspcnt = 0;
139  Blast_HSPListFree(hsp_list);
140 
141  /* Insert the hit list(s) into the appropriate places in the results
142  structure */
143  for (index = 0; index < results->num_queries; index++) {
144  if (hsp_list_array[index]) {
145  if (!results->hitlist_array[index]) {
146  results->hitlist_array[index] =
148  }
149  Blast_HitListUpdate(results->hitlist_array[index],
150  hsp_list_array[index]);
151  }
152  }
153  sfree(hsp_list_array);
154  } else if (hsp_list->hspcnt > 0) {
155  /* Single query; save the HSP list directly into the results
156  structure */
157  if (!results->hitlist_array[0]) {
158  results->hitlist_array[0] =
160  }
161  Blast_HitListUpdate(results->hitlist_array[0], hsp_list);
162  } else {
163  /* Empty HSPList - free it. */
164  Blast_HSPListFree(hsp_list);
165  }
166 
167  return 0;
168 }
169 
170 /** Callback used for sorting HSPs by score, with HSPs
171  * from different contexts segregated from each other
172  */
173 static int
174 s_ScoreCompareHSPWithContext(const void* h1, const void* h2)
175 {
176  BlastHSP* hsp1,* hsp2; /* the HSPs to be compared */
177  int result = 0; /* the result of the comparison */
178 
179  hsp1 = *((BlastHSP**) h1);
180  hsp2 = *((BlastHSP**) h2);
181 
182  /* Null HSPs are "greater" than any non-null ones, so they go to the end
183  of a sorted list. */
184  if (!hsp1 && !hsp2)
185  return 0;
186  else if (!hsp1)
187  return 1;
188  else if (!hsp2)
189  return -1;
190 
191  if ((result = BLAST_CMP(hsp1->context, hsp2->context)) != 0)
192  return result;
193  return ScoreCompareHSPs(h1, h2);
194 }
195 
196 /** Perform writing task for RPS case
197  * For RPS BLAST saving procedure is different, because HSPs from different
198  * subjects are bundled in one HSP list
199  * ownership of the HSP list and sets the dereferenced pointer to NULL.
200  * @param data To store results to [in][out]
201  * @param hsp_list Pointer to the HSP list to save in the collector. [in]
202  */
203 static int
205 {
206  Int4 index, next_index;
207  BlastHitList* hit_list;
208  BlastHSPCollectorData * col_data = data;
209  BlastHSPResults* results = col_data->results;
210  BlastHSPCollectorParams* params = col_data->params;
211 
212  if (!hsplist_in || hsplist_in->hspcnt == 0)
213  return 0;
214 
215  /* Check that the query index is in the correct range. */
216  ASSERT(hsplist_in->query_index < results->num_queries);
217 
218  /* Check that program is indeed RPS Blast */
220 
221  /* If hit list for this query has not yet been allocated, do it here. */
222  hit_list = results->hitlist_array[hsplist_in->query_index];
223  if (!hit_list) {
224  results->hitlist_array[hsplist_in->query_index] =
225  hit_list = Blast_HitListNew(params->prelim_hitlist_size);
226  }
227 
228  /* Sort the input HSPList with context (i.e. oid) as the first priority,
229  and then score, etc. */
230  qsort(hsplist_in->hsp_array, hsplist_in->hspcnt, sizeof(BlastHSP*),
232 
233  /* Sequentially extract HSPs corresponding to one subject into a new
234  HSPList, and save these new HSPLists in a normal way, as in all other
235  BLAST programs. */
236  next_index = 0;
237 
238  for (index = 0; index < hsplist_in->hspcnt; index = next_index) {
239  BlastHSPList* hsp_list;
240  Int4 oid = hsplist_in->hsp_array[index]->context;
241  Int4 hspcnt;
242  /* Find the first HSP that corresponds to a different subject.
243  At the same time, set all HSP contexts to 0, since this is what
244  traceback code expects. */
245  for (next_index = index; next_index < hsplist_in->hspcnt;
246  ++next_index) {
247  if (hsplist_in->hsp_array[next_index]->context != oid)
248  break;
249  hsplist_in->hsp_array[next_index]->context = 0;
250  }
251  hspcnt = next_index - index;
252  hsp_list = Blast_HSPListNew(hspcnt);
253  /* Set the oid field for this HSPList. */
254  hsp_list->oid = oid;
255  hsp_list->query_index = hsplist_in->query_index;
256  /* Save all HSPs corresponding to this subject. */
257  for ( ; index < next_index; ++index)
258  Blast_HSPListSaveHSP(hsp_list, hsplist_in->hsp_array[index]);
259  /* Check that HSPs are correctly sorted by score, as they should be. */
261  /* Insert this HSPList into this query's hit list. */
262  Blast_HitListUpdate(hit_list, hsp_list);
263  }
264 
265  /* All HSPs have been moved from the input HSPList to new HSPLists, so
266  set the input HSPList's count to 0. */
267  hsplist_in->hspcnt = 0;
268  Blast_HSPListFree(hsplist_in);
269 
270  return 0;
271 }
272 
273 /** Free the writer
274  * @param writer The writer to free [in]
275  * @return NULL.
276  */
277 static
280 {
281  BlastHSPCollectorData *data = writer->data;
282  sfree(data->params);
283  sfree(writer->data);
284  sfree(writer);
285  return NULL;
286 }
287 
288 /** create the writer
289  * @param params Pointer to the hit paramters [in]
290  * @param query_info BlastQueryInfo (not used) [in]
291  * @return writer
292  */
293 static
295 s_BlastHSPCollectorNew(void* params, BlastQueryInfo* query_info,
296  BLAST_SequenceBlk* sequence)
297 {
298  BlastHSPWriter * writer = NULL;
300  BlastHSPCollectorParams * col_param = params;
301 
302  /* allocate space for writer */
303  writer = malloc(sizeof(BlastHSPWriter));
304 
305  /* fill up the function pointers */
309  writer->RunFnPtr = (Blast_ProgramIsRpsBlast(col_param->program))
312 
313  /* allocate for data structure */
314  writer->data = malloc(sizeof(BlastHSPCollectorData));
315  data = writer->data;
316  data->params = params;
317 
318  return writer;
319 }
320 
321 /*************************************************************/
322 /** The following are exported functions to be used by APP */
323 
326  Int4 compositionBasedStats,
327  Boolean gapped_calculation)
328 {
330 
331  if (hit_options == NULL)
332  return NULL;
333 
335 
337  compositionBasedStats, gapped_calculation);
338  retval->hsp_num_max = BlastHspNumMax(gapped_calculation, hit_options);
339  retval->program = hit_options->program_number;
340  return retval;
341 }
342 
345 {
346  if ( !opts )
347  return NULL;
348  sfree(opts);
349  return NULL;
350 }
351 
354  BlastHSPWriterInfo * writer_info =
355  malloc(sizeof(BlastHSPWriterInfo));
356  writer_info->NewFnPtr = &s_BlastHSPCollectorNew;
357  writer_info->params = params;
358  return writer_info;
359 }
#define sfree(x)
Safe free a pointer: belongs to a higher level header.
Definition: blast_def.h:112
#define BLAST_CMP(a, b)
A macro expression that returns 1, 0, -1 if a is greater than, equal to or less than b,...
Definition: blast_def.h:107
int ScoreCompareHSPs(const void *h1, const void *h2)
Comparison callback function for sorting HSPs, first by score in descending order,...
Definition: blast_hits.c:1330
Int4 BlastHspNumMax(Boolean gapped_calculation, const BlastHitSavingOptions *options)
Calculated the number of HSPs that should be saved.
Definition: blast_hits.c:213
BlastHitList * Blast_HitListNew(Int4 hitlist_size)
Allocate memory for a hit list of a given size.
Definition: blast_hits.c:3117
BlastHSPList * Blast_HSPListNew(Int4 hsp_max)
Creates HSP list structure with a default size HSP array.
Definition: blast_hits.c:1558
Boolean Blast_HSPListIsSortedByScore(const BlastHSPList *hsp_list)
Check if HSP list is sorted by score.
Definition: blast_hits.c:1358
Int2 Blast_HSPListSaveHSP(BlastHSPList *hsp_list, BlastHSP *hsp)
Saves HSP information into a BlastHSPList structure.
Definition: blast_hits.c:1754
Int4 GetPrelimHitlistSize(Int4 hitlist_size, Int4 compositionBasedStats, Boolean gapped_calculation)
Definition: blast_hits.c:44
BlastHSPList * Blast_HSPListFree(BlastHSPList *hsp_list)
Deallocate memory for an HSP list structure as well as all it's components.
Definition: blast_hits.c:1542
Int2 Blast_HitListUpdate(BlastHitList *hit_list, BlastHSPList *hsp_list)
Insert a new HSP list into the hit list.
Definition: blast_hits.c:3235
Utilities for dealing with BLAST HSPs in the core of BLAST.
Boolean Blast_ProgramIsRpsBlast(EBlastProgramType p)
Returns true if program is RPS-BLAST (i.e.
Definition: blast_program.c:73
EBlastProgramType
Defines the engine's notion of the different applications of the BLAST algorithm.
Definition: blast_program.h:72
Int4 Blast_GetQueryIndexFromContext(Int4 context, EBlastProgramType program)
Given a context from BLAST engine core, return the query index.
Various auxiliary BLAST utility functions.
char data[12]
Definition: iconv.c:80
#define SEV_WARNING
Definition: gicache.c:90
#define NULL
Definition: ncbistd.hpp:225
#define ErrPostEx(sev, err_code,...)
Definition: ncbierr.hpp:78
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
BlastHSPCollectorParams * BlastHSPCollectorParamsNew(const BlastHitSavingOptions *hit_options, Int4 compositionBasedStats, Boolean gapped_calculation)
The following are exported functions to be used by APP.
BlastHSPWriterInfo * BlastHSPCollectorInfoNew(BlastHSPCollectorParams *params)
WriterInfo to create a default writer: the collecter.
struct BlastHSPCollectorData BlastHSPCollectorData
Data structure used by the writer.
static BlastHSPWriter * s_BlastHSPCollectorNew(void *params, BlastQueryInfo *query_info, BLAST_SequenceBlk *sequence)
create the writer
BlastHSPCollectorParams * BlastHSPCollectorParamsFree(BlastHSPCollectorParams *opts)
Deallocates the BlastHSPCollectorParams structure passed in.
static BlastHSPWriter * s_BlastHSPCollectorFree(BlastHSPWriter *writer)
Free the writer.
static int s_BlastHSPCollectorRun_RPS(void *data, BlastHSPList *hsplist_in)
Perform writing task for RPS case For RPS BLAST saving procedure is different, because HSPs from diff...
static int s_BlastHSPCollectorInit(void *data, void *hsp_results)
The following are implementations for BlastHSPWriter ADT.
static int s_ScoreCompareHSPWithContext(const void *h1, const void *h2)
Callback used for sorting HSPs by score, with HSPs from different contexts segregated from each other...
static int s_BlastHSPCollectorRun(void *data, BlastHSPList *hsp_list)
Perform writing task ownership of the HSP list and sets the dereferenced pointer to NULL.
static int s_BlastHSPCollectorFinal(void *data, void *results)
Perform post-run clean-ups.
Implementation of a number of BlastHSPWriters to save hits from a BLAST search, and subsequently retu...
Uint1 Boolean
bool replacment for C
Definition: ncbi_std.h:94
#define ASSERT
macro for assert.
Definition: ncbi_std.h:107
Structure to hold a sequence.
Definition: blast_def.h:242
Data structure used by the writer.
BlastHSPResults * results
place to store hits
BlastHSPCollectorParams * params
how many hits to save
Keeps prelim_hitlist_size and HitSavingOptions together.
EBlastProgramType program
program type
Int4 hsp_num_max
number of HSPs to save per db sequence.
Int4 prelim_hitlist_size
number of hits saved during preliminary part of search.
The structure to hold all HSPs for a given sequence after the gapped alignment.
Definition: blast_hits.h:153
Int4 oid
The ordinal id of the subject sequence this HSP list is for.
Definition: blast_hits.h:154
Int4 hspcnt
Number of HSPs saved.
Definition: blast_hits.h:158
BlastHSP ** hsp_array
Array of pointers to individual HSPs.
Definition: blast_hits.h:157
Int4 query_index
Index of the query which this HSPList corresponds to.
Definition: blast_hits.h:155
The structure to contain all BLAST results, for multiple queries.
Definition: blast_hits.h:183
BlastHitList ** hitlist_array
Array of results for individual query sequences.
Definition: blast_hits.h:185
Int4 num_queries
Number of query sequences.
Definition: blast_hits.h:184
A wrap of data structure used to create a writer.
BlastHSPWriterNewFn NewFnPtr
ADT definition of BlastHSPWriter.
void * data
data structure
BlastHSPWriterFinalFn FinalFnPtr
BlastHSPWriterFreeFn FreeFnPtr
BlastHSPWriterRunFn RunFnPtr
BlastHSPWriterInitFn InitFnPtr
Structure holding all information about an HSP.
Definition: blast_hits.h:126
Int4 context
Context number of query.
Definition: blast_hits.h:133
The structure to contain all BLAST results for one query sequence.
Definition: blast_hits.h:169
Options used when evaluating and saving hits These include: a.
EBlastProgramType program_number
indicates blastn, blastp, etc.
Int4 hitlist_size
Maximal number of database sequences to return results for.
The query related information.
else result
Definition: token2.c:20
voidp malloc(uInt size)
voidp calloc(uInt items, uInt size)
Modified on Wed Apr 17 13:09:10 2024 by modify_doxy.py rev. 669887