NCBI C++ ToolKit
blast_test_util.cpp
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: blast_test_util.cpp 65550 2014-12-11 16:00:59Z camacho $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Christiam Camacho
27  *
28  */
29 
30 /** @file blast_test_util.cpp
31  * Utilities to develop and debug unit tests for BLAST
32  */
33 
34 #include <ncbi_pch.hpp>
35 #include "blast_test_util.hpp"
36 #include <corelib/ncbimisc.hpp>
37 #include <corelib/ncbitype.h>
38 #include <util/random_gen.hpp>
39 
40 // BLAST includes
42 #include "blast_objmgr_priv.hpp"
43 
44 // Serialization includes
45 #include <serial/serial.hpp>
46 #include <serial/objistr.hpp>
47 
48 // Object manager includes
49 #include <objmgr/bioseq_handle.hpp>
50 #include <objmgr/seq_vector.hpp>
52 
53 // Object includes
55 
56 // Formatter includes
58 
59 #include <sstream>
60 
61 #include <corelib/test_boost.hpp>
62 
63 using namespace std;
64 using namespace ncbi;
65 using namespace ncbi::objects;
66 using namespace ncbi::blast;
67 
68 namespace TestUtil {
69 
70 objects::CSeq_id* GenerateRandomSeqid_Gi()
71 {
72  static CRandom random_gen((CRandom::TValue)time(0));
73  return new CSeq_id(CSeq_id::e_Gi, random_gen.GetRand(1, 20000000));
74 }
75 
76 vector<EBlastProgramType> GetAllBlastProgramTypes()
77 {
78  vector<EBlastProgramType> retval;
79  retval.push_back(eBlastTypeBlastp);
80  retval.push_back(eBlastTypeBlastn);
81  retval.push_back(eBlastTypeBlastx);
82  retval.push_back(eBlastTypeTblastn);
83  retval.push_back(eBlastTypeTblastx);
84 
85  retval.push_back(eBlastTypePsiBlast);
86  retval.push_back(eBlastTypePsiTblastn);
87 
88  retval.push_back(eBlastTypeRpsBlast);
89  retval.push_back(eBlastTypeRpsTblastn);
90 
91  retval.push_back(eBlastTypePhiBlastp);
92  retval.push_back(eBlastTypePhiBlastn);
93 
94  return retval;
95 }
96 
99 {
100  CRef<CSeq_align_set> retval(new CSeq_align_set());
101 
102  ITERATE(CSeq_align_set::Tdata, i, sset.Get()) {
103  ASSERT((*i)->GetSegs().IsDisc());
104 
105  ITERATE(CSeq_align::C_Segs::TDisc::Tdata, hsp_itr,
106  (*i)->GetSegs().GetDisc().Get()) {
107  retval->Set().push_back((*hsp_itr));
108  }
109  }
110 
111  return retval;
112 }
113 
114 #if 0
115 void PrintFormattedSeqAlign(ostream& out,
116  const CSeq_align_set* sas,
117  CScope& scope)
118 {
119  ASSERT(sas);
120 
121  int align_opt = CDisplaySeqalign::eShowMiddleLine |
125 
127 
128  CDisplaySeqalign formatter(*saset, scope);
129  formatter.SetAlignOption(align_opt);
130  formatter.DisplaySeqalign(out);
131 }
132 #endif
133 
134 // Pretty print sequence
135 void PrintSequence(const Uint1* seq, TSeqPos len, ostream& out,
136  bool show_markers, TSeqPos chars_per_line)
137 {
138  TSeqPos nlines = len/chars_per_line;
139 
140  for (TSeqPos line = 0; line < nlines + 1; line++) {
141 
142  // print chars_per_line residues/bases
143  for (TSeqPos i = (chars_per_line*line);
144  i < chars_per_line*(line+1) && (i < len); i++) {
145  out << GetResidue(seq[i]);
146  }
147  out << endl;
148 
149  if ( !show_markers )
150  continue;
151 
152  // print the residue/base markers
153  for (TSeqPos i = (chars_per_line*line);
154  i < chars_per_line*(line+1) && (i < len); i++) {
155  if (i == 0 || ((i%10) == 0)) {
156  out << i;
157  stringstream ss;
158  ss << i;
159  TSeqPos marker_length = ss.str().size();
160  i += (marker_length-1);
161  } else {
162  out << " ";
163  }
164  }
165  out << endl;
166  }
167 }
168 
169 void PrintSequence(const CSeqVector svector, ostream& out,
170  bool show_markers, TSeqPos chars_per_line)
171 {
172  TSeqPos nlines = svector.size()/chars_per_line;
173 
174  for (TSeqPos line = 0; line < nlines + 1; line++) {
175 
176  // print chars_per_line residues/bases
177  for (TSeqPos i = (chars_per_line*line);
178  i < chars_per_line*(line+1) && (i < svector.size()); i++) {
179  out << GetResidue(svector[i]);
180  }
181  out << endl;
182 
183  if ( !show_markers )
184  continue;
185 
186  // print the residue/base markers
187  for (TSeqPos i = (chars_per_line*line);
188  i < chars_per_line*(line+1) && (i < svector.size()); i++) {
189  if (i == 0 || ((i%10) == 0)) {
190  out << i;
191  stringstream ss;
192  ss << i;
193  TSeqPos marker_length = ss.str().size();
194  i += (marker_length-1);
195  } else {
196  out << " ";
197  }
198  }
199  out << endl;
200  }
201 }
202 
203 char GetResidue(unsigned int res)
204 {
205  if ( !(res < BLASTAA_SIZE)) {
206  std::stringstream ss;
207  ss << "TestUtil::GetResidue(): Invalid residue " << res;
208  throw std::runtime_error(ss.str());
209  }
210  return NCBISTDAA_TO_AMINOACID[res];
211 
212 }
213 
215 CreateProtQueryInfo(unsigned int query_size)
216 {
218  if ( !retval ) {
219  return NULL;
220  }
221  retval->contexts[0].query_length = query_size;
222  retval->max_length = query_size;
223  return retval;
224 }
225 
228 {
229  if ( !seqsrc ) {
230  return;
231  }
232 
233  char* error_str = BlastSeqSrcGetInitError(seqsrc);
234  if (error_str) {
235  string msg(error_str);
236  sfree(error_str);
237  NCBI_THROW(CBlastException, eSeqSrcInit, msg);
238  }
239 }
240 
241 Uint4
243  Uint4 byte_length,
244  Uint4 swap_size,
245  Uint4 hash_seed)
246 {
247  Uint4 hash = hash_seed;
248  Uint4 swap_mask = swap_size - 1;
249 
250  // Check that swapsize is a power of two.
251  _ASSERT((swap_size) && (0 == (swap_mask & swap_size)));
252 
253  // Insure that the byte_length is a multiple of swap_size
254  _ASSERT((byte_length & swap_mask) == 0);
255 
256  Uint1 end_bytes[] = { 0x44, 0x33, 0x22, 0x11 };
257  Uint4 end_value = *((int *) & end_bytes);
258 
259  if (end_value == 0x11223344) {
260  // Prevent actual swapping on little endian machinery.
261  swap_size = 1;
262  swap_mask = 0;
263  }
264 
265  Uint4 keep_mask = ~ swap_mask;
266 
267  // Logical address is the address if the data was little endian.
268 
269  for(Uint4 logical = 0; logical < byte_length; logical++) {
270  Uint4 physical =
271  (logical & keep_mask) | (swap_mask - (logical & swap_mask));
272 
273  // Alternate addition and XOR. This technique destroys most
274  // of the possible mathematical relationships between similar
275  // input strings.
276 
277  if (logical & 1) {
278  hash += int(buffer[physical]) & 0xFF;
279  } else {
280  hash ^= int(buffer[physical]) & 0xFF;
281  }
282 
283  // 1. "Rotate" by a value relatively prime to 32 (any odd
284  // value), to insure that each input bit will eventually
285  // affect each position.
286  // 2. Add a per-iteration constant to detect changes in length.
287 
288  hash = ((hash << 13) | (hash >> 19)) + 1234;
289  }
290 
291  return hash;
292 }
293 
294 }
295 
Sequence alignment display tool.
Contains C++ wrapper classes to structures in algo/blast/core as well as some auxiliary functions to ...
#define sfree(x)
Safe free a pointer: belongs to a higher level header.
Definition: blast_def.h:112
Definitions which are dependant on the NCBI C++ Object Manager.
@ eBlastTypeBlastn
Definition: blast_program.h:74
@ eBlastTypeBlastx
Definition: blast_program.h:75
@ eBlastTypePsiTblastn
Definition: blast_program.h:83
@ eBlastTypeRpsTblastn
Definition: blast_program.h:85
@ eBlastTypePhiBlastn
Definition: blast_program.h:87
@ eBlastTypeTblastx
Definition: blast_program.h:79
@ eBlastTypePsiBlast
Definition: blast_program.h:82
@ eBlastTypePhiBlastp
Definition: blast_program.h:86
@ eBlastTypeRpsBlast
Definition: blast_program.h:84
@ eBlastTypeTblastn
Definition: blast_program.h:77
@ eBlastTypeBlastp
Definition: blast_program.h:73
BlastQueryInfo * BlastQueryInfoNew(EBlastProgramType program, int num_queries)
Allocate memory for query information structure.
char * BlastSeqSrcGetInitError(const BlastSeqSrc *seq_src)
Function to retrieve NULL terminated string containing the description of an initialization error or ...
Definition: blast_seqsrc.c:159
Defines BLAST error codes (user errors included)
CRandom::
Definition: random_gen.hpp:66
CScope –.
Definition: scope.hpp:92
CSeqVector –.
Definition: seq_vector.hpp:65
size_type size() const noexcept
return size of the vector
Definition: bmsparsevec.h:729
std::ofstream out("events_result.xml")
main entry point for tests
#define BLASTAA_SIZE
Size of aminoacid alphabet.
const char NCBISTDAA_TO_AMINOACID[]
Translates between ncbieaa and ncbistdaa.
unsigned int TSeqPos
Type for sequence locations and lengths.
Definition: ncbimisc.hpp:875
#define ITERATE(Type, Var, Cont)
ITERATE macro to sequence through container elements.
Definition: ncbimisc.hpp:815
#define NULL
Definition: ncbistd.hpp:225
#define NCBI_THROW(exception_class, err_code, message)
Generic macro to throw an exception, given the exception class, error code and message string.
Definition: ncbiexpt.hpp:704
#define THROWS(x)
Definition: ncbiexpt.hpp:75
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
uint32_t Uint4
4-byte (32-bit) unsigned integer
Definition: ncbitype.h:103
Uint4 TValue
Type of the generated integer value and/or the seed value.
Definition: random_gen.hpp:69
TValue GetRand(void)
Get the next random number in the interval [0..GetMax()] (inclusive)
Definition: random_gen.hpp:238
Tdata & Set(void)
Assign a value to data member.
list< CRef< CSeq_align > > Tdata
const Tdata & Get(void) const
Get the member data.
@ e_Gi
GenInfo Integrated Database.
Definition: Seq_id_.hpp:106
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1210
static const char * GetResidue(TokenStatBlkPtr stoken)
Definition: indx_blk.cpp:234
int i
int len
void PrintFormattedSeqAlign(ostream &out, const CSeq_align_set *sas, CScope &scope)
objects::CSeq_id * GenerateRandomSeqid_Gi()
vector< EBlastProgramType > GetAllBlastProgramTypes()
BlastQueryInfo * CreateProtQueryInfo(unsigned int query_size)
Creates and initializes a BlastQueryInfo structure for a single protein sequence.
CRef< CSeq_align_set > FlattenSeqAlignSet(const CSeq_align_set &sset)
void CheckForBlastSeqSrcErrors(const BlastSeqSrc *seqsrc)
Uint4 EndianIndependentBufferHash(const char *buffer, Uint4 byte_length, Uint4 swap_size, Uint4 hash_seed)
Endianness independent hash function.
void PrintSequence(const CSeqVector svector, ostream &out, bool show_markers, TSeqPos chars_per_line)
Magic spell ;-) needed for some weird compilers... very empiric.
#define ASSERT
macro for assert.
Definition: ncbi_std.h:107
Miscellaneous common-use basic types and functionality.
Defines Limits for the types used in NCBI C/C++ toolkit.
Uint4 end_value
char end_bytes[4]
Utilities to develop and debug unit tests for BLAST.
static pcre_uint8 * buffer
Definition: pcretest.c:1051
Int4 query_length
Length of this query, strand or frame.
The query related information.
BlastContextInfo * contexts
Information per context.
Uint4 max_length
Length of the longest among the concatenated queries.
Complete type definition of Blast Sequence Source ADT.
Definition: blast_seqsrc.c:43
Definition: _hash_fun.h:40
#define _ASSERT
Utility stuff for more convenient using of Boost.Test library.
Modified on Tue Apr 30 06:41:10 2024 by modify_doxy.py rev. 669887