NCBI C++ ToolKit
lookup_util.c
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /* $Id: lookup_util.c 73100 2016-06-20 15:45:40Z boratyng $
2  * ===========================================================================
3  *
4  * PUBLIC DOMAIN NOTICE
5  * National Center for Biotechnology Information
6  *
7  * This software/database is a "United States Government Work" under the
8  * terms of the United States Copyright Act. It was written as part of
9  * the author's official duties as a United States Government employee and
10  * thus cannot be copyrighted. This software/database is freely available
11  * to the public for use. The National Library of Medicine and the U.S.
12  * Government have not placed any restriction on its use or reproduction.
13  *
14  * Although all reasonable efforts have been taken to ensure the accuracy
15  * and reliability of the software and data, the NLM and the U.S.
16  * Government do not and cannot warrant the performance or results that
17  * may be obtained by using this software or data. The NLM and the U.S.
18  * Government disclaim all warranties, express or implied, including
19  * warranties of performance, merchantability or fitness for any particular
20  * purpose.
21  *
22  * Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  */
26 
27 /** @file lookup_util.c
28  * Utility functions for lookup table generation.
29  */
30 
32 
33 static void fkm_output(Int4 * a,
34  Int4 n,
35  Int4 p,
36  Uint1 * output,
37  Int4 * cursor,
38  Uint1 * alphabet);
39 static void fkm(Int4 * a,
40  Int4 n,
41  Int4 k,
42  Uint1 * output,
43  Int4 * cursor,
44  Uint1 * alphabet);
45 
46 Int4
48  Int4 n)
49 {
50  Int4 r, y;
51 
52  r = 1;
53  y = x;
54 
55  if (n == 0)
56  return 1;
57  if (x == 0)
58  return 0;
59 
60  while (n > 1) {
61  if ((n % 2) == 1)
62  r *= y;
63  n = n >> 1;
64  y = y * y;
65  }
66  r = r * y;
67  return r;
68 }
69 
70 Int4
72 {
73  Int4 lg = 0;
74 
75  if (x == 0)
76  return 0;
77 
78  while ((x = x >> 1))
79  lg++;
80 
81  return lg;
82 }
83 
84 /** Output a Lyndon word as part of a de Bruijn sequence.
85  *
86  *if the length of a lyndon word is divisible by n, print it.
87  * @param a the shift register
88  * @param p
89  * @param n
90  * @param output the output sequence
91  * @param cursor current location in the output sequence
92  * @param alphabet optional translation alphabet
93  */
94 
95 static void
97  Int4 n,
98  Int4 p,
99  Uint1 * output,
100  Int4 * cursor,
101  Uint1 * alphabet)
102 {
103  Int4 i;
104 
105  if (n % p == 0)
106  for (i = 1; i <= p; i++) {
107  if (alphabet != NULL)
108  output[*cursor] = alphabet[a[i]];
109  else
110  output[*cursor] = a[i];
111  *cursor = *cursor + 1;
112  }
113 }
114 
115 /**
116  * iterative fredricksen-kessler-maiorana algorithm
117  * to generate de bruijn sequences.
118  *
119  * generates all lyndon words, in lexicographic order.
120  * the concatenation of all lyndon words whose length is
121  * divisible by n yields a de bruijn sequence.
122  *
123  * further, the sequence generated is of shortest lexicographic length.
124  *
125  * references:
126  * http://mathworld.wolfram.com/deBruijnSequence.html
127  * http://www.theory.csc.uvic.ca/~cos/inf/neck/NecklaceInfo.html
128  * http://www.cs.usyd.edu.au/~algo4301/ , chapter 7.2
129  * http://citeseer.nj.nec.com/ruskey92generating.html
130  *
131  * @param a the shift register
132  * @param n the number of letters in each word
133  * @param k the size of the alphabet
134  * @param output the output sequence
135  * @param cursor the current location in the output sequence
136  * @param alphabet optional translation alphabet
137  */
138 
139 static void
141  Int4 n,
142  Int4 k,
143  Uint1 * output,
144  Int4 * cursor,
145  Uint1 * alphabet)
146 {
147  Int4 i, j;
148 
149  fkm_output(a, n, 1, output, cursor, alphabet);
150 
151  i = n;
152 
153  do {
154  a[i] = a[i] + 1;
155 
156  for (j = 1; j <= n - i; j++)
157  a[j + i] = a[j];
158 
159  fkm_output(a, n, i, output, cursor, alphabet);
160 
161  i = n;
162 
163  while (a[i] == k - 1)
164  i--;
165  }
166  while (i > 0);
167 }
168 
169 void
171  Int4 k,
172  Uint1 * output,
173  Uint1 * alphabet)
174 {
175  Int4 *a;
176  Int4 cursor = 0;
177 
178  /* n+1 because the array is indexed from one, not zero */
179  a = (Int4 *) calloc((n + 1), sizeof(Int4));
180 
181  /* compute the (n,k) de Bruijn sequence and store it in output */
182 
183  fkm(a, n, k, output, &cursor, alphabet);
184 
185  sfree(a);
186  return;
187 }
188 
189 Int4
191 {
192  Int4 num_entries = 0;
193  Int4 curr_max = 0;
194  BlastSeqLoc *loc = location;
195 
196  while (loc) {
197  num_entries += loc->ssr->right - loc->ssr->left;
198  curr_max = MAX(curr_max, loc->ssr->right);
199  loc = loc->next;
200  }
201 
202  *max_off = curr_max;
203  return num_entries;
204 }
#define sfree(x)
Safe free a pointer: belongs to a higher level header.
Definition: blast_def.h:112
static SQLCHAR output[256]
Definition: print.c:5
static const char location[]
Definition: config.c:97
#define NULL
Definition: ncbistd.hpp:225
uint8_t Uint1
1-byte (8-bit) unsigned integer
Definition: ncbitype.h:99
int32_t Int4
4-byte (32-bit) signed integer
Definition: ncbitype.h:102
int64_t Int8
8-byte (64-bit) signed integer
Definition: ncbitype.h:104
int i
yy_size_t n
Int4 EstimateNumTableEntries(BlastSeqLoc *location, Int4 *max_off)
Given a list of query locations, estimate the number of words that would need to be added to a lookup...
Definition: lookup_util.c:190
static void fkm(Int4 *a, Int4 n, Int4 k, Uint1 *output, Int4 *cursor, Uint1 *alphabet)
iterative fredricksen-kessler-maiorana algorithm to generate de bruijn sequences.
Definition: lookup_util.c:140
Int4 iexp(Int4 x, Int4 n)
Integer exponentiation using right to left binary algorithm.
Definition: lookup_util.c:47
Int4 ilog2(Int8 x)
Integer base two logarithm.
Definition: lookup_util.c:71
void debruijn(Int4 n, Int4 k, Uint1 *output, Uint1 *alphabet)
generates a de Bruijn sequence containing all substrings of length n over an alphabet of size k.
Definition: lookup_util.c:170
static void fkm_output(Int4 *a, Int4 n, Int4 p, Uint1 *output, Int4 *cursor, Uint1 *alphabet)
Output a Lyndon word as part of a de Bruijn sequence.
Definition: lookup_util.c:96
Utility functions for lookup table generation.
unsigned int a
Definition: ncbi_localip.c:102
#define MAX(a, b)
returns larger of a and b.
Definition: ncbi_std.h:117
double r(size_t dimension_, const Int4 *score_, const double *prob_, double theta_)
Used to hold a set of positions, mostly used for filtering.
Definition: blast_def.h:204
SSeqRange * ssr
location data on the sequence.
Definition: blast_def.h:206
struct BlastSeqLoc * next
next in linked list
Definition: blast_def.h:205
Int4 left
left endpoint of range (zero based)
Definition: blast_def.h:156
Int4 right
right endpoint of range (zero based)
Definition: blast_def.h:157
voidp calloc(uInt items, uInt size)
Modified on Sun Apr 14 05:24:18 2024 by modify_doxy.py rev. 669887