NCBI C++ ToolKit
mdb.c
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /** @file mdb.c
2  * @brief Lightning memory-mapped database library
3  *
4  * A Btree-based database management library modeled loosely on the
5  * BerkeleyDB API, but much simplified.
6  */
7 /*
8  * Copyright 2011-2019 Howard Chu, Symas Corp.
9  * All rights reserved.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted only as authorized by the OpenLDAP
13  * Public License.
14  *
15  * A copy of this license is available in the file LICENSE in the
16  * top-level directory of the distribution or, alternatively, at
17  * <http://www.OpenLDAP.org/license.html>.
18  *
19  * This code is derived from btree.c written by Martin Hedenfalk.
20  *
21  * Copyright (c) 2009, 2010 Martin Hedenfalk <martin@bzero.se>
22  *
23  * Permission to use, copy, modify, and distribute this software for any
24  * purpose with or without fee is hereby granted, provided that the above
25  * copyright notice and this permission notice appear in all copies.
26  *
27  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
28  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
29  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
30  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
31  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
32  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
33  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
34  */
35 #ifndef _GNU_SOURCE
36 #define _GNU_SOURCE 1
37 #endif
38 #if defined(__WIN64__)
39 #define _FILE_OFFSET_BITS 64
40 #endif
41 #ifdef _WIN32
42 #include <malloc.h>
43 #include <windows.h>
44 #include <wchar.h> /* get wcscpy() */
45 
46 /** getpid() returns int; MinGW defines pid_t but MinGW64 typedefs it
47  * as int64 which is wrong. MSVC doesn't define it at all, so just
48  * don't use it.
49  */
50 #define MDB_PID_T int
51 #define MDB_THR_T DWORD
52 #include <sys/types.h>
53 #include <sys/stat.h>
54 #ifdef __GNUC__
55 # include <sys/param.h>
56 #else
57 # define LITTLE_ENDIAN 1234
58 # define BIG_ENDIAN 4321
59 # define BYTE_ORDER LITTLE_ENDIAN
60 # ifndef SSIZE_MAX
61 # define SSIZE_MAX INT_MAX
62 # endif
63 #endif
64 #else
65 #include <sys/types.h>
66 #include <sys/stat.h>
67 #define MDB_PID_T pid_t
68 #define MDB_THR_T pthread_t
69 #include <sys/param.h>
70 #include <sys/uio.h>
71 #include <sys/mman.h>
72 #ifdef HAVE_SYS_FILE_H
73 #include <sys/file.h>
74 #endif
75 #include <fcntl.h>
76 #endif
77 
78 #if defined(__mips) && defined(__linux)
79 /* MIPS has cache coherency issues, requires explicit cache control */
80 #include <asm/cachectl.h>
81 extern int cacheflush(char *addr, int nbytes, int cache);
82 #define CACHEFLUSH(addr, bytes, cache) cacheflush(addr, bytes, cache)
83 #else
84 #define CACHEFLUSH(addr, bytes, cache)
85 #endif
86 
87 #if defined(__linux) && !defined(MDB_FDATASYNC_WORKS)
88 /** fdatasync is broken on ext3/ext4fs on older kernels, see
89  * description in #mdb_env_open2 comments. You can safely
90  * define MDB_FDATASYNC_WORKS if this code will only be run
91  * on kernels 3.6 and newer.
92  */
93 #define BROKEN_FDATASYNC
94 #endif
95 
96 #include <errno.h>
97 #include <limits.h>
98 #include <stddef.h>
99 #include <inttypes.h>
100 #include <stdio.h>
101 #include <stdlib.h>
102 #include <string.h>
103 #include <time.h>
104 
105 #ifdef _MSC_VER
106 #include <io.h>
107 typedef SSIZE_T ssize_t;
108 #else
109 #include <unistd.h>
110 #endif
111 
112 #if defined(__sun) || defined(ANDROID)
113 /* Most platforms have posix_memalign, older may only have memalign */
114 #define HAVE_MEMALIGN 1
115 #include <malloc.h>
116 /* On Solaris, we need the POSIX sigwait function */
117 #if defined (__sun)
118 # define _POSIX_PTHREAD_SEMANTICS 1
119 #endif
120 #endif
121 
122 #if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER))
123 #include <netinet/in.h>
124 #include <resolv.h> /* defines BYTE_ORDER on HPUX and Solaris */
125 #endif
126 
127 #if defined(__APPLE__) || defined (BSD) || defined(__FreeBSD_kernel__)
128 # define MDB_USE_POSIX_SEM 1
129 # define MDB_FDATASYNC fsync
130 #elif defined(ANDROID)
131 # define MDB_FDATASYNC fsync
132 #endif
133 
134 #ifndef _WIN32
135 #include <pthread.h>
136 #include <signal.h>
137 #ifdef MDB_USE_POSIX_SEM
138 # define MDB_USE_HASH 1
139 #include <semaphore.h>
140 #else
141 #define MDB_USE_POSIX_MUTEX 1
142 #endif
143 #endif
144 
145 #if defined(_WIN32) + defined(MDB_USE_POSIX_SEM) \
146  + defined(MDB_USE_POSIX_MUTEX) != 1
147 # error "Ambiguous shared-lock implementation"
148 #endif
149 
150 #ifdef USE_VALGRIND
151 #include <valgrind/memcheck.h>
152 #define VGMEMP_CREATE(h,r,z) VALGRIND_CREATE_MEMPOOL(h,r,z)
153 #define VGMEMP_ALLOC(h,a,s) VALGRIND_MEMPOOL_ALLOC(h,a,s)
154 #define VGMEMP_FREE(h,a) VALGRIND_MEMPOOL_FREE(h,a)
155 #define VGMEMP_DESTROY(h) VALGRIND_DESTROY_MEMPOOL(h)
156 #define VGMEMP_DEFINED(a,s) VALGRIND_MAKE_MEM_DEFINED(a,s)
157 #else
158 #define VGMEMP_CREATE(h,r,z)
159 #define VGMEMP_ALLOC(h,a,s)
160 #define VGMEMP_FREE(h,a)
161 #define VGMEMP_DESTROY(h)
162 #define VGMEMP_DEFINED(a,s)
163 #endif
164 
165 #ifndef BYTE_ORDER
166 # if (defined(_LITTLE_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN))
167 /* Solaris just defines one or the other */
168 # define LITTLE_ENDIAN 1234
169 # define BIG_ENDIAN 4321
170 # ifdef _LITTLE_ENDIAN
171 # define BYTE_ORDER LITTLE_ENDIAN
172 # else
173 # define BYTE_ORDER BIG_ENDIAN
174 # endif
175 # else
176 # define BYTE_ORDER __BYTE_ORDER
177 # endif
178 #endif
179 
180 #ifndef LITTLE_ENDIAN
181 #define LITTLE_ENDIAN __LITTLE_ENDIAN
182 #endif
183 #ifndef BIG_ENDIAN
184 #define BIG_ENDIAN __BIG_ENDIAN
185 #endif
186 
187 #if defined(__i386) || defined(__x86_64) || defined(_M_IX86)
188 #define MISALIGNED_OK 1
189 #endif
190 
191 #include "lmdb.h"
192 #include "midl.h"
193 
194 #if (BYTE_ORDER == LITTLE_ENDIAN) == (BYTE_ORDER == BIG_ENDIAN)
195 # error "Unknown or unsupported endianness (BYTE_ORDER)"
196 #elif (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF
197 # error "Two's complement, reasonably sized integer types, please"
198 #endif
199 
200 #ifdef __GNUC__
201 /** Put infrequently used env functions in separate section */
202 # ifdef __APPLE__
203 # define ESECT __attribute__ ((section("__TEXT,text_env")))
204 # else
205 # define ESECT __attribute__ ((section("text_env")))
206 # endif
207 #else
208 #define ESECT
209 #endif
210 
211 #ifdef _WIN32
212 #define CALL_CONV WINAPI
213 #else
214 #define CALL_CONV
215 #endif
216 
217 /** @defgroup internal LMDB Internals
218  * @{
219  */
220 /** @defgroup compat Compatibility Macros
221  * A bunch of macros to minimize the amount of platform-specific ifdefs
222  * needed throughout the rest of the code. When the features this library
223  * needs are similar enough to POSIX to be hidden in a one-or-two line
224  * replacement, this macro approach is used.
225  * @{
226  */
227 
228  /** Features under development */
229 #ifndef MDB_DEVEL
230 #define MDB_DEVEL 0
231 #endif
232 
233  /** Wrapper around __func__, which is a C99 feature */
234 #if __STDC_VERSION__ >= 199901L
235 # define mdb_func_ __func__
236 #elif __GNUC__ >= 2 || _MSC_VER >= 1300
237 # define mdb_func_ __FUNCTION__
238 #else
239 /* If a debug message says <mdb_unknown>(), update the #if statements above */
240 # define mdb_func_ "<mdb_unknown>"
241 #endif
242 
243 /* Internal error codes, not exposed outside liblmdb */
244 #define MDB_NO_ROOT (MDB_LAST_ERRCODE + 10)
245 #ifdef _WIN32
246 #define MDB_OWNERDEAD ((int) WAIT_ABANDONED)
247 #elif defined(MDB_USE_POSIX_MUTEX) && defined(EOWNERDEAD)
248 #define MDB_OWNERDEAD EOWNERDEAD /**< #LOCK_MUTEX0() result if dead owner */
249 #endif
250 
251 #ifdef __GLIBC__
252 #define GLIBC_VER ((__GLIBC__ << 16 )| __GLIBC_MINOR__)
253 #endif
254 /** Some platforms define the EOWNERDEAD error code
255  * even though they don't support Robust Mutexes.
256  * Compile with -DMDB_USE_ROBUST=0, or use some other
257  * mechanism like -DMDB_USE_POSIX_SEM instead of
258  * -DMDB_USE_POSIX_MUTEX.
259  * (Posix semaphores are not robust.)
260  */
261 #ifndef MDB_USE_ROBUST
262 # ifdef __CYGWIN__
263 # define MDB_USE_ROBUST 0
264 # endif
265 #endif
266 
267 #ifndef MDB_USE_ROBUST
268 /* Android currently lacks Robust Mutex support. So does glibc < 2.4. */
269 # if defined(MDB_USE_POSIX_MUTEX) && (defined(ANDROID) || \
270  (defined(__GLIBC__) && GLIBC_VER < 0x020004))
271 # define MDB_USE_ROBUST 0
272 # else
273 # define MDB_USE_ROBUST 1
274 # endif
275 #endif /* !MDB_USE_ROBUST */
276 
277 #if defined(MDB_USE_POSIX_MUTEX) && (MDB_USE_ROBUST)
278 /* glibc < 2.12 only provided _np API */
279 # if (defined(__GLIBC__) && GLIBC_VER < 0x02000c) || \
280  (defined(PTHREAD_MUTEX_ROBUST_NP) && !defined(PTHREAD_MUTEX_ROBUST))
281 # define PTHREAD_MUTEX_ROBUST PTHREAD_MUTEX_ROBUST_NP
282 # define pthread_mutexattr_setrobust(attr, flag) pthread_mutexattr_setrobust_np(attr, flag)
283 # define pthread_mutex_consistent(mutex) pthread_mutex_consistent_np(mutex)
284 # endif
285 #endif /* MDB_USE_POSIX_MUTEX && MDB_USE_ROBUST */
286 
287 #if defined(MDB_OWNERDEAD) && (MDB_USE_ROBUST)
288 #define MDB_ROBUST_SUPPORTED 1
289 #endif
290 
291 #ifdef _WIN32
292 #define MDB_USE_HASH 1
293 #define MDB_PIDLOCK 0
294 #define THREAD_RET DWORD
295 #define pthread_t HANDLE
296 #define pthread_mutex_t HANDLE
297 #define pthread_cond_t HANDLE
299 #define pthread_key_t DWORD
300 #define pthread_self() GetCurrentThreadId()
301 #define pthread_key_create(x,y) \
302  ((*(x) = TlsAlloc()) == TLS_OUT_OF_INDEXES ? ErrCode() : 0)
303 #define pthread_key_delete(x) TlsFree(x)
304 #define pthread_getspecific(x) TlsGetValue(x)
305 #define pthread_setspecific(x,y) (TlsSetValue(x,y) ? 0 : ErrCode())
306 #define pthread_mutex_unlock(x) ReleaseMutex(*x)
307 #define pthread_mutex_lock(x) WaitForSingleObject(*x, INFINITE)
308 #define pthread_cond_signal(x) SetEvent(*x)
309 #define pthread_cond_wait(cond,mutex) do{SignalObjectAndWait(*mutex, *cond, INFINITE, FALSE); WaitForSingleObject(*mutex, INFINITE);}while(0)
310 #define THREAD_CREATE(thr,start,arg) \
311  (((thr) = CreateThread(NULL, 0, start, arg, 0, NULL)) ? 0 : ErrCode())
312 #define THREAD_FINISH(thr) \
313  (WaitForSingleObject(thr, INFINITE) ? ErrCode() : 0)
314 #define LOCK_MUTEX0(mutex) WaitForSingleObject(mutex, INFINITE)
315 #define UNLOCK_MUTEX(mutex) ReleaseMutex(mutex)
316 #define mdb_mutex_consistent(mutex) 0
317 #define getpid() GetCurrentProcessId()
318 #define MDB_FDATASYNC(fd) (!FlushFileBuffers(fd))
319 #define MDB_MSYNC(addr,len,flags) (!FlushViewOfFile(addr,len))
320 #define ErrCode() GetLastError()
321 #define GET_PAGESIZE(x) {SYSTEM_INFO si; GetSystemInfo(&si); (x) = si.dwPageSize;}
322 #define close(fd) (CloseHandle(fd) ? 0 : -1)
323 #define munmap(ptr,len) UnmapViewOfFile(ptr)
324 #ifdef PROCESS_QUERY_LIMITED_INFORMATION
325 #define MDB_PROCESS_QUERY_LIMITED_INFORMATION PROCESS_QUERY_LIMITED_INFORMATION
326 #else
327 #define MDB_PROCESS_QUERY_LIMITED_INFORMATION 0x1000
328 #endif
329 #define Z "I"
330 #else
331 #define THREAD_RET void *
332 #define THREAD_CREATE(thr,start,arg) pthread_create(&thr,NULL,start,arg)
333 #define THREAD_FINISH(thr) pthread_join(thr,NULL)
334 #define Z "z" /**< printf format modifier for size_t */
335 
336  /** For MDB_LOCK_FORMAT: True if readers take a pid lock in the lockfile */
337 #define MDB_PIDLOCK 1
338 
339 #ifdef MDB_USE_POSIX_SEM
340 
341 typedef sem_t *mdb_mutex_t, *mdb_mutexref_t;
342 #define LOCK_MUTEX0(mutex) mdb_sem_wait(mutex)
343 #define UNLOCK_MUTEX(mutex) sem_post(mutex)
344 
345 static int
346 mdb_sem_wait(sem_t *sem)
347 {
348  int rc;
349  while ((rc = sem_wait(sem)) && (rc = errno) == EINTR) ;
350  return rc;
351 }
352 
353 #else /* MDB_USE_POSIX_MUTEX: */
354  /** Shared mutex/semaphore as the original is stored.
355  *
356  * Not for copies. Instead it can be assigned to an #mdb_mutexref_t.
357  * When mdb_mutexref_t is a pointer and mdb_mutex_t is not, then it
358  * is array[size 1] so it can be assigned to the pointer.
359  */
360 typedef pthread_mutex_t mdb_mutex_t[1];
361  /** Reference to an #mdb_mutex_t */
362 typedef pthread_mutex_t *mdb_mutexref_t;
363  /** Lock the reader or writer mutex.
364  * Returns 0 or a code to give #mdb_mutex_failed(), as in #LOCK_MUTEX().
365  */
366 #define LOCK_MUTEX0(mutex) pthread_mutex_lock(mutex)
367  /** Unlock the reader or writer mutex.
368  */
369 #define UNLOCK_MUTEX(mutex) pthread_mutex_unlock(mutex)
370  /** Mark mutex-protected data as repaired, after death of previous owner.
371  */
372 #define mdb_mutex_consistent(mutex) pthread_mutex_consistent(mutex)
373 #endif /* MDB_USE_POSIX_SEM */
374 
375  /** Get the error code for the last failed system function.
376  */
377 #define ErrCode() errno
378 
379  /** An abstraction for a file handle.
380  * On POSIX systems file handles are small integers. On Windows
381  * they're opaque pointers.
382  */
383 #define HANDLE int
384 
385  /** A value for an invalid file handle.
386  * Mainly used to initialize file variables and signify that they are
387  * unused.
388  */
389 #define INVALID_HANDLE_VALUE (-1)
390 
391  /** Get the size of a memory page for the system.
392  * This is the basic size that the platform's memory manager uses, and is
393  * fundamental to the use of memory-mapped files.
394  */
395 #define GET_PAGESIZE(x) ((x) = sysconf(_SC_PAGE_SIZE))
396 #endif
397 
398 #if defined(_WIN32) || defined(MDB_USE_POSIX_SEM)
399 #define MNAME_LEN 32
400 #else
401 #define MNAME_LEN (sizeof(pthread_mutex_t))
402 #endif
403 
404 /** @} */
405 
406 #ifdef MDB_ROBUST_SUPPORTED
407  /** Lock mutex, handle any error, set rc = result.
408  * Return 0 on success, nonzero (not rc) on error.
409  */
410 #define LOCK_MUTEX(rc, env, mutex) \
411  (((rc) = LOCK_MUTEX0(mutex)) && \
412  ((rc) = mdb_mutex_failed(env, mutex, rc)))
413 static int mdb_mutex_failed(MDB_env *env, mdb_mutexref_t mutex, int rc);
414 #else
415 #define LOCK_MUTEX(rc, env, mutex) ((rc) = LOCK_MUTEX0(mutex))
416 #define mdb_mutex_failed(env, mutex, rc) (rc)
417 #endif
418 
419 #ifndef _WIN32
420 /** A flag for opening a file and requesting synchronous data writes.
421  * This is only used when writing a meta page. It's not strictly needed;
422  * we could just do a normal write and then immediately perform a flush.
423  * But if this flag is available it saves us an extra system call.
424  *
425  * @note If O_DSYNC is undefined but exists in /usr/include,
426  * preferably set some compiler flag to get the definition.
427  */
428 #ifndef MDB_DSYNC
429 # ifdef O_DSYNC
430 # define MDB_DSYNC O_DSYNC
431 # else
432 # define MDB_DSYNC O_SYNC
433 # endif
434 #endif
435 #endif
436 
437 /** Function for flushing the data of a file. Define this to fsync
438  * if fdatasync() is not supported.
439  */
440 #ifndef MDB_FDATASYNC
441 # define MDB_FDATASYNC fdatasync
442 #endif
443 
444 #ifndef MDB_MSYNC
445 # define MDB_MSYNC(addr,len,flags) msync(addr,len,flags)
446 #endif
447 
448 #ifndef MS_SYNC
449 #define MS_SYNC 1
450 #endif
451 
452 #ifndef MS_ASYNC
453 #define MS_ASYNC 0
454 #endif
455 
456  /** A page number in the database.
457  * Note that 64 bit page numbers are overkill, since pages themselves
458  * already represent 12-13 bits of addressable memory, and the OS will
459  * always limit applications to a maximum of 63 bits of address space.
460  *
461  * @note In the #MDB_node structure, we only store 48 bits of this value,
462  * which thus limits us to only 60 bits of addressable data.
463  */
464 typedef MDB_ID pgno_t;
465 
466  /** A transaction ID.
467  * See struct MDB_txn.mt_txnid for details.
468  */
469 typedef MDB_ID txnid_t;
470 
471 /** @defgroup debug Debug Macros
472  * @{
473  */
474 #ifndef MDB_DEBUG
475  /** Enable debug output. Needs variable argument macros (a C99 feature).
476  * Set this to 1 for copious tracing. Set to 2 to add dumps of all IDLs
477  * read from and written to the database (used for free space management).
478  */
479 #define MDB_DEBUG 0
480 #endif
481 
482 #if MDB_DEBUG
483 static int mdb_debug;
484 static txnid_t mdb_debug_start;
485 
486  /** Print a debug message with printf formatting.
487  * Requires double parenthesis around 2 or more args.
488  */
489 # define DPRINTF(args) ((void) ((mdb_debug) && DPRINTF0 args))
490 # define DPRINTF0(fmt, ...) \
491  fprintf(stderr, "%s:%d " fmt "\n", mdb_func_, __LINE__, __VA_ARGS__)
492 #else
493 # define DPRINTF(args) ((void) 0)
494 #endif
495  /** Print a debug string.
496  * The string is printed literally, with no format processing.
497  */
498 #define DPUTS(arg) DPRINTF(("%s", arg))
499  /** Debuging output value of a cursor DBI: Negative in a sub-cursor. */
500 #define DDBI(mc) \
501  (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi)
502 /** @} */
503 
504  /** @brief The maximum size of a database page.
505  *
506  * It is 32k or 64k, since value-PAGEBASE must fit in
507  * #MDB_page.%mp_upper.
508  *
509  * LMDB will use database pages < OS pages if needed.
510  * That causes more I/O in write transactions: The OS must
511  * know (read) the whole page before writing a partial page.
512  *
513  * Note that we don't currently support Huge pages. On Linux,
514  * regular data files cannot use Huge pages, and in general
515  * Huge pages aren't actually pageable. We rely on the OS
516  * demand-pager to read our data and page it out when memory
517  * pressure from other processes is high. So until OSs have
518  * actual paging support for Huge pages, they're not viable.
519  */
520 #define MAX_PAGESIZE (PAGEBASE ? 0x10000 : 0x8000)
521 
522  /** The minimum number of keys required in a database page.
523  * Setting this to a larger value will place a smaller bound on the
524  * maximum size of a data item. Data items larger than this size will
525  * be pushed into overflow pages instead of being stored directly in
526  * the B-tree node. This value used to default to 4. With a page size
527  * of 4096 bytes that meant that any item larger than 1024 bytes would
528  * go into an overflow page. That also meant that on average 2-3KB of
529  * each overflow page was wasted space. The value cannot be lower than
530  * 2 because then there would no longer be a tree structure. With this
531  * value, items larger than 2KB will go into overflow pages, and on
532  * average only 1KB will be wasted.
533  */
534 #define MDB_MINKEYS 2
535 
536  /** A stamp that identifies a file as an LMDB file.
537  * There's nothing special about this value other than that it is easily
538  * recognizable, and it will reflect any byte order mismatches.
539  */
540 #define MDB_MAGIC 0xBEEFC0DE
541 
542  /** The version number for a database's datafile format. */
543 #define MDB_DATA_VERSION ((MDB_DEVEL) ? 999 : 1)
544  /** The version number for a database's lockfile format. */
545 #define MDB_LOCK_VERSION 1
546 
547  /** @brief The max size of a key we can write, or 0 for computed max.
548  *
549  * This macro should normally be left alone or set to 0.
550  * Note that a database with big keys or dupsort data cannot be
551  * reliably modified by a liblmdb which uses a smaller max.
552  * The default is 511 for backwards compat, or 0 when #MDB_DEVEL.
553  *
554  * Other values are allowed, for backwards compat. However:
555  * A value bigger than the computed max can break if you do not
556  * know what you are doing, and liblmdb <= 0.9.10 can break when
557  * modifying a DB with keys/dupsort data bigger than its max.
558  *
559  * Data items in an #MDB_DUPSORT database are also limited to
560  * this size, since they're actually keys of a sub-DB. Keys and
561  * #MDB_DUPSORT data items must fit on a node in a regular page.
562  */
563 #ifndef MDB_MAXKEYSIZE
564 #define MDB_MAXKEYSIZE ((MDB_DEVEL) ? 0 : 511)
565 #endif
566 
567  /** The maximum size of a key we can write to the environment. */
568 #if MDB_MAXKEYSIZE
569 #define ENV_MAXKEY(env) (MDB_MAXKEYSIZE)
570 #else
571 #define ENV_MAXKEY(env) ((env)->me_maxkey)
572 #endif
573 
574  /** @brief The maximum size of a data item.
575  *
576  * We only store a 32 bit value for node sizes.
577  */
578 #define MAXDATASIZE 0xffffffffUL
579 
580 #if MDB_DEBUG
581  /** Key size which fits in a #DKBUF.
582  * @ingroup debug
583  */
584 #define DKBUF_MAXKEYSIZE ((MDB_MAXKEYSIZE) > 0 ? (MDB_MAXKEYSIZE) : 511)
585  /** A key buffer.
586  * @ingroup debug
587  * This is used for printing a hex dump of a key's contents.
588  */
589 #define DKBUF char kbuf[DKBUF_MAXKEYSIZE*2+1]
590  /** Display a key in hex.
591  * @ingroup debug
592  * Invoke a function to display a key in hex.
593  */
594 #define DKEY(x) mdb_dkey(x, kbuf)
595 #else
596 #define DKBUF
597 #define DKEY(x) 0
598 #endif
599 
600  /** An invalid page number.
601  * Mainly used to denote an empty tree.
602  */
603 #define P_INVALID (~(pgno_t)0)
604 
605  /** Test if the flags \b f are set in a flag word \b w. */
606 #define F_ISSET(w, f) (((w) & (f)) == (f))
607 
608  /** Round \b n up to an even number. */
609 #define EVEN(n) (((n) + 1U) & -2) /* sign-extending -2 to match n+1U */
610 
611  /** Used for offsets within a single page.
612  * Since memory pages are typically 4 or 8KB in size, 12-13 bits,
613  * this is plenty.
614  */
615 typedef uint16_t indx_t;
616 
617  /** Default size of memory map.
618  * This is certainly too small for any actual applications. Apps should always set
619  * the size explicitly using #mdb_env_set_mapsize().
620  */
621 #define DEFAULT_MAPSIZE 1048576
622 
623 /** @defgroup readers Reader Lock Table
624  * Readers don't acquire any locks for their data access. Instead, they
625  * simply record their transaction ID in the reader table. The reader
626  * mutex is needed just to find an empty slot in the reader table. The
627  * slot's address is saved in thread-specific data so that subsequent read
628  * transactions started by the same thread need no further locking to proceed.
629  *
630  * If #MDB_NOTLS is set, the slot address is not saved in thread-specific data.
631  *
632  * No reader table is used if the database is on a read-only filesystem, or
633  * if #MDB_NOLOCK is set.
634  *
635  * Since the database uses multi-version concurrency control, readers don't
636  * actually need any locking. This table is used to keep track of which
637  * readers are using data from which old transactions, so that we'll know
638  * when a particular old transaction is no longer in use. Old transactions
639  * that have discarded any data pages can then have those pages reclaimed
640  * for use by a later write transaction.
641  *
642  * The lock table is constructed such that reader slots are aligned with the
643  * processor's cache line size. Any slot is only ever used by one thread.
644  * This alignment guarantees that there will be no contention or cache
645  * thrashing as threads update their own slot info, and also eliminates
646  * any need for locking when accessing a slot.
647  *
648  * A writer thread will scan every slot in the table to determine the oldest
649  * outstanding reader transaction. Any freed pages older than this will be
650  * reclaimed by the writer. The writer doesn't use any locks when scanning
651  * this table. This means that there's no guarantee that the writer will
652  * see the most up-to-date reader info, but that's not required for correct
653  * operation - all we need is to know the upper bound on the oldest reader,
654  * we don't care at all about the newest reader. So the only consequence of
655  * reading stale information here is that old pages might hang around a
656  * while longer before being reclaimed. That's actually good anyway, because
657  * the longer we delay reclaiming old pages, the more likely it is that a
658  * string of contiguous pages can be found after coalescing old pages from
659  * many old transactions together.
660  * @{
661  */
662  /** Number of slots in the reader table.
663  * This value was chosen somewhat arbitrarily. 126 readers plus a
664  * couple mutexes fit exactly into 8KB on my development machine.
665  * Applications should set the table size using #mdb_env_set_maxreaders().
666  */
667 #define DEFAULT_READERS 126
668 
669  /** The size of a CPU cache line in bytes. We want our lock structures
670  * aligned to this size to avoid false cache line sharing in the
671  * lock table.
672  * This value works for most CPUs. For Itanium this should be 128.
673  */
674 #ifndef CACHELINE
675 #define CACHELINE 64
676 #endif
677 
678  /** The information we store in a single slot of the reader table.
679  * In addition to a transaction ID, we also record the process and
680  * thread ID that owns a slot, so that we can detect stale information,
681  * e.g. threads or processes that went away without cleaning up.
682  * @note We currently don't check for stale records. We simply re-init
683  * the table when we know that we're the only process opening the
684  * lock file.
685  */
686 typedef struct MDB_rxbody {
687  /** Current Transaction ID when this transaction began, or (txnid_t)-1.
688  * Multiple readers that start at the same time will probably have the
689  * same ID here. Again, it's not important to exclude them from
690  * anything; all we need to know is which version of the DB they
691  * started from so we can avoid overwriting any data used in that
692  * particular version.
693  */
694  volatile txnid_t mrb_txnid;
695  /** The process ID of the process owning this reader txn. */
696  volatile MDB_PID_T mrb_pid;
697  /** The thread ID of the thread owning this txn. */
698  volatile MDB_THR_T mrb_tid;
700 
701  /** The actual reader record, with cacheline padding. */
702 typedef struct MDB_reader {
703  union {
705  /** shorthand for mrb_txnid */
706 #define mr_txnid mru.mrx.mrb_txnid
707 #define mr_pid mru.mrx.mrb_pid
708 #define mr_tid mru.mrx.mrb_tid
709  /** cache line alignment */
710  char pad[(sizeof(MDB_rxbody)+CACHELINE-1) & ~(CACHELINE-1)];
711  } mru;
713 
714  /** The header for the reader table.
715  * The table resides in a memory-mapped file. (This is a different file
716  * than is used for the main database.)
717  *
718  * For POSIX the actual mutexes reside in the shared memory of this
719  * mapped file. On Windows, mutexes are named objects allocated by the
720  * kernel; we store the mutex names in this mapped file so that other
721  * processes can grab them. This same approach is also used on
722  * MacOSX/Darwin (using named semaphores) since MacOSX doesn't support
723  * process-shared POSIX mutexes. For these cases where a named object
724  * is used, the object name is derived from a 64 bit FNV hash of the
725  * environment pathname. As such, naming collisions are extremely
726  * unlikely. If a collision occurs, the results are unpredictable.
727  */
728 typedef struct MDB_txbody {
729  /** Stamp identifying this as an LMDB file. It must be set
730  * to #MDB_MAGIC. */
732  /** Format of this lock file. Must be set to #MDB_LOCK_FORMAT. */
734 #if defined(_WIN32) || defined(MDB_USE_POSIX_SEM)
735  char mtb_rmname[MNAME_LEN];
736 #else
737  /** Mutex protecting access to this table.
738  * This is the reader table lock used with LOCK_MUTEX().
739  */
741 #endif
742  /** The ID of the last transaction committed to the database.
743  * This is recorded here only for convenience; the value can always
744  * be determined by reading the main database meta pages.
745  */
746  volatile txnid_t mtb_txnid;
747  /** The number of slots that have been used in the reader table.
748  * This always records the maximum count, it is not decremented
749  * when readers release their slots.
750  */
751  volatile unsigned mtb_numreaders;
753 
754  /** The actual reader table definition. */
755 typedef struct MDB_txninfo {
756  union {
758 #define mti_magic mt1.mtb.mtb_magic
759 #define mti_format mt1.mtb.mtb_format
760 #define mti_rmutex mt1.mtb.mtb_rmutex
761 #define mti_rmname mt1.mtb.mtb_rmname
762 #define mti_txnid mt1.mtb.mtb_txnid
763 #define mti_numreaders mt1.mtb.mtb_numreaders
764  char pad[(sizeof(MDB_txbody)+CACHELINE-1) & ~(CACHELINE-1)];
765  } mt1;
766  union {
767 #if defined(_WIN32) || defined(MDB_USE_POSIX_SEM)
768  char mt2_wmname[MNAME_LEN];
769 #define mti_wmname mt2.mt2_wmname
770 #else
772 #define mti_wmutex mt2.mt2_wmutex
773 #endif
774  char pad[(MNAME_LEN+CACHELINE-1) & ~(CACHELINE-1)];
775  } mt2;
778 
779  /** Lockfile format signature: version, features and field layout */
780 #define MDB_LOCK_FORMAT \
781  ((uint32_t) \
782  ((MDB_LOCK_VERSION) \
783  /* Flags which describe functionality */ \
784  + (((MDB_PIDLOCK) != 0) << 16)))
785 /** @} */
786 
787 /** Common header for all page types. The page type depends on #mp_flags.
788  *
789  * #P_BRANCH and #P_LEAF pages have unsorted '#MDB_node's at the end, with
790  * sorted #mp_ptrs[] entries referring to them. Exception: #P_LEAF2 pages
791  * omit mp_ptrs and pack sorted #MDB_DUPFIXED values after the page header.
792  *
793  * #P_OVERFLOW records occupy one or more contiguous pages where only the
794  * first has a page header. They hold the real data of #F_BIGDATA nodes.
795  *
796  * #P_SUBP sub-pages are small leaf "pages" with duplicate data.
797  * A node with flag #F_DUPDATA but not #F_SUBDATA contains a sub-page.
798  * (Duplicate data can also go in sub-databases, which use normal pages.)
799  *
800  * #P_META pages contain #MDB_meta, the start point of an LMDB snapshot.
801  *
802  * Each non-metapage up to #MDB_meta.%mm_last_pg is reachable exactly once
803  * in the snapshot: Either used by a database or listed in a freeDB record.
804  */
805 typedef struct MDB_page {
806 #define mp_pgno mp_p.p_pgno
807 #define mp_next mp_p.p_next
808  union {
809  pgno_t p_pgno; /**< page number */
810  struct MDB_page *p_next; /**< for in-memory list of freed pages */
811  } mp_p;
812  uint16_t mp_pad; /**< key size if this is a LEAF2 page */
813 /** @defgroup mdb_page Page Flags
814  * @ingroup internal
815  * Flags for the page headers.
816  * @{
817  */
818 #define P_BRANCH 0x01 /**< branch page */
819 #define P_LEAF 0x02 /**< leaf page */
820 #define P_OVERFLOW 0x04 /**< overflow page */
821 #define P_META 0x08 /**< meta page */
822 #define P_DIRTY 0x10 /**< dirty page, also set for #P_SUBP pages */
823 #define P_LEAF2 0x20 /**< for #MDB_DUPFIXED records */
824 #define P_SUBP 0x40 /**< for #MDB_DUPSORT sub-pages */
825 #define P_LOOSE 0x4000 /**< page was dirtied then freed, can be reused */
826 #define P_KEEP 0x8000 /**< leave this page alone during spill */
827 /** @} */
828  uint16_t mp_flags; /**< @ref mdb_page */
829 #define mp_lower mp_pb.pb.pb_lower
830 #define mp_upper mp_pb.pb.pb_upper
831 #define mp_pages mp_pb.pb_pages
832  union {
833  struct {
834  indx_t pb_lower; /**< lower bound of free space */
835  indx_t pb_upper; /**< upper bound of free space */
836  } pb;
837  uint32_t pb_pages; /**< number of overflow pages */
838  } mp_pb;
839  indx_t mp_ptrs[1]; /**< dynamic size */
841 
842  /** Size of the page header, excluding dynamic data at the end */
843 #define PAGEHDRSZ ((unsigned) offsetof(MDB_page, mp_ptrs))
844 
845  /** Address of first usable data byte in a page, after the header */
846 #define METADATA(p) ((void *)((char *)(p) + PAGEHDRSZ))
847 
848  /** ITS#7713, change PAGEBASE to handle 65536 byte pages */
849 #define PAGEBASE ((MDB_DEVEL) ? PAGEHDRSZ : 0)
850 
851  /** Number of nodes on a page */
852 #define NUMKEYS(p) (((p)->mp_lower - (PAGEHDRSZ-PAGEBASE)) >> 1)
853 
854  /** The amount of space remaining in the page */
855 #define SIZELEFT(p) (indx_t)((p)->mp_upper - (p)->mp_lower)
856 
857  /** The percentage of space used in the page, in tenths of a percent. */
858 #define PAGEFILL(env, p) (1000L * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \
859  ((env)->me_psize - PAGEHDRSZ))
860  /** The minimum page fill factor, in tenths of a percent.
861  * Pages emptier than this are candidates for merging.
862  */
863 #define FILL_THRESHOLD 250
864 
865  /** Test if a page is a leaf page */
866 #define IS_LEAF(p) F_ISSET((p)->mp_flags, P_LEAF)
867  /** Test if a page is a LEAF2 page */
868 #define IS_LEAF2(p) F_ISSET((p)->mp_flags, P_LEAF2)
869  /** Test if a page is a branch page */
870 #define IS_BRANCH(p) F_ISSET((p)->mp_flags, P_BRANCH)
871  /** Test if a page is an overflow page */
872 #define IS_OVERFLOW(p) F_ISSET((p)->mp_flags, P_OVERFLOW)
873  /** Test if a page is a sub page */
874 #define IS_SUBP(p) F_ISSET((p)->mp_flags, P_SUBP)
875 
876  /** The number of overflow pages needed to store the given size. */
877 #define OVPAGES(size, psize) ((PAGEHDRSZ-1 + (size)) / (psize) + 1)
878 
879  /** Link in #MDB_txn.%mt_loose_pgs list.
880  * Kept outside the page header, which is needed when reusing the page.
881  */
882 #define NEXT_LOOSE_PAGE(p) (*(MDB_page **)((p) + 2))
883 
884  /** Header for a single key/data pair within a page.
885  * Used in pages of type #P_BRANCH and #P_LEAF without #P_LEAF2.
886  * We guarantee 2-byte alignment for 'MDB_node's.
887  *
888  * #mn_lo and #mn_hi are used for data size on leaf nodes, and for child
889  * pgno on branch nodes. On 64 bit platforms, #mn_flags is also used
890  * for pgno. (Branch nodes have no flags). Lo and hi are in host byte
891  * order in case some accesses can be optimized to 32-bit word access.
892  *
893  * Leaf node flags describe node contents. #F_BIGDATA says the node's
894  * data part is the page number of an overflow page with actual data.
895  * #F_DUPDATA and #F_SUBDATA can be combined giving duplicate data in
896  * a sub-page/sub-database, and named databases (just #F_SUBDATA).
897  */
898 typedef struct MDB_node {
899  /** part of data size or pgno
900  * @{ */
901 #if BYTE_ORDER == LITTLE_ENDIAN
902  unsigned short mn_lo, mn_hi;
903 #else
904  unsigned short mn_hi, mn_lo;
905 #endif
906  /** @} */
907 /** @defgroup mdb_node Node Flags
908  * @ingroup internal
909  * Flags for node headers.
910  * @{
911  */
912 #define F_BIGDATA 0x01 /**< data put on overflow page */
913 #define F_SUBDATA 0x02 /**< data is a sub-database */
914 #define F_DUPDATA 0x04 /**< data has duplicates */
915 
916 /** valid flags for #mdb_node_add() */
917 #define NODE_ADD_FLAGS (F_DUPDATA|F_SUBDATA|MDB_RESERVE|MDB_APPEND)
918 
919 /** @} */
920  unsigned short mn_flags; /**< @ref mdb_node */
921  unsigned short mn_ksize; /**< key size */
922  char mn_data[1]; /**< key and data are appended here */
924 
925  /** Size of the node header, excluding dynamic data at the end */
926 #define NODESIZE offsetof(MDB_node, mn_data)
927 
928  /** Bit position of top word in page number, for shifting mn_flags */
929 #define PGNO_TOPWORD ((pgno_t)-1 > 0xffffffffu ? 32 : 0)
930 
931  /** Size of a node in a branch page with a given key.
932  * This is just the node header plus the key, there is no data.
933  */
934 #define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->mv_size))
935 
936  /** Size of a node in a leaf page with a given key and data.
937  * This is node header plus key plus data size.
938  */
939 #define LEAFSIZE(k, d) (NODESIZE + (k)->mv_size + (d)->mv_size)
940 
941  /** Address of node \b i in page \b p */
942 #define NODEPTR(p, i) ((MDB_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEBASE))
943 
944  /** Address of the key for the node */
945 #define NODEKEY(node) (void *)((node)->mn_data)
946 
947  /** Address of the data for a node */
948 #define NODEDATA(node) (void *)((char *)(node)->mn_data + (node)->mn_ksize)
949 
950  /** Get the page number pointed to by a branch node */
951 #define NODEPGNO(node) \
952  ((node)->mn_lo | ((pgno_t) (node)->mn_hi << 16) | \
953  (PGNO_TOPWORD ? ((pgno_t) (node)->mn_flags << PGNO_TOPWORD) : 0))
954  /** Set the page number in a branch node */
955 #define SETPGNO(node,pgno) do { \
956  (node)->mn_lo = (pgno) & 0xffff; (node)->mn_hi = (pgno) >> 16; \
957  if (PGNO_TOPWORD) (node)->mn_flags = (pgno) >> PGNO_TOPWORD; } while(0)
958 
959  /** Get the size of the data in a leaf node */
960 #define NODEDSZ(node) ((node)->mn_lo | ((unsigned)(node)->mn_hi << 16))
961  /** Set the size of the data for a leaf node */
962 #define SETDSZ(node,size) do { \
963  (node)->mn_lo = (size) & 0xffff; (node)->mn_hi = (size) >> 16;} while(0)
964  /** The size of a key in a node */
965 #define NODEKSZ(node) ((node)->mn_ksize)
966 
967  /** Copy a page number from src to dst */
968 #ifdef MISALIGNED_OK
969 #define COPY_PGNO(dst,src) dst = src
970 #else
971 #if SIZE_MAX > 4294967295UL
972 #define COPY_PGNO(dst,src) do { \
973  unsigned short *s, *d; \
974  s = (unsigned short *)&(src); \
975  d = (unsigned short *)&(dst); \
976  *d++ = *s++; \
977  *d++ = *s++; \
978  *d++ = *s++; \
979  *d = *s; \
980 } while (0)
981 #else
982 #define COPY_PGNO(dst,src) do { \
983  unsigned short *s, *d; \
984  s = (unsigned short *)&(src); \
985  d = (unsigned short *)&(dst); \
986  *d++ = *s++; \
987  *d = *s; \
988 } while (0)
989 #endif
990 #endif
991  /** The address of a key in a LEAF2 page.
992  * LEAF2 pages are used for #MDB_DUPFIXED sorted-duplicate sub-DBs.
993  * There are no node headers, keys are stored contiguously.
994  */
995 #define LEAF2KEY(p, i, ks) ((char *)(p) + PAGEHDRSZ + ((i)*(ks)))
996 
997  /** Set the \b node's key into \b keyptr, if requested. */
998 #define MDB_GET_KEY(node, keyptr) { if ((keyptr) != NULL) { \
999  (keyptr)->mv_size = NODEKSZ(node); (keyptr)->mv_data = NODEKEY(node); } }
1000 
1001  /** Set the \b node's key into \b key. */
1002 #define MDB_GET_KEY2(node, key) { key.mv_size = NODEKSZ(node); key.mv_data = NODEKEY(node); }
1003 
1004  /** Information about a single database in the environment. */
1005 typedef struct MDB_db {
1006  uint32_t md_pad; /**< also ksize for LEAF2 pages */
1007  uint16_t md_flags; /**< @ref mdb_dbi_open */
1008  uint16_t md_depth; /**< depth of this tree */
1009  pgno_t md_branch_pages; /**< number of internal pages */
1010  pgno_t md_leaf_pages; /**< number of leaf pages */
1011  pgno_t md_overflow_pages; /**< number of overflow pages */
1012  size_t md_entries; /**< number of data items */
1013  pgno_t md_root; /**< the root page of this tree */
1015 
1016 #define MDB_VALID 0x8000 /**< DB handle is valid, for me_dbflags */
1017 #define PERSISTENT_FLAGS (0xffff & ~(MDB_VALID))
1018  /** #mdb_dbi_open() flags */
1019 #define VALID_FLAGS (MDB_REVERSEKEY|MDB_DUPSORT|MDB_INTEGERKEY|MDB_DUPFIXED|\
1020  MDB_INTEGERDUP|MDB_REVERSEDUP|MDB_CREATE)
1021 
1022  /** Handle for the DB used to track free pages. */
1023 #define FREE_DBI 0
1024  /** Handle for the default DB. */
1025 #define MAIN_DBI 1
1026  /** Number of DBs in metapage (free and main) - also hardcoded elsewhere */
1027 #define CORE_DBS 2
1028 
1029  /** Number of meta pages - also hardcoded elsewhere */
1030 #define NUM_METAS 2
1031 
1032  /** Meta page content.
1033  * A meta page is the start point for accessing a database snapshot.
1034  * Pages 0-1 are meta pages. Transaction N writes meta page #(N % 2).
1035  */
1036 typedef struct MDB_meta {
1037  /** Stamp identifying this as an LMDB file. It must be set
1038  * to #MDB_MAGIC. */
1040  /** Version number of this file. Must be set to #MDB_DATA_VERSION. */
1042  void *mm_address; /**< address for fixed mapping */
1043  size_t mm_mapsize; /**< size of mmap region */
1044  MDB_db mm_dbs[CORE_DBS]; /**< first is free space, 2nd is main db */
1045  /** The size of pages used in this DB */
1046 #define mm_psize mm_dbs[FREE_DBI].md_pad
1047  /** Any persistent environment flags. @ref mdb_env */
1048 #define mm_flags mm_dbs[FREE_DBI].md_flags
1049  /** Last used page in the datafile.
1050  * Actually the file may be shorter if the freeDB lists the final pages.
1051  */
1053  volatile txnid_t mm_txnid; /**< txnid that committed this page */
1055 
1056  /** Buffer for a stack-allocated meta page.
1057  * The members define size and alignment, and silence type
1058  * aliasing warnings. They are not used directly; that could
1059  * mean incorrectly using several union members in parallel.
1060  */
1061 typedef union MDB_metabuf {
1063  struct {
1068 
1069  /** Auxiliary DB info.
1070  * The information here is mostly static/read-only. There is
1071  * only a single copy of this record in the environment.
1072  */
1073 typedef struct MDB_dbx {
1074  MDB_val md_name; /**< name of the database */
1075  MDB_cmp_func *md_cmp; /**< function for comparing keys */
1076  MDB_cmp_func *md_dcmp; /**< function for comparing data items */
1077  MDB_rel_func *md_rel; /**< user relocate function */
1078  void *md_relctx; /**< user-provided context for md_rel */
1080 
1081  /** A database transaction.
1082  * Every operation requires a transaction handle.
1083  */
1084 struct MDB_txn {
1085  MDB_txn *mt_parent; /**< parent of a nested txn */
1086  /** Nested txn under this txn, set together with flag #MDB_TXN_HAS_CHILD */
1088  pgno_t mt_next_pgno; /**< next unallocated page */
1089  /** The ID of this transaction. IDs are integers incrementing from 1.
1090  * Only committed write transactions increment the ID. If a transaction
1091  * aborts, the ID may be re-used by the next writer.
1092  */
1094  MDB_env *mt_env; /**< the DB environment */
1095  /** The list of pages that became unused during this transaction.
1096  */
1098  /** The list of loose pages that became unused and may be reused
1099  * in this transaction, linked through #NEXT_LOOSE_PAGE(page).
1100  */
1102  /** Number of loose pages (#mt_loose_pgs) */
1104  /** The sorted list of dirty pages we temporarily wrote to disk
1105  * because the dirty list was full. page numbers in here are
1106  * shifted left by 1, deleted slots have the LSB set.
1107  */
1109  union {
1110  /** For write txns: Modified pages. Sorted when not MDB_WRITEMAP. */
1112  /** For read txns: This thread/txn's reader table slot, or NULL. */
1114  } mt_u;
1115  /** Array of records for each DB known in the environment. */
1117  /** Array of MDB_db records for each known DB */
1119  /** Array of sequence numbers for each DB handle */
1120  unsigned int *mt_dbiseqs;
1121 /** @defgroup mt_dbflag Transaction DB Flags
1122  * @ingroup internal
1123  * @{
1124  */
1125 #define DB_DIRTY 0x01 /**< DB was written in this txn */
1126 #define DB_STALE 0x02 /**< Named-DB record is older than txnID */
1127 #define DB_NEW 0x04 /**< Named-DB handle opened in this txn */
1128 #define DB_VALID 0x08 /**< DB handle is valid, see also #MDB_VALID */
1129 #define DB_USRVALID 0x10 /**< As #DB_VALID, but not set for #FREE_DBI */
1130 #define DB_DUPDATA 0x20 /**< DB is #MDB_DUPSORT data */
1131 /** @} */
1132  /** In write txns, array of cursors for each DB */
1134  /** Array of flags for each DB */
1135  unsigned char *mt_dbflags;
1136  /** Number of DB records in use, or 0 when the txn is finished.
1137  * This number only ever increments until the txn finishes; we
1138  * don't decrement it when individual DB handles are closed.
1139  */
1141 
1142 /** @defgroup mdb_txn Transaction Flags
1143  * @ingroup internal
1144  * @{
1145  */
1146  /** #mdb_txn_begin() flags */
1147 #define MDB_TXN_BEGIN_FLAGS MDB_RDONLY
1148 #define MDB_TXN_RDONLY MDB_RDONLY /**< read-only transaction */
1149  /* internal txn flags */
1150 #define MDB_TXN_WRITEMAP MDB_WRITEMAP /**< copy of #MDB_env flag in writers */
1151 #define MDB_TXN_FINISHED 0x01 /**< txn is finished or never began */
1152 #define MDB_TXN_ERROR 0x02 /**< txn is unusable after an error */
1153 #define MDB_TXN_DIRTY 0x04 /**< must write, even if dirty list is empty */
1154 #define MDB_TXN_SPILLS 0x08 /**< txn or a parent has spilled pages */
1155 #define MDB_TXN_HAS_CHILD 0x10 /**< txn has an #MDB_txn.%mt_child */
1156  /** most operations on the txn are currently illegal */
1157 #define MDB_TXN_BLOCKED (MDB_TXN_FINISHED|MDB_TXN_ERROR|MDB_TXN_HAS_CHILD)
1158 /** @} */
1159  unsigned int mt_flags; /**< @ref mdb_txn */
1160  /** #dirty_list room: Array size - \#dirty pages visible to this txn.
1161  * Includes ancestor txns' dirty pages not hidden by other txns'
1162  * dirty/spilled pages. Thus commit(nested txn) has room to merge
1163  * dirty_list into mt_parent after freeing hidden mt_parent pages.
1164  */
1165  unsigned int mt_dirty_room;
1166 };
1167 
1168 /** Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty.
1169  * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to
1170  * raise this on a 64 bit machine.
1171  */
1172 #define CURSOR_STACK 32
1173 
1174 struct MDB_xcursor;
1175 
1176  /** Cursors are used for all DB operations.
1177  * A cursor holds a path of (page pointer, key index) from the DB
1178  * root to a position in the DB, plus other state. #MDB_DUPSORT
1179  * cursors include an xcursor to the current data item. Write txns
1180  * track their cursors and keep them up to date when data moves.
1181  * Exception: An xcursor's pointer to a #P_SUBP page can be stale.
1182  * (A node with #F_DUPDATA but no #F_SUBDATA contains a subpage).
1183  */
1184 struct MDB_cursor {
1185  /** Next cursor on this DB in this txn */
1187  /** Backup of the original cursor if this cursor is a shadow */
1189  /** Context used for databases with #MDB_DUPSORT, otherwise NULL */
1191  /** The transaction that owns this cursor */
1193  /** The database handle this cursor operates on */
1195  /** The database record for this cursor */
1197  /** The database auxiliary record for this cursor */
1199  /** The @ref mt_dbflag for this database */
1200  unsigned char *mc_dbflag;
1201  unsigned short mc_snum; /**< number of pushed pages */
1202  unsigned short mc_top; /**< index of top page, normally mc_snum-1 */
1203 /** @defgroup mdb_cursor Cursor Flags
1204  * @ingroup internal
1205  * Cursor state flags.
1206  * @{
1207  */
1208 #define C_INITIALIZED 0x01 /**< cursor has been initialized and is valid */
1209 #define C_EOF 0x02 /**< No more data */
1210 #define C_SUB 0x04 /**< Cursor is a sub-cursor */
1211 #define C_DEL 0x08 /**< last op was a cursor_del */
1212 #define C_UNTRACK 0x40 /**< Un-track cursor when closing */
1213 /** @} */
1214  unsigned int mc_flags; /**< @ref mdb_cursor */
1215  MDB_page *mc_pg[CURSOR_STACK]; /**< stack of pushed pages */
1216  indx_t mc_ki[CURSOR_STACK]; /**< stack of page indices */
1217 };
1218 
1219  /** Context for sorted-dup records.
1220  * We could have gone to a fully recursive design, with arbitrarily
1221  * deep nesting of sub-databases. But for now we only handle these
1222  * levels - main DB, optional sub-DB, sorted-duplicate DB.
1223  */
1224 typedef struct MDB_xcursor {
1225  /** A sub-cursor for traversing the Dup DB */
1227  /** The database record for this Dup DB */
1229  /** The auxiliary DB record for this Dup DB */
1231  /** The @ref mt_dbflag for this Dup DB */
1232  unsigned char mx_dbflag;
1234 
1235  /** Check if there is an inited xcursor */
1236 #define XCURSOR_INITED(mc) \
1237  ((mc)->mc_xcursor && ((mc)->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))
1238 
1239  /** Update the xcursor's sub-page pointer, if any, in \b mc. Needed
1240  * when the node which contains the sub-page may have moved. Called
1241  * with leaf page \b mp = mc->mc_pg[\b top].
1242  */
1243 #define XCURSOR_REFRESH(mc, top, mp) do { \
1244  MDB_page *xr_pg = (mp); \
1245  MDB_node *xr_node; \
1246  if (!XCURSOR_INITED(mc) || (mc)->mc_ki[top] >= NUMKEYS(xr_pg)) break; \
1247  xr_node = NODEPTR(xr_pg, (mc)->mc_ki[top]); \
1248  if ((xr_node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) \
1249  (mc)->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(xr_node); \
1250 } while (0)
1251 
1252  /** State of FreeDB old pages, stored in the MDB_env */
1253 typedef struct MDB_pgstate {
1254  pgno_t *mf_pghead; /**< Reclaimed freeDB pages, or NULL before use */
1255  txnid_t mf_pglast; /**< ID of last used record, or 0 if !mf_pghead */
1257 
1258  /** The database environment. */
1259 struct MDB_env {
1260  HANDLE me_fd; /**< The main data file */
1261  HANDLE me_lfd; /**< The lock file */
1262  HANDLE me_mfd; /**< For writing and syncing the meta pages */
1263  /** Failed to update the meta page. Probably an I/O error. */
1264 #define MDB_FATAL_ERROR 0x80000000U
1265  /** Some fields are initialized. */
1266 #define MDB_ENV_ACTIVE 0x20000000U
1267  /** me_txkey is set */
1268 #define MDB_ENV_TXKEY 0x10000000U
1269  /** fdatasync is unreliable */
1270 #define MDB_FSYNCONLY 0x08000000U
1271  uint32_t me_flags; /**< @ref mdb_env */
1272  unsigned int me_psize; /**< DB page size, inited from me_os_psize */
1273  unsigned int me_os_psize; /**< OS page size, from #GET_PAGESIZE */
1274  unsigned int me_maxreaders; /**< size of the reader table */
1275  /** Max #MDB_txninfo.%mti_numreaders of interest to #mdb_env_close() */
1276  volatile int me_close_readers;
1277  MDB_dbi me_numdbs; /**< number of DBs opened */
1278  MDB_dbi me_maxdbs; /**< size of the DB table */
1279  MDB_PID_T me_pid; /**< process ID of this env */
1280  char *me_path; /**< path to the DB files */
1281  char *me_map; /**< the memory map of the data file */
1282  MDB_txninfo *me_txns; /**< the memory map of the lock file or NULL */
1283  MDB_meta *me_metas[NUM_METAS]; /**< pointers to the two meta pages */
1284  void *me_pbuf; /**< scratch area for DUPSORT put() */
1285  MDB_txn *me_txn; /**< current write transaction */
1286  MDB_txn *me_txn0; /**< prealloc'd write transaction */
1287  size_t me_mapsize; /**< size of the data memory map */
1288  off_t me_size; /**< current file size */
1289  pgno_t me_maxpg; /**< me_mapsize / me_psize */
1290  MDB_dbx *me_dbxs; /**< array of static DB info */
1291  uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */
1292  unsigned int *me_dbiseqs; /**< array of dbi sequence numbers */
1293  pthread_key_t me_txkey; /**< thread-key for readers */
1294  txnid_t me_pgoldest; /**< ID of oldest reader last time we looked */
1295  MDB_pgstate me_pgstate; /**< state of old pages from freeDB */
1296 # define me_pglast me_pgstate.mf_pglast
1297 # define me_pghead me_pgstate.mf_pghead
1298  MDB_page *me_dpages; /**< list of malloc'd blocks for re-use */
1299  /** IDL of pages that became unused in a write txn */
1301  /** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */
1303  /** Max number of freelist items that can fit in a single overflow page */
1305  /** Max size of a node on a page */
1306  unsigned int me_nodemax;
1307 #if !(MDB_MAXKEYSIZE)
1308  unsigned int me_maxkey; /**< max size of a key */
1309 #endif
1310  int me_live_reader; /**< have liveness lock in reader table */
1311 #ifdef _WIN32
1312  int me_pidquery; /**< Used in OpenProcess */
1313 #endif
1314 #ifdef MDB_USE_POSIX_MUTEX /* Posix mutexes reside in shared mem */
1315 # define me_rmutex me_txns->mti_rmutex /**< Shared reader lock */
1316 # define me_wmutex me_txns->mti_wmutex /**< Shared writer lock */
1317 #else
1320 #endif
1321  void *me_userctx; /**< User-settable context */
1322  MDB_assert_func *me_assert_func; /**< Callback for assertion failures */
1323 };
1324 
1325  /** Nested transaction */
1326 typedef struct MDB_ntxn {
1327  MDB_txn mnt_txn; /**< the transaction */
1328  MDB_pgstate mnt_pgstate; /**< parent transaction's saved freestate */
1330 
1331  /** max number of pages to commit in one writev() call */
1332 #define MDB_COMMIT_PAGES 64
1333 #if defined(IOV_MAX) && IOV_MAX < MDB_COMMIT_PAGES
1334 #undef MDB_COMMIT_PAGES
1335 #define MDB_COMMIT_PAGES IOV_MAX
1336 #endif
1337 
1338  /** max bytes to write in one call */
1339 #define MAX_WRITE (0x40000000U >> (sizeof(ssize_t) == 4))
1340 
1341  /** Check \b txn and \b dbi arguments to a function */
1342 #define TXN_DBI_EXIST(txn, dbi, validity) \
1343  ((txn) && (dbi)<(txn)->mt_numdbs && ((txn)->mt_dbflags[dbi] & (validity)))
1344 
1345  /** Check for misused \b dbi handles */
1346 #define TXN_DBI_CHANGED(txn, dbi) \
1347  ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi])
1348 
1349 static int mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp);
1350 static int mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp);
1351 static int mdb_page_touch(MDB_cursor *mc);
1352 
1353 #define MDB_END_NAMES {"committed", "empty-commit", "abort", "reset", \
1354  "reset-tmp", "fail-begin", "fail-beginchild"}
1355 enum {
1356  /* mdb_txn_end operation number, for logging */
1359 };
1360 #define MDB_END_OPMASK 0x0F /**< mask for #mdb_txn_end() operation number */
1361 #define MDB_END_UPDATE 0x10 /**< update env state (DBIs) */
1362 #define MDB_END_FREE 0x20 /**< free txn unless it is #MDB_env.%me_txn0 */
1363 #define MDB_END_SLOT MDB_NOTLS /**< release any reader slot if #MDB_NOTLS */
1364 static void mdb_txn_end(MDB_txn *txn, unsigned mode);
1365 
1366 static int mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **mp, int *lvl);
1367 static int mdb_page_search_root(MDB_cursor *mc,
1368  MDB_val *key, int modify);
1369 #define MDB_PS_MODIFY 1
1370 #define MDB_PS_ROOTONLY 2
1371 #define MDB_PS_FIRST 4
1372 #define MDB_PS_LAST 8
1373 static int mdb_page_search(MDB_cursor *mc,
1374  MDB_val *key, int flags);
1375 static int mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst);
1376 
1377 #define MDB_SPLIT_REPLACE MDB_APPENDDUP /**< newkey is not new */
1378 static int mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata,
1379  pgno_t newpgno, unsigned int nflags);
1380 
1381 static int mdb_env_read_header(MDB_env *env, MDB_meta *meta);
1382 static MDB_meta *mdb_env_pick_meta(const MDB_env *env);
1383 static int mdb_env_write_meta(MDB_txn *txn);
1384 #ifdef MDB_USE_POSIX_MUTEX /* Drop unused excl arg */
1385 # define mdb_env_close0(env, excl) mdb_env_close1(env)
1386 #endif
1387 static void mdb_env_close0(MDB_env *env, int excl);
1388 
1389 static MDB_node *mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp);
1390 static int mdb_node_add(MDB_cursor *mc, indx_t indx,
1391  MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags);
1392 static void mdb_node_del(MDB_cursor *mc, int ksize);
1393 static void mdb_node_shrink(MDB_page *mp, indx_t indx);
1394 static int mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft);
1395 static int mdb_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data);
1396 static size_t mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data);
1397 static size_t mdb_branch_size(MDB_env *env, MDB_val *key);
1398 
1399 static int mdb_rebalance(MDB_cursor *mc);
1400 static int mdb_update_key(MDB_cursor *mc, MDB_val *key);
1401 
1402 static void mdb_cursor_pop(MDB_cursor *mc);
1403 static int mdb_cursor_push(MDB_cursor *mc, MDB_page *mp);
1404 
1405 static int mdb_cursor_del0(MDB_cursor *mc);
1406 static int mdb_del0(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, unsigned flags);
1407 static int mdb_cursor_sibling(MDB_cursor *mc, int move_right);
1411  int *exactp);
1412 static int mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data);
1413 static int mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data);
1414 
1415 static void mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx);
1416 static void mdb_xcursor_init0(MDB_cursor *mc);
1417 static void mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node);
1418 static void mdb_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int force);
1419 
1420 static int mdb_drop0(MDB_cursor *mc, int subs);
1421 static void mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi);
1422 static int mdb_reader_check0(MDB_env *env, int rlocked, int *dead);
1423 
1424 /** @cond */
1426 /** @endcond */
1427 
1428 /** Compare two items pointing at size_t's of unknown alignment. */
1429 #ifdef MISALIGNED_OK
1430 # define mdb_cmp_clong mdb_cmp_long
1431 #else
1432 # define mdb_cmp_clong mdb_cmp_cint
1433 #endif
1434 
1435 #ifdef _WIN32
1436 static SECURITY_DESCRIPTOR mdb_null_sd;
1437 static SECURITY_ATTRIBUTES mdb_all_sa;
1438 static int mdb_sec_inited;
1439 
1440 struct MDB_name;
1441 static int utf8_to_utf16(const char *src, struct MDB_name *dst, int xtra);
1442 #endif
1443 
1444 /** Return the library version info. */
1445 char * ESECT
1446 mdb_version(int *major, int *minor, int *patch)
1447 {
1448  if (major) *major = MDB_VERSION_MAJOR;
1449  if (minor) *minor = MDB_VERSION_MINOR;
1450  if (patch) *patch = MDB_VERSION_PATCH;
1451  return MDB_VERSION_STRING;
1452 }
1453 
1454 /** Table of descriptions for LMDB @ref errors */
1455 static char *const mdb_errstr[] = {
1456  "MDB_KEYEXIST: Key/data pair already exists",
1457  "MDB_NOTFOUND: No matching key/data pair found",
1458  "MDB_PAGE_NOTFOUND: Requested page not found",
1459  "MDB_CORRUPTED: Located page was wrong type",
1460  "MDB_PANIC: Update of meta page failed or environment had fatal error",
1461  "MDB_VERSION_MISMATCH: Database environment version mismatch",
1462  "MDB_INVALID: File is not an LMDB file",
1463  "MDB_MAP_FULL: Environment mapsize limit reached",
1464  "MDB_DBS_FULL: Environment maxdbs limit reached",
1465  "MDB_READERS_FULL: Environment maxreaders limit reached",
1466  "MDB_TLS_FULL: Thread-local storage keys full - too many environments open",
1467  "MDB_TXN_FULL: Transaction has too many dirty pages - transaction too big",
1468  "MDB_CURSOR_FULL: Internal error - cursor stack limit reached",
1469  "MDB_PAGE_FULL: Internal error - page has no more space",
1470  "MDB_MAP_RESIZED: Database contents grew beyond environment mapsize",
1471  "MDB_INCOMPATIBLE: Operation and DB incompatible, or DB flags changed",
1472  "MDB_BAD_RSLOT: Invalid reuse of reader locktable slot",
1473  "MDB_BAD_TXN: Transaction must abort, has a child, or is invalid",
1474  "MDB_BAD_VALSIZE: Unsupported size of key/DB name/data, or wrong DUPFIXED size",
1475  "MDB_BAD_DBI: The specified DBI handle was closed/changed unexpectedly",
1476 };
1477 
1478 char *
1480 {
1481 #ifdef _WIN32
1482  /** HACK: pad 4KB on stack over the buf. Return system msgs in buf.
1483  * This works as long as no function between the call to mdb_strerror
1484  * and the actual use of the message uses more than 4K of stack.
1485  */
1486 #define MSGSIZE 1024
1487 #define PADSIZE 4096
1488  char buf[MSGSIZE+PADSIZE], *ptr = buf;
1489 #endif
1490  int i;
1491  if (!err)
1492  return ("Successful return: 0");
1493 
1494  if (err >= MDB_KEYEXIST && err <= MDB_LAST_ERRCODE) {
1495  i = err - MDB_KEYEXIST;
1496  return mdb_errstr[i];
1497  }
1498 
1499 #ifdef _WIN32
1500  /* These are the C-runtime error codes we use. The comment indicates
1501  * their numeric value, and the Win32 error they would correspond to
1502  * if the error actually came from a Win32 API. A major mess, we should
1503  * have used LMDB-specific error codes for everything.
1504  */
1505  switch(err) {
1506  case ENOENT: /* 2, FILE_NOT_FOUND */
1507  case EIO: /* 5, ACCESS_DENIED */
1508  case ENOMEM: /* 12, INVALID_ACCESS */
1509  case EACCES: /* 13, INVALID_DATA */
1510  case EBUSY: /* 16, CURRENT_DIRECTORY */
1511  case EINVAL: /* 22, BAD_COMMAND */
1512  case ENOSPC: /* 28, OUT_OF_PAPER */
1513  return strerror(err);
1514  default:
1515  ;
1516  }
1517  buf[0] = 0;
1518  FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM |
1519  FORMAT_MESSAGE_IGNORE_INSERTS,
1520  NULL, err, 0, ptr, MSGSIZE, (va_list *)buf+MSGSIZE);
1521  return ptr;
1522 #else
1523  return strerror(err);
1524 #endif
1525 }
1526 
1527 /** assert(3) variant in cursor context */
1528 #define mdb_cassert(mc, expr) mdb_assert0((mc)->mc_txn->mt_env, expr, #expr)
1529 /** assert(3) variant in transaction context */
1530 #define mdb_tassert(txn, expr) mdb_assert0((txn)->mt_env, expr, #expr)
1531 /** assert(3) variant in environment context */
1532 #define mdb_eassert(env, expr) mdb_assert0(env, expr, #expr)
1533 
1534 #ifndef NDEBUG
1535 # define mdb_assert0(env, expr, expr_txt) ((expr) ? (void)0 : \
1536  mdb_assert_fail(env, expr_txt, mdb_func_, __FILE__, __LINE__))
1537 
1538 static void ESECT
1539 mdb_assert_fail(MDB_env *env, const char *expr_txt,
1540  const char *func, const char *file, int line)
1541 {
1542  char buf[400];
1543  sprintf(buf, "%.100s:%d: Assertion '%.200s' failed in %.40s()",
1544  file, line, expr_txt, func);
1545  if (env->me_assert_func)
1546  env->me_assert_func(env, buf);
1547  fprintf(stderr, "%s\n", buf);
1548  abort();
1549 }
1550 #else
1551 # define mdb_assert0(env, expr, expr_txt) ((void) 0)
1552 #endif /* NDEBUG */
1553 
1554 #if MDB_DEBUG
1555 /** Return the page number of \b mp which may be sub-page, for debug output */
1556 static pgno_t
1557 mdb_dbg_pgno(MDB_page *mp)
1558 {
1559  pgno_t ret;
1560  COPY_PGNO(ret, mp->mp_pgno);
1561  return ret;
1562 }
1563 
1564 /** Display a key in hexadecimal and return the address of the result.
1565  * @param[in] key the key to display
1566  * @param[in] buf the buffer to write into. Should always be #DKBUF.
1567  * @return The key in hexadecimal form.
1568  */
1569 char *
1570 mdb_dkey(MDB_val *key, char *buf)
1571 {
1572  char *ptr = buf;
1573  unsigned char *c = key->mv_data;
1574  unsigned int i;
1575 
1576  if (!key)
1577  return "";
1578 
1579  if (key->mv_size > DKBUF_MAXKEYSIZE)
1580  return "MDB_MAXKEYSIZE";
1581  /* may want to make this a dynamic check: if the key is mostly
1582  * printable characters, print it as-is instead of converting to hex.
1583  */
1584 #if 1
1585  buf[0] = '\0';
1586  for (i=0; i<key->mv_size; i++)
1587  ptr += sprintf(ptr, "%02x", *c++);
1588 #else
1589  sprintf(buf, "%.*s", key->mv_size, key->mv_data);
1590 #endif
1591  return buf;
1592 }
1593 
1594 static const char *
1595 mdb_leafnode_type(MDB_node *n)
1596 {
1597  static char *const tp[2][2] = {{"", ": DB"}, {": sub-page", ": sub-DB"}};
1598  return F_ISSET(n->mn_flags, F_BIGDATA) ? ": overflow page" :
1599  tp[F_ISSET(n->mn_flags, F_DUPDATA)][F_ISSET(n->mn_flags, F_SUBDATA)];
1600 }
1601 
1602 /** Display all the keys in the page. */
1603 void
1604 mdb_page_list(MDB_page *mp)
1605 {
1606  pgno_t pgno = mdb_dbg_pgno(mp);
1607  const char *type, *state = (mp->mp_flags & P_DIRTY) ? ", dirty" : "";
1608  MDB_node *node;
1609  unsigned int i, nkeys, nsize, total = 0;
1610  MDB_val key;
1611  DKBUF;
1612 
1614  case P_BRANCH: type = "Branch page"; break;
1615  case P_LEAF: type = "Leaf page"; break;
1616  case P_LEAF|P_SUBP: type = "Sub-page"; break;
1617  case P_LEAF|P_LEAF2: type = "LEAF2 page"; break;
1618  case P_LEAF|P_LEAF2|P_SUBP: type = "LEAF2 sub-page"; break;
1619  case P_OVERFLOW:
1620  fprintf(stderr, "Overflow page %"Z"u pages %u%s\n",
1621  pgno, mp->mp_pages, state);
1622  return;
1623  case P_META:
1624  fprintf(stderr, "Meta-page %"Z"u txnid %"Z"u\n",
1625  pgno, ((MDB_meta *)METADATA(mp))->mm_txnid);
1626  return;
1627  default:
1628  fprintf(stderr, "Bad page %"Z"u flags 0x%X\n", pgno, mp->mp_flags);
1629  return;
1630  }
1631 
1632  nkeys = NUMKEYS(mp);
1633  fprintf(stderr, "%s %"Z"u numkeys %d%s\n", type, pgno, nkeys, state);
1634 
1635  for (i=0; i<nkeys; i++) {
1636  if (IS_LEAF2(mp)) { /* LEAF2 pages have no mp_ptrs[] or node headers */
1637  key.mv_size = nsize = mp->mp_pad;
1638  key.mv_data = LEAF2KEY(mp, i, nsize);
1639  total += nsize;
1640  fprintf(stderr, "key %d: nsize %d, %s\n", i, nsize, DKEY(&key));
1641  continue;
1642  }
1643  node = NODEPTR(mp, i);
1644  key.mv_size = node->mn_ksize;
1645  key.mv_data = node->mn_data;
1646  nsize = NODESIZE + key.mv_size;
1647  if (IS_BRANCH(mp)) {
1648  fprintf(stderr, "key %d: page %"Z"u, %s\n", i, NODEPGNO(node),
1649  DKEY(&key));
1650  total += nsize;
1651  } else {
1652  if (F_ISSET(node->mn_flags, F_BIGDATA))
1653  nsize += sizeof(pgno_t);
1654  else
1655  nsize += NODEDSZ(node);
1656  total += nsize;
1657  nsize += sizeof(indx_t);
1658  fprintf(stderr, "key %d: nsize %d, %s%s\n",
1659  i, nsize, DKEY(&key), mdb_leafnode_type(node));
1660  }
1661  total = EVEN(total);
1662  }
1663  fprintf(stderr, "Total: header %d + contents %d + unused %d\n",
1664  IS_LEAF2(mp) ? PAGEHDRSZ : PAGEBASE + mp->mp_lower, total, SIZELEFT(mp));
1665 }
1666 
1667 void
1668 mdb_cursor_chk(MDB_cursor *mc)
1669 {
1670  unsigned int i;
1671  MDB_node *node;
1672  MDB_page *mp;
1673 
1674  if (!mc->mc_snum || !(mc->mc_flags & C_INITIALIZED)) return;
1675  for (i=0; i<mc->mc_top; i++) {
1676  mp = mc->mc_pg[i];
1677  node = NODEPTR(mp, mc->mc_ki[i]);
1678  if (NODEPGNO(node) != mc->mc_pg[i+1]->mp_pgno)
1679  printf("oops!\n");
1680  }
1681  if (mc->mc_ki[i] >= NUMKEYS(mc->mc_pg[i]))
1682  printf("ack!\n");
1683  if (XCURSOR_INITED(mc)) {
1684  node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
1685  if (((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) &&
1686  mc->mc_xcursor->mx_cursor.mc_pg[0] != NODEDATA(node)) {
1687  printf("blah!\n");
1688  }
1689  }
1690 }
1691 #endif
1692 
1693 #if (MDB_DEBUG) > 2
1694 /** Count all the pages in each DB and in the freelist
1695  * and make sure it matches the actual number of pages
1696  * being used.
1697  * All named DBs must be open for a correct count.
1698  */
1699 static void mdb_audit(MDB_txn *txn)
1700 {
1701  MDB_cursor mc;
1702  MDB_val key, data;
1703  MDB_ID freecount, count;
1704  MDB_dbi i;
1705  int rc;
1706 
1707  freecount = 0;
1708  mdb_cursor_init(&mc, txn, FREE_DBI, NULL);
1709  while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0)
1710  freecount += *(MDB_ID *)data.mv_data;
1711  mdb_tassert(txn, rc == MDB_NOTFOUND);
1712 
1713  count = 0;
1714  for (i = 0; i<txn->mt_numdbs; i++) {
1715  MDB_xcursor mx;
1716  if (!(txn->mt_dbflags[i] & DB_VALID))
1717  continue;
1718  mdb_cursor_init(&mc, txn, i, &mx);
1719  if (txn->mt_dbs[i].md_root == P_INVALID)
1720  continue;
1721  count += txn->mt_dbs[i].md_branch_pages +
1722  txn->mt_dbs[i].md_leaf_pages +
1723  txn->mt_dbs[i].md_overflow_pages;
1724  if (txn->mt_dbs[i].md_flags & MDB_DUPSORT) {
1725  rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST);
1726  for (; rc == MDB_SUCCESS; rc = mdb_cursor_sibling(&mc, 1)) {
1727  unsigned j;
1728  MDB_page *mp;
1729  mp = mc.mc_pg[mc.mc_top];
1730  for (j=0; j<NUMKEYS(mp); j++) {
1731  MDB_node *leaf = NODEPTR(mp, j);
1732  if (leaf->mn_flags & F_SUBDATA) {
1733  MDB_db db;
1734  memcpy(&db, NODEDATA(leaf), sizeof(db));
1735  count += db.md_branch_pages + db.md_leaf_pages +
1736  db.md_overflow_pages;
1737  }
1738  }
1739  }
1740  mdb_tassert(txn, rc == MDB_NOTFOUND);
1741  }
1742  }
1743  if (freecount + count + NUM_METAS != txn->mt_next_pgno) {
1744  fprintf(stderr, "audit: %"Z"u freecount: %"Z"u count: %"Z"u total: %"Z"u next_pgno: %"Z"u\n",
1745  txn->mt_txnid, freecount, count+NUM_METAS,
1746  freecount+count+NUM_METAS, txn->mt_next_pgno);
1747  }
1748 }
1749 #endif
1750 
1751 int
1752 mdb_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b)
1753 {
1754  return txn->mt_dbxs[dbi].md_cmp(a, b);
1755 }
1756 
1757 int
1758 mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b)
1759 {
1760  MDB_cmp_func *dcmp = txn->mt_dbxs[dbi].md_dcmp;
1761 #if UINT_MAX < SIZE_MAX
1762  if (dcmp == mdb_cmp_int && a->mv_size == sizeof(size_t))
1763  dcmp = mdb_cmp_clong;
1764 #endif
1765  return dcmp(a, b);
1766 }
1767 
1768 /** Allocate memory for a page.
1769  * Re-use old malloc'd pages first for singletons, otherwise just malloc.
1770  * Set #MDB_TXN_ERROR on failure.
1771  */
1772 static MDB_page *
1773 mdb_page_malloc(MDB_txn *txn, unsigned num)
1774 {
1775  MDB_env *env = txn->mt_env;
1776  MDB_page *ret = env->me_dpages;
1777  size_t psize = env->me_psize, sz = psize, off;
1778  /* For ! #MDB_NOMEMINIT, psize counts how much to init.
1779  * For a single page alloc, we init everything after the page header.
1780  * For multi-page, we init the final page; if the caller needed that
1781  * many pages they will be filling in at least up to the last page.
1782  */
1783  if (num == 1) {
1784  if (ret) {
1785  VGMEMP_ALLOC(env, ret, sz);
1786  VGMEMP_DEFINED(ret, sizeof(ret->mp_next));
1787  env->me_dpages = ret->mp_next;
1788  return ret;
1789  }
1790  psize -= off = PAGEHDRSZ;
1791  } else {
1792  sz *= num;
1793  off = sz - psize;
1794  }
1795  if ((ret = malloc(sz)) != NULL) {
1796  VGMEMP_ALLOC(env, ret, sz);
1797  if (!(env->me_flags & MDB_NOMEMINIT)) {
1798  memset((char *)ret + off, 0, psize);
1799  ret->mp_pad = 0;
1800  }
1801  } else {
1802  txn->mt_flags |= MDB_TXN_ERROR;
1803  }
1804  return ret;
1805 }
1806 /** Free a single page.
1807  * Saves single pages to a list, for future reuse.
1808  * (This is not used for multi-page overflow pages.)
1809  */
1810 static void
1812 {
1813  mp->mp_next = env->me_dpages;
1814  VGMEMP_FREE(env, mp);
1815  env->me_dpages = mp;
1816 }
1817 
1818 /** Free a dirty page */
1819 static void
1821 {
1822  if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) {
1823  mdb_page_free(env, dp);
1824  } else {
1825  /* large pages just get freed directly */
1826  VGMEMP_FREE(env, dp);
1827  free(dp);
1828  }
1829 }
1830 
1831 /** Return all dirty pages to dpage list */
1832 static void
1834 {
1835  MDB_env *env = txn->mt_env;
1836  MDB_ID2L dl = txn->mt_u.dirty_list;
1837  unsigned i, n = dl[0].mid;
1838 
1839  for (i = 1; i <= n; i++) {
1840  mdb_dpage_free(env, dl[i].mptr);
1841  }
1842  dl[0].mid = 0;
1843 }
1844 
1845 /** Loosen or free a single page.
1846  * Saves single pages to a list for future reuse
1847  * in this same txn. It has been pulled from the freeDB
1848  * and already resides on the dirty list, but has been
1849  * deleted. Use these pages first before pulling again
1850  * from the freeDB.
1851  *
1852  * If the page wasn't dirtied in this txn, just add it
1853  * to this txn's free list.
1854  */
1855 static int
1857 {
1858  int loose = 0;
1859  pgno_t pgno = mp->mp_pgno;
1860  MDB_txn *txn = mc->mc_txn;
1861 
1862  if ((mp->mp_flags & P_DIRTY) && mc->mc_dbi != FREE_DBI) {
1863  if (txn->mt_parent) {
1864  MDB_ID2 *dl = txn->mt_u.dirty_list;
1865  /* If txn has a parent, make sure the page is in our
1866  * dirty list.
1867  */
1868  if (dl[0].mid) {
1869  unsigned x = mdb_mid2l_search(dl, pgno);
1870  if (x <= dl[0].mid && dl[x].mid == pgno) {
1871  if (mp != dl[x].mptr) { /* bad cursor? */
1872  mc->mc_flags &= ~(C_INITIALIZED|C_EOF);
1873  txn->mt_flags |= MDB_TXN_ERROR;
1874  return MDB_CORRUPTED;
1875  }
1876  /* ok, it's ours */
1877  loose = 1;
1878  }
1879  }
1880  } else {
1881  /* no parent txn, so it's just ours */
1882  loose = 1;
1883  }
1884  }
1885  if (loose) {
1886  DPRINTF(("loosen db %d page %"Z"u", DDBI(mc),
1887  mp->mp_pgno));
1888  NEXT_LOOSE_PAGE(mp) = txn->mt_loose_pgs;
1889  txn->mt_loose_pgs = mp;
1890  txn->mt_loose_count++;
1891  mp->mp_flags |= P_LOOSE;
1892  } else {
1893  int rc = mdb_midl_append(&txn->mt_free_pgs, pgno);
1894  if (rc)
1895  return rc;
1896  }
1897 
1898  return MDB_SUCCESS;
1899 }
1900 
1901 /** Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn.
1902  * @param[in] mc A cursor handle for the current operation.
1903  * @param[in] pflags Flags of the pages to update:
1904  * P_DIRTY to set P_KEEP, P_DIRTY|P_KEEP to clear it.
1905  * @param[in] all No shortcuts. Needed except after a full #mdb_page_flush().
1906  * @return 0 on success, non-zero on failure.
1907  */
1908 static int
1909 mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all)
1910 {
1911  enum { Mask = P_SUBP|P_DIRTY|P_LOOSE|P_KEEP };
1912  MDB_txn *txn = mc->mc_txn;
1913  MDB_cursor *m3, *m0 = mc;
1914  MDB_xcursor *mx;
1915  MDB_page *dp, *mp;
1916  MDB_node *leaf;
1917  unsigned i, j;
1918  int rc = MDB_SUCCESS, level;
1919 
1920  /* Mark pages seen by cursors */
1921  if (mc->mc_flags & C_UNTRACK)
1922  mc = NULL; /* will find mc in mt_cursors */
1923  for (i = txn->mt_numdbs;; mc = txn->mt_cursors[--i]) {
1924  for (; mc; mc=mc->mc_next) {
1925  if (!(mc->mc_flags & C_INITIALIZED))
1926  continue;
1927  for (m3 = mc;; m3 = &mx->mx_cursor) {
1928  mp = NULL;
1929  for (j=0; j<m3->mc_snum; j++) {
1930  mp = m3->mc_pg[j];
1931  if ((mp->mp_flags & Mask) == pflags)
1932  mp->mp_flags ^= P_KEEP;
1933  }
1934  mx = m3->mc_xcursor;
1935  /* Proceed to mx if it is at a sub-database */
1936  if (! (mx && (mx->mx_cursor.mc_flags & C_INITIALIZED)))
1937  break;
1938  if (! (mp && (mp->mp_flags & P_LEAF)))
1939  break;
1940  leaf = NODEPTR(mp, m3->mc_ki[j-1]);
1941  if (!(leaf->mn_flags & F_SUBDATA))
1942  break;
1943  }
1944  }
1945  if (i == 0)
1946  break;
1947  }
1948 
1949  if (all) {
1950  /* Mark dirty root pages */
1951  for (i=0; i<txn->mt_numdbs; i++) {
1952  if (txn->mt_dbflags[i] & DB_DIRTY) {
1953  pgno_t pgno = txn->mt_dbs[i].md_root;
1954  if (pgno == P_INVALID)
1955  continue;
1956  if ((rc = mdb_page_get(m0, pgno, &dp, &level)) != MDB_SUCCESS)
1957  break;
1958  if ((dp->mp_flags & Mask) == pflags && level <= 1)
1959  dp->mp_flags ^= P_KEEP;
1960  }
1961  }
1962  }
1963 
1964  return rc;
1965 }
1966 
1967 static int mdb_page_flush(MDB_txn *txn, int keep);
1968 
1969 /** Spill pages from the dirty list back to disk.
1970  * This is intended to prevent running into #MDB_TXN_FULL situations,
1971  * but note that they may still occur in a few cases:
1972  * 1) our estimate of the txn size could be too small. Currently this
1973  * seems unlikely, except with a large number of #MDB_MULTIPLE items.
1974  * 2) child txns may run out of space if their parents dirtied a
1975  * lot of pages and never spilled them. TODO: we probably should do
1976  * a preemptive spill during #mdb_txn_begin() of a child txn, if
1977  * the parent's dirty_room is below a given threshold.
1978  *
1979  * Otherwise, if not using nested txns, it is expected that apps will
1980  * not run into #MDB_TXN_FULL any more. The pages are flushed to disk
1981  * the same way as for a txn commit, e.g. their P_DIRTY flag is cleared.
1982  * If the txn never references them again, they can be left alone.
1983  * If the txn only reads them, they can be used without any fuss.
1984  * If the txn writes them again, they can be dirtied immediately without
1985  * going thru all of the work of #mdb_page_touch(). Such references are
1986  * handled by #mdb_page_unspill().
1987  *
1988  * Also note, we never spill DB root pages, nor pages of active cursors,
1989  * because we'll need these back again soon anyway. And in nested txns,
1990  * we can't spill a page in a child txn if it was already spilled in a
1991  * parent txn. That would alter the parent txns' data even though
1992  * the child hasn't committed yet, and we'd have no way to undo it if
1993  * the child aborted.
1994  *
1995  * @param[in] m0 cursor A cursor handle identifying the transaction and
1996  * database for which we are checking space.
1997  * @param[in] key For a put operation, the key being stored.
1998  * @param[in] data For a put operation, the data being stored.
1999  * @return 0 on success, non-zero on failure.
2000  */
2001 static int
2003 {
2004  MDB_txn *txn = m0->mc_txn;
2005  MDB_page *dp;
2006  MDB_ID2L dl = txn->mt_u.dirty_list;
2007  unsigned int i, j, need;
2008  int rc;
2009 
2010  if (m0->mc_flags & C_SUB)
2011  return MDB_SUCCESS;
2012 
2013  /* Estimate how much space this op will take */
2014  i = m0->mc_db->md_depth;
2015  /* Named DBs also dirty the main DB */
2016  if (m0->mc_dbi >= CORE_DBS)
2017  i += txn->mt_dbs[MAIN_DBI].md_depth;
2018  /* For puts, roughly factor in the key+data size */
2019  if (key)
2020  i += (LEAFSIZE(key, data) + txn->mt_env->me_psize) / txn->mt_env->me_psize;
2021  i += i; /* double it for good measure */
2022  need = i;
2023 
2024  if (txn->mt_dirty_room > i)
2025  return MDB_SUCCESS;
2026 
2027  if (!txn->mt_spill_pgs) {
2029  if (!txn->mt_spill_pgs)
2030  return ENOMEM;
2031  } else {
2032  /* purge deleted slots */
2033  MDB_IDL sl = txn->mt_spill_pgs;
2034  unsigned int num = sl[0];
2035  j=0;
2036  for (i=1; i<=num; i++) {
2037  if (!(sl[i] & 1))
2038  sl[++j] = sl[i];
2039  }
2040  sl[0] = j;
2041  }
2042 
2043  /* Preserve pages which may soon be dirtied again */
2044  if ((rc = mdb_pages_xkeep(m0, P_DIRTY, 1)) != MDB_SUCCESS)
2045  goto done;
2046 
2047  /* Less aggressive spill - we originally spilled the entire dirty list,
2048  * with a few exceptions for cursor pages and DB root pages. But this
2049  * turns out to be a lot of wasted effort because in a large txn many
2050  * of those pages will need to be used again. So now we spill only 1/8th
2051  * of the dirty pages. Testing revealed this to be a good tradeoff,
2052  * better than 1/2, 1/4, or 1/10.
2053  */
2054  if (need < MDB_IDL_UM_MAX / 8)
2055  need = MDB_IDL_UM_MAX / 8;
2056 
2057  /* Save the page IDs of all the pages we're flushing */
2058  /* flush from the tail forward, this saves a lot of shifting later on. */
2059  for (i=dl[0].mid; i && need; i--) {
2060  MDB_ID pn = dl[i].mid << 1;
2061  dp = dl[i].mptr;
2062  if (dp->mp_flags & (P_LOOSE|P_KEEP))
2063  continue;
2064  /* Can't spill twice, make sure it's not already in a parent's
2065  * spill list.
2066  */
2067  if (txn->mt_parent) {
2068  MDB_txn *tx2;
2069  for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) {
2070  if (tx2->mt_spill_pgs) {
2071  j = mdb_midl_search(tx2->mt_spill_pgs, pn);
2072  if (j <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[j] == pn) {
2073  dp->mp_flags |= P_KEEP;
2074  break;
2075  }
2076  }
2077  }
2078  if (tx2)
2079  continue;
2080  }
2081  if ((rc = mdb_midl_append(&txn->mt_spill_pgs, pn)))
2082  goto done;
2083  need--;
2084  }
2086 
2087  /* Flush the spilled part of dirty list */
2088  if ((rc = mdb_page_flush(txn, i)) != MDB_SUCCESS)
2089  goto done;
2090 
2091  /* Reset any dirty pages we kept that page_flush didn't see */
2092  rc = mdb_pages_xkeep(m0, P_DIRTY|P_KEEP, i);
2093 
2094 done:
2095  txn->mt_flags |= rc ? MDB_TXN_ERROR : MDB_TXN_SPILLS;
2096  return rc;
2097 }
2098 
2099 /** Find oldest txnid still referenced. Expects txn->mt_txnid > 0. */
2100 static txnid_t
2102 {
2103  int i;
2104  txnid_t mr, oldest = txn->mt_txnid - 1;
2105  if (txn->mt_env->me_txns) {
2107  for (i = txn->mt_env->me_txns->mti_numreaders; --i >= 0; ) {
2108  if (r[i].mr_pid) {
2109  mr = r[i].mr_txnid;
2110  if (oldest > mr)
2111  oldest = mr;
2112  }
2113  }
2114  }
2115  return oldest;
2116 }
2117 
2118 /** Add a page to the txn's dirty list */
2119 static void
2121 {
2122  MDB_ID2 mid;
2123  int rc, (*insert)(MDB_ID2L, MDB_ID2 *);
2124 
2125  if (txn->mt_flags & MDB_TXN_WRITEMAP) {
2126  insert = mdb_mid2l_append;
2127  } else {
2128  insert = mdb_mid2l_insert;
2129  }
2130  mid.mid = mp->mp_pgno;
2131  mid.mptr = mp;
2132  rc = insert(txn->mt_u.dirty_list, &mid);
2133  mdb_tassert(txn, rc == 0);
2134  txn->mt_dirty_room--;
2135 }
2136 
2137 /** Allocate page numbers and memory for writing. Maintain me_pglast,
2138  * me_pghead and mt_next_pgno. Set #MDB_TXN_ERROR on failure.
2139  *
2140  * If there are free pages available from older transactions, they
2141  * are re-used first. Otherwise allocate a new page at mt_next_pgno.
2142  * Do not modify the freedB, just merge freeDB records into me_pghead[]
2143  * and move me_pglast to say which records were consumed. Only this
2144  * function can create me_pghead and move me_pglast/mt_next_pgno.
2145  * @param[in] mc cursor A cursor handle identifying the transaction and
2146  * database for which we are allocating.
2147  * @param[in] num the number of pages to allocate.
2148  * @param[out] mp Address of the allocated page(s). Requests for multiple pages
2149  * will always be satisfied by a single contiguous chunk of memory.
2150  * @return 0 on success, non-zero on failure.
2151  */
2152 static int
2154 {
2155 #ifdef MDB_PARANOID /* Seems like we can ignore this now */
2156  /* Get at most <Max_retries> more freeDB records once me_pghead
2157  * has enough pages. If not enough, use new pages from the map.
2158  * If <Paranoid> and mc is updating the freeDB, only get new
2159  * records if me_pghead is empty. Then the freelist cannot play
2160  * catch-up with itself by growing while trying to save it.
2161  */
2162  enum { Paranoid = 1, Max_retries = 500 };
2163 #else
2164  enum { Paranoid = 0, Max_retries = INT_MAX /*infinite*/ };
2165 #endif
2166  int rc, retry = num * 60;
2167  MDB_txn *txn = mc->mc_txn;
2168  MDB_env *env = txn->mt_env;
2169  pgno_t pgno, *mop = env->me_pghead;
2170  unsigned i, j, mop_len = mop ? mop[0] : 0, n2 = num-1;
2171  MDB_page *np;
2172  txnid_t oldest = 0, last;
2173  MDB_cursor_op op;
2174  MDB_cursor m2;
2175  int found_old = 0;
2176 
2177  /* If there are any loose pages, just use them */
2178  if (num == 1 && txn->mt_loose_pgs) {
2179  np = txn->mt_loose_pgs;
2180  txn->mt_loose_pgs = NEXT_LOOSE_PAGE(np);
2181  txn->mt_loose_count--;
2182  DPRINTF(("db %d use loose page %"Z"u", DDBI(mc),
2183  np->mp_pgno));
2184  *mp = np;
2185  return MDB_SUCCESS;
2186  }
2187 
2188  *mp = NULL;
2189 
2190  /* If our dirty list is already full, we can't do anything */
2191  if (txn->mt_dirty_room == 0) {
2192  rc = MDB_TXN_FULL;
2193  goto fail;
2194  }
2195 
2196  for (op = MDB_FIRST;; op = MDB_NEXT) {
2197  MDB_val key, data;
2198  MDB_node *leaf;
2199  pgno_t *idl;
2200 
2201  /* Seek a big enough contiguous page range. Prefer
2202  * pages at the tail, just truncating the list.
2203  */
2204  if (mop_len > n2) {
2205  i = mop_len;
2206  do {
2207  pgno = mop[i];
2208  if (mop[i-n2] == pgno+n2)
2209  goto search_done;
2210  } while (--i > n2);
2211  if (--retry < 0)
2212  break;
2213  }
2214 
2215  if (op == MDB_FIRST) { /* 1st iteration */
2216  /* Prepare to fetch more and coalesce */
2217  last = env->me_pglast;
2218  oldest = env->me_pgoldest;
2219  mdb_cursor_init(&m2, txn, FREE_DBI, NULL);
2220  if (last) {
2221  op = MDB_SET_RANGE;
2222  key.mv_data = &last; /* will look up last+1 */
2223  key.mv_size = sizeof(last);
2224  }
2225  if (Paranoid && mc->mc_dbi == FREE_DBI)
2226  retry = -1;
2227  }
2228  if (Paranoid && retry < 0 && mop_len)
2229  break;
2230 
2231  last++;
2232  /* Do not fetch more if the record will be too recent */
2233  if (oldest <= last) {
2234  if (!found_old) {
2235  oldest = mdb_find_oldest(txn);
2236  env->me_pgoldest = oldest;
2237  found_old = 1;
2238  }
2239  if (oldest <= last)
2240  break;
2241  }
2242  rc = mdb_cursor_get(&m2, &key, NULL, op);
2243  if (rc) {
2244  if (rc == MDB_NOTFOUND)
2245  break;
2246  goto fail;
2247  }
2248  last = *(txnid_t*)key.mv_data;
2249  if (oldest <= last) {
2250  if (!found_old) {
2251  oldest = mdb_find_oldest(txn);
2252  env->me_pgoldest = oldest;
2253  found_old = 1;
2254  }
2255  if (oldest <= last)
2256  break;
2257  }
2258  np = m2.mc_pg[m2.mc_top];
2259  leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]);
2260  if ((rc = mdb_node_read(&m2, leaf, &data)) != MDB_SUCCESS)
2261  goto fail;
2262 
2263  idl = (MDB_ID *) data.mv_data;
2264  i = idl[0];
2265  if (!mop) {
2266  if (!(env->me_pghead = mop = mdb_midl_alloc(i))) {
2267  rc = ENOMEM;
2268  goto fail;
2269  }
2270  } else {
2271  if ((rc = mdb_midl_need(&env->me_pghead, i)) != 0)
2272  goto fail;
2273  mop = env->me_pghead;
2274  }
2275  env->me_pglast = last;
2276 #if (MDB_DEBUG) > 1
2277  DPRINTF(("IDL read txn %"Z"u root %"Z"u num %u",
2278  last, txn->mt_dbs[FREE_DBI].md_root, i));
2279  for (j = i; j; j--)
2280  DPRINTF(("IDL %"Z"u", idl[j]));
2281 #endif
2282  /* Merge in descending sorted order */
2283  mdb_midl_xmerge(mop, idl);
2284  mop_len = mop[0];
2285  }
2286 
2287  /* Use new pages from the map when nothing suitable in the freeDB */
2288  i = 0;
2289  pgno = txn->mt_next_pgno;
2290  if (pgno + num >= env->me_maxpg) {
2291  DPUTS("DB size maxed out");
2292  rc = MDB_MAP_FULL;
2293  goto fail;
2294  }
2295 
2296 search_done:
2297  if (env->me_flags & MDB_WRITEMAP) {
2298  np = (MDB_page *)(env->me_map + env->me_psize * pgno);
2299  } else {
2300  if (!(np = mdb_page_malloc(txn, num))) {
2301  rc = ENOMEM;
2302  goto fail;
2303  }
2304  }
2305  if (i) {
2306  mop[0] = mop_len -= num;
2307  /* Move any stragglers down */
2308  for (j = i-num; j < mop_len; )
2309  mop[++j] = mop[++i];
2310  } else {
2311  txn->mt_next_pgno = pgno + num;
2312  }
2313  np->mp_pgno = pgno;
2314  mdb_page_dirty(txn, np);
2315  *mp = np;
2316 
2317  return MDB_SUCCESS;
2318 
2319 fail:
2320  txn->mt_flags |= MDB_TXN_ERROR;
2321  return rc;
2322 }
2323 
2324 /** Copy the used portions of a non-overflow page.
2325  * @param[in] dst page to copy into
2326  * @param[in] src page to copy from
2327  * @param[in] psize size of a page
2328  */
2329 static void
2330 mdb_page_copy(MDB_page *dst, MDB_page *src, unsigned int psize)
2331 {
2332  enum { Align = sizeof(pgno_t) };
2333  indx_t upper = src->mp_upper, lower = src->mp_lower, unused = upper-lower;
2334 
2335  /* If page isn't full, just copy the used portion. Adjust
2336  * alignment so memcpy may copy words instead of bytes.
2337  */
2338  if ((unused &= -Align) && !IS_LEAF2(src)) {
2339  upper = (upper + PAGEBASE) & -Align;
2340  memcpy(dst, src, (lower + PAGEBASE + (Align-1)) & -Align);
2341  memcpy((pgno_t *)((char *)dst+upper), (pgno_t *)((char *)src+upper),
2342  psize - upper);
2343  } else {
2344  memcpy(dst, src, psize - unused);
2345  }
2346 }
2347 
2348 /** Pull a page off the txn's spill list, if present.
2349  * If a page being referenced was spilled to disk in this txn, bring
2350  * it back and make it dirty/writable again.
2351  * @param[in] txn the transaction handle.
2352  * @param[in] mp the page being referenced. It must not be dirty.
2353  * @param[out] ret the writable page, if any. ret is unchanged if
2354  * mp wasn't spilled.
2355  */
2356 static int
2358 {
2359  MDB_env *env = txn->mt_env;
2360  const MDB_txn *tx2;
2361  unsigned x;
2362  pgno_t pgno = mp->mp_pgno, pn = pgno << 1;
2363 
2364  for (tx2 = txn; tx2; tx2=tx2->mt_parent) {
2365  if (!tx2->mt_spill_pgs)
2366  continue;
2367  x = mdb_midl_search(tx2->mt_spill_pgs, pn);
2368  if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) {
2369  MDB_page *np;
2370  int num;
2371  if (txn->mt_dirty_room == 0)
2372  return MDB_TXN_FULL;
2373  if (IS_OVERFLOW(mp))
2374  num = mp->mp_pages;
2375  else
2376  num = 1;
2377  if (env->me_flags & MDB_WRITEMAP) {
2378  np = mp;
2379  } else {
2380  np = mdb_page_malloc(txn, num);
2381  if (!np)
2382  return ENOMEM;
2383  if (num > 1)
2384  memcpy(np, mp, num * env->me_psize);
2385  else
2386  mdb_page_copy(np, mp, env->me_psize);
2387  }
2388  if (tx2 == txn) {
2389  /* If in current txn, this page is no longer spilled.
2390  * If it happens to be the last page, truncate the spill list.
2391  * Otherwise mark it as deleted by setting the LSB.
2392  */
2393  if (x == txn->mt_spill_pgs[0])
2394  txn->mt_spill_pgs[0]--;
2395  else
2396  txn->mt_spill_pgs[x] |= 1;
2397  } /* otherwise, if belonging to a parent txn, the
2398  * page remains spilled until child commits
2399  */
2400 
2401  mdb_page_dirty(txn, np);
2402  np->mp_flags |= P_DIRTY;
2403  *ret = np;
2404  break;
2405  }
2406  }
2407  return MDB_SUCCESS;
2408 }
2409 
2410 /** Touch a page: make it dirty and re-insert into tree with updated pgno.
2411  * Set #MDB_TXN_ERROR on failure.
2412  * @param[in] mc cursor pointing to the page to be touched
2413  * @return 0 on success, non-zero on failure.
2414  */
2415 static int
2417 {
2418  MDB_page *mp = mc->mc_pg[mc->mc_top], *np;
2419  MDB_txn *txn = mc->mc_txn;
2420  MDB_cursor *m2, *m3;
2421  pgno_t pgno;
2422  int rc;
2423 
2424  if (!F_ISSET(mp->mp_flags, P_DIRTY)) {
2425  if (txn->mt_flags & MDB_TXN_SPILLS) {
2426  np = NULL;
2427  rc = mdb_page_unspill(txn, mp, &np);
2428  if (rc)
2429  goto fail;
2430  if (np)
2431  goto done;
2432  }
2433  if ((rc = mdb_midl_need(&txn->mt_free_pgs, 1)) ||
2434  (rc = mdb_page_alloc(mc, 1, &np)))
2435  goto fail;
2436  pgno = np->mp_pgno;
2437  DPRINTF(("touched db %d page %"Z"u -> %"Z"u", DDBI(mc),
2438  mp->mp_pgno, pgno));
2439  mdb_cassert(mc, mp->mp_pgno != pgno);
2440  mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno);
2441  /* Update the parent page, if any, to point to the new page */
2442  if (mc->mc_top) {
2443  MDB_page *parent = mc->mc_pg[mc->mc_top-1];
2444  MDB_node *node = NODEPTR(parent, mc->mc_ki[mc->mc_top-1]);
2445  SETPGNO(node, pgno);
2446  } else {
2447  mc->mc_db->md_root = pgno;
2448  }
2449  } else if (txn->mt_parent && !IS_SUBP(mp)) {
2450  MDB_ID2 mid, *dl = txn->mt_u.dirty_list;
2451  pgno = mp->mp_pgno;
2452  /* If txn has a parent, make sure the page is in our
2453  * dirty list.
2454  */
2455  if (dl[0].mid) {
2456  unsigned x = mdb_mid2l_search(dl, pgno);
2457  if (x <= dl[0].mid && dl[x].mid == pgno) {
2458  if (mp != dl[x].mptr) { /* bad cursor? */
2459  mc->mc_flags &= ~(C_INITIALIZED|C_EOF);
2460  txn->mt_flags |= MDB_TXN_ERROR;
2461  return MDB_CORRUPTED;
2462  }
2463  return 0;
2464  }
2465  }
2466  mdb_cassert(mc, dl[0].mid < MDB_IDL_UM_MAX);
2467  /* No - copy it */
2468  np = mdb_page_malloc(txn, 1);
2469  if (!np)
2470  return ENOMEM;
2471  mid.mid = pgno;
2472  mid.mptr = np;
2473  rc = mdb_mid2l_insert(dl, &mid);
2474  mdb_cassert(mc, rc == 0);
2475  } else {
2476  return 0;
2477  }
2478 
2479  mdb_page_copy(np, mp, txn->mt_env->me_psize);
2480  np->mp_pgno = pgno;
2481  np->mp_flags |= P_DIRTY;
2482 
2483 done:
2484  /* Adjust cursors pointing to mp */
2485  mc->mc_pg[mc->mc_top] = np;
2486  m2 = txn->mt_cursors[mc->mc_dbi];
2487  if (mc->mc_flags & C_SUB) {
2488  for (; m2; m2=m2->mc_next) {
2489  m3 = &m2->mc_xcursor->mx_cursor;
2490  if (m3->mc_snum < mc->mc_snum) continue;
2491  if (m3->mc_pg[mc->mc_top] == mp)
2492  m3->mc_pg[mc->mc_top] = np;
2493  }
2494  } else {
2495  for (; m2; m2=m2->mc_next) {
2496  if (m2->mc_snum < mc->mc_snum) continue;
2497  if (m2 == mc) continue;
2498  if (m2->mc_pg[mc->mc_top] == mp) {
2499  m2->mc_pg[mc->mc_top] = np;
2500  if (IS_LEAF(np))
2501  XCURSOR_REFRESH(m2, mc->mc_top, np);
2502  }
2503  }
2504  }
2505  return 0;
2506 
2507 fail:
2508  txn->mt_flags |= MDB_TXN_ERROR;
2509  return rc;
2510 }
2511 
2512 int
2514 {
2515  int rc = 0;
2516  if (env->me_flags & MDB_RDONLY)
2517  return EACCES;
2518  if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) {
2519  if (env->me_flags & MDB_WRITEMAP) {
2520  int flags = ((env->me_flags & MDB_MAPASYNC) && !force)
2521  ? MS_ASYNC : MS_SYNC;
2522  if (MDB_MSYNC(env->me_map, env->me_mapsize, flags))
2523  rc = ErrCode();
2524 #ifdef _WIN32
2525  else if (flags == MS_SYNC && MDB_FDATASYNC(env->me_fd))
2526  rc = ErrCode();
2527 #endif
2528  } else {
2529 #ifdef BROKEN_FDATASYNC
2530  if (env->me_flags & MDB_FSYNCONLY) {
2531  if (fsync(env->me_fd))
2532  rc = ErrCode();
2533  } else
2534 #endif
2535  if (MDB_FDATASYNC(env->me_fd))
2536  rc = ErrCode();
2537  }
2538  }
2539  return rc;
2540 }
2541 
2542 /** Back up parent txn's cursors, then grab the originals for tracking */
2543 static int
2545 {
2546  MDB_cursor *mc, *bk;
2547  MDB_xcursor *mx;
2548  size_t size;
2549  int i;
2550 
2551  for (i = src->mt_numdbs; --i >= 0; ) {
2552  if ((mc = src->mt_cursors[i]) != NULL) {
2553  size = sizeof(MDB_cursor);
2554  if (mc->mc_xcursor)
2555  size += sizeof(MDB_xcursor);
2556  for (; mc; mc = bk->mc_next) {
2557  bk = malloc(size);
2558  if (!bk)
2559  return ENOMEM;
2560  *bk = *mc;
2561  mc->mc_backup = bk;
2562  mc->mc_db = &dst->mt_dbs[i];
2563  /* Kill pointers into src to reduce abuse: The
2564  * user may not use mc until dst ends. But we need a valid
2565  * txn pointer here for cursor fixups to keep working.
2566  */
2567  mc->mc_txn = dst;
2568  mc->mc_dbflag = &dst->mt_dbflags[i];
2569  if ((mx = mc->mc_xcursor) != NULL) {
2570  *(MDB_xcursor *)(bk+1) = *mx;
2571  mx->mx_cursor.mc_txn = dst;
2572  }
2573  mc->mc_next = dst->mt_cursors[i];
2574  dst->mt_cursors[i] = mc;
2575  }
2576  }
2577  }
2578  return MDB_SUCCESS;
2579 }
2580 
2581 /** Close this write txn's cursors, give parent txn's cursors back to parent.
2582  * @param[in] txn the transaction handle.
2583  * @param[in] merge true to keep changes to parent cursors, false to revert.
2584  * @return 0 on success, non-zero on failure.
2585  */
2586 static void
2587 mdb_cursors_close(MDB_txn *txn, unsigned merge)
2588 {
2589  MDB_cursor **cursors = txn->mt_cursors, *mc, *next, *bk;
2590  MDB_xcursor *mx;
2591  int i;
2592 
2593  for (i = txn->mt_numdbs; --i >= 0; ) {
2594  for (mc = cursors[i]; mc; mc = next) {
2595  next = mc->mc_next;
2596  if ((bk = mc->mc_backup) != NULL) {
2597  if (merge) {
2598  /* Commit changes to parent txn */
2599  mc->mc_next = bk->mc_next;
2600  mc->mc_backup = bk->mc_backup;
2601  mc->mc_txn = bk->mc_txn;
2602  mc->mc_db = bk->mc_db;
2603  mc->mc_dbflag = bk->mc_dbflag;
2604  if ((mx = mc->mc_xcursor) != NULL)
2605  mx->mx_cursor.mc_txn = bk->mc_txn;
2606  } else {
2607  /* Abort nested txn */
2608  *mc = *bk;
2609  if ((mx = mc->mc_xcursor) != NULL)
2610  *mx = *(MDB_xcursor *)(bk+1);
2611  }
2612  mc = bk;
2613  }
2614  /* Only malloced cursors are permanently tracked. */
2615  free(mc);
2616  }
2617  cursors[i] = NULL;
2618  }
2619 }
2620 
2621 #if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */
2622 enum Pidlock_op {
2623  Pidset, Pidcheck
2624 };
2625 #else
2627  Pidset = F_SETLK, Pidcheck = F_GETLK
2628 };
2629 #endif
2630 
2631 /** Set or check a pid lock. Set returns 0 on success.
2632  * Check returns 0 if the process is certainly dead, nonzero if it may
2633  * be alive (the lock exists or an error happened so we do not know).
2634  *
2635  * On Windows Pidset is a no-op, we merely check for the existence
2636  * of the process with the given pid. On POSIX we use a single byte
2637  * lock on the lockfile, set at an offset equal to the pid.
2638  */
2639 static int
2641 {
2642 #if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */
2643  int ret = 0;
2644  HANDLE h;
2645  if (op == Pidcheck) {
2646  h = OpenProcess(env->me_pidquery, FALSE, pid);
2647  /* No documented "no such process" code, but other program use this: */
2648  if (!h)
2649  return ErrCode() != ERROR_INVALID_PARAMETER;
2650  /* A process exists until all handles to it close. Has it exited? */
2651  ret = WaitForSingleObject(h, 0) != 0;
2652  CloseHandle(h);
2653  }
2654  return ret;
2655 #else
2656  for (;;) {
2657  int rc;
2658  struct flock lock_info;
2659  memset(&lock_info, 0, sizeof(lock_info));
2660  lock_info.l_type = F_WRLCK;
2661  lock_info.l_whence = SEEK_SET;
2662  lock_info.l_start = pid;
2663  lock_info.l_len = 1;
2664  if ((rc = fcntl(env->me_lfd, op, &lock_info)) == 0) {
2665  if (op == F_GETLK && lock_info.l_type != F_UNLCK)
2666  rc = -1;
2667  } else if ((rc = ErrCode()) == EINTR) {
2668  continue;
2669  }
2670  return rc;
2671  }
2672 #endif
2673 }
2674 
2675 /** Common code for #mdb_txn_begin() and #mdb_txn_renew().
2676  * @param[in] txn the transaction handle to initialize
2677  * @return 0 on success, non-zero on failure.
2678  */
2679 static int
2681 {
2682  MDB_env *env = txn->mt_env;
2683  MDB_txninfo *ti = env->me_txns;
2684  MDB_meta *meta;
2685  unsigned int i, nr, flags = txn->mt_flags;
2686  uint16_t x;
2687  int rc, new_notls = 0;
2688 
2689  if ((flags &= MDB_TXN_RDONLY) != 0) {
2690  if (!ti) {
2691  meta = mdb_env_pick_meta(env);
2692  txn->mt_txnid = meta->mm_txnid;
2693  txn->mt_u.reader = NULL;
2694  } else {
2695  MDB_reader *r = (env->me_flags & MDB_NOTLS) ? txn->mt_u.reader :
2696  pthread_getspecific(env->me_txkey);
2697  if (r) {
2698  if (r->mr_pid != env->me_pid || r->mr_txnid != (txnid_t)-1)
2699  return MDB_BAD_RSLOT;
2700  } else {
2701  MDB_PID_T pid = env->me_pid;
2702  MDB_THR_T tid = pthread_self();
2703  mdb_mutexref_t rmutex = env->me_rmutex;
2704 
2705  if (!env->me_live_reader) {
2706  rc = mdb_reader_pid(env, Pidset, pid);
2707  if (rc)
2708  return rc;
2709  env->me_live_reader = 1;
2710  }
2711 
2712  if (LOCK_MUTEX(rc, env, rmutex))
2713  return rc;
2714  nr = ti->mti_numreaders;
2715  for (i=0; i<nr; i++)
2716  if (ti->mti_readers[i].mr_pid == 0)
2717  break;
2718  if (i == env->me_maxreaders) {
2719  UNLOCK_MUTEX(rmutex);
2720  return MDB_READERS_FULL;
2721  }
2722  r = &ti->mti_readers[i];
2723  /* Claim the reader slot, carefully since other code
2724  * uses the reader table un-mutexed: First reset the
2725  * slot, next publish it in mti_numreaders. After
2726  * that, it is safe for mdb_env_close() to touch it.
2727  * When it will be closed, we can finally claim it.
2728  */
2729  r->mr_pid = 0;
2730  r->mr_txnid = (txnid_t)-1;
2731  r->mr_tid = tid;
2732  if (i == nr)
2733  ti->mti_numreaders = ++nr;
2734  env->me_close_readers = nr;
2735  r->mr_pid = pid;
2736  UNLOCK_MUTEX(rmutex);
2737 
2738  new_notls = (env->me_flags & MDB_NOTLS);
2739  if (!new_notls && (rc=pthread_setspecific(env->me_txkey, r))) {
2740  r->mr_pid = 0;
2741  return rc;
2742  }
2743  }
2744  do /* LY: Retry on a race, ITS#7970. */
2745  r->mr_txnid = ti->mti_txnid;
2746  while(r->mr_txnid != ti->mti_txnid);
2747  txn->mt_txnid = r->mr_txnid;
2748  txn->mt_u.reader = r;
2749  meta = env->me_metas[txn->mt_txnid & 1];
2750  }
2751 
2752  } else {
2753  /* Not yet touching txn == env->me_txn0, it may be active */
2754  if (ti) {
2755  if (LOCK_MUTEX(rc, env, env->me_wmutex))
2756  return rc;
2757  txn->mt_txnid = ti->mti_txnid;
2758  meta = env->me_metas[txn->mt_txnid & 1];
2759  } else {
2760  meta = mdb_env_pick_meta(env);
2761  txn->mt_txnid = meta->mm_txnid;
2762  }
2763  txn->mt_txnid++;
2764 #if MDB_DEBUG
2765  if (txn->mt_txnid == mdb_debug_start)
2766  mdb_debug = 1;
2767 #endif
2768  txn->mt_child = NULL;
2769  txn->mt_loose_pgs = NULL;
2770  txn->mt_loose_count = 0;
2772  txn->mt_u.dirty_list = env->me_dirty_list;
2773  txn->mt_u.dirty_list[0].mid = 0;
2774  txn->mt_free_pgs = env->me_free_pgs;
2775  txn->mt_free_pgs[0] = 0;
2776  txn->mt_spill_pgs = NULL;
2777  env->me_txn = txn;
2778  memcpy(txn->mt_dbiseqs, env->me_dbiseqs, env->me_maxdbs * sizeof(unsigned int));
2779  }
2780 
2781  /* Copy the DB info and flags */
2782  memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDB_db));
2783 
2784  /* Moved to here to avoid a data race in read TXNs */
2785  txn->mt_next_pgno = meta->mm_last_pg+1;
2786 
2787  txn->mt_flags = flags;
2788 
2789  /* Setup db info */
2790  txn->mt_numdbs = env->me_numdbs;
2791  for (i=CORE_DBS; i<txn->mt_numdbs; i++) {
2792  x = env->me_dbflags[i];
2793  txn->mt_dbs[i].md_flags = x & PERSISTENT_FLAGS;
2794  txn->mt_dbflags[i] = (x & MDB_VALID) ? DB_VALID|DB_USRVALID|DB_STALE : 0;
2795  }
2797  txn->mt_dbflags[FREE_DBI] = DB_VALID;
2798 
2799  if (env->me_flags & MDB_FATAL_ERROR) {
2800  DPUTS("environment had fatal error, must shutdown!");
2801  rc = MDB_PANIC;
2802  } else if (env->me_maxpg < txn->mt_next_pgno) {
2803  rc = MDB_MAP_RESIZED;
2804  } else {
2805  return MDB_SUCCESS;
2806  }
2807  mdb_txn_end(txn, new_notls /*0 or MDB_END_SLOT*/ | MDB_END_FAIL_BEGIN);
2808  return rc;
2809 }
2810 
2811 int
2813 {
2814  int rc;
2815 
2816  if (!txn || !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY|MDB_TXN_FINISHED))
2817  return EINVAL;
2818 
2819  rc = mdb_txn_renew0(txn);
2820  if (rc == MDB_SUCCESS) {
2821  DPRINTF(("renew txn %"Z"u%c %p on mdbenv %p, root page %"Z"u",
2822  txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w',
2823  (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root));
2824  }
2825  return rc;
2826 }
2827 
2828 int
2829 mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret)
2830 {
2831  MDB_txn *txn;
2832  MDB_ntxn *ntxn;
2833  int rc, size, tsize;
2834 
2836  flags |= env->me_flags & MDB_WRITEMAP;
2837 
2838  if (env->me_flags & MDB_RDONLY & ~flags) /* write txn in RDONLY env */
2839  return EACCES;
2840 
2841  if (parent) {
2842  /* Nested transactions: Max 1 child, write txns only, no writemap */
2843  flags |= parent->mt_flags;
2845  return (parent->mt_flags & MDB_TXN_RDONLY) ? EINVAL : MDB_BAD_TXN;
2846  }
2847  /* Child txns save MDB_pgstate and use own copy of cursors */
2848  size = env->me_maxdbs * (sizeof(MDB_db)+sizeof(MDB_cursor *)+1);
2849  size += tsize = sizeof(MDB_ntxn);
2850  } else if (flags & MDB_RDONLY) {
2851  size = env->me_maxdbs * (sizeof(MDB_db)+1);
2852  size += tsize = sizeof(MDB_txn);
2853  } else {
2854  /* Reuse preallocated write txn. However, do not touch it until
2855  * mdb_txn_renew0() succeeds, since it currently may be active.
2856  */
2857  txn = env->me_txn0;
2858  goto renew;
2859  }
2860  if ((txn = calloc(1, size)) == NULL) {
2861  DPRINTF(("calloc: %s", strerror(errno)));
2862  return ENOMEM;
2863  }
2864  txn->mt_dbxs = env->me_dbxs; /* static */
2865  txn->mt_dbs = (MDB_db *) ((char *)txn + tsize);
2866  txn->mt_dbflags = (unsigned char *)txn + size - env->me_maxdbs;
2867  txn->mt_flags = flags;
2868  txn->mt_env = env;
2869 
2870  if (parent) {
2871  unsigned int i;
2872  txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs);
2873  txn->mt_dbiseqs = parent->mt_dbiseqs;
2874  txn->mt_u.dirty_list = malloc(sizeof(MDB_ID2)*MDB_IDL_UM_SIZE);
2875  if (!txn->mt_u.dirty_list ||
2877  {
2878  free(txn->mt_u.dirty_list);
2879  free(txn);
2880  return ENOMEM;
2881  }
2882  txn->mt_txnid = parent->mt_txnid;
2883  txn->mt_dirty_room = parent->mt_dirty_room;
2884  txn->mt_u.dirty_list[0].mid = 0;
2885  txn->mt_spill_pgs = NULL;
2886  txn->mt_next_pgno = parent->mt_next_pgno;
2887  parent->mt_flags |= MDB_TXN_HAS_CHILD;
2888  parent->mt_child = txn;
2889  txn->mt_parent = parent;
2890  txn->mt_numdbs = parent->mt_numdbs;
2891  memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDB_db));
2892  /* Copy parent's mt_dbflags, but clear DB_NEW */
2893  for (i=0; i<txn->mt_numdbs; i++)
2894  txn->mt_dbflags[i] = parent->mt_dbflags[i] & ~DB_NEW;
2895  rc = 0;
2896  ntxn = (MDB_ntxn *)txn;
2897  ntxn->mnt_pgstate = env->me_pgstate; /* save parent me_pghead & co */
2898  if (env->me_pghead) {
2899  size = MDB_IDL_SIZEOF(env->me_pghead);
2900  env->me_pghead = mdb_midl_alloc(env->me_pghead[0]);
2901  if (env->me_pghead)
2902  memcpy(env->me_pghead, ntxn->mnt_pgstate.mf_pghead, size);
2903  else
2904  rc = ENOMEM;
2905  }
2906  if (!rc)
2907  rc = mdb_cursor_shadow(parent, txn);
2908  if (rc)
2910  } else { /* MDB_RDONLY */
2911  txn->mt_dbiseqs = env->me_dbiseqs;
2912 renew:
2913  rc = mdb_txn_renew0(txn);
2914  }
2915  if (rc) {
2916  if (txn != env->me_txn0)
2917  free(txn);
2918  } else {
2919  txn->mt_flags |= flags; /* could not change txn=me_txn0 earlier */
2920  *ret = txn;
2921  DPRINTF(("begin txn %"Z"u%c %p on mdbenv %p, root page %"Z"u",
2922  txn->mt_txnid, (flags & MDB_RDONLY) ? 'r' : 'w',
2923  (void *) txn, (void *) env, txn->mt_dbs[MAIN_DBI].md_root));
2924  }
2925 
2926  return rc;
2927 }
2928 
2929 MDB_env *
2931 {
2932  if(!txn) return NULL;
2933  return txn->mt_env;
2934 }
2935 
2936 size_t
2938 {
2939  if(!txn) return 0;
2940  return txn->mt_txnid;
2941 }
2942 
2943 /** Export or close DBI handles opened in this txn. */
2944 static void
2945 mdb_dbis_update(MDB_txn *txn, int keep)
2946 {
2947  int i;
2948  MDB_dbi n = txn->mt_numdbs;
2949  MDB_env *env = txn->mt_env;
2950  unsigned char *tdbflags = txn->mt_dbflags;
2951 
2952  for (i = n; --i >= CORE_DBS;) {
2953  if (tdbflags[i] & DB_NEW) {
2954  if (keep) {
2955  env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDB_VALID;
2956  } else {
2957  char *ptr = env->me_dbxs[i].md_name.mv_data;
2958  if (ptr) {
2959  env->me_dbxs[i].md_name.mv_data = NULL;
2960  env->me_dbxs[i].md_name.mv_size = 0;
2961  env->me_dbflags[i] = 0;
2962  env->me_dbiseqs[i]++;
2963  free(ptr);
2964  }
2965  }
2966  }
2967  }
2968  if (keep && env->me_numdbs < n)
2969  env->me_numdbs = n;
2970 }
2971 
2972 /** End a transaction, except successful commit of a nested transaction.
2973  * May be called twice for readonly txns: First reset it, then abort.
2974  * @param[in] txn the transaction handle to end
2975  * @param[in] mode why and how to end the transaction
2976  */
2977 static void
2978 mdb_txn_end(MDB_txn *txn, unsigned mode)
2979 {
2980  MDB_env *env = txn->mt_env;
2981 #if MDB_DEBUG
2982  static const char *const names[] = MDB_END_NAMES;
2983 #endif
2984 
2985  /* Export or close DBI handles opened in this txn */
2987 
2988  DPRINTF(("%s txn %"Z"u%c %p on mdbenv %p, root page %"Z"u",
2990  txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w',
2991  (void *) txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root));
2992 
2993  if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
2994  if (txn->mt_u.reader) {
2995  txn->mt_u.reader->mr_txnid = (txnid_t)-1;
2996  if (!(env->me_flags & MDB_NOTLS)) {
2997  txn->mt_u.reader = NULL; /* txn does not own reader */
2998  } else if (mode & MDB_END_SLOT) {
2999  txn->mt_u.reader->mr_pid = 0;
3000  txn->mt_u.reader = NULL;
3001  } /* else txn owns the slot until it does MDB_END_SLOT */
3002  }
3003  txn->mt_numdbs = 0; /* prevent further DBI activity */
3004  txn->mt_flags |= MDB_TXN_FINISHED;
3005 
3006  } else if (!F_ISSET(txn->mt_flags, MDB_TXN_FINISHED)) {
3007  pgno_t *pghead = env->me_pghead;
3008 
3009  if (!(mode & MDB_END_UPDATE)) /* !(already closed cursors) */
3010  mdb_cursors_close(txn, 0);
3011  if (!(env->me_flags & MDB_WRITEMAP)) {
3012  mdb_dlist_free(txn);
3013  }
3014 
3015  txn->mt_numdbs = 0;
3016  txn->mt_flags = MDB_TXN_FINISHED;
3017 
3018  if (!txn->mt_parent) {
3020  env->me_free_pgs = txn->mt_free_pgs;
3021  /* me_pgstate: */
3022  env->me_pghead = NULL;
3023  env->me_pglast = 0;
3024 
3025  env->me_txn = NULL;
3026  mode = 0; /* txn == env->me_txn0, do not free() it */
3027 
3028  /* The writer mutex was locked in mdb_txn_begin. */
3029  if (env->me_txns)
3030  UNLOCK_MUTEX(env->me_wmutex);
3031  } else {
3032  txn->mt_parent->mt_child = NULL;
3034  env->me_pgstate = ((MDB_ntxn *)txn)->mnt_pgstate;
3035  mdb_midl_free(txn->mt_free_pgs);
3037  free(txn->mt_u.dirty_list);
3038  }
3039 
3040  mdb_midl_free(pghead);
3041  }
3042 
3043  if (mode & MDB_END_FREE)
3044  free(txn);
3045 }
3046 
3047 void
3049 {
3050  if (txn == NULL)
3051  return;
3052 
3053  /* This call is only valid for read-only txns */
3054  if (!(txn->mt_flags & MDB_TXN_RDONLY))
3055  return;
3056 
3057  mdb_txn_end(txn, MDB_END_RESET);
3058 }
3059 
3060 void
3062 {
3063  if (txn == NULL)
3064  return;
3065 
3066  if (txn->mt_child)
3067  mdb_txn_abort(txn->mt_child);
3068 
3070 }
3071 
3072 /** Save the freelist as of this transaction to the freeDB.
3073  * This changes the freelist. Keep trying until it stabilizes.
3074  */
3075 static int
3077 {
3078  /* env->me_pghead[] can grow and shrink during this call.
3079  * env->me_pglast and txn->mt_free_pgs[] can only grow.
3080  * Page numbers cannot disappear from txn->mt_free_pgs[].
3081  */
3082  MDB_cursor mc;
3083  MDB_env *env = txn->mt_env;
3084  int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1;
3085  txnid_t pglast = 0, head_id = 0;
3086  pgno_t freecnt = 0, *free_pgs, *mop;
3087  ssize_t head_room = 0, total_room = 0, mop_len, clean_limit;
3088 
3089  mdb_cursor_init(&mc, txn, FREE_DBI, NULL);
3090 
3091  if (env->me_pghead) {
3092  /* Make sure first page of freeDB is touched and on freelist */
3094  if (rc && rc != MDB_NOTFOUND)
3095  return rc;
3096  }
3097 
3098  if (!env->me_pghead && txn->mt_loose_pgs) {
3099  /* Put loose page numbers in mt_free_pgs, since
3100  * we may be unable to return them to me_pghead.
3101  */
3102  MDB_page *mp = txn->mt_loose_pgs;
3103  MDB_ID2 *dl = txn->mt_u.dirty_list;
3104  unsigned x;
3105  if ((rc = mdb_midl_need(&txn->mt_free_pgs, txn->mt_loose_count)) != 0)
3106  return rc;
3107  for (; mp; mp = NEXT_LOOSE_PAGE(mp)) {
3108  mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno);
3109  /* must also remove from dirty list */
3110  if (txn->mt_flags & MDB_TXN_WRITEMAP) {
3111  for (x=1; x<=dl[0].mid; x++)
3112  if (dl[x].mid == mp->mp_pgno)
3113  break;
3114  mdb_tassert(txn, x <= dl[0].mid);
3115  } else {
3116  x = mdb_mid2l_search(dl, mp->mp_pgno);
3117  mdb_tassert(txn, dl[x].mid == mp->mp_pgno);
3118  mdb_dpage_free(env, mp);
3119  }
3120  dl[x].mptr = NULL;
3121  }
3122  {
3123  /* squash freed slots out of the dirty list */
3124  unsigned y;
3125  for (y=1; dl[y].mptr && y <= dl[0].mid; y++);
3126  if (y <= dl[0].mid) {
3127  for(x=y, y++;;) {
3128  while (!dl[y].mptr && y <= dl[0].mid) y++;
3129  if (y > dl[0].mid) break;
3130  dl[x++] = dl[y++];
3131  }
3132  dl[0].mid = x-1;
3133  } else {
3134  /* all slots freed */
3135  dl[0].mid = 0;
3136  }
3137  }
3138  txn->mt_loose_pgs = NULL;
3139  txn->mt_loose_count = 0;
3140  }
3141 
3142  /* MDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) */
3143  clean_limit = (env->me_flags & (MDB_NOMEMINIT|MDB_WRITEMAP))
3144  ? SSIZE_MAX : maxfree_1pg;
3145 
3146  for (;;) {
3147  /* Come back here after each Put() in case freelist changed */
3148  MDB_val key, data;
3149  pgno_t *pgs;
3150  ssize_t j;
3151 
3152  /* If using records from freeDB which we have not yet
3153  * deleted, delete them and any we reserved for me_pghead.
3154  */
3155  while (pglast < env->me_pglast) {
3156  rc = mdb_cursor_first(&mc, &key, NULL);
3157  if (rc)
3158  return rc;
3159  pglast = head_id = *(txnid_t *)key.mv_data;
3160  total_room = head_room = 0;
3161  mdb_tassert(txn, pglast <= env->me_pglast);
3162  rc = mdb_cursor_del(&mc, 0);
3163  if (rc)
3164  return rc;
3165  }
3166 
3167  /* Save the IDL of pages freed by this txn, to a single record */
3168  if (freecnt < txn->mt_free_pgs[0]) {
3169  if (!freecnt) {
3170  /* Make sure last page of freeDB is touched and on freelist */
3172  if (rc && rc != MDB_NOTFOUND)
3173  return rc;
3174  }
3175  free_pgs = txn->mt_free_pgs;
3176  /* Write to last page of freeDB */
3177  key.mv_size = sizeof(txn->mt_txnid);
3178  key.mv_data = &txn->mt_txnid;
3179  do {
3180  freecnt = free_pgs[0];
3181  data.mv_size = MDB_IDL_SIZEOF(free_pgs);
3182  rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE);
3183  if (rc)
3184  return rc;
3185  /* Retry if mt_free_pgs[] grew during the Put() */
3186  free_pgs = txn->mt_free_pgs;
3187  } while (freecnt < free_pgs[0]);
3188  mdb_midl_sort(free_pgs);
3189  memcpy(data.mv_data, free_pgs, data.mv_size);
3190 #if (MDB_DEBUG) > 1
3191  {
3192  unsigned int i = free_pgs[0];
3193  DPRINTF(("IDL write txn %"Z"u root %"Z"u num %u",
3194  txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i));
3195  for (; i; i--)
3196  DPRINTF(("IDL %"Z"u", free_pgs[i]));
3197  }
3198 #endif
3199  continue;
3200  }
3201 
3202  mop = env->me_pghead;
3203  mop_len = (mop ? mop[0] : 0) + txn->mt_loose_count;
3204 
3205  /* Reserve records for me_pghead[]. Split it if multi-page,
3206  * to avoid searching freeDB for a page range. Use keys in
3207  * range [1,me_pglast]: Smaller than txnid of oldest reader.
3208  */
3209  if (total_room >= mop_len) {
3210  if (total_room == mop_len || --more < 0)
3211  break;
3212  } else if (head_room >= maxfree_1pg && head_id > 1) {
3213  /* Keep current record (overflow page), add a new one */
3214  head_id--;
3215  head_room = 0;
3216  }
3217  /* (Re)write {key = head_id, IDL length = head_room} */
3218  total_room -= head_room;
3219  head_room = mop_len - total_room;
3220  if (head_room > maxfree_1pg && head_id > 1) {
3221  /* Overflow multi-page for part of me_pghead */
3222  head_room /= head_id; /* amortize page sizes */
3223  head_room += maxfree_1pg - head_room % (maxfree_1pg + 1);
3224  } else if (head_room < 0) {
3225  /* Rare case, not bothering to delete this record */
3226  head_room = 0;
3227  }
3228  key.mv_size = sizeof(head_id);
3229  key.mv_data = &head_id;
3230  data.mv_size = (head_room + 1) * sizeof(pgno_t);
3231  rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE);
3232  if (rc)
3233  return rc;
3234  /* IDL is initially empty, zero out at least the length */
3235  pgs = (pgno_t *)data.mv_data;
3236  j = head_room > clean_limit ? head_room : 0;
3237  do {
3238  pgs[j] = 0;
3239  } while (--j >= 0);
3240  total_room += head_room;
3241  }
3242 
3243  /* Return loose page numbers to me_pghead, though usually none are
3244  * left at this point. The pages themselves remain in dirty_list.
3245  */
3246  if (txn->mt_loose_pgs) {
3247  MDB_page *mp = txn->mt_loose_pgs;
3248  unsigned count = txn->mt_loose_count;
3249  MDB_IDL loose;
3250  /* Room for loose pages + temp IDL with same */
3251  if ((rc = mdb_midl_need(&env->me_pghead, 2*count+1)) != 0)
3252  return rc;
3253  mop = env->me_pghead;
3254  loose = mop + MDB_IDL_ALLOCLEN(mop) - count;
3255  for (count = 0; mp; mp = NEXT_LOOSE_PAGE(mp))
3256  loose[ ++count ] = mp->mp_pgno;
3257  loose[0] = count;
3258  mdb_midl_sort(loose);
3259  mdb_midl_xmerge(mop, loose);
3260  txn->mt_loose_pgs = NULL;
3261  txn->mt_loose_count = 0;
3262  mop_len = mop[0];
3263  }
3264 
3265  /* Fill in the reserved me_pghead records */
3266  rc = MDB_SUCCESS;
3267  if (mop_len) {
3268  MDB_val key, data;
3269 
3270  mop += mop_len;
3271  rc = mdb_cursor_first(&mc, &key, &data);
3272  for (; !rc; rc = mdb_cursor_next(&mc, &key, &data, MDB_NEXT)) {
3273  txnid_t id = *(txnid_t *)key.mv_data;
3274  ssize_t len = (ssize_t)(data.mv_size / sizeof(MDB_ID)) - 1;
3275  MDB_ID save;
3276 
3277  mdb_tassert(txn, len >= 0 && id <= env->me_pglast);
3278  key.mv_data = &id;
3279  if (len > mop_len) {
3280  len = mop_len;
3281  data.mv_size = (len + 1) * sizeof(MDB_ID);
3282  }
3283  data.mv_data = mop -= len;
3284  save = mop[0];
3285  mop[0] = len;
3286  rc = mdb_cursor_put(&mc, &key, &data, MDB_CURRENT);
3287  mop[0] = save;
3288  if (rc || !(mop_len -= len))
3289  break;
3290  }
3291  }
3292  return rc;
3293 }
3294 
3295 /** Flush (some) dirty pages to the map, after clearing their dirty flag.
3296  * @param[in] txn the transaction that's being committed
3297  * @param[in] keep number of initial pages in dirty_list to keep dirty.
3298  * @return 0 on success, non-zero on failure.
3299  */
3300 static int
3301 mdb_page_flush(MDB_txn *txn, int keep)
3302 {
3303  MDB_env *env = txn->mt_env;
3304  MDB_ID2L dl = txn->mt_u.dirty_list;
3305  unsigned psize = env->me_psize, j;
3306  int i, pagecount = dl[0].mid, rc;
3307  size_t size = 0, pos = 0;
3308  pgno_t pgno = 0;
3309  MDB_page *dp = NULL;
3310 #ifdef _WIN32
3311  OVERLAPPED ov;
3312 #else
3313  struct iovec iov[MDB_COMMIT_PAGES];
3314  ssize_t wpos = 0, wsize = 0, wres;
3315  size_t next_pos = 1; /* impossible pos, so pos != next_pos */
3316  int n = 0;
3317 #endif
3318 
3319  j = i = keep;
3320 
3321  if (env->me_flags & MDB_WRITEMAP) {
3322  /* Clear dirty flags */
3323  while (++i <= pagecount) {
3324  dp = dl[i].mptr;
3325  /* Don't flush this page yet */
3326  if (dp->mp_flags & (P_LOOSE|P_KEEP)) {
3327  dp->mp_flags &= ~P_KEEP;
3328  dl[++j] = dl[i];
3329  continue;
3330  }
3331  dp->mp_flags &= ~P_DIRTY;
3332  }
3333  goto done;
3334  }
3335 
3336  /* Write the pages */
3337  for (;;) {
3338  if (++i <= pagecount) {
3339  dp = dl[i].mptr;
3340  /* Don't flush this page yet */
3341  if (dp->mp_flags & (P_LOOSE|P_KEEP)) {
3342  dp->mp_flags &= ~P_KEEP;
3343  dl[i].mid = 0;
3344  continue;
3345  }
3346  pgno = dl[i].mid;
3347  /* clear dirty flag */
3348  dp->mp_flags &= ~P_DIRTY;
3349  pos = pgno * psize;
3350  size = psize;
3351  if (IS_OVERFLOW(dp)) size *= dp->mp_pages;
3352  }
3353 #ifdef _WIN32
3354  else break;
3355 
3356  /* Windows actually supports scatter/gather I/O, but only on
3357  * unbuffered file handles. Since we're relying on the OS page
3358  * cache for all our data, that's self-defeating. So we just
3359  * write pages one at a time. We use the ov structure to set
3360  * the write offset, to at least save the overhead of a Seek
3361  * system call.
3362  */
3363  DPRINTF(("committing page %"Z"u", pgno));
3364  memset(&ov, 0, sizeof(ov));
3365  ov.Offset = pos & 0xffffffff;
3366  ov.OffsetHigh = pos >> 16 >> 16;
3367  if (!WriteFile(env->me_fd, dp, size, NULL, &ov)) {
3368  rc = ErrCode();
3369  DPRINTF(("WriteFile: %d", rc));
3370  return rc;
3371  }
3372 #else
3373  /* Write up to MDB_COMMIT_PAGES dirty pages at a time. */
3374  if (pos!=next_pos || n==MDB_COMMIT_PAGES || wsize+size>MAX_WRITE) {
3375  if (n) {
3376 retry_write:
3377  /* Write previous page(s) */
3378 #ifdef MDB_USE_PWRITEV
3379  wres = pwritev(env->me_fd, iov, n, wpos);
3380 #else
3381  if (n == 1) {
3382  wres = pwrite(env->me_fd, iov[0].iov_base, wsize, wpos);
3383  } else {
3384 retry_seek:
3385  if (lseek(env->me_fd, wpos, SEEK_SET) == -1) {
3386  rc = ErrCode();
3387  if (rc == EINTR)
3388  goto retry_seek;
3389  DPRINTF(("lseek: %s", strerror(rc)));
3390  return rc;
3391  }
3392  wres = writev(env->me_fd, iov, n);
3393  }
3394 #endif
3395  if (wres != wsize) {
3396  if (wres < 0) {
3397  rc = ErrCode();
3398  if (rc == EINTR)
3399  goto retry_write;
3400  DPRINTF(("Write error: %s", strerror(rc)));
3401  } else {
3402  rc = EIO; /* TODO: Use which error code? */
3403  DPUTS("short write, filesystem full?");
3404  }
3405  return rc;
3406  }
3407  n = 0;
3408  }
3409  if (i > pagecount)
3410  break;
3411  wpos = pos;
3412  wsize = 0;
3413  }
3414  DPRINTF(("committing page %"Z"u", pgno));
3415  next_pos = pos + size;
3416  iov[n].iov_len = size;
3417  iov[n].iov_base = (char *)dp;
3418  wsize += size;
3419  n++;
3420 #endif /* _WIN32 */
3421  }
3422 
3423  /* MIPS has cache coherency issues, this is a no-op everywhere else
3424  * Note: for any size >= on-chip cache size, entire on-chip cache is
3425  * flushed.
3426  */
3427  CACHEFLUSH(env->me_map, txn->mt_next_pgno * env->me_psize, DCACHE);
3428 
3429  for (i = keep; ++i <= pagecount; ) {
3430  dp = dl[i].mptr;
3431  /* This is a page we skipped above */
3432  if (!dl[i].mid) {
3433  dl[++j] = dl[i];
3434  dl[j].mid = dp->mp_pgno;
3435  continue;
3436  }
3437  mdb_dpage_free(env, dp);
3438  }
3439 
3440 done:
3441  i--;
3442  txn->mt_dirty_room += i - j;
3443  dl[0].mid = j;
3444  return MDB_SUCCESS;
3445 }
3446 
3447 int
3449 {
3450  int rc;
3451  unsigned int i, end_mode;
3452  MDB_env *env;
3453 
3454  if (txn == NULL)
3455  return EINVAL;
3456 
3457  /* mdb_txn_end() mode for a commit which writes nothing */
3459 
3460  if (txn->mt_child) {
3461  rc = mdb_txn_commit(txn->mt_child);
3462  if (rc)
3463  goto fail;
3464  }
3465 
3466  env = txn->mt_env;
3467 
3468  if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
3469  goto done;
3470  }
3471 
3472  if (txn->mt_flags & (MDB_TXN_FINISHED|MDB_TXN_ERROR)) {
3473  DPUTS("txn has failed/finished, can't commit");
3474  if (txn->mt_parent)
3475  txn->mt_parent->mt_flags |= MDB_TXN_ERROR;
3476  rc = MDB_BAD_TXN;
3477  goto fail;
3478  }
3479 
3480  if (txn->mt_parent) {
3481  MDB_txn *parent = txn->mt_parent;
3482  MDB_page **lp;
3483  MDB_ID2L dst, src;
3484  MDB_IDL pspill;
3485  unsigned x, y, len, ps_len;
3486 
3487  /* Append our free list to parent's */
3488  rc = mdb_midl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs);
3489  if (rc)
3490  goto fail;
3491  mdb_midl_free(txn->mt_free_pgs);
3492  /* Failures after this must either undo the changes
3493  * to the parent or set MDB_TXN_ERROR in the parent.
3494  */
3495 
3496  parent->mt_next_pgno = txn->mt_next_pgno;
3497  parent->mt_flags = txn->mt_flags;
3498 
3499  /* Merge our cursors into parent's and close them */
3500  mdb_cursors_close(txn, 1);
3501 
3502  /* Update parent's DB table. */
3503  memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDB_db));
3504  parent->mt_numdbs = txn->mt_numdbs;
3505  parent->mt_dbflags[FREE_DBI] = txn->mt_dbflags[FREE_DBI];
3506  parent->mt_dbflags[MAIN_DBI] = txn->mt_dbflags[MAIN_DBI];
3507  for (i=CORE_DBS; i<txn->mt_numdbs; i++) {
3508  /* preserve parent's DB_NEW status */
3509  x = parent->mt_dbflags[i] & DB_NEW;
3510  parent->mt_dbflags[i] = txn->mt_dbflags[i] | x;
3511  }
3512 
3513  dst = parent->mt_u.dirty_list;
3514  src = txn->mt_u.dirty_list;
3515  /* Remove anything in our dirty list from parent's spill list */
3516  if ((pspill = parent->mt_spill_pgs) && (ps_len = pspill[0])) {
3517  x = y = ps_len;
3518  pspill[0] = (pgno_t)-1;
3519  /* Mark our dirty pages as deleted in parent spill list */
3520  for (i=0, len=src[0].mid; ++i <= len; ) {
3521  MDB_ID pn = src[i].mid << 1;
3522  while (pn > pspill[x])
3523  x--;
3524  if (pn == pspill[x]) {
3525  pspill[x] = 1;
3526  y = --x;
3527  }
3528  }
3529  /* Squash deleted pagenums if we deleted any */
3530  for (x=y; ++x <= ps_len; )
3531  if (!(pspill[x] & 1))
3532  pspill[++y] = pspill[x];
3533  pspill[0] = y;
3534  }
3535 
3536  /* Remove anything in our spill list from parent's dirty list */
3537  if (txn->mt_spill_pgs && txn->mt_spill_pgs[0]) {
3538  for (i=1; i<=txn->mt_spill_pgs[0]; i++) {
3539  MDB_ID pn = txn->mt_spill_pgs[i];
3540  if (pn & 1)
3541  continue; /* deleted spillpg */
3542  pn >>= 1;
3543  y = mdb_mid2l_search(dst, pn);
3544  if (y <= dst[0].mid && dst[y].mid == pn) {
3545  free(dst[y].mptr);
3546  while (y < dst[0].mid) {
3547  dst[y] = dst[y+1];
3548  y++;
3549  }
3550  dst[0].mid--;
3551  }
3552  }
3553  }
3554 
3555  /* Find len = length of merging our dirty list with parent's */
3556  x = dst[0].mid;
3557  dst[0].mid = 0; /* simplify loops */
3558  if (parent->mt_parent) {
3559  len = x + src[0].mid;
3560  y = mdb_mid2l_search(src, dst[x].mid + 1) - 1;
3561  for (i = x; y && i; y--) {
3562  pgno_t yp = src[y].mid;
3563  while (yp < dst[i].mid)
3564  i--;
3565  if (yp == dst[i].mid) {
3566  i--;
3567  len--;
3568  }
3569  }
3570  } else { /* Simplify the above for single-ancestor case */
3571  len = MDB_IDL_UM_MAX - txn->mt_dirty_room;
3572  }
3573  /* Merge our dirty list with parent's */
3574  y = src[0].mid;
3575  for (i = len; y; dst[i--] = src[y--]) {
3576  pgno_t yp = src[y].mid;
3577  while (yp < dst[x].mid)
3578  dst[i--] = dst[x--];
3579  if (yp == dst[x].mid)
3580  free(dst[x--].mptr);
3581  }
3582  mdb_tassert(txn, i == x);
3583  dst[0].mid = len;
3584  free(txn->mt_u.dirty_list);
3585  parent->mt_dirty_room = txn->mt_dirty_room;
3586  if (txn->mt_spill_pgs) {
3587  if (parent->mt_spill_pgs) {
3588  /* TODO: Prevent failure here, so parent does not fail */
3589  rc = mdb_midl_append_list(&parent->mt_spill_pgs, txn->mt_spill_pgs);
3590  if (rc)
3591  parent->mt_flags |= MDB_TXN_ERROR;
3593  mdb_midl_sort(parent->mt_spill_pgs);
3594  } else {
3595  parent->mt_spill_pgs = txn->mt_spill_pgs;
3596  }
3597  }
3598 
3599  /* Append our loose page list to parent's */
3600  for (lp = &parent->mt_loose_pgs; *lp; lp = &NEXT_LOOSE_PAGE(*lp))
3601  ;
3602  *lp = txn->mt_loose_pgs;
3603  parent->mt_loose_count += txn->mt_loose_count;
3604 
3605  parent->mt_child = NULL;
3606  mdb_midl_free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pghead);
3607  free(txn);
3608  return rc;
3609  }
3610 
3611  if (txn != env->me_txn) {
3612  DPUTS("attempt to commit unknown transaction");
3613  rc = EINVAL;
3614  goto fail;
3615  }
3616 
3617  mdb_cursors_close(txn, 0);
3618 
3619  if (!txn->mt_u.dirty_list[0].mid &&
3621  goto done;
3622 
3623  DPRINTF(("committing txn %"Z"u %p on mdbenv %p, root page %"Z"u",
3624  txn->mt_txnid, (void*)txn, (void*)env, txn->mt_dbs[MAIN_DBI].md_root));
3625 
3626  /* Update DB root pointers */
3627  if (txn->mt_numdbs > CORE_DBS) {
3628  MDB_cursor mc;
3629  MDB_dbi i;
3630  MDB_val data;
3631  data.mv_size = sizeof(MDB_db);
3632 
3633  mdb_cursor_init(&mc, txn, MAIN_DBI, NULL);
3634  for (i = CORE_DBS; i < txn->mt_numdbs; i++) {
3635  if (txn->mt_dbflags[i] & DB_DIRTY) {
3636  if (TXN_DBI_CHANGED(txn, i)) {
3637  rc = MDB_BAD_DBI;
3638  goto fail;
3639  }
3640  data.mv_data = &txn->mt_dbs[i];
3641  rc = mdb_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data,
3642  F_SUBDATA);
3643  if (rc)
3644  goto fail;
3645  }
3646  }
3647  }
3648 
3649  rc = mdb_freelist_save(txn);
3650  if (rc)
3651  goto fail;
3652 
3653  mdb_midl_free(env->me_pghead);
3654  env->me_pghead = NULL;
3656 
3657 #if (MDB_DEBUG) > 2
3658  mdb_audit(txn);
3659 #endif
3660 
3661  if ((rc = mdb_page_flush(txn, 0)) ||
3662  (rc = mdb_env_sync(env, 0)) ||
3663  (rc = mdb_env_write_meta(txn)))
3664  goto fail;
3665  end_mode = MDB_END_COMMITTED|MDB_END_UPDATE;
3666 
3667 done:
3668  mdb_txn_end(txn, end_mode);
3669  return MDB_SUCCESS;
3670 
3671 fail:
3672  mdb_txn_abort(txn);
3673  return rc;
3674 }
3675 
3676 /** Read the environment parameters of a DB environment before
3677  * mapping it into memory.
3678  * @param[in] env the environment handle
3679  * @param[out] meta address of where to store the meta information
3680  * @return 0 on success, non-zero on failure.
3681  */
3682 static int ESECT
3684 {
3685  MDB_metabuf pbuf;
3686  MDB_page *p;
3687  MDB_meta *m;
3688  int i, rc, off;
3689  enum { Size = sizeof(pbuf) };
3690 
3691  /* We don't know the page size yet, so use a minimum value.
3692  * Read both meta pages so we can use the latest one.
3693  */
3694 
3695  for (i=off=0; i<NUM_METAS; i++, off += meta->mm_psize) {
3696 #ifdef _WIN32
3697  DWORD len;
3698  OVERLAPPED ov;
3699  memset(&ov, 0, sizeof(ov));
3700  ov.Offset = off;
3701  rc = ReadFile(env->me_fd, &pbuf, Size, &len, &ov) ? (int)len : -1;
3702  if (rc == -1 && ErrCode() == ERROR_HANDLE_EOF)
3703  rc = 0;
3704 #else
3705  rc = pread(env->me_fd, &pbuf, Size, off);
3706 #endif
3707  if (rc != Size) {
3708  if (rc == 0 && off == 0)
3709  return ENOENT;
3710  rc = rc < 0 ? (int) ErrCode() : MDB_INVALID;
3711  DPRINTF(("read: %s", mdb_strerror(rc)));
3712  return rc;
3713  }
3714 
3715  p = (MDB_page *)&pbuf;
3716 
3717  if (!F_ISSET(p->mp_flags, P_META)) {
3718  DPRINTF(("page %"Z"u not a meta page", p->mp_pgno));
3719  return MDB_INVALID;
3720  }
3721 
3722  m = METADATA(p);
3723  if (m->mm_magic != MDB_MAGIC) {
3724  DPUTS("meta has invalid magic");
3725  return MDB_INVALID;
3726  }
3727 
3728  if (m->mm_version != MDB_DATA_VERSION) {
3729  DPRINTF(("database is version %u, expected version %u",
3731  return MDB_VERSION_MISMATCH;
3732  }
3733 
3734  if (off == 0 || m->mm_txnid > meta->mm_txnid)
3735  *meta = *m;
3736  }
3737  return 0;
3738 }
3739 
3740 /** Fill in most of the zeroed #MDB_meta for an empty database environment */
3741 static void ESECT
3743 {
3744  meta->mm_magic = MDB_MAGIC;
3745  meta->mm_version = MDB_DATA_VERSION;
3746  meta->mm_mapsize = env->me_mapsize;
3747  meta->mm_psize = env->me_psize;
3748  meta->mm_last_pg = NUM_METAS-1;
3749  meta->mm_flags = env->me_flags & 0xffff;
3750  meta->mm_flags |= MDB_INTEGERKEY; /* this is mm_dbs[FREE_DBI].md_flags */
3751  meta->mm_dbs[FREE_DBI].md_root = P_INVALID;
3752  meta->mm_dbs[MAIN_DBI].md_root = P_INVALID;
3753 }
3754 
3755 /** Write the environment parameters of a freshly created DB environment.
3756  * @param[in] env the environment handle
3757  * @param[in] meta the #MDB_meta to write
3758  * @return 0 on success, non-zero on failure.
3759  */
3760 static int ESECT
3762 {
3763  MDB_page *p, *q;
3764  int rc;
3765  unsigned int psize;
3766 #ifdef _WIN32
3767  DWORD len;
3768  OVERLAPPED ov;
3769  memset(&ov, 0, sizeof(ov));
3770 #define DO_PWRITE(rc, fd, ptr, size, len, pos) do { \
3771  ov.Offset = pos; \
3772  rc = WriteFile(fd, ptr, size, &len, &ov); } while(0)
3773 #else
3774  int len;
3775 #define DO_PWRITE(rc, fd, ptr, size, len, pos) do { \
3776  len = pwrite(fd, ptr, size, pos); \
3777  if (len == -1 && ErrCode() == EINTR) continue; \
3778  rc = (len >= 0); break; } while(1)
3779 #endif
3780 
3781  DPUTS("writing new meta page");
3782 
3783  psize = env->me_psize;
3784 
3785  p = calloc(NUM_METAS, psize);
3786  if (!p)
3787  return ENOMEM;
3788 
3789  p->mp_pgno = 0;
3790  p->mp_flags = P_META;
3791  *(MDB_meta *)METADATA(p) = *meta;
3792 
3793  q = (MDB_page *)((char *)p + psize);
3794  q->mp_pgno = 1;
3795  q->mp_flags = P_META;
3796  *(MDB_meta *)METADATA(q) = *meta;
3797 
3798  DO_PWRITE(rc, env->me_fd, p, psize * NUM_METAS, len, 0);
3799  if (!rc)
3800  rc = ErrCode();
3801  else if ((unsigned) len == psize * NUM_METAS)
3802  rc = MDB_SUCCESS;
3803  else
3804  rc = ENOSPC;
3805  free(p);
3806  return rc;
3807 }
3808 
3809 /** Update the environment info to commit a transaction.
3810  * @param[in] txn the transaction that's being committed
3811  * @return 0 on success, non-zero on failure.
3812  */
3813 static int
3815 {
3816  MDB_env *env;
3817  MDB_meta meta, metab, *mp;
3818  unsigned flags;
3819  size_t mapsize;
3820  off_t off;
3821  int rc, len, toggle;
3822  char *ptr;
3823  HANDLE mfd;
3824 #ifdef _WIN32
3825  OVERLAPPED ov;
3826 #else
3827  int r2;
3828 #endif
3829 
3830  toggle = txn->mt_txnid & 1;
3831  DPRINTF(("writing meta page %d for root page %"Z"u",
3832  toggle, txn->mt_dbs[MAIN_DBI].md_root));
3833 
3834  env = txn->mt_env;
3835  flags = env->me_flags;
3836  mp = env->me_metas[toggle];
3837  mapsize = env->me_metas[toggle ^ 1]->mm_mapsize;
3838  /* Persist any increases of mapsize config */
3839  if (mapsize < env->me_mapsize)
3840  mapsize = env->me_mapsize;
3841 
3842  if (flags & MDB_WRITEMAP) {
3843  mp->mm_mapsize = mapsize;
3844  mp->mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI];
3845  mp->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI];
3846  mp->mm_last_pg = txn->mt_next_pgno - 1;
3847 #if (__GNUC__ * 100 + __GNUC_MINOR__ >= 404) && /* TODO: portability */ \
3848  !(defined(__i386__) || defined(__x86_64__))
3849  /* LY: issue a memory barrier, if not x86. ITS#7969 */
3850  __sync_synchronize();
3851 #endif
3852  mp->mm_txnid = txn->mt_txnid;
3853  if (!(flags & (MDB_NOMETASYNC|MDB_NOSYNC))) {
3854  unsigned meta_size = env->me_psize;
3855  rc = (env->me_flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC;
3856  ptr = (char *)mp - PAGEHDRSZ;
3857 #ifndef _WIN32 /* POSIX msync() requires ptr = start of OS page */
3858  r2 = (ptr - env->me_map) & (env->me_os_psize - 1);
3859  ptr -= r2;
3860  meta_size += r2;
3861 #endif
3862  if (MDB_MSYNC(ptr, meta_size, rc)) {
3863  rc = ErrCode();
3864  goto fail;
3865  }
3866  }
3867  goto done;
3868  }
3869  metab.mm_txnid = mp->mm_txnid;
3870  metab.mm_last_pg = mp->mm_last_pg;
3871 
3872  meta.mm_mapsize = mapsize;
3873  meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI];
3874  meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI];
3875  meta.mm_last_pg = txn->mt_next_pgno - 1;
3876  meta.mm_txnid = txn->mt_txnid;
3877 
3878  off = offsetof(MDB_meta, mm_mapsize);
3879  ptr = (char *)&meta + off;
3880  len = sizeof(MDB_meta) - off;
3881  off += (char *)mp - env->me_map;
3882 
3883  /* Write to the SYNC fd unless MDB_NOSYNC/MDB_NOMETASYNC.
3884  * (me_mfd goes to the same file as me_fd, but writing to it
3885  * also syncs to disk. Avoids a separate fdatasync() call.)
3886  */
3887  mfd = (flags & (MDB_NOSYNC|MDB_NOMETASYNC)) ? env->me_fd : env->me_mfd;
3888 #ifdef _WIN32
3889  {
3890  memset(&ov, 0, sizeof(ov));
3891  ov.Offset = off;
3892  if (!WriteFile(mfd, ptr, len, (DWORD *)&rc, &ov))
3893  rc = -1;
3894  }
3895 #else
3896 retry_write:
3897  rc = pwrite(mfd, ptr, len, off);
3898 #endif
3899  if (rc != len) {
3900  rc = rc < 0 ? ErrCode() : EIO;
3901 #ifndef _WIN32
3902  if (rc == EINTR)
3903  goto retry_write;
3904 #endif
3905  DPUTS("write failed, disk error?");
3906  /* On a failure, the pagecache still contains the new data.
3907  * Write some old data back, to prevent it from being used.
3908  * Use the non-SYNC fd; we know it will fail anyway.
3909  */
3910  meta.mm_last_pg = metab.mm_last_pg;
3911  meta.mm_txnid = metab.mm_txnid;
3912 #ifdef _WIN32
3913  memset(&ov, 0, sizeof(ov));
3914  ov.Offset = off;
3915  WriteFile(env->me_fd, ptr, len, NULL, &ov);
3916 #else
3917  r2 = pwrite(env->me_fd, ptr, len, off);
3918  (void)r2; /* Silence warnings. We don't care about pwrite's return value */
3919 #endif
3920 fail:
3921  env->me_flags |= MDB_FATAL_ERROR;
3922  return rc;
3923  }
3924  /* MIPS has cache coherency issues, this is a no-op everywhere else */
3925  CACHEFLUSH(env->me_map + off, len, DCACHE);
3926 done:
3927  /* Memory ordering issues are irrelevant; since the entire writer
3928  * is wrapped by wmutex, all of these changes will become visible
3929  * after the wmutex is unlocked. Since the DB is multi-version,
3930  * readers will get consistent data regardless of how fresh or
3931  * how stale their view of these values is.
3932  */
3933  if (env->me_txns)
3934  env->me_txns->mti_txnid = txn->mt_txnid;
3935 
3936  return MDB_SUCCESS;
3937 }
3938 
3939 /** Check both meta pages to see which one is newer.
3940  * @param[in] env the environment handle
3941  * @return newest #MDB_meta.
3942  */
3943 static MDB_meta *
3945 {
3946  MDB_meta *const *metas = env->me_metas;
3947  return metas[ metas[0]->mm_txnid < metas[1]->mm_txnid ];
3948 }
3949 
3950 int ESECT
3952 {
3953  MDB_env *e;
3954 
3955  e = calloc(1, sizeof(MDB_env));
3956  if (!e)
3957  return ENOMEM;
3958 
3960  e->me_maxdbs = e->me_numdbs = CORE_DBS;
3964 #ifdef MDB_USE_POSIX_SEM
3965  e->me_rmutex = SEM_FAILED;
3966  e->me_wmutex = SEM_FAILED;
3967 #endif
3968  e->me_pid = getpid();
3970  VGMEMP_CREATE(e,0,0);
3971  *env = e;
3972  return MDB_SUCCESS;
3973 }
3974 
3975 static int ESECT
3977 {
3978  MDB_page *p;
3979  unsigned int flags = env->me_flags;
3980 #ifdef _WIN32
3981  int rc;
3982  HANDLE mh;
3983  LONG sizelo, sizehi;
3984  size_t msize;
3985 
3986  if (flags & MDB_RDONLY) {
3987  /* Don't set explicit map size, use whatever exists */
3988  msize = 0;
3989  sizelo = 0;
3990  sizehi = 0;
3991  } else {
3992  msize = env->me_mapsize;
3993  sizelo = msize & 0xffffffff;
3994  sizehi = msize >> 16 >> 16; /* only needed on Win64 */
3995 
3996  /* Windows won't create mappings for zero length files.
3997  * and won't map more than the file size.
3998  * Just set the maxsize right now.
3999  */
4000  if (!(flags & MDB_WRITEMAP) && (SetFilePointer(env->me_fd, sizelo, &sizehi, 0) != (DWORD)sizelo
4001  || !SetEndOfFile(env->me_fd)
4002  || SetFilePointer(env->me_fd, 0, NULL, 0) != 0))
4003  return ErrCode();
4004  }
4005 
4006  mh = CreateFileMapping(env->me_fd, NULL, flags & MDB_WRITEMAP ?
4007  PAGE_READWRITE : PAGE_READONLY,
4008  sizehi, sizelo, NULL);
4009  if (!mh)
4010  return ErrCode();
4011  env->me_map = MapViewOfFileEx(mh, flags & MDB_WRITEMAP ?
4012  FILE_MAP_WRITE : FILE_MAP_READ,
4013  0, 0, msize, addr);
4014  rc = env->me_map ? 0 : ErrCode();
4015  CloseHandle(mh);
4016  if (rc)
4017  return rc;
4018 #else
4019  int prot = PROT_READ;
4020  if (flags & MDB_WRITEMAP) {
4021  prot |= PROT_WRITE;
4022  if (ftruncate(env->me_fd, env->me_mapsize) < 0)
4023  return ErrCode();
4024  }
4025  env->me_map = mmap(addr, env->me_mapsize, prot, MAP_SHARED,
4026  env->me_fd, 0);
4027  if (env->me_map == MAP_FAILED) {
4028  env->me_map = NULL;
4029  return ErrCode();
4030  }
4031 
4032  if (flags & MDB_NORDAHEAD) {
4033  /* Turn off readahead. It's harmful when the DB is larger than RAM. */
4034 #ifdef MADV_RANDOM
4035  madvise(env->me_map, env->me_mapsize, MADV_RANDOM);
4036 #else
4037 #ifdef POSIX_MADV_RANDOM
4038  posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM);
4039 #endif /* POSIX_MADV_RANDOM */
4040 #endif /* MADV_RANDOM */
4041  }
4042 #endif /* _WIN32 */
4043 
4044  /* Can happen because the address argument to mmap() is just a
4045  * hint. mmap() can pick another, e.g. if the range is in use.
4046  * The MAP_FIXED flag would prevent that, but then mmap could
4047  * instead unmap existing pages to make room for the new map.
4048  */
4049  if (addr && env->me_map != addr)
4050  return EBUSY; /* TODO: Make a new MDB_* error code? */
4051 
4052  p = (MDB_page *)env->me_map;
4053  env->me_metas[0] = METADATA(p);
4054  env->me_metas[1] = (MDB_meta *)((char *)env->me_metas[0] + env->me_psize);
4055 
4056  return MDB_SUCCESS;
4057 }
4058 
4059 int ESECT
4061 {
4062  /* If env is already open, caller is responsible for making
4063  * sure there are no active txns.
4064  */
4065  if (env->me_map) {
4066  int rc;
4067  MDB_meta *meta;
4068  void *old;
4069  if (env->me_txn)
4070  return EINVAL;
4071  meta = mdb_env_pick_meta(env);
4072  if (!size)
4073  size = meta->mm_mapsize;
4074  {
4075  /* Silently round up to minimum if the size is too small */
4076  size_t minsize = (meta->mm_last_pg + 1) * env->me_psize;
4077  if (size < minsize)
4078  size = minsize;
4079  }
4080  munmap(env->me_map, env->me_mapsize);
4081  env->me_mapsize = size;
4082  old = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : NULL;
4083  rc = mdb_env_map(env, old);
4084  if (rc)
4085  return rc;
4086  }
4087  env->me_mapsize = size;
4088  if (env->me_psize)
4089  env->me_maxpg = env->me_mapsize / env->me_psize;
4090  return MDB_SUCCESS;
4091 }
4092 
4093 int ESECT
4095 {
4096  if (env->me_map)
4097  return EINVAL;
4098  env->me_maxdbs = dbs + CORE_DBS;
4099  return MDB_SUCCESS;
4100 }
4101 
4102 int ESECT
4103 mdb_env_set_maxreaders(MDB_env *env, unsigned int readers)
4104 {
4105  if (env->me_map || readers < 1)
4106  return EINVAL;
4107  env->me_maxreaders = readers;
4108  return MDB_SUCCESS;
4109 }
4110 
4111 int ESECT
4112 mdb_env_get_maxreaders(MDB_env *env, unsigned int *readers)
4113 {
4114  if (!env || !readers)
4115  return EINVAL;
4116  *readers = env->me_maxreaders;
4117  return MDB_SUCCESS;
4118 }
4119 
4120 static int ESECT
4121 mdb_fsize(HANDLE fd, size_t *size)
4122 {
4123 #ifdef _WIN32
4124  LARGE_INTEGER fsize;
4125 
4126  if (!GetFileSizeEx(fd, &fsize))
4127  return ErrCode();
4128 
4129  *size = fsize.QuadPart;
4130 #else
4131  struct stat st;
4132 
4133  if (fstat(fd, &st))
4134  return ErrCode();
4135 
4136  *size = st.st_size;
4137 #endif
4138  return MDB_SUCCESS;
4139 }
4140 
4141 
4142 #ifdef _WIN32
4143 typedef wchar_t mdb_nchar_t;
4144 # define MDB_NAME(str) L##str
4145 # define mdb_name_cpy wcscpy
4146 #else
4147 /** Character type for file names: char on Unix, wchar_t on Windows */
4148 typedef char mdb_nchar_t;
4149 # define MDB_NAME(str) str /**< #mdb_nchar_t[] string literal */
4150 # define mdb_name_cpy strcpy /**< Copy name (#mdb_nchar_t string) */
4151 #endif
4152 
4153 /** Filename - string of #mdb_nchar_t[] */
4154 typedef struct MDB_name {
4155  int mn_len; /**< Length */
4156  int mn_alloced; /**< True if #mn_val was malloced */
4157  mdb_nchar_t *mn_val; /**< Contents */
4159 
4160 /** Filename suffixes [datafile,lockfile][without,with MDB_NOSUBDIR] */
4161 static const mdb_nchar_t *const mdb_suffixes[2][2] = {
4162  { MDB_NAME("/data.mdb"), MDB_NAME("") },
4163  { MDB_NAME("/lock.mdb"), MDB_NAME("-lock") }
4164 };
4165 
4166 #define MDB_SUFFLEN 9 /**< Max string length in #mdb_suffixes[] */
4167 
4168 /** Set up filename + scratch area for filename suffix, for opening files.
4169  * It should be freed with #mdb_fname_destroy().
4170  * On Windows, paths are converted from char *UTF-8 to wchar_t *UTF-16.
4171  *
4172  * @param[in] path Pathname for #mdb_env_open().
4173  * @param[in] envflags Whether a subdir and/or lockfile will be used.
4174  * @param[out] fname Resulting filename, with room for a suffix if necessary.
4175  */
4176 static int ESECT
4177 mdb_fname_init(const char *path, unsigned envflags, MDB_name *fname)
4178 {
4179  int no_suffix = F_ISSET(envflags, MDB_NOSUBDIR|MDB_NOLOCK);
4180  fname->mn_alloced = 0;
4181 #ifdef _WIN32
4182  return utf8_to_utf16(path, fname, no_suffix ? 0 : MDB_SUFFLEN);
4183 #else
4184  fname->mn_len = strlen(path);
4185  if (no_suffix)
4186  fname->mn_val = (char *) path;
4187  else if ((fname->mn_val = malloc(fname->mn_len + MDB_SUFFLEN+1)) != NULL) {
4188  fname->mn_alloced = 1;
4189  strcpy(fname->mn_val, path);
4190  }
4191  else
4192  return ENOMEM;
4193  return MDB_SUCCESS;
4194 #endif
4195 }
4196 
4197 /** Destroy \b fname from #mdb_fname_init() */
4198 #define mdb_fname_destroy(fname) \
4199  do { if ((fname).mn_alloced) free((fname).mn_val); } while (0)
4200 
4201 #ifdef O_CLOEXEC /* POSIX.1-2008: Set FD_CLOEXEC atomically at open() */
4202 # define MDB_CLOEXEC O_CLOEXEC
4203 #else
4204 # define MDB_CLOEXEC 0
4205 #endif
4206 
4207 /** File type, access mode etc. for #mdb_fopen() */
4209 #ifdef _WIN32
4211 #else
4212  /* A comment in mdb_fopen() explains some O_* flag choices. */
4213  MDB_O_RDONLY= O_RDONLY, /**< for RDONLY me_fd */
4214  MDB_O_RDWR = O_RDWR |O_CREAT, /**< for me_fd */
4215  MDB_O_META = O_WRONLY|MDB_DSYNC |MDB_CLOEXEC, /**< for me_mfd */
4216  MDB_O_COPY = O_WRONLY|O_CREAT|O_EXCL|MDB_CLOEXEC, /**< for #mdb_env_copy() */
4217  /** Bitmask for open() flags in enum #mdb_fopen_type. The other bits
4218  * distinguish otherwise-equal MDB_O_* constants from each other.
4219  */
4221  MDB_O_LOCKS = MDB_O_RDWR|MDB_CLOEXEC | ((MDB_O_MASK+1) & ~MDB_O_MASK) /**< for me_lfd */
4222 #endif
4223 };
4224 
4225 /** Open an LMDB file.
4226  * @param[in] env The LMDB environment.
4227  * @param[in,out] fname Path from from #mdb_fname_init(). A suffix is
4228  * appended if necessary to create the filename, without changing mn_len.
4229  * @param[in] which Determines file type, access mode, etc.
4230  * @param[in] mode The Unix permissions for the file, if we create it.
4231  * @param[out] res Resulting file handle.
4232  * @return 0 on success, non-zero on failure.
4233  */
4234 static int ESECT
4236  enum mdb_fopen_type which, mdb_mode_t mode,
4237  HANDLE *res)
4238 {
4239  int rc = MDB_SUCCESS;
4240  HANDLE fd;
4241 #ifdef _WIN32
4242  DWORD acc, share, disp, attrs;
4243 #else
4244  int flags;
4245 #endif
4246 
4247  if (fname->mn_alloced) /* modifiable copy */
4248  mdb_name_cpy(fname->mn_val + fname->mn_len,
4249  mdb_suffixes[which==MDB_O_LOCKS][F_ISSET(env->me_flags, MDB_NOSUBDIR)]);
4250 
4251  /* The directory must already exist. Usually the file need not.
4252  * MDB_O_META requires the file because we already created it using
4253  * MDB_O_RDWR. MDB_O_COPY must not overwrite an existing file.
4254  *
4255  * With MDB_O_COPY we do not want the OS to cache the writes, since
4256  * the source data is already in the OS cache.
4257  *
4258  * The lockfile needs FD_CLOEXEC (close file descriptor on exec*())
4259  * to avoid the flock() issues noted under Caveats in lmdb.h.
4260  * Also set it for other filehandles which the user cannot get at
4261  * and close himself, which he may need after fork(). I.e. all but
4262  * me_fd, which programs do use via mdb_env_get_fd().
4263  */
4264 
4265 #ifdef _WIN32
4266  acc = GENERIC_READ|GENERIC_WRITE;
4267  share = FILE_SHARE_READ|FILE_SHARE_WRITE;
4268  disp = OPEN_ALWAYS;
4269  attrs = FILE_ATTRIBUTE_NORMAL;
4270  switch (which) {
4271  case MDB_O_RDONLY: /* read-only datafile */
4272  acc = GENERIC_READ;
4273  disp = OPEN_EXISTING;
4274  break;
4275  case MDB_O_META: /* for writing metapages */
4276  acc = GENERIC_WRITE;
4277  disp = OPEN_EXISTING;
4278  attrs = FILE_ATTRIBUTE_NORMAL|FILE_FLAG_WRITE_THROUGH;
4279  break;
4280  case MDB_O_COPY: /* mdb_env_copy() & co */
4281  acc = GENERIC_WRITE;
4282  share = 0;
4283  disp = CREATE_NEW;
4284  attrs = FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH;
4285  break;
4286  default: break; /* silence gcc -Wswitch (not all enum values handled) */
4287  }
4288  fd = CreateFileW(fname->mn_val, acc, share, NULL, disp, attrs, NULL);
4289 #else
4290  fd = open(fname->mn_val, which & MDB_O_MASK, mode);
4291 #endif
4292 
4293  if (fd == INVALID_HANDLE_VALUE)
4294  rc = ErrCode();
4295 #ifndef _WIN32
4296  else {
4297  if (which != MDB_O_RDONLY && which != MDB_O_RDWR) {
4298  /* Set CLOEXEC if we could not pass it to open() */
4299  if (!MDB_CLOEXEC && (flags = fcntl(fd, F_GETFD)) != -1)
4300  (void) fcntl(fd, F_SETFD, flags | FD_CLOEXEC);
4301  }
4302  if (which == MDB_O_COPY && env->me_psize >= env->me_os_psize) {
4303  /* This may require buffer alignment. There is no portable
4304  * way to ask how much, so we require OS pagesize alignment.
4305  */
4306 # ifdef F_NOCACHE /* __APPLE__ */
4307  (void) fcntl(fd, F_NOCACHE, 1);
4308 # elif defined O_DIRECT
4309  /* open(...O_DIRECT...) would break on filesystems without
4310  * O_DIRECT support (ITS#7682). Try to set it here instead.
4311  */
4312  if ((flags = fcntl(fd, F_GETFL)) != -1)
4313  (void) fcntl(fd, F_SETFL, flags | O_DIRECT);
4314 # endif
4315  }
4316  }
4317 #endif /* !_WIN32 */
4318 
4319  *res = fd;
4320  return rc;
4321 }
4322 
4323 
4324 #ifdef BROKEN_FDATASYNC
4325 #include <sys/utsname.h>
4326 #include <sys/vfs.h>
4327 #endif
4328 
4329 /** Further setup required for opening an LMDB environment
4330  */
4331 static int ESECT
4333 {
4334  unsigned int flags = env->me_flags;
4335  int i, newenv = 0, rc;
4336  MDB_meta meta;
4337 
4338 #ifdef _WIN32
4339  /* See if we should use QueryLimited */
4340  rc = GetVersion();
4341  if ((rc & 0xff) > 5)
4342  env->me_pidquery = MDB_PROCESS_QUERY_LIMITED_INFORMATION;
4343  else
4344  env->me_pidquery = PROCESS_QUERY_INFORMATION;
4345 #endif /* _WIN32 */
4346 
4347 #ifdef BROKEN_FDATASYNC
4348  /* ext3/ext4 fdatasync is broken on some older Linux kernels.
4349  * https://lkml.org/lkml/2012/9/3/83
4350  * Kernels after 3.6-rc6 are known good.
4351  * https://lkml.org/lkml/2012/9/10/556
4352  * See if the DB is on ext3/ext4, then check for new enough kernel
4353  * Kernels 2.6.32.60, 2.6.34.15, 3.2.30, and 3.5.4 are also known
4354  * to be patched.
4355  */
4356  {
4357  struct statfs st;
4358  fstatfs(env->me_fd, &st);
4359  while (st.f_type == 0xEF53) {
4360  struct utsname uts;
4361  int i;
4362  uname(&uts);
4363  if (uts.release[0] < '3') {
4364  if (!strncmp(uts.release, "2.6.32.", 7)) {
4365  i = atoi(uts.release+7);
4366  if (i >= 60)
4367  break; /* 2.6.32.60 and newer is OK */
4368  } else if (!strncmp(uts.release, "2.6.34.", 7)) {
4369  i = atoi(uts.release+7);
4370  if (i >= 15)
4371  break; /* 2.6.34.15 and newer is OK */
4372  }
4373  } else if (uts.release[0] == '3') {
4374  i = atoi(uts.release+2);
4375  if (i > 5)
4376  break; /* 3.6 and newer is OK */
4377  if (i == 5) {
4378  i = atoi(uts.release+4);
4379  if (i >= 4)
4380  break; /* 3.5.4 and newer is OK */
4381  } else if (i == 2) {
4382  i = atoi(uts.release+4);
4383  if (i >= 30)
4384  break; /* 3.2.30 and newer is OK */
4385  }
4386  } else { /* 4.x and newer is OK */
4387  break;
4388  }
4389  env->me_flags |= MDB_FSYNCONLY;
4390  break;
4391  }
4392  }
4393 #endif
4394 
4395  if ((i = mdb_env_read_header(env, &meta)) != 0) {
4396  if (i != ENOENT)
4397  return i;
4398  DPUTS("new mdbenv");
4399  newenv = 1;
4400  env->me_psize = env->me_os_psize;
4401  if (env->me_psize > MAX_PAGESIZE)
4402  env->me_psize = MAX_PAGESIZE;
4403  memset(&meta, 0, sizeof(meta));
4404  mdb_env_init_meta0(env, &meta);
4405  meta.mm_mapsize = DEFAULT_MAPSIZE;
4406  } else {
4407  env->me_psize = meta.mm_psize;
4408  }
4409 
4410  /* Was a mapsize configured? */
4411  if (!env->me_mapsize) {
4412  env->me_mapsize = meta.mm_mapsize;
4413  }
4414  {
4415  /* Make sure mapsize >= committed data size. Even when using
4416  * mm_mapsize, which could be broken in old files (ITS#7789).
4417  */
4418  size_t minsize = (meta.mm_last_pg + 1) * meta.mm_psize;
4419  if (env->me_mapsize < minsize)
4420  env->me_mapsize = minsize;
4421  }
4422  meta.mm_mapsize = env->me_mapsize;
4423 
4424  if (newenv && !(flags & MDB_FIXEDMAP)) {
4425  /* mdb_env_map() may grow the datafile. Write the metapages
4426  * first, so the file will be valid if initialization fails.
4427  * Except with FIXEDMAP, since we do not yet know mm_address.
4428  * We could fill in mm_address later, but then a different
4429  * program might end up doing that - one with a memory layout
4430  * and map address which does not suit the main program.
4431  */
4432  rc = mdb_env_init_meta(env, &meta);
4433  if (rc)
4434  return rc;
4435  newenv = 0;
4436  }
4437 
4438  rc = mdb_env_map(env, (flags & MDB_FIXEDMAP) ? meta.mm_address : NULL);
4439  if (rc)
4440  return rc;
4441 
4442  if (newenv) {
4443  if (flags & MDB_FIXEDMAP)
4444  meta.mm_address = env->me_map;
4445  i = mdb_env_init_meta(env, &meta);
4446  if (i != MDB_SUCCESS) {
4447  return i;
4448  }
4449  }
4450 
4451  env->me_maxfree_1pg = (env->me_psize - PAGEHDRSZ) / sizeof(pgno_t) - 1;
4452  env->me_nodemax = (((env->me_psize - PAGEHDRSZ) / MDB_MINKEYS) & -2)
4453  - sizeof(indx_t);
4454 #if !(MDB_MAXKEYSIZE)
4455  env->me_maxkey = env->me_nodemax - (NODESIZE + sizeof(MDB_db));
4456 #endif
4457  env->me_maxpg = env->me_mapsize / env->me_psize;
4458 
4459 #if MDB_DEBUG
4460  {
4461  MDB_meta *meta = mdb_env_pick_meta(env);
4462  MDB_db *db = &meta->mm_dbs[MAIN_DBI];
4463 
4464  DPRINTF(("opened database version %u, pagesize %u",
4465  meta->mm_version, env->me_psize));
4466  DPRINTF(("using meta page %d", (int) (meta->mm_txnid & 1)));
4467  DPRINTF(("depth: %u", db->md_depth));
4468  DPRINTF(("entries: %"Z"u", db->md_entries));
4469  DPRINTF(("branch pages: %"Z"u", db->md_branch_pages));
4470  DPRINTF(("leaf pages: %"Z"u", db->md_leaf_pages));
4471  DPRINTF(("overflow pages: %"Z"u", db->md_overflow_pages));
4472  DPRINTF(("root: %"Z"u", db->md_root));
4473  }
4474 #endif
4475 
4476  return MDB_SUCCESS;
4477 }
4478 
4479 
4480 /** Release a reader thread's slot in the reader lock table.
4481  * This function is called automatically when a thread exits.
4482  * @param[in] ptr This points to the slot in the reader lock table.
4483  */
4484 static void
4486 {
4487  MDB_reader *reader = ptr;
4488 
4489 #ifndef _WIN32
4490  if (reader->mr_pid == getpid()) /* catch pthread_exit() in child process */
4491 #endif
4492  /* We omit the mutex, so do this atomically (i.e. skip mr_txnid) */
4493  reader->mr_pid = 0;
4494 }
4495 
4496 #ifdef _WIN32
4497 /** Junk for arranging thread-specific callbacks on Windows. This is
4498  * necessarily platform and compiler-specific. Windows supports up
4499  * to 1088 keys. Let's assume nobody opens more than 64 environments
4500  * in a single process, for now. They can override this if needed.
4501  */
4502 #ifndef MAX_TLS_KEYS
4503 #define MAX_TLS_KEYS 64
4504 #endif
4505 static pthread_key_t mdb_tls_keys[MAX_TLS_KEYS];
4506 static int mdb_tls_nkeys;
4507 
4508 static void NTAPI mdb_tls_callback(PVOID module, DWORD reason, PVOID ptr)
4509 {
4510  int i;
4511  switch(reason) {
4512  case DLL_PROCESS_ATTACH: break;
4513  case DLL_THREAD_ATTACH: break;
4514  case DLL_THREAD_DETACH:
4515  for (i=0; i<mdb_tls_nkeys; i++) {
4516  MDB_reader *r = pthread_getspecific(mdb_tls_keys[i]);
4517  if (r) {
4519  }
4520  }
4521  break;
4522  case DLL_PROCESS_DETACH: break;
4523  }
4524 }
4525 #ifdef __GNUC__
4526 #ifdef _WIN64
4527 const PIMAGE_TLS_CALLBACK mdb_tls_cbp __attribute__((section (".CRT$XLB"))) = mdb_tls_callback;
4528 #else
4529 PIMAGE_TLS_CALLBACK mdb_tls_cbp __attribute__((section (".CRT$XLB"))) = mdb_tls_callback;
4530 #endif
4531 #else
4532 #ifdef _WIN64
4533 /* Force some symbol references.
4534  * _tls_used forces the linker to create the TLS directory if not already done
4535  * mdb_tls_cbp prevents whole-program-optimizer from dropping the symbol.
4536  */
4537 #pragma comment(linker, "/INCLUDE:_tls_used")
4538 #pragma comment(linker, "/INCLUDE:mdb_tls_cbp")
4539 #pragma const_seg(".CRT$XLB")
4540 extern const PIMAGE_TLS_CALLBACK mdb_tls_cbp;
4541 const PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback;
4542 #pragma const_seg()
4543 #else /* _WIN32 */
4544 #pragma comment(linker, "/INCLUDE:__tls_used")
4545 #pragma comment(linker, "/INCLUDE:_mdb_tls_cbp")
4546 #pragma data_seg(".CRT$XLB")
4547 PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback;
4548 #pragma data_seg()
4549 #endif /* WIN 32/64 */
4550 #endif /* !__GNUC__ */
4551 #endif
4552 
4553 /** Downgrade the exclusive lock on the region back to shared */
4554 static int ESECT
4556 {
4557  int rc = 0;
4558  MDB_meta *meta = mdb_env_pick_meta(env);
4559 
4560  env->me_txns->mti_txnid = meta->mm_txnid;
4561 
4562 #ifdef _WIN32
4563  {
4564  OVERLAPPED ov;
4565  /* First acquire a shared lock. The Unlock will
4566  * then release the existing exclusive lock.
4567  */
4568  memset(&ov, 0, sizeof(ov));
4569  if (!LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) {
4570  rc = ErrCode();
4571  } else {
4572  UnlockFile(env->me_lfd, 0, 0, 1, 0);
4573  *excl = 0;
4574  }
4575  }
4576 #else
4577  {
4578  struct flock lock_info;
4579  /* The shared lock replaces the existing lock */
4580  memset((void *)&lock_info, 0, sizeof(lock_info));
4581  lock_info.l_type = F_RDLCK;
4582  lock_info.l_whence = SEEK_SET;
4583  lock_info.l_start = 0;
4584  lock_info.l_len = 1;
4585  while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) &&
4586  (rc = ErrCode()) == EINTR) ;
4587  *excl = rc ? -1 : 0; /* error may mean we lost the lock */
4588  }
4589 #endif
4590 
4591  return rc;
4592 }
4593 
4594 /** Try to get exclusive lock, otherwise shared.
4595  * Maintain *excl = -1: no/unknown lock, 0: shared, 1: exclusive.
4596  */
4597 static int ESECT
4599 {
4600  int rc = 0;
4601 #ifdef _WIN32
4602  if (LockFile(env->me_lfd, 0, 0, 1, 0)) {
4603  *excl = 1;
4604  } else {
4605  OVERLAPPED ov;
4606  memset(&ov, 0, sizeof(ov));
4607  if (LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) {
4608  *excl = 0;
4609  } else {
4610  rc = ErrCode();
4611  }
4612  }
4613 #else
4614  struct flock lock_info;
4615  memset((void *)&lock_info, 0, sizeof(lock_info));
4616  lock_info.l_type = F_WRLCK;
4617  lock_info.l_whence = SEEK_SET;
4618  lock_info.l_start = 0;
4619  lock_info.l_len = 1;
4620  while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) &&
4621  (rc = ErrCode()) == EINTR) ;
4622  if (!rc) {
4623  *excl = 1;
4624  } else
4625 # ifndef MDB_USE_POSIX_MUTEX
4626  if (*excl < 0) /* always true when MDB_USE_POSIX_MUTEX */
4627 # endif
4628  {
4629  lock_info.l_type = F_RDLCK;
4630  while ((rc = fcntl(env->me_lfd, F_SETLKW, &lock_info)) &&
4631  (rc = ErrCode()) == EINTR) ;
4632  if (rc == 0)
4633  *excl = 0;
4634  }
4635 #endif
4636  return rc;
4637 }
4638 
4639 #ifdef MDB_USE_HASH
4640 /*
4641  * hash_64 - 64 bit Fowler/Noll/Vo-0 FNV-1a hash code
4642  *
4643  * @(#) $Revision: 89498 $
4644  * @(#) $Id: mdb.c 89498 2020-04-03 14:10:47Z ivanov $
4645  * @(#) $Source: /usr/local/src/cmd/fnv/RCS/hash_64a.c,v $
4646  *
4647  * http://www.isthe.com/chongo/tech/comp/fnv/index.html
4648  *
4649  ***
4650  *
4651  * Please do not copyright this code. This code is in the public domain.
4652  *
4653  * LANDON CURT NOLL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
4654  * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO
4655  * EVENT SHALL LANDON CURT NOLL BE LIABLE FOR ANY SPECIAL, INDIRECT OR
4656  * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
4657  * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
4658  * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
4659  * PERFORMANCE OF THIS SOFTWARE.
4660  *
4661  * By:
4662  * chongo <Landon Curt Noll> /\oo/\
4663  * http://www.isthe.com/chongo/
4664  *
4665  * Share and Enjoy! :-)
4666  */
4667 
4668 typedef unsigned long long mdb_hash_t;
4669 #define MDB_HASH_INIT ((mdb_hash_t)0xcbf29ce484222325ULL)
4670 
4671 /** perform a 64 bit Fowler/Noll/Vo FNV-1a hash on a buffer
4672  * @param[in] val value to hash
4673  * @param[in] hval initial value for hash
4674  * @return 64 bit hash
4675  *
4676  * NOTE: To use the recommended 64 bit FNV-1a hash, use MDB_HASH_INIT as the
4677  * hval arg on the first call.
4678  */
4679 static mdb_hash_t
4680 mdb_hash_val(MDB_val *val, mdb_hash_t hval)
4681 {
4682  unsigned char *s = (unsigned char *)val->mv_data; /* unsigned string */
4683  unsigned char *end = s + val->mv_size;
4684  /*
4685  * FNV-1a hash each octet of the string
4686  */
4687  while (s < end) {
4688  /* xor the bottom with the current octet */
4689  hval ^= (mdb_hash_t)*s++;
4690 
4691  /* multiply by the 64 bit FNV magic prime mod 2^64 */
4692  hval += (hval << 1) + (hval << 4) + (hval << 5) +
4693  (hval << 7) + (hval << 8) + (hval << 40);
4694  }
4695  /* return our new hash value */
4696  return hval;
4697 }
4698 
4699 /** Hash the string and output the encoded hash.
4700  * This uses modified RFC1924 Ascii85 encoding to accommodate systems with
4701  * very short name limits. We don't care about the encoding being reversible,
4702  * we just want to preserve as many bits of the input as possible in a
4703  * small printable string.
4704  * @param[in] str string to hash
4705  * @param[out] encbuf an array of 11 chars to hold the hash
4706  */
4707 static const char mdb_a85[]= "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~";
4708 
4709 static void ESECT
4710 mdb_pack85(unsigned long l, char *out)
4711 {
4712  int i;
4713 
4714  for (i=0; i<5; i++) {
4715  *out++ = mdb_a85[l % 85];
4716  l /= 85;
4717  }
4718 }
4719 
4720 static void ESECT
4721 mdb_hash_enc(MDB_val *val, char *encbuf)
4722 {
4723  mdb_hash_t h = mdb_hash_val(val, MDB_HASH_INIT);
4724 
4725  mdb_pack85(h, encbuf);
4726  mdb_pack85(h>>32, encbuf+5);
4727  encbuf[10] = '\0';
4728 }
4729 #endif
4730 
4731 /** Open and/or initialize the lock region for the environment.
4732  * @param[in] env The LMDB environment.
4733  * @param[in] fname Filename + scratch area, from #mdb_fname_init().
4734  * @param[in] mode The Unix permissions for the file, if we create it.
4735  * @param[in,out] excl In -1, out lock type: -1 none, 0 shared, 1 exclusive
4736  * @return 0 on success, non-zero on failure.
4737  */
4738 static int ESECT
4739 mdb_env_setup_locks(MDB_env *env, MDB_name *fname, int mode, int *excl)
4740 {
4741 #ifdef _WIN32
4742 # define MDB_ERRCODE_ROFS ERROR_WRITE_PROTECT
4743 #else
4744 # define MDB_ERRCODE_ROFS EROFS
4745 #endif
4746  int rc;
4747  off_t size, rsize;
4748 
4749  rc = mdb_fopen(env, fname, MDB_O_LOCKS, mode, &env->me_lfd);
4750  if (rc) {
4751  /* Omit lockfile if read-only env on read-only filesystem */
4752  if (rc == MDB_ERRCODE_ROFS && (env->me_flags & MDB_RDONLY)) {
4753  return MDB_SUCCESS;
4754  }
4755  goto fail;
4756  }
4757 
4758  if (!(env->me_flags & MDB_NOTLS)) {
4759  rc = pthread_key_create(&env->me_txkey, mdb_env_reader_dest);
4760  if (rc)
4761  goto fail;
4762  env->me_flags |= MDB_ENV_TXKEY;
4763 #ifdef _WIN32
4764  /* Windows TLS callbacks need help finding their TLS info. */
4765  if (mdb_tls_nkeys >= MAX_TLS_KEYS) {
4766  rc = MDB_TLS_FULL;
4767  goto fail;
4768  }
4769  mdb_tls_keys[mdb_tls_nkeys++] = env->me_txkey;
4770 #endif
4771  }
4772 
4773  /* Try to get exclusive lock. If we succeed, then
4774  * nobody is using the lock region and we should initialize it.
4775  */
4776  if ((rc = mdb_env_excl_lock(env, excl))) goto fail;
4777 
4778 #ifdef _WIN32
4779  size = GetFileSize(env->me_lfd, NULL);
4780 #else
4781  size = lseek(env->me_lfd, 0, SEEK_END);
4782  if (size == -1) goto fail_errno;
4783 #endif
4784  rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo);
4785  if (size < rsize && *excl > 0) {
4786 #ifdef _WIN32
4787  if (SetFilePointer(env->me_lfd, rsize, NULL, FILE_BEGIN) != (DWORD)rsize
4788  || !SetEndOfFile(env->me_lfd))
4789  goto fail_errno;
4790 #else
4791  if (ftruncate(env->me_lfd, rsize) != 0) goto fail_errno;
4792 #endif
4793  } else {
4794  rsize = size;
4795  size = rsize - sizeof(MDB_txninfo);
4796  env->me_maxreaders = size/sizeof(MDB_reader) + 1;
4797  }
4798  {
4799 #ifdef _WIN32
4800  HANDLE mh;
4801  mh = CreateFileMapping(env->me_lfd, NULL, PAGE_READWRITE,
4802  0, 0, NULL);
4803  if (!mh) goto fail_errno;
4804  env->me_txns = MapViewOfFileEx(mh, FILE_MAP_WRITE, 0, 0, rsize, NULL);
4805  CloseHandle(mh);
4806  if (!env->me_txns) goto fail_errno;
4807 #else
4808  void *m = mmap(NULL, rsize, PROT_READ|PROT_WRITE, MAP_SHARED,
4809  env->me_lfd, 0);
4810  if (m == MAP_FAILED) goto fail_errno;
4811  env->me_txns = m;
4812 #endif
4813  }
4814  if (*excl > 0) {
4815 #ifdef _WIN32
4816  BY_HANDLE_FILE_INFORMATION stbuf;
4817  struct {
4818  DWORD volume;
4819  DWORD nhigh;
4820  DWORD nlow;
4821  } idbuf;
4822  MDB_val val;
4823  char encbuf[11];
4824 
4825  if (!mdb_sec_inited) {
4826  InitializeSecurityDescriptor(&mdb_null_sd,
4827  SECURITY_DESCRIPTOR_REVISION);
4828  SetSecurityDescriptorDacl(&mdb_null_sd, TRUE, 0, FALSE);
4829  mdb_all_sa.nLength = sizeof(SECURITY_ATTRIBUTES);
4830  mdb_all_sa.bInheritHandle = FALSE;
4831  mdb_all_sa.lpSecurityDescriptor = &mdb_null_sd;
4832  mdb_sec_inited = 1;
4833  }
4834  if (!GetFileInformationByHandle(env->me_lfd, &stbuf)) goto fail_errno;
4835  idbuf.volume = stbuf.dwVolumeSerialNumber;
4836  idbuf.nhigh = stbuf.nFileIndexHigh;
4837  idbuf.nlow = stbuf.nFileIndexLow;
4838  val.mv_data = &idbuf;
4839  val.mv_size = sizeof(idbuf);
4840  mdb_hash_enc(&val, encbuf);
4841  sprintf(env->me_txns->mti_rmname, "Global\\MDBr%s", encbuf);
4842  sprintf(env->me_txns->mti_wmname, "Global\\MDBw%s", encbuf);
4843  env->me_rmutex = CreateMutexA(&mdb_all_sa, FALSE, env->me_txns->mti_rmname);
4844  if (!env->me_rmutex) goto fail_errno;
4845  env->me_wmutex = CreateMutexA(&mdb_all_sa, FALSE, env->me_txns->mti_wmname);
4846  if (!env->me_wmutex) goto fail_errno;
4847 #elif defined(MDB_USE_POSIX_SEM)
4848  struct stat stbuf;
4849  struct {
4850  dev_t dev;
4851  ino_t ino;
4852  } idbuf;
4853  MDB_val val;
4854  char encbuf[11];
4855 
4856 #if defined(__NetBSD__)
4857 #define MDB_SHORT_SEMNAMES 1 /* limited to 14 chars */
4858 #endif
4859  if (fstat(env->me_lfd, &stbuf)) goto fail_errno;
4860  idbuf.dev = stbuf.st_dev;
4861  idbuf.ino = stbuf.st_ino;
4862  val.mv_data = &idbuf;
4863  val.mv_size = sizeof(idbuf);
4864  mdb_hash_enc(&val, encbuf);
4865 #ifdef MDB_SHORT_SEMNAMES
4866  encbuf[9] = '\0'; /* drop name from 15 chars to 14 chars */
4867 #endif
4868  sprintf(env->me_txns->mti_rmname, "/MDBr%s", encbuf);
4869  sprintf(env->me_txns->mti_wmname, "/MDBw%s", encbuf);
4870  /* Clean up after a previous run, if needed: Try to
4871  * remove both semaphores before doing anything else.
4872  */
4873  sem_unlink(env->me_txns->mti_rmname);
4874  sem_unlink(env->me_txns->mti_wmname);
4875  env->me_rmutex = sem_open(env->me_txns->mti_rmname,
4876  O_CREAT|O_EXCL, mode, 1);
4877  if (env->me_rmutex == SEM_FAILED) goto fail_errno;
4878  env->me_wmutex = sem_open(env->me_txns->mti_wmname,
4879  O_CREAT|O_EXCL, mode, 1);
4880  if (env->me_wmutex == SEM_FAILED) goto fail_errno;
4881 #else /* MDB_USE_POSIX_MUTEX: */
4882  pthread_mutexattr_t mattr;
4883 
4884  /* Solaris needs this before initing a robust mutex. Otherwise
4885  * it may skip the init and return EBUSY "seems someone already
4886  * inited" or EINVAL "it was inited differently".
4887  */
4888  memset(env->me_txns->mti_rmutex, 0, sizeof(*env->me_txns->mti_rmutex));
4889  memset(env->me_txns->mti_wmutex, 0, sizeof(*env->me_txns->mti_wmutex));
4890 
4891  if ((rc = pthread_mutexattr_init(&mattr)))
4892  goto fail;
4893 
4894  rc = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED);
4895 #ifdef MDB_ROBUST_SUPPORTED
4896  if (!rc) rc = pthread_mutexattr_setrobust(&mattr, PTHREAD_MUTEX_ROBUST);
4897 #endif
4898  if (!rc) rc = pthread_mutex_init(env->me_txns->mti_rmutex, &mattr);
4899  if (!rc) rc = pthread_mutex_init(env->me_txns->mti_wmutex, &mattr);
4900  pthread_mutexattr_destroy(&mattr);
4901  if (rc)
4902  goto fail;
4903 #endif /* _WIN32 || MDB_USE_POSIX_SEM */
4904 
4905  env->me_txns->mti_magic = MDB_MAGIC;
4906  env->me_txns->mti_format = MDB_LOCK_FORMAT;
4907  env->me_txns->mti_txnid = 0;
4908  env->me_txns->mti_numreaders = 0;
4909 
4910  } else {
4911  if (env->me_txns->mti_magic != MDB_MAGIC) {
4912  DPUTS("lock region has invalid magic");
4913  rc = MDB_INVALID;
4914  goto fail;
4915  }
4916  if (env->me_txns->mti_format != MDB_LOCK_FORMAT) {
4917  DPRINTF(("lock region has format+version 0x%x, expected 0x%x",
4918  env->me_txns->mti_format, MDB_LOCK_FORMAT));
4919  rc = MDB_VERSION_MISMATCH;
4920  goto fail;
4921  }
4922  rc = ErrCode();
4923  if (rc && rc != EACCES && rc != EAGAIN) {
4924  goto fail;
4925  }
4926 #ifdef _WIN32
4927  env->me_rmutex = OpenMutexA(SYNCHRONIZE, FALSE, env->me_txns->mti_rmname);
4928  if (!env->me_rmutex) goto fail_errno;
4929  env->me_wmutex = OpenMutexA(SYNCHRONIZE, FALSE, env->me_txns->mti_wmname);
4930  if (!env->me_wmutex) goto fail_errno;
4931 #elif defined(MDB_USE_POSIX_SEM)
4932  env->me_rmutex = sem_open(env->me_txns->mti_rmname, 0);
4933  if (env->me_rmutex == SEM_FAILED) goto fail_errno;
4934  env->me_wmutex = sem_open(env->me_txns->mti_wmname, 0);
4935  if (env->me_wmutex == SEM_FAILED) goto fail_errno;
4936 #endif
4937  }
4938  return MDB_SUCCESS;
4939 
4940 fail_errno:
4941  rc = ErrCode();
4942 fail:
4943  return rc;
4944 }
4945 
4946  /** Only a subset of the @ref mdb_env flags can be changed
4947  * at runtime. Changing other flags requires closing the
4948  * environment and re-opening it with the new flags.
4949  */
4950 #define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC|MDB_NOMEMINIT)
4951 #define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY| \
4952  MDB_WRITEMAP|MDB_NOTLS|MDB_NOLOCK|MDB_NORDAHEAD)
4953 
4954 #if VALID_FLAGS & PERSISTENT_FLAGS & (CHANGEABLE|CHANGELESS)
4955 # error "Persistent DB flags & env flags overlap, but both go in mm_flags"
4956 #endif
4957 
4958 int ESECT
4959 mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode)
4960 {
4961  int rc, excl = -1;
4962  MDB_name fname;
4963 
4964  if (env->me_fd!=INVALID_HANDLE_VALUE || (flags & ~(CHANGEABLE|CHANGELESS)))
4965  return EINVAL;
4966 
4967  flags |= env->me_flags;
4968 
4969  rc = mdb_fname_init(path, flags, &fname);
4970  if (rc)
4971  return rc;
4972 
4973  if (flags & MDB_RDONLY) {
4974  /* silently ignore WRITEMAP when we're only getting read access */
4975  flags &= ~MDB_WRITEMAP;
4976  } else {
4977  if (!((env->me_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX)) &&
4978  (env->me_dirty_list = calloc(MDB_IDL_UM_SIZE, sizeof(MDB_ID2)))))
4979  rc = ENOMEM;
4980  }
4981  env->me_flags = flags |= MDB_ENV_ACTIVE;
4982  if (rc)
4983  goto leave;
4984 
4985  env->me_path = strdup(path);
4986  env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx));
4987  env->me_dbflags = calloc(env->me_maxdbs, sizeof(uint16_t));
4988  env->me_dbiseqs = calloc(env->me_maxdbs, sizeof(unsigned int));
4989  if (!(env->me_dbxs && env->me_path && env->me_dbflags && env->me_dbiseqs)) {
4990  rc = ENOMEM;
4991  goto leave;
4992  }
4993  env->me_dbxs[FREE_DBI].md_cmp = mdb_cmp_long; /* aligned MDB_INTEGERKEY */
4994 
4995  /* For RDONLY, get lockfile after we know datafile exists */
4996  if (!(flags & (MDB_RDONLY|MDB_NOLOCK))) {
4997  rc = mdb_env_setup_locks(env, &fname, mode, &excl);
4998  if (rc)
4999  goto leave;
5000  }
5001 
5002  rc = mdb_fopen(env, &fname,
5004  mode, &env->me_fd);
5005  if (rc)
5006  goto leave;
5007 
5008  if ((flags & (MDB_RDONLY|MDB_NOLOCK)) == MDB_RDONLY) {
5009  rc = mdb_env_setup_locks(env, &fname, mode, &excl);
5010  if (rc)
5011  goto leave;
5012  }
5013 
5014  if ((rc = mdb_env_open2(env)) == MDB_SUCCESS) {
5015  if (!(flags & (MDB_RDONLY|MDB_WRITEMAP))) {
5016  /* Synchronous fd for meta writes. Needed even with
5017  * MDB_NOSYNC/MDB_NOMETASYNC, in case these get reset.
5018  */
5019  rc = mdb_fopen(env, &fname, MDB_O_META, mode, &env->me_mfd);
5020  if (rc)
5021  goto leave;
5022  }
5023  DPRINTF(("opened dbenv %p", (void *) env));
5024  if (excl > 0) {
5025  rc = mdb_env_share_locks(env, &excl);
5026  if (rc)
5027  goto leave;
5028  }
5029  if (!(flags & MDB_RDONLY)) {
5030  MDB_txn *txn;
5031  int tsize = sizeof(MDB_txn), size = tsize + env->me_maxdbs *
5032  (sizeof(MDB_db)+sizeof(MDB_cursor *)+sizeof(unsigned int)+1);
5033  if ((env->me_pbuf = calloc(1, env->me_psize)) &&
5034  (txn = calloc(1,