diff --git a/api/leveldb/Makefile.am b/api/leveldb/Makefile.am index 44aa69bbd48..2cfd9d945a5 100644 --- a/api/leveldb/Makefile.am +++ b/api/leveldb/Makefile.am @@ -16,7 +16,7 @@ leveldbincludedir = $(includedir)/wiredtiger/leveldb endif endif leveldbinclude_HEADERS = \ - wiredtiger_config.h \ + leveldb_wt_config.h \ leveldb/include/leveldb/cache.h \ leveldb/include/leveldb/comparator.h\ leveldb/include/leveldb/db.h \ diff --git a/api/leveldb/basho/perf_count.h b/api/leveldb/basho/perf_count.h index 0edf1b96549..b0f4abf9b66 100644 --- a/api/leveldb/basho/perf_count.h +++ b/api/leveldb/basho/perf_count.h @@ -23,7 +23,7 @@ #ifndef STORAGE_LEVELDB_INCLUDE_PERF_COUNT_H_ #define STORAGE_LEVELDB_INCLUDE_PERF_COUNT_H_ -#include "wiredtiger_config.h" +#include "leveldb_wt_config.h" #include #include diff --git a/api/leveldb/config.hin b/api/leveldb/config.hin new file mode 100644 index 00000000000..131b68969d3 --- /dev/null +++ b/api/leveldb/config.hin @@ -0,0 +1,22 @@ +/* api/leveldb/config.hin. Generated by autoheader, then hand-edited. */ + +/* Build the LevelDB API with Basho LevelDB support. */ +#undef HAVE_BASHOLEVELDB + +/* Snappy support automatically loaded. */ +#undef HAVE_BUILTIN_EXTENSION_SNAPPY + +/* Zlib support automatically loaded. */ +#undef HAVE_BUILTIN_EXTENSION_ZLIB + +/* Define to 1 for diagnostic tests. */ +#undef HAVE_DIAGNOSTIC + +/* Build the LevelDB API with HyperLevelDB support. */ +#undef HAVE_HYPERLEVELDB + +/* Define to 1 if you have the `snappy' library (-lsnappy). */ +#undef HAVE_LIBSNAPPY + +/* Build the LevelDB API with RocksDB support. */ +#undef HAVE_ROCKSDB diff --git a/api/leveldb/hyper_wt.cc b/api/leveldb/hyper_wt.cc index b147ff6fe75..95c82289e18 100644 --- a/api/leveldb/hyper_wt.cc +++ b/api/leveldb/hyper_wt.cc @@ -338,25 +338,6 @@ DbImpl::LiveBackup(const Slice& name) if ((t_ret = cursor->close(cursor)) != 0 && ret == 0) ret = t_ret; - // We only copied file contents that are on-disk. - // At this point we want to use a ReplayIterator to - // apply any in-memory operations. - DB* db; - leveldb::Options options; - ReplayIteratorImpl *iter = new ReplayIteratorImpl(context); - Status s = Open(options, backup, &db); - assert(s.ok()); - - while (iter->Valid()) { - if (iter->HasValue()) - s = db->Put(leveldb::WriteOptions(), - iter->key(), iter->value()); - else - s = db->Delete(leveldb::WriteOptions(), iter->key()); - iter->Next(); - } - delete iter; - delete db; return (WiredTigerErrorToStatus(ret)); } diff --git a/api/leveldb/hyperleveldb/replay_iterator.h b/api/leveldb/hyperleveldb/replay_iterator.h index 6e2f562c6c4..397acdfd889 100644 --- a/api/leveldb/hyperleveldb/replay_iterator.h +++ b/api/leveldb/hyperleveldb/replay_iterator.h @@ -5,7 +5,7 @@ #ifndef STORAGE_LEVELDB_INCLUDE_REPLAY_ITERATOR_H_ #define STORAGE_LEVELDB_INCLUDE_REPLAY_ITERATOR_H_ -#include "wiredtiger_config.h" +#include "leveldb_wt_config.h" #include "slice.h" #include "status.h" diff --git a/api/leveldb/leveldb/include/leveldb/cache.h b/api/leveldb/leveldb/include/leveldb/cache.h index 6ae25122133..94be8e919a8 100644 --- a/api/leveldb/leveldb/include/leveldb/cache.h +++ b/api/leveldb/leveldb/include/leveldb/cache.h @@ -18,7 +18,7 @@ #ifndef STORAGE_LEVELDB_INCLUDE_CACHE_H_ #define STORAGE_LEVELDB_INCLUDE_CACHE_H_ -#include "wiredtiger_config.h" +#include "leveldb_wt_config.h" #if defined(HAVE_ROCKSDB) && !defined(leveldb) #define leveldb rocksdb #endif diff --git a/api/leveldb/leveldb/include/leveldb/comparator.h b/api/leveldb/leveldb/include/leveldb/comparator.h index 23e0ba84559..78d83a4d08e 100644 --- a/api/leveldb/leveldb/include/leveldb/comparator.h +++ b/api/leveldb/leveldb/include/leveldb/comparator.h @@ -5,7 +5,7 @@ #ifndef STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_ #define STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_ -#include "wiredtiger_config.h" +#include "leveldb_wt_config.h" #if defined(HAVE_ROCKSDB) && !defined(leveldb) #define leveldb rocksdb #endif diff --git a/api/leveldb/leveldb/include/leveldb/db.h b/api/leveldb/leveldb/include/leveldb/db.h index c1818d28a7a..df8fcbbe9f8 100644 --- a/api/leveldb/leveldb/include/leveldb/db.h +++ b/api/leveldb/leveldb/include/leveldb/db.h @@ -5,7 +5,7 @@ #ifndef STORAGE_LEVELDB_INCLUDE_DB_H_ #define STORAGE_LEVELDB_INCLUDE_DB_H_ -#include "wiredtiger_config.h" +#include "leveldb_wt_config.h" #if defined(HAVE_ROCKSDB) && !defined(leveldb) #define leveldb rocksdb #endif @@ -292,6 +292,12 @@ class DB { // db->CompactRange(NULL, NULL); virtual void CompactRange(const Slice* begin, const Slice* end) = 0; + // Suspends the background compaction thread. This methods + // returns once suspended. + virtual void SuspendCompactions() = 0; + // Resumes a suspended background compation thread. + virtual void ResumeCompactions() = 0; + #ifdef HAVE_HYPERLEVELDB // Create a live backup of a live LevelDB instance. // The backup is stored in a directory named "backup-" under the top diff --git a/api/leveldb/leveldb/include/leveldb/env.h b/api/leveldb/leveldb/include/leveldb/env.h index 0d043307736..4ad67d36fea 100644 --- a/api/leveldb/leveldb/include/leveldb/env.h +++ b/api/leveldb/leveldb/include/leveldb/env.h @@ -13,7 +13,7 @@ #ifndef STORAGE_LEVELDB_INCLUDE_ENV_H_ #define STORAGE_LEVELDB_INCLUDE_ENV_H_ -#include "wiredtiger_config.h" +#include "leveldb_wt_config.h" #if defined(HAVE_ROCKSDB) && !defined(leveldb) #define leveldb rocksdb #endif diff --git a/api/leveldb/leveldb/include/leveldb/filter_policy.h b/api/leveldb/leveldb/include/leveldb/filter_policy.h index 2d970e709d6..e434ef4b241 100644 --- a/api/leveldb/leveldb/include/leveldb/filter_policy.h +++ b/api/leveldb/leveldb/include/leveldb/filter_policy.h @@ -16,7 +16,7 @@ #ifndef STORAGE_LEVELDB_INCLUDE_FILTER_POLICY_H_ #define STORAGE_LEVELDB_INCLUDE_FILTER_POLICY_H_ -#include "wiredtiger_config.h" +#include "leveldb_wt_config.h" #if defined(HAVE_ROCKSDB) && !defined(leveldb) #define leveldb rocksdb #endif diff --git a/api/leveldb/leveldb/include/leveldb/iterator.h b/api/leveldb/leveldb/include/leveldb/iterator.h index 3845d553a4e..2d97d180b17 100644 --- a/api/leveldb/leveldb/include/leveldb/iterator.h +++ b/api/leveldb/leveldb/include/leveldb/iterator.h @@ -15,7 +15,7 @@ #ifndef STORAGE_LEVELDB_INCLUDE_ITERATOR_H_ #define STORAGE_LEVELDB_INCLUDE_ITERATOR_H_ -#include "wiredtiger_config.h" +#include "leveldb_wt_config.h" #if defined(HAVE_ROCKSDB) && !defined(leveldb) #define leveldb rocksdb #endif diff --git a/api/leveldb/leveldb/include/leveldb/options.h b/api/leveldb/leveldb/include/leveldb/options.h index a14503fe086..9dcf73fc2a0 100644 --- a/api/leveldb/leveldb/include/leveldb/options.h +++ b/api/leveldb/leveldb/include/leveldb/options.h @@ -5,7 +5,7 @@ #ifndef STORAGE_LEVELDB_INCLUDE_OPTIONS_H_ #define STORAGE_LEVELDB_INCLUDE_OPTIONS_H_ -#include "wiredtiger_config.h" +#include "leveldb_wt_config.h" #if defined(HAVE_ROCKSDB) && !defined(leveldb) #define leveldb rocksdb #endif diff --git a/api/leveldb/leveldb/include/leveldb/slice.h b/api/leveldb/leveldb/include/leveldb/slice.h index d7c20cfcaac..1eb66dd825f 100644 --- a/api/leveldb/leveldb/include/leveldb/slice.h +++ b/api/leveldb/leveldb/include/leveldb/slice.h @@ -15,7 +15,7 @@ #ifndef STORAGE_LEVELDB_INCLUDE_SLICE_H_ #define STORAGE_LEVELDB_INCLUDE_SLICE_H_ -#include "wiredtiger_config.h" +#include "leveldb_wt_config.h" #if defined(HAVE_ROCKSDB) && !defined(leveldb) #define leveldb rocksdb #endif @@ -82,7 +82,8 @@ class Slice { (memcmp(data_, x.data_, x.size_) == 0)); } - private: +// The LevelDB JNI layer peeks in here +// private: const char* data_; size_t size_; diff --git a/api/leveldb/leveldb/include/leveldb/status.h b/api/leveldb/leveldb/include/leveldb/status.h index 8b2cbb9b422..3c21f64462b 100644 --- a/api/leveldb/leveldb/include/leveldb/status.h +++ b/api/leveldb/leveldb/include/leveldb/status.h @@ -13,7 +13,7 @@ #ifndef STORAGE_LEVELDB_INCLUDE_STATUS_H_ #define STORAGE_LEVELDB_INCLUDE_STATUS_H_ -#include "wiredtiger_config.h" +#include "leveldb_wt_config.h" #if defined(HAVE_ROCKSDB) && !defined(leveldb) #define leveldb rocksdb #endif diff --git a/api/leveldb/leveldb/include/leveldb/write_batch.h b/api/leveldb/leveldb/include/leveldb/write_batch.h index 9184d42c24c..293b41ad818 100644 --- a/api/leveldb/leveldb/include/leveldb/write_batch.h +++ b/api/leveldb/leveldb/include/leveldb/write_batch.h @@ -21,7 +21,7 @@ #ifndef STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_ #define STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_ -#include "wiredtiger_config.h" +#include "leveldb_wt_config.h" #if defined(HAVE_ROCKSDB) && !defined(leveldb) #define leveldb rocksdb #endif diff --git a/api/leveldb/leveldb_wt.cc b/api/leveldb/leveldb_wt.cc index cfeb0549db4..6425a5a8dfd 100644 --- a/api/leveldb/leveldb_wt.cc +++ b/api/leveldb/leveldb_wt.cc @@ -755,14 +755,9 @@ IteratorImpl::Next() int ret; WT_ITEM item; - if (!Status().ok()) + if (!Status().ok() || !valid_) return; - if (!valid_) { - SetError(EINVAL); - return; - } - ret = cursor_->next(cursor_); if (ret != 0) { if (ret != WT_NOTFOUND) @@ -791,14 +786,9 @@ IteratorImpl::Prev() { WT_ITEM item; - if (!Status().ok()) + if (!Status().ok() || !valid_) return; - if (!valid_) { - SetError(EINVAL); - return; - } - int ret = cursor_->prev(cursor_); if (ret != 0) { if (ret != WT_NOTFOUND) diff --git a/api/leveldb/leveldb_wt.h b/api/leveldb/leveldb_wt.h index 301fa250e85..683482ad23c 100644 --- a/api/leveldb/leveldb_wt.h +++ b/api/leveldb/leveldb_wt.h @@ -27,7 +27,7 @@ #ifndef _INCLUDE_LEVELDB_WT_H #define _INCLUDE_LEVELDB_WT_H 1 -#include "wiredtiger_config.h" +#include "leveldb_wt_config.h" #include "leveldb/cache.h" #include "leveldb/comparator.h" @@ -171,6 +171,7 @@ private: class CacheImpl : public Cache { public: CacheImpl(size_t capacity) : Cache(), capacity_(capacity) {} + virtual ~CacheImpl() {} virtual Handle* Insert(const Slice&, void*, size_t, void (*)(const Slice&, void*)) { return 0; } diff --git a/bench/wtperf/runners/shared-cache-stress.wtperf b/bench/wtperf/runners/shared-cache-stress.wtperf new file mode 100644 index 00000000000..87d14f4f5c1 --- /dev/null +++ b/bench/wtperf/runners/shared-cache-stress.wtperf @@ -0,0 +1,12 @@ +# Stress out the shared cache. +conn_config="statistics=(none),shared_cache=(name=wt-cache,size=536870912,reserve=10MB,chunk=20MB,)" +table_config="allocation_size=4KB,key_gap=10,split_pct=75,internal_page_max=4KB,internal_key_truncate=false,prefix_compression=false,leaf_item_max=1433,type=file,internal_item_max=1433,exclusive=true,leaf_page_max=4KB,block_compressor=," +checkpoint_interval=100 +checkpoint_threads=1 +icount=50000 +random_range=500000 +report_interval=5 +run_time=600 +populate_threads=1 +threads=((count=1,inserts=1),(count=1,reads=1)) +database_count=25 diff --git a/bench/wtperf/wtperf.c b/bench/wtperf/wtperf.c index 41f95799c57..93182550dc8 100644 --- a/bench/wtperf/wtperf.c +++ b/bench/wtperf/wtperf.c @@ -83,7 +83,7 @@ static int execute_workload(CONFIG *); static int find_table_count(CONFIG *); static void *monitor(void *); static void *populate_thread(void *); -static void randomize_value(CONFIG *, char *); +static void randomize_value(CONFIG_THREAD *, char *); static int start_all_runs(CONFIG *); static int start_run(CONFIG *); static int start_threads(CONFIG *, @@ -91,7 +91,7 @@ static int start_threads(CONFIG *, static int stop_threads(CONFIG *, u_int, CONFIG_THREAD *); static void *thread_run_wtperf(void *); static void *worker(void *); -static uint64_t wtperf_rand(CONFIG *); +static uint64_t wtperf_rand(CONFIG_THREAD *); static uint64_t wtperf_value_range(CONFIG *); #define HELIUM_NAME "dev1" @@ -100,11 +100,12 @@ static uint64_t wtperf_value_range(CONFIG *); #define HELIUM_CONFIG ",type=helium" /* - * wtperf uses a couple of internal WiredTiger library routines for timing - * and generating random numbers. + * wtperf uses internal WiredTiger library routines for timing and generating + * random numbers. */ extern int __wt_epoch(void *, struct timespec *); -extern uint32_t __wt_random(void); +extern uint32_t __wt_random(uint32_t *); +extern void __wt_random_init(uint32_t *); /* Retrieve an ID for the next insert operation. */ static inline uint64_t @@ -130,7 +131,7 @@ generate_key(CONFIG *cfg, char *key_buf, uint64_t keyno) } static void -randomize_value(CONFIG *cfg, char *value_buf) +randomize_value(CONFIG_THREAD *thread, char *value_buf) { uint8_t *vb; uint32_t i; @@ -140,13 +141,13 @@ randomize_value(CONFIG *cfg, char *value_buf) * randomly chosen byte (other than the trailing NUL). * Make sure we don't write a NUL: keep the value the same length. */ - i = __wt_random() % (cfg->value_sz - 1); + i = __wt_random(thread->rnd) % (thread->cfg->value_sz - 1); while (value_buf[i] == '\0' && i > 0) --i; if (i > 0) { vb = (uint8_t *)value_buf; - vb[0] = (__wt_random() % 255) + 1; - vb[i] = (__wt_random() % 255) + 1; + vb[0] = (__wt_random(thread->rnd) % 255) + 1; + vb[i] = (__wt_random(thread->rnd) % 255) + 1; } } @@ -317,13 +318,13 @@ worker_async(void *arg) case WORKER_INSERT: case WORKER_INSERT_RMW: if (cfg->random_range) - next_val = wtperf_rand(cfg); + next_val = wtperf_rand(thread); else next_val = cfg->icount + get_next_incr(cfg); break; case WORKER_READ: case WORKER_UPDATE: - next_val = wtperf_rand(cfg); + next_val = wtperf_rand(thread); /* * If the workload is started without a populate phase @@ -361,14 +362,14 @@ worker_async(void *arg) goto op_err; case WORKER_INSERT: if (cfg->random_value) - randomize_value(cfg, value_buf); + randomize_value(thread, value_buf); asyncop->set_value(asyncop, value_buf); if ((ret = asyncop->insert(asyncop)) == 0) break; goto op_err; case WORKER_UPDATE: if (cfg->random_value) - randomize_value(cfg, value_buf); + randomize_value(thread, value_buf); asyncop->set_value(asyncop, value_buf); if ((ret = asyncop->update(asyncop)) == 0) break; @@ -455,7 +456,7 @@ worker(void *arg) case WORKER_INSERT_RMW: trk = &thread->insert; if (cfg->random_range) - next_val = wtperf_rand(cfg); + next_val = wtperf_rand(thread); else next_val = cfg->icount + get_next_incr(cfg); break; @@ -465,7 +466,7 @@ worker(void *arg) case WORKER_UPDATE: if (*op == WORKER_UPDATE) trk = &thread->update; - next_val = wtperf_rand(cfg); + next_val = wtperf_rand(thread); /* * If the workload is started without a populate phase @@ -532,7 +533,7 @@ worker(void *arg) /* FALLTHROUGH */ case WORKER_INSERT: if (cfg->random_value) - randomize_value(cfg, value_buf); + randomize_value(thread, value_buf); cursor->set_value(cursor, value_buf); if ((ret = cursor->insert(cursor)) == 0) break; @@ -556,7 +557,7 @@ worker(void *arg) else value_buf[0] = 'a'; if (cfg->random_value) - randomize_value(cfg, value_buf); + randomize_value(thread, value_buf); cursor->set_value(cursor, value_buf); if ((ret = cursor->update(cursor)) == 0) break; @@ -812,7 +813,7 @@ populate_thread(void *arg) } cursor->set_key(cursor, key_buf); if (cfg->random_value) - randomize_value(cfg, value_buf); + randomize_value(thread, value_buf); cursor->set_value(cursor, value_buf); if ((ret = cursor->insert(cursor)) != 0) { lprintf(cfg, ret, 0, "Failed inserting"); @@ -941,7 +942,7 @@ populate_async(void *arg) generate_key(cfg, key_buf, op); asyncop->set_key(asyncop, key_buf); if (cfg->random_value) - randomize_value(cfg, value_buf); + randomize_value(thread, value_buf); asyncop->set_value(asyncop, value_buf); if ((ret = asyncop->insert(asyncop)) != 0) { lprintf(cfg, ret, 0, "Failed inserting"); @@ -1199,9 +1200,8 @@ execute_populate(CONFIG *cfg) struct timespec start, stop; CONFIG_THREAD *popth; WT_ASYNC_OP *asyncop; - double secs; size_t i; - uint64_t last_ops; + uint64_t last_ops, msecs; uint32_t interval, tables; int elapsed, ret; void *(*pfunc)(void *); @@ -1279,12 +1279,11 @@ execute_populate(CONFIG *cfg) } lprintf(cfg, 0, 1, "Finished load of %" PRIu32 " items", cfg->icount); - secs = stop.tv_sec + stop.tv_nsec / (double)BILLION; - secs -= start.tv_sec + start.tv_nsec / (double)BILLION; - if (secs == 0) - ++secs; + msecs = ns_to_ms(WT_TIMEDIFF(stop, start)); lprintf(cfg, 0, 1, - "Load time: %.2f\n" "load ops/sec: %.2f", secs, cfg->icount / secs); + "Load time: %.2f\n" "load ops/sec: %" PRIu64, + (double)msecs / (double)THOUSAND, + (uint64_t)((cfg->icount / msecs) / THOUSAND)); /* * If configured, compact to allow LSM merging to complete. We @@ -1324,9 +1323,9 @@ execute_populate(CONFIG *cfg) lprintf(cfg, ret, 0, "Get time failed in populate."); return (ret); } - secs = stop.tv_sec + stop.tv_nsec / (double)BILLION; - secs -= start.tv_sec + start.tv_nsec / (double)BILLION; - lprintf(cfg, 0, 1, "Compact completed in %.2f seconds", secs); + lprintf(cfg, 0, 1, + "Compact completed in %" PRIu64 " seconds", + (uint64_t)(ns_to_sec(WT_TIMEDIFF(stop, start)))); assert(tables == 0); } return (0); @@ -1747,6 +1746,8 @@ start_run(CONFIG *cfg) char helium_buf[256]; monitor_created = ret = 0; + /* [-Wconditional-uninitialized] */ + memset(&monitor_thread, 0, sizeof(monitor_thread)); if ((ret = setup_log_file(cfg)) != 0) goto err; @@ -1872,7 +1873,7 @@ err: if (ret == 0) if (cfg->conn != NULL && (t_ret = cfg->conn->close(cfg->conn, NULL)) != 0) { - lprintf(cfg, ret, 0, + lprintf(cfg, t_ret, 0, "Error closing connection to %s", cfg->home); if (ret == 0) ret = t_ret; @@ -2105,15 +2106,30 @@ err: config_free(cfg); static int start_threads(CONFIG *cfg, - WORKLOAD *workp, CONFIG_THREAD *thread, u_int num, void *(*func)(void *)) + WORKLOAD *workp, CONFIG_THREAD *base, u_int num, void *(*func)(void *)) { - u_int i; + CONFIG_THREAD *thread; + u_int i, j; int ret; - for (i = 0; i < num; ++i, ++thread) { + /* Initialize the threads. */ + for (i = 0, thread = base; i < num; ++i, ++thread) { thread->cfg = cfg; thread->workload = workp; + /* + * We don't want the threads executing in lock-step, move each + * new RNG state further along in the sequence. + */ + if (i == 0) + __wt_random_init(thread->rnd); + else { + thread->rnd[0] = (thread - 1)->rnd[0]; + thread->rnd[1] = (thread - 1)->rnd[1]; + } + for (j = 0; j < 1000; ++j) + (void)__wt_random(thread->rnd); + /* * Every thread gets a key/data buffer because we don't bother * to distinguish between threads needing them and threads that @@ -2129,7 +2145,7 @@ start_threads(CONFIG *cfg, */ memset(thread->value_buf, 'a', cfg->value_sz - 1); if (cfg->random_value) - randomize_value(cfg, thread->value_buf); + randomize_value(thread, thread->value_buf); /* * Every thread gets tracking information and is initialized @@ -2140,13 +2156,16 @@ start_threads(CONFIG *cfg, thread->update.min_latency = UINT32_MAX; thread->ckpt.max_latency = thread->insert.max_latency = thread->read.max_latency = thread->update.max_latency = 0; + } + /* Start the threads. */ + for (i = 0, thread = base; i < num; ++i, ++thread) if ((ret = pthread_create( &thread->handle, NULL, func, thread)) != 0) { lprintf(cfg, ret, 0, "Error creating thread"); return (ret); } - } + return (0); } @@ -2190,16 +2209,19 @@ wtperf_value_range(CONFIG *cfg) } static uint64_t -wtperf_rand(CONFIG *cfg) +wtperf_rand(CONFIG_THREAD *thread) { + CONFIG *cfg; double S1, S2, U; uint64_t rval; + cfg = thread->cfg; + /* * Use WiredTiger's random number routine: it's lock-free and fairly * good. */ - rval = (uint64_t)__wt_random(); + rval = (uint64_t)__wt_random(thread->rnd); /* Use Pareto distribution to give 80/20 hot/cold values. */ if (cfg->pareto) { diff --git a/bench/wtperf/wtperf.h b/bench/wtperf/wtperf.h index d2baa558af0..2845762f50b 100644 --- a/bench/wtperf/wtperf.h +++ b/bench/wtperf/wtperf.h @@ -104,13 +104,13 @@ struct __config { /* Configuration struction */ CONFIG_THREAD *ckptthreads, *popthreads; #define WORKLOAD_MAX 50 - CONFIG_THREAD *workers; /* Worker threads */ + CONFIG_THREAD *workers; /* Worker threads */ u_int workers_cnt; - WORKLOAD *workload; /* Workloads */ + WORKLOAD *workload; /* Workloads */ u_int workload_cnt; - uint32_t use_asyncops; /* Use async operations */ + uint32_t use_asyncops; /* Use async operations */ /* State tracking variables. */ uint64_t ckpt_ops; /* checkpoint operations */ @@ -191,6 +191,8 @@ typedef struct { struct __config_thread { /* Per-thread structure */ CONFIG *cfg; /* Enclosing configuration */ + uint32_t rnd[2]; /* Random number generation state */ + pthread_t handle; /* Handle */ char *key_buf, *value_buf; /* Key/value memory */ diff --git a/build_posix/configure.ac.in b/build_posix/configure.ac.in index dd5bce738ea..6352fa6d0df 100644 --- a/build_posix/configure.ac.in +++ b/build_posix/configure.ac.in @@ -9,6 +9,10 @@ AC_CONFIG_AUX_DIR([build_posix/gnu-support]) AC_CONFIG_MACRO_DIR([build_posix/aclocal]) AC_CONFIG_SRCDIR([RELEASE]) +# If CFLAGS/CXXFLAGS were not set on entry, default to "-O3 -g" +: ${CFLAGS=-O3 -g} +: ${CXXFLAGS=-O3 -g} + AM_INIT_AUTOMAKE([1.11 foreign parallel-tests subdir-objects]) m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([no])]) @@ -24,9 +28,6 @@ LT_PREREQ(2.2.6) LT_INIT([pic-only]) AC_SUBST([LIBTOOL_DEPS]) -# If CFLAGS was not set on entry, default to "-O3 -g" -: ${CFLAGS="-O3 -g"} - AC_PROG_CC(cc gcc) # AC_PROG_CXX(c++ g++) @@ -165,7 +166,7 @@ AC_CONFIG_HEADERS([wiredtiger_config.h:build_posix/config.hin]) # The LevelDB API needs some configuration knowledge AM_COND_IF([LEVELDB], - AC_CONFIG_HEADERS([api/leveldb/wiredtiger_config.h:build_posix/config.hin])) + AC_CONFIG_HEADERS([api/leveldb/leveldb_wt_config.h:api/leveldb/config.hin])) # BEGIN check existence -- maintained by reconf and Make.subdirs # END check existence diff --git a/dist/s_copyright.list b/dist/s_copyright.list index ca2ba425ad5..d66be5a1ba7 100644 --- a/dist/s_copyright.list +++ b/dist/s_copyright.list @@ -1,3 +1,4 @@ +skip api/leveldb/leveldb_wt_config.in skip dist/api_config.py skip dist/api_data.py skip dist/api_err.py diff --git a/dist/s_define.list b/dist/s_define.list index 593deb6e672..653ba6c6a8b 100644 --- a/dist/s_define.list +++ b/dist/s_define.list @@ -24,7 +24,6 @@ WT_HANDLE_CLOSED WT_HANDLE_NULLABLE WT_READ_BARRIER WT_REF_SIZE -WT_RET_TIMEDOUT_OK WT_SPINLOCK_MAX WT_STAT_ATOMIC_DECR WT_STAT_ATOMIC_DECRV diff --git a/dist/s_docs b/dist/s_docs index 815d27d7b11..c0c8885e1b5 100755 --- a/dist/s_docs +++ b/dist/s_docs @@ -113,7 +113,7 @@ valid_build() } classf=`ls ../docs/struct___* 2>/dev/null` for c in $classf; do - echo "$c: Need to add class to PREDEFINE in src/docs/Doxyfile" + echo "$c: Need to add class to PREDEFINED in src/docs/Doxyfile" done } diff --git a/dist/s_funcs.list b/dist/s_funcs.list index bf1d5156820..2bc87233084 100644 --- a/dist/s_funcs.list +++ b/dist/s_funcs.list @@ -14,6 +14,7 @@ __wt_cache_dump __wt_config_getone __wt_cursor_get_raw_value __wt_debug_addr +__wt_debug_addr_print __wt_debug_offset __wt_debug_set_verbose __wt_debug_tree diff --git a/dist/s_string.ok b/dist/s_string.ok index 9b1f6ffd9b1..4b381b68eed 100644 --- a/dist/s_string.ok +++ b/dist/s_string.ok @@ -512,6 +512,7 @@ fileop fileops filesize filesystem +fillms firstfit fixup flcs @@ -690,6 +691,7 @@ namespace namespaces nbits nbsp +nchunks nclr nd negint @@ -762,6 +764,7 @@ ps pse psp pthread +pushms putK putV pv diff --git a/dist/s_style b/dist/s_style index 56e4ddfc529..09b1b9460d2 100644 --- a/dist/s_style +++ b/dist/s_style @@ -6,26 +6,47 @@ trap 'rm -f $t; exit 0' 0 1 2 3 13 15 cd .. -# Returns in functions after a jump to the error label. -for f in `find examples ext src test -name '*.[ci]'`; do +# Turn a C file into a line per function so we can use grep on it. +file_parse() +{ sed -n \ - -e '/^{$/,/^}$/{=;p;}' $f | + -e '/^{$/,/^}$/{=;p;}' $1 | sed 'N;s/\n/:/' | sed -e '/./{H;/^[0-9][0-9]*:}$/!d;}' \ -e x \ -e 's/\n/ /g' \ -e p \ - -e '{s/.*//;x;}' | - egrep '(WT_ERR|WT_ERR_MSG|WT_ERR_NOTFOUND_OK|WT_ERR_TEST|WT_ILLEGAL_VALUE_ERR|WT_VERBOSE_ERR)\(.*(WT_ASSERT_RET|WT_ILLEGAL_VALUE|WT_RET|WT_RET_MSG|WT_RET_NOTFOUND_OK|WT_RET_TEST|WT_VERBOSE_RET|WT_VERBOSE_RETVAL)\(.*err:' | - sed 's/:.*//' > $t + -e '{s/.*//;x;}' +} + +# Returns in functions after a jump to the error label, or an infinite loop +# where there's a jump to the error label after the error label. +for f in `find bench examples ext src test -name '*.[ci]'`; do + file_parse $f | + egrep '(WT_ERR|WT_ERR_MSG|WT_ERR_NOTFOUND_OK|WT_ERR_TEST|WT_ILLEGAL_VALUE_ERR)\(.*(WT_ASSERT_RET|WT_ILLEGAL_VALUE|WT_RET|WT_RET_MSG|WT_RET_NOTFOUND_OK|WT_RET_TEST|WT_VERBOSE_RET|WT_VERBOSE_RETVAL)\(.*err:|[^a-z_]err:.*(WT_ERR|WT_ERR_MSG|WT_ERR_NOTFOUND_OK|WT_ERR_TEST|WT_ILLEGAL_VALUE_ERR)\(' | + sed 's/:.*//' > $t + test -s $t && { - echo "$f: function with return after a jump to an error label" + echo "$f: return after a jump to the error label or a jump to the error label after the error label" + sed 's/^/function @ line:/' < $t + } +done + +# Return of 0 in functions after a jump to the error label. +for f in `find bench examples ext src test -name '*.[ci]'`; do + file_parse $f | + egrep -v '[^a-z_]err:.*return \(ret|[^a-z_]err:.*WT_RET' | + egrep '[^a-z_]err:.*return \(0\);' | + sed 's/:.*//' > $t + + test -s $t && { + echo "$f: error label followed by a return of 0" sed 's/^/function @ line:/' < $t } done for f in \ - `find bench/wtperf examples ext src test -name '*.[chisy]' -o -name '*.in' | + `find bench examples ext src test -name '*.[chisy]' -o -name '*.in' | sed '/Makefile.in/d'`; do if grep "^[^}]*while (0);" $f > $t; then echo "$f: while (0) has trailing semi-colon" diff --git a/dist/stat_data.py b/dist/stat_data.py index 88eff5f1156..e5cf75cb59a 100644 --- a/dist/stat_data.py +++ b/dist/stat_data.py @@ -180,6 +180,13 @@ connection_stats = [ 'sleep for LSM checkpoint throttle'), Stat('lsm_merge_throttle', 'sleep for LSM merge throttle'), Stat('lsm_rows_merged', 'rows merged in an LSM tree'), + Stat('lsm_work_queue_app', 'LSM App work units currently queued', + 'no_clear,no_scale'), + Stat('lsm_work_queue_manager', 'LSM Merge work units currently queued', + 'no_clear,no_scale'), + Stat('lsm_work_queue_max', 'LSM tree queue hit maximum'), + Stat('lsm_work_queue_switch', 'LSM Switch work units currently queued', + 'no_clear,no_scale'), Stat('lsm_work_units_created', 'LSM tree maintenance operations scheduled'), Stat('lsm_work_units_discarded', diff --git a/examples/c/ex_all.c b/examples/c/ex_all.c index 8f9fba093de..ea5d26ce133 100644 --- a/examples/c/ex_all.c +++ b/examples/c/ex_all.c @@ -1055,8 +1055,8 @@ main(void) /*! [Statistics logging with a table] */ ret = wiredtiger_open(home, NULL, - "create," - "statistics_log=(sources=(\"table:table1\",\"table:table2\"))", + "create, statistics_log=(" + "sources=(\"lsm:table1\",\"lsm:table2\"), wait=5)", &conn); /*! [Statistics logging with a table] */ if (ret == 0) @@ -1064,7 +1064,7 @@ main(void) /*! [Statistics logging with all tables] */ ret = wiredtiger_open(home, NULL, - "create,statistics_log=(sources=(\"table:\"))", + "create, statistics_log=(sources=(\"lsm:\"), wait=5)", &conn); /*! [Statistics logging with all tables] */ if (ret == 0) diff --git a/ext/compressors/zlib/zlib_compress.c b/ext/compressors/zlib/zlib_compress.c index 33bb9bf8810..3532ecf16cd 100644 --- a/ext/compressors/zlib/zlib_compress.c +++ b/ext/compressors/zlib/zlib_compress.c @@ -225,8 +225,15 @@ zlib_compress_raw(WT_COMPRESSOR *compressor, WT_SESSION *session, * Strategy: take the available output size and compress that much * input. Continue until there is no input small enough or the * compression fails to fit. + * + * Don't let the compression ratio become insanely good (which can + * happen with synthetic workloads). Once we hit a limit, stop so that + * the in-memory size of pages isn't totally different to the on-disk + * size. Otherwise we can get into trouble where every update to a + * page results in forced eviction based on in-memory size, even though + * the data fits into a single on-disk block. */ - while (zs.avail_out > 0) { + while (zs.avail_out > 0 && zs.total_in <= zs.total_out * 20) { /* Find the slot we will try to compress up to. */ if ((curr_slot = zlib_find_slot( zs.total_in + zs.avail_out, offsets, slots)) <= last_slot) diff --git a/lang/python/Makefile.am b/lang/python/Makefile.am index 0ac56138e29..03c65a57028 100644 --- a/lang/python/Makefile.am +++ b/lang/python/Makefile.am @@ -1,5 +1,5 @@ PYSRC = $(top_srcdir)/lang/python -PY_INCLUDE_DIRS = $(top_srcdir) +PYDIRS = -t $(abs_builddir) -I $(abs_top_srcdir):$(abs_top_builddir) -L $(abs_top_builddir)/.libs all-local: _wiredtiger.so # We keep generated Python sources under lang/python: that's where they live @@ -10,15 +10,19 @@ $(PYSRC)/wiredtiger_wrap.c: $(top_srcdir)/src/include/wiredtiger.in $(PYSRC)/wir mv wiredtiger.py wiredtiger/__init__.py) _wiredtiger.so: $(top_builddir)/libwiredtiger.la $(PYSRC)/wiredtiger_wrap.c - $(PYTHON) $(PYSRC)/setup.py build_ext -b . -t . -f -I $(PY_INCLUDE_DIRS) + (cd $(PYSRC) && \ + $(PYTHON) setup.py build_ext -f -b $(abs_builddir) $(PYDIRS)) install-exec-local: - $(PYTHON) $(PYSRC)/setup.py build_py -d build - $(PYTHON) $(PYSRC)/setup.py build_ext -b build -t . -f -I $(PY_INCLUDE_DIRS) - $(PYTHON) $(PYSRC)/setup.py install_lib -b build --skip-build $(PYTHON_INSTALL_ARG) + (cd $(PYSRC) && \ + $(PYTHON) setup.py build_py -d $(abs_builddir)/build && \ + $(PYTHON) setup.py build_ext -f -b $(abs_builddir)/build $(PYDIRS) && \ + $(PYTHON) setup.py install_lib -b $(abs_builddir)/build --skip-build $(PYTHON_INSTALL_ARG)) +# We build in different places for an install vs running from the tree: +# clean up both. Don't rely on "setup.py clean" -- everything that should +# be removed is created under the build directory. clean-local: - $(PYTHON) $(PYSRC)/setup.py clean - rm -rf _wiredtiger.so WT_TEST build wiredtiger + rm -rf build _wiredtiger.so wiredtiger_wrap.o WT_TEST TESTS = run-ex_access diff --git a/lang/python/setup.py b/lang/python/setup.py index 1c6ebc71387..1057006ce50 100644 --- a/lang/python/setup.py +++ b/lang/python/setup.py @@ -35,9 +35,7 @@ if not 'ARCHFLAGS' in os.environ: os.environ['ARCHFLAGS'] = '' # Suppress warnings building SWIG generated code -extra_cflags = [ - '-w', -] +extra_cflags = [ '-w' ] dir = os.path.dirname(__file__) @@ -50,12 +48,10 @@ wt_ver = '%d.%d' % (WIREDTIGER_VERSION_MAJOR, WIREDTIGER_VERSION_MINOR) setup(name='wiredtiger', version=wt_ver, ext_modules=[Extension('_wiredtiger', - [os.path.join(dir, 'wiredtiger_wrap.c')], - include_dirs=['../..'], - library_dirs=['../../.libs'], + [os.path.join(dir, 'wiredtiger_wrap.c')], libraries=['wiredtiger'], extra_compile_args=extra_cflags, )], - package_dir={'' : dir}, + package_dir={'' : dir}, packages=['wiredtiger'], ) diff --git a/src/async/async_api.c b/src/async/async_api.c index 4f4958baf2a..4ae074c429a 100644 --- a/src/async/async_api.c +++ b/src/async/async_api.c @@ -102,17 +102,17 @@ err: * Find and allocate the next available async op handle. */ static int -__async_new_op_alloc(WT_CONNECTION_IMPL *conn, const char *uri, +__async_new_op_alloc(WT_SESSION_IMPL *session, const char *uri, const char *config, WT_ASYNC_OP_IMPL **opp) { WT_ASYNC *async; WT_ASYNC_OP_IMPL *op; - WT_SESSION_IMPL *session; + WT_CONNECTION_IMPL *conn; uint32_t i, save_i, view; + conn = S2C(session); async = conn->async; - session = conn->default_session; - WT_STAT_FAST_CONN_INCR(conn->default_session, async_op_alloc); + WT_STAT_FAST_CONN_INCR(session, async_op_alloc); *opp = NULL; retry: @@ -154,7 +154,7 @@ retry: WT_STAT_FAST_CONN_INCR(session, async_alloc_race); goto retry; } - WT_STAT_FAST_CONN_INCRV(conn->default_session, async_alloc_view, view); + WT_STAT_FAST_CONN_INCRV(session, async_alloc_view, view); WT_RET(__async_get_format(conn, uri, config, op)); op->unique_id = WT_ATOMIC_ADD(async->op_id, 1); op->optype = WT_AOP_NONE; @@ -172,32 +172,24 @@ __async_config(WT_SESSION_IMPL *session, WT_CONNECTION_IMPL *conn, const char **cfg, int *runp) { WT_CONFIG_ITEM cval; - WT_DECL_RET; /* * The async configuration is off by default. */ - if ((ret = __wt_config_gets( - session, cfg, "async.enabled", &cval)) == 0) - *runp = cval.val != 0; - WT_RET_NOTFOUND_OK(ret); + WT_RET(__wt_config_gets(session, cfg, "async.enabled", &cval)); + *runp = cval.val != 0; /* * Even if async is turned off, we want to parse and store the * default values so that reconfigure can just enable them. */ - if ((ret = __wt_config_gets( - session, cfg, "async.ops_max", &cval)) == 0) - conn->async_size = (uint32_t)cval.val; - WT_RET_NOTFOUND_OK(ret); + WT_RET(__wt_config_gets(session, cfg, "async.ops_max", &cval)); + conn->async_size = (uint32_t)cval.val; - if ((ret = __wt_config_gets( - session, cfg, "async.threads", &cval)) == 0) { - conn->async_workers = (uint32_t)cval.val; - /* Sanity check that api_data.py is in sync with async.h */ - WT_ASSERT(session, conn->async_workers <= WT_ASYNC_MAX_WORKERS); - } - WT_RET_NOTFOUND_OK(ret); + WT_RET(__wt_config_gets(session, cfg, "async.threads", &cval)); + conn->async_workers = (uint32_t)cval.val; + /* Sanity check that api_data.py is in sync with async.h */ + WT_ASSERT(session, conn->async_workers <= WT_ASYNC_MAX_WORKERS); return (0); } @@ -209,8 +201,8 @@ __async_config(WT_SESSION_IMPL *session, void __wt_async_stats_update(WT_SESSION_IMPL *session) { - WT_CONNECTION_IMPL *conn; WT_ASYNC *async; + WT_CONNECTION_IMPL *conn; WT_CONNECTION_STATS *stats; conn = S2C(session); @@ -224,27 +216,18 @@ __wt_async_stats_update(WT_SESSION_IMPL *session) } /* - * __wt_async_create -- - * Start the async subsystem and worker threads. + * __async_start -- + * Start the async subsystem. All configuration processing has + * already been done by the caller. */ -int -__wt_async_create(WT_CONNECTION_IMPL *conn, const char *cfg[]) +static int +__async_start(WT_SESSION_IMPL *session) { WT_ASYNC *async; - WT_SESSION_IMPL *session; - int run; + WT_CONNECTION_IMPL *conn; uint32_t i; - session = conn->default_session; - - /* Handle configuration. */ - run = 0; - WT_RET(__async_config(session, conn, cfg, &run)); - - /* If async is not configured, we're done. */ - if (!run) - return (0); - + conn = S2C(session); conn->async_cfg = 1; /* * Async is on, allocate the WT_ASYNC structure and initialize the ops. @@ -254,7 +237,7 @@ __wt_async_create(WT_CONNECTION_IMPL *conn, const char *cfg[]) STAILQ_INIT(&async->formatqh); WT_RET(__wt_spin_init(session, &async->ops_lock, "ops")); WT_RET(__wt_cond_alloc(session, "async flush", 0, &async->flush_cond)); - WT_RET(__wt_async_op_init(conn)); + WT_RET(__wt_async_op_init(session)); /* * Start up the worker threads. @@ -283,22 +266,43 @@ __wt_async_create(WT_CONNECTION_IMPL *conn, const char *cfg[]) return (0); } +/* + * __wt_async_create -- + * Start the async subsystem and worker threads. + */ +int +__wt_async_create(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_CONNECTION_IMPL *conn; + int run; + + conn = S2C(session); + + /* Handle configuration. */ + run = 0; + WT_RET(__async_config(session, conn, cfg, &run)); + + /* If async is not configured, we're done. */ + if (!run) + return (0); + return (__async_start(session)); +} + /* * __wt_async_reconfig -- * Start the async subsystem and worker threads. */ int -__wt_async_reconfig(WT_CONNECTION_IMPL *conn, const char *cfg[]) +__wt_async_reconfig(WT_SESSION_IMPL *session, const char *cfg[]) { WT_ASYNC *async; - WT_CONNECTION_IMPL tmp_conn; + WT_CONNECTION_IMPL *conn, tmp_conn; WT_DECL_RET; WT_SESSION *wt_session; - WT_SESSION_IMPL *session; int run; uint32_t i; - session = conn->default_session; + conn = S2C(session); async = conn->async; memset(&tmp_conn, 0, sizeof(tmp_conn)); tmp_conn.async_cfg = conn->async_cfg; @@ -332,13 +336,13 @@ __wt_async_reconfig(WT_CONNECTION_IMPL *conn, const char *cfg[]) */ if (conn->async_cfg > 0 && !run) { /* Case 1 */ - WT_TRET(__wt_async_flush(conn)); - ret = __wt_async_destroy(conn); + WT_TRET(__wt_async_flush(session)); + ret = __wt_async_destroy(session); conn->async_cfg = 0; return (ret); } else if (conn->async_cfg == 0 && run) /* Case 2 */ - return (__wt_async_create(conn, cfg)); + return (__async_start(session)); else if (conn->async_cfg == 0) /* Case 3 */ return (0); @@ -412,17 +416,17 @@ __wt_async_reconfig(WT_CONNECTION_IMPL *conn, const char *cfg[]) * Destroy the async worker threads and async subsystem. */ int -__wt_async_destroy(WT_CONNECTION_IMPL *conn) +__wt_async_destroy(WT_SESSION_IMPL *session) { WT_ASYNC *async; WT_ASYNC_FORMAT *af, *afnext; WT_ASYNC_OP *op; + WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_SESSION *wt_session; - WT_SESSION_IMPL *session; uint32_t i; - session = conn->default_session; + conn = S2C(session); async = conn->async; if (!conn->async_cfg) @@ -477,17 +481,17 @@ __wt_async_destroy(WT_CONNECTION_IMPL *conn) * Implementation of the WT_CONN->async_flush method. */ int -__wt_async_flush(WT_CONNECTION_IMPL *conn) +__wt_async_flush(WT_SESSION_IMPL *session) { WT_ASYNC *async; + WT_CONNECTION_IMPL *conn; WT_DECL_RET; - WT_SESSION_IMPL *session; + conn = S2C(session); if (!conn->async_cfg) return (0); async = conn->async; - session = conn->default_session; WT_STAT_FAST_CONN_INCR(session, async_flush); /* * We have to do several things. First we have to prevent @@ -518,13 +522,11 @@ retry: */ async->flush_count = 0; (void)WT_ATOMIC_ADD(async->flush_gen, 1); - WT_ASSERT(conn->default_session, - async->flush_op.state == WT_ASYNCOP_FREE); + WT_ASSERT(session, async->flush_op.state == WT_ASYNCOP_FREE); async->flush_op.state = WT_ASYNCOP_READY; - WT_ERR(__wt_async_op_enqueue(conn, &async->flush_op)); + WT_ERR(__wt_async_op_enqueue(session, &async->flush_op)); while (async->flush_state != WT_ASYNC_FLUSH_COMPLETE) - WT_ERR_TIMEDOUT_OK( - __wt_cond_wait(NULL, async->flush_cond, 100000)); + WT_ERR(__wt_cond_wait(NULL, async->flush_cond, 100000)); /* * Flush is done. Clear the flags. */ @@ -571,19 +573,22 @@ __async_runtime_config(WT_ASYNC_OP_IMPL *op, const char *cfg[]) * Implementation of the WT_CONN->async_new_op method. */ int -__wt_async_new_op(WT_CONNECTION_IMPL *conn, const char *uri, +__wt_async_new_op(WT_SESSION_IMPL *session, const char *uri, const char *config, const char *cfg[], WT_ASYNC_CALLBACK *cb, WT_ASYNC_OP_IMPL **opp) { WT_ASYNC_OP_IMPL *op; + WT_CONNECTION_IMPL *conn; WT_DECL_RET; *opp = NULL; + + conn = S2C(session); if (!conn->async_cfg) return (ENOTSUP); op = NULL; - WT_ERR(__async_new_op_alloc(conn, uri, config, &op)); + WT_ERR(__async_new_op_alloc(session, uri, config, &op)); WT_ERR(__async_runtime_config(op, cfg)); op->cb = cb; *opp = op; diff --git a/src/async/async_op.c b/src/async/async_op.c index 1e9151e0f86..5cd05881fd9 100644 --- a/src/async/async_op.c +++ b/src/async/async_op.c @@ -91,7 +91,7 @@ static int __async_op_wrap(WT_ASYNC_OP_IMPL *op, WT_ASYNC_OPTYPE type) { op->optype = type; - return (__wt_async_op_enqueue(O2C(op), op)); + return (__wt_async_op_enqueue(O2S(op), op)); } /* @@ -254,20 +254,22 @@ __async_op_init(WT_CONNECTION_IMPL *conn, WT_ASYNC_OP_IMPL *op, uint32_t id) * Enqueue an operation onto the work queue. */ int -__wt_async_op_enqueue(WT_CONNECTION_IMPL *conn, WT_ASYNC_OP_IMPL *op) +__wt_async_op_enqueue(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op) { WT_ASYNC *async; + WT_CONNECTION_IMPL *conn; WT_DECL_RET; uint64_t cur_head, cur_tail, my_alloc, my_slot; #ifdef HAVE_DIAGNOSTIC WT_ASYNC_OP_IMPL *my_op; #endif + conn = S2C(session); async = conn->async; /* * Enqueue op at the tail of the work queue. */ - WT_ASSERT(conn->default_session, op->state == WT_ASYNCOP_READY); + WT_ASSERT(session, op->state == WT_ASYNCOP_READY); /* * We get our slot in the ring buffer to use. */ @@ -287,7 +289,7 @@ __wt_async_op_enqueue(WT_CONNECTION_IMPL *conn, WT_ASYNC_OP_IMPL *op) #ifdef HAVE_DIAGNOSTIC WT_ORDERED_READ(my_op, async->async_queue[my_slot]); if (my_op != NULL) - return (__wt_panic(conn->default_session)); + return (__wt_panic(session)); #endif WT_PUBLISH(async->async_queue[my_slot], op); op->state = WT_ASYNCOP_ENQUEUED; @@ -311,14 +313,17 @@ __wt_async_op_enqueue(WT_CONNECTION_IMPL *conn, WT_ASYNC_OP_IMPL *op) * Initialize all the op handles. */ int -__wt_async_op_init(WT_CONNECTION_IMPL *conn) +__wt_async_op_init(WT_SESSION_IMPL *session) { WT_ASYNC *async; WT_ASYNC_OP_IMPL *op; + WT_CONNECTION_IMPL *conn; WT_DECL_RET; uint32_t i; + conn = S2C(session); async = conn->async; + /* * Initialize the flush op structure. */ @@ -330,13 +335,12 @@ __wt_async_op_init(WT_CONNECTION_IMPL *conn) * can never overlap the tail. Include extra for the flush op. */ async->async_qsize = conn->async_size + 2; - WT_RET(__wt_calloc_def(conn->default_session, - async->async_qsize, &async->async_queue)); + WT_RET(__wt_calloc_def( + session, async->async_qsize, &async->async_queue)); /* * Allocate and initialize all the user ops. */ - WT_ERR(__wt_calloc_def(conn->default_session, - conn->async_size, &async->async_ops)); + WT_ERR(__wt_calloc_def(session, conn->async_size, &async->async_ops)); for (i = 0; i < conn->async_size; i++) { op = &async->async_ops[i]; WT_ERR(__async_op_init(conn, op, i)); @@ -344,11 +348,11 @@ __wt_async_op_init(WT_CONNECTION_IMPL *conn) return (0); err: if (async->async_ops != NULL) { - __wt_free(conn->default_session, async->async_ops); + __wt_free(session, async->async_ops); async->async_ops = NULL; } if (async->async_queue != NULL) { - __wt_free(conn->default_session, async->async_queue); + __wt_free(session, async->async_queue); async->async_queue = NULL; } return (ret); diff --git a/src/async/async_worker.c b/src/async/async_worker.c index 192af09a6e1..4c7ba6036ab 100644 --- a/src/async/async_worker.c +++ b/src/async/async_worker.c @@ -108,8 +108,7 @@ __async_flush_wait(WT_SESSION_IMPL *session, WT_ASYNC *async, uint64_t my_gen) while (async->flush_state == WT_ASYNC_FLUSHING && async->flush_gen == my_gen) - WT_ERR_TIMEDOUT_OK( - __wt_cond_wait(session, async->flush_cond, 10000)); + WT_ERR(__wt_cond_wait(session, async->flush_cond, 10000)); err: return (ret); } diff --git a/src/block/block_ckpt.c b/src/block/block_ckpt.c index 4227d2d7c03..b7ec8afff29 100644 --- a/src/block/block_ckpt.c +++ b/src/block/block_ckpt.c @@ -142,7 +142,7 @@ err: /* * allocated memory was in the service of verify, clean that up. */ if (block->verify) - WT_ERR(__wt_verify_ckpt_unload(session, block)); + WT_TRET(__wt_verify_ckpt_unload(session, block)); } /* Checkpoints don't need the original information, discard it. */ diff --git a/src/block/block_session.c b/src/block/block_session.c index 17767fc815f..fa56b72f49b 100644 --- a/src/block/block_session.c +++ b/src/block/block_session.c @@ -29,7 +29,7 @@ __block_ext_alloc(WT_SESSION_IMPL *session, WT_EXT **extp) u_int skipdepth; - skipdepth = __wt_skip_choose_depth(); + skipdepth = __wt_skip_choose_depth(session); WT_RET(__wt_calloc(session, 1, sizeof(WT_EXT) + skipdepth * 2 * sizeof(WT_EXT *), &ext)); ext->depth = (uint8_t)skipdepth; diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c index 8a069cc4bdf..5412286621e 100644 --- a/src/btree/bt_debug.c +++ b/src/btree/bt_debug.c @@ -62,7 +62,7 @@ __wt_debug_set_verbose(WT_SESSION_IMPL *session, const char *v) snprintf(buf, sizeof(buf), "verbose=[%s]", v); cfg[0] = buf; - return (__wt_conn_verbose_config(session, cfg)); + return (__wt_verbose_config(session, cfg)); } /* @@ -194,6 +194,24 @@ __dmsg(WT_DBG *ds, const char *fmt, ...) } } +/* + * __wt_debug_addr_print -- + * Print out an address. + */ +int +__wt_debug_addr_print( + WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size) +{ + WT_DECL_ITEM(buf); + + WT_RET(__wt_scr_alloc(session, 128, &buf)); + fprintf(stderr, "%s\n", + __wt_addr_string(session, addr, addr_size, buf)); + __wt_scr_free(&buf); + + return (0); +} + /* * __wt_debug_addr -- * Read and dump a disk page in debugging mode, using an addr/size pair. diff --git a/src/btree/bt_evict.c b/src/btree/bt_evict.c index c402a1715d6..84f7571993b 100644 --- a/src/btree/bt_evict.c +++ b/src/btree/bt_evict.c @@ -191,8 +191,7 @@ __evict_server(void *arg) F_CLR(cache, WT_EVICT_ACTIVE); WT_ERR(__wt_verbose(session, WT_VERB_EVICTSERVER, "sleeping")); /* Don't rely on signals: check periodically. */ - WT_ERR_TIMEDOUT_OK( - __wt_cond_wait(session, cache->evict_cond, 100000)); + WT_ERR(__wt_cond_wait(session, cache->evict_cond, 100000)); WT_ERR(__wt_verbose(session, WT_VERB_EVICTSERVER, "waking")); } @@ -228,12 +227,14 @@ err: * Start the eviction server thread. */ int -__wt_evict_create(WT_CONNECTION_IMPL *conn) +__wt_evict_create(WT_SESSION_IMPL *session) { - WT_SESSION_IMPL *session; + WT_CONNECTION_IMPL *conn; WT_EVICT_WORKER *workers; u_int i; + conn = S2C(session); + /* Set first, the thread might run before we finish up. */ F_SET(conn, WT_CONN_EVICTION_RUN); @@ -288,17 +289,17 @@ __wt_evict_create(WT_CONNECTION_IMPL *conn) * Destroy the eviction server thread. */ int -__wt_evict_destroy(WT_CONNECTION_IMPL *conn) +__wt_evict_destroy(WT_SESSION_IMPL *session) { WT_CACHE *cache; + WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_EVICT_WORKER *workers; WT_SESSION *wt_session; - WT_SESSION_IMPL *session; u_int i; + conn = S2C(session); cache = conn->cache; - session = conn->default_session; workers = conn->evict_workctx; F_CLR(conn, WT_CONN_EVICTION_RUN); diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c index 1816167d5c3..10366e91a0e 100644 --- a/src/btree/bt_slvg.c +++ b/src/btree/bt_slvg.c @@ -115,7 +115,7 @@ struct __wt_track { static int __slvg_cleanup(WT_SESSION_IMPL *, WT_STUFF *); static int __slvg_col_build_internal(WT_SESSION_IMPL *, uint32_t, WT_STUFF *); static int __slvg_col_build_leaf(WT_SESSION_IMPL *, WT_TRACK *, WT_REF *); -static int __slvg_col_merge_ovfl( +static int __slvg_col_ovfl( WT_SESSION_IMPL *, WT_TRACK *, WT_PAGE *, uint64_t, uint64_t); static int __slvg_col_range(WT_SESSION_IMPL *, WT_STUFF *); static int __slvg_col_range_missing(WT_SESSION_IMPL *, WT_STUFF *); @@ -126,13 +126,13 @@ static int __slvg_merge_block_free(WT_SESSION_IMPL *, WT_STUFF *); static int __slvg_ovfl_compare(const void *, const void *); static int __slvg_ovfl_discard(WT_SESSION_IMPL *, WT_STUFF *); static int __slvg_ovfl_reconcile(WT_SESSION_IMPL *, WT_STUFF *); -static int __slvg_ovfl_ref(WT_SESSION_IMPL *, WT_TRACK *); +static int __slvg_ovfl_ref(WT_SESSION_IMPL *, WT_TRACK *, int); static int __slvg_ovfl_ref_all(WT_SESSION_IMPL *, WT_TRACK *); static int __slvg_read(WT_SESSION_IMPL *, WT_STUFF *); static int __slvg_row_build_internal(WT_SESSION_IMPL *, uint32_t, WT_STUFF *); static int __slvg_row_build_leaf( WT_SESSION_IMPL *, WT_TRACK *, WT_REF *, WT_STUFF *); -static int __slvg_row_merge_ovfl( +static int __slvg_row_ovfl( WT_SESSION_IMPL *, WT_TRACK *, WT_PAGE *, uint32_t, uint32_t); static int __slvg_row_range(WT_SESSION_IMPL *, WT_STUFF *); static int __slvg_row_range_overlap( @@ -203,10 +203,37 @@ __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]) /* * Step 3: - * Review the relationships between the pages and the overflow items. + * Discard any page referencing a non-existent overflow page. We do + * this before checking overlapping key ranges on the grounds that a + * bad key range we can use is better than a terrific key range that + * references pages we don't have. On the other hand, we subsequently + * discard key ranges where there are better overlapping ranges, and + * it would be better if we let the availability of an overflow value + * inform our choices as to the key ranges we select, ideally on a + * per-key basis. + * + * A complicating problem is found in variable-length column-store + * objects, where we potentially split key ranges within RLE units. + * For example, if there's a page with rows 15-20 and we later find + * row 17 with a larger LSN, the range splits into 3 chunks, 15-16, + * 17, and 18-20. If rows 15-20 were originally a single value (an + * RLE of 6), and that record is an overflow record, we end up with + * two chunks, both of which want to reference the same overflow value. + * + * Instead of the approach just described, we're first discarding any + * pages referencing non-existent overflow pages, then we're reviewing + * our key ranges and discarding any that overlap. We're doing it that + * way for a few reasons: absent corruption, missing overflow items are + * strong arguments the page was replaced (on the other hand, some kind + * of file corruption is probably why we're here); it's a significant + * amount of additional complexity to simultaneously juggle overlapping + * ranges and missing overflow items; finally, real-world applications + * usually don't have a lot of overflow items, as WiredTiger supports + * very large page sizes, overflow items shouldn't be common. * * Step 4: - * Add unreferenced overflow page blocks to the free list. + * Add unreferenced overflow page blocks to the free list so they are + * reused immediately. */ if (ss->ovfl_next != 0) { WT_ERR(__slvg_ovfl_reconcile(session, ss)); @@ -1239,7 +1266,7 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref) /* Set the referenced flag on overflow pages we're using. */ if (page->type == WT_PAGE_COL_VAR && trk->trk_ovfl_cnt != 0) - WT_ERR(__slvg_col_merge_ovfl(session, trk, page, skip, take)); + WT_ERR(__slvg_col_ovfl(session, trk, page, skip, take)); /* * If we're missing some part of the range, the real start range is in @@ -1293,12 +1320,12 @@ err: WT_TRET(__wt_page_release(session, ref, 0)); } /* - * __slvg_col_merge_ovfl_single -- + * __slvg_col_ovfl_single -- * Find a single overflow record in the merge page's list, and mark it as * referenced. */ static int -__slvg_col_merge_ovfl_single( +__slvg_col_ovfl_single( WT_SESSION_IMPL *session, WT_TRACK *trk, WT_CELL_UNPACK *unpack) { WT_TRACK *ovfl; @@ -1312,7 +1339,7 @@ __slvg_col_merge_ovfl_single( ovfl = trk->ss->ovfl[trk->trk_ovfl_slot[i]]; if (unpack->size == ovfl->trk_addr_size && memcmp(unpack->data, ovfl->trk_addr, unpack->size) == 0) - return (__slvg_ovfl_ref(session, ovfl)); + return (__slvg_ovfl_ref(session, ovfl, 0)); } WT_PANIC_RET(session, @@ -1320,16 +1347,17 @@ __slvg_col_merge_ovfl_single( } /* - * __slvg_col_merge_ovfl -- + * __slvg_col_ovfl -- * Mark overflow items referenced by the merged page. */ static int -__slvg_col_merge_ovfl(WT_SESSION_IMPL *session, +__slvg_col_ovfl(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_PAGE *page, uint64_t skip, uint64_t take) { WT_CELL_UNPACK unpack; WT_CELL *cell; WT_COL *cip; + WT_DECL_RET; uint64_t recno, start, stop; uint32_t i; @@ -1359,9 +1387,28 @@ __slvg_col_merge_ovfl(WT_SESSION_IMPL *session, * because stop is the last record wanted, if the record number * equals stop, we want the next record. */ - if (recno > start && unpack.type == WT_CELL_VALUE_OVFL) - WT_RET(__slvg_col_merge_ovfl_single( - session, trk, &unpack)); + if (recno > start && unpack.type == WT_CELL_VALUE_OVFL) { + ret = __slvg_col_ovfl_single(session, trk, &unpack); + + /* + * When handling overlapping ranges on variable-length + * column-store leaf pages, we split ranges without + * considering if we were splitting RLE units. (See + * note at the beginning of this file for explanation + * of the overall process.) If the RLE unit was on-page, + * we can simply write it again. If the RLE unit was an + * overflow value that's already been used by another + * row (from some other page created by a range split), + * there's not much to do, this row can't reference an + * overflow record we don't have: delete the row. + */ + if (ret == EBUSY) { + __wt_cell_type_reset(session, + cell, WT_CELL_VALUE_OVFL, WT_CELL_DEL); + ret = 0; + } + WT_RET(ret); + } if (recno > stop) break; } @@ -1936,7 +1983,7 @@ __slvg_row_build_leaf( /* Set the referenced flag on overflow pages we're using. */ if (trk->trk_ovfl_cnt != 0) - WT_ERR(__slvg_row_merge_ovfl(session, + WT_ERR(__slvg_row_ovfl(session, trk, page, skip_start, page->pg_row_entries - skip_stop)); /* @@ -1984,13 +2031,12 @@ err: WT_TRET(__wt_page_release(session, ref, 0)); } /* - * __slvg_row_merge_ovfl_single -- + * __slvg_row_ovfl_single -- * Find a single overflow record in the merge page's list, and mark it as * referenced. */ static int -__slvg_row_merge_ovfl_single( - WT_SESSION_IMPL *session, WT_TRACK *trk, WT_CELL *cell) +__slvg_row_ovfl_single(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_CELL *cell) { WT_CELL_UNPACK unpack; WT_TRACK *ovfl; @@ -2010,7 +2056,7 @@ __slvg_row_merge_ovfl_single( ovfl = trk->ss->ovfl[trk->trk_ovfl_slot[i]]; if (unpack.size == ovfl->trk_addr_size && memcmp(unpack.data, ovfl->trk_addr, unpack.size) == 0) - return (__slvg_ovfl_ref(session, ovfl)); + return (__slvg_ovfl_ref(session, ovfl, 1)); } WT_PANIC_RET(session, @@ -2018,11 +2064,11 @@ __slvg_row_merge_ovfl_single( } /* - * __slvg_row_merge_ovfl -- + * __slvg_row_ovfl -- * Mark overflow items referenced by the merged page. */ static int -__slvg_row_merge_ovfl(WT_SESSION_IMPL *session, +__slvg_row_ovfl(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_PAGE *page, uint32_t start, uint32_t stop) { WT_CELL *cell; @@ -2038,12 +2084,10 @@ __slvg_row_merge_ovfl(WT_SESSION_IMPL *session, (void)__wt_row_leaf_key_info( page, copy, NULL, &cell, NULL, NULL); if (cell != NULL) - WT_RET( - __slvg_row_merge_ovfl_single(session, trk, cell)); + WT_RET(__slvg_row_ovfl_single(session, trk, cell)); cell = __wt_row_leaf_value_cell(page, rip, NULL); if (cell != NULL) - WT_RET( - __slvg_row_merge_ovfl_single(session, trk, cell)); + WT_RET(__slvg_row_ovfl_single(session, trk, cell)); } return (0); } @@ -2113,18 +2157,6 @@ __slvg_ovfl_reconcile(WT_SESSION_IMPL *session, WT_STUFF *ss) slot = NULL; /* - * Discard any page referencing a non-existent overflow page. We do - * this before checking overlapping key ranges on the grounds that a - * bad key range we can use is better than a terrific key range that - * references pages we don't have. - * - * An alternative would be to discard only the on-page item referencing - * the missing overflow item. We're not doing that because: (1) absent - * corruption, a missing overflow item is a strong argument the page was - * replaced (but admittedly, corruption is probably why we're here); (2) - * it's a lot of work, and as WiredTiger supports very large page sizes, - * overflow items simply shouldn't be common. - * * If an overflow page is referenced more than once, discard leaf pages * with the lowest LSNs until overflow pages are only referenced once. * @@ -2314,12 +2346,15 @@ __slvg_merge_block_free(WT_SESSION_IMPL *session, WT_STUFF *ss) * Reference an overflow page, checking for multiple references. */ static int -__slvg_ovfl_ref(WT_SESSION_IMPL *session, WT_TRACK *trk) +__slvg_ovfl_ref(WT_SESSION_IMPL *session, WT_TRACK *trk, int multi_panic) { - if (F_ISSET(trk, WT_TRACK_OVFL_REFD)) + if (F_ISSET(trk, WT_TRACK_OVFL_REFD)) { + if (!multi_panic) + return (EBUSY); WT_PANIC_RET(session, EINVAL, - "overflow record at row-store page merge referenced " - "multiple times"); + "overflow record unexpectedly referenced multiple times " + "during leaf page merge"); + } F_SET(trk, WT_TRACK_OVFL_REFD); return (0); @@ -2336,7 +2371,7 @@ __slvg_ovfl_ref_all(WT_SESSION_IMPL *session, WT_TRACK *trk) for (i = 0; i < trk->trk_ovfl_cnt; ++i) WT_RET(__slvg_ovfl_ref( - session, trk->ss->ovfl[trk->trk_ovfl_slot[i]])); + session, trk->ss->ovfl[trk->trk_ovfl_slot[i]], 1)); return (0); } diff --git a/src/btree/col_modify.c b/src/btree/col_modify.c index e2e3adbd714..3a4a2a2987d 100644 --- a/src/btree/col_modify.c +++ b/src/btree/col_modify.c @@ -121,7 +121,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, ins_head = *ins_headp; /* Choose a skiplist depth for this insert. */ - skipdepth = __wt_skip_choose_depth(); + skipdepth = __wt_skip_choose_depth(session); /* * Allocate a WT_INSERT/WT_UPDATE pair and transaction ID, and diff --git a/src/btree/col_srch.c b/src/btree/col_srch.c index d367157b400..e4083e2282f 100644 --- a/src/btree/col_srch.c +++ b/src/btree/col_srch.c @@ -22,7 +22,7 @@ __wt_col_search(WT_SESSION_IMPL *session, WT_INSERT_HEAD *ins_head; WT_PAGE *page; WT_PAGE_INDEX *pindex; - WT_REF *child, *parent; + WT_REF *current, *descent; uint32_t base, indx, limit; int depth; @@ -35,68 +35,66 @@ __wt_col_search(WT_SESSION_IMPL *session, * page, not a full tree. */ if (leaf != NULL) { - child = leaf; + current = leaf; goto leaf_only; } /* Search the internal pages of the tree. */ - parent = child = &btree->root; + current = &btree->root; for (depth = 2;; ++depth) { -restart: page = parent->page; +restart: page = current->page; if (page->type != WT_PAGE_COL_INT) break; - WT_ASSERT(session, parent->key.recno == page->pg_intl_recno); + WT_ASSERT(session, current->key.recno == page->pg_intl_recno); pindex = WT_INTL_INDEX_COPY(page); base = pindex->entries; - child = pindex->index[base - 1]; + descent = pindex->index[base - 1]; /* Fast path appends. */ - if (recno >= child->key.recno) + if (recno >= descent->key.recno) goto descend; /* Binary search of internal pages. */ for (base = 0, limit = pindex->entries - 1; limit != 0; limit >>= 1) { indx = base + (limit >> 1); - child = pindex->index[indx]; + descent = pindex->index[indx]; - if (recno == child->key.recno) + if (recno == descent->key.recno) break; - if (recno < child->key.recno) + if (recno < descent->key.recno) continue; base = indx + 1; --limit; } -descend: WT_ASSERT(session, child != NULL); - - /* +descend: /* * Reference the slot used for next step down the tree. * * Base is the smallest index greater than recno and may be the * (last + 1) index. The slot for descent is the one before * base. */ - if (recno != child->key.recno) { + if (recno != descent->key.recno) { /* * We don't have to correct for base == 0 because the * only way for base to be 0 is if recno is the page's * starting recno. */ WT_ASSERT(session, base > 0); - child = pindex->index[base - 1]; + descent = pindex->index[base - 1]; } /* - * Swap the parent page for the child page. If the page splits - * while we're retrieving it, restart the search in the parent + * Swap the current page for the child page. If the page splits + * while we're retrieving it, restart the search in the current * page; otherwise return on error, the swap call ensures we're * holding nothing on failure. */ - switch (ret = __wt_page_swap(session, parent, child, 0)) { + switch (ret = __wt_page_swap(session, current, descent, 0)) { case 0: - parent = child; + current = descent; break; case WT_RESTART: goto restart; @@ -110,8 +108,8 @@ descend: WT_ASSERT(session, child != NULL); btree->maximum_depth = depth; leaf_only: - page = child->page; - cbt->ref = child; + page = current->page; + cbt->ref = current; cbt->recno = recno; cbt->compare = 0; diff --git a/src/btree/rec_track.c b/src/btree/rec_track.c index a4ef0aaa100..a02e6c32ec7 100644 --- a/src/btree/rec_track.c +++ b/src/btree/rec_track.c @@ -495,7 +495,7 @@ __wt_ovfl_reuse_add(WT_SESSION_IMPL *session, WT_PAGE *page, head = page->modify->ovfl_track->ovfl_reuse; /* Choose a skiplist depth for this insert. */ - skipdepth = __wt_skip_choose_depth(); + skipdepth = __wt_skip_choose_depth(session); /* * Allocate the WT_OVFL_REUSE structure, next pointers for the skip @@ -783,7 +783,7 @@ __wt_ovfl_txnc_add(WT_SESSION_IMPL *session, WT_PAGE *page, head = page->modify->ovfl_track->ovfl_txnc; /* Choose a skiplist depth for this insert. */ - skipdepth = __wt_skip_choose_depth(); + skipdepth = __wt_skip_choose_depth(session); /* * Allocate the WT_OVFL_TXNC structure, next pointers for the skip @@ -807,7 +807,7 @@ __wt_ovfl_txnc_add(WT_SESSION_IMPL *session, WT_PAGE *page, txnc->value_offset = WT_PTRDIFF32(p, txnc); txnc->value_size = WT_STORE_SIZE(value_size); memcpy(p, value, value_size); - txnc->current = __wt_txn_current_id(session); + txnc->current = __wt_txn_new_id(session); __wt_cache_page_inmem_incr(session, page, WT_OVFL_SIZE(WT_OVFL_TXNC) + addr_size + value_size); diff --git a/src/btree/rec_write.c b/src/btree/rec_write.c index d5564247afa..ca14b4d4db9 100644 --- a/src/btree/rec_write.c +++ b/src/btree/rec_write.c @@ -529,6 +529,52 @@ err: __wt_page_out(session, &next); return (ret); } +/* + * __rec_raw_compression_config -- + * Configure raw compression. + */ +static inline int +__rec_raw_compression_config( + WT_SESSION_IMPL *session, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage) +{ + WT_BTREE *btree; + + btree = S2BT(session); + + /* Check if raw compression configured. */ + if (btree->compressor == NULL || + btree->compressor->compress_raw == NULL) + return (0); + + /* Only for row-store and variable-length column-store objects. */ + if (page->type == WT_PAGE_COL_FIX) + return (0); + + /* + * Raw compression cannot support dictionary compression. (Technically, + * we could still use the raw callback on column-store variable length + * internal pages with dictionary compression configured, because + * dictionary compression only applies to column-store leaf pages, but + * that seems an unlikely use case.) + */ + if (btree->dictionary != 0) + return (0); + + /* Raw compression cannot support prefix compression. */ + if (btree->prefix_compression != 0) + return (0); + + /* + * Raw compression is also turned off during salvage: we can't allow + * pages to split during salvage, raw compression has no point if it + * can't manipulate the page size. + */ + if (salvage != NULL) + return (0); + + return (1); +} + /* * __rec_write_init -- * Initialize the reconciliation structure. @@ -566,27 +612,9 @@ __rec_write_init(WT_SESSION_IMPL *session, /* Track if the page can be marked clean. */ r->leave_dirty = 0; - /* - * Raw compression, the application builds disk images: applicable only - * to row-and variable-length column-store objects. Dictionary and - * prefix compression must be turned off or we ignore raw-compression, - * raw compression can't support either one. (Technically, we could - * still use the raw callback on column-store variable length internal - * pages with dictionary compression configured, because dictionary - * compression only applies to column-store leaf pages, but that seems - * an unlikely use case.) - * - * Raw compression is also turned off during salvage: we can't allow - * pages to split during salvage, raw compression has no point if it - * can't manipulate the page size. - */ + /* Raw compression. */ r->raw_compression = - btree->compressor != NULL && - btree->compressor->compress_raw != NULL && - page->type != WT_PAGE_COL_FIX && - btree->dictionary == 0 && - btree->prefix_compression == 0 && - salvage == NULL; + __rec_raw_compression_config(session, page, salvage); r->raw_destination.flags = WT_ITEM_ALIGNED; /* Track overflow items. */ @@ -1951,6 +1979,30 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r) return (0); } +/* + * __rec_skipped_update_chk -- + * Return if a skipped update makes this a waste of time. + */ +static inline int +__rec_skipped_update_chk(WT_SESSION_IMPL *session, WT_RECONCILE *r) +{ + /* + * If we're doing an eviction, and we skipped an update, it only pays + * off to continue if we're writing multiple blocks, that is, we'll be + * able to evict something. This should be unlikely (why did eviction + * choose a recently written, small block), but it's possible. Our + * caller is responsible for calling us at the right moment, when all + * of the rows have been reviewed and we're about to finalize a write. + */ + if (F_ISSET(r, WT_SKIP_UPDATE_RESTORE) && + r->bnd_next == 0 && r->leave_dirty) { + WT_STAT_FAST_CONN_INCR(session, rec_skipped_update); + WT_STAT_FAST_DATA_INCR(session, rec_skipped_update); + return (EBUSY); + } + return (0); +} + /* * __rec_split_raw_worker -- * Handle the raw compression page reconciliation bookkeeping. @@ -2310,22 +2362,13 @@ no_slots: return (0); } + /* Check if a skipped update makes this a waste of time. */ + if (last_block) + WT_RET (__rec_skipped_update_chk(session, r)); + /* We have a block, update the boundary counter. */ ++r->bnd_next; - /* - * If we're doing an eviction, and we skipped an update, it only pays - * off to continue if we're writing multiple blocks, that is, we'll be - * able to evict something. This should be unlikely (why did eviction - * choose a recently written, small block), but it's possible. - */ - if (r->bnd_next == 1 && last_block && - F_ISSET(r, WT_SKIP_UPDATE_RESTORE) && r->leave_dirty) { - WT_STAT_FAST_CONN_INCR(session, rec_skipped_update); - WT_STAT_FAST_DATA_INCR(session, rec_skipped_update); - return (EBUSY); - } - /* * If we are writing the whole page in our first/only attempt, it might * be a checkpoint (checkpoints are only a single page, by definition). @@ -2430,18 +2473,8 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r) WT_ILLEGAL_VALUE(session); } - /* - * If we're doing an eviction, and we skipped an update, it only pays - * off to continue if we're writing multiple blocks, that is, we'll be - * able to evict something. This should be unlikely (why did eviction - * choose a recently written, small block), but it's possible. - */ - if (F_ISSET(r, WT_SKIP_UPDATE_RESTORE) && - r->bnd_next == 0 && r->leave_dirty) { - WT_STAT_FAST_CONN_INCR(session, rec_skipped_update); - WT_STAT_FAST_DATA_INCR(session, rec_skipped_update); - return (EBUSY); - } + /* Check if a skipped update makes this a waste of time. */ + WT_RET (__rec_skipped_update_chk(session, r)); /* * We only arrive here with no entries to write if the page was entirely @@ -2476,6 +2509,10 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r) static inline int __rec_split_finish_raw(WT_SESSION_IMPL *session, WT_RECONCILE *r) { + /* Check if a skipped update makes this a waste of time. */ + if (r->entries == 0) + WT_RET (__rec_skipped_update_chk(session, r)); + while (r->entries != 0) WT_RET(__rec_split_raw_worker(session, r, 1)); return (0); @@ -3831,7 +3868,7 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) WT_REF *ref; size_t size; u_int vtype; - int hazard, onpage_ovfl, ovfl_key, state; + int hazard, key_onpage_ovfl, ovfl_key, state; const void *p; btree = S2BT(session); @@ -3879,11 +3916,12 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) ikey = __wt_ref_key_instantiated(ref); if (ikey == NULL || ikey->cell_offset == 0) { cell = NULL; - onpage_ovfl = 0; + key_onpage_ovfl = 0; } else { cell = WT_PAGE_REF_OFFSET(page, ikey->cell_offset); __wt_cell_unpack(cell, kpack); - onpage_ovfl = kpack->ovfl == 1 ? 1 : 0; + key_onpage_ovfl = + kpack->ovfl && kpack->raw != WT_CELL_KEY_OVFL_RM; } WT_ERR(__rec_child_modify(session, r, ref, &hazard, &state)); @@ -3900,7 +3938,7 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) * always instantiated. Don't worry about reuse, * reusing this key in this reconciliation is unlikely. */ - if (onpage_ovfl && kpack->raw != WT_CELL_KEY_OVFL_RM) + if (key_onpage_ovfl) WT_ERR(__wt_ovfl_discard_add( session, page, kpack->cell)); CHILD_RELEASE_ERR(session, hazard, ref); @@ -3926,8 +3964,7 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) * worry about reuse, reusing this key in this * reconciliation is unlikely. */ - if (onpage_ovfl && - kpack->raw != WT_CELL_KEY_OVFL_RM) + if (key_onpage_ovfl) WT_ERR(__wt_ovfl_discard_add( session, page, kpack->cell)); CHILD_RELEASE_ERR(session, hazard, ref); @@ -3942,8 +3979,7 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) * worry about reuse, reusing this key in this * reconciliation is unlikely. */ - if (onpage_ovfl && - kpack->raw != WT_CELL_KEY_OVFL_RM) + if (key_onpage_ovfl) WT_ERR(__wt_ovfl_discard_add( session, page, kpack->cell)); @@ -3982,19 +4018,11 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) __rec_cell_build_addr(r, p, size, vtype, 0); CHILD_RELEASE_ERR(session, hazard, ref); - /* - * If the key is an overflow key, check to see if the backing - * blocks have been freed; in that case, we have to build a new - * key. - */ - if (onpage_ovfl && kpack->raw == WT_CELL_KEY_OVFL_RM) - onpage_ovfl = 0; - /* * Build key cell. * Truncate any 0th key, internal pages don't need 0th keys. */ - if (onpage_ovfl) { + if (key_onpage_ovfl) { key->buf.data = cell; key->buf.size = __wt_cell_total_len(kpack); key->cell_len = 0; @@ -4020,10 +4048,10 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) * case, we have to build the actual key now because we * are about to promote it. */ - if (onpage_ovfl) { + if (key_onpage_ovfl) { WT_ERR(__wt_buf_set(session, r->cur, WT_IKEY_DATA(ikey), ikey->size)); - onpage_ovfl = 0; + key_onpage_ovfl = 0; } WT_ERR(__rec_split(session, r)); } @@ -5395,7 +5423,7 @@ __rec_dictionary_init(WT_SESSION_IMPL *session, WT_RECONCILE *r, u_int slots) WT_RET(__wt_calloc(session, r->dictionary_slots, sizeof(WT_DICTIONARY *), &r->dictionary)); for (i = 0; i < r->dictionary_slots; ++i) { - depth = __wt_skip_choose_depth(); + depth = __wt_skip_choose_depth(session); WT_RET(__wt_calloc(session, 1, sizeof(WT_DICTIONARY) + depth * sizeof(WT_DICTIONARY *), &r->dictionary[i])); diff --git a/src/btree/row_modify.c b/src/btree/row_modify.c index 03772e317b4..a87a93b05a1 100644 --- a/src/btree/row_modify.c +++ b/src/btree/row_modify.c @@ -118,7 +118,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, ins_head = *ins_headp; /* Choose a skiplist depth for this insert. */ - skipdepth = __wt_skip_choose_depth(); + skipdepth = __wt_skip_choose_depth(session); /* * Allocate a WT_INSERT/WT_UPDATE pair and transaction ID, and diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c index 7e7bc788b4d..b190aaaded5 100644 --- a/src/btree/row_srch.c +++ b/src/btree/row_srch.c @@ -144,7 +144,7 @@ __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *item; WT_PAGE *page; WT_PAGE_INDEX *pindex; - WT_REF *child, *parent; + WT_REF *current, *descent; WT_ROW *rip; size_t match, skiphigh, skiplow; uint32_t base, indx, limit; @@ -182,15 +182,15 @@ __wt_row_search(WT_SESSION_IMPL *session, * page, not a full tree. */ if (leaf != NULL) { - child = leaf; + current = leaf; goto leaf_only; } /* Search the internal pages of the tree. */ cmp = -1; - parent = child = &btree->root; + current = &btree->root; for (depth = 2;; ++depth) { -restart: page = parent->page; +restart: page = current->page; if (page->type != WT_PAGE_ROW_INT) break; @@ -201,14 +201,14 @@ restart: page = parent->page; * the root page in new trees. */ if (pindex->entries == 1) { - child = pindex->index[0]; + descent = pindex->index[0]; goto descend; } /* Fast-path appends. */ if (append_check) { - child = pindex->index[pindex->entries - 1]; - __wt_ref_key(page, child, &item->data, &item->size); + descent = pindex->index[pindex->entries - 1]; + __wt_ref_key(page, descent, &item->data, &item->size); WT_ERR(__wt_compare( session, collator, srch_key, item, &cmp)); if (cmp >= 0) @@ -240,9 +240,9 @@ restart: page = parent->page; if (collator == NULL) for (; limit != 0; limit >>= 1) { indx = base + (limit >> 1); - child = pindex->index[indx]; + descent = pindex->index[indx]; __wt_ref_key( - page, child, &item->data, &item->size); + page, descent, &item->data, &item->size); match = WT_MIN(skiplow, skiphigh); cmp = __wt_lex_compare_skip( @@ -259,9 +259,9 @@ restart: page = parent->page; else for (; limit != 0; limit >>= 1) { indx = base + (limit >> 1); - child = pindex->index[indx]; + descent = pindex->index[indx]; __wt_ref_key( - page, child, &item->data, &item->size); + page, descent, &item->data, &item->size); WT_ERR(__wt_compare( session, collator, srch_key, item, &cmp)); @@ -273,11 +273,11 @@ restart: page = parent->page; } /* - * Set the slot to descend the tree: child is already set if + * Set the slot to descend the tree: descent is already set if * there was an exact match on the page, otherwise, base is * the smallest index greater than key, possibly (last + 1). */ - child = pindex->index[base - 1]; + descent = pindex->index[base - 1]; /* * If we end up somewhere other than the last slot, it's not a @@ -287,14 +287,14 @@ restart: page = parent->page; descend_right = 0; descend: /* - * Swap the parent page for the child page. If the page splits - * while we're retrieving it, restart the search in the parent + * Swap the current page for the child page. If the page splits + * while we're retrieving it, restart the search in the current * page; otherwise return on error, the swap call ensures we're * holding nothing on failure. */ - switch (ret = __wt_page_swap(session, parent, child, 0)) { + switch (ret = __wt_page_swap(session, current, descent, 0)) { case 0: - parent = child; + current = descent; break; case WT_RESTART: skiphigh = skiplow = 0; @@ -309,8 +309,8 @@ descend: /* btree->maximum_depth = depth; leaf_only: - page = child->page; - cbt->ref = child; + page = current->page; + cbt->ref = current; /* * In the case of a right-side tree descent during an insert, do a fast @@ -456,7 +456,7 @@ leaf_match: cbt->compare = 0; return (0); err: if (leaf != NULL) - WT_TRET(__wt_page_release(session, child, 0)); + WT_TRET(__wt_page_release(session, current, 0)); return (ret); } @@ -472,7 +472,7 @@ __wt_row_random(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_INSERT *p, *t; WT_PAGE *page; WT_PAGE_INDEX *pindex; - WT_REF *child, *parent; + WT_REF *current, *descent; btree = S2BT(session); @@ -480,21 +480,22 @@ __wt_row_random(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) restart: /* Walk the internal pages of the tree. */ - parent = child = &btree->root; + current = &btree->root; for (;;) { - page = parent->page; + page = current->page; if (page->type != WT_PAGE_ROW_INT) break; pindex = WT_INTL_INDEX_COPY(page); - child = pindex->index[__wt_random() % pindex->entries]; + descent = pindex->index[ + __wt_random(session->rnd) % pindex->entries]; /* * Swap the parent page for the child page; return on error, * the swap function ensures we're holding nothing on failure. */ - if ((ret = __wt_page_swap(session, parent, child, 0)) == 0) { - parent = child; + if ((ret = __wt_page_swap(session, current, descent, 0)) == 0) { + current = descent; continue; } /* @@ -503,7 +504,7 @@ restart: * it and restart the search from the top of the tree. */ if (ret == WT_RESTART && - (ret = __wt_page_release(session, parent, 0)) == 0) + (ret = __wt_page_release(session, current, 0)) == 0) goto restart; return (ret); } @@ -517,11 +518,11 @@ restart: * or a tree with just one big page, that's not going to work, * check for that. */ - cbt->ref = child; + cbt->ref = current; cbt->compare = 0; pindex = WT_INTL_INDEX_COPY(btree->root.page); cbt->slot = pindex->entries < 2 ? - __wt_random() % page->pg_row_entries : 0; + __wt_random(session->rnd) % page->pg_row_entries : 0; return (__wt_row_leaf_key(session, page, page->pg_row_d + cbt->slot, &cbt->search_key, 0)); @@ -541,12 +542,12 @@ restart: break; t = WT_SKIP_NEXT(t); } - cbt->ref = child; + cbt->ref = current; cbt->compare = 0; cbt->ins = t; return (0); -err: WT_TRET(__wt_page_release(session, child, 0)); +err: WT_TRET(__wt_page_release(session, current, 0)); return (ret); } diff --git a/src/config/config_collapse.c b/src/config/config_collapse.c index 8d245680f69..2c469c25996 100644 --- a/src/config/config_collapse.c +++ b/src/config/config_collapse.c @@ -9,9 +9,24 @@ /* * __wt_config_collapse -- - * Given a NULL-terminated list of configuration strings, where the first - * one contains all the defaults, collapse them into newly allocated - * memory. + * Collapse a set of configuration strings into newly allocated memory. + * + * This function takes a NULL-terminated list of configuration strings (where + * the first one contains all the defaults and the values are in order from + * least to most preferred, that is, the default values are least preferred), + * and collapses them into newly allocated memory. The algorithm is to walk + * the first of the configuration strings, and for each entry, search all of + * the configuration strings for a final value, keeping the last value found. + * + * Notes: + * Any key not appearing in the first configuration string is discarded + * from the final result, because we'll never search for it. + * + * Nested structures aren't parsed. For example, imagine a configuration + * string contains "key=(k2=v2,k3=v3)", and a subsequent string has + * "key=(k4=v4)", the result will be "key=(k4=v4)", as we search for and + * use the final value of "key", regardless of field overlap or missing + * fields in the nested value. */ int __wt_config_collapse( @@ -60,3 +75,306 @@ __wt_config_collapse( err: __wt_scr_free(&tmp); return (ret); } + +/* + * We need a character that can't appear in a key as a separator. + */ +#undef SEP /* separator key, character */ +#define SEP "[" +#undef SEPC +#define SEPC '[' + +/* + * Individual configuration entries, including a generation number used to make + * the qsort stable. + */ +typedef struct { + char *k, *v; /* key, value */ + size_t gen; /* generation */ +} WT_CONFIG_MERGE_ENTRY; + +/* + * The array of configuration entries. + */ +typedef struct { + size_t entries_allocated; /* allocated */ + size_t entries_next; /* next slot */ + + WT_CONFIG_MERGE_ENTRY *entries; /* array of entries */ +} WT_CONFIG_MERGE; + +/* + * __config_merge_scan -- + * Walk a configuration string, inserting entries into the merged array. + */ +static int +__config_merge_scan(WT_SESSION_IMPL *session, + const char *key, const char *value, WT_CONFIG_MERGE *cp) +{ + WT_CONFIG cparser; + WT_CONFIG_ITEM k, v; + WT_DECL_ITEM(kb); + WT_DECL_ITEM(vb); + WT_DECL_RET; + size_t len; + + WT_ERR(__wt_scr_alloc(session, 0, &kb)); + WT_ERR(__wt_scr_alloc(session, 0, &vb)); + + WT_ERR(__wt_config_init(session, &cparser, value)); + while ((ret = __wt_config_next(&cparser, &k, &v)) == 0) { + if (k.type != WT_CONFIG_ITEM_STRING && + k.type != WT_CONFIG_ITEM_ID) + WT_ERR_MSG(session, EINVAL, + "Invalid configuration key found: '%s'\n", k.str); + + /* Include the quotes around string keys/values. */ + if (k.type == WT_CONFIG_ITEM_STRING) { + --k.str; + k.len += 2; + } + if (v.type == WT_CONFIG_ITEM_STRING) { + --v.str; + v.len += 2; + } + + /* + * !!! + * We're using a JSON quote character to separate the names we + * create for nested structures. That's not completely safe as + * it's possible to quote characters in JSON such that a quote + * character appears as a literal character in a key name. In + * a few cases, applications can create their own key namespace + * (for example, shared library extension names), and therefore + * it's possible for an application to confuse us. Error if we + * we ever see a key with a magic character. + */ + for (len = 0; len < k.len; ++len) + if (k.str[len] == SEPC) + WT_ERR_MSG(session, EINVAL, + "key %.*s contains a '%c' separator " + "character", + (int)k.len, (char *)k.str, SEPC); + + /* Build the key/value strings. */ + WT_ERR(__wt_buf_fmt(session, + kb, "%s%s%.*s", + key == NULL ? "" : key, + key == NULL ? "" : SEP, + (int)k.len, k.str)); + WT_ERR(__wt_buf_fmt(session, + vb, "%.*s", (int)v.len, v.str)); + + /* + * If the value is a structure, recursively parse it. + * + * !!! + * Don't merge unless the structure has field names. WiredTiger + * stores checkpoint LSNs in the metadata file using nested + * structures without field names: "checkpoint_lsn=(1,0)", not + * "checkpoint_lsn=(file=1,offset=0)". The value type is still + * WT_CONFIG_ITEM_STRUCT, so we check for a field name in the + * value. + */ + if (v.type == WT_CONFIG_ITEM_STRUCT && + strchr(vb->data, '=') != NULL) { + WT_ERR(__config_merge_scan( + session, kb->data, vb->data, cp)); + continue; + } + + /* Insert the value into the array. */ + WT_ERR(__wt_realloc_def(session, + &cp->entries_allocated, + cp->entries_next + 1, &cp->entries)); + WT_ERR(__wt_strndup(session, + kb->data, kb->size, &cp->entries[cp->entries_next].k)); + WT_ERR(__wt_strndup(session, + vb->data, vb->size, &cp->entries[cp->entries_next].v)); + cp->entries[cp->entries_next].gen = cp->entries_next; + ++cp->entries_next; + } + WT_ERR_NOTFOUND_OK(ret); + +err: __wt_scr_free(&kb); + __wt_scr_free(&vb); + return (ret); +} + +/* + * __strip_comma -- + * Strip a trailing comma. + */ +static inline void +__strip_comma(WT_ITEM *buf) +{ + if (buf->size != 0 && ((char *)buf->data)[buf->size - 1] == ',') + --buf->size; +} + +/* + * __config_merge_format_next -- + * Walk the array, building entries. + */ +static int +__config_merge_format_next(WT_SESSION_IMPL *session, const char *prefix, + size_t plen, size_t *enp, WT_CONFIG_MERGE *cp, WT_ITEM *build) +{ + WT_CONFIG_MERGE_ENTRY *ep; + size_t len1, len2, next; + char *p; + + for (; *enp < cp->entries_next; ++*enp) { + ep = &cp->entries[*enp]; + len1 = strlen(ep->k); + + /* + * The entries are in sorted order, take the last entry for any + * key. + */ + if (*enp < (cp->entries_next - 1)) { + len2 = strlen((ep + 1)->k); + + /* Choose the last of identical keys. */ + if (len1 == len2 && + memcmp(ep->k, (ep + 1)->k, len1) == 0) + continue; + + /* + * The test is complicated by matching empty entries + * "foo=" against nested structures "foo,bar=", where + * the latter is a replacement for the former. + */ + if (len2 > len1 && + (ep + 1)->k[len1] == SEPC && + memcmp(ep->k, (ep + 1)->k, len1) == 0) + continue; + } + + /* + * If we're skipping a prefix and this entry doesn't match it, + * back off one entry and pop up a level. + */ + if (plen != 0 && + (plen > len1 || memcmp(ep->k, prefix, plen) != 0)) { + --*enp; + break; + } + + /* + * If the entry introduces a new level, recurse through that + * new level. + */ + if ((p = strchr(ep->k + plen, SEPC)) != NULL) { + next = WT_PTRDIFF(p, ep->k); + WT_RET(__wt_buf_catfmt(session, + build, "%.*s=(", (int)(next - plen), ep->k + plen)); + WT_RET(__config_merge_format_next( + session, ep->k, next + 1, enp, cp, build)); + __strip_comma(build); + WT_RET(__wt_buf_catfmt(session, build, "),")); + continue; + } + + /* Append the entry to the buffer. */ + WT_RET(__wt_buf_catfmt( + session, build, "%s=%s,", ep->k + plen, ep->v)); + } + + return (0); +} + +/* + * __config_merge_format -- + * Take the sorted array of entries, and format them into allocated memory. + */ +static int +__config_merge_format( + WT_SESSION_IMPL *session, WT_CONFIG_MERGE *cp, const char **config_ret) +{ + WT_DECL_ITEM(build); + WT_DECL_RET; + size_t entries; + + WT_RET(__wt_scr_alloc(session, 4 * 1024, &build)); + + entries = 0; + WT_ERR(__config_merge_format_next(session, "", 0, &entries, cp, build)); + + __strip_comma(build); + + ret = __wt_strndup(session, build->data, build->size, config_ret); + +err: __wt_scr_free(&build); + return (ret); +} + +/* + * __config_merge_cmp -- + * Qsort function: sort the config merge array. + */ +static int +__config_merge_cmp(const void *a, const void *b) +{ + WT_CONFIG_MERGE_ENTRY *ae, *be; + int cmp; + + ae = (WT_CONFIG_MERGE_ENTRY *)a; + be = (WT_CONFIG_MERGE_ENTRY *)b; + + if ((cmp = strcmp(ae->k, be->k)) != 0) + return (cmp); + return (ae->gen > be->gen ? 1 : -1); +} + +/* + * __wt_config_merge -- + * Merge a set of configuration strings into newly allocated memory. + * + * This function takes a NULL-terminated list of configuration strings (where + * the values are in order from least to most preferred), and merges them into + * newly allocated memory. The algorithm is to walk the configuration strings + * and build a table of each key/value pair. The pairs are sorted based on the + * name and the configuration string in which they were found, and a final + * configuration string is built from the result. + * + * Note: + * Nested structures are parsed and merge. For example, if configuration + * strings "key=(k1=v1,k2=v2)" and "key=(k1=v2)" appear, the result will + * be "key=(k1=v2,k2=v2)" because the nested values are merged. + */ +int +__wt_config_merge( + WT_SESSION_IMPL *session, const char **cfg, const char **config_ret) +{ + WT_CONFIG_MERGE merge; + WT_DECL_RET; + size_t i; + + /* Start out with a reasonable number of entries. */ + WT_CLEAR(merge); + + WT_RET(__wt_realloc_def( + session, &merge.entries_allocated, 100, &merge.entries)); + + /* Scan the configuration strings, entering them into the array. */ + for (; *cfg != NULL; ++cfg) + WT_ERR(__config_merge_scan(session, NULL, *cfg, &merge)); + + /* + * Sort the array by key and, in the case of identical keys, by + * generation. + */ + qsort(merge.entries, merge.entries_next, + sizeof(WT_CONFIG_MERGE_ENTRY), __config_merge_cmp); + + /* Convert the array of entries into a string. */ + ret = __config_merge_format(session, &merge, config_ret); + +err: for (i = 0; i < merge.entries_next; ++i) { + __wt_free(session, merge.entries[i].k); + __wt_free(session, merge.entries[i].v); + } + __wt_free(session, merge.entries); + return (ret); +} diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c index e7826e9fd56..bdac6a57006 100644 --- a/src/conn/conn_api.c +++ b/src/conn/conn_api.c @@ -306,16 +306,16 @@ err: if (ncoll != NULL) { /* * __wt_conn_remove_collator -- - * remove collator added by WT_CONNECTION->add_collator, - * only used internally. + * Remove collator added by WT_CONNECTION->add_collator, only used + * internally. */ int -__wt_conn_remove_collator(WT_CONNECTION_IMPL *conn, WT_NAMED_COLLATOR *ncoll) +__wt_conn_remove_collator(WT_SESSION_IMPL *session, WT_NAMED_COLLATOR *ncoll) { - WT_SESSION_IMPL *session; + WT_CONNECTION_IMPL *conn; WT_DECL_RET; - session = conn->default_session; + conn = S2C(session); /* Call any termination method. */ if (ncoll->collator->terminate != NULL) @@ -370,17 +370,17 @@ err: if (ncomp != NULL) { /* * __wt_conn_remove_compressor -- - * remove compressor added by WT_CONNECTION->add_compressor, - * only used internally. + * remove compressor added by WT_CONNECTION->add_compressor, only used + * internally. */ int __wt_conn_remove_compressor( - WT_CONNECTION_IMPL *conn, WT_NAMED_COMPRESSOR *ncomp) + WT_SESSION_IMPL *session, WT_NAMED_COMPRESSOR *ncomp) { - WT_SESSION_IMPL *session; + WT_CONNECTION_IMPL *conn; WT_DECL_RET; - session = conn->default_session; + conn = S2C(session); /* Call any termination method. */ if (ncomp->compressor->terminate != NULL) @@ -438,12 +438,12 @@ err: if (ndsrc != NULL) { */ int __wt_conn_remove_data_source( - WT_CONNECTION_IMPL *conn, WT_NAMED_DATA_SOURCE *ndsrc) + WT_SESSION_IMPL *session, WT_NAMED_DATA_SOURCE *ndsrc) { WT_DECL_RET; - WT_SESSION_IMPL *session; + WT_CONNECTION_IMPL *conn; - session = conn->default_session; + conn = S2C(session); /* Call any termination method. */ if (ndsrc->dsrc->terminate != NULL) @@ -494,7 +494,7 @@ __conn_async_flush(WT_CONNECTION *wt_conn) conn = (WT_CONNECTION_IMPL *)wt_conn; CONNECTION_API_CALL_NOCONF(conn, session, async_flush); - WT_ERR(__wt_async_flush(conn)); + WT_ERR(__wt_async_flush(session)); err: API_END_RET_NOTFOUND_MAP(session, ret); } @@ -514,7 +514,7 @@ __conn_async_new_op(WT_CONNECTION *wt_conn, const char *uri, const char *config, conn = (WT_CONNECTION_IMPL *)wt_conn; CONNECTION_API_CALL(conn, session, async_new_op, config, cfg); - WT_ERR(__wt_async_new_op(conn, uri, config, cfg, callback, &op)); + WT_ERR(__wt_async_new_op(session, uri, config, cfg, callback, &op)); *asyncopp = &op->iface; @@ -626,43 +626,47 @@ err: /* static int __conn_reconfigure(WT_CONNECTION *wt_conn, const char *config) { - WT_CONFIG_ITEM cval; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_SESSION_IMPL *session; - - /* - * Special version of cfg that doesn't include the default config: used - * to limit changes to values that the application sets explicitly. - * Note that any function using this value has to be prepared to handle - * not-found as a valid option return. - */ - const char *raw_cfg[] = { config, NULL }; + const char *p, *config_cfg[] = { NULL, NULL, NULL }; conn = (WT_CONNECTION_IMPL *)wt_conn; CONNECTION_API_CALL(conn, session, reconfigure, config, cfg); + WT_UNUSED(cfg); - WT_ERR(__wt_conn_cache_pool_config(session, cfg)); - WT_ERR(__wt_cache_config(conn, raw_cfg)); + /* Serialize reconfiguration. */ + __wt_spin_lock(session, &conn->reconfig_lock); - WT_ERR(__wt_async_reconfig(conn, raw_cfg)); - WT_ERR(__conn_statistics_config(session, raw_cfg)); - WT_ERR(__wt_conn_verbose_config(session, raw_cfg)); - WT_ERR(__wt_checkpoint_server_create(conn, cfg)); - WT_ERR(__wt_statlog_create(conn, cfg)); + /* + * The configuration argument has been checked for validity, replace the + * previous connection configuration. + * + * DO NOT merge the configuration before the reconfigure calls. Some + * of the underlying reconfiguration functions do explicit checks with + * the second element of the configuration array, knowing the defaults + * are in slot #1 and the application's modifications are in slot #2. + */ + config_cfg[0] = conn->cfg; + config_cfg[1] = config; - WT_ERR(__wt_config_gets( - session, cfg, "lsm_manager.worker_thread_max", &cval)); - if (cval.val) - conn->lsm_manager.lsm_workers_max = (uint32_t)cval.val; + WT_ERR(__conn_statistics_config(session, config_cfg)); + WT_ERR(__wt_async_reconfig(session, config_cfg)); + WT_ERR(__wt_cache_config(session, config_cfg)); + WT_ERR(__wt_cache_pool_config(session, config_cfg)); + WT_ERR(__wt_checkpoint_server_create(session, config_cfg)); + WT_ERR(__wt_lsm_manager_config(session, config_cfg)); + WT_ERR(__wt_statlog_create(session, config_cfg)); + WT_ERR(__wt_verbose_config(session, config_cfg)); - /* Wake up the cache pool server so any changes are noticed. */ - if (F_ISSET(conn, WT_CONN_CACHE_POOL)) - WT_ERR(__wt_cond_signal( - session, __wt_process.cache_pool->cache_pool_cond)); + WT_ERR(__wt_config_merge(session, config_cfg, &p)); + __wt_free(session, conn->cfg); + conn->cfg = p; -err: API_END_RET(session, ret); +err: __wt_spin_unlock(session, &conn->reconfig_lock); + + API_END_RET(session, ret); } /* @@ -1009,59 +1013,62 @@ __conn_statistics_config(WT_SESSION_IMPL *session, const char *cfg[]) WT_CONFIG_ITEM cval, sval; WT_CONNECTION_IMPL *conn; WT_DECL_RET; + uint32_t flags; int set; conn = S2C(session); - if ((ret = __wt_config_gets(session, cfg, "statistics", &cval)) != 0) - return (ret == WT_NOTFOUND ? 0 : ret); - - /* Configuring statistics clears any existing values. */ - conn->stat_flags = 0; + WT_RET(__wt_config_gets(session, cfg, "statistics", &cval)); + flags = 0; set = 0; if ((ret = __wt_config_subgets( session, &cval, "none", &sval)) == 0 && sval.val != 0) { - FLD_SET(conn->stat_flags, WT_CONN_STAT_NONE); + LF_SET(WT_CONN_STAT_NONE); ++set; } WT_RET_NOTFOUND_OK(ret); if ((ret = __wt_config_subgets( session, &cval, "fast", &sval)) == 0 && sval.val != 0) { - FLD_SET(conn->stat_flags, WT_CONN_STAT_FAST); + LF_SET(WT_CONN_STAT_FAST); ++set; } WT_RET_NOTFOUND_OK(ret); if ((ret = __wt_config_subgets( session, &cval, "all", &sval)) == 0 && sval.val != 0) { - FLD_SET(conn->stat_flags, WT_CONN_STAT_ALL | WT_CONN_STAT_FAST); + LF_SET(WT_CONN_STAT_ALL | WT_CONN_STAT_FAST); ++set; } WT_RET_NOTFOUND_OK(ret); if ((ret = __wt_config_subgets( session, &cval, "clear", &sval)) == 0 && sval.val != 0) - FLD_SET(conn->stat_flags, WT_CONN_STAT_CLEAR); + LF_SET(WT_CONN_STAT_CLEAR); WT_RET_NOTFOUND_OK(ret); if (set > 1) WT_RET_MSG(session, EINVAL, "only one statistics configuration value may be specified"); + + /* Configuring statistics clears any existing values. */ + conn->stat_flags = flags; + return (0); } /* - * __wt_conn_verbose_config -- + * __wt_verbose_config -- * Set verbose configuration. */ int -__wt_conn_verbose_config(WT_SESSION_IMPL *session, const char *cfg[]) +__wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[]) { WT_CONFIG_ITEM cval, sval; WT_CONNECTION_IMPL *conn; WT_DECL_RET; + uint32_t flags; static const struct { const char *name; uint32_t flag; @@ -1092,14 +1099,14 @@ __wt_conn_verbose_config(WT_SESSION_IMPL *session, const char *cfg[]) conn = S2C(session); - if ((ret = __wt_config_gets(session, cfg, "verbose", &cval)) != 0) - return (ret == WT_NOTFOUND ? 0 : ret); + WT_RET(__wt_config_gets(session, cfg, "verbose", &cval)); + flags = 0; for (ft = verbtypes; ft->name != NULL; ft++) { if ((ret = __wt_config_subgets( session, &cval, ft->name, &sval)) == 0 && sval.val != 0) { #ifdef HAVE_VERBOSE - FLD_SET(conn->verbose, ft->flag); + LF_SET(ft->flag); #else WT_RET_MSG(session, EINVAL, "Verbose option specified when WiredTiger built " @@ -1107,11 +1114,11 @@ __wt_conn_verbose_config(WT_SESSION_IMPL *session, const char *cfg[]) "configure command and rebuild to include support " "for verbose messages"); #endif - } else - FLD_CLR(conn->verbose, ft->flag); - + } WT_RET_NOTFOUND_OK(ret); } + + conn->verbose = flags; return (0); } @@ -1234,6 +1241,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, session = conn->default_session = &conn->dummy_session; session->iface.connection = &conn->iface; session->name = "wiredtiger_open"; + __wt_random_init(session->rnd); __wt_event_handler_set(session, event_handler); /* Remaining basic initialization of the connection structure. */ @@ -1319,7 +1327,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, if (cval.val) F_SET(conn, WT_CONN_CKPT_SYNC); - WT_ERR(__wt_conn_verbose_config(session, cfg)); + WT_ERR(__wt_verbose_config(session, cfg)); WT_ERR(__wt_config_gets(session, cfg, "buffer_alignment", &cval)); if (cval.val == -1) @@ -1399,13 +1407,12 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, */ WT_ERR(__wt_connection_workers(session, cfg)); + /* Merge the final configuration for later reconfiguration. */ + WT_ERR(__wt_config_merge(session, cfg, &conn->cfg)); + STATIC_ASSERT(offsetof(WT_CONNECTION_IMPL, iface) == 0); *wt_connp = &conn->iface; - /* - * Destroying the connection on error will destroy our session handle, - * cleanup using the session handle first, then discard the connection. - */ err: __wt_buf_free(session, &cbbuf); __wt_buf_free(session, &cubuf); diff --git a/src/conn/conn_cache.c b/src/conn/conn_cache.c index 42e45a9c58b..079bd05ff1e 100644 --- a/src/conn/conn_cache.c +++ b/src/conn/conn_cache.c @@ -12,65 +12,52 @@ * Configure the underlying cache. */ int -__wt_cache_config(WT_CONNECTION_IMPL *conn, const char *cfg[]) +__wt_cache_config(WT_SESSION_IMPL *session, const char *cfg[]) { WT_CACHE *cache; WT_CONFIG_ITEM cval; - WT_DECL_RET; - WT_SESSION_IMPL *session; + WT_CONNECTION_IMPL *conn; - session = conn->default_session; + conn = S2C(session); cache = conn->cache; /* * If not using a shared cache configure the cache size, otherwise * check for a reserved size. */ - if (!F_ISSET(conn, WT_CONN_CACHE_POOL) && - (ret = __wt_config_gets(session, cfg, "cache_size", &cval)) == 0) + if (!F_ISSET(conn, WT_CONN_CACHE_POOL)) { + WT_RET(__wt_config_gets(session, cfg, "cache_size", &cval)); conn->cache_size = (uint64_t)cval.val; - - if (F_ISSET(conn, WT_CONN_CACHE_POOL) && - (ret = __wt_config_gets(session, cfg, - "shared_cache.reserve", &cval)) == 0 && cval.val != 0) + } else { + WT_RET(__wt_config_gets( + session, cfg, "shared_cache.reserve", &cval)); + if (cval.val == 0) + WT_RET(__wt_config_gets( + session, cfg, "shared_cache.chunk", &cval)); cache->cp_reserved = (uint64_t)cval.val; - else if ((ret = __wt_config_gets(session, cfg, - "shared_cache.chunk", &cval)) == 0) - cache->cp_reserved = (uint64_t)cval.val; - WT_RET_NOTFOUND_OK(ret); + } - if ((ret = - __wt_config_gets(session, cfg, "eviction_target", &cval)) == 0) - cache->eviction_target = (u_int)cval.val; - WT_RET_NOTFOUND_OK(ret); + WT_RET(__wt_config_gets(session, cfg, "eviction_target", &cval)); + cache->eviction_target = (u_int)cval.val; - if ((ret = - __wt_config_gets(session, cfg, "eviction_trigger", &cval)) == 0) - cache->eviction_trigger = (u_int)cval.val; - WT_RET_NOTFOUND_OK(ret); + WT_RET(__wt_config_gets(session, cfg, "eviction_trigger", &cval)); + cache->eviction_trigger = (u_int)cval.val; - if ((ret = __wt_config_gets( - session, cfg, "eviction_dirty_target", &cval)) == 0) - cache->eviction_dirty_target = (u_int)cval.val; - WT_RET_NOTFOUND_OK(ret); + WT_RET(__wt_config_gets(session, cfg, "eviction_dirty_target", &cval)); + cache->eviction_dirty_target = (u_int)cval.val; /* * The eviction thread configuration options include the main eviction * thread and workers. Our implementation splits them out. Adjust for * the difference when parsing the configuration. */ - if ((ret = __wt_config_gets( - session, cfg, "eviction.threads_max", &cval)) == 0) { - WT_ASSERT(session, cval.val > 0); - conn->evict_workers_max = (u_int)cval.val - 1; - } - WT_RET_NOTFOUND_OK(ret); - if ((ret = __wt_config_gets( - session, cfg, "eviction.threads_min", &cval)) == 0) { - WT_ASSERT(session, cval.val > 0); - conn->evict_workers_min = (u_int)cval.val - 1; - } - WT_RET_NOTFOUND_OK(ret); + WT_RET(__wt_config_gets(session, cfg, "eviction.threads_max", &cval)); + WT_ASSERT(session, cval.val > 0); + conn->evict_workers_max = (u_int)cval.val - 1; + + WT_RET(__wt_config_gets(session, cfg, "eviction.threads_min", &cval)); + WT_ASSERT(session, cval.val > 0); + conn->evict_workers_min = (u_int)cval.val - 1; if (conn->evict_workers_min > conn->evict_workers_max) WT_RET_MSG(session, EINVAL, @@ -85,13 +72,13 @@ __wt_cache_config(WT_CONNECTION_IMPL *conn, const char *cfg[]) * Create the underlying cache. */ int -__wt_cache_create(WT_CONNECTION_IMPL *conn, const char *cfg[]) +__wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[]) { WT_CACHE *cache; + WT_CONNECTION_IMPL *conn; WT_DECL_RET; - WT_SESSION_IMPL *session; - session = conn->default_session; + conn = S2C(session); WT_ASSERT(session, conn->cache == NULL || (F_ISSET(conn, WT_CONN_CACHE_POOL) && conn->cache != NULL)); @@ -101,7 +88,7 @@ __wt_cache_create(WT_CONNECTION_IMPL *conn, const char *cfg[]) cache = conn->cache; /* Use a common routine for run-time configuration options. */ - WT_RET(__wt_cache_config(conn, cfg)); + WT_RET(__wt_cache_config(session, cfg)); /* Add the configured cache to the cache pool. */ if (F_ISSET(conn, WT_CONN_CACHE_POOL)) @@ -133,7 +120,7 @@ __wt_cache_create(WT_CONNECTION_IMPL *conn, const char *cfg[]) __wt_cache_stats_update(session); return (0); -err: WT_RET(__wt_cache_destroy(conn)); +err: WT_RET(__wt_cache_destroy(session)); return (ret); } @@ -164,13 +151,13 @@ __wt_cache_stats_update(WT_SESSION_IMPL *session) * Discard the underlying cache. */ int -__wt_cache_destroy(WT_CONNECTION_IMPL *conn) +__wt_cache_destroy(WT_SESSION_IMPL *session) { WT_CACHE *cache; + WT_CONNECTION_IMPL *conn; WT_DECL_RET; - WT_SESSION_IMPL *session; - session = conn->default_session; + conn = S2C(session); cache = conn->cache; if (cache == NULL) diff --git a/src/conn/conn_cache_pool.c b/src/conn/conn_cache_pool.c index cffb9af40f9..ba80ac15267 100644 --- a/src/conn/conn_cache_pool.c +++ b/src/conn/conn_cache_pool.c @@ -24,11 +24,11 @@ static int __cache_pool_assess(WT_SESSION_IMPL *, uint64_t *); static int __cache_pool_balance(WT_SESSION_IMPL *); /* - * __wt_conn_cache_pool_config -- + * __wt_cache_pool_config -- * Parse and setup the cache pool options. */ int -__wt_conn_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg) +__wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg) { WT_CACHE_POOL *cp; WT_CONFIG_ITEM cval; @@ -102,6 +102,7 @@ __wt_conn_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg) pool_name); cp = __wt_process.cache_pool; + /* * The cache pool requires a reference count to avoid a race between * configuration/open and destroy. @@ -110,39 +111,54 @@ __wt_conn_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg) ++cp->refs; /* - * Retrieve the pool configuration options. The values are optional if - * we are re-configuring. + * Cache pool configurations are optional when not creating. If + * values aren't being changed, retrieve the current value so that + * validation of settings works. */ - ret = __wt_config_gets(session, cfg, "shared_cache.size", &cval); - if (reconfiguring && ret == WT_NOTFOUND) - /* Not being changed; use the old value. */ - size = cp->size; - else { - WT_ERR(ret); + if (!created) { + if (__wt_config_gets(session, &cfg[1], + "shared_cache.size", &cval) == 0 && cval.val != 0) + size = (uint64_t)cval.val; + else + size = cp->size; + if (__wt_config_gets(session, &cfg[1], + "shared_cache.chunk", &cval) == 0 && cval.val != 0) + chunk = (uint64_t)cval.val; + else + chunk = cp->chunk; + } else { + /* + * The only time shared cache configuration uses default + * values is when we are creating the pool. + */ + WT_ERR(__wt_config_gets( + session, cfg, "shared_cache.size", &cval)); + WT_ASSERT(session, cval.val != 0); size = (uint64_t)cval.val; - } - ret = __wt_config_gets(session, cfg, "shared_cache.chunk", &cval); - if (reconfiguring && ret == WT_NOTFOUND) - /* Not being changed; use the old value. */ - chunk = cp->chunk; - else { - WT_ERR(ret); + WT_ERR(__wt_config_gets( + session, cfg, "shared_cache.chunk", &cval)); + WT_ASSERT(session, cval.val != 0); chunk = (uint64_t)cval.val; } + /* * Retrieve the reserve size here for validation of configuration. * Don't save it yet since the connections cache is not created if * we are opening. Cache configuration is responsible for saving the * setting. + * The different conditions when reserved size are set are: + * - It's part of the users configuration - use that value. + * - We are reconfiguring - keep the previous value. + * - We are joining a cache pool for the first time (including + * creating the pool) - use the chunk size; that's the default. */ - ret = __wt_config_gets(session, cfg, "shared_cache.reserve", &cval); - if (reconfiguring && ret == WT_NOTFOUND) - /* It is safe to access the cache during reconfigure. */ - reserve = conn->cache->cp_reserved; - else { - WT_ERR(ret); + if (__wt_config_gets(session, &cfg[1], + "shared_cache.reserve", &cval) == 0 && cval.val != 0) reserve = (uint64_t)cval.val; - } + else if (reconfiguring) + reserve = conn->cache->cp_reserved; + else + reserve = chunk; /* * Validate that size and reserve values don't cause the cache @@ -163,6 +179,11 @@ __wt_conn_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg) cp->size = size; cp->chunk = chunk; + /* Wake up the cache pool server so any changes are noticed. */ + if (reconfiguring) + WT_ERR(__wt_cond_signal( + session, __wt_process.cache_pool->cache_pool_cond)); + WT_ERR(__wt_verbose(session, WT_VERB_SHARED_CACHE, "Configured cache pool %s. Size: %" PRIu64 ", chunk size: %" PRIu64, cp->name, cp->size, cp->chunk)); @@ -239,17 +260,16 @@ __wt_conn_cache_pool_open(WT_SESSION_IMPL *session) * if we were the last connection. */ int -__wt_conn_cache_pool_destroy(WT_CONNECTION_IMPL *conn) +__wt_conn_cache_pool_destroy(WT_SESSION_IMPL *session) { WT_CACHE *cache; WT_CACHE_POOL *cp; - WT_CONNECTION_IMPL *entry; + WT_CONNECTION_IMPL *conn, *entry; WT_DECL_RET; WT_SESSION *wt_session; - WT_SESSION_IMPL *session; int cp_locked, found; - session = conn->default_session; + conn = S2C(session); cache = conn->cache; cp_locked = found = 0; cp = __wt_process.cache_pool; @@ -488,26 +508,46 @@ __cache_pool_adjust(WT_SESSION_IMPL *session, if (cache->cp_skip_count > 0 && --cache->cp_skip_count > 0) continue; /* - * TODO: Use __wt_cache_bytes_inuse instead of eviction_target - * which doesn't do the right thing at the moment. + * If the entry is currently allocated less than the reserved + * size, increase it's allocation. This should only happen if: + * - It's the first time we've seen this member + * - The reserved size has been adjusted */ if (entry->cache_size < reserved) { grew = 1; adjusted = reserved - entry->cache_size; + /* + * Conditions for reducing the amount of resources for an + * entry: + * - If we are forcing and this entry has more than the + * minimum amount of space in use. + * - If the read pressure in this entry is below the + * threshold, other entries need more cache, the entry has + * more than the minimum space and there is no available + * space in the pool. + */ } else if ((force && entry->cache_size > reserved) || (read_pressure < WT_CACHE_POOL_REDUCE_THRESHOLD && highest > 1 && entry->cache_size > reserved && cp->currently_used >= cp->size)) { - /* - * If a connection isn't actively using it's assigned - * cache and is assigned a reasonable amount - reduce - * it. - */ grew = 0; - if (entry->cache_size - cp->chunk > reserved) + /* + * Shrink by a chunk size if that doesn't drop us + * below the reserved size. + */ + if (entry->cache_size > cp->chunk + reserved) adjusted = cp->chunk; else adjusted = entry->cache_size - reserved; + /* + * Conditions for increasing the amount of resources for an + * entry: + * - There was some activity across the pool + * - This entry is using less than the entire cache pool + * - The connection is using enough cache to require eviction + * - There is space available in the pool + * - Additional cache would benefit the connection + */ } else if (highest > 1 && entry->cache_size < cp->size && cache->bytes_inmem >= @@ -527,6 +567,9 @@ __cache_pool_adjust(WT_SESSION_IMPL *session, } else { cache->cp_skip_count = WT_CACHE_POOL_REDUCE_SKIPS; + WT_ASSERT(session, + entry->cache_size >= adjusted && + cp->currently_used >= adjusted); entry->cache_size -= adjusted; cp->currently_used -= adjusted; } @@ -562,8 +605,8 @@ __wt_cache_pool_server(void *arg) while (F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE) && F_ISSET(cache, WT_CACHE_POOL_RUN)) { if (cp->currently_used <= cp->size) - WT_ERR_TIMEDOUT_OK(__wt_cond_wait( - session, cp->cache_pool_cond, 1000000)); + WT_ERR(__wt_cond_wait(session, + cp->cache_pool_cond, 1000000)); /* * Re-check pool run flag - since we want to avoid getting the diff --git a/src/conn/conn_ckpt.c b/src/conn/conn_ckpt.c index 90560842c07..a7b8be0d083 100644 --- a/src/conn/conn_ckpt.c +++ b/src/conn/conn_ckpt.c @@ -20,6 +20,7 @@ __ckpt_server_config(WT_SESSION_IMPL *session, const char **cfg, int *startp) WT_CONNECTION_IMPL *conn; WT_DECL_ITEM(tmp); WT_DECL_RET; + char *p; conn = S2C(session); @@ -40,19 +41,26 @@ __ckpt_server_config(WT_SESSION_IMPL *session, const char **cfg, int *startp) } *startp = 1; + /* + * The application can specify a checkpoint name, which we ignore if + * it's our default. + */ WT_RET(__wt_config_gets(session, cfg, "checkpoint.name", &cval)); + if (cval.len != 0 && + !WT_STRING_MATCH(WT_CHECKPOINT, cval.str, cval.len)) { + WT_RET(__wt_checkpoint_name_ok(session, cval.str, cval.len)); - if (!WT_STRING_MATCH(WT_CHECKPOINT, cval.str, cval.len)) { WT_RET(__wt_scr_alloc(session, cval.len + 20, &tmp)); - strcpy((char *)tmp->data, "name="); - strncat((char *)tmp->data, cval.str, cval.len); - ret = __wt_strndup(session, - tmp->data, strlen("name=") + cval.len, &conn->ckpt_config); - __wt_scr_free(&tmp); - WT_RET(ret); + WT_ERR(__wt_buf_fmt( + session, tmp, "name=%.*s", (int)cval.len, cval.str)); + WT_ERR(__wt_strdup(session, tmp->data, &p)); + + __wt_free(session, conn->ckpt_config); + conn->ckpt_config = p; } - return (0); +err: __wt_scr_free(&tmp); + return (ret); } /* @@ -86,7 +94,7 @@ __ckpt_server(void *arg) * NOTE: If the user only configured logsize, then usecs * will be 0 and this wait won't return until signalled. */ - WT_ERR_TIMEDOUT_OK( + WT_ERR( __wt_cond_wait(session, conn->ckpt_cond, conn->ckpt_usecs)); } @@ -139,17 +147,19 @@ __ckpt_server_start(WT_CONNECTION_IMPL *conn) * Configure and start the checkpoint server. */ int -__wt_checkpoint_server_create(WT_CONNECTION_IMPL *conn, const char *cfg[]) +__wt_checkpoint_server_create(WT_SESSION_IMPL *session, const char *cfg[]) { + WT_CONNECTION_IMPL *conn; int start; + conn = S2C(session); start = 0; /* If there is already a server running, shut it down. */ if (conn->ckpt_session != NULL) - WT_RET(__wt_checkpoint_server_destroy(conn)); + WT_RET(__wt_checkpoint_server_destroy(session)); - WT_RET(__ckpt_server_config(conn->default_session, cfg, &start)); + WT_RET(__ckpt_server_config(session, cfg, &start)); if (start) WT_RET(__ckpt_server_start(conn)); @@ -161,13 +171,13 @@ __wt_checkpoint_server_create(WT_CONNECTION_IMPL *conn, const char *cfg[]) * Destroy the checkpoint server thread. */ int -__wt_checkpoint_server_destroy(WT_CONNECTION_IMPL *conn) +__wt_checkpoint_server_destroy(WT_SESSION_IMPL *session) { + WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_SESSION *wt_session; - WT_SESSION_IMPL *session; - session = conn->default_session; + conn = S2C(session); F_CLR(conn, WT_CONN_SERVER_CHECKPOINT); if (conn->ckpt_tid_set) { diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c index 51b9d0846fd..de716433598 100644 --- a/src/conn/conn_dhandle.c +++ b/src/conn/conn_dhandle.c @@ -575,6 +575,7 @@ __wt_conn_dhandle_discard_single( WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *save_dhandle; WT_DECL_RET; + WT_DECL_SPINLOCK_ID(id); /* Must appear last */ conn = S2C(session); @@ -594,12 +595,21 @@ __wt_conn_dhandle_discard_single( /* * Get the schema lock (required to remove entries from the data handle - * list), get the dhandle lock to block the eviction server from walking - * the list. + * list), get the dhandle lock to block the eviction server from + * walking the list. */ F_SET(session, WT_SESSION_SCHEMA_LOCKED); __wt_spin_lock(session, &conn->schema_lock); - __wt_spin_lock(session, &conn->dhandle_lock); + + /* + * If the eviction server is running, don't block waiting for it while + * holding the schema lock. The sweep server will try again. + */ + if (final) + __wt_spin_lock(session, &conn->dhandle_lock); + else if ((ret = + __wt_spin_trylock(session, &conn->dhandle_lock, &id)) != 0) + goto unlock; /* * Check if the handle was reacquired by a session while we waited; @@ -612,7 +622,8 @@ __wt_conn_dhandle_discard_single( SLIST_REMOVE(&conn->dhlh, dhandle, __wt_data_handle, l); __wt_spin_unlock(session, &conn->dhandle_lock); - __wt_spin_unlock(session, &conn->schema_lock); + +unlock: __wt_spin_unlock(session, &conn->schema_lock); F_CLR(session, WT_SESSION_SCHEMA_LOCKED); /* @@ -640,13 +651,13 @@ err: session->dhandle = save_dhandle; * Close/discard all data handles. */ int -__wt_conn_dhandle_discard(WT_CONNECTION_IMPL *conn) +__wt_conn_dhandle_discard(WT_SESSION_IMPL *session) { + WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; WT_DECL_RET; - WT_SESSION_IMPL *session; - session = conn->default_session; + conn = S2C(session); /* * Close open data handles: first, everything but the metadata file diff --git a/src/conn/conn_handle.c b/src/conn/conn_handle.c index 36c53133325..e4f0a6ddd73 100644 --- a/src/conn/conn_handle.c +++ b/src/conn/conn_handle.c @@ -45,12 +45,23 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn) WT_RET(__wt_spin_init(session, &conn->dhandle_lock, "data handle")); WT_RET(__wt_spin_init(session, &conn->fh_lock, "file list")); WT_RET(__wt_spin_init(session, &conn->hot_backup_lock, "hot backup")); + WT_RET(__wt_spin_init(session, &conn->reconfig_lock, "reconfigure")); WT_RET(__wt_spin_init(session, &conn->schema_lock, "schema")); WT_RET(__wt_calloc_def(session, WT_PAGE_LOCKS(conn), &conn->page_lock)); for (i = 0; i < WT_PAGE_LOCKS(conn); ++i) WT_RET( __wt_spin_init(session, &conn->page_lock[i], "btree page")); + /* Setup the spin locks for the LSM manager queues. */ + WT_RET(__wt_spin_init(session, + &conn->lsm_manager.app_lock, "LSM application queue lock")); + WT_RET(__wt_spin_init(session, + &conn->lsm_manager.manager_lock, "LSM manager queue lock")); + WT_RET(__wt_spin_init( + session, &conn->lsm_manager.switch_lock, "LSM switch queue lock")); + WT_RET(__wt_cond_alloc( + session, "LSM worker cond", 0, &conn->lsm_manager.work_cond)); + /* * Generation numbers. * @@ -114,12 +125,14 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn) __wt_spin_destroy(session, &conn->dhandle_lock); __wt_spin_destroy(session, &conn->fh_lock); __wt_spin_destroy(session, &conn->hot_backup_lock); + __wt_spin_destroy(session, &conn->reconfig_lock); __wt_spin_destroy(session, &conn->schema_lock); for (i = 0; i < WT_PAGE_LOCKS(conn); ++i) __wt_spin_destroy(session, &conn->page_lock[i]); __wt_free(session, conn->page_lock); /* Free allocated memory. */ + __wt_free(session, conn->cfg); __wt_free(session, conn->home); __wt_free(session, conn->error_prefix); __wt_free(session, conn->sessions); diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c index ee4a2dd6b70..48218507d09 100644 --- a/src/conn/conn_log.c +++ b/src/conn/conn_log.c @@ -106,7 +106,7 @@ __log_archive_server(void *arg) "log_archive: Blocked due to open log " "cursor holding archive lock")); } - WT_ERR_TIMEDOUT_OK( + WT_ERR( __wt_cond_wait(session, conn->arch_cond, 1000000)); continue; } @@ -150,8 +150,7 @@ __log_archive_server(void *arg) WT_ERR(__wt_rwunlock(session, log->log_archive_lock)); /* Wait until the next event. */ - WT_ERR_TIMEDOUT_OK( - __wt_cond_wait(session, conn->arch_cond, 1000000)); + WT_ERR(__wt_cond_wait(session, conn->arch_cond, 1000000)); } if (0) { @@ -167,13 +166,13 @@ err: __wt_err(session, ret, "log archive server error"); * Start the log subsystem and archive server thread. */ int -__wt_logmgr_create(WT_CONNECTION_IMPL *conn, const char *cfg[]) +__wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[]) { - WT_SESSION_IMPL *session; + WT_CONNECTION_IMPL *conn; WT_LOG *log; int run; - session = conn->default_session; + conn = S2C(session); /* Handle configuration. */ WT_RET(__logmgr_config(session, cfg, &run)); @@ -245,13 +244,13 @@ __wt_logmgr_create(WT_CONNECTION_IMPL *conn, const char *cfg[]) * Destroy the log archiving server thread and logging subsystem. */ int -__wt_logmgr_destroy(WT_CONNECTION_IMPL *conn) +__wt_logmgr_destroy(WT_SESSION_IMPL *session) { + WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_SESSION *wt_session; - WT_SESSION_IMPL *session; - session = conn->default_session; + conn = S2C(session); if (!conn->logging) return (0); diff --git a/src/conn/conn_open.c b/src/conn/conn_open.c index 3a2f1cb51a4..5bcf05975e7 100644 --- a/src/conn/conn_open.c +++ b/src/conn/conn_open.c @@ -55,13 +55,13 @@ __wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[]) WT_WRITE_BARRIER(); /* Connect to a cache pool. */ - WT_RET(__wt_conn_cache_pool_config(session, cfg)); + WT_RET(__wt_cache_pool_config(session, cfg)); /* Create the cache. */ - WT_RET(__wt_cache_create(conn, cfg)); + WT_RET(__wt_cache_create(session, cfg)); /* Initialize transaction support. */ - WT_RET(__wt_txn_global_init(conn, cfg)); + WT_RET(__wt_txn_global_init(session, cfg)); return (0); } @@ -81,16 +81,30 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) WT_NAMED_COMPRESSOR *ncomp; WT_NAMED_DATA_SOURCE *ndsrc; WT_SESSION_IMPL *s, *session; + WT_TXN_GLOBAL *txn_global; u_int i; wt_conn = &conn->iface; + txn_global = &conn->txn_global; session = conn->default_session; - /* We're shutting down. Make sure everything gets freed. */ - __wt_txn_update_oldest(session); + /* + * We're shutting down. Make sure everything gets freed. + * + * It's possible that the eviction server is in the middle of a long + * operation, with a transaction ID pinned. In that case, we will loop + * here until the transaction ID is released, when the oldest + * transaction ID will catch up with the current ID. + */ + for (;;) { + __wt_txn_update_oldest(session); + if (txn_global->oldest_id == txn_global->current) + break; + __wt_yield(); + } /* Clear any pending async ops. */ - WT_TRET(__wt_async_flush(conn)); + WT_TRET(__wt_async_flush(session)); /* * Shut down server threads other than the eviction server, which is @@ -99,14 +113,14 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) * exit before files are closed. */ F_CLR(conn, WT_CONN_SERVER_RUN); - WT_TRET(__wt_async_destroy(conn)); - WT_TRET(__wt_lsm_manager_destroy(conn)); - WT_TRET(__wt_checkpoint_server_destroy(conn)); - WT_TRET(__wt_statlog_destroy(conn, 1)); - WT_TRET(__wt_sweep_destroy(conn)); + WT_TRET(__wt_async_destroy(session)); + WT_TRET(__wt_lsm_manager_destroy(session)); + WT_TRET(__wt_checkpoint_server_destroy(session)); + WT_TRET(__wt_statlog_destroy(session, 1)); + WT_TRET(__wt_sweep_destroy(session)); /* Close open data handles. */ - WT_TRET(__wt_conn_dhandle_discard(conn)); + WT_TRET(__wt_conn_dhandle_discard(session)); /* * Now that all data handles are closed, tell logging that a checkpoint @@ -116,20 +130,20 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) if (conn->logging) { WT_TRET(__wt_txn_checkpoint_log( session, 1, WT_TXN_LOG_CKPT_STOP, NULL)); - WT_TRET(__wt_logmgr_destroy(conn)); + WT_TRET(__wt_logmgr_destroy(session)); } /* Free memory for collators */ while ((ncoll = TAILQ_FIRST(&conn->collqh)) != NULL) - WT_TRET(__wt_conn_remove_collator(conn, ncoll)); + WT_TRET(__wt_conn_remove_collator(session, ncoll)); /* Free memory for compressors */ while ((ncomp = TAILQ_FIRST(&conn->compqh)) != NULL) - WT_TRET(__wt_conn_remove_compressor(conn, ncomp)); + WT_TRET(__wt_conn_remove_compressor(session, ncomp)); /* Free memory for data sources */ while ((ndsrc = TAILQ_FIRST(&conn->dsrcqh)) != NULL) - WT_TRET(__wt_conn_remove_data_source(conn, ndsrc)); + WT_TRET(__wt_conn_remove_data_source(session, ndsrc)); /* * Complain if files weren't closed, ignoring the lock file, we'll @@ -146,16 +160,16 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) } /* Shut down the eviction server thread. */ - WT_TRET(__wt_evict_destroy(conn)); + WT_TRET(__wt_evict_destroy(session)); /* Disconnect from shared cache - must be before cache destroy. */ - WT_TRET(__wt_conn_cache_pool_destroy(conn)); + WT_TRET(__wt_conn_cache_pool_destroy(session)); /* Discard the cache. */ - WT_TRET(__wt_cache_destroy(conn)); + WT_TRET(__wt_cache_destroy(session)); /* Discard transaction state. */ - __wt_txn_global_destroy(conn); + __wt_txn_global_destroy(session); /* Close extensions, first calling any unload entry point. */ while ((dlh = TAILQ_FIRST(&conn->dlhqh)) != NULL) { @@ -207,38 +221,34 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) int __wt_connection_workers(WT_SESSION_IMPL *session, const char *cfg[]) { - WT_CONNECTION_IMPL *conn; - - conn = S2C(session); - /* * Start the eviction thread. */ - WT_RET(__wt_evict_create(conn)); + WT_RET(__wt_evict_create(session)); /* * Start the handle sweep thread. */ - WT_RET(__wt_sweep_create(conn)); + WT_RET(__wt_sweep_create(session)); /* * Start the optional statistics thread. Start statistics first so that * other optional threads can know if statistics are enabled or not. */ - WT_RET(__wt_statlog_create(conn, cfg)); + WT_RET(__wt_statlog_create(session, cfg)); /* Start the optional async threads. */ - WT_RET(__wt_async_create(conn, cfg)); + WT_RET(__wt_async_create(session, cfg)); /* * Start the optional logging/archive thread. * NOTE: The log manager must be started before checkpoints so that the * checkpoint server knows if logging is enabled. */ - WT_RET(__wt_logmgr_create(conn, cfg)); + WT_RET(__wt_logmgr_create(session, cfg)); /* Start the optional checkpoint thread. */ - WT_RET(__wt_checkpoint_server_create(conn, cfg)); + WT_RET(__wt_checkpoint_server_create(session, cfg)); return (0); } diff --git a/src/conn/conn_stat.c b/src/conn/conn_stat.c index 9ad1cddeb6e..fbd9b3835b4 100644 --- a/src/conn/conn_stat.c +++ b/src/conn/conn_stat.c @@ -18,6 +18,22 @@ #endif #endif +/* + * __stat_sources_free -- + * Free the array of statistics sources. + */ +static void +__stat_sources_free(WT_SESSION_IMPL *session, char ***sources) +{ + char **p; + + if ((p = (*sources)) != NULL) { + for (; *p != NULL; ++p) + __wt_free(session, *p); + __wt_free(session, *sources); + } +} + /* * __wt_conn_stat_init -- * Initialize the per-connection statistics. @@ -41,8 +57,10 @@ __statlog_config(WT_SESSION_IMPL *session, const char **cfg, int *runp) WT_CONNECTION_IMPL *conn; WT_DECL_RET; int cnt; + char **sources; conn = S2C(session); + sources = NULL; WT_RET(__wt_config_gets(session, cfg, "statistics_log.wait", &cval)); /* Only start the server if wait time is non-zero */ @@ -67,7 +85,7 @@ __statlog_config(WT_SESSION_IMPL *session, const char **cfg, int *runp) ; WT_RET_NOTFOUND_OK(ret); if (cnt != 0) { - WT_RET(__wt_calloc_def(session, cnt + 1, &conn->stat_sources)); + WT_RET(__wt_calloc_def(session, cnt + 1, &sources)); WT_RET(__wt_config_subinit(session, &objectconf, &cval)); for (cnt = 0; (ret = __wt_config_next(&objectconf, &k, &v)) == 0; ++cnt) { @@ -80,24 +98,28 @@ __statlog_config(WT_SESSION_IMPL *session, const char **cfg, int *runp) */ if (!WT_PREFIX_MATCH(k.str, "file:") && !WT_PREFIX_MATCH(k.str, "lsm:")) - WT_RET_MSG(session, EINVAL, + WT_ERR_MSG(session, EINVAL, "statistics_log sources configuration only " "supports objects of type \"file\" or " "\"lsm\""); - WT_RET(__wt_strndup(session, - k.str, k.len, &conn->stat_sources[cnt])); + WT_ERR( + __wt_strndup(session, k.str, k.len, &sources[cnt])); } - WT_RET_NOTFOUND_OK(ret); + WT_ERR_NOTFOUND_OK(ret); + + conn->stat_sources = sources; + sources = NULL; } - WT_RET(__wt_config_gets(session, cfg, "statistics_log.path", &cval)); - WT_RET(__wt_nfilename(session, cval.str, cval.len, &conn->stat_path)); + WT_ERR(__wt_config_gets(session, cfg, "statistics_log.path", &cval)); + WT_ERR(__wt_nfilename(session, cval.str, cval.len, &conn->stat_path)); - WT_RET(__wt_config_gets( + WT_ERR(__wt_config_gets( session, cfg, "statistics_log.timestamp", &cval)); - WT_RET(__wt_strndup(session, cval.str, cval.len, &conn->stat_format)); + WT_ERR(__wt_strndup(session, cval.str, cval.len, &conn->stat_format)); - return (0); +err: __stat_sources_free(session, &sources); + return (ret); } /* @@ -385,7 +407,7 @@ __statlog_server(void *arg) WT_ERR(__statlog_log_one(session, &path, &tmp)); /* Wait until the next event. */ - WT_ERR_TIMEDOUT_OK( + WT_ERR( __wt_cond_wait(session, conn->stat_cond, conn->stat_usecs)); } @@ -441,12 +463,12 @@ __statlog_start(WT_CONNECTION_IMPL *conn) * Start the statistics server thread. */ int -__wt_statlog_create(WT_CONNECTION_IMPL *conn, const char *cfg[]) +__wt_statlog_create(WT_SESSION_IMPL *session, const char *cfg[]) { - WT_SESSION_IMPL *session; + WT_CONNECTION_IMPL *conn; int start; - session = conn->default_session; + conn = S2C(session); start = 0; /* @@ -455,9 +477,9 @@ __wt_statlog_create(WT_CONNECTION_IMPL *conn, const char *cfg[]) * configuration changes - but that makes our lives easier. */ if (conn->stat_session != NULL) - WT_RET(__wt_statlog_destroy(conn, 0)); + WT_RET(__wt_statlog_destroy(session, 0)); - WT_RET_NOTFOUND_OK(__statlog_config(session, cfg, &start)); + WT_RET(__statlog_config(session, cfg, &start)); if (start) WT_RET(__statlog_start(conn)); @@ -469,14 +491,13 @@ __wt_statlog_create(WT_CONNECTION_IMPL *conn, const char *cfg[]) * Destroy the statistics server thread. */ int -__wt_statlog_destroy(WT_CONNECTION_IMPL *conn, int is_close) +__wt_statlog_destroy(WT_SESSION_IMPL *session, int is_close) { + WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_SESSION *wt_session; - WT_SESSION_IMPL *session; - char **p; - session = conn->default_session; + conn = S2C(session); F_CLR(conn, WT_CONN_SERVER_STATISTICS); if (conn->stat_tid_set) { @@ -491,11 +512,7 @@ __wt_statlog_destroy(WT_CONNECTION_IMPL *conn, int is_close) WT_TRET(__wt_cond_destroy(session, &conn->stat_cond)); - if ((p = conn->stat_sources) != NULL) { - for (; *p != NULL; ++p) - __wt_free(session, *p); - __wt_free(session, conn->stat_sources); - } + __stat_sources_free(session, &conn->stat_sources); __wt_free(session, conn->stat_path); __wt_free(session, conn->stat_format); diff --git a/src/conn/conn_sweep.c b/src/conn/conn_sweep.c index 2d917f9117b..37039504952 100644 --- a/src/conn/conn_sweep.c +++ b/src/conn/conn_sweep.c @@ -109,7 +109,7 @@ __sweep_server(void *arg) F_ISSET(conn, WT_CONN_SERVER_SWEEP)) { /* Wait until the next event. */ - WT_ERR_TIMEDOUT_OK( + WT_ERR( __wt_cond_wait(session, conn->sweep_cond, 30 * WT_MILLION)); /* Sweep the handles. */ @@ -127,9 +127,11 @@ err: __wt_err(session, ret, "handle sweep server error"); * Start the handle sweep thread. */ int -__wt_sweep_create(WT_CONNECTION_IMPL *conn) +__wt_sweep_create(WT_SESSION_IMPL *session) { - WT_SESSION_IMPL *session; + WT_CONNECTION_IMPL *conn; + + conn = S2C(session); /* Set first, the thread might run before we finish up. */ F_SET(conn, WT_CONN_SERVER_SWEEP); @@ -159,13 +161,13 @@ __wt_sweep_create(WT_CONNECTION_IMPL *conn) * Destroy the handle-sweep thread. */ int -__wt_sweep_destroy(WT_CONNECTION_IMPL *conn) +__wt_sweep_destroy(WT_SESSION_IMPL *session) { + WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_SESSION *wt_session; - WT_SESSION_IMPL *session; - session = conn->default_session; + conn = S2C(session); F_CLR(conn, WT_CONN_SERVER_SWEEP); if (conn->sweep_tid_set) { diff --git a/src/cursor/cur_backup.c b/src/cursor/cur_backup.c index fc3bd6e3abd..bef0b70b50c 100644 --- a/src/cursor/cur_backup.c +++ b/src/cursor/cur_backup.c @@ -103,11 +103,11 @@ __wt_curbackup_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, - NULL, /* get-key */ + __wt_cursor_get_key, /* get-key */ __wt_cursor_notsup, /* get-value */ __wt_cursor_notsup, /* set-key */ __wt_cursor_notsup, /* set-value */ - NULL, /* compare */ + __wt_cursor_notsup, /* compare */ __curbackup_next, /* next */ __wt_cursor_notsup, /* prev */ __curbackup_reset, /* reset */ diff --git a/src/cursor/cur_config.c b/src/cursor/cur_config.c index 9eb9a6ef74d..b7bd05b4e24 100644 --- a/src/cursor/cur_config.c +++ b/src/cursor/cur_config.c @@ -26,14 +26,14 @@ __wt_curconfig_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, - NULL, /* get-key */ - NULL, /* get-value */ - NULL, /* set-key */ - NULL, /* set-value */ - NULL, /* compare */ + __wt_cursor_get_key, /* get-key */ + __wt_cursor_get_value, /* get-value */ + __wt_cursor_set_key, /* set-key */ + __wt_cursor_set_value, /* set-value */ + __wt_cursor_notsup, /* compare */ __wt_cursor_notsup, /* next */ __wt_cursor_notsup, /* prev */ - __wt_cursor_notsup, /* reset */ + __wt_cursor_noop, /* reset */ __wt_cursor_notsup, /* search */ __wt_cursor_notsup, /* search-near */ __wt_cursor_notsup, /* insert */ diff --git a/src/cursor/cur_ds.c b/src/cursor/cur_ds.c index 69370f1fa6b..6fa10739856 100644 --- a/src/cursor/cur_ds.c +++ b/src/cursor/cur_ds.c @@ -448,10 +448,10 @@ __wt_curds_open( const char *cfg[], WT_DATA_SOURCE *dsrc, WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, - NULL, /* get-key */ - NULL, /* get-value */ - NULL, /* set-key */ - NULL, /* set-value */ + __wt_cursor_get_key, /* get-key */ + __wt_cursor_get_value, /* get-value */ + __wt_cursor_set_key, /* set-key */ + __wt_cursor_set_value, /* set-value */ __curds_compare, /* compare */ __curds_next, /* next */ __curds_prev, /* prev */ diff --git a/src/cursor/cur_dump.c b/src/cursor/cur_dump.c index 484a0f15bd6..cec2350307e 100644 --- a/src/cursor/cur_dump.c +++ b/src/cursor/cur_dump.c @@ -350,7 +350,7 @@ __wt_curdump_create(WT_CURSOR *child, WT_CURSOR *owner, WT_CURSOR **cursorp) __curdump_get_value, /* get-value */ __curdump_set_key, /* set-key */ __curdump_set_value, /* set-value */ - NULL, /* compare */ + __wt_cursor_notsup, /* compare */ __curdump_next, /* next */ __curdump_prev, /* prev */ __curdump_reset, /* reset */ diff --git a/src/cursor/cur_file.c b/src/cursor/cur_file.c index e5a1d8a68b6..688419d1916 100644 --- a/src/cursor/cur_file.c +++ b/src/cursor/cur_file.c @@ -10,29 +10,22 @@ /* * WT_BTREE_CURSOR_SAVE_AND_RESTORE * Save the cursor's key/value data/size fields, call an underlying btree - * function, and then consistently handle failure and success. + * function, and then consistently handle failure and success. */ #define WT_BTREE_CURSOR_SAVE_AND_RESTORE(cursor, f, ret) do { \ - const void *__key_data = (cursor)->key.data; \ - const void *__value_data = (cursor)->value.data; \ + WT_ITEM __key_copy = (cursor)->key; \ uint64_t __recno = (cursor)->recno; \ - size_t __key_size = (cursor)->key.size; \ - size_t __value_size = (cursor)->value.size; \ + WT_ITEM __value_copy = (cursor)->value; \ if (((ret) = (f)) == 0) { \ F_CLR(cursor, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT); \ F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); \ - } else if ((ret) == WT_NOTFOUND) \ - F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); \ - else { \ + } else { \ if (F_ISSET(cursor, WT_CURSTD_KEY_EXT)) { \ (cursor)->recno = __recno; \ - (cursor)->key.data = __key_data; \ - (cursor)->key.size = __key_size; \ - } \ - if (F_ISSET(cursor, WT_CURSTD_VALUE_EXT)) { \ - (cursor)->value.data = __value_data; \ - (cursor)->value.size = __value_size; \ + WT_ITEM_SET((cursor)->key, __key_copy); \ } \ + if (F_ISSET(cursor, WT_CURSTD_VALUE_EXT)) \ + WT_ITEM_SET((cursor)->value, __value_copy); \ F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); \ } \ } while (0) @@ -347,10 +340,10 @@ __wt_curfile_create(WT_SESSION_IMPL *session, WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, - NULL, /* get-key */ - NULL, /* get-value */ - NULL, /* set-key */ - NULL, /* set-value */ + __wt_cursor_get_key, /* get-key */ + __wt_cursor_get_value, /* get-value */ + __wt_cursor_set_key, /* set-key */ + __wt_cursor_set_value, /* set-value */ __curfile_compare, /* compare */ __curfile_next, /* next */ __curfile_prev, /* prev */ diff --git a/src/cursor/cur_index.c b/src/cursor/cur_index.c index ae438618bbc..bf73b3612c1 100644 --- a/src/cursor/cur_index.c +++ b/src/cursor/cur_index.c @@ -337,11 +337,11 @@ __wt_curindex_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, - NULL, /* get-key */ + __wt_cursor_get_key, /* get-key */ __curindex_get_value, /* get-value */ - NULL, /* set-key */ + __wt_cursor_set_key, /* set-key */ __curindex_set_value, /* set-value */ - NULL, /* compare */ + __wt_cursor_notsup, /* compare */ __curindex_next, /* next */ __curindex_prev, /* prev */ __curindex_reset, /* reset */ diff --git a/src/cursor/cur_log.c b/src/cursor/cur_log.c index e67caa2f4d5..4ecbcae96dd 100644 --- a/src/cursor/cur_log.c +++ b/src/cursor/cur_log.c @@ -308,10 +308,10 @@ __wt_curlog_open(WT_SESSION_IMPL *session, { WT_CONNECTION_IMPL *conn; WT_CURSOR_STATIC_INIT(iface, - NULL, /* get-key */ - NULL, /* get-value */ - NULL, /* set-key */ - NULL, /* set-value */ + __wt_cursor_get_key, /* get-key */ + __wt_cursor_get_value, /* get-value */ + __wt_cursor_set_key, /* set-key */ + __wt_cursor_set_value, /* set-value */ __curlog_compare, /* compare */ __curlog_next, /* next */ __wt_cursor_notsup, /* prev */ diff --git a/src/cursor/cur_metadata.c b/src/cursor/cur_metadata.c index 39512b91612..30fe3b28625 100644 --- a/src/cursor/cur_metadata.c +++ b/src/cursor/cur_metadata.c @@ -403,10 +403,10 @@ __wt_curmetadata_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, - NULL, /* get-key */ - NULL, /* get-value */ - NULL, /* set-key */ - NULL, /* set-value */ + __wt_cursor_get_key, /* get-key */ + __wt_cursor_get_value, /* get-value */ + __wt_cursor_set_key, /* set-key */ + __wt_cursor_set_value, /* set-value */ __curmetadata_compare, /* compare */ __curmetadata_next, /* next */ __curmetadata_prev, /* prev */ diff --git a/src/cursor/cur_stat.c b/src/cursor/cur_stat.c index 3dcfa638a3d..fe4660ae0a3 100644 --- a/src/cursor/cur_stat.c +++ b/src/cursor/cur_stat.c @@ -483,7 +483,7 @@ __wt_curstat_open(WT_SESSION_IMPL *session, __curstat_get_value, /* get-value */ __curstat_set_key, /* set-key */ __curstat_set_value, /* set-value */ - NULL, /* compare */ + __wt_cursor_notsup, /* compare */ __curstat_next, /* next */ __curstat_prev, /* prev */ __curstat_reset, /* reset */ diff --git a/src/cursor/cur_std.c b/src/cursor/cur_std.c index 98d882d037f..a831afb59e2 100644 --- a/src/cursor/cur_std.c +++ b/src/cursor/cur_std.c @@ -435,19 +435,6 @@ err: cursor->saved_err = ret; API_END(session, ret); } -/* - * __cursor_search -- - * WT_CURSOR->search default implementation. - */ -static int -__cursor_search(WT_CURSOR *cursor) -{ - int exact; - - WT_RET(cursor->search_near(cursor, &exact)); - return ((exact == 0) ? 0 : WT_NOTFOUND); -} - /* * __wt_cursor_close -- * WT_CURSOR->close default implementation. @@ -551,48 +538,12 @@ int __wt_cursor_init(WT_CURSOR *cursor, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) { - WT_CURSOR *cdump; WT_CONFIG_ITEM cval; + WT_CURSOR *cdump; WT_SESSION_IMPL *session; session = (WT_SESSION_IMPL *)cursor->session; - /* - * Fill in unspecified cursor methods: get/set key/value, position - * duplication, search and reconfiguration are all standard, else - * if the method isn't set, assume it's unsupported. - */ - if (cursor->get_key == NULL) - cursor->get_key = __wt_cursor_get_key; - if (cursor->get_value == NULL) - cursor->get_value = __wt_cursor_get_value; - if (cursor->set_key == NULL) - cursor->set_key = __wt_cursor_set_key; - if (cursor->set_value == NULL) - cursor->set_value = __wt_cursor_set_value; - if (cursor->compare == NULL) - cursor->compare = (int (*) - (WT_CURSOR *, WT_CURSOR *, int *))__wt_cursor_notsup; - if (cursor->next == NULL) - cursor->next = __wt_cursor_notsup; - if (cursor->prev == NULL) - cursor->prev = __wt_cursor_notsup; - if (cursor->reset == NULL) - cursor->reset = __wt_cursor_noop; - if (cursor->search == NULL) - cursor->search = __cursor_search; - if (cursor->search_near == NULL) - cursor->search_near = - (int (*)(WT_CURSOR *, int *))__wt_cursor_notsup; - if (cursor->insert == NULL) - cursor->insert = __wt_cursor_notsup; - if (cursor->update == NULL) - cursor->update = __wt_cursor_notsup; - if (cursor->remove == NULL) - cursor->remove = __wt_cursor_notsup; - if (cursor->close == NULL) - WT_RET_MSG(session, EINVAL, "cursor lacks a close method"); - if (cursor->uri == NULL) WT_RET(__wt_strdup(session, uri, &cursor->uri)); @@ -620,8 +571,8 @@ __wt_cursor_init(WT_CURSOR *cursor, cursor->remove = __wt_cursor_notsup; } - /* dump */ /* + * dump * If an index cursor is opened with dump, then this * function is called on the index files, with the dump * config string, and with the index cursor as an owner. diff --git a/src/docs/Doxyfile b/src/docs/Doxyfile index 792b255f4ab..5492905f7e9 100644 --- a/src/docs/Doxyfile +++ b/src/docs/Doxyfile @@ -268,7 +268,7 @@ OPTIMIZE_OUTPUT_VHDL = NO # that for custom extensions you also need to set FILE_PATTERNS otherwise the # files are not read by doxygen. -EXTENSION_MAPPING = +EXTENSION_MAPPING = in=C # If MARKDOWN_SUPPORT is enabled (the default) then doxygen pre-processes all # comments according to the Markdown format, which allows for more readable diff --git a/src/docs/file-formats.dox b/src/docs/file-formats.dox index 46865da4811..bc747433172 100644 --- a/src/docs/file-formats.dox +++ b/src/docs/file-formats.dox @@ -3,7 +3,8 @@ @section file_formats_formats File formats WiredTiger supports two underlying file formats: row-store and -column-store, both are key/value stores. +column-store, where both are B+tree implementations of key/value stores. +WiredTiger also supports @ref lsm, implemented as a tree of B+trees. In a row-store, both keys and data are variable-length byte strings. In a column-store, keys are 64-bit record numbers (key_format type 'r'), @@ -28,14 +29,38 @@ deleting a value is the same as storing a value of 0. For the same reason, storing a value of 0 will cause cursor scans to skip the record. WiredTiger does not support duplicate data items: there can be only a -single value for any given key, and applications are responsible for -creating unique key/value pairs. +single value associated with any given key, and applications are +responsible for creating unique key/value pairs. WiredTiger allocates space from the underlying files in block units. The minimum file allocation unit WiredTiger supports is 512B and the maximum file allocation unit is 512MB. File block offsets are 64-bit (meaning the maximum file size is very, very large). +@section file_formats_choice Choosing a file format + +The row-store format is the default choice for most applications. When +the primary key is a record number, there are advantages to storing +columns in separate files, or the underlying data is a set of bits, +column-store format may be a better choice. + +Both row- and column-store formats can maintain high volumes of writes, +but for data sets requiring sustained, extreme write throughput, @ref +lsm are usually a better choice. For applications that do not require +extreme write throughput, row- or column-store is likely to be a better +choice because the read throughput is better than with LSM trees (an +effect that becomes more pronounced as additional read threads are added). + +Applications with complex schemas may also benefit from using multiple +storage formats, that is, using a combination of different formats in +the database, and even in individual tables (for example, a sparse, wide +table configured with a column-store primary, where indexes are stored +in an LSM tree). + +Finally, as WiredTiger makes it easy to switch back-and-forth between +storage configurations, it's usually worthwhile benchmarking possible +configurations when there is any question. + @section file_formats_compression File formats and compression Row-stores support four types of compression: key prefix compression, diff --git a/src/docs/install.dox b/src/docs/install.dox index f3f5094eb5d..eae566f4291 100644 --- a/src/docs/install.dox +++ b/src/docs/install.dox @@ -21,15 +21,14 @@ First, clone the repository: git clone git://github.com/wiredtiger/wiredtiger.git @endcode -Second, run the \c build_posix/reconf script: +Second, run \c autogen.sh to create the \c configure script: @code cd wiredtiger -sh build_posix/reconf +sh autogen.sh @endcode -This creates the \c configure script, and you can now proceed with @ref -building. +Now proceed with @ref building. @section building Building WiredTiger diff --git a/src/docs/lsm.dox b/src/docs/lsm.dox index 0313862afdf..b71fccd7151 100644 --- a/src/docs/lsm.dox +++ b/src/docs/lsm.dox @@ -107,17 +107,10 @@ there are chunks in the tree for each cursor that is open on the LSM tree. The number of hazard pointers is configured with the \c "hazard_max" configuration key to ::wiredtiger_open. -@subsection lsm_tombstones Empty values - -Internally, WiredTiger's LSM trees use an empty value to represent a -record that has been removed (also known as a "tombstone"). For this -reason, applications cannot store records in LSM trees with empty values. - @subsection lsm_checkpoints Named checkpoints -Named checkpoints are not supported on LSM trees, and cursors cannot be opened -with a non-empty \c "checkpoint" configuration. - -We intend to address these limitations in future releases. +Named checkpoints are not supported on LSM trees, and cursors cannot be +opened with a non-empty \c "checkpoint" configuration (that is, only the +most recent standard checkpoint can be read). */ diff --git a/src/docs/programming.dox b/src/docs/programming.dox index 59eeab7705d..4add19c833b 100644 --- a/src/docs/programming.dox +++ b/src/docs/programming.dox @@ -18,8 +18,8 @@ each of which is ordered by one or more columns.

Storage options

- @subpage schema -- @subpage lsm - @subpage file_formats +- @subpage lsm - @subpage compression

Programming notes

diff --git a/src/docs/spell.ok b/src/docs/spell.ok index 95bdf58fc06..857f89cef05 100644 --- a/src/docs/spell.ok +++ b/src/docs/spell.ok @@ -88,6 +88,7 @@ ar archiver arg async +autogen atomicity autoconf automake @@ -339,7 +340,6 @@ realloc'd recno recnoN recnum -reconf recoverability recs rectype diff --git a/src/docs/tune-bulk-load.dox b/src/docs/tune-bulk-load.dox index 9e89fb7ceea..8ee1061c76c 100644 --- a/src/docs/tune-bulk-load.dox +++ b/src/docs/tune-bulk-load.dox @@ -11,7 +11,9 @@ be used on newly created objects, and an object being bulk-loaded is not accessible from other cursors. Cursors configured for bulk-load only support the WT_CURSOR::insert and -WT_CURSOR::close methods. +WT_CURSOR::close methods. Bulk load inserts are non-transactional: they +cannot be rolled back and ignore the transactional state of the WT_SESSION +in which they are opened. When bulk-loading row-store objects, keys must be loaded in sorted order. diff --git a/src/docs/tune-cursor-persist.dox b/src/docs/tune-cursor-persist.dox index b8420281c69..183a877434e 100644 --- a/src/docs/tune-cursor-persist.dox +++ b/src/docs/tune-cursor-persist.dox @@ -8,18 +8,11 @@ hold positions in objects, and therefore long-lived cursor positions can decrease performance by blocking page eviction or looking like a long-lived transaction. -One solution is to cache cursors, but use the WT_CURSOR::reset method +Best practices are to cache cursors, but use the WT_CURSOR::reset method to discard the cursor's position in the object when the position is no -longer needed. And, use the WT_CURSOR::insert method instead of the -WT_CURSOR::update method, when there's no reason to track a position in -the object, because the WT_CURSOR::insert method never maintains a cursor -position, so there's no need to call WT_CURSOR::reset. - -Additionally, cursors are automatically reset whenever a transaction -boundary is crossed; when a transaction is started with the -WT_SESSION::begin_transaction or ended with either -WT_SESSION::commit_transaction or WT_SESSION::rollback_transaction, all -open cursors are automatically reset, there is no need to call the -WT_CURSOR::reset method explicitly. +longer needed. Additionally, use the WT_CURSOR::insert method instead +of the WT_CURSOR::update method when there's no need to hold a position +in the object, because the WT_CURSOR::insert method never holds a cursor +position and there's no need to call WT_CURSOR::reset. */ diff --git a/src/include/btree.i b/src/include/btree.i index 8cd48d222f0..5a333da192d 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -1071,12 +1071,12 @@ __wt_page_hazard_check(WT_SESSION_IMPL *session, WT_PAGE *page) * Randomly choose a depth for a skiplist insert. */ static inline u_int -__wt_skip_choose_depth(void) +__wt_skip_choose_depth(WT_SESSION_IMPL *session) { u_int d; for (d = 1; d < WT_SKIP_MAXDEPTH && - __wt_random() < WT_SKIP_PROBABILITY; d++) + __wt_random(session->rnd) < WT_SKIP_PROBABILITY; d++) ; return (d); } diff --git a/src/include/connection.h b/src/include/connection.h index 03feef68e56..9af23f95cbf 100644 --- a/src/include/connection.h +++ b/src/include/connection.h @@ -70,9 +70,12 @@ struct __wt_connection_impl { WT_SESSION_IMPL *default_session; WT_SESSION_IMPL dummy_session; + const char *cfg; /* Connection configuration */ + WT_SPINLOCK api_lock; /* Connection API spinlock */ WT_SPINLOCK checkpoint_lock; /* Checkpoint spinlock */ WT_SPINLOCK fh_lock; /* File handle queue spinlock */ + WT_SPINLOCK reconfig_lock; /* Single thread reconfigure */ WT_SPINLOCK schema_lock; /* Schema operation spinlock */ /* diff --git a/src/include/dhandle.h b/src/include/dhandle.h index 9a05620c74c..5556627c74d 100644 --- a/src/include/dhandle.h +++ b/src/include/dhandle.h @@ -18,7 +18,7 @@ #define WT_SET_BTREE_IN_SESSION(s, b) ((s)->dhandle = b->dhandle) #define WT_CLEAR_BTREE_IN_SESSION(s) ((s)->dhandle = NULL) -#define WT_WITH_DHANDLE(s, d, e) do { \ +#define WT_WITH_DHANDLE(s, d, e) do { \ WT_DATA_HANDLE *__saved_dhandle = (s)->dhandle; \ (s)->dhandle = (d); \ e; \ diff --git a/src/include/error.h b/src/include/error.h index fda63ef8442..8f4d5c9f2e5 100644 --- a/src/include/error.h +++ b/src/include/error.h @@ -35,14 +35,6 @@ goto err; \ } \ } while (0) -#define WT_ERR_TIMEDOUT_OK(a) do { \ - if ((ret = (a)) != 0) { \ - if (ret == ETIMEDOUT) \ - ret = 0; \ - else \ - goto err; \ - } \ -} while (0) #define WT_ERR_TEST(a, v) do { \ if (a) { \ ret = (v); \ @@ -70,12 +62,6 @@ if ((__ret = (a)) != 0 && __ret != WT_NOTFOUND) \ return (__ret); \ } while (0) -#define WT_RET_TIMEDOUT_OK(a) do { \ - int __ret; \ - if ((__ret = (a)) != 0 && __ret != ETIMEDOUT) \ - return (__ret); \ -} while (0) - /* Set "ret" if not already set. */ #define WT_TRET(a) do { \ int __ret; \ diff --git a/src/include/extern.h b/src/include/extern.h index 27a53666a0e..8c9886f5917 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -1,19 +1,19 @@ /* DO NOT EDIT: automatically built by dist/s_prototypes. */ extern void __wt_async_stats_update(WT_SESSION_IMPL *session); -extern int __wt_async_create(WT_CONNECTION_IMPL *conn, const char *cfg[]); -extern int __wt_async_reconfig(WT_CONNECTION_IMPL *conn, const char *cfg[]); -extern int __wt_async_destroy(WT_CONNECTION_IMPL *conn); -extern int __wt_async_flush(WT_CONNECTION_IMPL *conn); -extern int __wt_async_new_op(WT_CONNECTION_IMPL *conn, +extern int __wt_async_create(WT_SESSION_IMPL *session, const char *cfg[]); +extern int __wt_async_reconfig(WT_SESSION_IMPL *session, const char *cfg[]); +extern int __wt_async_destroy(WT_SESSION_IMPL *session); +extern int __wt_async_flush(WT_SESSION_IMPL *session); +extern int __wt_async_new_op(WT_SESSION_IMPL *session, const char *uri, const char *config, const char *cfg[], WT_ASYNC_CALLBACK *cb, WT_ASYNC_OP_IMPL **opp); -extern int __wt_async_op_enqueue(WT_CONNECTION_IMPL *conn, +extern int __wt_async_op_enqueue(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op); -extern int __wt_async_op_init(WT_CONNECTION_IMPL *conn); +extern int __wt_async_op_init(WT_SESSION_IMPL *session); extern void *__wt_async_worker(void *arg); extern int __wt_block_addr_to_buffer(WT_BLOCK *block, uint8_t **pp, @@ -277,6 +277,9 @@ extern int __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop); extern int __wt_btcur_close(WT_CURSOR_BTREE *cbt); extern int __wt_debug_set_verbose(WT_SESSION_IMPL *session, const char *v); +extern int __wt_debug_addr_print( WT_SESSION_IMPL *session, + const uint8_t *addr, + size_t addr_size); extern int __wt_debug_addr(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size, @@ -320,8 +323,8 @@ extern void __wt_free_ref_index(WT_SESSION_IMPL *session, int free_pages); extern void __wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref); extern int __wt_evict_server_wake(WT_SESSION_IMPL *session); -extern int __wt_evict_create(WT_CONNECTION_IMPL *conn); -extern int __wt_evict_destroy(WT_CONNECTION_IMPL *conn); +extern int __wt_evict_create(WT_SESSION_IMPL *session); +extern int __wt_evict_destroy(WT_SESSION_IMPL *session); extern int __wt_evict_page(WT_SESSION_IMPL *session, WT_REF *ref); extern int __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session); extern void __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session); @@ -575,6 +578,9 @@ extern int __wt_config_check(WT_SESSION_IMPL *session, extern int __wt_config_collapse( WT_SESSION_IMPL *session, const char **cfg, const char **config_ret); +extern int __wt_config_merge( WT_SESSION_IMPL *session, + const char **cfg, + const char **config_ret); extern int __wt_config_concat( WT_SESSION_IMPL *session, const char **cfg, const char **config_ret); @@ -594,26 +600,24 @@ extern int __wt_collator_config(WT_SESSION_IMPL *session, const char **cfg, WT_COLLATOR **collatorp, int *ownp); -extern int __wt_conn_remove_collator(WT_CONNECTION_IMPL *conn, +extern int __wt_conn_remove_collator(WT_SESSION_IMPL *session, WT_NAMED_COLLATOR *ncoll); -extern int __wt_conn_remove_compressor( WT_CONNECTION_IMPL *conn, +extern int __wt_conn_remove_compressor( WT_SESSION_IMPL *session, WT_NAMED_COMPRESSOR *ncomp); -extern int __wt_conn_remove_data_source( WT_CONNECTION_IMPL *conn, +extern int __wt_conn_remove_data_source( WT_SESSION_IMPL *session, WT_NAMED_DATA_SOURCE *ndsrc); -extern int __wt_conn_verbose_config(WT_SESSION_IMPL *session, - const char *cfg[]); -extern int __wt_cache_config(WT_CONNECTION_IMPL *conn, const char *cfg[]); -extern int __wt_cache_create(WT_CONNECTION_IMPL *conn, const char *cfg[]); +extern int __wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[]); +extern int __wt_cache_config(WT_SESSION_IMPL *session, const char *cfg[]); +extern int __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[]); extern void __wt_cache_stats_update(WT_SESSION_IMPL *session); -extern int __wt_cache_destroy(WT_CONNECTION_IMPL *conn); -extern int __wt_conn_cache_pool_config(WT_SESSION_IMPL *session, - const char **cfg); +extern int __wt_cache_destroy(WT_SESSION_IMPL *session); +extern int __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg); extern int __wt_conn_cache_pool_open(WT_SESSION_IMPL *session); -extern int __wt_conn_cache_pool_destroy(WT_CONNECTION_IMPL *conn); +extern int __wt_conn_cache_pool_destroy(WT_SESSION_IMPL *session); extern void *__wt_cache_pool_server(void *arg); -extern int __wt_checkpoint_server_create(WT_CONNECTION_IMPL *conn, +extern int __wt_checkpoint_server_create(WT_SESSION_IMPL *session, const char *cfg[]); -extern int __wt_checkpoint_server_destroy(WT_CONNECTION_IMPL *conn); +extern int __wt_checkpoint_server_destroy(WT_SESSION_IMPL *session); extern int __wt_checkpoint_signal(WT_SESSION_IMPL *session, off_t logsize); extern int __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session); extern int __wt_conn_btree_get(WT_SESSION_IMPL *session, @@ -638,20 +642,20 @@ extern int __wt_conn_dhandle_close_all(WT_SESSION_IMPL *session, extern int __wt_conn_dhandle_discard_single( WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle, int final); -extern int __wt_conn_dhandle_discard(WT_CONNECTION_IMPL *conn); +extern int __wt_conn_dhandle_discard(WT_SESSION_IMPL *session); extern int __wt_connection_init(WT_CONNECTION_IMPL *conn); extern int __wt_connection_destroy(WT_CONNECTION_IMPL *conn); -extern int __wt_logmgr_create(WT_CONNECTION_IMPL *conn, const char *cfg[]); -extern int __wt_logmgr_destroy(WT_CONNECTION_IMPL *conn); +extern int __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[]); +extern int __wt_logmgr_destroy(WT_SESSION_IMPL *session); extern int __wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[]); extern int __wt_connection_close(WT_CONNECTION_IMPL *conn); extern int __wt_connection_workers(WT_SESSION_IMPL *session, const char *cfg[]); extern void __wt_conn_stat_init(WT_SESSION_IMPL *session); extern int __wt_statlog_log_one(WT_SESSION_IMPL *session); -extern int __wt_statlog_create(WT_CONNECTION_IMPL *conn, const char *cfg[]); -extern int __wt_statlog_destroy(WT_CONNECTION_IMPL *conn, int is_close); -extern int __wt_sweep_create(WT_CONNECTION_IMPL *conn); -extern int __wt_sweep_destroy(WT_CONNECTION_IMPL *conn); +extern int __wt_statlog_create(WT_SESSION_IMPL *session, const char *cfg[]); +extern int __wt_statlog_destroy(WT_SESSION_IMPL *session, int is_close); +extern int __wt_sweep_create(WT_SESSION_IMPL *session); +extern int __wt_sweep_destroy(WT_SESSION_IMPL *session); extern int __wt_curbackup_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], @@ -946,17 +950,19 @@ extern int __wt_clsm_open(WT_SESSION_IMPL *session, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp); +extern int __wt_lsm_manager_config(WT_SESSION_IMPL *session, const char **cfg); extern int __wt_lsm_manager_start(WT_SESSION_IMPL *session); extern void __wt_lsm_manager_free_work_unit( WT_SESSION_IMPL *session, WT_LSM_WORK_UNIT *entry); -extern int __wt_lsm_manager_destroy(WT_CONNECTION_IMPL *conn); +extern int __wt_lsm_manager_destroy(WT_SESSION_IMPL *session); extern int __wt_lsm_manager_clear_tree( WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree); extern int __wt_lsm_manager_pop_entry( WT_SESSION_IMPL *session, uint32_t type, WT_LSM_WORK_UNIT **entryp); -extern int __wt_lsm_manager_push_entry( WT_SESSION_IMPL *session, +extern int __wt_lsm_manager_push_entry(WT_SESSION_IMPL *session, uint32_t type, + uint32_t flags, WT_LSM_TREE *lsm_tree); extern int __wt_lsm_merge_update_tree(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, @@ -1027,10 +1033,14 @@ extern int __wt_lsm_tree_worker(WT_SESSION_IMPL *session, int *), const char *cfg[], uint32_t open_flags); -extern int __wt_lsm_get_chunk_to_flush( WT_SESSION_IMPL *session, +extern int __wt_lsm_get_chunk_to_flush(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, + int force, WT_LSM_CHUNK **chunkp); -extern int __wt_lsm_bloom_work(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree); +extern int __wt_lsm_work_switch( WT_SESSION_IMPL *session, + WT_LSM_WORK_UNIT **entryp, + int *ran); +extern int __wt_lsm_work_bloom(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree); extern int __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk); @@ -1389,7 +1399,10 @@ extern int __wt_schema_range_truncate( WT_SESSION_IMPL *session, WT_CURSOR *stop); extern WT_DATA_SOURCE *__wt_schema_get_source(WT_SESSION_IMPL *session, const char *name); -extern int __wt_schema_name_check(WT_SESSION_IMPL *session, const char *uri); +extern int __wt_str_name_check(WT_SESSION_IMPL *session, const char *str); +extern int __wt_name_check(WT_SESSION_IMPL *session, + const char *str, + size_t len); extern int __wt_schema_worker(WT_SESSION_IMPL *session, const char *uri, int (*file_func)(WT_SESSION_IMPL *, @@ -1570,7 +1583,8 @@ extern uint32_t __wt_nlpo2(uint32_t v); extern uint32_t __wt_log2_int(uint32_t n); extern int __wt_ispo2(uint32_t v); extern uint32_t __wt_rduppo2(uint32_t n, uint32_t po2); -extern uint32_t __wt_random(void); +extern void __wt_random_init(uint32_t *rnd); +extern uint32_t __wt_random(uint32_t *rnd); extern int __wt_buf_grow_worker(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size); @@ -1617,8 +1631,11 @@ extern int __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_txn_init(WT_SESSION_IMPL *session); extern void __wt_txn_destroy(WT_SESSION_IMPL *session); -extern int __wt_txn_global_init(WT_CONNECTION_IMPL *conn, const char *cfg[]); -extern void __wt_txn_global_destroy(WT_CONNECTION_IMPL *conn); +extern int __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]); +extern void __wt_txn_global_destroy(WT_SESSION_IMPL *session); +extern int __wt_checkpoint_name_ok(WT_SESSION_IMPL *session, + const char *name, + size_t len); extern int __wt_checkpoint_list(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]); @@ -1651,4 +1668,4 @@ extern int __wt_txn_truncate_log( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *stop); extern int __wt_txn_truncate_end(WT_SESSION_IMPL *session); extern int __wt_txn_printlog(WT_SESSION *wt_session, FILE *out); -extern int __wt_txn_recover(WT_SESSION_IMPL *default_session); +extern int __wt_txn_recover(WT_CONNECTION_IMPL *conn); diff --git a/src/include/lsm.h b/src/include/lsm.h index c6c68d1f901..666acddc124 100644 --- a/src/include/lsm.h +++ b/src/include/lsm.h @@ -87,7 +87,7 @@ struct __wt_lsm_chunk { #define WT_LSM_WORK_DROP 0x02 /* Drop unused chunks */ #define WT_LSM_WORK_FLUSH 0x04 /* Flush a chunk to disk */ #define WT_LSM_WORK_MERGE 0x08 /* Look for a tree merge */ -#define WT_LSM_WORK_SWITCH 0x10 /* Switch to a new in memory chunk */ +#define WT_LSM_WORK_SWITCH 0x10 /* Switch to new in-memory chunk */ /* * WT_LSM_WORK_UNIT -- @@ -95,7 +95,9 @@ struct __wt_lsm_chunk { */ struct __wt_lsm_work_unit { TAILQ_ENTRY(__wt_lsm_work_unit) q; /* Worker unit queue */ - uint32_t flags; /* The type of operation */ + uint32_t type; /* Type of operation */ +#define WT_LSM_WORK_FORCE 0x0001 /* Force operation */ + uint32_t flags; /* Flags for operation */ WT_LSM_TREE *lsm_tree; }; @@ -121,6 +123,7 @@ struct __wt_lsm_manager { WT_SPINLOCK switch_lock; /* Lock for switch queue */ WT_SPINLOCK app_lock; /* Lock for application queue */ WT_SPINLOCK manager_lock; /* Lock for manager queue */ + WT_CONDVAR *work_cond; /* Used to notify worker of activity */ uint32_t lsm_workers; /* Current number of LSM workers */ uint32_t lsm_workers_max; WT_LSM_WORKER_ARGS *lsm_worker_cookies; @@ -139,9 +142,9 @@ struct __wt_lsm_tree { const char *collator_name; int refcnt; /* Number of users of the tree */ +#define LSM_TREE_MAX_QUEUE 100 int queue_ref; WT_RWLOCK *rwlock; - WT_CONDVAR *work_cond; /* Used to notify worker of activity */ TAILQ_ENTRY(__wt_lsm_tree) q; WT_DSRC_STATS stats; /* LSM-level statistics */ @@ -190,11 +193,11 @@ struct __wt_lsm_tree { int freeing_old_chunks; /* Whether chunks are being freed */ uint32_t merge_aggressiveness; /* Increase amount of work per merge */ -#define WT_LSM_TREE_ACTIVE 0x01 /* Workers are active */ -#define WT_LSM_TREE_COMPACTING 0x02 /* Tree is being compacted */ -#define WT_LSM_TREE_NEED_SWITCH 0x04 /* A new chunk should be created */ -#define WT_LSM_TREE_OPEN 0x08 /* The tree is open */ -#define WT_LSM_TREE_THROTTLE 0x10 /* Throttle updates */ +#define WT_LSM_TREE_ACTIVE 0x01 /* Workers are active */ +#define WT_LSM_TREE_COMPACTING 0x02 /* Tree being compacted */ +#define WT_LSM_TREE_NEED_SWITCH 0x04 /* New chunk needs creating */ +#define WT_LSM_TREE_OPEN 0x08 /* The tree is open */ +#define WT_LSM_TREE_THROTTLE 0x10 /* Throttle updates */ uint32_t flags; #define WT_LSM_TREE_EXCLUSIVE 0x01 /* Tree is opened exclusively */ @@ -226,8 +229,9 @@ struct __wt_lsm_worker_cookie { * State for an LSM worker thread. */ struct __wt_lsm_worker_args { - WT_SESSION_IMPL *session; - pthread_t tid; - u_int id; - uint32_t flags; + WT_SESSION_IMPL *session; /* Session */ + WT_CONDVAR *work_cond; /* Owned by the manager */ + pthread_t tid; /* Thread id */ + u_int id; /* My manager slot id */ + uint32_t type; /* Types of operations handled */ }; diff --git a/src/include/misc.h b/src/include/misc.h index e50038b2c66..aae3cdb53df 100644 --- a/src/include/misc.h +++ b/src/include/misc.h @@ -148,11 +148,17 @@ /* Check if a string matches a prefix. */ #define WT_PREFIX_MATCH(str, pfx) \ - (strncmp((str), (pfx), strlen(pfx)) == 0) + (((const char *)str)[0] == ((const char *)pfx)[0] && \ + strncmp((str), (pfx), strlen(pfx)) == 0) +/* Check if a non-nul-terminated string matches a prefix. */ #define WT_PREFIX_MATCH_LEN(str, len, pfx) \ ((len) >= strlen(pfx) && WT_PREFIX_MATCH(str, pfx)) +/* Check if a string matches a prefix, and move past it. */ +#define WT_PREFIX_SKIP(str, pfx) \ + (WT_PREFIX_MATCH(str, pfx) ? ((str) += strlen(pfx), 1) : 0) + /* * Check if a variable string equals a constant string. Inline the common * case for WiredTiger of a single byte string. This is required because not @@ -162,14 +168,10 @@ (sizeof(cs) == 2 ? (s)[0] == (cs)[0] && (s)[1] == '\0' : \ strcmp(s, cs) == 0) -/* Check if a string matches a prefix, and move past it. */ -#define WT_PREFIX_SKIP(str, pfx) \ - ((strncmp((str), (pfx), strlen(pfx)) == 0) ? \ - ((str) += strlen(pfx), 1) : 0) - /* Check if a string matches a byte string of len bytes. */ #define WT_STRING_MATCH(str, bytes, len) \ - (strncmp(str, bytes, len) == 0 && (str)[(len)] == '\0') + (((const char *)str)[0] == ((const char *)bytes)[0] && \ + strncmp(str, bytes, len) == 0 && (str)[(len)] == '\0') /* * Macro that produces a string literal that isn't wrapped in quotes, to avoid @@ -186,6 +188,12 @@ ((i)->mem != NULL && (i)->data >= (i)->mem && \ WT_PTRDIFF((i)->data, (i)->mem) < (i)->memsize) +/* Copy the data and size fields of an item. */ +#define WT_ITEM_SET(dst, src) do { \ + (dst).data = (src).data; \ + (dst).size = (src).size; \ +} while (0) + /* * In diagnostic mode we track the locations from which hazard pointers and * scratch buffers were acquired. diff --git a/src/include/schema.h b/src/include/schema.h index 038404f5ea5..e24a19b03ca 100644 --- a/src/include/schema.h +++ b/src/include/schema.h @@ -65,43 +65,37 @@ struct __wt_table { */ #define WT_COLGROUPS(t) WT_MAX((t)->ncolgroups, 1) +/* + * WT_WITH_SCHEMA_LOCK -- + * Acquire the schema lock, perform an operation, drop the lock. + */ #define WT_WITH_SCHEMA_LOCK(session, op) do { \ - int __schema_locked = 0; \ - WT_DECL_SPINLOCK_ID(__id); /* Must appear last */ \ WT_ASSERT(session, \ F_ISSET(session, WT_SESSION_SCHEMA_LOCKED) || \ !F_ISSET(session, WT_SESSION_NO_SCHEMA_LOCK)); \ - while (!F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)) \ - if (session->skip_schema_lock || __wt_spin_trylock( \ - session, &S2C(session)->schema_lock, &__id) == 0) { \ - F_SET(session, WT_SESSION_SCHEMA_LOCKED); \ - __schema_locked = 1; \ - } else \ - __wt_yield(); \ - (op); \ - if (__schema_locked) { \ + if (F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)) { \ + (op); \ + } else { \ + __wt_spin_lock(session, &S2C(session)->schema_lock); \ + F_SET(session, WT_SESSION_SCHEMA_LOCKED); \ + (op); \ + __wt_spin_unlock(session, &S2C(session)->schema_lock); \ F_CLR(session, WT_SESSION_SCHEMA_LOCKED); \ - if (!session->skip_schema_lock) \ - __wt_spin_unlock( \ - session, &S2C(session)->schema_lock); \ } \ } while (0) -/* Drop the schema lock, and re-acquire after operation. */ +/* + * WT_WITHOUT_SCHEMA_LOCK -- + * Drop the schema lock, perform an operation, re-acquire the lock. + */ #define WT_WITHOUT_SCHEMA_LOCK(session, op) do { \ - WT_DECL_SPINLOCK_ID(__id); /* Must appear last */ \ - if (!F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)) \ - (op); \ - else { \ + if (F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)) { \ __wt_spin_unlock(session, &S2C(session)->schema_lock); \ F_CLR(session, WT_SESSION_SCHEMA_LOCKED); \ (op); \ - while (!F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)) { \ - if (__wt_spin_trylock(session, \ - &S2C(session)->schema_lock, &__id) == 0) \ - F_SET(session, WT_SESSION_SCHEMA_LOCKED);\ - else \ - __wt_yield(); \ - } \ + __wt_spin_lock(session, &S2C(session)->schema_lock); \ + F_SET(session, WT_SESSION_SCHEMA_LOCKED); \ + } else { \ + (op); \ } \ } while (0) diff --git a/src/include/session.h b/src/include/session.h index 5d566f8b62d..eace12844e9 100644 --- a/src/include/session.h +++ b/src/include/session.h @@ -52,6 +52,8 @@ struct __wt_session_impl { WT_CONDVAR *cond; /* Condition variable */ + uint32_t rnd[2]; /* Random number generation state */ + WT_EVENT_HANDLER *event_handler;/* Application's event handlers */ WT_DATA_HANDLE *dhandle; /* Current data handle */ @@ -113,8 +115,6 @@ struct __wt_session_impl { int (*reconcile_cleanup)(WT_SESSION_IMPL *); int compaction; /* Compaction did some work */ - int skip_schema_lock; /* Another thread holds the schema lock - * on our behalf */ /* * The split stash memory and hazard information persist past session diff --git a/src/include/stat.h b/src/include/stat.h index e2eedd76632..55332e34250 100644 --- a/src/include/stat.h +++ b/src/include/stat.h @@ -207,6 +207,10 @@ struct __wt_connection_stats { WT_STATS lsm_checkpoint_throttle; WT_STATS lsm_merge_throttle; WT_STATS lsm_rows_merged; + WT_STATS lsm_work_queue_app; + WT_STATS lsm_work_queue_manager; + WT_STATS lsm_work_queue_max; + WT_STATS lsm_work_queue_switch; WT_STATS lsm_work_units_created; WT_STATS lsm_work_units_discarded; WT_STATS lsm_work_units_done; diff --git a/src/include/txn.i b/src/include/txn.i index 3854429f8e4..81559bfe490 100644 --- a/src/include/txn.i +++ b/src/include/txn.i @@ -179,7 +179,7 @@ __wt_txn_read(WT_SESSION_IMPL *session, WT_UPDATE *upd) /* * __wt_txn_autocommit_check -- - * If an auto-commit transaction is required, start one. + * If an auto-commit transaction is required, start one. */ static inline int __wt_txn_autocommit_check(WT_SESSION_IMPL *session) @@ -194,16 +194,6 @@ __wt_txn_autocommit_check(WT_SESSION_IMPL *session) return (0); } -/* - * __wt_txn_current_id -- - * Get the current transaction ID. - */ -static inline uint64_t -__wt_txn_current_id(WT_SESSION_IMPL *session) -{ - return (S2C(session)->txn_global.current); -} - /* * __wt_txn_new_id -- * Allocate a new transaction ID. @@ -211,7 +201,14 @@ __wt_txn_current_id(WT_SESSION_IMPL *session) static inline uint64_t __wt_txn_new_id(WT_SESSION_IMPL *session) { - return WT_ATOMIC_ADD(S2C(session)->txn_global.current, 1); + /* + * We want the global value to lead the allocated values, so that any + * allocated transaction ID eventually becomes globally visible. When + * there are no transactions running, the oldest_id will reach the + * global current ID, so we want post-increment semantics. Our atomic + * add primitive does pre-increment, so adjust the result here. + */ + return WT_ATOMIC_ADD(S2C(session)->txn_global.current, 1) - 1; } /* diff --git a/src/include/verify_build.h b/src/include/verify_build.h index 811602be34a..53c59f90bbd 100644 --- a/src/include/verify_build.h +++ b/src/include/verify_build.h @@ -53,12 +53,10 @@ __wt_verify_build(void) SIZE_CHECK(WT_REF, WT_REF_SIZE); /* - * We mix-and-match 32-bit unsigned values and size_t's, mostly because - * we allocate and handle 32-bit objects, and lots of the underlying C - * library expects size_t values for the length of memory objects. We - * check, just to be sure. + * The btree code encodes key/value pairs in size_t's, and requires at + * least 8B size_t's. */ - STATIC_ASSERT(sizeof(size_t) >= sizeof(uint32_t)); + STATIC_ASSERT(sizeof(size_t) >= sizeof(int64_t)); /* * We require an off_t fit into an 8B chunk because 8B is the largest diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index c84c0d01ae1..58b53361dcc 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -3166,50 +3166,58 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_LSM_MERGE_THROTTLE 1078 /*! rows merged in an LSM tree */ #define WT_STAT_CONN_LSM_ROWS_MERGED 1079 +/*! LSM App work units currently queued */ +#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1080 +/*! LSM Merge work units currently queued */ +#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1081 +/*! LSM tree queue hit maximum */ +#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1082 +/*! LSM Switch work units currently queued */ +#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1083 /*! LSM tree maintenance operations scheduled */ -#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1080 +#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1084 /*! LSM tree maintenance operations discarded */ -#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1081 +#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1085 /*! LSM tree maintenance operations executed */ -#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1082 +#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1086 /*! memory allocations */ -#define WT_STAT_CONN_MEMORY_ALLOCATION 1083 +#define WT_STAT_CONN_MEMORY_ALLOCATION 1087 /*! memory frees */ -#define WT_STAT_CONN_MEMORY_FREE 1084 +#define WT_STAT_CONN_MEMORY_FREE 1088 /*! memory re-allocations */ -#define WT_STAT_CONN_MEMORY_GROW 1085 +#define WT_STAT_CONN_MEMORY_GROW 1089 /*! total read I/Os */ -#define WT_STAT_CONN_READ_IO 1086 +#define WT_STAT_CONN_READ_IO 1090 /*! page reconciliation calls */ -#define WT_STAT_CONN_REC_PAGES 1087 +#define WT_STAT_CONN_REC_PAGES 1091 /*! page reconciliation calls for eviction */ -#define WT_STAT_CONN_REC_PAGES_EVICTION 1088 +#define WT_STAT_CONN_REC_PAGES_EVICTION 1092 /*! reconciliation failed because an update could not be included */ -#define WT_STAT_CONN_REC_SKIPPED_UPDATE 1089 +#define WT_STAT_CONN_REC_SKIPPED_UPDATE 1093 /*! split bytes currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1090 +#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1094 /*! split objects currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1091 +#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1095 /*! pthread mutex shared lock read-lock calls */ -#define WT_STAT_CONN_RWLOCK_READ 1092 +#define WT_STAT_CONN_RWLOCK_READ 1096 /*! pthread mutex shared lock write-lock calls */ -#define WT_STAT_CONN_RWLOCK_WRITE 1093 +#define WT_STAT_CONN_RWLOCK_WRITE 1097 /*! open cursor count */ -#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1094 +#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1098 /*! transactions */ -#define WT_STAT_CONN_TXN_BEGIN 1095 +#define WT_STAT_CONN_TXN_BEGIN 1099 /*! transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1096 +#define WT_STAT_CONN_TXN_CHECKPOINT 1100 /*! transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1097 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1101 /*! transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1098 +#define WT_STAT_CONN_TXN_COMMIT 1102 /*! transaction failures due to cache overflow */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1099 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1103 /*! transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1100 +#define WT_STAT_CONN_TXN_ROLLBACK 1104 /*! total write I/Os */ -#define WT_STAT_CONN_WRITE_IO 1101 +#define WT_STAT_CONN_WRITE_IO 1105 /*! * @} diff --git a/src/log/log.c b/src/log/log.c index 12a12f02375..c7d362973aa 100644 --- a/src/log/log.c +++ b/src/log/log.c @@ -246,7 +246,7 @@ __wt_log_open(WT_SESSION_IMPL *session) */ if (logcount > 0) { log->trunc_lsn = log->alloc_lsn; - WT_ERR(__wt_txn_recover(session)); + WT_ERR(__wt_txn_recover(conn)); } err: __wt_log_files_free(session, logfiles, logcount); diff --git a/src/log/log_slot.c b/src/log/log_slot.c index cc5dee721fa..8f72763b4e8 100644 --- a/src/log/log_slot.c +++ b/src/log/log_slot.c @@ -106,7 +106,7 @@ __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, log = conn->log; slot_grow_attempts = 0; find_slot: - allocated_slot = __wt_random() % SLOT_ACTIVE; + allocated_slot = __wt_random(session->rnd) % SLOT_ACTIVE; slot = log->slot_array[allocated_slot]; old_state = slot->slot_state; join_slot: diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c index b4af1c0abf2..801a7469f11 100644 --- a/src/lsm/lsm_cursor.c +++ b/src/lsm/lsm_cursor.c @@ -50,7 +50,7 @@ __clsm_enter_update(WT_CURSOR_LSM *clsm) WT_LSM_CHUNK *primary_chunk; WT_LSM_TREE *lsm_tree; WT_SESSION_IMPL *session; - int have_primary, need_signal, ovfl; + int have_primary, ovfl; lsm_tree = clsm->lsm_tree; if (clsm->nchunks == 0 || @@ -88,19 +88,14 @@ __clsm_enter_update(WT_CURSOR_LSM *clsm) * when only one switch is required, creating very * small chunks. */ - need_signal = 0; WT_RET(__wt_lsm_tree_lock(session, lsm_tree, 0)); if (clsm->dsk_gen == lsm_tree->dsk_gen && !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) { WT_RET(__wt_lsm_manager_push_entry( - session, WT_LSM_WORK_SWITCH, lsm_tree)); + session, WT_LSM_WORK_SWITCH, 0, lsm_tree)); F_SET(lsm_tree, WT_LSM_TREE_NEED_SWITCH); - need_signal = 1; } WT_RET(__wt_lsm_tree_unlock(session, lsm_tree)); - if (need_signal) - WT_RET(__wt_cond_signal( - session, lsm_tree->work_cond)); ovfl = 0; } } else if (have_primary) @@ -422,7 +417,6 @@ __clsm_open_cursors( F_SET(lsm_tree, WT_LSM_TREE_NEED_SWITCH); locked = 0; WT_ERR(__wt_lsm_tree_unlock(session, lsm_tree)); - WT_ERR(__wt_cond_signal(session, lsm_tree->work_cond)); /* * Give the worker thread a chance to run before locking the @@ -706,7 +700,7 @@ __clsm_get_current( multiple = 0; WT_FORALL_CURSORS(clsm, c, i) { - if (!F_ISSET(c, WT_CURSTD_KEY_SET)) + if (!F_ISSET(c, WT_CURSTD_KEY_INT)) continue; if (current == NULL) { current = c; @@ -829,7 +823,7 @@ retry: /* if (F_ISSET(clsm, WT_CLSM_MULTIPLE)) { check = 0; WT_FORALL_CURSORS(clsm, c, i) { - if (!F_ISSET(c, WT_CURSTD_KEY_SET)) + if (!F_ISSET(c, WT_CURSTD_KEY_INT)) continue; if (check) { WT_ERR(WT_LSM_CURCMP(session, @@ -912,7 +906,7 @@ retry: /* if (F_ISSET(clsm, WT_CLSM_MULTIPLE)) { check = 0; WT_FORALL_CURSORS(clsm, c, i) { - if (!F_ISSET(c, WT_CURSTD_KEY_SET)) + if (!F_ISSET(c, WT_CURSTD_KEY_INT)) continue; if (check) { WT_ERR(WT_LSM_CURCMP(session, @@ -964,7 +958,7 @@ __clsm_reset_cursors(WT_CURSOR_LSM *clsm, WT_CURSOR *skip) WT_FORALL_CURSORS(clsm, c, i) { if (c == skip) continue; - if (F_ISSET(c, WT_CURSTD_KEY_SET)) + if (F_ISSET(c, WT_CURSTD_KEY_INT)) WT_TRET(c->reset(c)); } @@ -1448,10 +1442,10 @@ __wt_clsm_open(WT_SESSION_IMPL *session, { WT_CONFIG_ITEM cval; WT_CURSOR_STATIC_INIT(iface, - NULL, /* get-key */ - NULL, /* get-value */ - NULL, /* set-key */ - NULL, /* set-value */ + __wt_cursor_get_key, /* get-key */ + __wt_cursor_get_value, /* get-value */ + __wt_cursor_set_key, /* set-key */ + __wt_cursor_set_value, /* set-value */ __clsm_compare, /* compare */ __clsm_next, /* next */ __clsm_prev, /* prev */ diff --git a/src/lsm/lsm_manager.c b/src/lsm/lsm_manager.c index b888ff36bed..fce030459a3 100644 --- a/src/lsm/lsm_manager.c +++ b/src/lsm/lsm_manager.c @@ -13,9 +13,29 @@ static int __lsm_manager_worker_setup(WT_SESSION_IMPL *); static void * __lsm_worker_manager(void *); +/* + * __wt_lsm_manager_config -- + * Re-configure the LSM manager. + */ +int +__wt_lsm_manager_config(WT_SESSION_IMPL *session, const char **cfg) +{ + WT_CONNECTION_IMPL *conn; + WT_CONFIG_ITEM cval; + + conn = S2C(session); + + WT_RET(__wt_config_gets( + session, cfg, "lsm_manager.worker_thread_max", &cval)); + if (cval.val) + conn->lsm_manager.lsm_workers_max = (uint32_t)cval.val; + return (0); +} + /* * __wt_lsm_manager_start -- - * Start the LSM management infrastructure. + * Start the LSM management infrastructure. Our queues and locks were + * initialized when the connection was initialized. */ int __wt_lsm_manager_start(WT_SESSION_IMPL *session) @@ -88,69 +108,72 @@ __wt_lsm_manager_free_work_unit( * Destroy the LSM manager threads and subsystem. */ int -__wt_lsm_manager_destroy(WT_CONNECTION_IMPL *conn) +__wt_lsm_manager_destroy(WT_SESSION_IMPL *session) { + WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LSM_MANAGER *manager; WT_LSM_WORK_UNIT *current, *next; WT_SESSION *wt_session; - WT_SESSION_IMPL *session; uint32_t i; uint64_t removed; - session = conn->default_session; + conn = S2C(session); manager = &conn->lsm_manager; removed = 0; - if (manager->lsm_worker_cookies == NULL) - return (0); + if (manager->lsm_worker_cookies != NULL) { + /* Wait for the server to notice and wrap up. */ + while (F_ISSET(conn, WT_CONN_SERVER_LSM)) + __wt_yield(); - /* Wait for the server to notice and wrap up. */ - while (F_ISSET(conn, WT_CONN_SERVER_LSM)) - __wt_yield(); + /* Clean up open LSM handles. */ + ret = __wt_lsm_tree_close_all(session); - /* Clean up open LSM handles. */ - ret = __wt_lsm_tree_close_all(conn->default_session); + WT_TRET(__wt_thread_join( + session, manager->lsm_worker_cookies[0].tid)); + manager->lsm_worker_cookies[0].tid = 0; - WT_TRET(__wt_thread_join(session, manager->lsm_worker_cookies[0].tid)); - manager->lsm_worker_cookies[0].tid = 0; + /* Release memory from any operations left on the queue. */ + for (current = TAILQ_FIRST(&manager->switchqh); + current != NULL; current = next) { + next = TAILQ_NEXT(current, q); + TAILQ_REMOVE(&manager->switchqh, current, q); + ++removed; + __wt_lsm_manager_free_work_unit(session, current); + } + for (current = TAILQ_FIRST(&manager->appqh); + current != NULL; current = next) { + next = TAILQ_NEXT(current, q); + TAILQ_REMOVE(&manager->appqh, current, q); + ++removed; + __wt_lsm_manager_free_work_unit(session, current); + } + for (current = TAILQ_FIRST(&manager->managerqh); + current != NULL; current = next) { + next = TAILQ_NEXT(current, q); + TAILQ_REMOVE(&manager->managerqh, current, q); + ++removed; + __wt_lsm_manager_free_work_unit(session, current); + } - /* Release memory from any operations left on the queue. */ - for (current = TAILQ_FIRST(&manager->switchqh); - current != NULL; current = next) { - next = TAILQ_NEXT(current, q); - TAILQ_REMOVE(&manager->switchqh, current, q); - ++removed; - __wt_lsm_manager_free_work_unit(session, current); - } - for (current = TAILQ_FIRST(&manager->appqh); - current != NULL; current = next) { - next = TAILQ_NEXT(current, q); - TAILQ_REMOVE(&manager->appqh, current, q); - ++removed; - __wt_lsm_manager_free_work_unit(session, current); - } - for (current = TAILQ_FIRST(&manager->managerqh); - current != NULL; current = next) { - next = TAILQ_NEXT(current, q); - TAILQ_REMOVE(&manager->managerqh, current, q); - ++removed; - __wt_lsm_manager_free_work_unit(session, current); + /* Close all LSM worker sessions. */ + for (i = 0; i < manager->lsm_workers_max; i++) { + wt_session = + &manager->lsm_worker_cookies[i].session->iface; + WT_TRET(wt_session->close(wt_session, NULL)); + } + + WT_STAT_FAST_CONN_INCRV(session, + lsm_work_units_discarded, removed); + __wt_free(session, manager->lsm_worker_cookies); } - /* Close all LSM worker sessions. */ - for (i = 0; i < manager->lsm_workers_max; i++) { - wt_session = &manager->lsm_worker_cookies[i].session->iface; - WT_TRET(wt_session->close(wt_session, NULL)); - } - - WT_STAT_FAST_CONN_INCRV(session, lsm_work_units_discarded, removed); - + /* Free resources that are allocated in connection initialize */ __wt_spin_destroy(session, &manager->switch_lock); __wt_spin_destroy(session, &manager->app_lock); __wt_spin_destroy(session, &manager->manager_lock); - - __wt_free(session, manager->lsm_worker_cookies); + WT_TRET(__wt_cond_destroy(session, &manager->work_cond)); return (ret); } @@ -164,7 +187,7 @@ __lsm_manager_aggressive_update(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { struct timespec now; uint64_t chunk_wait, stallms; - u_int old_aggressive; + u_int new_aggressive; WT_RET(__wt_epoch(session, &now)); stallms = WT_TIMEDIFF(now, lsm_tree->last_flush_ts) / WT_MILLION; @@ -173,24 +196,28 @@ __lsm_manager_aggressive_update(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) * been created by now. Use 10 seconds as a default if we don't have an * estimate. */ - chunk_wait = stallms / (lsm_tree->chunk_fill_ms == 0 ? - 10000 : lsm_tree->chunk_fill_ms); - old_aggressive = lsm_tree->merge_aggressiveness; - lsm_tree->merge_aggressiveness = - (u_int)(chunk_wait / lsm_tree->merge_min); + if (lsm_tree->nchunks > 1) + chunk_wait = stallms / (lsm_tree->chunk_fill_ms == 0 ? + 10000 : lsm_tree->chunk_fill_ms); + else + chunk_wait = 0; + new_aggressive = (u_int)(chunk_wait / lsm_tree->merge_min); - if (lsm_tree->merge_aggressiveness > old_aggressive) + if (new_aggressive > lsm_tree->merge_aggressiveness) { WT_RET(__wt_verbose(session, WT_VERB_LSM, - "LSM merge got aggressive (%u), " - "%u / %" PRIu64, - lsm_tree->merge_aggressiveness, stallms, - lsm_tree->chunk_fill_ms)); + "LSM merge %s got aggressive (old %u new %u), " + "merge_min %d, %u / %" PRIu64, + lsm_tree->name, lsm_tree->merge_aggressiveness, + new_aggressive, lsm_tree->merge_min, stallms, + lsm_tree->chunk_fill_ms)); + lsm_tree->merge_aggressiveness = new_aggressive; + } return (0); } /* * __lsm_manager_worker_setup -- - * Do setup owned by the LSM manager thread includes starting the worker + * Do setup owned by the LSM manager thread including starting the worker * threads. */ static int @@ -204,18 +231,14 @@ __lsm_manager_worker_setup(WT_SESSION_IMPL *session) manager = &conn->lsm_manager; WT_ASSERT(session, manager->lsm_workers == 1); - - /* Setup the spin locks for the queues. */ - WT_RET(__wt_spin_init( - session, &manager->app_lock, "LSM application queue lock")); - WT_RET(__wt_spin_init( - session, &manager->manager_lock, "LSM manager queue lock")); - WT_RET(__wt_spin_init( - session, &manager->switch_lock, "LSM switch queue lock")); - + /* + * The LSM manager is worker[0]. The switch thread is worker[1]. + * Setup and start the switch/drop worker explicitly. + */ worker_args = &manager->lsm_worker_cookies[1]; + worker_args->work_cond = manager->work_cond; worker_args->id = manager->lsm_workers++; - worker_args->flags = WT_LSM_WORK_SWITCH; + worker_args->type = WT_LSM_WORK_DROP | WT_LSM_WORK_SWITCH; /* Start the switch thread. */ WT_RET(__wt_lsm_worker_start(session, worker_args)); @@ -229,8 +252,9 @@ __lsm_manager_worker_setup(WT_SESSION_IMPL *session) /* Freed by the worker thread when it shuts down */ worker_args = &manager->lsm_worker_cookies[manager->lsm_workers]; + worker_args->work_cond = manager->work_cond; worker_args->id = manager->lsm_workers; - worker_args->flags = + worker_args->type = WT_LSM_WORK_BLOOM | WT_LSM_WORK_DROP | WT_LSM_WORK_FLUSH | @@ -242,7 +266,7 @@ __lsm_manager_worker_setup(WT_SESSION_IMPL *session) * least one thread capable of running merges. */ if (manager->lsm_workers % 2 == 1) - F_SET(worker_args, WT_LSM_WORK_MERGE); + FLD_SET(worker_args->type, WT_LSM_WORK_MERGE); WT_RET(__wt_lsm_worker_start(session, worker_args)); } return (0); @@ -267,6 +291,7 @@ __lsm_manager_worker_shutdown(WT_SESSION_IMPL *session) */ for (i = 1; i < manager->lsm_workers; i++) { WT_ASSERT(session, manager->lsm_worker_cookies[i].tid != 0); + WT_TRET(__wt_cond_signal(session, manager->work_cond)); WT_TRET(__wt_thread_join( session, manager->lsm_worker_cookies[i].tid)); } @@ -318,24 +343,35 @@ __lsm_manager_run_server(WT_SESSION_IMPL *session) * to how often new chunks are being created add some * more. */ - if ((!lsm_tree->modified && lsm_tree->nchunks > 1) || - lsm_tree->merge_aggressiveness > 3 || + if (lsm_tree->queue_ref >= LSM_TREE_MAX_QUEUE) + WT_STAT_FAST_CONN_INCR(session, + lsm_work_queue_max); + else if ((!lsm_tree->modified && + lsm_tree->nchunks > 1) || (lsm_tree->queue_ref == 0 && lsm_tree->nchunks > 1) || + (lsm_tree->merge_aggressiveness > 3 && + !F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING)) || pushms > fillms) { WT_RET(__wt_lsm_manager_push_entry( - session, WT_LSM_WORK_SWITCH, lsm_tree)); + session, WT_LSM_WORK_SWITCH, 0, lsm_tree)); WT_RET(__wt_lsm_manager_push_entry( - session, WT_LSM_WORK_FLUSH, lsm_tree)); + session, WT_LSM_WORK_DROP, 0, lsm_tree)); WT_RET(__wt_lsm_manager_push_entry( - session, WT_LSM_WORK_BLOOM, lsm_tree)); + session, WT_LSM_WORK_FLUSH, 0, lsm_tree)); WT_RET(__wt_lsm_manager_push_entry( - session, WT_LSM_WORK_MERGE, lsm_tree)); - } - if (lsm_tree->queue_ref == 0 && - lsm_tree->nold_chunks != 0) { + session, WT_LSM_WORK_BLOOM, 0, lsm_tree)); + WT_RET(__wt_verbose(session, WT_VERB_LSM, + "MGR %s: queue %d mod %d nchunks %d" + " flags 0x%x aggressive %d pushms %" PRIu64 + " fillms %" PRIu64, + lsm_tree->name, lsm_tree->queue_ref, + lsm_tree->modified, lsm_tree->nchunks, + lsm_tree->flags, + lsm_tree->merge_aggressiveness, + pushms, fillms)); WT_RET(__wt_lsm_manager_push_entry( - session, WT_LSM_WORK_DROP, lsm_tree)); + session, WT_LSM_WORK_MERGE, 0, lsm_tree)); } } } @@ -428,6 +464,25 @@ __wt_lsm_manager_clear_tree( return (0); } +/* + * We assume this is only called from __wt_lsm_manager_pop_entry and we + * have session, entry and type available to use. If the queue is empty + * we may return from the macro. + */ +#define LSM_POP_ENTRY(qh, qlock, qlen) do { \ + if (TAILQ_EMPTY(qh)) \ + return (0); \ + __wt_spin_lock(session, qlock); \ + TAILQ_FOREACH(entry, (qh), q) { \ + if (FLD_ISSET(type, entry->type)) { \ + TAILQ_REMOVE(qh, entry, q); \ + WT_STAT_FAST_CONN_DECR(session, qlen); \ + break; \ + } \ + } \ + __wt_spin_unlock(session, (qlock)); \ +} while (0) + /* * __wt_lsm_manager_pop_entry -- * Retrieve the head of the queue, if it matches the requested work @@ -444,71 +499,43 @@ __wt_lsm_manager_pop_entry( *entryp = NULL; entry = NULL; - switch (type) { - case WT_LSM_WORK_SWITCH: - if (TAILQ_EMPTY(&manager->switchqh)) - return (0); - - __wt_spin_lock(session, &manager->switch_lock); - if (!TAILQ_EMPTY(&manager->switchqh)) { - entry = TAILQ_FIRST(&manager->switchqh); - WT_ASSERT(session, entry != NULL); - TAILQ_REMOVE(&manager->switchqh, entry, q); - } - __wt_spin_unlock(session, &manager->switch_lock); - break; - case WT_LSM_WORK_MERGE: - if (TAILQ_EMPTY(&manager->managerqh)) - return (0); - - __wt_spin_lock(session, &manager->manager_lock); - if (!TAILQ_EMPTY(&manager->managerqh)) { - entry = TAILQ_FIRST(&manager->managerqh); - WT_ASSERT(session, entry != NULL); - if (F_ISSET(entry, type)) - TAILQ_REMOVE(&manager->managerqh, entry, q); - else - entry = NULL; - } - - __wt_spin_unlock(session, &manager->manager_lock); - break; - default: - /* - * The app queue is the only one that has multiple different - * work unit types, allow a request for a variety. - */ - WT_ASSERT(session, FLD_ISSET(type, WT_LSM_WORK_BLOOM) || - FLD_ISSET(type, WT_LSM_WORK_DROP) || - FLD_ISSET(type, WT_LSM_WORK_FLUSH)); - if (TAILQ_EMPTY(&manager->appqh)) - return (0); - - __wt_spin_lock(session, &manager->app_lock); - if (!TAILQ_EMPTY(&manager->appqh)) { - entry = TAILQ_FIRST(&manager->appqh); - WT_ASSERT(session, entry != NULL); - if (FLD_ISSET(type, entry->flags)) - TAILQ_REMOVE(&manager->appqh, entry, q); - else - entry = NULL; - } - __wt_spin_unlock(session, &manager->app_lock); - break; - } + /* + * Pop the entry off the correct queue based on our work type. + */ + if (type == WT_LSM_WORK_SWITCH) + LSM_POP_ENTRY(&manager->switchqh, + &manager->switch_lock, lsm_work_queue_switch); + else if (type == WT_LSM_WORK_MERGE) + LSM_POP_ENTRY(&manager->managerqh, + &manager->manager_lock, lsm_work_queue_manager); + else + LSM_POP_ENTRY(&manager->appqh, + &manager->app_lock, lsm_work_queue_app); if (entry != NULL) WT_STAT_FAST_CONN_INCR(session, lsm_work_units_done); *entryp = entry; return (0); } +/* + * Push a work unit onto the appropriate queue. This macro assumes we are + * called from __wt_lsm_manager_push_entry and we have session and entry + * available for use. + */ +#define LSM_PUSH_ENTRY(qh, qlock, qlen) do { \ + __wt_spin_lock(session, qlock); \ + TAILQ_INSERT_TAIL((qh), entry, q); \ + WT_STAT_FAST_CONN_INCR(session, qlen); \ + __wt_spin_unlock(session, qlock); \ +} while (0) + /* * __wt_lsm_manager_push_entry -- * Add an entry to the end of the switch queue. */ int -__wt_lsm_manager_push_entry( - WT_SESSION_IMPL *session, uint32_t type, WT_LSM_TREE *lsm_tree) +__wt_lsm_manager_push_entry(WT_SESSION_IMPL *session, + uint32_t type, uint32_t flags, WT_LSM_TREE *lsm_tree) { WT_LSM_MANAGER *manager; WT_LSM_WORK_UNIT *entry; @@ -518,31 +545,23 @@ __wt_lsm_manager_push_entry( WT_RET(__wt_epoch(session, &lsm_tree->work_push_ts)); WT_RET(__wt_calloc_def(session, 1, &entry)); - entry->flags = type; + entry->type = type; + entry->flags = flags; entry->lsm_tree = lsm_tree; (void)WT_ATOMIC_ADD(lsm_tree->queue_ref, 1); WT_STAT_FAST_CONN_INCR(session, lsm_work_units_created); - switch (type) { - case WT_LSM_WORK_SWITCH: - __wt_spin_lock(session, &manager->switch_lock); - TAILQ_INSERT_TAIL(&manager->switchqh, entry, q); - __wt_spin_unlock(session, &manager->switch_lock); - break; - case WT_LSM_WORK_BLOOM: - case WT_LSM_WORK_DROP: - case WT_LSM_WORK_FLUSH: - __wt_spin_lock(session, &manager->app_lock); - TAILQ_INSERT_TAIL(&manager->appqh, entry, q); - __wt_spin_unlock(session, &manager->app_lock); - break; - case WT_LSM_WORK_MERGE: - __wt_spin_lock(session, &manager->manager_lock); - TAILQ_INSERT_TAIL(&manager->managerqh, entry, q); - __wt_spin_unlock(session, &manager->manager_lock); - break; - WT_ILLEGAL_VALUE(session); - } + if (type == WT_LSM_WORK_SWITCH) + LSM_PUSH_ENTRY(&manager->switchqh, + &manager->switch_lock, lsm_work_queue_switch); + else if (type == WT_LSM_WORK_MERGE) + LSM_PUSH_ENTRY(&manager->managerqh, + &manager->manager_lock, lsm_work_queue_manager); + else + LSM_PUSH_ENTRY(&manager->appqh, + &manager->app_lock, lsm_work_queue_app); + + WT_RET(__wt_cond_signal(session, manager->work_cond)); return (0); } diff --git a/src/lsm/lsm_merge.c b/src/lsm/lsm_merge.c index 70b1503377e..964aeb9529d 100644 --- a/src/lsm/lsm_merge.c +++ b/src/lsm/lsm_merge.c @@ -61,6 +61,7 @@ __wt_lsm_merge( uint32_t aggressive, generation, max_gap, max_gen, max_level, start_id; uint64_t insert_count, record_count, chunk_size; u_int dest_id, end_chunk, i, merge_max, merge_min, nchunks, start_chunk; + u_int verb; int create_bloom, locked, tret; const char *cfg[3]; const char *drop_cfg[] = @@ -72,16 +73,17 @@ __wt_lsm_merge( dest = src = NULL; locked = 0; start_id = 0; - aggressive = lsm_tree->merge_aggressiveness; /* - * If the tree is open read-only be very aggressive. Otherwise, we can - * spend a long time waiting for merges to start in read-only - * applications. + * If the tree is open read-only or we are compacting, be very + * aggressive. Otherwise, we can spend a long time waiting for merges + * to start in read-only applications. */ - if (!lsm_tree->modified) + if (!lsm_tree->modified || + F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING)) lsm_tree->merge_aggressiveness = 10; + aggressive = lsm_tree->merge_aggressiveness; merge_max = (aggressive > 5) ? 100 : lsm_tree->merge_min; merge_min = (aggressive > 5) ? 2 : lsm_tree->merge_min; max_gap = (aggressive + 4) / 5; @@ -249,10 +251,22 @@ __wt_lsm_merge( /* Allocate an ID for the merge. */ dest_id = WT_ATOMIC_ADD(lsm_tree->last, 1); - WT_RET(__wt_verbose(session, WT_VERB_LSM, - "Merging chunks %u-%u into %u (%" PRIu64 " records)" - ", generation %" PRIu32, - start_chunk, end_chunk, dest_id, record_count, generation)); + /* + * We only want to do the chunk loop if we're running with verbose, + * so we wrap these statements in the conditional. Avoid the loop + * in the normal path. + */ + if (WT_VERBOSE_ISSET(session, WT_VERB_LSM)) { + WT_RET(__wt_verbose(session, WT_VERB_LSM, + "Merging %s chunks %u-%u into %u (%" PRIu64 " records)" + ", generation %" PRIu32, + lsm_tree->name, + start_chunk, end_chunk, dest_id, record_count, generation)); + for (verb = start_chunk; verb <= end_chunk; verb++) + WT_RET(__wt_verbose(session, WT_VERB_LSM, + "%s: Chunk[%u] id %u", + lsm_tree->name, verb, lsm_tree->chunk[verb]->id)); + } WT_RET(__wt_calloc_def(session, 1, &chunk)); chunk->id = dest_id; @@ -341,6 +355,13 @@ __wt_lsm_merge( F_CLR(session, WT_SESSION_NO_CACHE); + /* + * We're doing advisory reads to fault the new trees into cache. + * Don't block if the cache is full: our next unit of work may be to + * discard some trees to free space. + */ + F_SET(session, WT_SESSION_NO_CACHE_CHECK); + if (create_bloom) { if (ret == 0) WT_TRET(__wt_bloom_finalize(bloom)); @@ -419,7 +440,7 @@ __wt_lsm_merge( /* Schedule a pass to discard old chunks */ WT_ERR(__wt_lsm_manager_push_entry( - session, WT_LSM_WORK_DROP, lsm_tree)); + session, WT_LSM_WORK_DROP, 0, lsm_tree)); err: if (locked) WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree)); @@ -450,7 +471,7 @@ err: if (locked) else WT_TRET(__wt_verbose(session, WT_VERB_LSM, "Merge failed with %s", wiredtiger_strerror(ret))); - F_CLR(session, WT_SESSION_NO_CACHE); } + F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_CACHE_CHECK); return (ret); } diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c index e0a977acf07..988c39607b6 100644 --- a/src/lsm/lsm_tree.c +++ b/src/lsm/lsm_tree.c @@ -35,7 +35,6 @@ __lsm_tree_discard(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) __wt_free(session, lsm_tree->file_config); WT_TRET(__wt_rwlock_destroy(session, &lsm_tree->rwlock)); - WT_TRET(__wt_cond_destroy(session, &lsm_tree->work_cond)); for (i = 0; i < lsm_tree->nchunks; i++) { if ((chunk = lsm_tree->chunk[i]) == NULL) @@ -442,7 +441,6 @@ __lsm_tree_open( WT_RET(__wt_calloc_def(session, 1, &lsm_tree)); WT_ERR(__wt_rwlock_alloc(session, &lsm_tree->rwlock, "lsm tree")); - WT_ERR(__wt_cond_alloc(session, "lsm ckpt", 0, &lsm_tree->work_cond)); WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri)); WT_ERR(__wt_lsm_meta_read(session, lsm_tree)); @@ -735,7 +733,7 @@ __wt_lsm_tree_switch(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) /* Set the switch transaction in the previous chunk, if necessary. */ if (chunk != NULL && chunk->switch_txn == WT_TXN_NONE) - chunk->switch_txn = __wt_txn_current_id(session); + chunk->switch_txn = __wt_txn_new_id(session); /* Update the throttle time. */ __wt_lsm_tree_throttle(session, lsm_tree, 0); @@ -746,8 +744,8 @@ __wt_lsm_tree_switch(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) nchunks + 1, &lsm_tree->chunk)); WT_ERR(__wt_verbose(session, WT_VERB_LSM, - "Tree switch to: %" PRIu32 ", checkpoint throttle %ld, " - "merge throttle %ld", + "Tree %s switch to: %" PRIu32 ", checkpoint throttle %ld, " + "merge throttle %ld", lsm_tree->name, new_id, lsm_tree->ckpt_throttle, lsm_tree->merge_throttle)); WT_ERR(__wt_calloc_def(session, 1, &chunk)); @@ -771,7 +769,7 @@ err: WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree)); WT_PANIC_RET(session, ret, "Failed doing LSM switch"); else if (!first_switch) WT_RET(__wt_lsm_manager_push_entry( - session, WT_LSM_WORK_FLUSH, lsm_tree)); + session, WT_LSM_WORK_FLUSH, 0, lsm_tree)); return (ret); } @@ -1009,9 +1007,10 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip) WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; time_t begin, end; - int i, compacting, locked; + int i, compacting, flushing, locked, ref; - compacting = locked = 0; + compacting = flushing = locked = ref = 0; + chunk = NULL; /* * This function is applied to all matching sources: ignore anything * that is not an LSM tree. @@ -1030,6 +1029,19 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip) WT_ERR(__wt_seconds(session, &begin)); + /* + * Compacting has two distinct phases. + * 1. All in-memory chunks up to and including the current + * current chunk must be flushed. Normally, the flush code + * does not flush the last, in-use chunk, so we set a force + * flag to include that last chunk. We monitor the state of the + * last chunk and periodically push another forced flush work + * unit until it is complete. + * 2. After all flushing is done, we move onto the merging + * phase for compaction. Again, we monitor the state and + * continue to push merge work units until all merging is done. + */ + /* Lock the tree: single-thread compaction. */ WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 1)); locked = 1; @@ -1038,39 +1050,90 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip) lsm_tree->merge_throttle = 0; lsm_tree->merge_aggressiveness = 0; - /* If another thread started compacting this tree, we're done. */ + /* If another thread started a compact on this tree, we're done. */ if (F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING)) goto err; - compacting = 1; - F_SET(lsm_tree, WT_LSM_TREE_COMPACTING); - /* * Set the switch transaction on the current chunk, if it * hasn't been set before. This prevents further writes, so it * can be flushed by the checkpoint worker. */ if (lsm_tree->nchunks > 0 && - (chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]) != NULL && - chunk->switch_txn == WT_TXN_NONE) - chunk->switch_txn = __wt_txn_current_id(session); + (chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]) != NULL) { + if (chunk->switch_txn == WT_TXN_NONE) + chunk->switch_txn = __wt_txn_new_id(session); + /* + * If we have a chunk, we want to look for it to be on-disk. + * So we need to add a reference to keep it available. + */ + (void)WT_ATOMIC_ADD(chunk->refcnt, 1); + ref = 1; + } locked = 0; WT_ERR(__wt_lsm_tree_unlock(session, lsm_tree)); - /* Make sure the in-memory chunk gets flushed but not switched. */ - WT_ERR(__wt_lsm_manager_push_entry( - session, WT_LSM_WORK_FLUSH, lsm_tree)); + if (chunk != NULL) { + WT_ERR(__wt_verbose(session, WT_VERB_LSM, + "Compact force flush %s flags 0x%" PRIx32 + " chunk %u flags 0x%" + PRIx32, name, lsm_tree->flags, chunk->id, chunk->flags)); + flushing = 1; + /* + * Make sure the in-memory chunk gets flushed do not push a + * switch, because we don't want to create a new in-memory + * chunk if the tree is being used read-only now. + */ + WT_ERR(__wt_lsm_manager_push_entry(session, + WT_LSM_WORK_FLUSH, WT_LSM_WORK_FORCE, lsm_tree)); + } else { + /* + * If there is no chunk to flush, go straight to the + * compacting state. + */ + compacting = 1; + F_SET(lsm_tree, WT_LSM_TREE_COMPACTING); + WT_ERR(__wt_verbose(session, WT_VERB_LSM, + "COMPACT: Start compacting %s", lsm_tree->name)); + } /* Wait for the work unit queues to drain. */ while (F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) { + /* + * The flush flag is cleared when the chunk has been flushed. + * Continue to push forced flushes until the chunk is on disk. + * Once it is on disk move to the compacting phase. + */ + if (flushing) { + WT_ASSERT(session, chunk != NULL); + if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { + WT_ERR(__wt_verbose(session, + WT_VERB_LSM, + "Compact flush done %s chunk %u. " + "Start compacting", + name, chunk->id)); + (void)WT_ATOMIC_SUB(chunk->refcnt, 1); + flushing = ref = 0; + compacting = 1; + F_SET(lsm_tree, WT_LSM_TREE_COMPACTING); + } else { + WT_ERR(__wt_verbose(session, WT_VERB_LSM, + "Compact flush retry %s chunk %u", + name, chunk->id)); + WT_ERR(__wt_lsm_manager_push_entry(session, + WT_LSM_WORK_FLUSH, WT_LSM_WORK_FORCE, + lsm_tree)); + } + } + /* * The compacting flag is cleared when no merges can be done. * Ensure that we push through some aggressive merges before * stopping otherwise we might not do merges that would * span chunks with different generations. */ - if (!F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING)) { + if (compacting && !F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING)) { if (lsm_tree->merge_aggressiveness < 10) { F_SET(lsm_tree, WT_LSM_TREE_COMPACTING); lsm_tree->merge_aggressiveness = 10; @@ -1088,21 +1151,29 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip) * done. If we are pushing merges, make sure they are * aggressive, to avoid duplicating effort. */ + if (compacting) #define COMPACT_PARALLEL_MERGES 5 - for (i = lsm_tree->queue_ref; - i < COMPACT_PARALLEL_MERGES; i++) { - lsm_tree->merge_aggressiveness = 10; - WT_ERR(__wt_lsm_manager_push_entry( - session, WT_LSM_WORK_MERGE, lsm_tree)); - } + for (i = lsm_tree->queue_ref; + i < COMPACT_PARALLEL_MERGES; i++) { + lsm_tree->merge_aggressiveness = 10; + WT_ERR(__wt_lsm_manager_push_entry( + session, WT_LSM_WORK_MERGE, 0, lsm_tree)); + } } -err: if (locked) - WT_ERR(__wt_lsm_tree_unlock(session, lsm_tree)); - /* Ensure the compacting flag is cleared if we set it. */ +err: + /* Ensure anything we set is cleared. */ + if (ref) + (void)WT_ATOMIC_SUB(chunk->refcnt, 1); if (compacting) { F_CLR(lsm_tree, WT_LSM_TREE_COMPACTING); lsm_tree->merge_aggressiveness = 0; } + if (locked) + WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree)); + + WT_TRET(__wt_verbose(session, WT_VERB_LSM, + "Compact %s complete, return %d", name, ret)); + __wt_lsm_tree_release(session, lsm_tree); return (ret); diff --git a/src/lsm/lsm_work_unit.c b/src/lsm/lsm_work_unit.c index fd40c0dac1f..050c364f5d8 100644 --- a/src/lsm/lsm_work_unit.c +++ b/src/lsm/lsm_work_unit.c @@ -66,10 +66,10 @@ err: WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree)); * Find and pin a chunk in the LSM tree that is likely to need flushing. */ int -__wt_lsm_get_chunk_to_flush( - WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK **chunkp) +__wt_lsm_get_chunk_to_flush(WT_SESSION_IMPL *session, + WT_LSM_TREE *lsm_tree, int force, WT_LSM_CHUNK **chunkp) { - u_int i; + u_int i, end; *chunkp = NULL; @@ -78,9 +78,19 @@ __wt_lsm_get_chunk_to_flush( if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) return (__wt_lsm_tree_unlock(session, lsm_tree)); - for (i = 0; i < lsm_tree->nchunks - 1; i++) { + /* + * Normally we don't want to force out the last chunk. But if we're + * doing a forced flush, likely from a compact call, then we want + * to include the final chunk. + */ + end = force ? lsm_tree->nchunks : lsm_tree->nchunks - 1; + for (i = 0; i < end; i++) { if (!F_ISSET(lsm_tree->chunk[i], WT_LSM_CHUNK_ONDISK)) { (void)WT_ATOMIC_ADD(lsm_tree->chunk[i]->refcnt, 1); + WT_RET(__wt_verbose(session, WT_VERB_LSM, + "Flush%s: return chunk %u of %u: %s", + force ? " w/ force" : "", i, end - 1, + lsm_tree->chunk[i]->uri)); *chunkp = lsm_tree->chunk[i]; break; } @@ -112,23 +122,53 @@ __lsm_unpin_chunks(WT_SESSION_IMPL *session, WT_LSM_WORKER_COOKIE *cookie) } /* - * __wt_lsm_bloom_work -- + * __wt_lsm_work_switch -- + * Do a switch if the LSM tree needs one. + */ +int +__wt_lsm_work_switch( + WT_SESSION_IMPL *session, WT_LSM_WORK_UNIT **entryp, int *ran) +{ + WT_DECL_RET; + WT_LSM_WORK_UNIT *entry; + + /* We've become responsible for freeing the work unit. */ + entry = *entryp; + *ran = 0; + *entryp = NULL; + + if (F_ISSET(entry->lsm_tree, WT_LSM_TREE_NEED_SWITCH)) { + WT_WITH_SCHEMA_LOCK(session, ret = + __wt_lsm_tree_switch(session, entry->lsm_tree)); + /* Failing to complete the switch is fine */ + if (ret == EBUSY) + ret = 0; + else + *ran = 1; + } + __wt_lsm_manager_free_work_unit(session, entry); + return (ret); +} + +/* + * __wt_lsm_work_bloom -- * Try to create a Bloom filter for the newest on-disk chunk that doesn't * have one. */ int -__wt_lsm_bloom_work(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) +__wt_lsm_work_bloom(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_WORKER_COOKIE cookie; - u_int i; + u_int i, merge; WT_CLEAR(cookie); WT_RET(__lsm_copy_chunks(session, lsm_tree, &cookie, 0)); /* Create bloom filters in all checkpointed chunks. */ + merge = 0; for (i = 0; i < cookie.nchunks; i++) { chunk = cookie.chunk_array[i]; @@ -147,14 +187,28 @@ __wt_lsm_bloom_work(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) * recheck that the chunk still needs a Bloom filter. */ if (WT_ATOMIC_CAS(chunk->bloom_busy, 0, 1)) { - if (!F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) + if (!F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) { ret = __lsm_bloom_create( session, lsm_tree, chunk, (u_int)i); + /* + * Record if we were successful so that we can + * later push a merge work unit. + */ + if (ret == 0) + merge = 1; + } chunk->bloom_busy = 0; break; } } + /* + * If we created any bloom filters, we push a merge work unit now. + */ + if (merge) + WT_ERR(__wt_lsm_manager_push_entry( + session, WT_LSM_WORK_MERGE, 0, lsm_tree)); +err: __lsm_unpin_chunks(session, &cookie); __wt_free(session, cookie.chunk_array); return (ret); @@ -186,16 +240,25 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, else WT_RET_MSG(session, ret, "discard handle"); } - if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) + if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { + WT_RET(__wt_verbose(session, WT_VERB_LSM, + "LSM worker %s already on disk", + chunk->uri)); return (0); + } /* Stop if a running transaction needs the chunk. */ __wt_txn_update_oldest(session); if (chunk->switch_txn == WT_TXN_NONE || - !__wt_txn_visible_all(session, chunk->switch_txn)) + !__wt_txn_visible_all(session, chunk->switch_txn)) { + WT_RET(__wt_verbose(session, WT_VERB_LSM, + "LSM worker %s: running transaction, return", + chunk->uri)); return (0); + } - WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker flushing")); + WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker flushing %s", + chunk->uri)); /* * Flush the file before checkpointing: this is the expensive part in @@ -218,7 +281,8 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, } WT_RET(ret); - WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointing")); + WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointing %s", + chunk->uri)); WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker(session, chunk->uri, @@ -259,15 +323,16 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, /* Make sure we aren't pinning a transaction ID. */ __wt_txn_release_snapshot(session); - WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointed")); + WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointed %s", + chunk->uri)); /* * Schedule a bloom filter create for our newly flushed chunk */ if (!FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF)) WT_RET(__wt_lsm_manager_push_entry( - session, WT_LSM_WORK_BLOOM, lsm_tree)); + session, WT_LSM_WORK_BLOOM, 0, lsm_tree)); else WT_RET(__wt_lsm_manager_push_entry( - session, WT_LSM_WORK_MERGE, lsm_tree)); + session, WT_LSM_WORK_MERGE, 0, lsm_tree)); return (0); } @@ -334,7 +399,15 @@ __lsm_bloom_create(WT_SESSION_IMPL *session, F_CLR(session, WT_SESSION_NO_CACHE); - /* Load the new Bloom filter into cache. */ + /* + * Load the new Bloom filter into cache. + * + * We're doing advisory reads to fault the new trees into cache. + * Don't block if the cache is full: our next unit of work may be to + * discard some trees to free space. + */ + F_SET(session, WT_SESSION_NO_CACHE_CHECK); + WT_CLEAR(key); WT_ERR_NOTFOUND_OK(__wt_bloom_get(bloom, &key)); @@ -355,7 +428,7 @@ __lsm_bloom_create(WT_SESSION_IMPL *session, err: if (bloom != NULL) WT_TRET(__wt_bloom_close(bloom)); - F_CLR(session, WT_SESSION_NO_CACHE); + F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_CACHE_CHECK); return (ret); } diff --git a/src/lsm/lsm_worker.c b/src/lsm/lsm_worker.c index d3018e4362a..3b662e2ebee 100644 --- a/src/lsm/lsm_worker.c +++ b/src/lsm/lsm_worker.c @@ -13,7 +13,7 @@ static void * __lsm_worker(void *); /* * __wt_lsm_worker_start -- - * A wrapper around the LSM worker thread start + * A wrapper around the LSM worker thread start. */ int __wt_lsm_worker_start(WT_SESSION_IMPL *session, WT_LSM_WORKER_ARGS *args) @@ -23,7 +23,7 @@ __wt_lsm_worker_start(WT_SESSION_IMPL *session, WT_LSM_WORKER_ARGS *args) /* * __lsm_worker_general_op -- - * Execute a single bloom, drop or flush work unit + * Execute a single bloom, drop or flush work unit. */ static int __lsm_worker_general_op( @@ -32,34 +32,43 @@ __lsm_worker_general_op( WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_WORK_UNIT *entry; + int force; *completed = 0; - if (!F_ISSET(cookie, WT_LSM_WORK_FLUSH) && - !F_ISSET(cookie, WT_LSM_WORK_DROP) && - !F_ISSET(cookie, WT_LSM_WORK_BLOOM)) - return (WT_NOTFOUND); + /* + * Return if this thread cannot process a bloom, drop or flush. + */ + if (!FLD_ISSET(cookie->type, + WT_LSM_WORK_BLOOM | WT_LSM_WORK_DROP | WT_LSM_WORK_FLUSH)) + return (WT_NOTFOUND); if ((ret = __wt_lsm_manager_pop_entry(session, - cookie->flags, &entry)) != 0 || entry == NULL) + cookie->type, &entry)) != 0 || entry == NULL) return (ret); - if (entry->flags == WT_LSM_WORK_FLUSH) { - WT_ERR(__wt_lsm_get_chunk_to_flush( - session, entry->lsm_tree, &chunk)); + if (entry->type == WT_LSM_WORK_FLUSH) { + force = F_ISSET(entry, WT_LSM_WORK_FORCE); + F_CLR(entry, WT_LSM_WORK_FORCE); + WT_ERR(__wt_lsm_get_chunk_to_flush(session, + entry->lsm_tree, force, &chunk)); + /* + * If we got a chunk to flush, checkpoint it. + */ if (chunk != NULL) { + WT_ERR(__wt_verbose(session, WT_VERB_LSM, + "Flush%s chunk %d %s", + force ? " w/ force" : "", + chunk->id, chunk->uri)); ret = __wt_lsm_checkpoint_chunk( session, entry->lsm_tree, chunk); WT_ASSERT(session, chunk->refcnt > 0); (void)WT_ATOMIC_SUB(chunk->refcnt, 1); WT_ERR(ret); } - } else if (entry->flags == WT_LSM_WORK_DROP) + } else if (entry->type == WT_LSM_WORK_DROP) WT_ERR(__wt_lsm_free_chunks(session, entry->lsm_tree)); - else if (entry->flags == WT_LSM_WORK_BLOOM) { - WT_ERR(__wt_lsm_bloom_work(session, entry->lsm_tree)); - WT_ERR(__wt_lsm_manager_push_entry( - session, WT_LSM_WORK_MERGE, entry->lsm_tree)); - } + else if (entry->type == WT_LSM_WORK_BLOOM) + WT_ERR(__wt_lsm_work_bloom(session, entry->lsm_tree)); *completed = 1; err: __wt_lsm_manager_free_work_unit(session, entry); @@ -78,7 +87,7 @@ __lsm_worker(void *arg) WT_LSM_WORK_UNIT *entry; WT_LSM_WORKER_ARGS *cookie; WT_SESSION_IMPL *session; - int ran; + int progress, ran; cookie = (WT_LSM_WORKER_ARGS *)arg; session = cookie->session; @@ -86,45 +95,42 @@ __lsm_worker(void *arg) entry = NULL; while (F_ISSET(conn, WT_CONN_SERVER_RUN)) { - /* Don't busy wait if there aren't any LSM trees. */ - if (TAILQ_EMPTY(&conn->lsmqh)) { - __wt_sleep(0, 10000); - continue; - } + progress = 0; - /* Switches are always a high priority */ - while (F_ISSET(cookie, WT_LSM_WORK_SWITCH) && + /* + * Workers process the different LSM work queues. Some workers + * can handle several or all work unit types. So the code is + * prioritized so important operations happen first. + * Switches are the highest priority. + */ + while (FLD_ISSET(cookie->type, WT_LSM_WORK_SWITCH) && (ret = __wt_lsm_manager_pop_entry( session, WT_LSM_WORK_SWITCH, &entry)) == 0 && - entry != NULL) { - /* - * Don't exit the switch thread because a single - * switch fails. Keep trying until we are told to - * shut down. - */ - WT_WITH_SCHEMA_LOCK(session, ret = - __wt_lsm_tree_switch(session, entry->lsm_tree)); - - __wt_lsm_manager_free_work_unit(session, entry); - entry = NULL; - - if (ret == EBUSY) - ret = 0; - WT_ERR(ret); - } + entry != NULL) + WT_ERR( + __wt_lsm_work_switch(session, &entry, &progress)); /* Flag an error if the pop failed. */ WT_ERR(ret); + /* + * Next the general operations. + */ ret = __lsm_worker_general_op(session, cookie, &ran); if (ret == EBUSY || ret == WT_NOTFOUND) ret = 0; WT_ERR(ret); + progress = progress || ran; - if (F_ISSET(cookie, WT_LSM_WORK_MERGE) && + /* + * Finally see if there is any merge work we can do. This is + * last because the earlier operations may result in adding + * merge work to the queue. + */ + if (FLD_ISSET(cookie->type, WT_LSM_WORK_MERGE) && (ret = __wt_lsm_manager_pop_entry( session, WT_LSM_WORK_MERGE, &entry)) == 0 && entry != NULL) { - WT_ASSERT(session, entry->flags == WT_LSM_WORK_MERGE); + WT_ASSERT(session, entry->type == WT_LSM_WORK_MERGE); ret = __wt_lsm_merge(session, entry->lsm_tree, cookie->id); if (ret == WT_NOTFOUND) { @@ -136,9 +142,17 @@ __lsm_worker(void *arg) WT_CLEAR_BTREE_IN_SESSION(session); __wt_lsm_manager_free_work_unit(session, entry); entry = NULL; + progress = 1; } /* Flag an error if the pop failed. */ WT_ERR(ret); + + /* Don't busy wait if there was any work to do. */ + if (!progress) { + WT_ERR( + __wt_cond_wait(session, cookie->work_cond, 10000)); + continue; + } } if (ret != 0) { diff --git a/src/os_posix/os_open.c b/src/os_posix/os_open.c index 9a0bff8df16..a826573d99f 100644 --- a/src/os_posix/os_open.c +++ b/src/os_posix/os_open.c @@ -46,12 +46,13 @@ __open_directory_sync(WT_SESSION_IMPL *session, char *path) err: WT_SYSCALL_RETRY(close(fd), ret); if (ret != 0) - WT_ERR_MSG(session, ret, "%s: close", path); + __wt_err(session, ret, "%s: close", path); + return (ret); #else WT_UNUSED(session); WT_UNUSED(path); -#endif return (0); +#endif } /* diff --git a/src/schema/schema_truncate.c b/src/schema/schema_truncate.c index 91f38206265..da526dfe5f4 100644 --- a/src/schema/schema_truncate.c +++ b/src/schema/schema_truncate.c @@ -88,7 +88,7 @@ __truncate_dsrc(WT_SESSION_IMPL *session, const char *uri) WT_ERR_NOTFOUND_OK(ret); err: WT_TRET(cursor->close(cursor)); - return (0); + return (ret); } /* diff --git a/src/schema/schema_util.c b/src/schema/schema_util.c index 90e5fb42dc1..263f56f1c41 100644 --- a/src/schema/schema_util.c +++ b/src/schema/schema_util.c @@ -23,11 +23,11 @@ __wt_schema_get_source(WT_SESSION_IMPL *session, const char *name) } /* - * __wt_schema_name_check -- + * __wt_str_name_check -- * Disallow any use of the WiredTiger name space. */ int -__wt_schema_name_check(WT_SESSION_IMPL *session, const char *uri) +__wt_str_name_check(WT_SESSION_IMPL *session, const char *str) { const char *name, *sep; int skipped; @@ -37,7 +37,7 @@ __wt_schema_name_check(WT_SESSION_IMPL *session, const char *uri) * "bad" if the application truncated the metadata file. Skip any * leading URI prefix, check and then skip over a table name. */ - name = uri; + name = str; for (skipped = 0; skipped < 2; skipped++) { if ((sep = strchr(name, ':')) == NULL) break; @@ -62,3 +62,23 @@ __wt_schema_name_check(WT_SESSION_IMPL *session, const char *uri) return (0); } + +/* + * __wt_name_check -- + * Disallow any use of the WiredTiger name space. + */ +int +__wt_name_check(WT_SESSION_IMPL *session, const char *str, size_t len) +{ + WT_DECL_RET; + WT_DECL_ITEM(tmp); + + WT_RET(__wt_scr_alloc(session, len, &tmp)); + + WT_ERR(__wt_buf_fmt(session, tmp, "%.*s", (int)len, str)); + + ret = __wt_str_name_check(session, tmp->data); + +err: __wt_scr_free(&tmp); + return (ret); +} diff --git a/src/session/session_api.c b/src/session/session_api.c index e63e2c0284a..d4d5fe5e2c9 100644 --- a/src/session/session_api.c +++ b/src/session/session_api.c @@ -188,42 +188,85 @@ __wt_open_cursor(WT_SESSION_IMPL *session, { WT_COLGROUP *colgroup; WT_DATA_SOURCE *dsrc; - WT_DECL_RET; - if (WT_PREFIX_MATCH(uri, "backup:")) - ret = __wt_curbackup_open(session, uri, cfg, cursorp); - else if (WT_PREFIX_MATCH(uri, "colgroup:")) { - /* - * Column groups are a special case: open a cursor on the - * underlying data source. - */ - WT_RET(__wt_schema_get_colgroup(session, uri, NULL, &colgroup)); - ret = __wt_open_cursor( - session, colgroup->source, owner, cfg, cursorp); - } else if (WT_PREFIX_MATCH(uri, "config:")) - ret = __wt_curconfig_open(session, uri, cfg, cursorp); - else if (WT_PREFIX_MATCH(uri, "file:")) - ret = __wt_curfile_open(session, uri, owner, cfg, cursorp); - else if (WT_PREFIX_MATCH(uri, "lsm:")) - ret = __wt_clsm_open(session, uri, owner, cfg, cursorp); - else if (WT_PREFIX_MATCH(uri, WT_METADATA_URI)) - ret = __wt_curmetadata_open(session, uri, owner, cfg, cursorp); - else if (WT_PREFIX_MATCH(uri, "index:")) - ret = __wt_curindex_open(session, uri, owner, cfg, cursorp); - else if (WT_PREFIX_MATCH(uri, "log:")) - ret = __wt_curlog_open(session, uri, cfg, cursorp); - else if (WT_PREFIX_MATCH(uri, "statistics:")) - ret = __wt_curstat_open(session, uri, cfg, cursorp); - else if (WT_PREFIX_MATCH(uri, "table:")) - ret = __wt_curtable_open(session, uri, cfg, cursorp); - else if ((dsrc = __wt_schema_get_source(session, uri)) != NULL) - ret = dsrc->open_cursor == NULL ? + /* + * Open specific cursor types we know about, or call the generic data + * source open function. + * + * Unwind a set of string comparisons into a switch statement hoping + * the compiler can make it fast, but list the common choices first + * instead of sorting so if/else patterns are still fast. + */ + switch (uri[0]) { + /* + * Common cursor types. + */ + case 't': + if (WT_PREFIX_MATCH(uri, "table:")) + return (__wt_curtable_open(session, uri, cfg, cursorp)); + break; + case 'c': + if (WT_PREFIX_MATCH(uri, "colgroup:")) { + /* + * Column groups are a special case: open a cursor on + * the underlying data source. + */ + WT_RET(__wt_schema_get_colgroup( + session, uri, NULL, &colgroup)); + return (__wt_open_cursor( + session, colgroup->source, owner, cfg, cursorp)); + } + + if (WT_PREFIX_MATCH(uri, "config:")) + return ( + __wt_curconfig_open(session, uri, cfg, cursorp)); + break; + case 'i': + if (WT_PREFIX_MATCH(uri, "index:")) + return (__wt_curindex_open( + session, uri, owner, cfg, cursorp)); + break; + case 'l': + if (WT_PREFIX_MATCH(uri, "lsm:")) + return (__wt_clsm_open( + session, uri, owner, cfg, cursorp)); + + if (WT_PREFIX_MATCH(uri, "log:")) + return (__wt_curlog_open(session, uri, cfg, cursorp)); + break; + + /* + * Less common cursor types. + */ + case 'f': + if (WT_PREFIX_MATCH(uri, "file:")) + return (__wt_curfile_open( + session, uri, owner, cfg, cursorp)); + break; + case 'm': + if (WT_PREFIX_MATCH(uri, WT_METADATA_URI)) + return (__wt_curmetadata_open( + session, uri, owner, cfg, cursorp)); + break; + case 'b': + if (WT_PREFIX_MATCH(uri, "backup:")) + return ( + __wt_curbackup_open(session, uri, cfg, cursorp)); + break; + case 's': + if (WT_PREFIX_MATCH(uri, "statistics:")) + return (__wt_curstat_open(session, uri, cfg, cursorp)); + break; + default: + break; + } + + if ((dsrc = __wt_schema_get_source(session, uri)) != NULL) + return (dsrc->open_cursor == NULL ? __wt_object_unsupported(session, uri) : - __wt_curds_open(session, uri, owner, cfg, dsrc, cursorp); - else - ret = __wt_bad_object_type(session, uri); + __wt_curds_open(session, uri, owner, cfg, dsrc, cursorp)); - return (ret); + return (__wt_bad_object_type(session, uri)); } /* @@ -319,7 +362,7 @@ __session_create(WT_SESSION *wt_session, const char *uri, const char *config) WT_UNUSED(cfg); /* Disallow objects in the WiredTiger name space. */ - WT_ERR(__wt_schema_name_check(session, uri)); + WT_ERR(__wt_str_name_check(session, uri)); /* * Type configuration only applies to tables, column groups and indexes. @@ -387,8 +430,8 @@ __session_rename(WT_SESSION *wt_session, SESSION_API_CALL(session, rename, config, cfg); /* Disallow objects in the WiredTiger name space. */ - WT_ERR(__wt_schema_name_check(session, uri)); - WT_ERR(__wt_schema_name_check(session, newuri)); + WT_ERR(__wt_str_name_check(session, uri)); + WT_ERR(__wt_str_name_check(session, newuri)); WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_rename(session, uri, newuri, cfg)); @@ -408,7 +451,7 @@ __session_compact(WT_SESSION *wt_session, const char *uri, const char *config) session = (WT_SESSION_IMPL *)wt_session; /* Disallow objects in the WiredTiger name space. */ - WT_RET(__wt_schema_name_check(session, uri)); + WT_RET(__wt_str_name_check(session, uri)); if (!WT_PREFIX_MATCH(uri, "colgroup:") && !WT_PREFIX_MATCH(uri, "file:") && @@ -434,7 +477,7 @@ __session_drop(WT_SESSION *wt_session, const char *uri, const char *config) SESSION_API_CALL(session, drop, config, cfg); /* Disallow objects in the WiredTiger name space. */ - WT_ERR(__wt_schema_name_check(session, uri)); + WT_ERR(__wt_str_name_check(session, uri)); WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_drop(session, uri, cfg)); @@ -496,7 +539,7 @@ __session_truncate(WT_SESSION *wt_session, if (uri != NULL) { /* Disallow objects in the WiredTiger name space. */ - WT_ERR(__wt_schema_name_check(session, uri)); + WT_ERR(__wt_str_name_check(session, uri)); WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_truncate(session, uri, cfg)); @@ -876,6 +919,8 @@ __wt_open_session(WT_CONNECTION_IMPL *conn, WT_ERR(__wt_cond_alloc(session, "session", 0, &session_ret->cond)); + __wt_random_init(session_ret->rnd); + __wt_event_handler_set(session_ret, event_handler == NULL ? session->event_handler : event_handler); diff --git a/src/support/rand.c b/src/support/rand.c index 248f9c59ff1..b716eb8c58b 100644 --- a/src/support/rand.c +++ b/src/support/rand.c @@ -27,6 +27,22 @@ #include "wt_internal.h" +#undef M_W +#define M_W (rnd)[0] +#undef M_Z +#define M_Z (rnd)[1] + +/* + * __wt_random_init -- + * Initialize return of a 32-bit pseudo-random number. + */ +void +__wt_random_init(uint32_t *rnd) +{ + M_W = 521288629; + M_Z = 362436069; +} + /* * __wt_random -- * Return a 32-bit pseudo-random number. @@ -43,13 +59,11 @@ * forever. Take local copies of the shared values to avoid this. */ uint32_t -__wt_random(void) +__wt_random(uint32_t *rnd) { - static uint32_t m_w = 521288629; - static uint32_t m_z = 362436069; - uint32_t w = m_w, z = m_z; + uint32_t w = M_W, z = M_Z; - m_z = z = 36969 * (z & 65535) + (z >> 16); - m_w = w = 18000 * (w & 65535) + (w >> 16); + M_Z = z = 36969 * (z & 65535) + (z >> 16); + M_W = w = 18000 * (w & 65535) + (w >> 16); return (z << 16) + (w & 65535); } diff --git a/src/support/stat.c b/src/support/stat.c index 482e3d3923b..edf64b3f19b 100644 --- a/src/support/stat.c +++ b/src/support/stat.c @@ -412,6 +412,12 @@ __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats) "sleep for LSM checkpoint throttle"; stats->lsm_merge_throttle.desc = "sleep for LSM merge throttle"; stats->lsm_rows_merged.desc = "rows merged in an LSM tree"; + stats->lsm_work_queue_app.desc = "LSM App work units currently queued"; + stats->lsm_work_queue_manager.desc = + "LSM Merge work units currently queued"; + stats->lsm_work_queue_max.desc = "LSM tree queue hit maximum"; + stats->lsm_work_queue_switch.desc = + "LSM Switch work units currently queued"; stats->lsm_work_units_created.desc = "LSM tree maintenance operations scheduled"; stats->lsm_work_units_discarded.desc = @@ -526,6 +532,7 @@ __wt_stat_refresh_connection_stats(void *stats_arg) stats->lsm_checkpoint_throttle.v = 0; stats->lsm_merge_throttle.v = 0; stats->lsm_rows_merged.v = 0; + stats->lsm_work_queue_max.v = 0; stats->lsm_work_units_created.v = 0; stats->lsm_work_units_discarded.v = 0; stats->lsm_work_units_done.v = 0; diff --git a/src/txn/txn.c b/src/txn/txn.c index 3e18eecc962..bbcb39e2dc2 100644 --- a/src/txn/txn.c +++ b/src/txn/txn.c @@ -474,15 +474,16 @@ __wt_txn_destroy(WT_SESSION_IMPL *session) * Initialize the global transaction state. */ int -__wt_txn_global_init(WT_CONNECTION_IMPL *conn, const char *cfg[]) +__wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]) { - WT_SESSION_IMPL *session; + WT_CONNECTION_IMPL *conn; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *s; u_int i; WT_UNUSED(cfg); - session = conn->default_session; + conn = S2C(session); + txn_global = &conn->txn_global; txn_global->current = 1; txn_global->oldest_id = 1; @@ -501,12 +502,12 @@ __wt_txn_global_init(WT_CONNECTION_IMPL *conn, const char *cfg[]) * Destroy the global transaction state. */ void -__wt_txn_global_destroy(WT_CONNECTION_IMPL *conn) +__wt_txn_global_destroy(WT_SESSION_IMPL *session) { - WT_SESSION_IMPL *session; + WT_CONNECTION_IMPL *conn; WT_TXN_GLOBAL *txn_global; - session = conn->default_session; + conn = S2C(session); txn_global = &conn->txn_global; if (txn_global != NULL) diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index 71f1c8bb2ae..0bebce927fe 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -7,6 +7,30 @@ #include "wt_internal.h" +/* + * __wt_checkpoint_name_ok -- + * Complain if the checkpoint name isn't acceptable. + */ +int +__wt_checkpoint_name_ok(WT_SESSION_IMPL *session, const char *name, size_t len) +{ + /* Check for characters we don't want to see in a metadata file. */ + WT_RET(__wt_name_check(session, name, len)); + + /* + * The internal checkpoint name is special, applications aren't allowed + * to use it. Be aggressive and disallow any matching prefix, it makes + * things easier when checking in other places. + */ + if (len < strlen(WT_CHECKPOINT)) + return (0); + if (!WT_PREFIX_MATCH(name, WT_CHECKPOINT)) + return (0); + + WT_RET_MSG(session, EINVAL, + "the checkpoint name \"%s\" is reserved", WT_CHECKPOINT); +} + /* * __checkpoint_name_check -- * Check for an attempt to name a checkpoint that includes anything @@ -75,9 +99,11 @@ __checkpoint_apply(WT_SESSION_IMPL *session, const char *cfg[], target_list = 0; - /* Flag if this is a named checkpoint. */ - WT_ERR(__wt_config_gets(session, cfg, "name", &cval)); + /* Flag if this is a named checkpoint, and check if the name is OK. */ + WT_RET(__wt_config_gets(session, cfg, "name", &cval)); named = cval.len != 0; + if (named) + WT_RET(__wt_checkpoint_name_ok(session, cval.str, cval.len)); /* Step through the targets and optionally operate on each one. */ WT_ERR(__wt_config_gets(session, cfg, "target", &cval)); @@ -412,27 +438,6 @@ err: /* return (ret); } -/* - * __ckpt_name_ok -- - * Complain if our reserved checkpoint name is used. - */ -static int -__ckpt_name_ok(WT_SESSION_IMPL *session, const char *name, size_t len) -{ - /* - * The internal checkpoint name is special, applications aren't allowed - * to use it. Be aggressive and disallow any matching prefix, it makes - * things easier when checking in other places. - */ - if (len < strlen(WT_CHECKPOINT)) - return (0); - if (!WT_PREFIX_MATCH(name, WT_CHECKPOINT)) - return (0); - - WT_RET_MSG(session, EINVAL, - "the checkpoint name \"%s\" is reserved", WT_CHECKPOINT); -} - /* * __drop -- * Drop all checkpoints with a specific name. @@ -575,7 +580,7 @@ __checkpoint_worker( if (cval.len == 0) name = WT_CHECKPOINT; else { - WT_ERR(__ckpt_name_ok(session, cval.str, cval.len)); + WT_ERR(__wt_checkpoint_name_ok(session, cval.str, cval.len)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &name_alloc)); name = name_alloc; } @@ -588,12 +593,12 @@ __checkpoint_worker( WT_ERR(__wt_config_subinit(session, &dropconf, &cval)); while ((ret = __wt_config_next(&dropconf, &k, &v)) == 0) { - /* Disallow the reserved checkpoint name. */ + /* Disallow unsafe checkpoint names. */ if (v.len == 0) - WT_ERR(__ckpt_name_ok( + WT_ERR(__wt_checkpoint_name_ok( session, k.str, k.len)); else - WT_ERR(__ckpt_name_ok( + WT_ERR(__wt_checkpoint_name_ok( session, v.str, v.len)); if (v.len == 0) diff --git a/src/txn/txn_recover.c b/src/txn/txn_recover.c index d52a471449a..2fce335dee9 100644 --- a/src/txn/txn_recover.c +++ b/src/txn/txn_recover.c @@ -399,16 +399,14 @@ err: if (r->nfiles > r->max_fileid) * Run recovery. */ int -__wt_txn_recover(WT_SESSION_IMPL *default_session) +__wt_txn_recover(WT_CONNECTION_IMPL *conn) { - WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_RECOVERY r; WT_SESSION_IMPL *session; const char *config; int was_backup; - conn = S2C(default_session); WT_CLEAR(r); INIT_LSN(&r.ckpt_lsn); was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP) ? 1 : 0; diff --git a/test/checkpoint/checkpointer.c b/test/checkpoint/checkpointer.c index 9f12e323878..57478350b52 100644 --- a/test/checkpoint/checkpointer.c +++ b/test/checkpoint/checkpointer.c @@ -188,7 +188,7 @@ verify_checkpoint(WT_SESSION *session) continue; t_ret = cursors[i]->next(cursors[i]); if (t_ret != 0 && t_ret != WT_NOTFOUND) { - (void)log_print_err("cursor->next", ret, 1); + (void)log_print_err("cursor->next", t_ret, 1); goto err; } @@ -224,7 +224,7 @@ err: for (i = 0; i < g.ntables; i++) { "verify_checkpoint:cursor close", ret, 1); } free(cursors); - return (0); + return (ret); } /* diff --git a/test/checkpoint/workers.c b/test/checkpoint/workers.c index 1a01dda649c..b7858cb8292 100644 --- a/test/checkpoint/workers.c +++ b/test/checkpoint/workers.c @@ -169,11 +169,14 @@ real_worker(void) { WT_CURSOR **cursors; WT_SESSION *session; + uint32_t rnd[2]; u_int i, keyno; int j, ret, t_ret; ret = t_ret = 0; + __wt_random_init(rnd); + if ((cursors = calloc( (size_t)(g.ntables), sizeof(WT_CURSOR *))) == NULL) return (log_print_err("malloc", ENOMEM, 1)); @@ -197,7 +200,7 @@ real_worker(void) "real_worker:begin_transaction", ret, 1); goto err; } - keyno = __wt_random() % g.nkeys + 1; + keyno = __wt_random(rnd) % g.nkeys + 1; for (j = 0; j < g.ntables; j++) { if ((ret = worker_op(cursors[j], keyno, i)) != 0) break; diff --git a/test/format/config.c b/test/format/config.c index 509dc7684ec..519c5db7890 100644 --- a/test/format/config.c +++ b/test/format/config.c @@ -243,26 +243,32 @@ config_compression(void) cp = config_find("compression", strlen("compression")); if (!(cp->flags & C_PERM)) { cstr = "compression=none"; - switch (MMRAND(1, 10)) { - case 1: case 2: case 3: /* 30% */ + switch (MMRAND(1, 20)) { + case 1: case 2: case 3: /* 30% no compression */ + case 4: case 5: case 6: break; - case 4: case 5: /* 20% */ + case 7: case 8: case 9: case 10: /* 20% bzip */ if (access(BZIP_PATH, R_OK) == 0) cstr = "compression=bzip"; break; - case 6: /* 10% */ + case 11: /* 5% bzip-raw */ if (access(BZIP_PATH, R_OK) == 0) cstr = "compression=bzip-raw"; break; - case 7: case 8: /* 20% */ + case 12: case 13: case 14: case 15: /* 20% snappy */ if (access(SNAPPY_PATH, R_OK) == 0) cstr = "compression=snappy"; break; - case 9: case 10: /* 20% */ + case 16: case 17: case 18: case 19: /* 20% zlib */ if (access(ZLIB_PATH, R_OK) == 0) cstr = "compression=zlib"; break; + case 20: /* 5% zlib-no-raw */ + if (access(ZLIB_PATH, R_OK) == 0) + cstr = "compression=zlib-noraw"; + break; } + config_single(cstr, 0); } @@ -281,6 +287,7 @@ config_compression(void) die(0, "snappy library not found or not readable"); break; case COMPRESS_ZLIB: + case COMPRESS_ZLIB_NO_RAW: if (access(ZLIB_PATH, R_OK) != 0) die(0, "zlib library not found or not readable"); break; @@ -549,6 +556,8 @@ config_map_compression(const char *s, u_int *vp) *vp = COMPRESS_SNAPPY; else if (strcmp(s, "zlib") == 0) *vp = COMPRESS_ZLIB; + else if (strcmp(s, "zlib-noraw") == 0) + *vp = COMPRESS_ZLIB_NO_RAW; else die(EINVAL, "illegal compression configuration: %s", s); } diff --git a/test/format/config.h b/test/format/config.h index a32df3de95c..9852fafabf7 100644 --- a/test/format/config.h +++ b/test/format/config.h @@ -115,7 +115,8 @@ static CONFIG c[] = { C_BOOL, 10, 0, 0, &g.c_compact, NULL }, { "compression", - "type of compression (none | bzip | bzip-raw | lzo | snappy | zlib)", + "type of compression " + "(none | bzip | bzip-raw | lzo | snappy | zlib | zlib-noraw)", C_IGNORE|C_STRING, 1, 5, 5, NULL, &g.c_compression }, { "data_extend", diff --git a/test/format/format.h b/test/format/format.h index 0e45a28b3ef..1f2b363e9a4 100644 --- a/test/format/format.h +++ b/test/format/format.h @@ -205,6 +205,7 @@ typedef struct { #define COMPRESS_LZO 4 #define COMPRESS_SNAPPY 5 #define COMPRESS_ZLIB 6 +#define COMPRESS_ZLIB_NO_RAW 7 u_int c_compression_flag; /* Compression flag value */ #define ISOLATION_RANDOM 1 diff --git a/test/format/ops.c b/test/format/ops.c index fdf762289c4..e38b75f0deb 100644 --- a/test/format/ops.c +++ b/test/format/ops.c @@ -219,21 +219,14 @@ ops(void *arg) /* * We can't checkpoint or swap sessions/cursors while in a - * transaction, resolve any running transaction. Otherwise, - * reset the cursor: we may block waiting for a lock and there - * is no reason to keep pages pinned. + * transaction, resolve any running transaction. */ - if (cnt == ckpt_op || cnt == session_op) { - if (intxn) { - if ((ret = session->commit_transaction( - session, NULL)) != 0) - die(ret, "session.commit_transaction"); - ++tinfo->commit; - intxn = 0; - } - else if (cursor != NULL && - (ret = cursor->reset(cursor)) != 0) - die(ret, "cursor.reset"); + if (intxn && (cnt == ckpt_op || cnt == session_op)) { + if ((ret = session->commit_transaction( + session, NULL)) != 0) + die(ret, "session.commit_transaction"); + ++tinfo->commit; + intxn = 0; } /* Open up a new session and cursors. */ @@ -313,7 +306,7 @@ ops(void *arg) /* * If we're not single-threaded and we're not in a transaction, - * start a transaction 80% of the time. + * start a transaction 20% of the time. */ if (!SINGLETHREADED && !intxn && MMRAND(1, 10) >= 8) { if ((ret = @@ -371,13 +364,6 @@ ops(void *arg) if (g.append_cnt >= g.append_max) goto skip_insert; - /* - * Reset the standard cursor so it doesn't keep - * pages pinned. - */ - if ((ret = cursor->reset(cursor)) != 0) - die(ret, "cursor.reset"); - /* Insert, then reset the insert cursor. */ if (col_insert( cursor_insert, &key, &value, &keyno)) @@ -430,6 +416,10 @@ skip_insert: if (col_update(cursor, &key, &value, keyno)) if (read_row(cursor, &key, keyno)) goto deadlock; + /* Reset the cursor: there is no reason to keep pages pinned. */ + if (cursor != NULL && (ret = cursor->reset(cursor)) != 0) + die(ret, "cursor.reset"); + /* * If we're in the transaction, commit 40% of the time and * rollback 10% of the time. @@ -449,7 +439,8 @@ deadlock: ++tinfo->deadlock; } if ((ret = session->rollback_transaction( session, NULL)) != 0) - die(ret, "session.commit_transaction"); + die(ret, + "session.rollback_transaction"); ++tinfo->rollback; intxn = 0; break; diff --git a/test/format/wts.c b/test/format/wts.c index 1a83fa92894..e495956fd2e 100644 --- a/test/format/wts.c +++ b/test/format/wts.c @@ -272,6 +272,10 @@ wts_create(void) p += snprintf(p, (size_t)(end - p), ",block_compressor=\"zlib\""); break; + case COMPRESS_ZLIB_NO_RAW: + p += snprintf(p, (size_t)(end - p), + ",block_compressor=\"zlib-noraw\""); + break; } /* Configure Btree internal key truncation. */ diff --git a/test/suite/test_checkpoint01.py b/test/suite/test_checkpoint01.py index 153ea015cf5..ab4dbe18bd6 100644 --- a/test/suite/test_checkpoint01.py +++ b/test/suite/test_checkpoint01.py @@ -308,8 +308,8 @@ class test_checkpoint_last(wttest.WiredTigerTestCase): # Check we can't use the reserved name as an application checkpoint name. -class test_checkpoint_last_name(wttest.WiredTigerTestCase): - def test_checkpoint_last_name(self): +class test_checkpoint_illegal_name(wttest.WiredTigerTestCase): + def test_checkpoint_illegal_name(self): simple_populate(self, "file:checkpoint", 'key_format=S', 100) msg = '/the checkpoint name.*is reserved/' for conf in ( @@ -324,6 +324,12 @@ class test_checkpoint_last_name(wttest.WiredTigerTestCase): 'drop=(to=WiredTigerCheckpointX)'): self.assertRaisesWithMessage(wiredtiger.WiredTigerError, lambda: self.session.checkpoint(conf), msg) + msg = '/WiredTiger objects should not include grouping/' + for conf in ( + 'name=check{point', + 'name=check\\point'): + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.session.checkpoint(conf), msg) # Check we can't name checkpoints that include LSM tables. diff --git a/tools/stat_data.py b/tools/stat_data.py index 60319c6a846..2a7fbfbe884 100644 --- a/tools/stat_data.py +++ b/tools/stat_data.py @@ -9,6 +9,9 @@ no_scale_per_second_list = [ 'cache: pages currently held in the cache', 'files currently open', 'log: total log buffer size', + 'LSM App work units currently queued', + 'LSM Merge work units currently queued', + 'LSM Switch work units currently queued', 'split bytes currently awaiting free', 'split objects currently awaiting free', 'open cursor count',