Compare commits
37 Commits
mongodb-3.
...
mongodb-3.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cae5fcf57a | ||
|
|
2182e62919 | ||
|
|
547de638fd | ||
|
|
6bb9c16436 | ||
|
|
2cb69f635c | ||
|
|
6b51e483b4 | ||
|
|
f154aa4619 | ||
|
|
98fb64bacc | ||
|
|
b719a1bc73 | ||
|
|
1bfc894e37 | ||
|
|
969f940511 | ||
|
|
77721b97ec | ||
|
|
b5ab4a631b | ||
|
|
edf9dba6bb | ||
|
|
d72b3bd925 | ||
|
|
a1ddc5e616 | ||
|
|
deadcdaf1b | ||
|
|
27d0cbdf80 | ||
|
|
38369aebcf | ||
|
|
3051f3be9c | ||
|
|
ba0b7f26cf | ||
|
|
6feaa2812e | ||
|
|
1da2d3a517 | ||
|
|
42282959f7 | ||
|
|
0bc4f8f2dc | ||
|
|
0398515732 | ||
|
|
06a5c7b7a9 | ||
|
|
16a418b471 | ||
|
|
deb2d8109c | ||
|
|
66a111ec48 | ||
|
|
7b1398a1a6 | ||
|
|
155ad1a5f8 | ||
|
|
c819d2f9d3 | ||
|
|
00dfebc9b0 | ||
|
|
cb642366f1 | ||
|
|
01dbcf110a | ||
|
|
457da5c312 |
1
dist/flags.py
vendored
1
dist/flags.py
vendored
@@ -36,6 +36,7 @@ flags = {
|
||||
'page_read' : [
|
||||
'READ_CACHE',
|
||||
'READ_COMPACT',
|
||||
'READ_NO_EMPTY',
|
||||
'READ_NO_EVICT',
|
||||
'READ_NO_GEN',
|
||||
'READ_NO_WAIT',
|
||||
|
||||
3
dist/s_copyright
vendored
3
dist/s_copyright
vendored
@@ -1,5 +1,8 @@
|
||||
#! /bin/sh
|
||||
|
||||
# Only run when building a release
|
||||
test -z "$WT_RELEASE_BUILD" && exit 0
|
||||
|
||||
# Check the copyrights.
|
||||
|
||||
c1=__wt.1$$
|
||||
|
||||
@@ -426,7 +426,7 @@ __wt_block_manager_size(
|
||||
{
|
||||
wt_off_t filesize;
|
||||
|
||||
WT_RET(__wt_filesize_name(session, filename, &filesize));
|
||||
WT_RET(__wt_filesize_name(session, filename, false, &filesize));
|
||||
WT_STAT_SET(stats, block_size, filesize);
|
||||
|
||||
return (0);
|
||||
|
||||
@@ -214,10 +214,11 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
|
||||
|
||||
/*
|
||||
* __wt_delete_page_skip --
|
||||
* If iterating a cursor, skip deleted pages that are visible to us.
|
||||
* If iterating a cursor, skip deleted pages that are either visible to
|
||||
* us or globally visible.
|
||||
*/
|
||||
bool
|
||||
__wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref)
|
||||
__wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all)
|
||||
{
|
||||
bool skip;
|
||||
|
||||
@@ -245,9 +246,22 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref)
|
||||
if (!__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED))
|
||||
return (false);
|
||||
|
||||
skip = (ref->page_del == NULL ||
|
||||
skip = ref->page_del == NULL || (visible_all ?
|
||||
__wt_txn_visible_all(session, ref->page_del->txnid) :
|
||||
__wt_txn_visible(session, ref->page_del->txnid));
|
||||
|
||||
/*
|
||||
* The page_del structure can be freed as soon as the delete is stable:
|
||||
* it is only read when the ref state is WT_REF_DELETED. It is worth
|
||||
* checking every time we come through because once this is freed, we
|
||||
* no longer need synchronization to check the ref.
|
||||
*/
|
||||
if (skip && ref->page_del != NULL && (visible_all ||
|
||||
__wt_txn_visible_all(session, ref->page_del->txnid))) {
|
||||
__wt_free(session, ref->page_del->update_list);
|
||||
__wt_free(session, ref->page_del);
|
||||
}
|
||||
|
||||
WT_PUBLISH(ref->state, WT_REF_DELETED);
|
||||
return (skip);
|
||||
}
|
||||
|
||||
@@ -252,10 +252,7 @@ __wt_free_ref(
|
||||
}
|
||||
|
||||
/* Free any address allocation. */
|
||||
if (ref->addr != NULL && __wt_off_page(page, ref->addr)) {
|
||||
__wt_free(session, ((WT_ADDR *)ref->addr)->addr);
|
||||
__wt_free(session, ref->addr);
|
||||
}
|
||||
__wt_ref_addr_free(session, ref);
|
||||
|
||||
/* Free any page-deleted information. */
|
||||
if (ref->page_del != NULL) {
|
||||
|
||||
@@ -76,8 +76,12 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
|
||||
|
||||
for (force_attempts = 0, oldgen = false, wait_cnt = 0;;) {
|
||||
switch (ref->state) {
|
||||
case WT_REF_DISK:
|
||||
case WT_REF_DELETED:
|
||||
if (LF_ISSET(WT_READ_NO_EMPTY) &&
|
||||
__wt_delete_page_skip(session, ref, false))
|
||||
return (WT_NOTFOUND);
|
||||
/* FALLTHROUGH */
|
||||
case WT_REF_DISK:
|
||||
if (LF_ISSET(WT_READ_CACHE))
|
||||
return (WT_NOTFOUND);
|
||||
|
||||
|
||||
@@ -1299,9 +1299,7 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref)
|
||||
* would have been lost.) Clear the reference addr so eviction doesn't
|
||||
* free the underlying blocks.
|
||||
*/
|
||||
__wt_free(session, ((WT_ADDR *)ref->addr)->addr);
|
||||
__wt_free(session, ref->addr);
|
||||
ref->addr = NULL;
|
||||
__wt_ref_addr_free(session, ref);
|
||||
|
||||
/* Write the new version of the leaf page to disk. */
|
||||
WT_ERR(__slvg_modify_init(session, page));
|
||||
@@ -2008,9 +2006,7 @@ __slvg_row_build_leaf(
|
||||
* would have been lost.) Clear the reference addr so eviction doesn't
|
||||
* free the underlying blocks.
|
||||
*/
|
||||
__wt_free(session, ((WT_ADDR *)ref->addr)->addr);
|
||||
__wt_free(session, ref->addr);
|
||||
ref->addr = NULL;
|
||||
__wt_ref_addr_free(session, ref);
|
||||
|
||||
/* Write the new version of the leaf page to disk. */
|
||||
WT_ERR(__slvg_modify_init(session, page));
|
||||
|
||||
@@ -167,15 +167,12 @@ __split_safe_free(WT_SESSION_IMPL *session,
|
||||
* Return if we should deepen the tree.
|
||||
*/
|
||||
static bool
|
||||
__split_should_deepen(
|
||||
WT_SESSION_IMPL *session, WT_REF *ref, uint32_t *childrenp)
|
||||
__split_should_deepen(WT_SESSION_IMPL *session, WT_REF *ref)
|
||||
{
|
||||
WT_BTREE *btree;
|
||||
WT_PAGE *page;
|
||||
WT_PAGE_INDEX *pindex;
|
||||
|
||||
*childrenp = 0;
|
||||
|
||||
btree = S2BT(session);
|
||||
page = ref->page;
|
||||
pindex = WT_INTL_INDEX_GET_SAFE(page);
|
||||
@@ -193,10 +190,8 @@ __split_should_deepen(
|
||||
* we get a significant payback (in the case of a set of large keys,
|
||||
* splitting won't help).
|
||||
*/
|
||||
if (pindex->entries > btree->split_deepen_min_child) {
|
||||
*childrenp = pindex->entries / btree->split_deepen_per_child;
|
||||
if (pindex->entries > btree->split_deepen_min_child)
|
||||
return (true);
|
||||
}
|
||||
|
||||
/*
|
||||
* Don't allow a single page to put pressure on cache usage. The root
|
||||
@@ -207,10 +202,8 @@ __split_should_deepen(
|
||||
*/
|
||||
if (pindex->entries >= 100 &&
|
||||
(__wt_ref_is_root(ref) ||
|
||||
page->memory_footprint >= S2C(session)->cache_size / 4)) {
|
||||
*childrenp = pindex->entries / 10;
|
||||
page->memory_footprint >= S2C(session)->cache_size / 4))
|
||||
return (true);
|
||||
}
|
||||
|
||||
return (false);
|
||||
}
|
||||
@@ -377,8 +370,9 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page)
|
||||
* Split an internal page in-memory, deepening the tree.
|
||||
*/
|
||||
static int
|
||||
__split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children)
|
||||
__split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent)
|
||||
{
|
||||
WT_BTREE *btree;
|
||||
WT_DECL_RET;
|
||||
WT_PAGE *child;
|
||||
WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex;
|
||||
@@ -386,63 +380,91 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children)
|
||||
WT_REF *child_ref, **child_refp, *parent_ref, **parent_refp, *ref;
|
||||
size_t child_incr, parent_decr, parent_incr, size;
|
||||
uint64_t split_gen;
|
||||
uint32_t chunk, i, j, remain, slots;
|
||||
uint32_t children, chunk, i, j, moved_entries, new_entries, remain;
|
||||
uint32_t skip_leading, slots;
|
||||
bool panic;
|
||||
void *p;
|
||||
|
||||
WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen);
|
||||
WT_STAT_FAST_DATA_INCR(session, cache_eviction_deepen);
|
||||
|
||||
btree = S2BT(session);
|
||||
alloc_index = NULL;
|
||||
parent_incr = parent_decr = 0;
|
||||
panic = false;
|
||||
|
||||
pindex = WT_INTL_INDEX_GET_SAFE(parent);
|
||||
|
||||
WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen);
|
||||
WT_STAT_FAST_DATA_INCR(session, cache_eviction_deepen);
|
||||
/*
|
||||
* A prepending/appending workload will repeatedly deepen parts of the
|
||||
* tree that aren't changing, and appending workloads are not uncommon.
|
||||
* First, keep the first/last pages of the tree at their current level,
|
||||
* to catch simple workloads. Second, track the number of entries which
|
||||
* resulted from the last time we deepened this page, and if we refilled
|
||||
* this page without splitting into those slots, ignore them for this
|
||||
* split. It's not exact because an eviction might split into any part
|
||||
* of the page: if 80% of the splits are at the end of the page, assume
|
||||
* an append-style workload. Of course, the plan eventually fails: when
|
||||
* repeatedly deepening this page for an append-only workload, we will
|
||||
* progressively ignore more and more of the slots. When ignoring 90% of
|
||||
* the slots, deepen the entire page again.
|
||||
*
|
||||
* Figure out how many slots we're leaving at this level and how many
|
||||
* child pages we're creating.
|
||||
*/
|
||||
#undef skip_trailing
|
||||
#define skip_trailing 1
|
||||
skip_leading = 1;
|
||||
new_entries = pindex->entries - parent->pg_intl_deepen_split_last;
|
||||
if (parent->pg_intl_deepen_split_append > (new_entries * 8) / 10)
|
||||
skip_leading = parent->pg_intl_deepen_split_last;
|
||||
if (skip_leading > (pindex->entries * 9) * 10)
|
||||
skip_leading = 1;
|
||||
|
||||
/*
|
||||
* In a few (rare) cases we split pages with only a few entries, and in
|
||||
* those cases we keep it simple, 10 children, skip only first and last
|
||||
* entries. Otherwise, split into a lot of child pages.
|
||||
*/
|
||||
moved_entries = pindex->entries - (skip_leading + skip_trailing);
|
||||
children = moved_entries / btree->split_deepen_per_child;
|
||||
if (children < 10) {
|
||||
children = 10;
|
||||
skip_leading = 1;
|
||||
moved_entries =
|
||||
pindex->entries - (skip_leading + skip_trailing);
|
||||
}
|
||||
|
||||
WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
|
||||
"%p: %" PRIu32 " elements, splitting into %" PRIu32 " children",
|
||||
parent, pindex->entries, children));
|
||||
|
||||
/*
|
||||
* If the workload is prepending/appending to the tree, we could deepen
|
||||
* without bound. Don't let that happen, keep the first/last pages of
|
||||
* the tree at their current level.
|
||||
*
|
||||
* XXX
|
||||
* To improve this, we could track which pages were last merged into
|
||||
* this page by eviction, and leave those pages alone, to prevent any
|
||||
* sustained insert into the tree from deepening a single location.
|
||||
*/
|
||||
#undef SPLIT_CORRECT_1
|
||||
#define SPLIT_CORRECT_1 1 /* First page correction */
|
||||
#undef SPLIT_CORRECT_2
|
||||
#define SPLIT_CORRECT_2 2 /* First/last page correction */
|
||||
|
||||
/*
|
||||
* Allocate a new WT_PAGE_INDEX and set of WT_REF objects. Initialize
|
||||
* the first/last slots of the allocated WT_PAGE_INDEX to point to the
|
||||
* first/last pages we're keeping at the current level, and the rest of
|
||||
* the slots to point to new WT_REF objects.
|
||||
* Allocate a new WT_PAGE_INDEX and set of WT_REF objects. Initialize
|
||||
* the slots of the allocated WT_PAGE_INDEX to point to the pages we're
|
||||
* keeping at the current level, and the rest of the slots to point to
|
||||
* new WT_REF objects.
|
||||
*/
|
||||
size = sizeof(WT_PAGE_INDEX) +
|
||||
(children + SPLIT_CORRECT_2) * sizeof(WT_REF *);
|
||||
(children + skip_leading + skip_trailing) * sizeof(WT_REF *);
|
||||
WT_ERR(__wt_calloc(session, 1, size, &alloc_index));
|
||||
parent_incr += size;
|
||||
alloc_index->index = (WT_REF **)(alloc_index + 1);
|
||||
alloc_index->entries = children + SPLIT_CORRECT_2;
|
||||
alloc_index->index[0] = pindex->index[0];
|
||||
alloc_index->entries = children + skip_leading + skip_trailing;
|
||||
for (alloc_refp = alloc_index->index,
|
||||
i = 0; i < skip_leading; ++alloc_refp, ++i)
|
||||
alloc_index->index[i] = pindex->index[i];
|
||||
for (i = 0; i < children; ++alloc_refp, ++i)
|
||||
WT_ERR(__wt_calloc_one(session, alloc_refp));
|
||||
parent_incr += children * sizeof(WT_REF);
|
||||
alloc_index->index[alloc_index->entries - 1] =
|
||||
pindex->index[pindex->entries - 1];
|
||||
for (alloc_refp = alloc_index->index + SPLIT_CORRECT_1,
|
||||
i = 0; i < children; ++alloc_refp, ++i) {
|
||||
WT_ERR(__wt_calloc_one(session, alloc_refp));
|
||||
parent_incr += sizeof(WT_REF);
|
||||
}
|
||||
|
||||
/* Allocate child pages, and connect them into the new page index. */
|
||||
chunk = (pindex->entries - SPLIT_CORRECT_2) / children;
|
||||
remain = (pindex->entries - SPLIT_CORRECT_2) - chunk * (children - 1);
|
||||
for (parent_refp = pindex->index + SPLIT_CORRECT_1,
|
||||
alloc_refp = alloc_index->index + SPLIT_CORRECT_1,
|
||||
chunk = moved_entries / children;
|
||||
remain = moved_entries - chunk * (children - 1);
|
||||
for (parent_refp = pindex->index + skip_leading,
|
||||
alloc_refp = alloc_index->index + skip_leading,
|
||||
i = 0; i < children; ++i) {
|
||||
slots = i == children - 1 ? remain : chunk;
|
||||
WT_ERR(__wt_page_alloc(
|
||||
@@ -480,7 +502,7 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children)
|
||||
* array, a thread might see a freed WT_REF. Set the eviction
|
||||
* transaction requirement for the newly created internal pages.
|
||||
*/
|
||||
child->modify->mod_split_txn = __wt_txn_new_id(session);
|
||||
child->modify->mod_split_txn = __wt_txn_id_alloc(session, false);
|
||||
|
||||
/*
|
||||
* The newly allocated child's page index references the same
|
||||
@@ -500,10 +522,11 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children)
|
||||
}
|
||||
__wt_cache_page_inmem_incr(session, child, child_incr);
|
||||
}
|
||||
WT_ASSERT(session, alloc_refp -
|
||||
alloc_index->index == alloc_index->entries - SPLIT_CORRECT_1);
|
||||
WT_ASSERT(session,
|
||||
parent_refp - pindex->index == pindex->entries - SPLIT_CORRECT_1);
|
||||
alloc_refp - alloc_index->index ==
|
||||
alloc_index->entries - skip_trailing);
|
||||
WT_ASSERT(session,
|
||||
parent_refp - pindex->index == pindex->entries - skip_trailing);
|
||||
|
||||
/*
|
||||
* Update the parent's index; this is the update which splits the page,
|
||||
@@ -527,6 +550,12 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children)
|
||||
#ifdef HAVE_DIAGNOSTIC
|
||||
__split_verify_intl_key_order(session, parent);
|
||||
#endif
|
||||
/*
|
||||
* Save the number of entries created by deepening the tree and reset
|
||||
* the count of splits into this page after that point.
|
||||
*/
|
||||
parent->pg_intl_deepen_split_append = 0;
|
||||
parent->pg_intl_deepen_split_last = alloc_index->entries;
|
||||
|
||||
/*
|
||||
* The moved reference structures now reference the wrong parent page,
|
||||
@@ -889,8 +918,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
|
||||
WT_REF **alloc_refp, *next_ref, *parent_ref;
|
||||
size_t parent_decr, size;
|
||||
uint64_t split_gen;
|
||||
uint32_t i, j;
|
||||
uint32_t children, deleted_entries, parent_entries, result_entries;
|
||||
uint32_t deleted_entries, i, j, parent_entries, result_entries;
|
||||
bool complete;
|
||||
|
||||
parent = ref->home;
|
||||
@@ -915,22 +943,32 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
|
||||
* reading thread will restart. Include the ref we are splitting in
|
||||
* the count to be deleted.
|
||||
*/
|
||||
for (i = 0, deleted_entries = 1; i < parent_entries; ++i) {
|
||||
for (deleted_entries = 1, i = 0; i < parent_entries; ++i) {
|
||||
next_ref = pindex->index[i];
|
||||
WT_ASSERT(session, next_ref->state != WT_REF_SPLIT);
|
||||
if (next_ref->state == WT_REF_DELETED &&
|
||||
__wt_delete_page_skip(session, next_ref) &&
|
||||
__wt_delete_page_skip(session, next_ref, true) &&
|
||||
__wt_atomic_casv32(
|
||||
&next_ref->state, WT_REF_DELETED, WT_REF_SPLIT))
|
||||
deleted_entries++;
|
||||
}
|
||||
|
||||
/*
|
||||
* The final entry count consists of: The original count, plus any
|
||||
* new pages, less any refs we are removing.
|
||||
* The final entry count consists of the original count, plus any new
|
||||
* pages, less any WT_REFs we're removing.
|
||||
*/
|
||||
result_entries = (parent_entries + new_entries) - deleted_entries;
|
||||
|
||||
/*
|
||||
* If the entire (sub)tree is empty, give up: we can't leave an empty
|
||||
* internal page. Mark it to be evicted soon and clean up any
|
||||
* references that have changed state.
|
||||
*/
|
||||
if (result_entries == 0) {
|
||||
__wt_page_evict_soon(parent);
|
||||
goto err;
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate and initialize a new page index array for the parent, then
|
||||
* copy references from the original index array, plus references from
|
||||
@@ -943,7 +981,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
|
||||
alloc_index->entries = result_entries;
|
||||
for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i) {
|
||||
next_ref = pindex->index[i];
|
||||
if (next_ref == ref)
|
||||
if (next_ref == ref) {
|
||||
for (j = 0; j < new_entries; ++j) {
|
||||
ref_new[j]->home = parent;
|
||||
*alloc_refp++ = ref_new[j];
|
||||
@@ -955,11 +993,29 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
|
||||
*/
|
||||
ref_new[j] = NULL;
|
||||
}
|
||||
else if (next_ref->state != WT_REF_SPLIT)
|
||||
|
||||
/*
|
||||
* We detect append-style workloads to avoid repeatedly
|
||||
* deepening parts of the tree where no work is being
|
||||
* done by tracking if we're splitting after the slots
|
||||
* created by the last split to deepen this parent.
|
||||
*
|
||||
* Note the calculation: i is a 0-based array offset and
|
||||
* split-last is a count of entries, also either or both
|
||||
* i and split-last might be unsigned 0, don't decrement
|
||||
* either one.
|
||||
*/
|
||||
if (i > parent->pg_intl_deepen_split_last)
|
||||
parent->
|
||||
pg_intl_deepen_split_append += new_entries;
|
||||
} else if (next_ref->state != WT_REF_SPLIT)
|
||||
/* Skip refs we have marked for deletion. */
|
||||
*alloc_refp++ = next_ref;
|
||||
}
|
||||
|
||||
/* Check that we filled in all the entries. */
|
||||
WT_ASSERT(session, alloc_refp - alloc_index->index == result_entries);
|
||||
|
||||
/*
|
||||
* Update the parent page's index: this update makes the split visible
|
||||
* to threads descending the tree.
|
||||
@@ -973,6 +1029,16 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
|
||||
__split_verify_intl_key_order(session, parent);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Page-delete information is only read when the WT_REF state is
|
||||
* WT_REF_DELETED. The page-delete memory wasn't added to the
|
||||
* parent's footprint, ignore it here.
|
||||
*/
|
||||
if (ref->page_del != NULL) {
|
||||
__wt_free(session, ref->page_del->update_list);
|
||||
__wt_free(session, ref->page_del);
|
||||
}
|
||||
|
||||
/*
|
||||
* Reset the page's original WT_REF field to split. Threads cursoring
|
||||
* through the tree were blocked because that WT_REF state was set to
|
||||
@@ -995,9 +1061,9 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
|
||||
|
||||
WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
|
||||
"%s split into parent %" PRIu32 " -> %" PRIu32
|
||||
" (%" PRIu32 ")",
|
||||
__wt_page_type_string(ref->page->type), parent_entries,
|
||||
result_entries, result_entries - parent_entries));
|
||||
" (%" PRIu32 ")", ref->page == NULL ?
|
||||
"reverse" : __wt_page_type_string(ref->page->type),
|
||||
parent_entries, result_entries, result_entries - parent_entries));
|
||||
|
||||
/*
|
||||
* The new page index is in place, free the WT_REF we were splitting
|
||||
@@ -1034,19 +1100,15 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
|
||||
session, split_gen, 0, ikey, size));
|
||||
parent_decr += size;
|
||||
}
|
||||
/*
|
||||
* The page_del structure can be freed immediately: it
|
||||
* is only read when the ref state is WT_REF_DELETED.
|
||||
* The size of the structure wasn't added to the parent,
|
||||
* don't decrement.
|
||||
*/
|
||||
if (next_ref->page_del != NULL) {
|
||||
__wt_free(session,
|
||||
next_ref->page_del->update_list);
|
||||
__wt_free(session, next_ref->page_del);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* If this page was fast-truncated, any attached structure
|
||||
* should have been freed before now.
|
||||
*/
|
||||
WT_ASSERT(session, next_ref->page_del == NULL);
|
||||
|
||||
WT_TRET(__wt_ref_block_free(session, next_ref));
|
||||
WT_TRET(__split_safe_free(
|
||||
session, split_gen, 0, next_ref, sizeof(WT_REF)));
|
||||
parent_decr += sizeof(WT_REF);
|
||||
@@ -1086,17 +1148,24 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
|
||||
* are holding it locked.
|
||||
*/
|
||||
if (ret == 0 && !closing &&
|
||||
__split_should_deepen(session, parent_ref, &children))
|
||||
ret = __split_deepen(session, parent, children);
|
||||
__split_should_deepen(session, parent_ref))
|
||||
ret = __split_deepen(session, parent);
|
||||
|
||||
err: if (!complete)
|
||||
err: if (!complete) {
|
||||
for (i = 0; i < parent_entries; ++i) {
|
||||
next_ref = pindex->index[i];
|
||||
if (next_ref->state == WT_REF_SPLIT)
|
||||
next_ref->state = WT_REF_DELETED;
|
||||
}
|
||||
|
||||
__wt_free_ref_index(session, NULL, alloc_index, false);
|
||||
/* If we gave up on a reverse split, unlock the child. */
|
||||
if (ref_new == NULL) {
|
||||
WT_ASSERT(session, ref->state == WT_REF_LOCKED);
|
||||
ref->state = WT_REF_DELETED;
|
||||
}
|
||||
|
||||
__wt_free_ref_index(session, NULL, alloc_index, false);
|
||||
}
|
||||
|
||||
/*
|
||||
* A note on error handling: if we completed the split, return success,
|
||||
@@ -1150,21 +1219,30 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
|
||||
* The first page in the split is the current page, but we still have
|
||||
* to create a replacement WT_REF, the original WT_REF will be set to
|
||||
* split status and eventually freed.
|
||||
*/
|
||||
WT_ERR(__wt_calloc_one(session, &split_ref[0]));
|
||||
child = split_ref[0];
|
||||
*child = *ref;
|
||||
|
||||
/*
|
||||
*
|
||||
* The new WT_REF is not quite identical: we have to instantiate a key,
|
||||
* and the new reference is visible to readers once the split completes.
|
||||
*
|
||||
* The key-instantiation code checks for races, clear the key fields so
|
||||
* we don't trigger them.
|
||||
* The key-instantiation code checks for races, leave the key fields
|
||||
* zeroed we don't trigger them.
|
||||
*
|
||||
* Don't copy any deleted page state: we may be splitting a page that
|
||||
* was instantiated after a truncate and that history should not be
|
||||
* carried onto these new child pages.
|
||||
*/
|
||||
child->key.recno = 0;
|
||||
child->key.ikey = NULL;
|
||||
WT_ERR(__wt_calloc_one(session, &split_ref[0]));
|
||||
child = split_ref[0];
|
||||
child->page = ref->page;
|
||||
child->home = ref->home;
|
||||
child->pindex_hint = ref->pindex_hint;
|
||||
child->state = WT_REF_MEM;
|
||||
child->addr = ref->addr;
|
||||
|
||||
/*
|
||||
* The address has moved to the replacement WT_REF. Make sure it isn't
|
||||
* freed when the original ref is discarded.
|
||||
*/
|
||||
ref->addr = NULL;
|
||||
|
||||
/*
|
||||
* Copy the first key from the original page into first ref in the new
|
||||
@@ -1320,7 +1398,7 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
|
||||
* threads will not try to forcibly evict the page again until
|
||||
* all concurrent transactions commit.
|
||||
*/
|
||||
page->modify->inmem_split_txn = __wt_txn_new_id(session);
|
||||
page->modify->inmem_split_txn = __wt_txn_id_alloc(session, false);
|
||||
|
||||
/*
|
||||
* Update the page accounting.
|
||||
@@ -1366,6 +1444,11 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
|
||||
return (0);
|
||||
|
||||
err: if (split_ref[0] != NULL) {
|
||||
/*
|
||||
* The address was moved to the replacement WT_REF, restore it.
|
||||
*/
|
||||
ref->addr = split_ref[0]->addr;
|
||||
|
||||
__wt_free(session, split_ref[0]->key.ikey);
|
||||
__wt_free(session, split_ref[0]);
|
||||
}
|
||||
@@ -1396,6 +1479,24 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
|
||||
return (ret);
|
||||
}
|
||||
|
||||
/*
|
||||
* __wt_split_reverse --
|
||||
* We have a locked ref that is empty and we want to rewrite the index in
|
||||
* its parent.
|
||||
*/
|
||||
int
|
||||
__wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref)
|
||||
{
|
||||
WT_DECL_RET;
|
||||
WT_PAGE *parent;
|
||||
bool hazard;
|
||||
|
||||
WT_RET(__split_parent_lock(session, ref, &parent, &hazard));
|
||||
ret = __split_parent(session, ref, NULL, 0, 0, 0);
|
||||
WT_TRET(__split_parent_unlock(session, parent, hazard));
|
||||
return (ret);
|
||||
}
|
||||
|
||||
/*
|
||||
* __wt_split_rewrite --
|
||||
* Rewrite an in-memory page with a new version.
|
||||
|
||||
@@ -22,16 +22,17 @@ __sync_file(WT_SESSION_IMPL *session, int syncop)
|
||||
WT_PAGE_MODIFY *mod;
|
||||
WT_REF *walk;
|
||||
WT_TXN *txn;
|
||||
uint64_t internal_bytes, leaf_bytes;
|
||||
uint64_t internal_pages, leaf_pages;
|
||||
uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages;
|
||||
uint64_t saved_snap_min;
|
||||
uint32_t flags;
|
||||
bool evict_reset;
|
||||
|
||||
btree = S2BT(session);
|
||||
|
||||
flags = WT_READ_CACHE | WT_READ_NO_GEN;
|
||||
walk = NULL;
|
||||
txn = &session->txn;
|
||||
saved_snap_min = WT_SESSION_TXN_STATE(session)->snap_min;
|
||||
flags = WT_READ_CACHE | WT_READ_NO_GEN;
|
||||
|
||||
internal_bytes = leaf_bytes = 0;
|
||||
internal_pages = leaf_pages = 0;
|
||||
@@ -79,6 +80,19 @@ __sync_file(WT_SESSION_IMPL *session, int syncop)
|
||||
}
|
||||
break;
|
||||
case WT_SYNC_CHECKPOINT:
|
||||
/*
|
||||
* If we are flushing a file at read-committed isolation, which
|
||||
* is of particular interest for flushing the metadata to make
|
||||
* schema-changing operation durable, get a transactional
|
||||
* snapshot now.
|
||||
*
|
||||
* All changes committed up to this point should be included.
|
||||
* We don't update the snapshot in between pages because (a)
|
||||
* the metadata shouldn't be that big, and (b) if we do ever
|
||||
*/
|
||||
if (txn->isolation == WT_ISO_READ_COMMITTED)
|
||||
__wt_txn_get_snapshot(session);
|
||||
|
||||
/*
|
||||
* We cannot check the tree modified flag in the case of a
|
||||
* checkpoint, the checkpoint code has already cleared it.
|
||||
@@ -185,7 +199,12 @@ err: /* On error, clear any left-over tree walk. */
|
||||
if (walk != NULL)
|
||||
WT_TRET(__wt_page_release(session, walk, flags));
|
||||
|
||||
if (txn->isolation == WT_ISO_READ_COMMITTED && session->ncursors == 0)
|
||||
/*
|
||||
* If we got a snapshot in order to write pages, and there was no
|
||||
* snapshot active when we started, release it.
|
||||
*/
|
||||
if (txn->isolation == WT_ISO_READ_COMMITTED &&
|
||||
saved_snap_min == WT_TXN_NONE)
|
||||
__wt_txn_release_snapshot(session);
|
||||
|
||||
if (btree->checkpointing) {
|
||||
|
||||
@@ -94,6 +94,9 @@ __wt_tree_walk(WT_SESSION_IMPL *session,
|
||||
*/
|
||||
WT_ENTER_PAGE_INDEX(session);
|
||||
|
||||
/* Walk should never instantiate deleted pages. */
|
||||
LF_SET(WT_READ_NO_EMPTY);
|
||||
|
||||
/*
|
||||
* !!!
|
||||
* Fast-truncate currently only works on row-store trees.
|
||||
@@ -174,9 +177,10 @@ ascend: /*
|
||||
|
||||
/*
|
||||
* If we got all the way through an internal page and
|
||||
* all of the child pages were deleted, evict it.
|
||||
* all of the child pages were deleted, mark it for
|
||||
* eviction.
|
||||
*/
|
||||
if (empty_internal) {
|
||||
if (empty_internal && pindex->entries > 1) {
|
||||
__wt_page_evict_soon(ref->page);
|
||||
empty_internal = false;
|
||||
}
|
||||
@@ -240,7 +244,8 @@ ascend: /*
|
||||
* If we see any child states other than deleted, the
|
||||
* page isn't empty.
|
||||
*/
|
||||
if (ref->state != WT_REF_DELETED)
|
||||
if (ref->state != WT_REF_DELETED &&
|
||||
!LF_ISSET(WT_READ_TRUNCATE))
|
||||
empty_internal = false;
|
||||
|
||||
if (LF_ISSET(WT_READ_CACHE)) {
|
||||
@@ -257,7 +262,7 @@ ascend: /*
|
||||
* to delete it again.
|
||||
*/
|
||||
if (ref->state == WT_REF_DELETED &&
|
||||
__wt_delete_page_skip(session, ref))
|
||||
__wt_delete_page_skip(session, ref, false))
|
||||
break;
|
||||
/*
|
||||
* If deleting a range, try to delete the page
|
||||
@@ -266,6 +271,7 @@ ascend: /*
|
||||
WT_ERR(__wt_delete_page(session, ref, &skip));
|
||||
if (skip)
|
||||
break;
|
||||
empty_internal = false;
|
||||
} else if (LF_ISSET(WT_READ_COMPACT)) {
|
||||
/*
|
||||
* Skip deleted pages, rewriting them doesn't
|
||||
@@ -294,7 +300,7 @@ ascend: /*
|
||||
* Try to skip deleted pages visible to us.
|
||||
*/
|
||||
if (ref->state == WT_REF_DELETED &&
|
||||
__wt_delete_page_skip(session, ref))
|
||||
__wt_delete_page_skip(session, ref, false))
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -302,7 +308,7 @@ ascend: /*
|
||||
|
||||
/*
|
||||
* Not-found is an expected return when only walking
|
||||
* in-cache pages.
|
||||
* in-cache pages, or if we see a deleted page.
|
||||
*/
|
||||
if (ret == WT_NOTFOUND) {
|
||||
ret = 0;
|
||||
|
||||
@@ -735,11 +735,16 @@ __wt_config_gets_def(WT_SESSION_IMPL *session,
|
||||
|
||||
*value = false_value;
|
||||
value->val = def;
|
||||
|
||||
if (cfg == NULL || cfg[0] == NULL || cfg[1] == NULL)
|
||||
return (0);
|
||||
else if (cfg[2] == NULL)
|
||||
|
||||
if (cfg[2] == NULL) {
|
||||
WT_RET_NOTFOUND_OK(
|
||||
__wt_config_getones(session, cfg[1], key, value));
|
||||
return (0);
|
||||
}
|
||||
|
||||
return (__wt_config_gets(session, cfg, key, value));
|
||||
}
|
||||
|
||||
|
||||
@@ -122,6 +122,9 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
|
||||
/* Close open data handles. */
|
||||
WT_TRET(__wt_conn_dhandle_discard(session));
|
||||
|
||||
/* Shut down metadata tracking, required before creating tables. */
|
||||
WT_TRET(__wt_meta_track_destroy(session));
|
||||
|
||||
/*
|
||||
* Now that all data handles are closed, tell logging that a checkpoint
|
||||
* has completed then shut down the log manager (only after closing
|
||||
@@ -254,6 +257,9 @@ __wt_connection_workers(WT_SESSION_IMPL *session, const char *cfg[])
|
||||
*/
|
||||
WT_RET(__wt_logmgr_open(session));
|
||||
|
||||
/* Initialize metadata tracking, required before creating tables. */
|
||||
WT_RET(__wt_meta_track_init(session));
|
||||
|
||||
/* Start the optional checkpoint thread. */
|
||||
WT_RET(__wt_checkpoint_server_create(session, cfg));
|
||||
|
||||
|
||||
@@ -81,7 +81,8 @@ __wt_evict_file(WT_SESSION_IMPL *session, int syncop)
|
||||
case WT_SYNC_DISCARD:
|
||||
WT_ASSERT(session,
|
||||
__wt_page_can_evict(session, page, 0, NULL));
|
||||
__wt_evict_page_clean_update(session, ref);
|
||||
WT_ERR(
|
||||
__wt_evict_page_clean_update(session, ref, true));
|
||||
break;
|
||||
case WT_SYNC_DISCARD_FORCE:
|
||||
/*
|
||||
@@ -97,8 +98,9 @@ __wt_evict_file(WT_SESSION_IMPL *session, int syncop)
|
||||
}
|
||||
|
||||
F_SET(session, WT_SESSION_DISCARD_FORCE);
|
||||
__wt_evict_page_clean_update(session, ref);
|
||||
ret = __wt_evict_page_clean_update(session, ref, true);
|
||||
F_CLR(session, WT_SESSION_DISCARD_FORCE);
|
||||
WT_ERR(ret);
|
||||
break;
|
||||
WT_ILLEGAL_VALUE_ERR(session);
|
||||
}
|
||||
|
||||
@@ -36,6 +36,10 @@ __evict_read_gen(const WT_EVICT_ENTRY *entry)
|
||||
|
||||
page = entry->ref->page;
|
||||
|
||||
/* Any page set to the oldest generation should be discarded. */
|
||||
if (page->read_gen == WT_READGEN_OLDEST)
|
||||
return (WT_READGEN_OLDEST);
|
||||
|
||||
/* Any empty page (leaf or internal), is a good choice. */
|
||||
if (__wt_page_is_empty(page))
|
||||
return (WT_READGEN_OLDEST);
|
||||
@@ -1221,6 +1225,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags)
|
||||
* eviction, skip anything that isn't marked.
|
||||
*/
|
||||
if (LF_ISSET(WT_EVICT_PASS_WOULD_BLOCK) &&
|
||||
page->memory_footprint < btree->maxmempage &&
|
||||
page->read_gen != WT_READGEN_OLDEST)
|
||||
continue;
|
||||
|
||||
|
||||
@@ -110,7 +110,8 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
|
||||
if (__wt_ref_is_root(ref))
|
||||
__wt_ref_out(session, ref);
|
||||
else
|
||||
__wt_evict_page_clean_update(session, ref);
|
||||
WT_ERR(__wt_evict_page_clean_update(
|
||||
session, ref, closing));
|
||||
|
||||
WT_STAT_FAST_CONN_INCR(session, cache_eviction_clean);
|
||||
WT_STAT_FAST_DATA_INCR(session, cache_eviction_clean);
|
||||
@@ -141,22 +142,75 @@ done: if ((inmem_split || (forced_eviction && ret == EBUSY)) &&
|
||||
|
||||
return (ret);
|
||||
}
|
||||
/*
|
||||
* __evict_delete_ref --
|
||||
* Mark a page reference deleted and check if the parent can reverse
|
||||
* split.
|
||||
*/
|
||||
static int
|
||||
__evict_delete_ref(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
|
||||
{
|
||||
WT_DECL_RET;
|
||||
WT_PAGE *parent;
|
||||
WT_PAGE_INDEX *pindex;
|
||||
uint32_t ndeleted;
|
||||
|
||||
if (__wt_ref_is_root(ref))
|
||||
return (0);
|
||||
|
||||
/*
|
||||
* Avoid doing reverse splits when closing the file, it is
|
||||
* wasted work and some structure may already have been freed.
|
||||
*/
|
||||
if (!closing) {
|
||||
parent = ref->home;
|
||||
WT_INTL_INDEX_GET(session, parent, pindex);
|
||||
ndeleted = __wt_atomic_addv32(&pindex->deleted_entries, 1);
|
||||
|
||||
/*
|
||||
* If more than 10% of the parent references are deleted, try a
|
||||
* reverse split. Don't bother if there is a single deleted
|
||||
* reference: the internal page is empty and we have to wait
|
||||
* for eviction to notice.
|
||||
*
|
||||
* This will consume the deleted ref (and eventually free it).
|
||||
* If the reverse split can't get the access it needs because
|
||||
* something is busy, be sure that the page still ends up
|
||||
* marked deleted.
|
||||
*/
|
||||
if (ndeleted > pindex->entries / 10 && pindex->entries > 1 &&
|
||||
(ret = __wt_split_reverse(session, ref)) != EBUSY)
|
||||
return (ret);
|
||||
}
|
||||
|
||||
WT_PUBLISH(ref->state, WT_REF_DELETED);
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* __wt_evict_page_clean_update --
|
||||
* Update a clean page's reference on eviction.
|
||||
*/
|
||||
void
|
||||
__wt_evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref)
|
||||
int
|
||||
__wt_evict_page_clean_update(
|
||||
WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
|
||||
{
|
||||
WT_DECL_RET;
|
||||
|
||||
/*
|
||||
* Discard the page and update the reference structure; if the page has
|
||||
* an address, it's a disk page; if it has no address, it's a deleted
|
||||
* page re-instantiated (for example, by searching) and never written.
|
||||
*/
|
||||
__wt_ref_out(session, ref);
|
||||
WT_PUBLISH(ref->state,
|
||||
ref->addr == NULL ? WT_REF_DELETED : WT_REF_DISK);
|
||||
if (ref->addr == NULL) {
|
||||
WT_WITH_PAGE_INDEX(session,
|
||||
ret = __evict_delete_ref(session, ref, closing));
|
||||
WT_RET_BUSY_OK(ret);
|
||||
} else
|
||||
WT_PUBLISH(ref->state, WT_REF_DISK);
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -167,19 +221,15 @@ static int
|
||||
__evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
|
||||
{
|
||||
WT_ADDR *addr;
|
||||
WT_PAGE *parent;
|
||||
WT_DECL_RET;
|
||||
WT_PAGE_MODIFY *mod;
|
||||
|
||||
parent = ref->home;
|
||||
mod = ref->page->modify;
|
||||
|
||||
WT_ASSERT(session, ref->addr == NULL);
|
||||
|
||||
switch (mod->rec_result) {
|
||||
case WT_PM_REC_EMPTY: /* Page is empty */
|
||||
if (ref->addr != NULL && __wt_off_page(parent, ref->addr)) {
|
||||
__wt_free(session, ((WT_ADDR *)ref->addr)->addr);
|
||||
__wt_free(session, ref->addr);
|
||||
}
|
||||
|
||||
/*
|
||||
* Update the parent to reference a deleted page. The fact that
|
||||
* reconciliation left the page "empty" means there's no older
|
||||
@@ -194,7 +244,9 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
|
||||
*/
|
||||
__wt_ref_out(session, ref);
|
||||
ref->addr = NULL;
|
||||
WT_PUBLISH(ref->state, WT_REF_DELETED);
|
||||
WT_WITH_PAGE_INDEX(session,
|
||||
ret = __evict_delete_ref(session, ref, closing));
|
||||
WT_RET_BUSY_OK(ret);
|
||||
break;
|
||||
case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */
|
||||
/*
|
||||
@@ -204,11 +256,6 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
|
||||
WT_RET(__wt_split_multi(session, ref, closing));
|
||||
break;
|
||||
case WT_PM_REC_REPLACE: /* 1-for-1 page swap */
|
||||
if (ref->addr != NULL && __wt_off_page(parent, ref->addr)) {
|
||||
__wt_free(session, ((WT_ADDR *)ref->addr)->addr);
|
||||
__wt_free(session, ref->addr);
|
||||
}
|
||||
|
||||
/*
|
||||
* Update the parent to reference the replacement page.
|
||||
*
|
||||
|
||||
@@ -375,9 +375,8 @@ struct __wt_page {
|
||||
/*
|
||||
* Internal pages (both column- and row-store).
|
||||
*
|
||||
* The page record number is only used by column-store, but it
|
||||
* makes some things simpler and it doesn't cost us any memory,
|
||||
* other structures in this union are still as large.
|
||||
* The page record number is only used by column-store, but it's
|
||||
* simpler having only one kind of internal page.
|
||||
*
|
||||
* In-memory internal pages have an array of pointers to child
|
||||
* structures, maintained in collated order. When a page is
|
||||
@@ -409,12 +408,27 @@ struct __wt_page {
|
||||
|
||||
struct __wt_page_index {
|
||||
uint32_t entries;
|
||||
uint32_t deleted_entries;
|
||||
WT_REF **index;
|
||||
} * volatile __index; /* Collated children */
|
||||
|
||||
/*
|
||||
* When splitting to deepen the tree, track the number
|
||||
* of entries in the newly created parent, and how many
|
||||
* subsequent splits follow the initial set of entries.
|
||||
* If future splits into the page are generally after
|
||||
* the initial set of items, perform future deepening
|
||||
* splits in this page to optimize for an append-style
|
||||
* workload.
|
||||
*/
|
||||
uint32_t deepen_split_append;
|
||||
uint32_t deepen_split_last;
|
||||
} intl;
|
||||
#undef pg_intl_recno
|
||||
#define pg_intl_recno u.intl.recno
|
||||
#define pg_intl_parent_ref u.intl.parent_ref
|
||||
#define pg_intl_deepen_split_append u.intl.deepen_split_append
|
||||
#define pg_intl_deepen_split_last u.intl.deepen_split_last
|
||||
|
||||
/*
|
||||
* Macros to copy/set the index because the name is obscured to ensure
|
||||
|
||||
@@ -946,7 +946,8 @@ __wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page)
|
||||
WT_BTREE *btree;
|
||||
WT_INSERT_HEAD *ins_head;
|
||||
WT_INSERT *ins;
|
||||
int i;
|
||||
size_t size;
|
||||
int count;
|
||||
|
||||
btree = S2BT(session);
|
||||
|
||||
@@ -976,25 +977,91 @@ __wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page)
|
||||
return (false);
|
||||
|
||||
/*
|
||||
* There is no point splitting if the list is small, no deep items is
|
||||
* our heuristic for that. A 1/4 probability of adding a new skiplist
|
||||
* level, with level-0 always created, means there will be a 5th level
|
||||
* entry for roughly every 1024 entries in the list. If there are at
|
||||
* least 4 5th level entries (4K items), the list is large enough.
|
||||
* There is no point doing an in-memory split unless there is a lot of
|
||||
* data in the last skiplist on the page. Split if there are enough
|
||||
* items and the skiplist does not fit within a single disk page.
|
||||
*
|
||||
* Rather than scanning the whole list, walk a higher level, which
|
||||
* gives a sample of the items -- at level 0 we have all the items, at
|
||||
* level 1 we have 1/4 and at level 2 we have 1/16th. If we see more
|
||||
* than 30 items and more data than would fit in a disk page, split.
|
||||
*/
|
||||
#define WT_MIN_SPLIT_SKIPLIST_DEPTH WT_MIN(5, WT_SKIP_MAXDEPTH - 1)
|
||||
#define WT_MIN_SPLIT_DEPTH 2
|
||||
#define WT_MIN_SPLIT_COUNT 30
|
||||
#define WT_MIN_SPLIT_MULTIPLIER 16 /* At level 2, we see 1/16th entries */
|
||||
|
||||
ins_head = page->pg_row_entries == 0 ?
|
||||
WT_ROW_INSERT_SMALLEST(page) :
|
||||
WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1);
|
||||
if (ins_head == NULL)
|
||||
return (false);
|
||||
for (i = 0, ins = ins_head->head[WT_MIN_SPLIT_SKIPLIST_DEPTH];
|
||||
ins != NULL; ins = ins->next[WT_MIN_SPLIT_SKIPLIST_DEPTH])
|
||||
if (++i == 4)
|
||||
for (count = 0, size = 0, ins = ins_head->head[WT_MIN_SPLIT_DEPTH];
|
||||
ins != NULL; ins = ins->next[WT_MIN_SPLIT_DEPTH]) {
|
||||
count += WT_MIN_SPLIT_MULTIPLIER;
|
||||
size += WT_MIN_SPLIT_MULTIPLIER *
|
||||
(WT_INSERT_KEY_SIZE(ins) + WT_UPDATE_MEMSIZE(ins->upd));
|
||||
if (count > WT_MIN_SPLIT_COUNT &&
|
||||
size > (size_t)btree->maxleafpage)
|
||||
return (true);
|
||||
}
|
||||
return (false);
|
||||
}
|
||||
|
||||
/*
|
||||
* __wt_ref_addr_free --
|
||||
* Free the address in a reference, if necessary.
|
||||
*/
|
||||
static inline void
|
||||
__wt_ref_addr_free(WT_SESSION_IMPL *session, WT_REF *ref)
|
||||
{
|
||||
if (ref->addr == NULL)
|
||||
return;
|
||||
|
||||
if (ref->home == NULL || __wt_off_page(ref->home, ref->addr)) {
|
||||
__wt_free(session, ((WT_ADDR *)ref->addr)->addr);
|
||||
__wt_free(session, ref->addr);
|
||||
}
|
||||
ref->addr = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* __wt_btree_block_free --
|
||||
* Helper function to free a block from the current tree.
|
||||
*/
|
||||
static inline int
|
||||
__wt_btree_block_free(
|
||||
WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size)
|
||||
{
|
||||
WT_BM *bm;
|
||||
WT_BTREE *btree;
|
||||
|
||||
btree = S2BT(session);
|
||||
bm = btree->bm;
|
||||
|
||||
return (bm->free(bm, session, addr, addr_size));
|
||||
}
|
||||
|
||||
/*
|
||||
* __wt_ref_block_free --
|
||||
* Free the on-disk block for a reference and clear the address.
|
||||
*/
|
||||
static inline int
|
||||
__wt_ref_block_free(WT_SESSION_IMPL *session, WT_REF *ref)
|
||||
{
|
||||
const uint8_t *addr;
|
||||
size_t addr_size;
|
||||
|
||||
if (ref->addr == NULL)
|
||||
return (0);
|
||||
|
||||
WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
|
||||
WT_RET(__wt_btree_block_free(session, addr, addr_size));
|
||||
|
||||
/* Clear the address (so we don't free it twice). */
|
||||
__wt_ref_addr_free(session, ref);
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* __wt_page_can_evict --
|
||||
* Check whether a page can be evicted.
|
||||
@@ -1196,13 +1263,9 @@ __wt_page_swap_func(WT_SESSION_IMPL *session, WT_REF *held,
|
||||
#endif
|
||||
);
|
||||
|
||||
/* An expected failure: WT_NOTFOUND when doing a cache-only read. */
|
||||
if (LF_ISSET(WT_READ_CACHE) && ret == WT_NOTFOUND)
|
||||
return (WT_NOTFOUND);
|
||||
|
||||
/* An expected failure: WT_RESTART */
|
||||
if (ret == WT_RESTART)
|
||||
return (WT_RESTART);
|
||||
/* Expected failures: page not found or restart. */
|
||||
if (ret == WT_NOTFOUND || ret == WT_RESTART)
|
||||
return (ret);
|
||||
|
||||
/* Discard the original held page. */
|
||||
acquired = ret == 0;
|
||||
|
||||
@@ -334,6 +334,8 @@ struct __wt_connection_impl {
|
||||
uint32_t log_prealloc; /* Log file pre-allocation */
|
||||
uint32_t txn_logsync; /* Log sync configuration */
|
||||
|
||||
WT_SESSION_IMPL *meta_ckpt_session;/* Metadata checkpoint session */
|
||||
|
||||
WT_SESSION_IMPL *sweep_session; /* Handle sweep session */
|
||||
wt_thread_t sweep_tid; /* Handle sweep thread */
|
||||
int sweep_tid_set; /* Handle sweep thread set */
|
||||
|
||||
@@ -114,7 +114,7 @@ extern int __wt_debug_tree(WT_SESSION_IMPL *session, WT_PAGE *page, const char *
|
||||
extern int __wt_debug_page(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile);
|
||||
extern int __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp);
|
||||
extern void __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref);
|
||||
extern bool __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref);
|
||||
extern bool __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all);
|
||||
extern int __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref);
|
||||
extern void __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref);
|
||||
extern void __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep);
|
||||
@@ -153,6 +153,7 @@ extern void __wt_split_stash_discard(WT_SESSION_IMPL *session);
|
||||
extern void __wt_split_stash_discard_all( WT_SESSION_IMPL *session_safe, WT_SESSION_IMPL *session);
|
||||
extern int __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp);
|
||||
extern int __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref);
|
||||
extern int __wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref);
|
||||
extern int __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref);
|
||||
extern int __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing);
|
||||
extern int __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst);
|
||||
@@ -315,7 +316,7 @@ extern int __wt_evict_lru_page(WT_SESSION_IMPL *session, bool is_server);
|
||||
extern int __wt_cache_wait(WT_SESSION_IMPL *session, int full);
|
||||
extern void __wt_cache_dump(WT_SESSION_IMPL *session);
|
||||
extern int __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing);
|
||||
extern void __wt_evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref);
|
||||
extern int __wt_evict_page_clean_update( WT_SESSION_IMPL *session, WT_REF *ref, bool closing);
|
||||
extern int __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn);
|
||||
extern int __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn);
|
||||
extern int __wt_log_needs_recovery(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn, bool *recp);
|
||||
@@ -442,6 +443,8 @@ extern int __wt_meta_track_update(WT_SESSION_IMPL *session, const char *key);
|
||||
extern int __wt_meta_track_fileop( WT_SESSION_IMPL *session, const char *olduri, const char *newuri);
|
||||
extern int __wt_meta_track_drop( WT_SESSION_IMPL *session, const char *filename);
|
||||
extern int __wt_meta_track_handle_lock(WT_SESSION_IMPL *session, bool created);
|
||||
extern int __wt_meta_track_init(WT_SESSION_IMPL *session);
|
||||
extern int __wt_meta_track_destroy(WT_SESSION_IMPL *session);
|
||||
extern int __wt_turtle_init(WT_SESSION_IMPL *session);
|
||||
extern int __wt_turtle_read(WT_SESSION_IMPL *session, const char *key, char **valuep);
|
||||
extern int __wt_turtle_update( WT_SESSION_IMPL *session, const char *key, const char *value);
|
||||
@@ -462,7 +465,7 @@ extern int __wt_exist(WT_SESSION_IMPL *session, const char *filename, bool *exis
|
||||
extern void __wt_fallocate_config(WT_SESSION_IMPL *session, WT_FH *fh);
|
||||
extern int __wt_fallocate( WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, wt_off_t len);
|
||||
extern int __wt_filesize(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t *sizep);
|
||||
extern int __wt_filesize_name( WT_SESSION_IMPL *session, const char *filename, wt_off_t *sizep);
|
||||
extern int __wt_filesize_name(WT_SESSION_IMPL *session, const char *filename, bool silent, wt_off_t *sizep);
|
||||
extern int __wt_bytelock(WT_FH *fhp, wt_off_t byte, bool lock);
|
||||
extern int __wt_directory_sync_fh(WT_SESSION_IMPL *session, WT_FH *fh);
|
||||
extern int __wt_directory_sync(WT_SESSION_IMPL *session, char *path);
|
||||
|
||||
@@ -32,13 +32,14 @@
|
||||
#define WT_LOG_FSYNC 0x00000004
|
||||
#define WT_READ_CACHE 0x00000001
|
||||
#define WT_READ_COMPACT 0x00000002
|
||||
#define WT_READ_NO_EVICT 0x00000004
|
||||
#define WT_READ_NO_GEN 0x00000008
|
||||
#define WT_READ_NO_WAIT 0x00000010
|
||||
#define WT_READ_PREV 0x00000020
|
||||
#define WT_READ_SKIP_INTL 0x00000040
|
||||
#define WT_READ_TRUNCATE 0x00000080
|
||||
#define WT_READ_WONT_NEED 0x00000100
|
||||
#define WT_READ_NO_EMPTY 0x00000004
|
||||
#define WT_READ_NO_EVICT 0x00000008
|
||||
#define WT_READ_NO_GEN 0x00000010
|
||||
#define WT_READ_NO_WAIT 0x00000020
|
||||
#define WT_READ_PREV 0x00000040
|
||||
#define WT_READ_SKIP_INTL 0x00000080
|
||||
#define WT_READ_TRUNCATE 0x00000100
|
||||
#define WT_READ_WONT_NEED 0x00000200
|
||||
#define WT_SESSION_CAN_WAIT 0x00000001
|
||||
#define WT_SESSION_CLEAR_EVICT_WALK 0x00000002
|
||||
#define WT_SESSION_DISCARD_FORCE 0x00000004
|
||||
|
||||
@@ -34,6 +34,7 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_txn_state {
|
||||
};
|
||||
|
||||
struct __wt_txn_global {
|
||||
WT_SPINLOCK id_lock;
|
||||
volatile uint64_t current; /* Current transaction ID. */
|
||||
|
||||
/* The oldest running transaction ID (may race). */
|
||||
|
||||
@@ -187,6 +187,13 @@ __wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id)
|
||||
session->dhandle == session->meta_dhandle)
|
||||
return (true);
|
||||
|
||||
/*
|
||||
* If we don't have a transactional snapshot, only make stable updates
|
||||
* visible.
|
||||
*/
|
||||
if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT))
|
||||
return (__wt_txn_visible_all(session, id));
|
||||
|
||||
/* Transactions see their own changes. */
|
||||
if (id == txn->id)
|
||||
return (true);
|
||||
@@ -272,23 +279,6 @@ __wt_txn_autocommit_check(WT_SESSION_IMPL *session)
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* __wt_txn_new_id --
|
||||
* Allocate a new transaction ID.
|
||||
*/
|
||||
static inline uint64_t
|
||||
__wt_txn_new_id(WT_SESSION_IMPL *session)
|
||||
{
|
||||
/*
|
||||
* We want the global value to lead the allocated values, so that any
|
||||
* allocated transaction ID eventually becomes globally visible. When
|
||||
* there are no transactions running, the oldest_id will reach the
|
||||
* global current ID, so we want post-increment semantics. Our atomic
|
||||
* add primitive does pre-increment, so adjust the result here.
|
||||
*/
|
||||
return (__wt_atomic_addv64(&S2C(session)->txn_global.current, 1) - 1);
|
||||
}
|
||||
|
||||
/*
|
||||
* __wt_txn_idle_cache_check --
|
||||
* If there is no transaction active in this thread and we haven't checked
|
||||
@@ -315,6 +305,54 @@ __wt_txn_idle_cache_check(WT_SESSION_IMPL *session)
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* __wt_txn_id_alloc --
|
||||
* Allocate a new transaction ID.
|
||||
*/
|
||||
static inline uint64_t
|
||||
__wt_txn_id_alloc(WT_SESSION_IMPL *session, bool publish)
|
||||
{
|
||||
WT_TXN_GLOBAL *txn_global;
|
||||
uint64_t id;
|
||||
|
||||
txn_global = &S2C(session)->txn_global;
|
||||
|
||||
/*
|
||||
* Allocating transaction IDs involves several steps.
|
||||
*
|
||||
* Firstly, we do an atomic increment to allocate a unique ID. The
|
||||
* field we increment is not used anywhere else.
|
||||
*
|
||||
* Then we optionally publish the allocated ID into the global
|
||||
* transaction table. It is critical that this becomes visible before
|
||||
* the global current value moves past our ID, or some concurrent
|
||||
* reader could get a snapshot that makes our changes visible before we
|
||||
* commit.
|
||||
*
|
||||
* Lastly, we spin to update the current ID. This is the only place
|
||||
* that the current ID is updated, and it is in the same cache line as
|
||||
* the field we allocate from, so we should usually succeed on the
|
||||
* first try.
|
||||
*
|
||||
* We want the global value to lead the allocated values, so that any
|
||||
* allocated transaction ID eventually becomes globally visible. When
|
||||
* there are no transactions running, the oldest_id will reach the
|
||||
* global current ID, so we want post-increment semantics. Our atomic
|
||||
* add primitive does pre-increment, so adjust the result here.
|
||||
*/
|
||||
__wt_spin_lock(session, &txn_global->id_lock);
|
||||
id = txn_global->current;
|
||||
|
||||
if (publish) {
|
||||
session->txn.id = id;
|
||||
WT_PUBLISH(WT_SESSION_TXN_STATE(session)->id, id);
|
||||
}
|
||||
|
||||
++txn_global->current;
|
||||
__wt_spin_unlock(session, &txn_global->id_lock);
|
||||
return (id);
|
||||
}
|
||||
|
||||
/*
|
||||
* __wt_txn_id_check --
|
||||
* A transaction is going to do an update, start an auto commit
|
||||
@@ -323,57 +361,27 @@ __wt_txn_idle_cache_check(WT_SESSION_IMPL *session)
|
||||
static inline int
|
||||
__wt_txn_id_check(WT_SESSION_IMPL *session)
|
||||
{
|
||||
WT_CONNECTION_IMPL *conn;
|
||||
WT_TXN *txn;
|
||||
WT_TXN_GLOBAL *txn_global;
|
||||
WT_TXN_STATE *txn_state;
|
||||
|
||||
txn = &session->txn;
|
||||
|
||||
WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING));
|
||||
|
||||
if (F_ISSET(txn, WT_TXN_HAS_ID))
|
||||
return (0);
|
||||
|
||||
/* If the transaction is idle, check that the cache isn't full. */
|
||||
WT_RET(__wt_txn_idle_cache_check(session));
|
||||
|
||||
if (!F_ISSET(txn, WT_TXN_HAS_ID)) {
|
||||
conn = S2C(session);
|
||||
txn_global = &conn->txn_global;
|
||||
txn_state = WT_SESSION_TXN_STATE(session);
|
||||
(void)__wt_txn_id_alloc(session, true);
|
||||
|
||||
WT_ASSERT(session, txn_state->id == WT_TXN_NONE);
|
||||
|
||||
/*
|
||||
* Allocate a transaction ID.
|
||||
*
|
||||
* We use an atomic compare and swap to ensure that we get a
|
||||
* unique ID that is published before the global counter is
|
||||
* updated.
|
||||
*
|
||||
* If two threads race to allocate an ID, only the latest ID
|
||||
* will proceed. The winning thread can be sure its snapshot
|
||||
* contains all of the earlier active IDs. Threads that race
|
||||
* and get an earlier ID may not appear in the snapshot, but
|
||||
* they will loop and allocate a new ID before proceeding to
|
||||
* make any updates.
|
||||
*
|
||||
* This potentially wastes transaction IDs when threads race to
|
||||
* begin transactions: that is the price we pay to keep this
|
||||
* path latch free.
|
||||
*/
|
||||
do {
|
||||
txn_state->id = txn->id = txn_global->current;
|
||||
} while (!__wt_atomic_casv64(
|
||||
&txn_global->current, txn->id, txn->id + 1) ||
|
||||
WT_TXNID_LT(txn->id, txn_global->last_running));
|
||||
|
||||
/*
|
||||
* If we have used 64-bits of transaction IDs, there is nothing
|
||||
* more we can do.
|
||||
*/
|
||||
if (txn->id == WT_TXN_ABORTED)
|
||||
WT_RET_MSG(session, ENOMEM, "Out of transaction IDs");
|
||||
F_SET(txn, WT_TXN_HAS_ID);
|
||||
}
|
||||
/*
|
||||
* If we have used 64-bits of transaction IDs, there is nothing
|
||||
* more we can do.
|
||||
*/
|
||||
if (txn->id == WT_TXN_ABORTED)
|
||||
WT_RET_MSG(session, ENOMEM, "Out of transaction IDs");
|
||||
F_SET(txn, WT_TXN_HAS_ID);
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
@@ -213,7 +213,7 @@ __wt_lsm_tree_set_chunk_size(
|
||||
if (!WT_PREFIX_SKIP(filename, "file:"))
|
||||
WT_RET_MSG(session, EINVAL,
|
||||
"Expected a 'file:' URI: %s", chunk->uri);
|
||||
WT_RET(__wt_filesize_name(session, filename, &size));
|
||||
WT_RET(__wt_filesize_name(session, filename, false, &size));
|
||||
|
||||
chunk->size = (uint64_t)size;
|
||||
|
||||
@@ -858,7 +858,7 @@ __wt_lsm_tree_switch(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
|
||||
*/
|
||||
if (last_chunk != NULL && last_chunk->switch_txn == WT_TXN_NONE &&
|
||||
!F_ISSET(last_chunk, WT_LSM_CHUNK_ONDISK))
|
||||
last_chunk->switch_txn = __wt_txn_new_id(session);
|
||||
last_chunk->switch_txn = __wt_txn_id_alloc(session, false);
|
||||
|
||||
/*
|
||||
* If a maximum number of chunks are configured, drop the any chunks
|
||||
@@ -1257,7 +1257,7 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, bool *skipp)
|
||||
if (lsm_tree->nchunks > 0 &&
|
||||
(chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]) != NULL) {
|
||||
if (chunk->switch_txn == WT_TXN_NONE)
|
||||
chunk->switch_txn = __wt_txn_new_id(session);
|
||||
chunk->switch_txn = __wt_txn_id_alloc(session, false);
|
||||
/*
|
||||
* If we have a chunk, we want to look for it to be on-disk.
|
||||
* So we need to add a reference to keep it available.
|
||||
|
||||
@@ -261,6 +261,7 @@ __wt_meta_track_off(WT_SESSION_IMPL *session, bool need_sync, bool unroll)
|
||||
{
|
||||
WT_DECL_RET;
|
||||
WT_META_TRACK *trk, *trk_orig;
|
||||
WT_SESSION_IMPL *ckpt_session;
|
||||
|
||||
WT_ASSERT(session,
|
||||
WT_META_TRACKING(session) && session->meta_track_nest > 0);
|
||||
@@ -304,8 +305,18 @@ __wt_meta_track_off(WT_SESSION_IMPL *session, bool need_sync, bool unroll)
|
||||
session, false, WT_TXN_LOG_CKPT_SYNC, NULL));
|
||||
WT_RET(ret);
|
||||
} else {
|
||||
WT_WITH_DHANDLE(session, session->meta_dhandle,
|
||||
ret = __wt_checkpoint(session, NULL));
|
||||
WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SCHEMA));
|
||||
ckpt_session = S2C(session)->meta_ckpt_session;
|
||||
/*
|
||||
* If this operation is part of a running transaction, that
|
||||
* should be included in the checkpoint.
|
||||
*/
|
||||
ckpt_session->txn.id = session->txn.id;
|
||||
F_SET(ckpt_session, WT_SESSION_LOCKED_SCHEMA);
|
||||
WT_WITH_DHANDLE(ckpt_session, session->meta_dhandle, ret =
|
||||
__wt_checkpoint(ckpt_session, NULL));
|
||||
F_CLR(ckpt_session, WT_SESSION_LOCKED_SCHEMA);
|
||||
ckpt_session->txn.id = WT_TXN_NONE;
|
||||
WT_RET(ret);
|
||||
WT_WITH_DHANDLE(session, session->meta_dhandle,
|
||||
ret = __wt_checkpoint_sync(session, NULL));
|
||||
@@ -473,3 +484,52 @@ __wt_meta_track_handle_lock(WT_SESSION_IMPL *session, bool created)
|
||||
trk->created = created;
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* __wt_meta_track_init --
|
||||
* Intialize metadata tracking.
|
||||
*/
|
||||
int
|
||||
__wt_meta_track_init(WT_SESSION_IMPL *session)
|
||||
{
|
||||
WT_CONNECTION_IMPL *conn;
|
||||
|
||||
conn = S2C(session);
|
||||
if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) {
|
||||
WT_RET(__wt_open_internal_session(conn,
|
||||
"metadata-ckpt", false, false,
|
||||
&conn->meta_ckpt_session));
|
||||
|
||||
/*
|
||||
* Sessions default to read-committed isolation, we rely on
|
||||
* that for the correctness of metadata checkpoints.
|
||||
*/
|
||||
WT_ASSERT(session, conn->meta_ckpt_session->txn.isolation ==
|
||||
WT_ISO_READ_COMMITTED);
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* __wt_meta_track_destroy --
|
||||
* Release resources allocated for metadata tracking.
|
||||
*/
|
||||
int
|
||||
__wt_meta_track_destroy(WT_SESSION_IMPL *session)
|
||||
{
|
||||
WT_CONNECTION_IMPL *conn;
|
||||
WT_DECL_RET;
|
||||
WT_SESSION *wt_session;
|
||||
|
||||
conn = S2C(session);
|
||||
|
||||
/* Close the session used for metadata checkpoints. */
|
||||
if (conn->meta_ckpt_session != NULL) {
|
||||
wt_session = &conn->meta_ckpt_session->iface;
|
||||
WT_TRET(wt_session->close(wt_session, NULL));
|
||||
conn->meta_ckpt_session = NULL;
|
||||
}
|
||||
|
||||
return (ret);
|
||||
}
|
||||
|
||||
@@ -34,8 +34,8 @@ __wt_filesize(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t *sizep)
|
||||
* Return the size of a file in bytes, given a file name.
|
||||
*/
|
||||
int
|
||||
__wt_filesize_name(
|
||||
WT_SESSION_IMPL *session, const char *filename, wt_off_t *sizep)
|
||||
__wt_filesize_name(WT_SESSION_IMPL *session,
|
||||
const char *filename, bool silent, wt_off_t *sizep)
|
||||
{
|
||||
struct stat sb;
|
||||
WT_DECL_RET;
|
||||
@@ -52,5 +52,11 @@ __wt_filesize_name(
|
||||
return (0);
|
||||
}
|
||||
|
||||
WT_RET_MSG(session, ret, "%s: fstat", filename);
|
||||
/*
|
||||
* Some callers of this function expect failure if the file doesn't
|
||||
* exist, and don't want an error message logged.
|
||||
*/
|
||||
if (!silent)
|
||||
WT_RET_MSG(session, ret, "%s: fstat", filename);
|
||||
return (ret);
|
||||
}
|
||||
|
||||
@@ -15,8 +15,8 @@
|
||||
int
|
||||
__wt_filesize(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t *sizep)
|
||||
{
|
||||
WT_DECL_RET;
|
||||
LARGE_INTEGER size;
|
||||
WT_DECL_RET;
|
||||
|
||||
WT_RET(__wt_verbose(
|
||||
session, WT_VERB_FILEOPS, "%s: GetFileSizeEx", fh->name));
|
||||
@@ -34,11 +34,11 @@ __wt_filesize(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t *sizep)
|
||||
* Return the size of a file in bytes, given a file name.
|
||||
*/
|
||||
int
|
||||
__wt_filesize_name(
|
||||
WT_SESSION_IMPL *session, const char *filename, wt_off_t *sizep)
|
||||
__wt_filesize_name(WT_SESSION_IMPL *session,
|
||||
const char *filename, bool silent, wt_off_t *sizep)
|
||||
{
|
||||
WT_DECL_RET;
|
||||
WIN32_FILE_ATTRIBUTE_DATA data;
|
||||
WT_DECL_RET;
|
||||
char *path;
|
||||
|
||||
WT_RET(__wt_filename(session, filename, &path));
|
||||
@@ -53,5 +53,12 @@ __wt_filesize_name(
|
||||
return (0);
|
||||
}
|
||||
|
||||
WT_RET_MSG(session, __wt_errno(), "%s: GetFileAttributesEx", filename);
|
||||
/*
|
||||
* Some callers of this function expect failure if the file doesn't
|
||||
* exist, and don't want an error message logged.
|
||||
*/
|
||||
ret = __wt_errno();
|
||||
if (!silent)
|
||||
WT_RET_MSG(session, ret, "%s: GetFileAttributesEx", filename);
|
||||
return (ret);
|
||||
}
|
||||
|
||||
@@ -820,7 +820,7 @@ __wt_ovfl_txnc_add(WT_SESSION_IMPL *session, WT_PAGE *page,
|
||||
txnc->value_offset = WT_PTRDIFF32(p, txnc);
|
||||
txnc->value_size = WT_STORE_SIZE(value_size);
|
||||
memcpy(p, value, value_size);
|
||||
txnc->current = __wt_txn_new_id(session);
|
||||
txnc->current = __wt_txn_id_alloc(session, false);
|
||||
|
||||
__wt_cache_page_inmem_incr(
|
||||
session, page, WT_OVFL_SIZE(txnc, WT_OVFL_TXNC));
|
||||
|
||||
@@ -653,6 +653,7 @@ __rec_write_init(WT_SESSION_IMPL *session,
|
||||
r->flags = flags;
|
||||
|
||||
/* Track if the page can be marked clean. */
|
||||
r->max_txn = WT_TXN_NONE;
|
||||
r->leave_dirty = false;
|
||||
|
||||
/* Raw compression. */
|
||||
@@ -1065,10 +1066,7 @@ static int
|
||||
__rec_child_deleted(WT_SESSION_IMPL *session,
|
||||
WT_RECONCILE *r, WT_REF *ref, WT_CHILD_STATE *statep)
|
||||
{
|
||||
WT_BM *bm;
|
||||
WT_PAGE_DELETED *page_del;
|
||||
size_t addr_size;
|
||||
const uint8_t *addr;
|
||||
|
||||
page_del = ref->page_del;
|
||||
|
||||
@@ -1116,17 +1114,8 @@ __rec_child_deleted(WT_SESSION_IMPL *session,
|
||||
*/
|
||||
if (ref->addr != NULL &&
|
||||
(page_del == NULL ||
|
||||
__wt_txn_visible_all(session, page_del->txnid))) {
|
||||
WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
|
||||
bm = S2BT(session)->bm;
|
||||
WT_RET(bm->free(bm, session, addr, addr_size));
|
||||
|
||||
if (__wt_off_page(ref->home, ref->addr)) {
|
||||
__wt_free(session, ((WT_ADDR *)ref->addr)->addr);
|
||||
__wt_free(session, ref->addr);
|
||||
}
|
||||
ref->addr = NULL;
|
||||
}
|
||||
__wt_txn_visible_all(session, page_del->txnid)))
|
||||
WT_RET(__wt_ref_block_free(session, ref));
|
||||
|
||||
/*
|
||||
* If the original page is gone, we can skip the slot on the internal
|
||||
@@ -4789,13 +4778,11 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins)
|
||||
static int
|
||||
__rec_split_discard(WT_SESSION_IMPL *session, WT_PAGE *page)
|
||||
{
|
||||
WT_BM *bm;
|
||||
WT_DECL_RET;
|
||||
WT_PAGE_MODIFY *mod;
|
||||
WT_MULTI *multi;
|
||||
uint32_t i;
|
||||
|
||||
bm = S2BT(session)->bm;
|
||||
mod = page->modify;
|
||||
|
||||
/*
|
||||
@@ -4815,7 +4802,7 @@ __rec_split_discard(WT_SESSION_IMPL *session, WT_PAGE *page)
|
||||
if (multi->addr.reuse)
|
||||
multi->addr.addr = NULL;
|
||||
else {
|
||||
WT_RET(bm->free(bm, session,
|
||||
WT_RET(__wt_btree_block_free(session,
|
||||
multi->addr.addr, multi->addr.size));
|
||||
__wt_free(session, multi->addr.addr);
|
||||
}
|
||||
@@ -4861,8 +4848,6 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
|
||||
WT_MULTI *multi;
|
||||
WT_PAGE_MODIFY *mod;
|
||||
WT_REF *ref;
|
||||
size_t addr_size;
|
||||
const uint8_t *addr;
|
||||
|
||||
btree = S2BT(session);
|
||||
bm = btree->bm;
|
||||
@@ -4887,21 +4872,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
|
||||
*/
|
||||
if (__wt_ref_is_root(ref))
|
||||
break;
|
||||
if (ref->addr != NULL) {
|
||||
/*
|
||||
* Free the page and clear the address (so we don't free
|
||||
* it twice).
|
||||
*/
|
||||
WT_RET(__wt_ref_info(
|
||||
session, ref, &addr, &addr_size, NULL));
|
||||
WT_RET(bm->free(bm, session, addr, addr_size));
|
||||
if (__wt_off_page(ref->home, ref->addr)) {
|
||||
__wt_free(
|
||||
session, ((WT_ADDR *)ref->addr)->addr);
|
||||
__wt_free(session, ref->addr);
|
||||
}
|
||||
ref->addr = NULL;
|
||||
}
|
||||
WT_RET(__wt_ref_block_free(session, ref));
|
||||
break;
|
||||
case WT_PM_REC_EMPTY: /* Page deleted */
|
||||
break;
|
||||
@@ -4920,7 +4891,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
|
||||
* are checkpoints, and must be explicitly dropped.
|
||||
*/
|
||||
if (!__wt_ref_is_root(ref))
|
||||
WT_RET(bm->free(bm, session,
|
||||
WT_RET(__wt_btree_block_free(session,
|
||||
mod->mod_replace.addr, mod->mod_replace.size));
|
||||
|
||||
/* Discard the replacement page's address. */
|
||||
@@ -5125,14 +5096,12 @@ err: __wt_scr_free(session, &tkey);
|
||||
static int
|
||||
__rec_write_wrapup_err(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
|
||||
{
|
||||
WT_BM *bm;
|
||||
WT_BOUNDARY *bnd;
|
||||
WT_DECL_RET;
|
||||
WT_MULTI *multi;
|
||||
WT_PAGE_MODIFY *mod;
|
||||
uint32_t i;
|
||||
|
||||
bm = S2BT(session)->bm;
|
||||
mod = page->modify;
|
||||
|
||||
/*
|
||||
@@ -5163,7 +5132,7 @@ __rec_write_wrapup_err(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
|
||||
if (bnd->addr.reuse)
|
||||
bnd->addr.addr = NULL;
|
||||
else {
|
||||
WT_TRET(bm->free(bm, session,
|
||||
WT_TRET(__wt_btree_block_free(session,
|
||||
bnd->addr.addr, bnd->addr.size));
|
||||
__wt_free(session, bnd->addr.addr);
|
||||
}
|
||||
|
||||
@@ -52,6 +52,67 @@ err: __wt_scr_free(session, &buf);
|
||||
return (ret);
|
||||
}
|
||||
|
||||
/*
|
||||
* __curstat_size_only --
|
||||
* For very simple tables we can avoid getting table handles if
|
||||
* configured to only retrieve the size. It's worthwhile because
|
||||
* workloads that create and drop a lot of tables can put a lot of
|
||||
* pressure on the table list lock.
|
||||
*/
|
||||
static int
|
||||
__curstat_size_only(WT_SESSION_IMPL *session,
|
||||
const char *uri, bool *was_fast,WT_CURSOR_STAT *cst)
|
||||
{
|
||||
WT_CONFIG cparser;
|
||||
WT_CONFIG_ITEM ckey, colconf, cval;
|
||||
WT_DECL_RET;
|
||||
WT_ITEM namebuf;
|
||||
wt_off_t filesize;
|
||||
char *tableconf;
|
||||
|
||||
WT_CLEAR(namebuf);
|
||||
*was_fast = false;
|
||||
|
||||
/* Retrieve the metadata for this table. */
|
||||
WT_RET(__wt_metadata_search(session, uri, &tableconf));
|
||||
|
||||
/*
|
||||
* The fast path only works if the table consists of a single file
|
||||
* and does not have any indexes. The absence of named columns is how
|
||||
* we determine that neither of those conditions can be satisfied.
|
||||
*/
|
||||
WT_ERR(__wt_config_getones(session, tableconf, "columns", &colconf));
|
||||
WT_ERR(__wt_config_subinit(session, &cparser, &colconf));
|
||||
if ((ret = __wt_config_next(&cparser, &ckey, &cval)) == 0)
|
||||
goto err;
|
||||
|
||||
/* Build up the file name from the table URI. */
|
||||
WT_ERR(__wt_buf_fmt(
|
||||
session, &namebuf, "%s.wt", uri + strlen("table:")));
|
||||
|
||||
/*
|
||||
* Get the size of the underlying file. This will fail for anything
|
||||
* other than simple tables (LSM for example) and will fail if there
|
||||
* are concurrent schema level operations (for example drop). That is
|
||||
* fine - failing here results in falling back to the slow path of
|
||||
* opening the handle.
|
||||
* !!! Deliberately discard the return code from a failed call - the
|
||||
* error is flagged by not setting fast to true.
|
||||
*/
|
||||
if (__wt_filesize_name(session, namebuf.data, true, &filesize) == 0) {
|
||||
/* Setup and populate the statistics structure */
|
||||
__wt_stat_init_dsrc_stats(&cst->u.dsrc_stats);
|
||||
WT_STAT_SET(&cst->u.dsrc_stats, block_size, filesize);
|
||||
__wt_curstat_dsrc_final(cst);
|
||||
*was_fast = true;
|
||||
}
|
||||
|
||||
err: __wt_free(session, tableconf);
|
||||
__wt_buf_free(session, &namebuf);
|
||||
|
||||
return (ret);
|
||||
}
|
||||
|
||||
/*
|
||||
* __wt_curstat_table_init --
|
||||
* Initialize the statistics for a table.
|
||||
@@ -67,6 +128,17 @@ __wt_curstat_table_init(WT_SESSION_IMPL *session,
|
||||
WT_TABLE *table;
|
||||
u_int i;
|
||||
const char *name;
|
||||
bool was_fast;
|
||||
|
||||
/*
|
||||
* If only gathering table size statistics, try a fast path that
|
||||
* avoids the schema and table list locks.
|
||||
*/
|
||||
if (F_ISSET(cst, WT_CONN_STAT_SIZE)) {
|
||||
WT_RET(__curstat_size_only(session, uri, &was_fast, cst));
|
||||
if (was_fast)
|
||||
return (0);
|
||||
}
|
||||
|
||||
name = uri + strlen("table:");
|
||||
WT_RET(__wt_schema_get_table(
|
||||
|
||||
@@ -597,6 +597,9 @@ __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[])
|
||||
txn_global->current = txn_global->last_running =
|
||||
txn_global->oldest_id = WT_TXN_FIRST;
|
||||
|
||||
WT_RET(__wt_spin_init(session,
|
||||
&txn_global->id_lock, "transaction id lock"));
|
||||
|
||||
WT_RET(__wt_calloc_def(
|
||||
session, conn->session_size, &txn_global->states));
|
||||
for (i = 0, s = txn_global->states; i < conn->session_size; i++, s++)
|
||||
@@ -618,6 +621,6 @@ __wt_txn_global_destroy(WT_SESSION_IMPL *session)
|
||||
conn = S2C(session);
|
||||
txn_global = &conn->txn_global;
|
||||
|
||||
if (txn_global != NULL)
|
||||
__wt_free(session, txn_global->states);
|
||||
__wt_spin_destroy(session, &txn_global->id_lock);
|
||||
__wt_free(session, txn_global->states);
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#! /bin/sh
|
||||
|
||||
# Smoke-test format as part of running "make check".
|
||||
args="-1 -c "." data_source=table ops=100000 rows=10000 threads=4 compression=none logging_compression=none"
|
||||
args="-1 -c "." data_source=table ops=100000 rows=10000 threads=4 compression=none"
|
||||
|
||||
./t $args file_type=fix || exit 1
|
||||
./t $args file_type=row || exit 1
|
||||
|
||||
@@ -37,20 +37,24 @@ from helper import complex_populate, complex_populate_lsm, simple_populate
|
||||
class test_stat_cursor_config(wttest.WiredTigerTestCase):
|
||||
pfx = 'test_stat_cursor_config'
|
||||
uri = [
|
||||
('1', dict(uri='file:' + pfx, pop=simple_populate)),
|
||||
('2', dict(uri='table:' + pfx, pop=simple_populate)),
|
||||
('3', dict(uri='table:' + pfx, pop=complex_populate)),
|
||||
('4', dict(uri='table:' + pfx, pop=complex_populate_lsm))
|
||||
('file', dict(uri='file:' + pfx, pop=simple_populate, cfg='')),
|
||||
('table', dict(uri='table:' + pfx, pop=simple_populate, cfg='')),
|
||||
('table-lsm',
|
||||
dict(uri='table:' + pfx, pop=simple_populate, cfg=',type=lsm')),
|
||||
('complex', dict(uri='table:' + pfx, pop=complex_populate, cfg='')),
|
||||
('complex-lsm',
|
||||
dict(uri='table:' + pfx, pop=complex_populate_lsm, cfg=''))
|
||||
]
|
||||
data_config = [
|
||||
('none', dict(data_config='none', ok=[])),
|
||||
( 'all', dict(data_config='all', ok=['empty', 'fast', 'all'])),
|
||||
('fast', dict(data_config='fast', ok=['empty', 'fast']))
|
||||
( 'all', dict(data_config='all', ok=['empty', 'fast', 'all', 'size'])),
|
||||
('fast', dict(data_config='fast', ok=['empty', 'fast', 'size']))
|
||||
]
|
||||
cursor_config = [
|
||||
('empty', dict(cursor_config='empty')),
|
||||
( 'all', dict(cursor_config='all')),
|
||||
('fast', dict(cursor_config='fast'))
|
||||
('fast', dict(cursor_config='fast')),
|
||||
('size', dict(cursor_config='size'))
|
||||
]
|
||||
|
||||
scenarios = number_scenarios(
|
||||
@@ -67,7 +71,7 @@ class test_stat_cursor_config(wttest.WiredTigerTestCase):
|
||||
# For each database/cursor configuration, confirm the right combinations
|
||||
# succeed or fail.
|
||||
def test_stat_cursor_config(self):
|
||||
self.pop(self, self.uri, 'key_format=S', 100)
|
||||
self.pop(self, self.uri, 'key_format=S' + self.cfg, 100)
|
||||
config = 'statistics=('
|
||||
if self.cursor_config != 'empty':
|
||||
config = config + self.cursor_config
|
||||
|
||||
89
test/suite/test_stat05.py
Normal file
89
test/suite/test_stat05.py
Normal file
@@ -0,0 +1,89 @@
|
||||
#!/usr/bin/env python
|
||||
#
|
||||
# Public Domain 2014-2015 MongoDB, Inc.
|
||||
# Public Domain 2008-2014 WiredTiger, Inc.
|
||||
#
|
||||
# This is free and unencumbered software released into the public domain.
|
||||
#
|
||||
# Anyone is free to copy, modify, publish, use, compile, sell, or
|
||||
# distribute this software, either in source code form or as a compiled
|
||||
# binary, for any purpose, commercial or non-commercial, and by any
|
||||
# means.
|
||||
#
|
||||
# In jurisdictions that recognize copyright laws, the author or authors
|
||||
# of this software dedicate any and all copyright interest in the
|
||||
# software to the public domain. We make this dedication for the benefit
|
||||
# of the public at large and to the detriment of our heirs and
|
||||
# successors. We intend this dedication to be an overt act of
|
||||
# relinquishment in perpetuity of all present and future rights to this
|
||||
# software under copyright law.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
# OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
import itertools, wiredtiger, wttest
|
||||
from suite_subprocess import suite_subprocess
|
||||
from wtscenario import multiply_scenarios, number_scenarios
|
||||
from wiredtiger import stat
|
||||
from helper import complex_populate, complex_populate_lsm, simple_populate
|
||||
from helper import complex_value_populate, key_populate, value_populate
|
||||
|
||||
# test_stat05.py
|
||||
# Statistics cursor using size only
|
||||
class test_stat_cursor_config(wttest.WiredTigerTestCase):
|
||||
pfx = 'test_stat_cursor_size'
|
||||
uri = [
|
||||
('file', dict(uri='file:' + pfx, pop=simple_populate, cfg='')),
|
||||
('table', dict(uri='table:' + pfx, pop=simple_populate, cfg='')),
|
||||
('table-lsm', dict(uri='table:' + pfx, pop=simple_populate,
|
||||
cfg=',type=lsm,lsm=(chunk_size=1MB,merge_min=2)')),
|
||||
('complex', dict(uri='table:' + pfx, pop=complex_populate, cfg='')),
|
||||
('complex-lsm',
|
||||
dict(uri='table:' + pfx, pop=complex_populate_lsm,
|
||||
cfg=',lsm=(chunk_size=1MB,merge_min=2)')),
|
||||
]
|
||||
|
||||
scenarios = number_scenarios(uri)
|
||||
|
||||
# Override WiredTigerTestCase to enable statistics
|
||||
def setUpConnectionOpen(self, dir):
|
||||
conn = wiredtiger.wiredtiger_open(dir,
|
||||
'create,' +
|
||||
'statistics=(fast),' +
|
||||
'error_prefix="%s: "' % self.shortid())
|
||||
return conn
|
||||
|
||||
def openAndWalkStatCursor(self):
|
||||
c = self.session.open_cursor(
|
||||
'statistics:' + self.uri, None, 'statistics=(size)')
|
||||
count = 0
|
||||
while c.next() == 0:
|
||||
count += 1
|
||||
c.close()
|
||||
|
||||
|
||||
# Open a size-only statistics cursor on various table types. Ensure that
|
||||
# the cursor open succeeds. Insert enough data that LSM tables to need to
|
||||
# switch and merge.
|
||||
def test_stat_cursor_size(self):
|
||||
self.pop(self, self.uri, 'key_format=S' + self.cfg, 100)
|
||||
self.openAndWalkStatCursor()
|
||||
cursor = self.session.open_cursor(self.uri, None)
|
||||
for i in range(100, 40000 + 1):
|
||||
if i % 100 == 0:
|
||||
self.openAndWalkStatCursor()
|
||||
if self.pop == simple_populate:
|
||||
cursor[key_populate(cursor, i)] = value_populate(cursor, i)
|
||||
else:
|
||||
v = complex_value_populate(cursor, i)
|
||||
cursor[key_populate(cursor, i)] = (v[0], v[1], v[2], v[3])
|
||||
cursor.close()
|
||||
self.openAndWalkStatCursor()
|
||||
|
||||
if __name__ == '__main__':
|
||||
wttest.run()
|
||||
Reference in New Issue
Block a user