Merge pull request #2260 from wiredtiger/reverse-splits
SERVER-21027 Reverse split if there are many deleted pages
This commit is contained in:
1
dist/flags.py
vendored
1
dist/flags.py
vendored
@@ -36,6 +36,7 @@ flags = {
|
||||
'page_read' : [
|
||||
'READ_CACHE',
|
||||
'READ_COMPACT',
|
||||
'READ_NO_EMPTY',
|
||||
'READ_NO_EVICT',
|
||||
'READ_NO_GEN',
|
||||
'READ_NO_WAIT',
|
||||
|
||||
@@ -214,10 +214,11 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
|
||||
|
||||
/*
|
||||
* __wt_delete_page_skip --
|
||||
* If iterating a cursor, skip deleted pages that are visible to us.
|
||||
* If iterating a cursor, skip deleted pages that are either visible to
|
||||
* us or globally visible.
|
||||
*/
|
||||
bool
|
||||
__wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref)
|
||||
__wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all)
|
||||
{
|
||||
bool skip;
|
||||
|
||||
@@ -245,8 +246,9 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref)
|
||||
if (!__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED))
|
||||
return (false);
|
||||
|
||||
skip = ref->page_del == NULL ||
|
||||
__wt_txn_visible(session, ref->page_del->txnid);
|
||||
skip = ref->page_del == NULL || (visible_all ?
|
||||
__wt_txn_visible_all(session, ref->page_del->txnid) :
|
||||
__wt_txn_visible(session, ref->page_del->txnid));
|
||||
|
||||
WT_PUBLISH(ref->state, WT_REF_DELETED);
|
||||
return (skip);
|
||||
|
||||
@@ -448,8 +448,12 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
|
||||
for (oldgen = stalled = false,
|
||||
force_attempts = 0, sleep_cnt = wait_cnt = 0;;) {
|
||||
switch (ref->state) {
|
||||
case WT_REF_DISK:
|
||||
case WT_REF_DELETED:
|
||||
if (LF_ISSET(WT_READ_NO_EMPTY) &&
|
||||
__wt_delete_page_skip(session, ref, false))
|
||||
return (WT_NOTFOUND);
|
||||
/* FALLTHROUGH */
|
||||
case WT_REF_DISK:
|
||||
if (LF_ISSET(WT_READ_CACHE))
|
||||
return (WT_NOTFOUND);
|
||||
|
||||
|
||||
@@ -1010,11 +1010,12 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
|
||||
* reading thread will restart. Include the ref we are splitting in
|
||||
* the count to be deleted.
|
||||
*/
|
||||
for (i = 0, deleted_entries = 1; i < parent_entries; ++i) {
|
||||
deleted_entries = ref_new != NULL ? 1 : 0;
|
||||
for (i = 0; i < parent_entries; ++i) {
|
||||
next_ref = pindex->index[i];
|
||||
WT_ASSERT(session, next_ref->state != WT_REF_SPLIT);
|
||||
if (next_ref->state == WT_REF_DELETED &&
|
||||
__wt_delete_page_skip(session, next_ref) &&
|
||||
__wt_delete_page_skip(session, next_ref, true) &&
|
||||
__wt_atomic_casv32(
|
||||
&next_ref->state, WT_REF_DELETED, WT_REF_SPLIT))
|
||||
deleted_entries++;
|
||||
@@ -1026,6 +1027,18 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
|
||||
*/
|
||||
result_entries = (parent_entries + new_entries) - deleted_entries;
|
||||
|
||||
/*
|
||||
* If the entire (sub)tree is empty, leave the first ref in place,
|
||||
* deleted.
|
||||
*/
|
||||
if (result_entries == 0) {
|
||||
next_ref = pindex->index[0];
|
||||
WT_ASSERT(session, next_ref->state == WT_REF_SPLIT);
|
||||
next_ref->state = WT_REF_DELETED;
|
||||
--deleted_entries;
|
||||
result_entries = 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate and initialize a new page index array for the parent, then
|
||||
* copy references from the original index array, plus references from
|
||||
@@ -1507,6 +1520,29 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
|
||||
return (ret);
|
||||
}
|
||||
|
||||
/*
|
||||
* __wt_split_reverse --
|
||||
* Lock, then reverse split an internal page (remove deleted refs).
|
||||
*/
|
||||
int
|
||||
__wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref)
|
||||
{
|
||||
WT_DECL_RET;
|
||||
WT_PAGE *parent;
|
||||
WT_REF dummy_child;
|
||||
bool hazard;
|
||||
|
||||
WT_CLEAR(dummy_child);
|
||||
dummy_child.home = dummy_child.page = ref->page;
|
||||
dummy_child.state = WT_REF_MEM;
|
||||
|
||||
WT_RET(__split_parent_lock(session, &dummy_child, &parent, &hazard));
|
||||
WT_ASSERT(session, parent == ref->page);
|
||||
ret = __split_parent(session, &dummy_child, NULL, 0, 0, 0);
|
||||
WT_TRET(__split_parent_unlock(session, parent, hazard));
|
||||
return (ret);
|
||||
}
|
||||
|
||||
/*
|
||||
* __wt_split_rewrite --
|
||||
* Rewrite an in-memory page with a new version.
|
||||
|
||||
@@ -94,6 +94,9 @@ __wt_tree_walk(WT_SESSION_IMPL *session,
|
||||
*/
|
||||
WT_ENTER_PAGE_INDEX(session);
|
||||
|
||||
/* Walk should never instantiate deleted pages. */
|
||||
LF_SET(WT_READ_NO_EMPTY);
|
||||
|
||||
/*
|
||||
* !!!
|
||||
* Fast-truncate currently only works on row-store trees.
|
||||
@@ -174,9 +177,10 @@ ascend: /*
|
||||
|
||||
/*
|
||||
* If we got all the way through an internal page and
|
||||
* all of the child pages were deleted, evict it.
|
||||
* all of the child pages were deleted, mark it for
|
||||
* eviction.
|
||||
*/
|
||||
if (empty_internal) {
|
||||
if (empty_internal && pindex->entries > 1) {
|
||||
__wt_page_evict_soon(ref->page);
|
||||
empty_internal = false;
|
||||
}
|
||||
@@ -257,7 +261,7 @@ ascend: /*
|
||||
* to delete it again.
|
||||
*/
|
||||
if (ref->state == WT_REF_DELETED &&
|
||||
__wt_delete_page_skip(session, ref))
|
||||
__wt_delete_page_skip(session, ref, false))
|
||||
break;
|
||||
/*
|
||||
* If deleting a range, try to delete the page
|
||||
@@ -294,7 +298,7 @@ ascend: /*
|
||||
* Try to skip deleted pages visible to us.
|
||||
*/
|
||||
if (ref->state == WT_REF_DELETED &&
|
||||
__wt_delete_page_skip(session, ref))
|
||||
__wt_delete_page_skip(session, ref, false))
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -302,7 +306,7 @@ ascend: /*
|
||||
|
||||
/*
|
||||
* Not-found is an expected return when only walking
|
||||
* in-cache pages.
|
||||
* in-cache pages, or if we see a deleted page.
|
||||
*/
|
||||
if (ret == WT_NOTFOUND) {
|
||||
ret = 0;
|
||||
|
||||
@@ -142,6 +142,28 @@ done: if (((inmem_split && ret == 0) || (forced_eviction && ret == EBUSY)) &&
|
||||
|
||||
return (ret);
|
||||
}
|
||||
/*
|
||||
* __evict_reverse_split_check --
|
||||
* Check if an internal page needs a reverse split.
|
||||
*/
|
||||
static int
|
||||
__evict_reverse_split_check(WT_SESSION_IMPL *session, WT_REF *ref)
|
||||
{
|
||||
WT_PAGE *parent;
|
||||
WT_PAGE_INDEX *pindex;
|
||||
uint32_t deleted_entries;
|
||||
|
||||
if (__wt_ref_is_root(ref))
|
||||
return (0);
|
||||
|
||||
parent = ref->home;
|
||||
WT_INTL_INDEX_GET(session, parent, pindex);
|
||||
deleted_entries = __wt_atomic_addv32(&pindex->deleted_entries, 1);
|
||||
if (deleted_entries > pindex->entries / 10)
|
||||
WT_RET(__wt_split_reverse(session, parent->pg_intl_parent_ref));
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* __wt_evict_page_clean_update --
|
||||
@@ -151,6 +173,8 @@ int
|
||||
__wt_evict_page_clean_update(
|
||||
WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
|
||||
{
|
||||
WT_DECL_RET;
|
||||
|
||||
/*
|
||||
* If doing normal system eviction, but only in the service of reducing
|
||||
* the number of dirty pages, leave the clean page in cache.
|
||||
@@ -164,8 +188,13 @@ __wt_evict_page_clean_update(
|
||||
* page re-instantiated (for example, by searching) and never written.
|
||||
*/
|
||||
__wt_ref_out(session, ref);
|
||||
WT_PUBLISH(ref->state,
|
||||
ref->addr == NULL ? WT_REF_DELETED : WT_REF_DISK);
|
||||
if (ref->addr == NULL) {
|
||||
WT_PUBLISH(ref->state, WT_REF_DELETED);
|
||||
WT_WITH_PAGE_INDEX(session,
|
||||
ret = __evict_reverse_split_check(session, ref));
|
||||
WT_RET_BUSY_OK(ret);
|
||||
} else
|
||||
WT_PUBLISH(ref->state, WT_REF_DISK);
|
||||
|
||||
return (0);
|
||||
}
|
||||
@@ -178,6 +207,7 @@ static int
|
||||
__evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
|
||||
{
|
||||
WT_ADDR *addr;
|
||||
WT_DECL_RET;
|
||||
WT_PAGE *parent;
|
||||
WT_PAGE_MODIFY *mod;
|
||||
|
||||
@@ -207,6 +237,9 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
|
||||
__wt_ref_out(session, ref);
|
||||
ref->addr = NULL;
|
||||
WT_PUBLISH(ref->state, WT_REF_DELETED);
|
||||
WT_WITH_PAGE_INDEX(session,
|
||||
ret = __evict_reverse_split_check(session, ref));
|
||||
WT_RET_BUSY_OK(ret);
|
||||
break;
|
||||
case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */
|
||||
/*
|
||||
|
||||
@@ -432,6 +432,7 @@ struct __wt_page {
|
||||
|
||||
struct __wt_page_index {
|
||||
uint32_t entries;
|
||||
uint32_t deleted_entries;
|
||||
WT_REF **index;
|
||||
} * volatile __index; /* Collated children */
|
||||
|
||||
|
||||
@@ -1263,13 +1263,9 @@ __wt_page_swap_func(WT_SESSION_IMPL *session, WT_REF *held,
|
||||
#endif
|
||||
);
|
||||
|
||||
/* An expected failure: WT_NOTFOUND when doing a cache-only read. */
|
||||
if (LF_ISSET(WT_READ_CACHE) && ret == WT_NOTFOUND)
|
||||
return (WT_NOTFOUND);
|
||||
|
||||
/* An expected failure: WT_RESTART */
|
||||
if (ret == WT_RESTART)
|
||||
return (WT_RESTART);
|
||||
/* Expected failures: page not found or restart. */
|
||||
if (ret == WT_NOTFOUND || ret == WT_RESTART)
|
||||
return (ret);
|
||||
|
||||
/* Discard the original held page. */
|
||||
acquired = ret == 0;
|
||||
|
||||
@@ -116,7 +116,7 @@ extern int __wt_debug_tree( WT_SESSION_IMPL *session, WT_BTREE *btree, WT_PAGE *
|
||||
extern int __wt_debug_page(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile);
|
||||
extern int __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp);
|
||||
extern void __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref);
|
||||
extern bool __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref);
|
||||
extern bool __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all);
|
||||
extern int __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref);
|
||||
extern void __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref);
|
||||
extern void __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep);
|
||||
@@ -155,6 +155,7 @@ extern void __wt_split_stash_discard(WT_SESSION_IMPL *session);
|
||||
extern void __wt_split_stash_discard_all( WT_SESSION_IMPL *session_safe, WT_SESSION_IMPL *session);
|
||||
extern int __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp);
|
||||
extern int __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref);
|
||||
extern int __wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref);
|
||||
extern int __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref);
|
||||
extern int __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing);
|
||||
extern int __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst);
|
||||
|
||||
@@ -36,13 +36,14 @@
|
||||
#define WT_LOG_FSYNC 0x00000008
|
||||
#define WT_READ_CACHE 0x00000001
|
||||
#define WT_READ_COMPACT 0x00000002
|
||||
#define WT_READ_NO_EVICT 0x00000004
|
||||
#define WT_READ_NO_GEN 0x00000008
|
||||
#define WT_READ_NO_WAIT 0x00000010
|
||||
#define WT_READ_PREV 0x00000020
|
||||
#define WT_READ_SKIP_INTL 0x00000040
|
||||
#define WT_READ_TRUNCATE 0x00000080
|
||||
#define WT_READ_WONT_NEED 0x00000100
|
||||
#define WT_READ_NO_EMPTY 0x00000004
|
||||
#define WT_READ_NO_EVICT 0x00000008
|
||||
#define WT_READ_NO_GEN 0x00000010
|
||||
#define WT_READ_NO_WAIT 0x00000020
|
||||
#define WT_READ_PREV 0x00000040
|
||||
#define WT_READ_SKIP_INTL 0x00000080
|
||||
#define WT_READ_TRUNCATE 0x00000100
|
||||
#define WT_READ_WONT_NEED 0x00000200
|
||||
#define WT_SESSION_CAN_WAIT 0x00000001
|
||||
#define WT_SESSION_CLEAR_EVICT_WALK 0x00000002
|
||||
#define WT_SESSION_INTERNAL 0x00000004
|
||||
|
||||
Reference in New Issue
Block a user