Merge pull request #2271 from wiredtiger/reverse-split-fix

SERVER-21027 Fix reverse splits to keep the original child ref locked
This commit is contained in:
Alex Gorrod
2015-10-29 16:47:19 +11:00
3 changed files with 53 additions and 40 deletions

View File

@@ -1010,8 +1010,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
* reading thread will restart. Include the ref we are splitting in
* the count to be deleted.
*/
deleted_entries = ref_new != NULL ? 1 : 0;
for (i = 0; i < parent_entries; ++i) {
for (deleted_entries = 1, i = 0; i < parent_entries; ++i) {
next_ref = pindex->index[i];
WT_ASSERT(session, next_ref->state != WT_REF_SPLIT);
if (next_ref->state == WT_REF_DELETED &&
@@ -1033,7 +1032,8 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
*/
if (result_entries == 0) {
next_ref = pindex->index[0];
WT_ASSERT(session, next_ref->state == WT_REF_SPLIT);
WT_ASSERT(session, next_ref->state == WT_REF_SPLIT ||
(next_ref == ref && ref->state == WT_REF_LOCKED));
next_ref->state = WT_REF_DELETED;
--deleted_entries;
result_entries = 1;
@@ -1119,9 +1119,9 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
"%s split into parent %" PRIu32 " -> %" PRIu32
" (%" PRIu32 ")",
__wt_page_type_string(ref->page->type), parent_entries,
result_entries, result_entries - parent_entries));
" (%" PRIu32 ")", ref->page == NULL ?
"reverse" : __wt_page_type_string(ref->page->type),
parent_entries, result_entries, result_entries - parent_entries));
/*
* The new page index is in place, free the WT_REF we were splitting
@@ -1522,23 +1522,18 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
/*
* __wt_split_reverse --
* Lock, then reverse split an internal page (remove deleted refs).
* We have a locked ref that is empty and we want to rewrite the index in
* its parent.
*/
int
__wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref)
{
WT_DECL_RET;
WT_PAGE *parent;
WT_REF dummy_child;
bool hazard;
WT_CLEAR(dummy_child);
dummy_child.home = dummy_child.page = ref->page;
dummy_child.state = WT_REF_MEM;
WT_RET(__split_parent_lock(session, &dummy_child, &parent, &hazard));
WT_ASSERT(session, parent == ref->page);
ret = __split_parent(session, &dummy_child, NULL, 0, 0, 0);
WT_RET(__split_parent_lock(session, ref, &parent, &hazard));
ret = __split_parent(session, ref, NULL, 0, 0, 0);
WT_TRET(__split_parent_unlock(session, parent, hazard));
return (ret);
}

View File

@@ -32,6 +32,9 @@ __wt_evict_file(WT_SESSION_IMPL *session, int syncop)
/* Make sure the oldest transaction ID is up-to-date. */
__wt_txn_update_oldest(session, true);
if (txn->isolation == WT_ISO_READ_COMMITTED)
__wt_txn_get_snapshot(session);
/* Walk the tree, discarding pages. */
next_ref = NULL;
WT_ERR(__wt_tree_walk(session, &next_ref, NULL,
@@ -59,11 +62,12 @@ __wt_evict_file(WT_SESSION_IMPL *session, int syncop)
* and the write will fail with EBUSY. Our caller handles that
* error, retrying later.
*/
if (syncop == WT_SYNC_CLOSE && __wt_page_is_modified(page)) {
if (txn->isolation == WT_ISO_READ_COMMITTED)
__wt_txn_get_snapshot(session);
if (syncop == WT_SYNC_CLOSE && __wt_page_is_modified(page))
WT_ERR(__wt_reconcile(session, ref, NULL, WT_EVICTING));
}
/* Update our snapshot for each new page. */
if (txn->isolation == WT_ISO_READ_COMMITTED)
__wt_txn_get_snapshot(session);
/*
* We can't evict the page just returned to us (it marks our

View File

@@ -143,25 +143,47 @@ done: if (((inmem_split && ret == 0) || (forced_eviction && ret == EBUSY)) &&
return (ret);
}
/*
* __evict_reverse_split_check --
* Check if an internal page needs a reverse split.
* __evict_delete_ref --
* Mark a page reference deleted and check if the parent can reverse
* split.
*/
static int
__evict_reverse_split_check(WT_SESSION_IMPL *session, WT_REF *ref)
__evict_delete_ref(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
{
WT_DECL_RET;
WT_PAGE *parent;
WT_PAGE_INDEX *pindex;
uint32_t deleted_entries;
uint32_t ndeleted;
if (__wt_ref_is_root(ref))
return (0);
parent = ref->home;
WT_INTL_INDEX_GET(session, parent, pindex);
deleted_entries = __wt_atomic_addv32(&pindex->deleted_entries, 1);
if (deleted_entries > pindex->entries / 10)
WT_RET(__wt_split_reverse(session, parent->pg_intl_parent_ref));
/*
* Avoid doing reverse splits when closing the file, it is
* wasted work and some structure may already have been freed.
*/
if (!closing) {
parent = ref->home;
WT_INTL_INDEX_GET(session, parent, pindex);
ndeleted = __wt_atomic_addv32(&pindex->deleted_entries, 1);
/*
* If more than 10% of the parent references are deleted, try a
* reverse split. Don't bother if there is a single deleted
* reference: the internal page is empty and we have to wait
* for eviction to notice.
*
* This will consume the deleted ref (and eventually free it).
* If the reverse split can't get the access it needs because
* something is busy, be sure that the page still ends up
* marked deleted.
*/
if (ndeleted > pindex->entries / 10 && pindex->entries > 1 &&
(ret = __wt_split_reverse(session, ref)) != EBUSY)
return (ret);
}
WT_PUBLISH(ref->state, WT_REF_DELETED);
return (0);
}
@@ -189,16 +211,9 @@ __wt_evict_page_clean_update(
*/
__wt_ref_out(session, ref);
if (ref->addr == NULL) {
WT_PUBLISH(ref->state, WT_REF_DELETED);
/*
* Avoid doing reverse splits when closing the file, it is
* wasted work and some structure may already have been freed.
*/
if (!closing) {
WT_WITH_PAGE_INDEX(session,
ret = __evict_reverse_split_check(session, ref));
WT_RET_BUSY_OK(ret);
}
WT_WITH_PAGE_INDEX(session,
ret = __evict_delete_ref(session, ref, closing));
WT_RET_BUSY_OK(ret);
} else
WT_PUBLISH(ref->state, WT_REF_DISK);
@@ -242,9 +257,8 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
*/
__wt_ref_out(session, ref);
ref->addr = NULL;
WT_PUBLISH(ref->state, WT_REF_DELETED);
WT_WITH_PAGE_INDEX(session,
ret = __evict_reverse_split_check(session, ref));
ret = __evict_delete_ref(session, ref, closing));
WT_RET_BUSY_OK(ret);
break;
case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */