Files
mongo/src/btree/bt_sync.c
2016-06-03 14:23:11 +10:00

302 lines
8.9 KiB
C

/*-
* Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
* See the file LICENSE for redistribution information.
*/
#include "wt_internal.h"
/*
* __sync_file --
* Flush pages for a specific file.
*/
static int
__sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
{
struct timespec end, start;
WT_BTREE *btree;
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
WT_PAGE *page;
WT_PAGE_MODIFY *mod;
WT_REF *walk;
WT_TXN *txn;
uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages;
uint64_t oldest_id, saved_snap_min;
uint32_t flags;
conn = S2C(session);
btree = S2BT(session);
walk = NULL;
txn = &session->txn;
saved_snap_min = WT_SESSION_TXN_STATE(session)->snap_min;
flags = WT_READ_CACHE | WT_READ_NO_GEN;
internal_bytes = leaf_bytes = 0;
internal_pages = leaf_pages = 0;
if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT))
WT_RET(__wt_epoch(session, &start));
switch (syncop) {
case WT_SYNC_WRITE_LEAVES:
/*
* Write all immediately available, dirty in-cache leaf pages.
*
* Writing the leaf pages is done without acquiring a high-level
* lock, serialize so multiple threads don't walk the tree at
* the same time.
*/
if (!btree->modified)
return (0);
__wt_spin_lock(session, &btree->flush_lock);
if (!btree->modified) {
__wt_spin_unlock(session, &btree->flush_lock);
return (0);
}
/*
* Save the oldest transaction ID we need to keep around.
* Otherwise, in a busy system, we could be updating pages so
* fast that write leaves never catches up. We deliberately
* have no transaction running at this point that would keep
* the oldest ID from moving forwards as we walk the tree.
*/
oldest_id = __wt_txn_oldest_id(session);
flags |= WT_READ_NO_WAIT | WT_READ_SKIP_INTL;
for (walk = NULL;;) {
WT_ERR(__wt_tree_walk(session, &walk, flags));
if (walk == NULL)
break;
/*
* Write dirty pages if nobody beat us to it. Don't
* try to write hot pages (defined as pages that have
* been updated since the write phase leaves started):
* checkpoint will have to visit them anyway.
*/
page = walk->page;
if (__wt_page_is_modified(page) &&
WT_TXNID_LT(page->modify->update_txn, oldest_id)) {
if (txn->isolation == WT_ISO_READ_COMMITTED)
WT_ERR(__wt_txn_get_snapshot(session));
leaf_bytes += page->memory_footprint;
++leaf_pages;
WT_ERR(__wt_reconcile(session, walk, NULL, 0));
}
}
break;
case WT_SYNC_CHECKPOINT:
/*
* If we are flushing a file at read-committed isolation, which
* is of particular interest for flushing the metadata to make
* schema-changing operation durable, get a transactional
* snapshot now.
*
* All changes committed up to this point should be included.
* We don't update the snapshot in between pages because the
* metadata shouldn't have many pages. Instead, read-committed
* isolation ensures that all metadata updates completed before
* the checkpoint are included.
*/
if (txn->isolation == WT_ISO_READ_COMMITTED)
WT_ERR(__wt_txn_get_snapshot(session));
/*
* We cannot check the tree modified flag in the case of a
* checkpoint, the checkpoint code has already cleared it.
*
* Writing the leaf pages is done without acquiring a high-level
* lock, serialize so multiple threads don't walk the tree at
* the same time. We're holding the schema lock, but need the
* lower-level lock as well.
*/
__wt_spin_lock(session, &btree->flush_lock);
/*
* In the final checkpoint pass, child pages cannot be evicted
* from underneath internal pages nor can underlying blocks be
* freed until the checkpoint's block lists are stable. Also,
* we cannot split child pages into parents unless we know the
* final pass will write a consistent view of that namespace.
* Set the checkpointing flag to block such actions and wait for
* any problematic eviction or page splits to complete.
*/
WT_PUBLISH(btree->checkpointing, WT_CKPT_PREPARE);
WT_ERR(__wt_evict_file_exclusive_on(session));
__wt_evict_file_exclusive_off(session);
WT_PUBLISH(btree->checkpointing, WT_CKPT_RUNNING);
/* Write all dirty in-cache pages. */
flags |= WT_READ_NO_EVICT;
for (walk = NULL;;) {
WT_ERR(__wt_tree_walk(session, &walk, flags));
if (walk == NULL)
break;
/* Skip clean pages. */
if (!__wt_page_is_modified(walk->page))
continue;
/*
* Take a local reference to the page modify structure
* now that we know the page is dirty. It needs to be
* done in this order otherwise the page modify
* structure could have been created between taking the
* reference and checking modified.
*/
page = walk->page;
mod = page->modify;
/*
* Write dirty pages, unless we can be sure they only
* became dirty after the checkpoint started.
*
* We can skip dirty pages if:
* (1) they are leaf pages;
* (2) there is a snapshot transaction active (which
* is the case in ordinary application checkpoints
* but not all internal cases); and
* (3) the first dirty update on the page is
* sufficiently recent that the checkpoint
* transaction would skip them.
*
* Mark the tree dirty: the checkpoint marked it clean
* and we can't skip future checkpoints until this page
* is written.
*/
if (!WT_PAGE_IS_INTERNAL(page) &&
F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) &&
WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn)) {
__wt_page_modify_set(session, page);
continue;
}
if (WT_PAGE_IS_INTERNAL(page)) {
internal_bytes += page->memory_footprint;
++internal_pages;
} else {
leaf_bytes += page->memory_footprint;
++leaf_pages;
}
WT_ERR(__wt_reconcile(session, walk, NULL, 0));
}
break;
case WT_SYNC_CLOSE:
case WT_SYNC_DISCARD:
WT_ILLEGAL_VALUE_ERR(session);
}
if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
WT_ERR(__wt_epoch(session, &end));
WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT,
"__sync_file WT_SYNC_%s wrote:\n\t %" PRIu64
" bytes, %" PRIu64 " pages of leaves\n\t %" PRIu64
" bytes, %" PRIu64 " pages of internal\n\t"
"Took: %" PRIu64 "ms",
syncop == WT_SYNC_WRITE_LEAVES ?
"WRITE_LEAVES" : "CHECKPOINT",
leaf_bytes, leaf_pages, internal_bytes, internal_pages,
WT_TIMEDIFF_MS(end, start)));
}
err: /* On error, clear any left-over tree walk. */
if (walk != NULL)
WT_TRET(__wt_page_release(session, walk, flags));
/*
* If we got a snapshot in order to write pages, and there was no
* snapshot active when we started, release it.
*/
if (txn->isolation == WT_ISO_READ_COMMITTED &&
saved_snap_min == WT_TXN_NONE)
__wt_txn_release_snapshot(session);
if (btree->checkpointing != WT_CKPT_OFF) {
/*
* Update the checkpoint generation for this handle so visible
* updates newer than the checkpoint can be evicted.
*
* This has to be published before eviction is enabled again,
* so that eviction knows that the checkpoint has completed.
*/
WT_PUBLISH(btree->checkpoint_gen,
conn->txn_global.checkpoint_gen);
WT_STAT_FAST_DATA_SET(session,
btree_checkpoint_generation, btree->checkpoint_gen);
/*
* Clear the checkpoint flag and push the change; not required,
* but publishing the change means stalled eviction gets moving
* as soon as possible.
*/
btree->checkpointing = WT_CKPT_OFF;
WT_FULL_BARRIER();
/*
* If this tree was being skipped by the eviction server during
* the checkpoint, clear the wait.
*/
btree->evict_walk_period = 0;
/*
* Wake the eviction server, in case application threads have
* stalled while the eviction server decided it couldn't make
* progress. Without this, application threads will be stalled
* until the eviction server next wakes.
*/
WT_TRET(__wt_evict_server_wake(session));
}
__wt_spin_unlock(session, &btree->flush_lock);
/*
* Leaves are written before a checkpoint (or as part of a file close,
* before checkpointing the file). Start a flush to stable storage,
* but don't wait for it.
*/
if (ret == 0 &&
syncop == WT_SYNC_WRITE_LEAVES && F_ISSET(conn, WT_CONN_CKPT_SYNC))
WT_RET(btree->bm->sync(btree->bm, session, false));
return (ret);
}
/*
* __wt_cache_op --
* Cache operations.
*/
int
__wt_cache_op(WT_SESSION_IMPL *session, WT_CACHE_OP op)
{
switch (op) {
case WT_SYNC_CHECKPOINT:
case WT_SYNC_CLOSE:
/*
* Make sure the checkpoint reference is set for
* reconciliation; it's ugly, but drilling a function parameter
* path from our callers to the reconciliation of the tree's
* root page is going to be worse.
*/
WT_ASSERT(session, S2BT(session)->ckpt != NULL);
break;
case WT_SYNC_DISCARD:
case WT_SYNC_WRITE_LEAVES:
break;
}
switch (op) {
case WT_SYNC_CHECKPOINT:
case WT_SYNC_WRITE_LEAVES:
return (__sync_file(session, op));
case WT_SYNC_CLOSE:
case WT_SYNC_DISCARD:
return (__wt_evict_file(session, op));
WT_ILLEGAL_VALUE(session);
}
}