Files
mongo/src/block/block_snap.c
Keith Bostic c6103a38b1 Initialize the snapshot-avail list after cleaning it out, else we'll try and
print a NULL pointer in VERBOSE mode.
2012-06-06 11:26:09 -04:00

707 lines
22 KiB
C

/*-
* Copyright (c) 2008-2012 WiredTiger, Inc.
* All rights reserved.
*
* See the file LICENSE for redistribution information.
*/
#include "wt_internal.h"
static int __snapshot_process(WT_SESSION_IMPL *, WT_BLOCK *, WT_SNAPSHOT *);
static int __snapshot_string(
WT_SESSION_IMPL *, WT_BLOCK *, const uint8_t *, WT_ITEM *);
static int __snapshot_update(WT_SESSION_IMPL *,
WT_BLOCK *, WT_SNAPSHOT *, WT_BLOCK_SNAPSHOT *, uint64_t, int);
/*
* __wt_block_snap_init --
* Initialize a snapshot structure.
*/
int
__wt_block_snap_init(WT_SESSION_IMPL *session,
WT_BLOCK *block, WT_BLOCK_SNAPSHOT *si, const char *name, int is_live)
{
WT_DECL_RET;
/*
* If we're loading a new live snapshot, there shouldn't be one already
* loaded. The btree engine should prevent this from ever happening,
* but paranoia is a healthy thing.
*/
if (is_live) {
__wt_spin_lock(session, &block->live_lock);
if (block->live_load)
ret = EINVAL;
else
block->live_load = 1;
__wt_spin_unlock(session, &block->live_lock);
if (ret)
WT_RET_MSG(session, EINVAL, "snapshot already loaded");
}
memset(si, 0, sizeof(*si));
si->root_offset = WT_BLOCK_INVALID_OFFSET;
WT_RET(__wt_block_extlist_init(session, &si->alloc, name, "alloc"));
WT_RET(__wt_block_extlist_init(session, &si->avail, name, "avail"));
WT_RET(__wt_block_extlist_init(session, &si->discard, name, "discard"));
si->file_size = WT_BLOCK_DESC_SECTOR;
WT_RET(__wt_block_extlist_init(
session, &si->snapshot_avail, name, "snapshot_avail"));
return (0);
}
/*
* __wt_block_snapshot_load --
* Load a snapshot.
*/
int
__wt_block_snapshot_load(WT_SESSION_IMPL *session,
WT_BLOCK *block, WT_ITEM *dsk, const uint8_t *addr, uint32_t addr_size,
int readonly)
{
WT_BLOCK_SNAPSHOT *si;
WT_DECL_ITEM(tmp);
WT_DECL_RET;
WT_UNUSED(addr_size);
/*
* Sometimes we don't find a root page (we weren't given a snapshot,
* or the referenced snapshot was empty). In that case we return a
* root page size of 0. Set that up now.
*/
dsk->size = 0;
si = &block->live;
WT_RET(__wt_block_snap_init(session, block, si, "live", 1));
if (WT_VERBOSE_ISSET(session, snapshot)) {
if (addr != NULL) {
WT_ERR(__wt_scr_alloc(session, 0, &tmp));
WT_ERR(__snapshot_string(session, block, addr, tmp));
}
WT_VERBOSE_ERR(session, snapshot,
"%s: load-snapshot: %s", block->name,
addr == NULL ? "[Empty]" : (char *)tmp->data);
}
/* If not loading a snapshot from disk, we're done. */
if (addr == NULL || addr_size == 0)
return (0);
/* Crack the snapshot cookie. */
if (addr != NULL)
WT_ERR(__wt_block_buffer_to_snapshot(session, block, addr, si));
/* Verify sets up next. */
if (block->verify)
WT_ERR(__wt_verify_snap_load(session, block, si));
/* Read, and optionally verify, any root page. */
if (si->root_offset != WT_BLOCK_INVALID_OFFSET) {
WT_ERR(__wt_block_read_off(session, block,
dsk, si->root_offset, si->root_size, si->root_cksum));
if (block->verify) {
if (tmp == NULL) {
WT_ERR(__wt_scr_alloc(session, 0, &tmp));
WT_ERR(__snapshot_string(
session, block, addr, tmp));
}
WT_ERR(
__wt_verify_dsk(session, (char *)tmp->data, dsk));
}
}
/*
* Rolling a snapshot forward requires the avail list, the blocks from
* which we can allocate.
*/
if (!readonly)
WT_ERR(__wt_block_extlist_read(session, block, &si->avail));
/*
* If the snapshot can be written, that means anything written after
* the snapshot is no longer interesting. Truncate the file.
*/
if (!readonly) {
WT_VERBOSE_ERR(session, snapshot,
"truncate file to %" PRIuMAX, (uintmax_t)si->file_size);
WT_ERR(__wt_ftruncate(session, block->fh, si->file_size));
}
if (0) {
err: (void)__wt_block_snapshot_unload(session, block);
}
__wt_scr_free(&tmp);
return (ret);
}
/*
* __wt_block_snapshot_unload --
* Unload a snapshot.
*/
int
__wt_block_snapshot_unload(WT_SESSION_IMPL *session, WT_BLOCK *block)
{
WT_BLOCK_SNAPSHOT *si;
WT_DECL_RET;
WT_VERBOSE_RETVAL(
session, snapshot, ret, "%s: unload snapshot", block->name);
si = &block->live;
/* Verify cleanup. */
if (block->verify)
WT_TRET(__wt_verify_snap_unload(session, block, si));
__wt_block_snap_destroy(session, si);
block->live_load = 0;
return (ret);
}
/*
* __wt_block_snap_destroy --
* Clear a snapshot structure.
*/
void
__wt_block_snap_destroy(WT_SESSION_IMPL *session, WT_BLOCK_SNAPSHOT *si)
{
/* Discard the extent lists. */
__wt_block_extlist_free(session, &si->alloc);
__wt_block_extlist_free(session, &si->avail);
__wt_block_extlist_free(session, &si->discard);
__wt_block_extlist_free(session, &si->snapshot_avail);
}
/*
* __wt_block_snapshot --
* Create a new snapshot.
*/
int
__wt_block_snapshot(WT_SESSION_IMPL *session,
WT_BLOCK *block, WT_ITEM *buf, WT_SNAPSHOT *snapbase)
{
WT_BLOCK_SNAPSHOT *si;
si = &block->live;
si->version = WT_BM_SNAPSHOT_VERSION;
/*
* Write the root page: it's possible for there to be a snapshot of
* an empty tree, in which case, we store an illegal root offset.
*
* XXX
* We happen to know that snapshots are single-threaded above us in
* the btree engine. That's probably something we want to guarantee
* for any WiredTiger block manager.
*/
if (buf == NULL) {
si->root_offset = WT_BLOCK_INVALID_OFFSET;
si->root_size = si->root_cksum = 0;
} else
WT_RET(__wt_block_write_off(session, block, buf,
&si->root_offset, &si->root_size, &si->root_cksum, 0));
/* Process the list of snapshots, deleting and updating as required. */
WT_RET(__snapshot_process(session, block, snapbase));
/*
* Snapshots have to hit disk (it would be reasonable to configure for
* lazy snapshots, but we don't support them yet). Regardless, we're
* not holding any locks, other writers can proceed while we wait.
*/
if (!F_ISSET(S2C(session), WT_CONN_NOSYNC))
WT_RET(__wt_fsync(session, block->fh));
return (0);
}
/*
* __snapshot_extlist_fblocks --
* If an extent list was read from disk, free its space to the live avail
* list.
*/
static inline int
__snapshot_extlist_fblocks(
WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el)
{
if (el->offset == WT_BLOCK_INVALID_OFFSET)
return (0);
return (__wt_block_insert_ext(
session, &block->live.avail, el->offset, el->size));
}
/*
* __snapshot_process --
* Process the list of snapshots.
*/
static int
__snapshot_process(
WT_SESSION_IMPL *session, WT_BLOCK *block, WT_SNAPSHOT *snapbase)
{
WT_BLOCK_SNAPSHOT *a, *b, *si;
WT_DECL_ITEM(tmp);
WT_DECL_RET;
WT_SNAPSHOT *snap;
uint64_t snapshot_size;
int deleting, locked;
si = &block->live;
locked = 0;
/*
* We've allocated our last page, update the snapshot size. We need to
* calculate the live system's snapshot size before reading and merging
* snapshot allocation and discard information from the snapshots we're
* deleting, those operations will change the underlying byte counts.
*/
snapshot_size = si->snapshot_size;
snapshot_size += si->alloc.bytes;
snapshot_size -= si->discard.bytes;
/*
* Extents that become newly available as a result of deleting previous
* snapshots are added to a list of extents. The list should be empty,
* but there's no explicit "free the snapshot information" call into the
* block manager; if there was an error in an upper level resulting in
* the snapshot never being "resolved", the list might not be empty.
*
* XXX
* This isn't sufficient, actually: we're going to leak all the blocks
* that were written as part of the last snapshot because it was never
* resolved.
*/
__wt_block_extlist_free(session, &si->snapshot_avail);
WT_RET(__wt_block_extlist_init(
session, &si->snapshot_avail, "live", "snapshot_avail"));
/*
* To delete a snapshot, we'll need snapshot information for it, and we
* have to read that from the disk.
*/
deleting = 0;
WT_SNAPSHOT_FOREACH(snapbase, snap) {
/*
* To delete a snapshot, we'll need snapshot information for it
* and the subsequent snapshot. The test is tricky, we have to
* load the current snapshot's information if it's marked for
* deletion, or if it follows a snapshot marked for deletion,
* where the boundary cases are the first snapshot in the list
* and the last snapshot in the list: if we're deleting the last
* snapshot in the list, there's no next snapshot, the snapshot
* will be merged into the live tree.
*/
if (!F_ISSET(snap, WT_SNAP_DELETE) &&
(snap == snapbase ||
F_ISSET(snap, WT_SNAP_ADD) ||
!F_ISSET(snap - 1, WT_SNAP_DELETE)))
continue;
deleting = 1;
/*
* Allocate a snapshot structure, crack the cookie and read the
* snapshot's extent lists.
*
* Ignore the avail list: snapshot avail lists are only useful
* if we are rolling forward from the particular snapshot and
* they represent our best understanding of what blocks can be
* allocated. If we are not operating on the live snapshot,
* subsequent snapshots might have allocated those blocks, and
* the avail list is useless. We don't discard it, because it
* is useful as part of verification, but we don't re-write it
* either.
*/
WT_ERR(__wt_calloc(
session, 1, sizeof(WT_BLOCK_SNAPSHOT), &snap->bpriv));
si = snap->bpriv;
WT_ERR(__wt_block_snap_init(session, block, si, snap->name, 0));
WT_ERR(__wt_block_buffer_to_snapshot(
session, block, snap->raw.data, si));
WT_ERR(__wt_block_extlist_read(session, block, &si->alloc));
WT_ERR(__wt_block_extlist_read(session, block, &si->discard));
}
/*
* Hold a lock so the live extent lists and the file size can't change
* underneath us. I suspect we'll tighten this if snapshots take too
* much time away from real work: we read historic snapshot information
* without a lock, but we could also merge and re-write the delete
* snapshot information without a lock, except for ranges merged into
* the live tree.
*/
__wt_spin_lock(session, &block->live_lock);
locked = 1;
/* Skip the additional processing if we aren't deleting snapshots. */
if (!deleting)
goto live_update;
/*
* Delete any no-longer-needed snapshots: we do this first as it frees
* blocks to the live lists, and the freed blocks will then be included
* when writing the live extent lists.
*/
WT_SNAPSHOT_FOREACH(snapbase, snap) {
if (!F_ISSET(snap, WT_SNAP_DELETE))
continue;
if (WT_VERBOSE_ISSET(session, snapshot)) {
if (tmp == NULL)
WT_ERR(__wt_scr_alloc(session, 0, &tmp));
WT_ERR(__snapshot_string(
session, block, snap->raw.data, tmp));
WT_VERBOSE_ERR(session, snapshot,
"%s: delete-snapshot: %s: %s",
block->name, snap->name, (char *)tmp->data);
}
/*
* Set the from/to snapshot structures, where the "to" value
* may be the live tree.
*/
a = snap->bpriv;
if (F_ISSET(snap + 1, WT_SNAP_ADD))
b = &block->live;
else
b = (snap + 1)->bpriv;
/*
* Free the root page: there's nothing special about this free,
* the root page is allocated using normal rules, that is, it
* may have been taken from the avail list, and was entered on
* the live system's alloc list at that time. We free it into
* the snapshot's discard list, however, not the live system's
* list because it appears on the snapshot's alloc list and so
* must be paired in the snapshot.
*/
if (a->root_offset != WT_BLOCK_INVALID_OFFSET)
WT_ERR(__wt_block_insert_ext(session,
&a->discard, a->root_offset, a->root_size));
/*
* Free the blocks used to hold the "from" snapshot's extent
* lists directly to the live system's avail list, they were
* never on any alloc list. Include the "from" snapshot's
* avail list, it's going away.
*/
WT_ERR(__snapshot_extlist_fblocks(session, block, &a->alloc));
WT_ERR(__snapshot_extlist_fblocks(session, block, &a->avail));
WT_ERR(__snapshot_extlist_fblocks(session, block, &a->discard));
/*
* Roll the "from" alloc and discard extent lists into the "to"
* snapshot's lists.
*/
if (a->alloc.entries != 0)
WT_ERR(__wt_block_extlist_merge(
session, &a->alloc, &b->alloc));
if (a->discard.entries != 0)
WT_ERR(__wt_block_extlist_merge(
session, &a->discard, &b->discard));
/*
* If the "to" snapshot is also being deleted, we're done with
* it, it's merged into some other snapshot in the next loop.
* This means the extent lists may aggregate over a number of
* snapshots, but that's OK, they're disjoint sets of ranges.
*/
if (F_ISSET(snap + 1, WT_SNAP_DELETE))
continue;
/*
* Find blocks for re-use: wherever the "to" snapshot's allocate
* and discard lists overlap is fair game, move ranges appearing
* on both lists to the live snapshot's newly available list.
*/
WT_ERR(__wt_block_extlist_overlap(session, block, b));
/*
* If we're updating the live system's information, we're done.
*/
if (F_ISSET(snap + 1, WT_SNAP_ADD))
continue;
/*
* We have to write the "to" snapshot's extent lists out in new
* blocks, and update its cookie.
*
* Free the blocks used to hold the "to" snapshot's extent lists
* directly to the live system's avail list, they were never on
* any alloc list. Do not include the "to" snapshot's avail
* list, it's not changing.
*/
WT_ERR(__snapshot_extlist_fblocks(session, block, &b->alloc));
WT_ERR(__snapshot_extlist_fblocks(session, block, &b->discard));
F_SET(snap + 1, WT_SNAP_UPDATE);
}
/* Update snapshots marked for update. */
WT_SNAPSHOT_FOREACH(snapbase, snap)
if (F_ISSET(snap, WT_SNAP_UPDATE)) {
WT_ASSERT(session, !F_ISSET(snap, WT_SNAP_ADD));
WT_ERR(__snapshot_update(
session, block, snap, snap->bpriv, 0, 0));
}
live_update:
si = &block->live;
/* Truncate the file if that's possible. */
WT_ERR(__wt_block_extlist_truncate(session, block, &si->avail));
/* Update the final, added snapshot based on the live system. */
WT_SNAPSHOT_FOREACH(snapbase, snap)
if (F_ISSET(snap, WT_SNAP_ADD)) {
WT_ERR(__snapshot_update(
session, block, snap, si, snapshot_size, 1));
/*
* XXX
* Our caller wants two pieces of information: the time
* the snapshot was taken and the final snapshot size.
* This violates layering but the alternative is a call
* for the btree layer to crack the snapshot cookie into
* its components, and that's a fair amount of work.
* (We could just read the system time in the session
* layer when updating the metadata file, but that won't
* work for the snapshot size, and so we do both here.)
*/
snap->snapshot_size = si->snapshot_size;
WT_ERR(__wt_epoch(session, &snap->sec, NULL));
}
/*
* Reset the live system's alloc and discard extent lists, leave the
* avail list alone.
*/
__wt_block_extlist_free(session, &si->alloc);
WT_ERR(__wt_block_extlist_init(session, &si->alloc, "live", "alloc"));
__wt_block_extlist_free(session, &si->discard);
WT_ERR(
__wt_block_extlist_init(session, &si->discard, "live", "discard"));
#ifdef HAVE_DIAGNOSTIC
/*
* The first snapshot in the system should always have an empty discard
* list. If we've read that snapshot and/or created it, check.
*/
WT_SNAPSHOT_FOREACH(snapbase, snap)
if (!F_ISSET(snap, WT_SNAP_DELETE))
break;
if ((a = snap->bpriv) == NULL)
a = &block->live;
if (a->discard.entries != 0) {
__wt_errx(session,
"snapshot incorrectly has blocks on the discard list");
WT_ERR(WT_ERROR);
}
#endif
err: if (locked)
__wt_spin_unlock(session, &block->live_lock);
/* Discard any snapshot information we loaded, we no longer need it. */
WT_SNAPSHOT_FOREACH(snapbase, snap)
if ((si = snap->bpriv) != NULL) {
__wt_block_extlist_free(session, &si->alloc);
__wt_block_extlist_free(session, &si->avail);
__wt_block_extlist_free(session, &si->discard);
}
__wt_scr_free(&tmp);
return (ret);
}
/*
* __snapshot_update --
* Update a snapshot.
*/
static int
__snapshot_update(
WT_SESSION_IMPL *session, WT_BLOCK *block, WT_SNAPSHOT *snap,
WT_BLOCK_SNAPSHOT *si, uint64_t snapshot_size, int is_live)
{
WT_DECL_ITEM(tmp);
WT_DECL_RET;
uint8_t *endp;
#ifdef HAVE_DIAGNOSTIC
/* Check the extent list combinations for overlaps. */
WT_RET(__wt_block_extlist_check(session, &si->alloc, &si->avail));
WT_RET(__wt_block_extlist_check(session, &si->discard, &si->avail));
WT_RET(__wt_block_extlist_check(session, &si->alloc, &si->discard));
#endif
/*
* Write the snapshot's extent lists; we only write an avail list for
* the live system, other snapshot's avail lists are static and never
* change. When we do write the avail list for the live system it's
* two lists: the current avail list plus the list of blocks that are
* being made available as of the new snapshot. We can't merge that
* second list into the real list yet, it's not truly available until
* the new snapshot location has been saved to the metadata.
*/
WT_RET(__wt_block_extlist_write(session, block, &si->alloc, NULL));
if (is_live)
WT_RET(__wt_block_extlist_write(
session, block, &si->avail, &si->snapshot_avail));
WT_RET(__wt_block_extlist_write(session, block, &si->discard, NULL));
/*
* Set the file size for the live system.
*
* XXX
* We do NOT set the file size when re-writing snapshots because we want
* to test the snapshot's blocks against a reasonable maximum file size
* during verification. This is not good: imagine a snapshot appearing
* early in the file, re-written, and then the snapshot requires blocks
* at the end of the file, blocks after the listed file size. If the
* application opens that snapshot for writing (discarding subsequent
* snapshots), we would truncate the file to the early chunk, discarding
* the re-written snapshot information. The alternative, updating the
* file size has its own problems, in that case we'd work correctly, but
* we'd lose all of the blocks between the original snapshot and the
* re-written snapshot. Currently, there's no API to roll-forward
* intermediate snapshots, if there ever is, this will need to be fixed.
*/
if (is_live)
WT_RET(__wt_filesize(session, block->fh, &si->file_size));
/* Set the snapshot size for the live system. */
if (is_live)
si->snapshot_size = snapshot_size;
/*
* Copy the snapshot information into the snapshot array's address
* cookie.
*/
WT_RET(__wt_buf_init(session, &snap->raw, WT_BTREE_MAX_ADDR_COOKIE));
endp = snap->raw.mem;
WT_RET(__wt_block_snapshot_to_buffer(session, block, &endp, si));
snap->raw.size = WT_PTRDIFF32(endp, snap->raw.mem);
if (WT_VERBOSE_ISSET(session, snapshot)) {
WT_RET(__wt_scr_alloc(session, 0, &tmp));
WT_ERR(__snapshot_string(session, block, snap->raw.data, tmp));
WT_VERBOSE_ERR(session, snapshot,
"%s: create-snapshot: %s: %s",
block->name, snap->name, (char *)tmp->data);
}
err: __wt_scr_free(&tmp);
return (ret);
}
/*
* __wt_block_snapshot_resolve --
* Resolve a snapshot.
*/
int
__wt_block_snapshot_resolve(
WT_SESSION_IMPL *session, WT_BLOCK *block, WT_SNAPSHOT *snapbase)
{
WT_BLOCK_SNAPSHOT *si;
WT_DECL_RET;
si = &block->live;
/*
* Snapshots are a two-step process: first, we write a new snapshot to
* disk (including all the new extent lists for modified snapshots and
* the live system). As part of this we create a list of file blocks
* newly available for re-allocation, based on snapshots being deleted.
* We then return the locations of the new snapshot information to our
* caller. Our caller has to write that information into some kind of
* stable storage, and once that's done, we can actually allocate from
* that list of newly available file blocks. (We can't allocate from
* that list immediately because the allocation might happen before our
* caller saves the new snapshot information, and if we crashed before
* the new snapshot information was saved, we'd have overwritten blocks
* still referenced by snapshots in the system.) In summary, there is
* a second step, after our caller saves the snapshot information, we
* are called to add the newly available blocks into the live system's
* available list.
*/
__wt_spin_lock(session, &block->live_lock);
ret =
__wt_block_extlist_merge(session, &si->snapshot_avail, &si->avail);
__wt_spin_unlock(session, &block->live_lock);
/* Discard the list. */
__wt_block_extlist_free(session, &si->snapshot_avail);
WT_UNUSED(snapbase);
return (ret);
}
/*
* __snapshot_string --
* Return a printable string representation of a snapshot address cookie.
*/
static int
__snapshot_string(WT_SESSION_IMPL *session,
WT_BLOCK *block, const uint8_t *addr, WT_ITEM *buf)
{
WT_BLOCK_SNAPSHOT *si, _si;
/* Initialize the snapshot, crack the cookie. */
si = &_si;
WT_RET(__wt_block_snap_init(session, block, si, "string", 0));
WT_RET(__wt_block_buffer_to_snapshot(session, block, addr, si));
WT_RET(__wt_buf_fmt(session, buf,
"version=%d",
si->version));
if (si->root_offset == WT_BLOCK_INVALID_OFFSET)
WT_RET(__wt_buf_catfmt(session, buf, ", root=[Empty]"));
else
WT_RET(__wt_buf_catfmt(session, buf,
", root=[%"
PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
(uintmax_t)si->root_offset,
(uintmax_t)(si->root_offset + si->root_size),
si->root_size, si->root_cksum));
if (si->alloc.offset == WT_BLOCK_INVALID_OFFSET)
WT_RET(__wt_buf_catfmt(session, buf, ", alloc=[Empty]"));
else
WT_RET(__wt_buf_catfmt(session, buf,
", alloc=[%"
PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
(uintmax_t)si->alloc.offset,
(uintmax_t)(si->alloc.offset + si->alloc.size),
si->alloc.size, si->alloc.cksum));
if (si->avail.offset == WT_BLOCK_INVALID_OFFSET)
WT_RET(__wt_buf_catfmt(session, buf, ", avail=[Empty]"));
else
WT_RET(__wt_buf_catfmt(session, buf,
", avail=[%"
PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
(uintmax_t)si->avail.offset,
(uintmax_t)(si->avail.offset + si->avail.size),
si->avail.size, si->avail.cksum));
if (si->discard.offset == WT_BLOCK_INVALID_OFFSET)
WT_RET(__wt_buf_catfmt(session, buf, ", discard=[Empty]"));
else
WT_RET(__wt_buf_catfmt(session, buf,
", discard=[%"
PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
(uintmax_t)si->discard.offset,
(uintmax_t)(si->discard.offset + si->discard.size),
si->discard.size, si->discard.cksum));
WT_RET(__wt_buf_catfmt(session, buf,
", file size=%" PRIuMAX
", write generation=%" PRIu64,
(uintmax_t)si->file_size,
si->write_gen));
return (0);
}