Compare commits

...

108 Commits

Author SHA1 Message Date
Michael Cahill
025aacb645 Fix the 2.7.0 release date. 2015-12-08 16:26:14 +11:00
Alex Gorrod
15c6015019 Bump WiredTiger 2.7.0 release 2015-12-08 14:45:52 +11:00
Alex Gorrod
67a0f8dc87 Update wtstats graphing tool for 2.7.0 release 2015-12-08 14:45:30 +11:00
Michael Cahill
57c3e84ce6 Merge pull request #2370 from wiredtiger/2.7.0_release
Cut WiredTiger 2.7.0 release
2015-12-08 14:29:17 +11:00
Michael Cahill
59864bee3b Add WT-2260 to the changelog. 2015-12-08 14:22:59 +11:00
Michael Cahill
04e0c89bea Merge branch 'develop' into 2.7.0_release 2015-12-08 14:21:57 +11:00
Michael Cahill
4c49043f5c Merge pull request #2361 from wiredtiger/wt-2260-dont-evict-internal
WT-2260 Avoid adding internal pages to the eviction queue.
2015-12-08 14:21:15 +11:00
Michael Cahill
f1b011ab23 Turn git hashes and jira issues in NEWS into links in the generated docs. 2015-12-08 14:08:21 +11:00
Michael Cahill
dc38ce52ec Exclude git hashes from spell checking. 2015-12-08 13:42:58 +11:00
Alex Gorrod
b3c1d5ce3f Editing pass on 2.7.0 changelog 2015-12-08 02:26:21 +00:00
Alex Gorrod
656a1b7578 Edits to changelog and updates to spell checker 2015-12-07 23:55:18 +00:00
Alex Gorrod
ebe74287d9 Add commit references to statistics changes 2015-12-07 06:23:57 +00:00
Michael Cahill
d4717fbc6e Edit pass through API change log. 2015-12-07 17:08:17 +11:00
Alex Gorrod
0a24dc9321 Fixup stats in changelog. 2015-12-07 06:04:44 +00:00
Alex Gorrod
b12f6dfb13 Edit API changes section of change log. 2015-12-07 05:37:26 +00:00
Michael Cahill
3ff5b3d4e3 First pass through 2.7.0 changelog issues. 2015-12-07 16:17:33 +11:00
Alex Gorrod
debe1b23f3 Add first pass release changelog for 2.7.0 2015-12-07 04:20:24 +00:00
Alex Gorrod
2d27b44ad1 Merge branch 'develop' into wt-2260-dont-evict-internal 2015-12-07 12:05:49 +11:00
sueloverso
b2517d049d Merge pull request #2365 from wiredtiger/wtperf-whitespace
Enforce whitespace guidelines on bench/wtperf.
2015-12-04 13:51:19 -05:00
Keith Bostic
c48b42748c Enforce whitespace guidelines on bench/wtperf. 2015-12-04 08:48:33 -05:00
Alex Gorrod
8d27ecbc5c WT-2260 Avoid adding internal pages to the eviction queue.
We added code to make eviction fairer when there are lots of files
open, but it had the effect that when there are only a few files
open eviction could add lots of internal pages to the eviction
queue, since it focused on a small area of the tree.
2015-12-04 16:22:36 +11:00
Alex Gorrod
7cc881e987 Merge pull request #2360 from wiredtiger/WT-2245
WT-2265 - increase mongodb-oplog wtperf workload throttle to compensate for fix
2015-12-04 15:57:12 +11:00
David Hows
d5092c4dd0 WT-2265 - increase mongodb-oplog wtperf workload throttle to compensate for fix 2015-12-04 11:54:05 +11:00
Susan LoVerso
7b1d722bd7 WT-2257 Account for removal of quotes for arg parsing. 2015-12-03 14:46:39 -05:00
Susan LoVerso
9b46f3f8e4 WT-2257 Print out args to script 2015-12-03 14:42:15 -05:00
Susan LoVerso
76349d6cd8 WT-2257 Print test and args to runner script 2015-12-03 14:24:19 -05:00
Susan LoVerso
f36b1a7aa5 WT-2257 Print out throttle setting in workload info. 2015-12-03 14:20:19 -05:00
Susan LoVerso
e53e2795e1 WT-2225 Allow script to take a command line arg to pass to wtperf. 2015-12-03 09:58:10 -05:00
sueloverso
2df5658da3 Merge pull request #2356 from wiredtiger/wtperf-2257
WT-2257 Fixes when given multiple thread workload configs.
2015-12-03 09:46:27 -05:00
Susan LoVerso
884c8e114c WT-2257 Updated docs 2015-12-03 09:37:27 -05:00
Susan LoVerso
7f5b70d1d7 WT-2257 Add more text to multiple workload configs. 2015-12-03 09:34:08 -05:00
Susan LoVerso
fc6db4dccc Merge branch 'develop' into wtperf-2257 2015-12-03 08:58:28 -05:00
sueloverso
c1013cee06 Merge pull request #2357 from wiredtiger/throttle-2256
WT-2256 Fix interval timer for wtperf throttling.
2015-12-02 19:24:39 -05:00
Alex Gorrod
fb7fc2f35e Merge pull request #2342 from wiredtiger/wtperf_truncate_multiplier
WT-2245 - Add a multiplier to the truncate stones to deal with very high throughput cases
2015-12-03 10:49:37 +11:00
Susan LoVerso
d04d2ba924 WT-2256 Fix interval timer for wtperf throttling. 2015-12-02 15:37:12 -05:00
Susan LoVerso
be3bc4918a WT-2257 Fixes when given multiple thread workload configs. 2015-12-02 15:30:19 -05:00
David Hows
04a09e7795 WT-2244 - Add comment explaining max multipler 2015-12-02 14:33:54 +11:00
Alex Gorrod
bff6525c83 Merge pull request #2352 from wiredtiger/SERVER-21553-free-deleted-blocks
SERVER-21553 Free blocks during reverse splits.
2015-12-02 14:23:27 +11:00
Michael Cahill
97b549e75f Merge branch 'develop' into SERVER-21553-free-deleted-blocks 2015-12-02 13:23:10 +11:00
Michael Cahill
c70b0973e1 Merge pull request #2351 from wiredtiger/wt-2253-readgen-oldest-eviction
WT-2553: prioritize WT_READGEN_OLDEST pages for eviction.
2015-12-02 13:22:33 +11:00
Michael Cahill
5fe8c70e33 SERVER-21553 Check that ref->addr is NULL, don't try to free it.
During review of this change, noticed that we are trying to free an address in
a path where it should never be set (or we would leak blocks).  Assert that the
address is NULL instead of checking whether it needs to be freed.
2015-12-02 13:19:51 +11:00
Michael Cahill
745eb56977 SERVER-21553 If truncate leaves an internal page empty, evict it asap.
(Note: works in conjunction with the change to LRU policy for internal pages in WT-2553).
2015-12-02 13:10:01 +11:00
Michael Cahill
6c82703fd0 Merge branch 'develop' into SERVER-21553-free-deleted-blocks 2015-12-02 12:35:39 +11:00
Alex Gorrod
4fc3e3982e Merge pull request #2353 from wiredtiger/WT-2553-evict-split-pages
WT-2553 Evict pages left behind by in-memory splits.
2015-12-02 12:35:06 +11:00
Michael Cahill
5ebfd92119 SERVER-21553 Review feedback. 2015-12-02 12:30:39 +11:00
Michael Cahill
fa28552449 SERVER-21553 Remove no-longer-used variables. 2015-12-02 11:36:47 +11:00
Michael Cahill
8cf3e9bbaf SERVER-21553 Finish rename to __wt_ref_addr_free. 2015-12-02 11:29:46 +11:00
Michael Cahill
7a1050dbdd WT-2553 Include pages resulting from in-memory splits when in the "would block" phase of eviction. 2015-12-02 11:22:35 +11:00
Michael Cahill
f6a2db06a9 SERVER-21553 Free blocks during reverse splits. 2015-12-02 11:17:54 +11:00
Keith Bostic
479818af02 WT-2553: If a leaf or internal page's read generation is set to
WT_READGEN_OLDEST, prioritize it for eviction.
2015-12-01 18:29:20 -05:00
Alex Gorrod
f1a93162f2 Merge branch 'develop' into wtperf_truncate_multiplier 2015-12-01 22:53:21 +00:00
Alex Gorrod
e731ef8ab8 Merge pull request #2350 from wiredtiger/WT-2251-ref-addr-leak 2015-12-01 17:05:36 +11:00
Michael Cahill
e0f7961e07 WT-2251 Clear the original ref->addr before an in-memory split to avoid use-after-free. 2015-12-01 16:21:47 +11:00
Michael Cahill
81b1d09a0f Merge branch 'develop' into WT-2251-ref-addr-leak 2015-12-01 14:45:17 +11:00
Michael Cahill
f192c3903f WT-2251 Cleanup: no longer need to clear ref->addr after freeing. 2015-12-01 14:41:15 +11:00
Alex Gorrod
0e93d60d0d Merge pull request #2349 from wiredtiger/SERVER-21691
SERVER-21691 Avoid insert stalls
2015-12-01 14:33:35 +11:00
Michael Cahill
e2a91fe5f6 WT-2251 Free addresses when we discard deleted page references.
There was a lot of repeated code to check for offpage and free ref->addr, create an inlined function.
2015-12-01 14:32:50 +11:00
Michael Cahill
cf62c714ce SERVER-21691 Don't retry eviction of a page if transaction state hasn't changed.
This check was moved out of __wt_page_can_evict, which meant it only applied to LRU eviction.  Move it back so that we don't repeatedly try forced eviction that has no chance of succeeding.
2015-12-01 12:33:06 +11:00
Michael Cahill
8cb4ecce11 SERVER-21691 Avoid blocking in-memory splits.
Only trylock the page's reconciliation lock (so in-memory splits aren't blocked by slow splits of siblings), don't give up as soon as a checkpoint starts.
2015-12-01 12:31:36 +11:00
Michael Cahill
264ec216ef Merge pull request #2346 from wiredtiger/WT-2249-eviction-stuck
WT-2249 Keep eviction stuck until cache usage is under 100%.
2015-12-01 12:29:50 +11:00
Michael Cahill
7880ced1b7 WT-2249 Review fix. 2015-12-01 12:28:29 +11:00
Keith Bostic
dca1411e73 Merge pull request #2347 from wiredtiger/WT-2250
WT-2250 Minor fix.  Use SET instead of INCRV for stat.
2015-11-30 15:43:50 -05:00
Susan LoVerso
7c66f601b4 WT-2250 Minor fix. Use SET instead of INCRV for stat. 2015-11-30 15:05:30 -05:00
Keith Bostic
f721883c06 Don't complain if the long-running-transaction config is turned off for
fixed-length column stores.
2015-11-30 07:44:43 -05:00
Michael Cahill
4415f79afe WT-2249 Keep eviction stuck until cache usage is under 100%.
Fix a real bug: STUCK is a cache flag, not a cache->state flag.

Don't try to do eviction when dumping the cache, and only kill processes if no
pages have been evicted since the STUCK flag was first set.
2015-11-30 17:51:30 +11:00
David Hows
f2fa6b9283 WT-2245 - Use WT_MIN and WT_MAX for increments 2015-11-30 17:47:35 +11:00
Michael Cahill
4c49948727 Merge pull request #2340 from wiredtiger/WT-2244
WT-2244 - Trigger in-memory splits sooner.
2015-11-30 14:36:32 +11:00
Michael Cahill
8e9cc8d32c Merge branch 'develop' into WT-2244 2015-11-30 10:02:08 +11:00
Michael Cahill
9f2e4f395e Merge pull request #2344 from wiredtiger/wt-2248-session-close
WT-2248: WT_SESSION.close is updating WT_CONNECTION_IMPL.default_session
2015-11-30 10:00:59 +11:00
Keith Bostic
6d7d76f65d WT-2248: The code in WT_SESSION.close does work using the WT_CONNECTION_IMPL
default session, but that means the API_END_RET_NOTFOUND_MAP session cleanup
happens in the context of the default session, which isn't correct.
2015-11-28 15:57:39 -05:00
Keith Bostic
abb07da300 Update a comment. 2015-11-27 20:01:28 -05:00
Keith Bostic
37655b96df Fix a few comments around cursor in-use count support. 2015-11-27 19:59:30 -05:00
Keith Bostic
494accec89 Rename __wt_page_can_split to be __wt_leaf_page_can_split, we split
internal pages a lot more these days. (It's really row-store leaf
page, but that's a bug we'll eventualy fix.)
2015-11-27 14:58:48 -05:00
Keith Bostic
29aea6835a Merge branch 'develop' into WT-2244 2015-11-27 14:53:55 -05:00
Keith Bostic
2dcf7b18d5 Fix a comment, we no longer check a transaction value. 2015-11-27 14:53:31 -05:00
Keith Bostic
2e3a02cc52 __wt_page_modify_clear calls __wt_page_is_modified internally. 2015-11-27 14:14:46 -05:00
Michael Cahill
05a3b8f3d8 whitespace 2015-11-27 22:09:00 +11:00
Michael Cahill
6c65c86bb9 WT-2244 Don't wait for transactions after splits.
Now that we split before pages become completely full, and we can block splits
outright with a flag, don't use transaction visibilty to block subsequent
splits.  This had the effect of limiting each page to a single split during
long-running transactions including checkpoints.
2015-11-27 21:59:40 +11:00
Michael Cahill
4aa7ba0d85 WT-2244 Refine eviction during checkpoints.
In particular, take the checkpoint genertion into account when tracking the
oldest ID in eviction, so that we retry eviction as soon as a checkpoint moves
forward.
2015-11-27 21:56:39 +11:00
Michael Cahill
770ccf05c4 Merge branch 'develop' into WT-2244 2015-11-27 16:43:11 +11:00
Michael Cahill
a6da10e9fe Merge pull request #2341 from wiredtiger/SERVER-21553
SERVER-21553 Enable fast-path truncate after splits.
2015-11-27 16:39:13 +11:00
Michael Cahill
5dd8d4dc2e SERVER-21553 Fix a line removed by mistake: set the address during splits. 2015-11-27 15:54:30 +11:00
David Hows
4d7c9cef69 WT-2245 - Code Review changes 2015-11-27 15:27:25 +11:00
David Hows
bac122021c WT-2245 - Code Review changes 2015-11-27 15:25:43 +11:00
David Hows
a5b4ace6e5 WTPERF - Add a multiplier to the truncate stones to deal with very high throughput cases 2015-11-27 14:45:42 +11:00
Michael Cahill
6995565214 Merge branch 'develop' into SERVER-21553 2015-11-27 14:31:35 +11:00
Alex Gorrod
39dfd21030 Merge pull request #2339 from wiredtiger/WT-2243
WT-2243 Don't keep transaction IDs pinned for reading from checkpoints.
2015-11-27 13:49:33 +11:00
Michael Cahill
ad2500b3d8 SERVER-21553 Enable fast-path truncate after splits.
A truncate operation attempts to mark leaf pages deleted without reading them
into cache.  One of the conditions that has to be met for that fast-path
truncate of pages is that the leaf page not contain overflow items (or we would
need to read it in order to delete the overflow items).

The "no overflow" flag was not being preseved across internal page splits, so
recent changes to splits were defeating fast-path truncation.

Add tracking of the "no overflow" flag for in-memory page addresses so
fast-path truncates work after internal pages are split.
2015-11-27 12:29:17 +11:00
Michael Cahill
982c5862fb Fix some OS X clang warnings.
ext/extractors/csv/csv_extractor.c:155:25: error: implicit conversion loses integer precision: 'long' to 'int' [-Werror,-Wshorten-64-to-32]
src/cursor/cur_join.c:1022:16: error: implicit conversion loses integer precision: 'size_t' (aka 'unsigned long') to 'u_int' (aka 'unsigned int') [-Werror,-Wshorten-64-to-32]
2015-11-27 09:20:25 +11:00
Michael Cahill
8fe7bb1ece WT-2243 Only clear btree cursor flags associated with position.
Previously, we were clearing the new "no transaction" flag every time a cursor was repositioned.
2015-11-26 16:16:18 +11:00
Alex Gorrod
4e1844c6a2 Merge pull request #2324 from wiredtiger/wt-2230-multi-split-error-path
WT-2230: multi-split error path
2015-11-26 15:47:10 +11:00
Alex Gorrod
cace179242 Merge pull request #2320 from wiredtiger/wt-2228-raw-compression
WT-2228: avoid unnecessary raw-compression calls.
2015-11-26 15:42:35 +11:00
Michael Cahill
3b70b692f0 WT-2244 - Trigger in-memory splits sooner.
Specifically, do an in-memory split when we hit 80% of memory_page_max.
2015-11-26 15:26:45 +11:00
Michael Cahill
3f306ce74f WT-2243 Don't keep transaction IDs pinned for reading from checkpoints. 2015-11-26 14:47:53 +11:00
Alex Gorrod
890ee34474 Merge pull request #2336 from wiredtiger/server-21619-dont-split-dead-tree
SERVER-21619 Don't do internal page splits after a tree is marked DEAD.
2015-11-26 12:18:33 +11:00
Michael Cahill
9ecf70c9c0 SERVER-21619 Revert an assertion change. 2015-11-26 11:52:39 +11:00
Michael Cahill
6c7338f2e6 Merge pull request #2337 from wiredtiger/WT-2241
WT-2241 Use a lock to protect transaction ID allocation.
2015-11-26 11:41:03 +11:00
Michael Cahill
354c0314cd SERVER-21619 Push down where we mark pages clean so we don't have to repeat that logic.
Switch a few boolean values from 0/1 to false/true.
2015-11-26 11:09:04 +11:00
Don Anderson
978c237f01 Merge pull request #2335 from wiredtiger/server-21641-join-coverity
WT-2234: Coverity analysis warnings
2015-11-25 09:24:07 -05:00
Keith Bostic
bc1301ad7b WT-2234: Coverity 1339897: Resource leaks (RESOURCE_LEAK) 2015-11-25 08:57:55 -05:00
Keith Bostic
1e094eeee8 Revert "SERVER-21641. Resolve Coverity complaint."
This reverts commit c9907c6289.
2015-11-25 08:54:51 -05:00
Michael Cahill
5a51b154c6 WT-2241 Use a lock to protect transaction ID allocation.
We still need to make sure that transaction IDs are published in the state
table before the current ID is incremented so that snapshot reads don't see
uncommitted updates.  However, a lock simplifies the code and performs better
in testing than the initial fix.
2015-11-25 22:08:59 +11:00
Alex Gorrod
d55a5b1a03 SERVER-21619 Don't do internal page splits after a tree is marked DEAD.
It leads to problems where eviction attempts to write back to a
file after the block manager is already closed.
2015-11-25 16:39:40 +11:00
Don Anderson
c9907c6289 SERVER-21641. Resolve Coverity complaint.
Presumably __wt_config_gets_def's (conditional) reference of cfg[2] means
that cfg strings are de facto required to have at least 3 entries.
2015-11-24 11:29:28 -05:00
Keith Bostic
714ae53068 WT-2230: incomplete change, __wt_free_ref can't reference ref->page,
use ref->home.
2015-11-20 12:56:41 -05:00
Keith Bostic
d66e84a16a WT-2230: comment typo. 2015-11-20 10:52:35 -05:00
Keith Bostic
c3d02dc409 WT-2230: The split_multi_inmem_fail function calls wt_free_ref with
ref->page as its page argument, which is used by wt_free_ref to
check the page type and then free row-store keys instantiated in
the WT_REF.  That's an error, and we'd drop core because ref->page
is freed by wt_free_ref before the page type is checked.

Instead, pass in a page type to resolve questions about WT_REF.key,
and use WT_REF.home to resolve questions about WT_REF.addr.
2015-11-20 10:39:47 -05:00
Keith Bostic
801fcc687c WT-2228: The WiredTiger reconciliation process calls raw compression
(for example, if the engine is configured with zlib), even when the data
chunk is smaller than the allocation size and compression is known to
be a waste of time. When the allocation size is a significant percentage
of the maximum block size, this can be half or more of the calls to raw
compression.
2015-11-19 10:20:50 -05:00
43 changed files with 663 additions and 363 deletions

231
NEWS
View File

@@ -1,3 +1,228 @@
WiredTiger release 2.7.0, 2015-12-08
------------------------------------
The WiredTiger 2.7.0 release contains new features, minor API changes and bug
fixes.
New features and API changes; refer to the API documentation for full details:
* 959376c WT-147: Create indexes on non-empty tables.
* 4368d39 WT-1315: Add an implementation of cursor joins via a new WT_SESSION::join API.
* 944ccd1 WT-1350: Add a new configuration option to ::wiredtiger_open and
WT_CONNECTION::reconfigure called "eviction_dirty_trigger" that causes eviction to start evicting
dirty pages from cache once the given threshold has been reached.
* ab5a8fb WT-1728: Add a WT_SESSION::reset method to release resources held by a session.
* 263c5b7 WT-1930: Allow setting "file_manager=(close_idle_time=0)" to ::wiredtiger_open and
WT_CONNECTION::reconfigure to disable closing idle handles.
* 6310c3f WT-1959: Change verify to distinguish between warnings and errors. Add a new strict mode
to verify that causes warnings to be reported as errors. Use strict mode to match earlier
behavior. See the upgrading documentation for more information.
* e0d6229 WT-1980: Add a new "metadata:create" URI to WT_SESSION::open_cursor for metadata cursors
that return strings useful for passing to WT_SESSION::create.
* 292712e WT-2065: Add a new configuration option to ::wiredtiger_open and
WT_CONNECTION::reconfigure called "shared_cache=(quota)" that limits the amount of shared cache a
participant can be assigned.
* 4d0ebf4 WT-2104: Add a method to flush log files via a new WT_SESSION::log_flush API. Made
WT_SESSION::commit_transaction configuration options match WT_SESSION::log_flush. Change the
default WT_SESSION::transaction_sync timeout to 20 minutes rather than infinity.
* 21b8330 WT-2151: Enhance logging configuration to allow reconfiguration and add a new
"log=(zero_fill)" configuration option that causes WiredTiger to zero-fill log files on creation.
* 368b307 WT-2200: Add a new configuration option to ::wiredtiger_open called "write_through" that
causes WiredTiger to specify the FILE_FLAG_WRITE_THROUGH on Windows when writing files (default
false, including when "direct_io" is configured).
* 08c0fcd WT-2217: After a successful call to WT_CURSOR::insert, the key and value will be
cleared from the cursor. See the upgrading documentation for more information.
* d4fc69a SERVER-17078: Add a "statistics=(size)" mode to statistics cursors, which allows for
retrieving file size only.
* b83b901 SERVER-18356: Changed the handling of the "config_base" option to ::wiredtiger_open. See
upgrading documentation for more information.
The following statistics were removed:
* f1ed3b9 WT-1481: connection dhandles swept.
* f1ed3b9 WT-1481: connection candidate referenced.
* 4ba4518 WT-1481: failed to find a slot large enough for record.
* 28563af WT-1989: log buffer size increases.
* f81c70d WT-1989: slots selected for switching that were unavailable.
* df4f69c WT-2094: log records written directly.
* df4f69c WT-2094: record size exceeded maximum.
* d68e078 WT-2182: pages split during eviction.
Lookaside table:
* 6a5a461 WT-1967: Allow eviction of updates required by old readers.
* 87592ec WT-2074: Fix a race between lookaside table reconciliation and checkpoints.
* 0390b29 WT-2149: Fix the order of creation of the lookaside table.
* 7518a69 WT-2190: Fix transaction visibility test that is applied to the lookaside table.
* 2cf57a6 SERVER-21585: Don't use the lookaside file until the cache is stuck full.
Issues fixed in MongoDB:
* d57dc26 SERVER-18829: Have pages start in the middle of the LRU queue for eviction.
* b847ccc SERVER-18838: During drops, don't remove files until the metadata is durable.
* 8f7da9a SERVER-18875: Clean up deleted pages.
* d04083d SERVER-18899: Add unit test to simulate fsyncLock.
* 3ec45a7 SERVER-19340: Avoid type aliasing in the random number generator.
* 907c0ca SERVER-19445: Have the oldest transaction update the oldest tracked ID.
* fb8739f SERVER-19522: Try to evict internal pages with no useful child pages.
* 4545a8b SERVER-19573: Change row-store inserts to avoid page locking.
* b52d2d3 SERVER-19751: Retry pthread_create on EAGAIN or EINTR.
* 46b4ad5 SERVER-19954: Don't scan tracked handles during checkpoints.
* 65abd20 SERVER-19989: Add a write barrier before data handles are added to shared lists.
* 3e46e79 SERVER-19990: Don't assert on eviction of live updates from dead trees.
* 38dad39 SERVER-20008: Don't reset eviction walks when hitting a busy page.
* 3b72361 SERVER-20159: Make all readers wait while the cache is full.
* 8be547b SERVER-20193: Fix obsolete transaction check.
* ad56c6a SERVER-20303: Tune in-memory splits when inserting large objects.
* 7505a02 SERVER-20385: Make WT_CURSOR::next(random) more random.
* 35d46c3 SERVER-21027: Reverse split if there are many deleted pages.
* a6da10e SERVER-21553: Enable fast-path truncate after splits.
* 890ee34 SERVER-21619: Don't do internal page splits after a tree is marked DEAD.
* 0e93d60 SERVER-21691: Avoid insert stalls.
Other note worthy changes since the previous release:
* bc2aa57 WT-1744: Throttle worker threads based on eviction targets.
* 55a989e WT-1845: Allow read only transactions to commit after failure.
* df625dc WT-1869: Avoid doing in memory splits while checkpointing a tree.
* ddac54f WT-1942: Add atomic implementations for PPC64 architecture.
* 3866fa6 WT-1962: Make the hot_backup_lock a read/write lock.
* 58f9e99 WT-1963: Fix backup cursor Java API.
* 4e0fe59 WT-1964: Fix a bug in the Java API when closing handles from a different thread.
* 60e2150 WT-1966: Change how the shared cache assigns priority to participants.
* 76d2e73 WT-1975: Ensure previous log files are complete for forced sync.
* e43b22a WT-1977: Improve performance of getting snapshots with many sessions.
* 5eaf63e WT-1978: Better checking and tests for index cursor comparison.
* 1602a4b WT-1981: Fix a signed 32-bit integer unpacking bug.
* cd1704d WT-1982: Fix a bug where cached overflow items were freed too early.
* 57a9f38 WT-1985: Integer packing and other fixes for Python and Java.
* 9897eb2 WT-1986: Fix a race renaming temporary log files.
* b10bff9 WT-1989: Improve scalability of log writes.
* f8dc12b WT-1996: Fix a bug where we would free the fist update during a page rewrite on error.
* 144a383 WT-1998: Fixes for indexes with some rarely used key/value formats.
* 8af8b8a WT-2002: Fix a bug in verify where it would panic when encountering a corrupted file.
* e1d8bc7 WT-2007: Statically allocate log slot buffers to a maximum size.
* 911158c WT-2008: Fix a bug in recovery where a file create went missing.
* 3e2e7e6 WT-2009: Apply tracked metadata operations post-commit.
* 1255cb2 WT-2012: Fix a bug updating the oldest ID.
* ef9d56f WT-2013: Add gcc asm definitions for ARM64.
* c8633e6 WT-2014: Fix a bug in checkpoints where files could be flushed in the wrong order.
* 9b09e69 WT-2015: Fix a bug in error handling during block open.
* 4938b8d WT-2017: Once an eviction server thread is started keep it running.
* 298f86c WT-2019: Fix a logic bug tracking the maximum transaction ID in clean trees.
* 7d6075c WT-2020: Clarify checksum error failure messages.
* 7b302d3 WT-2021: Fix a bug moving the oldest ID forward (introduced by WT-1967).
* 9df72d7 WT-2022: Fix a bug not releasing a handle when opening a non-existent index cursor.
* 81ffc2d WT-2023: Improve locking primitives: simplify read-write lock operations.
* 6b84722 WT-2029: Improve scalability of statistics.
* f97cfe9 WT-2031: Log slot revamp.
* bee11c3 WT-2032: Improve next_random cursors to work with small trees.
* cf53696 WT-2034: Improve shared cache balancing algorithm.
* aee1c94 WT-2035: For index cursors, keep track of which column groups need to be positioned.
* 36310d4 WT-2036: Make handle sweeps more robust.
* c948fbb WT-2037: Only write a checkpoint to the log on close if it wasn't.
* e25e615 WT-2038: Avoid long scans holding the handle list lock.
* 75a4655 WT-2039: Add error check and unit test for log records over 4 GB.
* 5ab26af WT-2042: Only try to evict tombstones that are visible to all readers.
* ce223ac WT-2045: Don't let the eviction server do slow reconciliation, it can stall eviction.
* 6665618 WT-2046: Add a statistic for search restarts.
* 98b4a28 WT-2047: Fix a bug in the random generator code to handle an uninitialized state.
* 258e2e1 WT-2050: Show size with memory allocation errors.
* 2e1471c WT-2053: Fix a bug in disk verify messages.
* e316e61 WT-2056: Reorder btree cursor close so stats are maintained correctly.
* 70f9100 WT-2057: Remove the verbose configuration when writing the base configuration file.
* 41b6fb8 WT-2058: Fix an alignment bug in the mutex and log-slot code.
* d72012b WT-2059: Include non-aggregated stats in cursor results.
* 3e0c7bf WT-2062: Try harder to make progress on in-memory splits.
* 66757f7 WT-2064: Don't spin indefinitely waiting for the handle list lock in eviction.
* 8f42f02 WT-2066: Update the oldest transaction ID from eviction.
* e167592 WT-2068: Protect discarding handles with the handle list lock.
* fd72a09 WT-2075: Fix a hang in logging with parallel workload.
* 11c0fa0 WT-2078: Fix a bug in error handling with statistics cursors.
* 9734d85 WT-2081: Make verify progress reporting less verbose.
* 6008b41 WT-2085: Run some of the log_server threads operations more frequently.
* 39a69ec WT-2086: Add a statistic to track when eviction finds a page that can be split.
* 334e103 WT-2089: Relax restrictions on multiblock eviction and in-memory splits.
* f13b788 WT-2090: Fix a bug in the Windows OS layer that swallowed error returns.
* 83b8db7 WT-2092: Free log condition variables after all threads are joined.
* d9391c0 WT-2093: Use the C99 bool type to clarify when functions return true/false.
* f883d27 WT-2094: Eliminate direct write and record unbuffered log records.
* 9008260 WT-2097: Reintroduce immediate waits when forced eviction is necessary.
* ff1da28 WT-2100: Rename evict to evict_queue so it's easier to search for.
* 41db2ee WT-2101: Don't update the logging ckpt_lsn on clean shutdown.
* e1d6886 WT-2102: Fix a hang in log slot join when forcing log writes.
* 0e96683 WT-2105: Fix a bug where we could reference an invalid memory address if a file is
corrupted on disk.
* 6a565bc WT-2108: Rework in-memory page rewrite support (WT_PM_REC_REWRITE).
* dcb0ddb WT-2114: Make application eviction fairer.
* 10c2f15 WT-2115: Don't skip truncated pages that are part of a checkpoint.
* cd6ce97 WT-2116: Add diagnostic checks for stuck cache and dump the state.
* 51cf672 WT-2119: Don't evict clean multiblock pages with overflow items during checkpoints.
* 346ad40 WT-2126: Clean up if there is an error during splits.
* 6831485 WT-2127: Deepen the tree more regularly to avoid wide internal pages.
* a0b5d2b WT-2128: When decoding huffman encoding during salvage it's possible to have fewer bits
than the symbol length during decoding, if the value has been corrupted.
* 79f74e5 WT-2131: Switch to using a lock to control page splits to avoid starvation.
* 02a3d9f WT-2132: Make debug dump function more robust to errors.
* 8c223e4 WT-2134: Flush all buffered log records in log_flush.
* d1b5e7f WT-2135: Fix log_only setting for backup cursor. Fix initialization.
* aab8101 WT-2137: Check the sync_lsn is in the correct file before moving it forward.
* 323af84 WT-2139: Fix a transaction visibility bug in read-uncommitted transactions.
* 751c628 WT-2146: Improve performance when searching for short keys.
* 62998ce WT-2148: Fix a compiler warning in encoding functions.
* 6c16fdd WT-2153: Fix bug. Now we always need to start the log_server thread.
* 6a5fca3 WT-2154: Make btree dump safer.
* 0d74bc6 WT-2155: Remove last use of F_CAS_ATOMIC and the associated macro.
* cc42bda WT-2156: Allow eviction workers to restart.
* bf1d359 WT-2157: Fix a bug where a failed page split could lead to incomplete checkpoints.
* ce9d265 WT-2159: Don't check the config twice in one path.
* 544f27d WT-2162: Add null pointer check, needed after an index is dropped.
* 0d85ebe WT-2164: Prevent another LSM chunk checkpoint while the first is still in progress.
* a81aae8 WT-2165: Stop using FALLOC_FL_KEEP_SIZE flag when pre-allocating files.
* 2865a76 WT-2167: Switch recovery to using an internal session.
* 5d4c952 WT-2170: Protect the turtle file with a lock.
* 497b744 WT-2174: Avoid the table list lock when creating a size only statistics cursor.
* fdfa804 WT-2178: In-memory storage engine support.
* b9bd01f WT-2179: Added decorator to mark txn13 as part of the --long test suite.
* be544dd WT-2180: Remove cursor.{search,search-near,remove} key size validation.
* be412b5 WT-2182: When internal pages grow large enough, split them into their parents.
* c27e78e WT-2184: Fix log scan bug when final record has many trailing zeros.
* 9584be3 WT-2185: Don't do reverse splits when closing a file.
* f6b12d3 WT-2187: Add flag for flushing a slot.
* a4545bf WT-2189: Update flag set and clear macros to be less error prone.
* 30ab327 WT-2191: In-memory disk image no longer the same as saved updates.
* 4ba5698 WT-2192: Fix the logic around checking whether internal page is evictable.
* 2f0b3e2 WT-2193: Handle read-committed metadata checkpoints during snapshot transactions.
* 9b1febc WT-2194: Java close callbacks should handle cursors that Java code did not open.
* 438f455 WT-2195: Fix a hang after giving up on a reverse split.
* ff27fe9 WT-2196: Fix error handling in size only statistics.
* 0a1ee34 WT-2199: Fix transaction sync inconsistency.
* 2ff1fd6 WT-2203: Release an allocated page on error.
* 3b3cf2a WT-2204: Don't take a local copy of page->modify until we know the page is dirty.
* 179d4d0 WT-2206: Change cache operations from flags to an enumeration.
* 82514ca WT-2207: Track whenever a session has a handle exclusive.
* 78bd4ac WT-2210: Raw compression fails if row-store recovery precedes column-store recovery.
* c360d53 WT-2212: Add a "use_environment" config to ::wiredtiger_open.
* a72ddb7 WT-2218: Add truncate stats.
* ce8c091 WT-2219: Enhancements to in-memory testing.
* e2f1130 WT-2220: Update time comparison macros.
* 59857f9 WT-2222: Add statistics for named snapshots.
* fb9cebe WT-2224: Track which deleted refs are discarded by a split.
* cace179 WT-2228: Avoid unnecessary raw-compression calls.
* 0a52a80 WT-2237: Have threads publish unique transaction IDs so that updates always become
visible immediately on commit.
* 6c7338f WT-2241: Use a lock to protect transaction ID allocation.
* 39dfd21 WT-2243: Don't keep transaction IDs pinned for reading from checkpoints.
* 4c49948 WT-2244: Trigger in-memory splits sooner.
* 9f2e4f3 WT-2248: WT_SESSION::close is updating WT_CONNECTION_IMPL.default_session.
* 264ec21 WT-2249: Keep eviction stuck until cache usage is under 100%.
* dca1411 WT-2250: Minor fix. Use SET instead of increment for stat.
* e731ef8 WT-2251: Free addresses when we discard deleted page references.
* 4fc3e39 WT-2253: Evict pages left behind by in-memory splits.
* 2df5658 WT-2257: Fixes when given multiple thread workload configurations.
* 4c49043 WT-2260: Avoid adding internal pages to the eviction queue
WiredTiger release 2.6.1, 2015-05-13
------------------------------------
@@ -255,7 +480,7 @@ API and behavior changes:
* Update configuration string parsing to always be case sensitive. See
upgrading documentation for more information.
* Change the statistics cursor WT_CURSOR.reset method to re-load statistics
* Change the statistics cursor WT_CURSOR::reset method to re-load statistics
values. See upgrading documentation for more information.
refs WT-1533
@@ -465,7 +690,7 @@ New features and API changes:
See API documentation for more information.
refs #1381
* Add a new WT_SESSION.strerror method, a thread-safe alternative to
* Add a new WT_SESSION::strerror method, a thread-safe alternative to
::wiredtiger_strerror.
refs #1516
@@ -1271,7 +1496,7 @@ This is primarily a bugfix and performance tuning release. The main changes are:
* The default behavior of the wt utility's load command has been changed to
overwrite existing data.
* Add a WT_SESSION.create prefix_compression_min configuration option with a
* Add a WT_SESSION::create prefix_compression_min configuration option with a
default value of 4. [#624] and [#624]
* Fix "make install" of Python API. [#598]

2
README
View File

@@ -1,4 +1,4 @@
WiredTiger 2.7.0: (November 19, 2015)
WiredTiger 2.7.0: (December 8, 2015)
This is version 2.7.0 of WiredTiger.

View File

@@ -46,7 +46,6 @@ static void config_opt_usage(void);
#define STRING_MATCH(str, bytes, len) \
(strncmp(str, bytes, len) == 0 && (str)[(len)] == '\0')
/*
* config_assign --
* Assign the src config to the dest, any storage allocated in dest is
@@ -181,6 +180,16 @@ config_threads(CONFIG *cfg, const char *config, size_t len)
int ret;
group = scan = NULL;
if (cfg->workload != NULL) {
/*
* This call overrides an earlier call. Free and
* reset everything.
*/
free(cfg->workload);
cfg->workload = NULL;
cfg->workload_cnt = 0;
cfg->workers_cnt = 0;
}
/* Allocate the workload array. */
if ((cfg->workload = calloc(WORKLOAD_MAX, sizeof(WORKLOAD))) == NULL)
return (enomem(cfg));
@@ -201,7 +210,7 @@ config_threads(CONFIG *cfg, const char *config, size_t len)
if ((ret = wiredtiger_config_parser_open(
NULL, groupk.str, groupk.len, &scan)) != 0)
goto err;
/* Move to the next workload slot. */
if (cfg->workload_cnt == WORKLOAD_MAX) {
fprintf(stderr,
@@ -308,7 +317,7 @@ err: if (group != NULL)
(void)group->close(group);
if (scan != NULL)
(void)scan->close(scan);
fprintf(stderr,
"invalid thread configuration or scan error: %.*s\n",
(int)len, config);
@@ -677,7 +686,7 @@ config_print(CONFIG *cfg)
for (i = 0, workp = cfg->workload;
i < cfg->workload_cnt; ++i, ++workp)
printf("\t\t%" PRId64 " threads (inserts=%" PRId64
", reads=%" PRId64 ", updates=%" PRId64
", reads=%" PRId64 ", updates=%" PRId64
", truncates=% " PRId64 ")\n",
workp->threads,
workp->insert, workp->read,

View File

@@ -8,4 +8,4 @@ run_time=500
populate_threads=1
# Setup three threads to insert into the oplog
# Setup one thread to be doing truncates from the oplog
threads=((count=3,inserts=1,throttle=2000),(count=1,truncate=1,truncate_pct=10,truncate_count=50000))
threads=((count=3,inserts=1,throttle=4000),(count=1,truncate=1,truncate_pct=10,truncate_count=50000))

View File

@@ -12,16 +12,27 @@
# This script should be invoked with the pathname of the wtperf test
# config to run and the number of runs.
#
if test "$#" -ne "2"; then
if test "$#" -lt "2"; then
echo "Must specify wtperf test to run and number of runs"
exit 1
fi
wttest=$1
runmax=$2
# Jenkins removes the quotes from the passed in arg so we may
# have 3 or 4 args.
wtarg=""
wtarg2=""
if test "$#" -gt "2"; then
wtarg=$3
if test "$#" -eq "4"; then
wtarg2=$4
fi
fi
home=./WT_TEST
outfile=./wtperf.out
rm -f $outfile
echo "Parsed $# args: test: $wttest runmax: $runmax args: $wtarg $wtarg2" >> $outfile
# Each of these has an entry for each op in ops below.
avg=(0 0 0 0)
@@ -77,7 +88,7 @@ run=1
while test "$run" -le "$runmax"; do
rm -rf $home
mkdir $home
LD_PRELOAD=/usr/lib64/libjemalloc.so.1 LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib ./wtperf -O $wttest
LD_PRELOAD=/usr/lib64/libjemalloc.so.1 LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib ./wtperf -O $wttest $wtarg $wtarg2
if test "$?" -ne "0"; then
exit 1
fi

View File

@@ -1534,8 +1534,10 @@ execute_workload(CONFIG *cfg)
lprintf(cfg, 0, 1,
"Starting workload #%d: %" PRId64 " threads, inserts=%"
PRId64 ", reads=%" PRId64 ", updates=%" PRId64
", truncate=%" PRId64, i + 1, workp->threads, workp->insert,
workp->read, workp->update, workp->truncate);
", truncate=%" PRId64 ", throttle=%" PRId64,
i + 1, workp->threads, workp->insert,
workp->read, workp->update, workp->truncate,
workp->throttle);
/* Figure out the workload's schedule. */
if ((ret = run_mix_schedule(cfg, workp)) != 0)
@@ -1906,7 +1908,7 @@ start_run(CONFIG *cfg)
monitor_created = ret = 0;
/* [-Wconditional-uninitialized] */
memset(&monitor_thread, 0, sizeof(monitor_thread));
if ((ret = setup_log_file(cfg)) != 0)
goto err;
@@ -2427,6 +2429,11 @@ worker_throttle(int64_t throttle, int64_t *ops, struct timespec *interval)
if (usecs_to_complete < USEC_PER_SEC)
(void)usleep((useconds_t)(USEC_PER_SEC - usecs_to_complete));
/*
* After sleeping, set the interval to the current time.
*/
if (__wt_epoch(NULL, &now) != 0)
return;
*ops = 0;
*interval = now;
}

View File

@@ -116,6 +116,7 @@ struct __truncate_struct {
uint64_t last_total_inserts;
uint64_t num_stones;
uint64_t last_key;
uint64_t catchup_multiplier;
};
/* Queue entry for use with the Truncate Logic */

View File

@@ -164,8 +164,8 @@ DEF_OPT_AS_STRING(threads, "", "workload configuration: each 'count' "
"'update' entries are the ratios of insert, read and update operations "
"done by each worker thread; If a throttle value is provided each thread "
"will do a maximum of that number of operations per second; multiple "
"workload configurations may be "
"specified; for example, a more complex threads configuration might be "
"workload configurations may be specified per threads configuration; "
"for example, a more complex threads configuration might be "
"'threads=((count=2,reads=1)(count=8,reads=1,inserts=2,updates=1))' "
"which would create 2 threads doing nothing but reads and 8 threads "
"each doing 50% inserts and 25% reads and updates. Allowed configuration "

View File

@@ -54,6 +54,12 @@ setup_truncate(CONFIG *cfg, CONFIG_THREAD *thread, WT_SESSION *session) {
session, cfg->uris[0], NULL, NULL, &cursor)) != 0)
goto err;
/*
* If we find the workload getting behind we multiply the number of
* records to be truncated.
*/
trunc_cfg->catchup_multiplier = 1;
/* How many entries between each stone. */
trunc_cfg->stone_gap =
(workload->truncate_count * workload->truncate_pct) / 100;
@@ -133,6 +139,7 @@ run_truncate(CONFIG *cfg, CONFIG_THREAD *thread,
TRUNCATE_QUEUE_ENTRY *truncate_item;
char *truncate_key;
int ret, t_ret;
uint64_t used_stone_gap;
ret = 0;
trunc_cfg = &thread->trunc_cfg;
@@ -145,11 +152,32 @@ run_truncate(CONFIG *cfg, CONFIG_THREAD *thread,
trunc_cfg->last_total_inserts = trunc_cfg->total_inserts;
/* We are done if there isn't enough data to trigger a new milestone. */
if (trunc_cfg->expected_total <= trunc_cfg->needed_stones)
if (trunc_cfg->expected_total <= thread->workload->truncate_count)
return (0);
/*
* If we are falling behind and using more than one stone per lap we
* should widen the stone gap for this lap to try and catch up quicker.
*/
if (trunc_cfg->expected_total >
thread->workload->truncate_count + trunc_cfg->stone_gap) {
/*
* Increase the multiplier until we create stones that are
* almost large enough to truncate the whole expected table size
* in one operation.
*/
trunc_cfg->catchup_multiplier =
WT_MIN(trunc_cfg->catchup_multiplier + 1,
trunc_cfg->needed_stones - 1);
} else {
/* Back off if we start seeing an improvement */
trunc_cfg->catchup_multiplier =
WT_MAX(trunc_cfg->catchup_multiplier - 1, 1);
}
used_stone_gap = trunc_cfg->stone_gap * trunc_cfg->catchup_multiplier;
while (trunc_cfg->num_stones < trunc_cfg->needed_stones) {
trunc_cfg->last_key += trunc_cfg->stone_gap;
trunc_cfg->last_key += used_stone_gap;
truncate_key = calloc(cfg->key_sz, 1);
if (truncate_key == NULL) {
lprintf(cfg, ENOMEM, 0,
@@ -165,7 +193,7 @@ run_truncate(CONFIG *cfg, CONFIG_THREAD *thread,
}
generate_key(cfg, truncate_key, trunc_cfg->last_key);
truncate_item->key = truncate_key;
truncate_item->diff = trunc_cfg->stone_gap;
truncate_item->diff = used_stone_gap;
TAILQ_INSERT_TAIL(&cfg->stone_head, truncate_item, q);
trunc_cfg->num_stones++;
}
@@ -189,7 +217,6 @@ run_truncate(CONFIG *cfg, CONFIG_THREAD *thread,
goto err;
}
*truncatedp = 1;
trunc_cfg->expected_total -= truncate_item->diff;

View File

@@ -3,7 +3,7 @@ dnl build by dist/s_version
VERSION_MAJOR=2
VERSION_MINOR=7
VERSION_PATCH=0
VERSION_STRING='"WiredTiger 2.7.0: (November 19, 2015)"'
VERSION_STRING='"WiredTiger 2.7.0: (December 8, 2015)"'
AC_SUBST(VERSION_MAJOR)
AC_SUBST(VERSION_MINOR)

3
dist/s_docs vendored
View File

@@ -22,7 +22,8 @@ changelog()
(echo "WiredTiger Change Log"
echo "====================="
echo
cat ../NEWS) > ../src/docs/changelog.md
sed -e 's, \([0-9a-f]\{7\}\) , [\1](https://github.com/wiredtiger/wiredtiger/commit/\1) ,g' \
-e 's,\(\(WT\|SERVER\)-[0-9]*\),[\1](https://jira.mongodb.org/browse/\1),g' ../NEWS) > ../src/docs/changelog.md
}
wtperf_config()

3
dist/s_string vendored
View File

@@ -30,7 +30,8 @@ replace() {
# check:
# Check the spelling of an individual file.
check() {
aspell --lang=en $1 list < ../$2 |
# Strip out git hashes, which are seven character hex strings.
sed 's/ [0-9a-f]\{7\} / /g' ../$2 | aspell --lang=en $1 list |
sort -u |
comm -23 /dev/stdin s_string.ok > $t
test -s $t && {

5
dist/s_string.ok vendored
View File

@@ -102,6 +102,7 @@ Encryptor
Encryptors
Enqueue
Eron
FALLOC
FALLTHROUGH
FH
FLD
@@ -224,6 +225,7 @@ Obama
Outfmt
PARAM
POSIX
PPC
PREDEFINE
PRIu
PRNG
@@ -358,6 +360,7 @@ arg
argc
args
argv
asm
async
asyncopp
asyncops
@@ -594,6 +597,7 @@ free'd
fscanf
fstat
fsync
fsyncLock
fsyncs
ftruncate
func
@@ -876,6 +880,7 @@ runtime
rwlock
sH
sHQ
scalability
sched
scr
sd

2
dist/s_whitespace vendored
View File

@@ -32,7 +32,7 @@ for f in `find dist -name '*.py' -name 's_*'`; do
done
# C-language sources.
for f in `find examples ext src test \
for f in `find bench examples ext src test \
-name '*.[chi]' -o \
-name '*.dox' -o \
-name '*.in' -o \

View File

@@ -97,8 +97,10 @@ csv_extract(WT_EXTRACTOR *extractor, WT_SESSION *session,
strncpy(copy, p, len);
copy[len] = '\0';
if (csv_extractor->format_isnum) {
if ((val = atoi(copy)) < 0)
if ((val = atoi(copy)) < 0) {
free(copy);
return (EINVAL);
}
result_cursor->set_key(result_cursor, val);
} else
result_cursor->set_key(result_cursor, copy);
@@ -150,7 +152,7 @@ csv_customize(WT_EXTRACTOR *extractor, WT_SESSION *session,
return (errno);
*csv_extractor = *orig;
csv_extractor->field = field_num;
csv_extractor->field = (int)field_num;
csv_extractor->format_isnum = (format.str[0] == 'i');
*customp = (WT_EXTRACTOR *)csv_extractor;
return (0);

View File

@@ -77,7 +77,7 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
}
(void)__wt_atomic_addv32(&S2BT(session)->evict_busy, 1);
ret = __wt_evict(session, ref, 0);
ret = __wt_evict(session, ref, false);
(void)__wt_atomic_subv32(&S2BT(session)->evict_busy, 1);
WT_RET_BUSY_OK(ret);
}
@@ -99,25 +99,18 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
/*
* We cannot fast-delete pages that have overflow key/value items as
* the overflow blocks have to be discarded. The way we figure that
* out is to check the on-page cell type for the page, cells for leaf
* pages that have no overflow items are special.
*
* In some cases, the reference address may not reference an on-page
* cell (for example, some combination of page splits), in which case
* we can't check the original cell value and we fail.
* out is to check the page's cell type, cells for leaf pages without
* overflow items are special.
*
* To look at an on-page cell, we need to look at the parent page, and
* that's dangerous, our parent page could change without warning if
* the parent page were to split, deepening the tree. It's safe: the
* page's reference will always point to some valid page, and if we find
* any problems we simply fail the fast-delete optimization.
*
* !!!
* I doubt it's worth the effort, but we could copy the cell's type into
* the reference structure, and then we wouldn't need an on-page cell.
*/
parent = ref->home;
if (__wt_off_page(parent, ref->addr) ||
if (__wt_off_page(parent, ref->addr) ?
((WT_ADDR *)ref->addr)->type != WT_ADDR_LEAF_NO :
__wt_cell_type_raw(ref->addr) != WT_CELL_ADDR_LEAF_NO)
goto err;

View File

@@ -50,15 +50,18 @@ __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep)
page = *pagep;
*pagep = NULL;
if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD))
__wt_page_modify_clear(session, page);
/*
* We should never discard ...
* We should never discard:
* - a dirty page,
* - a page queued for eviction, or
* - a locked page.
*/
WT_ASSERT( /* ... a dirty page */
session, !__wt_page_is_modified(page));
WT_ASSERT( /* ... a page queued for LRU eviction */
session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU));
WT_ASSERT( /* ... a locked page */
session, !__wt_fair_islocked(session, &page->page_lock));
WT_ASSERT(session, !__wt_page_is_modified(page));
WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU));
WT_ASSERT(session, !__wt_fair_islocked(session, &page->page_lock));
#ifdef HAVE_DIAGNOSTIC
{
@@ -227,7 +230,7 @@ __free_page_int(WT_SESSION_IMPL *session, WT_PAGE *page)
*/
void
__wt_free_ref(
WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref, bool free_pages)
WT_SESSION_IMPL *session, WT_REF *ref, int page_type, bool free_pages)
{
WT_IKEY *ikey;
@@ -246,8 +249,15 @@ __wt_free_ref(
__wt_page_out(session, &ref->page);
}
/* Free any key allocation. */
switch (page->type) {
/*
* Optionally free row-store WT_REF key allocation. Historic versions of
* this code looked in a passed-in page argument, but that is dangerous,
* some of our error-path callers create WT_REF structures without ever
* setting WT_REF.home or having a parent page to which the WT_REF will
* be linked. Those WT_REF structures invariably have instantiated keys,
* (they obviously cannot be on-page keys), and we must free the memory.
*/
switch (page_type) {
case WT_PAGE_ROW_INT:
case WT_PAGE_ROW_LEAF:
if ((ikey = __wt_ref_key_instantiated(ref)) != NULL)
@@ -255,11 +265,11 @@ __wt_free_ref(
break;
}
/* Free any address allocation. */
if (ref->addr != NULL && __wt_off_page(page, ref->addr)) {
__wt_free(session, ((WT_ADDR *)ref->addr)->addr);
__wt_free(session, ref->addr);
}
/*
* Free any address allocation; if there's no linked WT_REF page, it
* must be allocated.
*/
__wt_ref_addr_free(session, ref);
/* Free any page-deleted information. */
if (ref->page_del != NULL) {
@@ -272,7 +282,7 @@ __wt_free_ref(
/*
* __wt_free_ref_index --
* Discard a page index and it's references.
* Discard a page index and its references.
*/
void
__wt_free_ref_index(WT_SESSION_IMPL *session,
@@ -284,7 +294,8 @@ __wt_free_ref_index(WT_SESSION_IMPL *session,
return;
for (i = 0; i < pindex->entries; ++i)
__wt_free_ref(session, page, pindex->index[i], free_pages);
__wt_free_ref(
session, pindex->index[i], page->type, free_pages);
__wt_free(session, pindex);
}

View File

@@ -696,6 +696,13 @@ __btree_page_sizes(WT_SESSION_IMPL *session)
WT_MIN(btree->maxmempage, cache_size / 4);
}
/*
* Try in-memory splits once we hit 80% of the maximum in-memory page
* size. This gives multi-threaded append workloads a better chance of
* not stalling.
*/
btree->splitmempage = 8 * btree->maxmempage / 10;
/*
* Get the split percentage (reconciliation splits pages into smaller
* than the maximum page size chunks so we don't split every time a

View File

@@ -307,10 +307,6 @@ __evict_force_check(WT_SESSION_IMPL *session, WT_REF *ref)
btree = S2BT(session);
page = ref->page;
/* Pages are usually small enough, check that first. */
if (page->memory_footprint < btree->maxmempage)
return (0);
/* Leaf pages only. */
if (WT_PAGE_IS_INTERNAL(page))
return (0);
@@ -322,6 +318,12 @@ __evict_force_check(WT_SESSION_IMPL *session, WT_REF *ref)
if (page->modify == NULL)
return (0);
/* Pages are usually small enough, check that first. */
if (page->memory_footprint < btree->splitmempage)
return (0);
else if (page->memory_footprint < btree->maxmempage)
return (__wt_leaf_page_can_split(session, page));
/* Trigger eviction on the next page release. */
__wt_page_evict_soon(page);
@@ -329,7 +331,7 @@ __evict_force_check(WT_SESSION_IMPL *session, WT_REF *ref)
__wt_txn_update_oldest(session, false);
/* If eviction cannot succeed, don't try. */
return (__wt_page_can_evict(session, ref, true, NULL));
return (__wt_page_can_evict(session, ref, NULL));
}
/*

View File

@@ -326,7 +326,7 @@ __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[])
*/
if (ss->root_ref.page != NULL) {
btree->ckpt = ckptbase;
ret = __wt_evict(session, &ss->root_ref, 1);
ret = __wt_evict(session, &ss->root_ref, true);
ss->root_ref.page = NULL;
btree->ckpt = NULL;
}
@@ -1290,9 +1290,7 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref)
* would have been lost.) Clear the reference addr so eviction doesn't
* free the underlying blocks.
*/
__wt_free(session, ((WT_ADDR *)ref->addr)->addr);
__wt_free(session, ref->addr);
ref->addr = NULL;
__wt_ref_addr_free(session, ref);
/* Write the new version of the leaf page to disk. */
WT_ERR(__slvg_modify_init(session, page));
@@ -1304,7 +1302,7 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref)
ret = __wt_page_release(session, ref, 0);
if (ret == 0)
ret = __wt_evict(session, ref, 1);
ret = __wt_evict(session, ref, true);
if (0) {
err: WT_TRET(__wt_page_release(session, ref, 0));
@@ -2013,9 +2011,7 @@ __slvg_row_build_leaf(
* would have been lost.) Clear the reference addr so eviction doesn't
* free the underlying blocks.
*/
__wt_free(session, ((WT_ADDR *)ref->addr)->addr);
__wt_free(session, ref->addr);
ref->addr = NULL;
__wt_ref_addr_free(session, ref);
/* Write the new version of the leaf page to disk. */
WT_ERR(__slvg_modify_init(session, page));
@@ -2030,7 +2026,7 @@ __slvg_row_build_leaf(
*/
ret = __wt_page_release(session, ref, 0);
if (ret == 0)
ret = __wt_evict(session, ref, 1);
ret = __wt_evict(session, ref, true);
if (0) {
err: WT_TRET(__wt_page_release(session, ref, 0));

View File

@@ -340,8 +340,18 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home,
return (ret);
}
addr->size = (uint8_t)unpack.size;
addr->type =
unpack.raw == WT_CELL_ADDR_INT ? WT_ADDR_INT : WT_ADDR_LEAF;
switch (unpack.raw) {
case WT_CELL_ADDR_INT:
addr->type = WT_ADDR_INT;
break;
case WT_CELL_ADDR_LEAF:
addr->type = WT_ADDR_LEAF;
break;
case WT_CELL_ADDR_LEAF_NO:
addr->type = WT_ADDR_LEAF_NO;
break;
WT_ILLEGAL_VALUE(session);
}
ref->addr = addr;
}
@@ -399,17 +409,8 @@ __split_ref_move_final(
WT_DECL_RET;
WT_PAGE *child;
WT_REF *ref, *child_ref;
uint64_t txn_new_id;
uint32_t i;
/*
* When creating new internal pages as part of a split, we set a field
* in those pages modify structure to prevent them from being evicted
* until all threads are known to have exited the index of the page that
* previously "owned" the WT_REF. Set that field to a safe value.
*/
txn_new_id = __wt_txn_id_alloc(session, false);
/*
* The WT_REF structures moved to newly allocated child pages reference
* the wrong parent page and we have to fix that up. The problem is
@@ -461,8 +462,6 @@ __split_ref_move_final(
if (child_ref->home != child) {
child_ref->home = child;
child_ref->pindex_hint = 0;
child->modify->mod_split_txn = txn_new_id;
}
} WT_INTL_FOREACH_END;
WT_LEAVE_PAGE_INDEX(session);
@@ -896,6 +895,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
*/
WT_ASSERT(session, next_ref->page_del == NULL);
WT_TRET(__wt_ref_block_free(session, next_ref));
WT_TRET(__split_safe_free(
session, split_gen, exclusive, next_ref, sizeof(WT_REF)));
parent_decr += sizeof(WT_REF);
@@ -1183,8 +1183,8 @@ err: /*
* Lock an internal page.
*/
static int
__split_internal_lock(
WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE **parentp, bool *hazardp)
__split_internal_lock(WT_SESSION_IMPL *session, WT_REF *ref, bool trylock,
WT_PAGE **parentp, bool *hazardp)
{
WT_DECL_RET;
WT_PAGE *parent;
@@ -1202,7 +1202,7 @@ __split_internal_lock(
* loop until the exclusive lock is resolved). If we want to split
* the parent, give up to avoid that deadlock.
*/
if (S2BT(session)->checkpointing != WT_CKPT_OFF)
if (!trylock && S2BT(session)->checkpointing != WT_CKPT_OFF)
return (EBUSY);
/*
@@ -1227,7 +1227,10 @@ __split_internal_lock(
if (F_ISSET_ATOMIC(parent, WT_PAGE_SPLIT_BLOCK))
return (EBUSY);
WT_RET(__wt_fair_lock(session, &parent->page_lock));
if (trylock)
WT_RET(__wt_fair_trylock(session, &parent->page_lock));
else
WT_RET(__wt_fair_lock(session, &parent->page_lock));
if (parent == ref->home)
break;
WT_RET(__wt_fair_unlock(session, &parent->page_lock));
@@ -1371,7 +1374,7 @@ __split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page, bool page_hazard)
* locks, lock-coupling up the tree.
*/
WT_ERR(__split_internal_lock(
session, ref, &parent, &parent_hazard));
session, ref, true, &parent, &parent_hazard));
ret = __split_internal(session, parent, page);
WT_TRET(__split_internal_unlock(session, page, page_hazard));
@@ -1527,7 +1530,7 @@ __split_multi_inmem_final(WT_PAGE *orig, WT_MULTI *multi)
* Discard allocated pages after failure.
*/
static void
__split_multi_inmem_fail(WT_SESSION_IMPL *session, WT_REF *ref)
__split_multi_inmem_fail(WT_SESSION_IMPL *session, WT_PAGE *orig, WT_REF *ref)
{
/*
* We failed creating new in-memory pages. For error-handling reasons,
@@ -1537,7 +1540,7 @@ __split_multi_inmem_fail(WT_SESSION_IMPL *session, WT_REF *ref)
*/
if (ref->page != NULL) {
F_SET_ATOMIC(ref->page, WT_PAGE_UPDATE_IGNORE);
__wt_free_ref(session, ref->page, ref, true);
__wt_free_ref(session, ref, orig->type, true);
}
}
@@ -1635,7 +1638,7 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
*
* Note this page has already been through an in-memory split.
*/
WT_ASSERT(session, __wt_page_can_split(session, page));
WT_ASSERT(session, __wt_leaf_page_can_split(session, page));
WT_ASSERT(session, __wt_page_is_modified(page));
F_SET_ATOMIC(page, WT_PAGE_SPLIT_INSERT);
@@ -1668,6 +1671,12 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
child->state = WT_REF_MEM;
child->addr = ref->addr;
/*
* The address has moved to the replacement WT_REF. Make sure it isn't
* freed when the original ref is discarded.
*/
ref->addr = NULL;
/*
* Copy the first key from the original page into first ref in the new
* parent. Pages created in memory always have a "smallest" insert
@@ -1817,13 +1826,6 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
WT_ASSERT(session, ins != moved_ins);
#endif
/*
* Save the transaction ID when the split happened. Application
* threads will not try to forcibly evict the page again until
* all concurrent transactions commit.
*/
page->modify->inmem_split_txn = __wt_txn_id_alloc(session, false);
/*
* Update the page accounting.
*
@@ -1864,6 +1866,11 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
return (0);
err: if (split_ref[0] != NULL) {
/*
* The address was moved to the replacement WT_REF, restore it.
*/
ref->addr = split_ref[0]->addr;
__wt_free(session, split_ref[0]->key.ikey);
__wt_free(session, split_ref[0]);
}
@@ -1891,7 +1898,7 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
WT_RET(__wt_verbose(
session, WT_VERB_SPLIT, "%p: split-insert", ref->page));
WT_RET(__split_internal_lock(session, ref, &parent, &hazard));
WT_RET(__split_internal_lock(session, ref, true, &parent, &hazard));
if ((ret = __split_insert(session, ref)) != 0) {
WT_TRET(__split_internal_unlock(session, parent, hazard));
return (ret);
@@ -1962,7 +1969,7 @@ __split_multi(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
if (0) {
err: for (i = 0; i < new_entries; ++i)
__split_multi_inmem_fail(session, ref_new[i]);
__split_multi_inmem_fail(session, page, ref_new[i]);
}
__wt_free(session, ref_new);
@@ -1983,7 +1990,7 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
WT_RET(__wt_verbose(
session, WT_VERB_SPLIT, "%p: split-multi", ref->page));
WT_RET(__split_internal_lock(session, ref, &parent, &hazard));
WT_RET(__split_internal_lock(session, ref, false, &parent, &hazard));
if ((ret = __split_multi(session, ref, closing)) != 0 || closing) {
WT_TRET(__split_internal_unlock(session, parent, hazard));
return (ret);
@@ -2012,7 +2019,7 @@ __wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref)
WT_RET(__wt_verbose(
session, WT_VERB_SPLIT, "%p: reverse-split", ref->page));
WT_RET(__split_internal_lock(session, ref, &parent, &hazard));
WT_RET(__split_internal_lock(session, ref, false, &parent, &hazard));
ret = __split_parent(session, ref, NULL, 0, 0, false, true);
WT_TRET(__split_internal_unlock(session, parent, hazard));
return (ret);
@@ -2072,6 +2079,6 @@ __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref)
return (0);
err: __split_multi_inmem_fail(session, &new);
err: __split_multi_inmem_fail(session, page, &new);
return (ret);
}

View File

@@ -244,7 +244,8 @@ ascend: /*
* If we see any child states other than deleted, the
* page isn't empty.
*/
if (ref->state != WT_REF_DELETED)
if (ref->state != WT_REF_DELETED &&
!LF_ISSET(WT_READ_TRUNCATE))
empty_internal = false;
if (LF_ISSET(WT_READ_CACHE)) {
@@ -270,6 +271,7 @@ ascend: /*
WT_ERR(__wt_delete_page(session, ref, &skip));
if (skip)
break;
empty_internal = false;
} else if (LF_ISSET(WT_READ_COMPACT)) {
/*
* Skip deleted pages, rewriting them doesn't

View File

@@ -379,7 +379,7 @@ __curfile_close(WT_CURSOR *cursor)
* updated correctly.
*/
if (session->dhandle != NULL) {
/* Increment the data-source's in-use counter. */
/* Decrement the data-source's in-use counter. */
__wt_cursor_dhandle_decr_use(session);
WT_TRET(__wt_session_release_btree(session));
}
@@ -439,6 +439,9 @@ __wt_curfile_create(WT_SESSION_IMPL *session,
cursor->value_format = btree->value_format;
cbt->btree = btree;
if (session->dhandle->checkpoint != NULL)
F_SET(cbt, WT_CBT_NO_TXN);
if (bulk) {
F_SET(cursor, WT_CURSTD_BULK);

View File

@@ -6,12 +6,12 @@ WiredTiger is an high performance, scalable, production quality, NoSQL,
@section releases Releases
<table>
@row{<b>WiredTiger 2.7.0</b> (previous),
<a href="releases/wiredtiger-2.7.0.tar.bz2"><b>[Release package]</b></a>,
<a href="2.7.0/index.html"><b>[Documentation]</b></a>}
@row{<b>WiredTiger 2.6.1</b> (current),
<a href="releases/wiredtiger-2.6.1.tar.bz2"><b>[Release package]</b></a>,
<a href="2.6.1/index.html"><b>[Documentation]</b></a>}
@row{<b>WiredTiger 2.5.3</b> (previous),
<a href="releases/wiredtiger-2.5.3.tar.bz2"><b>[Release package]</b></a>,
<a href="2.5.3/index.html"><b>[Documentation]</b></a>}
@row{<b>Development branch</b>,
<a href="https://github.com/wiredtiger/wiredtiger"><b>[Source code]</b></a>,
<a href="develop/index.html"><b>[Documentation]</b></a>}

View File

@@ -230,8 +230,9 @@ threads, and the 'insert', 'read' and 'update' entries are the ratios
of insert, read and update operations done by each worker thread; If a
throttle value is provided each thread will do a maximum of that
number of operations per second; multiple workload configurations may
be specified; for example, a more complex threads configuration might
be 'threads=((count=2,reads=1)(count=8,reads=1,inserts=2,updates=1))'
be specified per threads configuration; for example, a more complex
threads configuration might be
'threads=((count=2,reads=1)(count=8,reads=1,inserts=2,updates=1))'
which would create 2 threads doing nothing but reads and 8 threads
each doing 50% inserts and 25% reads and updates. Allowed
configuration values are 'count', 'throttle', 'reads', 'inserts',

View File

@@ -76,22 +76,16 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
/*
* Evict the page.
*/
WT_ERR(__wt_evict(session, ref, 1));
WT_ERR(__wt_evict(session, ref, true));
break;
case WT_SYNC_DISCARD:
/*
* Dead handles may reference dirty pages; clean the
* page, both to keep statistics correct, and to let
* the page-discard function assert no dirty page is
* ever discarded.
* Discard the page regardless of whether it is dirty.
*/
if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD))
__wt_page_modify_clear(session, page);
WT_ASSERT(session,
F_ISSET(session->dhandle, WT_DHANDLE_DEAD) ||
__wt_page_can_evict(session, ref, false, NULL));
__wt_evict_page_clean_update(session, ref, 1);
__wt_page_can_evict(session, ref, NULL));
__wt_evict_page_clean_update(session, ref, true);
break;
WT_ILLEGAL_VALUE_ERR(session);
}

View File

@@ -36,6 +36,10 @@ __evict_read_gen(const WT_EVICT_ENTRY *entry)
page = entry->ref->page;
/* Any page set to the oldest generation should be discarded. */
if (page->read_gen == WT_READGEN_OLDEST)
return (WT_READGEN_OLDEST);
/* Any empty page (leaf or internal), is a good choice. */
if (__wt_page_is_empty(page))
return (WT_READGEN_OLDEST);
@@ -159,7 +163,8 @@ __evict_server(void *arg)
WT_DECL_RET;
WT_SESSION_IMPL *session;
#ifdef HAVE_DIAGNOSTIC
struct timespec now, stuck_ts = { 0, 0 };
struct timespec now, stuck_ts;
uint64_t pages_evicted = 0;
#endif
u_int spins;
@@ -204,10 +209,11 @@ __evict_server(void *arg)
/* Next time we wake up, reverse the sweep direction. */
cache->flags ^= WT_CACHE_WALK_REVERSE;
#ifdef HAVE_DIAGNOSTIC
stuck_ts.tv_sec = 0;
} else if (stuck_ts.tv_sec == 0)
pages_evicted = 0;
} else if (pages_evicted != cache->pages_evict) {
WT_ERR(__wt_epoch(session, &stuck_ts));
else {
pages_evicted = cache->pages_evict;
} else {
/* After being stuck for 5 minutes, give up. */
WT_ERR(__wt_epoch(session, &now));
if (WT_TIMEDIFF_SEC(now, stuck_ts) > 300) {
@@ -466,6 +472,15 @@ __evict_update_work(WT_SESSION_IMPL *session)
if (!F_ISSET(conn, WT_CONN_EVICTION_RUN))
return (false);
/*
* Setup the number of refs to consider in each handle, depending
* on how many handles are open. We want to consider less candidates
* from each file as more files are open. Handle the case where there
* are no files open by adding 1.
*/
cache->evict_max_refs_per_file =
WT_MAX(100, WT_MILLION / (conn->open_file_count + 1));
/*
* Page eviction overrides the dirty target and other types of eviction,
* that is, we don't care where we are with respect to the dirty target
@@ -481,6 +496,13 @@ __evict_update_work(WT_SESSION_IMPL *session)
goto done;
}
/*
* If the cache has been stuck and is now under control, clear the
* stuck flag.
*/
if (bytes_inuse < bytes_max)
F_CLR(cache, WT_CACHE_STUCK);
dirty_inuse = __wt_cache_dirty_inuse(cache);
if (dirty_inuse > (cache->eviction_dirty_target * bytes_max) / 100) {
FLD_SET(cache->state, WT_EVICT_PASS_DIRTY);
@@ -498,6 +520,7 @@ __evict_update_work(WT_SESSION_IMPL *session)
F_CLR(cache, WT_CACHE_WOULD_BLOCK);
goto done;
}
return (false);
done: if (F_ISSET(cache, WT_CACHE_STUCK))
@@ -1169,7 +1192,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp)
uint64_t pages_walked;
uint32_t walk_flags;
int internal_pages, restarts;
bool enough, modified;
bool enough, modified, would_split;
conn = S2C(session);
btree = S2BT(session);
@@ -1202,7 +1225,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp)
evict < end && !enough && (ret == 0 || ret == WT_NOTFOUND);
ret = __wt_tree_walk(
session, &btree->evict_ref, &pages_walked, walk_flags)) {
enough = pages_walked > WT_EVICT_MAX_PER_FILE;
enough = pages_walked > cache->evict_max_refs_per_file;
if ((ref = btree->evict_ref) == NULL) {
if (++restarts == 2 || enough)
break;
@@ -1237,6 +1260,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp)
* eviction, skip anything that isn't marked.
*/
if (FLD_ISSET(cache->state, WT_EVICT_PASS_WOULD_BLOCK) &&
page->memory_footprint < btree->splitmempage &&
page->read_gen != WT_READGEN_OLDEST)
continue;
@@ -1254,9 +1278,15 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp)
page->read_gen = __wt_cache_read_gen_new(session);
fast: /* If the page can't be evicted, give up. */
if (!__wt_page_can_evict(session, ref, true, NULL))
if (!__wt_page_can_evict(session, ref, &would_split))
continue;
/*
* Note: take care with ordering: if we detected that
* the page is modified above, we expect mod != NULL.
*/
mod = page->modify;
/*
* Additional tests if eviction is likely to succeed.
*
@@ -1269,12 +1299,6 @@ fast: /* If the page can't be evicted, give up. */
*/
if (!FLD_ISSET(cache->state,
WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK)) {
/*
* Note: take care with ordering: if we detected that
* the page is modified above, we expect mod != NULL.
*/
mod = page->modify;
/*
* If the page is clean but has modifications that
* appear too new to evict, skip it.
@@ -1282,19 +1306,6 @@ fast: /* If the page can't be evicted, give up. */
if (!modified && mod != NULL &&
!__wt_txn_visible_all(session, mod->rec_max_txn))
continue;
/*
* If the oldest transaction hasn't changed since the
* last time this page was written, it's unlikely we
* can make progress. Similarly, if the most recent
* update on the page is not yet globally visible,
* eviction will fail. These heuristics attempt to
* avoid repeated attempts to evict the same page.
*/
if (modified &&
(mod->disk_snap_min == conn->txn_global.oldest_id ||
!__wt_txn_visible_all(session, mod->update_txn)))
continue;
}
WT_ASSERT(session, evict->ref == NULL);
@@ -1419,7 +1430,6 @@ static int
__evict_page(WT_SESSION_IMPL *session, bool is_server)
{
WT_BTREE *btree;
WT_CACHE *cache;
WT_DECL_RET;
WT_PAGE *page;
WT_REF *ref;
@@ -1454,26 +1464,10 @@ __evict_page(WT_SESSION_IMPL *session, bool is_server)
if (page->read_gen != WT_READGEN_OLDEST)
page->read_gen = __wt_cache_read_gen_bump(session);
/*
* If we are evicting in a dead tree, don't write dirty pages.
*
* Force pages clean to keep statistics correct and to let the
* page-discard function assert that no dirty pages are ever
* discarded.
*/
if (F_ISSET(btree->dhandle, WT_DHANDLE_DEAD))
__wt_page_modify_clear(session, page);
WT_WITH_BTREE(session, btree, ret = __wt_evict(session, ref, 0));
WT_WITH_BTREE(session, btree, ret = __wt_evict(session, ref, false));
(void)__wt_atomic_subv32(&btree->evict_busy, 1);
WT_RET(ret);
cache = S2C(session)->cache;
if (F_ISSET(cache, WT_CACHE_STUCK))
F_CLR(cache, WT_CACHE_STUCK);
return (ret);
}
@@ -1617,8 +1611,8 @@ __wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile)
next_walk = NULL;
session->dhandle = dhandle;
while (__wt_tree_walk(session,
&next_walk, NULL, WT_READ_CACHE | WT_READ_NO_WAIT) == 0 &&
while (__wt_tree_walk(session, &next_walk, NULL,
WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_WAIT) == 0 &&
next_walk != NULL) {
page = next_walk->page;
size = page->memory_footprint;

View File

@@ -55,7 +55,7 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
WT_DECL_RET;
WT_PAGE *page;
WT_PAGE_MODIFY *mod;
bool forced_eviction, inmem_split;
bool clean_page, forced_eviction, inmem_split, tree_dead;
conn = S2C(session);
@@ -65,6 +65,7 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
page = ref->page;
forced_eviction = page->read_gen == WT_READGEN_OLDEST;
inmem_split = false;
tree_dead = F_ISSET(session->dhandle, WT_DHANDLE_DEAD);
WT_RET(__wt_verbose(session, WT_VERB_EVICT,
"page %p (%s)", page, __wt_page_type_string(page->type)));
@@ -105,24 +106,26 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
if (page->memory_footprint > conn->cache->evict_max_page_size)
conn->cache->evict_max_page_size = page->memory_footprint;
/* Update the reference and discard the page. */
if ((mod == NULL || mod->rec_result == 0) &&
!F_ISSET(conn, WT_CONN_IN_MEMORY)) {
if (__wt_ref_is_root(ref))
__wt_ref_out(session, ref);
else
WT_ERR(__wt_evict_page_clean_update(
session, ref, closing));
/* Figure out whether reconciliation was done on the page */
clean_page = mod == NULL || mod->rec_result == 0;
/* Update the reference and discard the page. */
if (__wt_ref_is_root(ref))
__wt_ref_out(session, ref);
else if (tree_dead || (clean_page && !F_ISSET(conn, WT_CONN_IN_MEMORY)))
/*
* Pages that belong to dead trees never write back to disk
* and can't support page splits.
*/
WT_ERR(__wt_evict_page_clean_update(
session, ref, tree_dead || closing));
else
WT_ERR(__evict_page_dirty_update(session, ref, closing));
if (clean_page) {
WT_STAT_FAST_CONN_INCR(session, cache_eviction_clean);
WT_STAT_FAST_DATA_INCR(session, cache_eviction_clean);
} else {
if (__wt_ref_is_root(ref))
__wt_ref_out(session, ref);
else
WT_ERR(__evict_page_dirty_update(
session, ref, closing));
WT_STAT_FAST_CONN_INCR(session, cache_eviction_dirty);
WT_STAT_FAST_DATA_INCR(session, cache_eviction_dirty);
}
@@ -238,20 +241,14 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
{
WT_ADDR *addr;
WT_DECL_RET;
WT_PAGE *parent;
WT_PAGE_MODIFY *mod;
parent = ref->home;
mod = ref->page->modify;
WT_ASSERT(session, ref->addr == NULL);
switch (mod->rec_result) {
case WT_PM_REC_EMPTY: /* Page is empty */
/* Discard the parent's address. */
if (ref->addr != NULL && __wt_off_page(parent, ref->addr)) {
__wt_free(session, ((WT_ADDR *)ref->addr)->addr);
__wt_free(session, ref->addr);
}
/*
* Update the parent to reference a deleted page. The fact that
* reconciliation left the page "empty" means there's no older
@@ -304,12 +301,6 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
if (!closing && __wt_eviction_dirty_target(session))
return (EBUSY);
/* Discard the parent's address. */
if (ref->addr != NULL && __wt_off_page(parent, ref->addr)) {
__wt_free(session, ((WT_ADDR *)ref->addr)->addr);
__wt_free(session, ref->addr);
}
/*
* Update the parent to reference the replacement page.
*
@@ -399,6 +390,13 @@ __evict_review(
WT_RET(ret);
}
/*
* It is always OK to evict pages from dead trees if they don't have
* children.
*/
if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD))
return (0);
/*
* Retrieve the modified state of the page. This must happen after the
* check for evictable internal pages otherwise there is a race where a
@@ -424,7 +422,7 @@ __evict_review(
if (modified)
__wt_txn_update_oldest(session, true);
if (!__wt_page_can_evict(session, ref, false, inmem_splitp))
if (!__wt_page_can_evict(session, ref, inmem_splitp))
return (EBUSY);
/*

View File

@@ -198,20 +198,9 @@ struct __wt_ovfl_txnc {
* When a page is modified, there's additional information to maintain.
*/
struct __wt_page_modify {
/*
* Track the highest transaction ID at which the page was written to
* disk. This can be used to avoid trying to write the page multiple
* times if a snapshot is keeping old versions pinned (e.g., in a
* checkpoint).
*/
uint64_t disk_snap_min;
/* The first unwritten transaction ID (approximate). */
uint64_t first_dirty_txn;
/* In-memory split transaction ID. */
uint64_t inmem_split_txn;
/* Avoid checking for obsolete updates during checkpoints. */
uint64_t obsolete_check_txn;
@@ -221,10 +210,8 @@ struct __wt_page_modify {
/* The largest update transaction ID (approximate). */
uint64_t update_txn;
#ifdef HAVE_DIAGNOSTIC
/* Check that transaction time moves forward. */
uint64_t last_oldest_id;
#endif
/* Dirty bytes added to the cache. */
size_t bytes_dirty;
@@ -313,17 +300,8 @@ struct __wt_page_modify {
* so they can be discarded when no longer needed.
*/
WT_PAGE *root_split; /* Linked list of root split pages */
/*
* When we deepen the tree, newly created internal pages cannot
* be evicted until all threads have exited the original page
* index structure. We set a transaction value during the split
* that's checked during eviction.
*/
uint64_t split_txn; /* Split eviction transaction value */
} intl;
#define mod_root_split u2.intl.root_split
#define mod_split_txn u2.intl.split_txn
struct {
/*
* Appended items to column-stores: there is only a single one

View File

@@ -88,7 +88,8 @@ struct __wt_btree {
uint32_t maxleafpage; /* Leaf page max size */
uint32_t maxleafkey; /* Leaf page max key size */
uint32_t maxleafvalue; /* Leaf page max value size */
uint64_t maxmempage; /* In memory page max size */
uint64_t maxmempage; /* In-memory page max size */
uint64_t splitmempage; /* In-memory split trigger size */
void *huffman_key; /* Key huffman encoding */
void *huffman_value; /* Value huffman encoding */

View File

@@ -37,6 +37,23 @@ __wt_page_is_modified(WT_PAGE *page)
return (page->modify != NULL && page->modify->write_gen != 0);
}
/*
* __wt_btree_block_free --
* Helper function to free a block from the current tree.
*/
static inline int
__wt_btree_block_free(
WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size)
{
WT_BM *bm;
WT_BTREE *btree;
btree = S2BT(session);
bm = btree->bm;
return (bm->free(bm, session, addr, addr_size));
}
/*
* __wt_cache_page_inmem_incr --
* Increment a page's memory footprint in the cache.
@@ -330,6 +347,8 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page)
{
uint64_t last_running;
WT_ASSERT(session, !F_ISSET(session->dhandle, WT_DHANDLE_DEAD));
last_running = 0;
if (page->modify->write_gen == 0)
last_running = S2C(session)->txn_global.last_running;
@@ -346,13 +365,6 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page)
if (__wt_atomic_add32(&page->modify->write_gen, 1) == 1) {
__wt_cache_dirty_incr(session, page);
/*
* The page can never end up with changes older than the oldest
* running transaction.
*/
if (F_ISSET(&session->txn, WT_TXN_HAS_SNAPSHOT))
page->modify->disk_snap_min = session->txn.snap_min;
/*
* We won the race to dirty the page, but another thread could
* have committed in the meantime, and the last_running field
@@ -470,6 +482,23 @@ __wt_off_page(WT_PAGE *page, const void *p)
p >= (void *)((uint8_t *)page->dsk + page->dsk->mem_size));
}
/*
* __wt_ref_addr_free --
* Free the address in a reference, if necessary.
*/
static inline void
__wt_ref_addr_free(WT_SESSION_IMPL *session, WT_REF *ref)
{
if (ref->addr == NULL)
return;
if (ref->home == NULL || __wt_off_page(ref->home, ref->addr)) {
__wt_free(session, ((WT_ADDR *)ref->addr)->addr);
__wt_free(session, ref->addr);
}
ref->addr = NULL;
}
/*
* __wt_ref_key --
* Return a reference to a row-store internal page key as cheaply as
@@ -968,11 +997,32 @@ __wt_ref_info(WT_SESSION_IMPL *session,
}
/*
* __wt_page_can_split --
* __wt_ref_block_free --
* Free the on-disk block for a reference and clear the address.
*/
static inline int
__wt_ref_block_free(WT_SESSION_IMPL *session, WT_REF *ref)
{
const uint8_t *addr;
size_t addr_size;
if (ref->addr == NULL)
return (0);
WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
WT_RET(__wt_btree_block_free(session, addr, addr_size));
/* Clear the address (so we don't free it twice). */
__wt_ref_addr_free(session, ref);
return (0);
}
/*
* __wt_leaf_page_can_split --
* Check whether a page can be split in memory.
*/
static inline bool
__wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page)
__wt_leaf_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page)
{
WT_BTREE *btree;
WT_INSERT_HEAD *ins_head;
@@ -1003,7 +1053,7 @@ __wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page)
* reconciliation will be wrong, so we can't evict immediately).
*/
if (page->type != WT_PAGE_ROW_LEAF ||
page->memory_footprint < btree->maxmempage ||
page->memory_footprint < btree->splitmempage ||
!__wt_page_is_modified(page))
return (false);
@@ -1046,13 +1096,12 @@ __wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page)
* Check whether a page can be evicted.
*/
static inline bool
__wt_page_can_evict(WT_SESSION_IMPL *session,
WT_REF *ref, bool check_splits, bool *inmem_splitp)
__wt_page_can_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool *inmem_splitp)
{
WT_BTREE *btree;
WT_PAGE *page;
WT_PAGE_MODIFY *mod;
WT_TXN_GLOBAL *txn_global;
bool modified;
if (inmem_splitp != NULL)
*inmem_splitp = false;
@@ -1071,20 +1120,21 @@ __wt_page_can_evict(WT_SESSION_IMPL *session,
* detailed eviction tests. We don't need further tests since the page
* won't be written or discarded from the cache.
*/
if (__wt_page_can_split(session, page)) {
if (__wt_leaf_page_can_split(session, page)) {
if (inmem_splitp != NULL)
*inmem_splitp = true;
return (true);
}
modified = __wt_page_is_modified(page);
/*
* If the file is being checkpointed, we can't evict dirty pages:
* if we write a page and free the previous version of the page, that
* previous version might be referenced by an internal page already
* been written in the checkpoint, leaving the checkpoint inconsistent.
*/
if (btree->checkpointing != WT_CKPT_OFF &&
__wt_page_is_modified(page)) {
if (btree->checkpointing != WT_CKPT_OFF && modified) {
WT_STAT_FAST_CONN_INCR(session, cache_eviction_checkpoint);
WT_STAT_FAST_DATA_INCR(session, cache_eviction_checkpoint);
return (false);
@@ -1105,28 +1155,24 @@ __wt_page_can_evict(WT_SESSION_IMPL *session,
* pages cannot be evicted until all threads are known to have exited
* the original parent page's index, because evicting an internal page
* discards its WT_REF array, and a thread traversing the original
* parent page index might see a freed WT_REF. During the split we set
* a transaction value, we can evict the created page as soon as that
* transaction value is globally visible.
* parent page index might see a freed WT_REF.
*/
if (check_splits && WT_PAGE_IS_INTERNAL(page) &&
(F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_BLOCK) ||
!__wt_txn_visible_all(session, mod->mod_split_txn)))
if (WT_PAGE_IS_INTERNAL(page) &&
F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_BLOCK))
return (false);
/*
* If the page was recently split in-memory, don't evict it immediately:
* we want to give application threads that are appending a chance to
* move to the new leaf page created by the split.
*
* Note the check here is similar to __wt_txn_visible_all, but ignores
* the checkpoint's transaction.
* If the oldest transaction hasn't changed since the last time
* this page was written, it's unlikely we can make progress.
* Similarly, if the most recent update on the page is not yet
* globally visible, eviction will fail. These heuristics
* attempt to avoid repeated attempts to evict the same page.
*/
if (check_splits) {
txn_global = &S2C(session)->txn_global;
if (WT_TXNID_LE(txn_global->oldest_id, mod->inmem_split_txn))
return (false);
}
if (modified &&
!F_ISSET(S2C(session)->cache, WT_CACHE_STUCK) &&
(mod->last_oldest_id == __wt_txn_oldest_id(session) ||
!__wt_txn_visible_all(session, mod->update_txn)))
return (false);
return (true);
}
@@ -1162,7 +1208,7 @@ __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref)
(void)__wt_atomic_addv32(&btree->evict_busy, 1);
too_big = page->memory_footprint > btree->maxmempage;
if ((ret = __wt_evict(session, ref, 0)) == 0) {
if ((ret = __wt_evict(session, ref, false)) == 0) {
if (too_big)
WT_STAT_FAST_CONN_INCR(session, cache_eviction_force);
else
@@ -1221,7 +1267,7 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
LF_ISSET(WT_READ_NO_EVICT) ||
F_ISSET(session, WT_SESSION_NO_EVICTION) ||
F_ISSET(btree, WT_BTREE_NO_EVICTION) ||
!__wt_page_can_evict(session, ref, true, NULL))
!__wt_page_can_evict(session, ref, NULL))
return (__wt_hazard_clear(session, page));
WT_RET_BUSY_OK(__wt_page_release_evict(session, ref));

View File

@@ -14,7 +14,6 @@
pages by this many increments of the
read generation. */
#define WT_EVICT_WALK_PER_FILE 10 /* Pages to queue per file */
#define WT_EVICT_MAX_PER_FILE 100 /* Max pages to visit per file */
#define WT_EVICT_WALK_BASE 300 /* Pages tracked across file visits */
#define WT_EVICT_WALK_INCR 100 /* Pages added each walk */
@@ -107,6 +106,7 @@ struct __wt_cache {
uint32_t evict_slots; /* LRU list eviction slots */
WT_DATA_HANDLE
*evict_file_next; /* LRU next file to search */
uint32_t evict_max_refs_per_file;/* LRU pages per file per pass */
/*
* Cache pool information.

View File

@@ -197,7 +197,14 @@ struct __wt_cursor_btree {
#define WT_CBT_ITERATE_NEXT 0x04 /* Next iteration configuration */
#define WT_CBT_ITERATE_PREV 0x08 /* Prev iteration configuration */
#define WT_CBT_MAX_RECORD 0x10 /* Col-store: past end-of-table */
#define WT_CBT_SEARCH_SMALLEST 0x20 /* Row-store: small-key insert list */
#define WT_CBT_NO_TXN 0x20 /* Non-transactional cursor
(e.g. on a checkpoint) */
#define WT_CBT_SEARCH_SMALLEST 0x40 /* Row-store: small-key insert list */
#define WT_CBT_POSITION_MASK /* Flags associated with position */ \
(WT_CBT_ITERATE_APPEND | WT_CBT_ITERATE_NEXT | WT_CBT_ITERATE_PREV | \
WT_CBT_MAX_RECORD | WT_CBT_SEARCH_SMALLEST)
uint8_t flags;
};
@@ -302,7 +309,7 @@ struct __wt_cursor_join_entry {
WT_CURSOR_JOIN_ENDPOINT *ends; /* reference endpoints */
size_t ends_allocated;
size_t ends_next;
u_int ends_next;
WT_JOIN_STATS stats; /* Join statistics */
};

View File

@@ -41,11 +41,7 @@ __cursor_pos_clear(WT_CURSOR_BTREE *cbt)
cbt->cip_saved = NULL;
cbt->rip_saved = NULL;
/*
* Don't clear the active flag, it's owned by the cursor enter/leave
* functions.
*/
F_CLR(cbt, ~WT_CBT_ACTIVE);
F_CLR(cbt, WT_CBT_POSITION_MASK);
}
/*
@@ -93,7 +89,8 @@ __curfile_enter(WT_CURSOR_BTREE *cbt)
session = (WT_SESSION_IMPL *)cbt->iface.session;
WT_RET(__cursor_enter(session));
if (!F_ISSET(cbt, WT_CBT_NO_TXN))
WT_RET(__cursor_enter(session));
F_SET(cbt, WT_CBT_ACTIVE);
return (0);
}
@@ -112,7 +109,8 @@ __curfile_leave(WT_CURSOR_BTREE *cbt)
/* If the cursor was active, deactivate it. */
if (F_ISSET(cbt, WT_CBT_ACTIVE)) {
__cursor_leave(session);
if (!F_ISSET(cbt, WT_CBT_NO_TXN))
__cursor_leave(session);
F_CLR(cbt, WT_CBT_ACTIVE);
}
@@ -204,7 +202,7 @@ err: return (ret);
/*
* __wt_cursor_dhandle_incr_use --
* Increment the in-use counter in cursor's data source.
* Increment the in-use counter in the cursor's data source.
*/
static inline void
__wt_cursor_dhandle_incr_use(WT_SESSION_IMPL *session)
@@ -221,7 +219,7 @@ __wt_cursor_dhandle_incr_use(WT_SESSION_IMPL *session)
/*
* __wt_cursor_dhandle_decr_use --
* Decrement the in-use counter in cursor's data source.
* Decrement the in-use counter in the cursor's data source.
*/
static inline void
__wt_cursor_dhandle_decr_use(WT_SESSION_IMPL *session)
@@ -262,7 +260,13 @@ __cursor_func_init(WT_CURSOR_BTREE *cbt, bool reenter)
if (!F_ISSET(cbt, WT_CBT_ACTIVE))
WT_RET(__curfile_enter(cbt));
__wt_txn_cursor_op(session);
/*
* If this is an ordinary transactional cursor, make sure we are set up
* to read.
*/
if (!F_ISSET(cbt, WT_CBT_NO_TXN))
__wt_txn_cursor_op(session);
return (0);
}

View File

@@ -122,7 +122,7 @@ extern bool __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool vi
extern int __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref);
extern void __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref);
extern void __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep);
extern void __wt_free_ref( WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref, bool free_pages);
extern void __wt_free_ref( WT_SESSION_IMPL *session, WT_REF *ref, int page_type, bool free_pages);
extern void __wt_free_ref_index(WT_SESSION_IMPL *session, WT_PAGE *page, WT_PAGE_INDEX *pindex, bool free_pages);
extern void __wt_free_update_list(WT_SESSION_IMPL *session, WT_UPDATE *upd);
extern int __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]);

View File

@@ -70,7 +70,7 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_txn_state {
};
struct __wt_txn_global {
uint64_t alloc; /* Transaction ID to allocate. */
WT_SPINLOCK id_lock;
volatile uint64_t current; /* Current transaction ID. */
/* The oldest running transaction ID (may race). */

View File

@@ -323,7 +323,6 @@ __wt_txn_id_alloc(WT_SESSION_IMPL *session, bool publish)
{
WT_TXN_GLOBAL *txn_global;
uint64_t id;
u_int i;
txn_global = &S2C(session)->txn_global;
@@ -350,20 +349,16 @@ __wt_txn_id_alloc(WT_SESSION_IMPL *session, bool publish)
* global current ID, so we want post-increment semantics. Our atomic
* add primitive does pre-increment, so adjust the result here.
*/
id = __wt_atomic_addv64(&S2C(session)->txn_global.alloc, 1) - 1;
__wt_spin_lock(session, &txn_global->id_lock);
id = txn_global->current;
if (publish) {
session->txn.id = id;
WT_SESSION_TXN_STATE(session)->id = id;
WT_PUBLISH(WT_SESSION_TXN_STATE(session)->id, id);
}
for (i = 0; txn_global->current != id; i++)
if (i < 100)
WT_PAUSE();
else
__wt_yield();
WT_PUBLISH(txn_global->current, id + 1);
++txn_global->current;
__wt_spin_unlock(session, &txn_global->id_lock);
return (id);
}

View File

@@ -293,7 +293,7 @@ __wt_log_slot_init(WT_SESSION_IMPL *session)
&log->slot_pool[i].slot_buf, log->slot_buf_size));
F_SET(&log->slot_pool[i], WT_SLOT_INIT_FLAGS);
}
WT_STAT_FAST_CONN_INCRV(session,
WT_STAT_FAST_CONN_SET(session,
log_buffer_size, log->slot_buf_size * WT_SLOT_POOL);
/*
* Set up the available slot from the pool the first time.

View File

@@ -351,6 +351,7 @@ __wt_reconcile(WT_SESSION_IMPL *session,
WT_PAGE *page;
WT_PAGE_MODIFY *mod;
WT_RECONCILE *r;
uint64_t oldest_id;
page = ref->page;
mod = page->modify;
@@ -361,21 +362,14 @@ __wt_reconcile(WT_SESSION_IMPL *session,
/* We shouldn't get called with a clean page, that's an error. */
WT_ASSERT(session, __wt_page_is_modified(page));
#ifdef HAVE_DIAGNOSTIC
{
/*
* Check that transaction time always moves forward for a given page.
* If this check fails, reconciliation can free something that a future
* reconciliation will need.
*/
uint64_t oldest_id = __wt_txn_oldest_id(session);
oldest_id = __wt_txn_oldest_id(session);
WT_ASSERT(session, WT_TXNID_LE(mod->last_oldest_id, oldest_id));
mod->last_oldest_id = oldest_id;
}
#endif
/* Record the most recent transaction ID we will *not* write. */
mod->disk_snap_min = session->txn.snap_min;
/* Initialize the reconciliation structure for each new run. */
WT_RET(__rec_write_init(
@@ -990,23 +984,6 @@ __rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, bool destroy)
}
}
/*
* __rec_block_free --
* Helper function to free a block.
*/
static int
__rec_block_free(
WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size)
{
WT_BM *bm;
WT_BTREE *btree;
btree = S2BT(session);
bm = btree->bm;
return (bm->free(bm, session, addr, addr_size));
}
/*
* __rec_update_save --
* Save a WT_UPDATE list for later restoration.
@@ -1349,8 +1326,6 @@ __rec_child_deleted(WT_SESSION_IMPL *session,
WT_RECONCILE *r, WT_REF *ref, WT_CHILD_STATE *statep)
{
WT_PAGE_DELETED *page_del;
size_t addr_size;
const uint8_t *addr;
page_del = ref->page_del;
@@ -1398,16 +1373,8 @@ __rec_child_deleted(WT_SESSION_IMPL *session,
*/
if (ref->addr != NULL &&
(page_del == NULL ||
__wt_txn_visible_all(session, page_del->txnid))) {
WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
WT_RET(__rec_block_free(session, addr, addr_size));
if (__wt_off_page(ref->home, ref->addr)) {
__wt_free(session, ((WT_ADDR *)ref->addr)->addr);
__wt_free(session, ref->addr);
}
ref->addr = NULL;
}
__wt_txn_visible_all(session, page_del->txnid)))
WT_RET(__wt_ref_block_free(session, ref));
/*
* If the original page is gone, we can skip the slot on the internal
@@ -2944,8 +2911,8 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r)
break;
case SPLIT_TRACKING_RAW:
/*
* We were configured for raw compression, but never actually
* wrote anything.
* We were configured for raw compression, and either we never
* wrote anything, or there's a remaindered block of data.
*/
break;
WT_ILLEGAL_VALUE(session);
@@ -2998,14 +2965,27 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r)
static int
__rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r)
{
/* We're done reconciling - write the final page */
if (r->raw_compression && r->entries != 0) {
while (r->entries != 0)
WT_RET(__rec_split_raw_worker(session, r, 0, true));
} else
WT_RET(__rec_split_finish_std(session, r));
WT_BTREE *btree;
size_t data_size;
return (0);
btree = S2BT(session);
/*
* We're done reconciling, write the final page. Call raw compression
* until/unless there's not enough data to compress.
*/
if (r->raw_compression && r->entries != 0) {
while (r->entries != 0) {
data_size =
WT_PTRDIFF32(r->first_free, r->disk_image.mem);
if (data_size <= btree->allocsize)
break;
WT_RET(__rec_split_raw_worker(session, r, 0, true));
}
if (r->entries == 0)
return (0);
}
return (__rec_split_finish_std(session, r));
}
/*
@@ -5310,7 +5290,7 @@ __rec_split_discard(WT_SESSION_IMPL *session, WT_PAGE *page)
if (multi->addr.reuse)
multi->addr.addr = NULL;
else {
WT_RET(__rec_block_free(session,
WT_RET(__wt_btree_block_free(session,
multi->addr.addr, multi->addr.size));
__wt_free(session, multi->addr.addr);
}
@@ -5393,8 +5373,6 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
WT_BTREE *btree;
WT_PAGE_MODIFY *mod;
WT_REF *ref;
size_t addr_size;
const uint8_t *addr;
btree = S2BT(session);
bm = btree->bm;
@@ -5419,21 +5397,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
*/
if (__wt_ref_is_root(ref))
break;
if (ref->addr != NULL) {
/*
* Free the page and clear the address (so we don't free
* it twice).
*/
WT_RET(__wt_ref_info(
session, ref, &addr, &addr_size, NULL));
WT_RET(__rec_block_free(session, addr, addr_size));
if (__wt_off_page(ref->home, ref->addr)) {
__wt_free(
session, ((WT_ADDR *)ref->addr)->addr);
__wt_free(session, ref->addr);
}
ref->addr = NULL;
}
WT_RET(__wt_ref_block_free(session, ref));
break;
case WT_PM_REC_EMPTY: /* Page deleted */
break;
@@ -5451,7 +5415,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
* are checkpoints, and must be explicitly dropped.
*/
if (!__wt_ref_is_root(ref))
WT_RET(__rec_block_free(session,
WT_RET(__wt_btree_block_free(session,
mod->mod_replace.addr, mod->mod_replace.size));
/* Discard the replacement page's address. */
@@ -5615,7 +5579,7 @@ __rec_write_wrapup_err(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
if (bnd->addr.reuse)
bnd->addr.addr = NULL;
else {
WT_TRET(__rec_block_free(session,
WT_TRET(__wt_btree_block_free(session,
bnd->addr.addr, bnd->addr.size));
__wt_free(session, bnd->addr.addr);
}

View File

@@ -206,6 +206,9 @@ __session_close(WT_SESSION *wt_session, const char *config)
__wt_spin_unlock(session, &conn->api_lock);
/* We no longer have a session, don't try to update it. */
session = NULL;
err: API_END_RET_NOTFOUND_MAP(session, ret);
}

View File

@@ -712,9 +712,11 @@ __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[])
conn = S2C(session);
txn_global = &conn->txn_global;
txn_global->alloc = txn_global->current =
txn_global->last_running = txn_global->oldest_id = WT_TXN_FIRST;
txn_global->current = txn_global->last_running =
txn_global->oldest_id = WT_TXN_FIRST;
WT_RET(__wt_spin_init(session,
&txn_global->id_lock, "transaction id lock"));
WT_RET(__wt_rwlock_alloc(session,
&txn_global->nsnap_rwlock, "named snapshot lock"));
txn_global->nsnap_oldest_id = WT_TXN_NONE;
@@ -747,6 +749,7 @@ __wt_txn_global_destroy(WT_SESSION_IMPL *session)
if (txn_global == NULL)
return (0);
__wt_spin_destroy(session, &txn_global->id_lock);
WT_TRET(__wt_rwlock_destroy(session, &txn_global->nsnap_rwlock));
__wt_free(session, txn_global->states);

View File

@@ -394,7 +394,7 @@ config_lrt(void)
* stores.
*/
if (g.type == FIX) {
if (config_is_perm("long_running_txn"))
if (g.c_long_running_txn && config_is_perm("long_running_txn"))
die(EINVAL,
"long_running_txn not supported with fixed-length "
"column store");

File diff suppressed because one or more lines are too long